From b536d78b574e95d76535ba9860665b10dda1651c Mon Sep 17 00:00:00 2001 From: Cyber MacGeddon Date: Wed, 20 Nov 2024 19:55:05 +0000 Subject: [PATCH 01/37] Prepare for 0.16: Change Python dep restrictions and Gitlab merge criteria --- .github/workflows/release.yaml | 2 +- trustgraph-bedrock/setup.py | 2 +- trustgraph-cli/setup.py | 2 +- trustgraph-embeddings-hf/setup.py | 4 ++-- trustgraph-flow/setup.py | 2 +- trustgraph-parquet/setup.py | 2 +- trustgraph-vertexai/setup.py | 2 +- trustgraph/setup.py | 14 +++++++------- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 5ca3b735..0d6d2d29 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -5,7 +5,7 @@ on: workflow_dispatch: push: tags: - - v0.15.* + - v0.16.* permissions: contents: read diff --git a/trustgraph-bedrock/setup.py b/trustgraph-bedrock/setup.py index 80cee09c..1a99e227 100644 --- a/trustgraph-bedrock/setup.py +++ b/trustgraph-bedrock/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", + "trustgraph-base>=0.16,<0.17", "pulsar-client", "prometheus-client", "boto3", diff --git a/trustgraph-cli/setup.py b/trustgraph-cli/setup.py index 651fdc27..ec541c8b 100644 --- a/trustgraph-cli/setup.py +++ b/trustgraph-cli/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", + "trustgraph-base>=0.16,<0.17", "requests", "pulsar-client", "rdflib", diff --git a/trustgraph-embeddings-hf/setup.py b/trustgraph-embeddings-hf/setup.py index ad01667f..2fbe079e 100644 --- a/trustgraph-embeddings-hf/setup.py +++ b/trustgraph-embeddings-hf/setup.py @@ -34,8 +34,8 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", - "trustgraph-flow>=0.15,<0.16", + "trustgraph-base>=0.16,<0.17", + "trustgraph-flow>=0.16,<0.17", "torch", "urllib3", "transformers", diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 8b46b2d2..8aeb7ce2 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", + "trustgraph-base>=0.16,<0.17", "urllib3", "rdflib", "pymilvus", diff --git a/trustgraph-parquet/setup.py b/trustgraph-parquet/setup.py index 668cde1c..7dab60ac 100644 --- a/trustgraph-parquet/setup.py +++ b/trustgraph-parquet/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", + "trustgraph-base>=0.16,<0.17", "pulsar-client", "prometheus-client", "pyarrow", diff --git a/trustgraph-vertexai/setup.py b/trustgraph-vertexai/setup.py index 0cdc3a97..d19e8c0d 100644 --- a/trustgraph-vertexai/setup.py +++ b/trustgraph-vertexai/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", + "trustgraph-base>=0.16,<0.17", "pulsar-client", "google-cloud-aiplatform", "prometheus-client", diff --git a/trustgraph/setup.py b/trustgraph/setup.py index 8e50aed5..7bb8dfd3 100644 --- a/trustgraph/setup.py +++ b/trustgraph/setup.py @@ -34,13 +34,13 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", - "trustgraph-bedrock>=0.15,<0.16", - "trustgraph-cli>=0.15,<0.16", - "trustgraph-embeddings-hf>=0.15,<0.16", - "trustgraph-flow>=0.15,<0.16", - "trustgraph-parquet>=0.15,<0.16", - "trustgraph-vertexai>=0.15,<0.16", + "trustgraph-base>=0.16,<0.17", + "trustgraph-bedrock>=0.16,<0.17", + "trustgraph-cli>=0.16,<0.17", + "trustgraph-embeddings-hf>=0.16,<0.17", + "trustgraph-flow>=0.16,<0.17", + "trustgraph-parquet>=0.16,<0.17", + "trustgraph-vertexai>=0.16,<0.17", ], scripts=[ ] From 92b84441eb8cd5b4afffe34fe800c0744256cd37 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Wed, 20 Nov 2024 19:55:40 +0000 Subject: [PATCH 02/37] Feature/api gateway (#164) * Bare bones API gateway * Working for LLM + prompt * RAG query works * Triples query * Added agent API * Embeddings API * Put API tests in a subdir --- test-api/test-agent-api | 28 ++ test-api/test-embeddings-api | 25 ++ test-api/test-graph-rag-api | 31 ++ test-api/test-llm-api | 31 ++ test-api/test-prompt-api | 38 ++ test-api/test-prompt2-api | 39 ++ test-api/test-triples-query-api | 35 ++ trustgraph-flow/scripts/api-gateway | 540 ++++++++++++++++++++++++++++ trustgraph-flow/setup.py | 1 + 9 files changed, 768 insertions(+) create mode 100755 test-api/test-agent-api create mode 100755 test-api/test-embeddings-api create mode 100755 test-api/test-graph-rag-api create mode 100755 test-api/test-llm-api create mode 100755 test-api/test-prompt-api create mode 100755 test-api/test-prompt2-api create mode 100755 test-api/test-triples-query-api create mode 100755 trustgraph-flow/scripts/api-gateway diff --git a/test-api/test-agent-api b/test-api/test-agent-api new file mode 100755 index 00000000..f36ba196 --- /dev/null +++ b/test-api/test-agent-api @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "question": "What is the highest risk aspect of running a space shuttle program? Provide 5 detailed reasons to justify our answer.", +} + +resp = requests.post( + f"{url}agent", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["answer"]) + + diff --git a/test-api/test-embeddings-api b/test-api/test-embeddings-api new file mode 100755 index 00000000..ef9ea099 --- /dev/null +++ b/test-api/test-embeddings-api @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "text": "What is the highest risk aspect of running a space shuttle program? Provide 5 detailed reasons to justify our answer.", +} + +resp = requests.post( + f"{url}embeddings", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + diff --git a/test-api/test-graph-rag-api b/test-api/test-graph-rag-api new file mode 100755 index 00000000..c329934c --- /dev/null +++ b/test-api/test-graph-rag-api @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "query": "Give me 10 facts", +} + +resp = requests.post( + f"{url}graph-rag", + json=input, +) + +resp = resp.json() + +print(resp) +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["response"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-llm-api b/test-api/test-llm-api new file mode 100755 index 00000000..c33c6634 --- /dev/null +++ b/test-api/test-llm-api @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "system": "Respond in French. Use long word, form of numbers, no digits", +# "prompt": "Add 2 and 12" + "prompt": "Add 12 and 14, and then make a poem about llamas which incorporates that number. Then write a joke about llamas" +} + +resp = requests.post( + f"{url}text-completion", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["response"]) + +############################################################################ + diff --git a/test-api/test-prompt-api b/test-api/test-prompt-api new file mode 100755 index 00000000..1005bc90 --- /dev/null +++ b/test-api/test-prompt-api @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "id": "question", + "variables": { + "question": "Write a joke about llamas." + } +} + +resp = requests.post( + f"{url}prompt", + json=input, +) + +resp = resp.json() + +print(resp) +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +if "object" in resp: + print(f"Object: {resp['object']}") + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-prompt2-api b/test-api/test-prompt2-api new file mode 100755 index 00000000..f1b80c48 --- /dev/null +++ b/test-api/test-prompt2-api @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "id": "extract-definitions", + "variables": { + "text": "A cat is a large mammal." + } +} + +resp = requests.post( + f"{url}prompt", + json=input, +) + +resp = resp.json() + +print(resp) +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +if "object" in resp: + object = json.loads(resp["object"]) + print(json.dumps(object, indent=4)) + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-triples-query-api b/test-api/test-triples-query-api new file mode 100755 index 00000000..e2895a28 --- /dev/null +++ b/test-api/test-triples-query-api @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "p": "http://www.w3.org/2000/01/rdf-schema#label", + "limit": 10 +} + +resp = requests.post( + f"{url}triples-query", + json=input, +) + +print(resp.text) +resp = resp.json() + + +print(resp) +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["response"]) + +sys.exit(0) + +############################################################################ + diff --git a/trustgraph-flow/scripts/api-gateway b/trustgraph-flow/scripts/api-gateway new file mode 100755 index 00000000..748b5c7d --- /dev/null +++ b/trustgraph-flow/scripts/api-gateway @@ -0,0 +1,540 @@ +#!/usr/bin/env python3 + +import asyncio +from aiohttp import web +import json +import logging +import uuid + +import pulsar +from pulsar.asyncio import Client +from pulsar.schema import JsonSchema +import _pulsar +import aiopulsar + +from trustgraph.clients.llm_client import LlmClient +from trustgraph.clients.prompt_client import PromptClient + +from trustgraph.schema import TextCompletionRequest, TextCompletionResponse +from trustgraph.schema import text_completion_request_queue +from trustgraph.schema import text_completion_response_queue + +from trustgraph.schema import PromptRequest, PromptResponse +from trustgraph.schema import prompt_request_queue +from trustgraph.schema import prompt_response_queue + +from trustgraph.schema import GraphRagQuery, GraphRagResponse +from trustgraph.schema import graph_rag_request_queue +from trustgraph.schema import graph_rag_response_queue + +from trustgraph.schema import TriplesQueryRequest, TriplesQueryResponse, Value +from trustgraph.schema import triples_request_queue +from trustgraph.schema import triples_response_queue + +from trustgraph.schema import AgentRequest, AgentResponse +from trustgraph.schema import agent_request_queue +from trustgraph.schema import agent_response_queue + +from trustgraph.schema import EmbeddingsRequest, EmbeddingsResponse +from trustgraph.schema import embeddings_request_queue +from trustgraph.schema import embeddings_response_queue + +logger = logging.getLogger("api") +logger.setLevel(logging.INFO) + +pulsar_host = "pulsar://localhost:6650" +TIME_OUT = 600 + +class Publisher: + + def __init__(self, pulsar_host, topic, schema=None, max_size=10): + self.pulsar_host = pulsar_host + self.topic = topic + self.schema = schema + self.q = asyncio.Queue(maxsize=max_size) + + async def run(self): + async with aiopulsar.connect(self.pulsar_host) as client: + async with client.create_producer( + topic=self.topic, + schema=self.schema, + ) as producer: + while True: + id, item = await self.q.get() + await producer.send(item, { "id": id }) +# print("message out") + + async def send(self, id, msg): + await self.q.put((id, msg)) + +class Subscriber: + + def __init__(self, pulsar_host, topic, subscription, consumer_name, + schema=None, max_size=10): + self.pulsar_host = pulsar_host + self.topic = topic + self.subscription = subscription + self.consumer_name = consumer_name + self.schema = schema + self.q = {} + + async def run(self): + async with aiopulsar.connect(pulsar_host) as client: + async with client.subscribe( + topic=self.topic, + subscription_name=self.subscription, + consumer_name=self.consumer_name, + schema=self.schema, + ) as consumer: + while True: + msg = await consumer.receive() +# print("message in", self.topic) + id = msg.properties()["id"] + value = msg.value() + if id in self.q: + await self.q[id].put(value) + + async def subscribe(self, id): + q = asyncio.Queue() + self.q[id] = q + return q + + async def unsubscribe(self, id): + if id in self.q: + del self.q[id] + +class Api: + + def __init__(self, **config): + + self.port = int(config.get("port", "8088")) + self.app = web.Application(middlewares=[]) + + self.llm_out = Publisher( + pulsar_host, text_completion_request_queue, + schema=JsonSchema(TextCompletionRequest) + ) + + self.llm_in = Subscriber( + pulsar_host, text_completion_response_queue, + "api-gateway", "api-gateway", + JsonSchema(TextCompletionResponse) + ) + + self.prompt_out = Publisher( + pulsar_host, prompt_request_queue, + schema=JsonSchema(PromptRequest) + ) + + self.prompt_in = Subscriber( + pulsar_host, prompt_response_queue, + "api-gateway", "api-gateway", + JsonSchema(PromptResponse) + ) + + self.graph_rag_out = Publisher( + pulsar_host, graph_rag_request_queue, + schema=JsonSchema(GraphRagQuery) + ) + + self.graph_rag_in = Subscriber( + pulsar_host, graph_rag_response_queue, + "api-gateway", "api-gateway", + JsonSchema(GraphRagResponse) + ) + + self.triples_query_out = Publisher( + pulsar_host, triples_request_queue, + schema=JsonSchema(TriplesQueryRequest) + ) + + self.triples_query_in = Subscriber( + pulsar_host, triples_response_queue, + "api-gateway", "api-gateway", + JsonSchema(TriplesQueryResponse) + ) + + self.agent_out = Publisher( + pulsar_host, agent_request_queue, + schema=JsonSchema(AgentRequest) + ) + + self.agent_in = Subscriber( + pulsar_host, agent_response_queue, + "api-gateway", "api-gateway", + JsonSchema(AgentResponse) + ) + + self.embeddings_out = Publisher( + pulsar_host, embeddings_request_queue, + schema=JsonSchema(EmbeddingsRequest) + ) + + self.embeddings_in = Subscriber( + pulsar_host, embeddings_response_queue, + "api-gateway", "api-gateway", + JsonSchema(EmbeddingsResponse) + ) + + self.app.add_routes([ + web.post("/api/v1/text-completion", self.llm), + web.post("/api/v1/prompt", self.prompt), + web.post("/api/v1/graph-rag", self.graph_rag), + web.post("/api/v1/triples-query", self.triples_query), + web.post("/api/v1/agent", self.agent), + web.post("/api/v1/embeddings", self.embeddings), + ]) + + async def llm(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.llm_in.subscribe(id) + + await self.llm_out.send( + id, + TextCompletionRequest( + system=data["system"], + prompt=data["prompt"] + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), TIME_OUT) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + return web.json_response( + { "response": resp.response } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.llm_in.unsubscribe(id) + + async def prompt(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.prompt_in.subscribe(id) + + terms = { + k: json.dumps(v) + for k, v in data["variables"].items() + } + + await self.prompt_out.send( + id, + PromptRequest( + id=data["id"], + terms=terms + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), TIME_OUT) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + if resp.object: + return web.json_response( + { "object": resp.object } + ) + + return web.json_response( + { "text": resp.text } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.prompt_in.unsubscribe(id) + + async def graph_rag(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.graph_rag_in.subscribe(id) + + await self.graph_rag_out.send( + id, + GraphRagQuery( + query=data["query"], + user=data.get("user", "trustgraph"), + collection=data.get("collection", "default"), + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), TIME_OUT) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + return web.json_response( + { "response": resp.response } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.graph_rag_in.unsubscribe(id) + + async def triples_query(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.triples_query_in.subscribe(id) + + if "s" in data: + if data["s"].startswith("http:") or data["s"].startswith("https:"): + s = Value(value=data["s"], is_uri=True) + else: + s = Value(value=data["s"], is_uri=True) + else: + s = None + + if "p" in data: + if data["p"].startswith("http:") or data["p"].startswith("https:"): + p = Value(value=data["p"], is_uri=True) + else: + p = Value(value=data["p"], is_uri=True) + else: + p = None + + if "o" in data: + if data["o"].startswith("http:") or data["o"].startswith("https:"): + o = Value(value=data["o"], is_uri=True) + else: + o = Value(value=data["o"], is_uri=True) + else: + o = None + + limit = int(data.get("limit", 10000)) + + await self.triples_query_out.send( + id, + TriplesQueryRequest( + s = s, p = p, o = o, + limit = limit, + user = data.get("user", "trustgraph"), + collection = data.get("collection", "default"), + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), TIME_OUT) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + return web.json_response( + { + "response": [ + { + "s": { + "v": t.s.value, + "e": t.s.is_uri, + }, + "p": { + "v": t.p.value, + "e": t.p.is_uri, + }, + "o": { + "v": t.o.value, + "e": t.o.is_uri, + } + } + for t in resp.triples + ] + } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.graph_rag_in.unsubscribe(id) + + async def agent(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.agent_in.subscribe(id) + + await self.agent_out.send( + id, + AgentRequest( + question=data["question"], + ) + ) + + while True: + try: + resp = await asyncio.wait_for(q.get(), TIME_OUT) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + if resp.answer: break + + if resp.thought: print("thought:", resp.thought) + if resp.observation: print("observation:", resp.observation) + + if resp.answer: + return web.json_response( + { "answer": resp.answer } + ) + + # Can't happen, ook at the logic + raise RuntimeError("Strange state") + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.agent_in.unsubscribe(id) + + async def embeddings(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.embeddings_in.subscribe(id) + + await self.embeddings_out.send( + id, + EmbeddingsRequest( + text=data["text"], + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), TIME_OUT) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + return web.json_response( + { "vectors": resp.vectors } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.embeddings_in.unsubscribe(id) + + async def app_factory(self): + + self.llm_pub_task = asyncio.create_task(self.llm_in.run()) + self.llm_sub_task = asyncio.create_task(self.llm_out.run()) + + self.prompt_pub_task = asyncio.create_task(self.prompt_in.run()) + self.prompt_sub_task = asyncio.create_task(self.prompt_out.run()) + + self.graph_rag_pub_task = asyncio.create_task(self.graph_rag_in.run()) + self.graph_rag_sub_task = asyncio.create_task(self.graph_rag_out.run()) + + self.triples_query_pub_task = asyncio.create_task( + self.triples_query_in.run() + ) + self.triples_query_sub_task = asyncio.create_task( + self.triples_query_out.run() + ) + + self.agent_pub_task = asyncio.create_task(self.agent_in.run()) + self.agent_sub_task = asyncio.create_task(self.agent_out.run()) + + self.embeddings_pub_task = asyncio.create_task( + self.embeddings_in.run() + ) + self.embeddings_sub_task = asyncio.create_task( + self.embeddings_out.run() + ) + + return self.app + + def run(self): + web.run_app(self.app_factory(), port=self.port) + +a = Api() +a.run() + diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 8aeb7ce2..4f7b3383 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -58,6 +58,7 @@ setuptools.setup( "google-generativeai", "ibis", "jsonschema", + "aiohttp", ], scripts=[ "scripts/agent-manager-react", From ba6d6c13afce518e9fbeba32ccbecdb89412fffd Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Wed, 20 Nov 2024 20:56:23 +0000 Subject: [PATCH 03/37] Fix API gateway integration, added to templates (#165) --- templates/components/trustgraph.jsonnet | 42 +++++++++++++++++ trustgraph-flow/scripts/api-gateway | 61 +++++++++++++++---------- trustgraph-flow/setup.py | 2 + 3 files changed, 81 insertions(+), 24 deletions(-) diff --git a/templates/components/trustgraph.jsonnet b/templates/components/trustgraph.jsonnet index e178cc27..37c05dae 100644 --- a/templates/components/trustgraph.jsonnet +++ b/templates/components/trustgraph.jsonnet @@ -5,9 +5,51 @@ local prompt = import "prompt-template.jsonnet"; { + "api-gateway-port":: 8088, + "api-gateway-timeout":: 600, + "chunk-size":: 250, "chunk-overlap":: 15, + "api-gateway" +: { + + create:: function(engine) + + local port = $["api-gateway-port"]; + + local container = + engine.container("api-gateway") + .with_image(images.trustgraph) + .with_command([ + "api-gateway", + "-p", + url.pulsar, + "--timeout", + std.toString($["api-gateway-timeout"]), + "--port", + std.toString(port), + ]) + .with_limits("0.5", "256M") + .with_reservations("0.1", "256M") + .with_port(8000, 8000, "metrics") + .with_port(port, port, "api"); + + local containerSet = engine.containers( + "api-gateway", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8000, 8000, "metrics") + .with_port(port, port, "api"); + + engine.resources([ + containerSet, + service, + ]) + + }, + "chunker" +: { create:: function(engine) diff --git a/trustgraph-flow/scripts/api-gateway b/trustgraph-flow/scripts/api-gateway index 748b5c7d..dd7d54ac 100755 --- a/trustgraph-flow/scripts/api-gateway +++ b/trustgraph-flow/scripts/api-gateway @@ -1,10 +1,17 @@ #!/usr/bin/env python3 +# FIXME: Subscribes to Pulsar unnecessarily, should only do it when there +# are active listeners + +# FIXME: Connection errors in publishers / subscribers cause those threads +# to fail and are not failed or retried + import asyncio from aiohttp import web import json import logging import uuid +import os import pulsar from pulsar.asyncio import Client @@ -42,7 +49,7 @@ from trustgraph.schema import embeddings_response_queue logger = logging.getLogger("api") logger.setLevel(logging.INFO) -pulsar_host = "pulsar://localhost:6650" +pulsar_host = os.getenv("PULSAR_HOST", "pulsar://pulsar:6650") TIME_OUT = 600 class Publisher: @@ -54,15 +61,18 @@ class Publisher: self.q = asyncio.Queue(maxsize=max_size) async def run(self): - async with aiopulsar.connect(self.pulsar_host) as client: - async with client.create_producer( - topic=self.topic, - schema=self.schema, - ) as producer: - while True: - id, item = await self.q.get() - await producer.send(item, { "id": id }) -# print("message out") + try: + async with aiopulsar.connect(self.pulsar_host) as client: + async with client.create_producer( + topic=self.topic, + schema=self.schema, + ) as producer: + while True: + id, item = await self.q.get() + await producer.send(item, { "id": id }) + # print("message out") + except Exception as e: + print("Exception:", e, flush=True) async def send(self, id, msg): await self.q.put((id, msg)) @@ -79,20 +89,23 @@ class Subscriber: self.q = {} async def run(self): - async with aiopulsar.connect(pulsar_host) as client: - async with client.subscribe( - topic=self.topic, - subscription_name=self.subscription, - consumer_name=self.consumer_name, - schema=self.schema, - ) as consumer: - while True: - msg = await consumer.receive() -# print("message in", self.topic) - id = msg.properties()["id"] - value = msg.value() - if id in self.q: - await self.q[id].put(value) + try: + async with aiopulsar.connect(pulsar_host) as client: + async with client.subscribe( + topic=self.topic, + subscription_name=self.subscription, + consumer_name=self.consumer_name, + schema=self.schema, + ) as consumer: + while True: + msg = await consumer.receive() + # print("message in", self.topic) + id = msg.properties()["id"] + value = msg.value() + if id in self.q: + await self.q[id].put(value) + except Exception as e: + print("Exception:", e, flush=True) async def subscribe(self, id): q = asyncio.Queue() diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 4f7b3383..44901119 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -59,8 +59,10 @@ setuptools.setup( "ibis", "jsonschema", "aiohttp", + "aiopulsar-py", ], scripts=[ + "scripts/api-gateway", "scripts/agent-manager-react", "scripts/chunker-recursive", "scripts/chunker-token", From a1e0edd96f70bbd01cea6a33e0ea671565413b16 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Wed, 20 Nov 2024 23:17:55 +0000 Subject: [PATCH 04/37] API gateway in a proper module, restarting publishers & subscribers as appropriate (#166) --- trustgraph-flow/scripts/api-gateway | 551 +-------------- trustgraph-flow/trustgraph/api/__init__.py | 0 .../trustgraph/api/gateway/__init__.py | 3 + .../trustgraph/api/gateway/__main__.py | 7 + .../trustgraph/api/gateway/service.py | 631 ++++++++++++++++++ 5 files changed, 643 insertions(+), 549 deletions(-) create mode 100644 trustgraph-flow/trustgraph/api/__init__.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/__init__.py create mode 100755 trustgraph-flow/trustgraph/api/gateway/__main__.py create mode 100755 trustgraph-flow/trustgraph/api/gateway/service.py diff --git a/trustgraph-flow/scripts/api-gateway b/trustgraph-flow/scripts/api-gateway index dd7d54ac..d28a5b8a 100755 --- a/trustgraph-flow/scripts/api-gateway +++ b/trustgraph-flow/scripts/api-gateway @@ -1,553 +1,6 @@ #!/usr/bin/env python3 -# FIXME: Subscribes to Pulsar unnecessarily, should only do it when there -# are active listeners +from trustgraph.api.gateway import run -# FIXME: Connection errors in publishers / subscribers cause those threads -# to fail and are not failed or retried - -import asyncio -from aiohttp import web -import json -import logging -import uuid -import os - -import pulsar -from pulsar.asyncio import Client -from pulsar.schema import JsonSchema -import _pulsar -import aiopulsar - -from trustgraph.clients.llm_client import LlmClient -from trustgraph.clients.prompt_client import PromptClient - -from trustgraph.schema import TextCompletionRequest, TextCompletionResponse -from trustgraph.schema import text_completion_request_queue -from trustgraph.schema import text_completion_response_queue - -from trustgraph.schema import PromptRequest, PromptResponse -from trustgraph.schema import prompt_request_queue -from trustgraph.schema import prompt_response_queue - -from trustgraph.schema import GraphRagQuery, GraphRagResponse -from trustgraph.schema import graph_rag_request_queue -from trustgraph.schema import graph_rag_response_queue - -from trustgraph.schema import TriplesQueryRequest, TriplesQueryResponse, Value -from trustgraph.schema import triples_request_queue -from trustgraph.schema import triples_response_queue - -from trustgraph.schema import AgentRequest, AgentResponse -from trustgraph.schema import agent_request_queue -from trustgraph.schema import agent_response_queue - -from trustgraph.schema import EmbeddingsRequest, EmbeddingsResponse -from trustgraph.schema import embeddings_request_queue -from trustgraph.schema import embeddings_response_queue - -logger = logging.getLogger("api") -logger.setLevel(logging.INFO) - -pulsar_host = os.getenv("PULSAR_HOST", "pulsar://pulsar:6650") -TIME_OUT = 600 - -class Publisher: - - def __init__(self, pulsar_host, topic, schema=None, max_size=10): - self.pulsar_host = pulsar_host - self.topic = topic - self.schema = schema - self.q = asyncio.Queue(maxsize=max_size) - - async def run(self): - try: - async with aiopulsar.connect(self.pulsar_host) as client: - async with client.create_producer( - topic=self.topic, - schema=self.schema, - ) as producer: - while True: - id, item = await self.q.get() - await producer.send(item, { "id": id }) - # print("message out") - except Exception as e: - print("Exception:", e, flush=True) - - async def send(self, id, msg): - await self.q.put((id, msg)) - -class Subscriber: - - def __init__(self, pulsar_host, topic, subscription, consumer_name, - schema=None, max_size=10): - self.pulsar_host = pulsar_host - self.topic = topic - self.subscription = subscription - self.consumer_name = consumer_name - self.schema = schema - self.q = {} - - async def run(self): - try: - async with aiopulsar.connect(pulsar_host) as client: - async with client.subscribe( - topic=self.topic, - subscription_name=self.subscription, - consumer_name=self.consumer_name, - schema=self.schema, - ) as consumer: - while True: - msg = await consumer.receive() - # print("message in", self.topic) - id = msg.properties()["id"] - value = msg.value() - if id in self.q: - await self.q[id].put(value) - except Exception as e: - print("Exception:", e, flush=True) - - async def subscribe(self, id): - q = asyncio.Queue() - self.q[id] = q - return q - - async def unsubscribe(self, id): - if id in self.q: - del self.q[id] - -class Api: - - def __init__(self, **config): - - self.port = int(config.get("port", "8088")) - self.app = web.Application(middlewares=[]) - - self.llm_out = Publisher( - pulsar_host, text_completion_request_queue, - schema=JsonSchema(TextCompletionRequest) - ) - - self.llm_in = Subscriber( - pulsar_host, text_completion_response_queue, - "api-gateway", "api-gateway", - JsonSchema(TextCompletionResponse) - ) - - self.prompt_out = Publisher( - pulsar_host, prompt_request_queue, - schema=JsonSchema(PromptRequest) - ) - - self.prompt_in = Subscriber( - pulsar_host, prompt_response_queue, - "api-gateway", "api-gateway", - JsonSchema(PromptResponse) - ) - - self.graph_rag_out = Publisher( - pulsar_host, graph_rag_request_queue, - schema=JsonSchema(GraphRagQuery) - ) - - self.graph_rag_in = Subscriber( - pulsar_host, graph_rag_response_queue, - "api-gateway", "api-gateway", - JsonSchema(GraphRagResponse) - ) - - self.triples_query_out = Publisher( - pulsar_host, triples_request_queue, - schema=JsonSchema(TriplesQueryRequest) - ) - - self.triples_query_in = Subscriber( - pulsar_host, triples_response_queue, - "api-gateway", "api-gateway", - JsonSchema(TriplesQueryResponse) - ) - - self.agent_out = Publisher( - pulsar_host, agent_request_queue, - schema=JsonSchema(AgentRequest) - ) - - self.agent_in = Subscriber( - pulsar_host, agent_response_queue, - "api-gateway", "api-gateway", - JsonSchema(AgentResponse) - ) - - self.embeddings_out = Publisher( - pulsar_host, embeddings_request_queue, - schema=JsonSchema(EmbeddingsRequest) - ) - - self.embeddings_in = Subscriber( - pulsar_host, embeddings_response_queue, - "api-gateway", "api-gateway", - JsonSchema(EmbeddingsResponse) - ) - - self.app.add_routes([ - web.post("/api/v1/text-completion", self.llm), - web.post("/api/v1/prompt", self.prompt), - web.post("/api/v1/graph-rag", self.graph_rag), - web.post("/api/v1/triples-query", self.triples_query), - web.post("/api/v1/agent", self.agent), - web.post("/api/v1/embeddings", self.embeddings), - ]) - - async def llm(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.llm_in.subscribe(id) - - await self.llm_out.send( - id, - TextCompletionRequest( - system=data["system"], - prompt=data["prompt"] - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), TIME_OUT) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - return web.json_response( - { "response": resp.response } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.llm_in.unsubscribe(id) - - async def prompt(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.prompt_in.subscribe(id) - - terms = { - k: json.dumps(v) - for k, v in data["variables"].items() - } - - await self.prompt_out.send( - id, - PromptRequest( - id=data["id"], - terms=terms - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), TIME_OUT) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - if resp.object: - return web.json_response( - { "object": resp.object } - ) - - return web.json_response( - { "text": resp.text } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.prompt_in.unsubscribe(id) - - async def graph_rag(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.graph_rag_in.subscribe(id) - - await self.graph_rag_out.send( - id, - GraphRagQuery( - query=data["query"], - user=data.get("user", "trustgraph"), - collection=data.get("collection", "default"), - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), TIME_OUT) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - return web.json_response( - { "response": resp.response } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.graph_rag_in.unsubscribe(id) - - async def triples_query(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.triples_query_in.subscribe(id) - - if "s" in data: - if data["s"].startswith("http:") or data["s"].startswith("https:"): - s = Value(value=data["s"], is_uri=True) - else: - s = Value(value=data["s"], is_uri=True) - else: - s = None - - if "p" in data: - if data["p"].startswith("http:") or data["p"].startswith("https:"): - p = Value(value=data["p"], is_uri=True) - else: - p = Value(value=data["p"], is_uri=True) - else: - p = None - - if "o" in data: - if data["o"].startswith("http:") or data["o"].startswith("https:"): - o = Value(value=data["o"], is_uri=True) - else: - o = Value(value=data["o"], is_uri=True) - else: - o = None - - limit = int(data.get("limit", 10000)) - - await self.triples_query_out.send( - id, - TriplesQueryRequest( - s = s, p = p, o = o, - limit = limit, - user = data.get("user", "trustgraph"), - collection = data.get("collection", "default"), - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), TIME_OUT) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - return web.json_response( - { - "response": [ - { - "s": { - "v": t.s.value, - "e": t.s.is_uri, - }, - "p": { - "v": t.p.value, - "e": t.p.is_uri, - }, - "o": { - "v": t.o.value, - "e": t.o.is_uri, - } - } - for t in resp.triples - ] - } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.graph_rag_in.unsubscribe(id) - - async def agent(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.agent_in.subscribe(id) - - await self.agent_out.send( - id, - AgentRequest( - question=data["question"], - ) - ) - - while True: - try: - resp = await asyncio.wait_for(q.get(), TIME_OUT) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - if resp.answer: break - - if resp.thought: print("thought:", resp.thought) - if resp.observation: print("observation:", resp.observation) - - if resp.answer: - return web.json_response( - { "answer": resp.answer } - ) - - # Can't happen, ook at the logic - raise RuntimeError("Strange state") - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.agent_in.unsubscribe(id) - - async def embeddings(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.embeddings_in.subscribe(id) - - await self.embeddings_out.send( - id, - EmbeddingsRequest( - text=data["text"], - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), TIME_OUT) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - return web.json_response( - { "vectors": resp.vectors } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.embeddings_in.unsubscribe(id) - - async def app_factory(self): - - self.llm_pub_task = asyncio.create_task(self.llm_in.run()) - self.llm_sub_task = asyncio.create_task(self.llm_out.run()) - - self.prompt_pub_task = asyncio.create_task(self.prompt_in.run()) - self.prompt_sub_task = asyncio.create_task(self.prompt_out.run()) - - self.graph_rag_pub_task = asyncio.create_task(self.graph_rag_in.run()) - self.graph_rag_sub_task = asyncio.create_task(self.graph_rag_out.run()) - - self.triples_query_pub_task = asyncio.create_task( - self.triples_query_in.run() - ) - self.triples_query_sub_task = asyncio.create_task( - self.triples_query_out.run() - ) - - self.agent_pub_task = asyncio.create_task(self.agent_in.run()) - self.agent_sub_task = asyncio.create_task(self.agent_out.run()) - - self.embeddings_pub_task = asyncio.create_task( - self.embeddings_in.run() - ) - self.embeddings_sub_task = asyncio.create_task( - self.embeddings_out.run() - ) - - return self.app - - def run(self): - web.run_app(self.app_factory(), port=self.port) - -a = Api() -a.run() +run() diff --git a/trustgraph-flow/trustgraph/api/__init__.py b/trustgraph-flow/trustgraph/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trustgraph-flow/trustgraph/api/gateway/__init__.py b/trustgraph-flow/trustgraph/api/gateway/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/api/gateway/__main__.py b/trustgraph-flow/trustgraph/api/gateway/__main__.py new file mode 100755 index 00000000..e9136855 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . service import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py new file mode 100755 index 00000000..b955af1e --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -0,0 +1,631 @@ + +""" +API gateway. Offers HTTP services which are translated to interaction on the +Pulsar bus. +""" + +module = ".".join(__name__.split(".")[1:-1]) + +# FIXME: Subscribes to Pulsar unnecessarily, should only do it when there +# are active listeners + +# FIXME: Connection errors in publishers / subscribers cause those threads +# to fail and are not failed or retried + +import asyncio +import argparse +from aiohttp import web +import json +import logging +import uuid +import os + +import pulsar +from pulsar.asyncio import Client +from pulsar.schema import JsonSchema +import _pulsar +import aiopulsar +from prometheus_client import start_http_server + +from ... log_level import LogLevel + +from trustgraph.clients.llm_client import LlmClient +from trustgraph.clients.prompt_client import PromptClient + +from ... schema import TextCompletionRequest, TextCompletionResponse +from ... schema import text_completion_request_queue +from ... schema import text_completion_response_queue + +from ... schema import PromptRequest, PromptResponse +from ... schema import prompt_request_queue +from ... schema import prompt_response_queue + +from ... schema import GraphRagQuery, GraphRagResponse +from ... schema import graph_rag_request_queue +from ... schema import graph_rag_response_queue + +from ... schema import TriplesQueryRequest, TriplesQueryResponse, Value +from ... schema import triples_request_queue +from ... schema import triples_response_queue + +from ... schema import AgentRequest, AgentResponse +from ... schema import agent_request_queue +from ... schema import agent_response_queue + +from ... schema import EmbeddingsRequest, EmbeddingsResponse +from ... schema import embeddings_request_queue +from ... schema import embeddings_response_queue + +logger = logging.getLogger("api") +logger.setLevel(logging.INFO) + +default_pulsar_host = os.getenv("PULSAR_HOST", "pulsar://pulsar:6650") +default_timeout = 600 +default_port = 8088 + +class Publisher: + + def __init__(self, pulsar_host, topic, schema=None, max_size=10): + self.pulsar_host = pulsar_host + self.topic = topic + self.schema = schema + self.q = asyncio.Queue(maxsize=max_size) + + async def run(self): + + while True: + + try: + async with aiopulsar.connect(self.pulsar_host) as client: + async with client.create_producer( + topic=self.topic, + schema=self.schema, + ) as producer: + while True: + id, item = await self.q.get() + await producer.send(item, { "id": id }) + except Exception as e: + print("Exception:", e, flush=True) + + # If handler drops out, sleep a retry + await asyncio.sleep(2) + + async def send(self, id, msg): + await self.q.put((id, msg)) + +class Subscriber: + + def __init__(self, pulsar_host, topic, subscription, consumer_name, + schema=None, max_size=10): + self.pulsar_host = pulsar_host + self.topic = topic + self.subscription = subscription + self.consumer_name = consumer_name + self.schema = schema + self.q = {} + + async def run(self): + while True: + try: + async with aiopulsar.connect(self.pulsar_host) as client: + async with client.subscribe( + topic=self.topic, + subscription_name=self.subscription, + consumer_name=self.consumer_name, + schema=self.schema, + ) as consumer: + while True: + msg = await consumer.receive() + id = msg.properties()["id"] + value = msg.value() + if id in self.q: + await self.q[id].put(value) + except Exception as e: + print("Exception:", e, flush=True) + + # If handler drops out, sleep a retry + await asyncio.sleep(2) + + async def subscribe(self, id): + q = asyncio.Queue() + self.q[id] = q + return q + + async def unsubscribe(self, id): + if id in self.q: + del self.q[id] + +class Api: + + def __init__(self, **config): + + self.app = web.Application(middlewares=[]) + + self.port = int(config.get("port", default_port)) + self.timeout = int(config.get("timeout", default_timeout)) + self.pulsar_host = config.get("pulsar_host", default_pulsar_host) + + self.llm_out = Publisher( + self.pulsar_host, text_completion_request_queue, + schema=JsonSchema(TextCompletionRequest) + ) + + self.llm_in = Subscriber( + self.pulsar_host, text_completion_response_queue, + "api-gateway", "api-gateway", + JsonSchema(TextCompletionResponse) + ) + + self.prompt_out = Publisher( + self.pulsar_host, prompt_request_queue, + schema=JsonSchema(PromptRequest) + ) + + self.prompt_in = Subscriber( + self.pulsar_host, prompt_response_queue, + "api-gateway", "api-gateway", + JsonSchema(PromptResponse) + ) + + self.graph_rag_out = Publisher( + self.pulsar_host, graph_rag_request_queue, + schema=JsonSchema(GraphRagQuery) + ) + + self.graph_rag_in = Subscriber( + self.pulsar_host, graph_rag_response_queue, + "api-gateway", "api-gateway", + JsonSchema(GraphRagResponse) + ) + + self.triples_query_out = Publisher( + self.pulsar_host, triples_request_queue, + schema=JsonSchema(TriplesQueryRequest) + ) + + self.triples_query_in = Subscriber( + self.pulsar_host, triples_response_queue, + "api-gateway", "api-gateway", + JsonSchema(TriplesQueryResponse) + ) + + self.agent_out = Publisher( + self.pulsar_host, agent_request_queue, + schema=JsonSchema(AgentRequest) + ) + + self.agent_in = Subscriber( + self.pulsar_host, agent_response_queue, + "api-gateway", "api-gateway", + JsonSchema(AgentResponse) + ) + + self.embeddings_out = Publisher( + self.pulsar_host, embeddings_request_queue, + schema=JsonSchema(EmbeddingsRequest) + ) + + self.embeddings_in = Subscriber( + self.pulsar_host, embeddings_response_queue, + "api-gateway", "api-gateway", + JsonSchema(EmbeddingsResponse) + ) + + self.app.add_routes([ + web.post("/api/v1/text-completion", self.llm), + web.post("/api/v1/prompt", self.prompt), + web.post("/api/v1/graph-rag", self.graph_rag), + web.post("/api/v1/triples-query", self.triples_query), + web.post("/api/v1/agent", self.agent), + web.post("/api/v1/embeddings", self.embeddings), + ]) + + async def llm(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.llm_in.subscribe(id) + + await self.llm_out.send( + id, + TextCompletionRequest( + system=data["system"], + prompt=data["prompt"] + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), self.timeout) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + return web.json_response( + { "response": resp.response } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.llm_in.unsubscribe(id) + + async def prompt(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.prompt_in.subscribe(id) + + terms = { + k: json.dumps(v) + for k, v in data["variables"].items() + } + + await self.prompt_out.send( + id, + PromptRequest( + id=data["id"], + terms=terms + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), self.timeout) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + if resp.object: + return web.json_response( + { "object": resp.object } + ) + + return web.json_response( + { "text": resp.text } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.prompt_in.unsubscribe(id) + + async def graph_rag(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.graph_rag_in.subscribe(id) + + await self.graph_rag_out.send( + id, + GraphRagQuery( + query=data["query"], + user=data.get("user", "trustgraph"), + collection=data.get("collection", "default"), + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), self.timeout) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + return web.json_response( + { "response": resp.response } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.graph_rag_in.unsubscribe(id) + + async def triples_query(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.triples_query_in.subscribe(id) + + if "s" in data: + if data["s"].startswith("http:") or data["s"].startswith("https:"): + s = Value(value=data["s"], is_uri=True) + else: + s = Value(value=data["s"], is_uri=True) + else: + s = None + + if "p" in data: + if data["p"].startswith("http:") or data["p"].startswith("https:"): + p = Value(value=data["p"], is_uri=True) + else: + p = Value(value=data["p"], is_uri=True) + else: + p = None + + if "o" in data: + if data["o"].startswith("http:") or data["o"].startswith("https:"): + o = Value(value=data["o"], is_uri=True) + else: + o = Value(value=data["o"], is_uri=True) + else: + o = None + + limit = int(data.get("limit", 10000)) + + await self.triples_query_out.send( + id, + TriplesQueryRequest( + s = s, p = p, o = o, + limit = limit, + user = data.get("user", "trustgraph"), + collection = data.get("collection", "default"), + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), self.timeout) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + return web.json_response( + { + "response": [ + { + "s": { + "v": t.s.value, + "e": t.s.is_uri, + }, + "p": { + "v": t.p.value, + "e": t.p.is_uri, + }, + "o": { + "v": t.o.value, + "e": t.o.is_uri, + } + } + for t in resp.triples + ] + } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.graph_rag_in.unsubscribe(id) + + async def agent(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.agent_in.subscribe(id) + + await self.agent_out.send( + id, + AgentRequest( + question=data["question"], + ) + ) + + while True: + try: + resp = await asyncio.wait_for(q.get(), self.timeout) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + if resp.answer: break + + if resp.thought: print("thought:", resp.thought) + if resp.observation: print("observation:", resp.observation) + + if resp.answer: + return web.json_response( + { "answer": resp.answer } + ) + + # Can't happen, ook at the logic + raise RuntimeError("Strange state") + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.agent_in.unsubscribe(id) + + async def embeddings(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.embeddings_in.subscribe(id) + + await self.embeddings_out.send( + id, + EmbeddingsRequest( + text=data["text"], + ) + ) + + try: + resp = await asyncio.wait_for(q.get(), self.timeout) + except: + raise RuntimeError("Timeout waiting for response") + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + return web.json_response( + { "vectors": resp.vectors } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.embeddings_in.unsubscribe(id) + + async def app_factory(self): + + self.llm_pub_task = asyncio.create_task(self.llm_in.run()) + self.llm_sub_task = asyncio.create_task(self.llm_out.run()) + + self.prompt_pub_task = asyncio.create_task(self.prompt_in.run()) + self.prompt_sub_task = asyncio.create_task(self.prompt_out.run()) + + self.graph_rag_pub_task = asyncio.create_task(self.graph_rag_in.run()) + self.graph_rag_sub_task = asyncio.create_task(self.graph_rag_out.run()) + + self.triples_query_pub_task = asyncio.create_task( + self.triples_query_in.run() + ) + self.triples_query_sub_task = asyncio.create_task( + self.triples_query_out.run() + ) + + self.agent_pub_task = asyncio.create_task(self.agent_in.run()) + self.agent_sub_task = asyncio.create_task(self.agent_out.run()) + + self.embeddings_pub_task = asyncio.create_task( + self.embeddings_in.run() + ) + self.embeddings_sub_task = asyncio.create_task( + self.embeddings_out.run() + ) + + return self.app + + def run(self): + web.run_app(self.app_factory(), port=self.port) + +def run(): + + + parser = argparse.ArgumentParser( + prog="api-gateway", + description=__doc__ + ) + + parser.add_argument( + '-p', '--pulsar-host', + default=default_pulsar_host, + help=f'Pulsar host (default: {default_pulsar_host})', + ) + + parser.add_argument( + '--port', + type=int, + default=default_port, + help=f'Port number to listen on (default: {default_port})', + ) + + parser.add_argument( + '--timeout', + type=int, + default=default_timeout, + help=f'API request timeout in seconds (default: {default_timeout})', + ) + + parser.add_argument( + '-l', '--log-level', + type=LogLevel, + default=LogLevel.INFO, + choices=list(LogLevel), + help=f'Output queue (default: info)' + ) + + parser.add_argument( + '--metrics', + action=argparse.BooleanOptionalAction, + default=True, + help=f'Metrics enabled (default: true)', + ) + + parser.add_argument( + '-P', '--metrics-port', + type=int, + default=8000, + help=f'Prometheus metrics port (default: 8000)', + ) + + args = parser.parse_args() + args = vars(args) + + if args["metrics"]: + start_http_server(args["metrics_port"]) + + a = Api(**args) + a.run() + From dc0f54f236f4bdacfb4515ebaa2c0da0a4dcf4fc Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Thu, 21 Nov 2024 14:53:53 +0000 Subject: [PATCH 05/37] API supports doc & text load (#167) --- trustgraph-cli/scripts/tg-load-text | 1 - .../trustgraph/api/gateway/service.py | 159 ++++++++++++++++-- 2 files changed, 143 insertions(+), 17 deletions(-) diff --git a/trustgraph-cli/scripts/tg-load-text b/trustgraph-cli/scripts/tg-load-text index 88dc8e17..e49ee7a9 100755 --- a/trustgraph-cli/scripts/tg-load-text +++ b/trustgraph-cli/scripts/tg-load-text @@ -6,7 +6,6 @@ Loads a text document into TrustGraph processing. import pulsar from pulsar.schema import JsonSchema -import base64 import hashlib import argparse import os diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index b955af1e..2ac22892 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -19,6 +19,7 @@ import json import logging import uuid import os +import base64 import pulsar from pulsar.asyncio import Client @@ -32,6 +33,8 @@ from ... log_level import LogLevel from trustgraph.clients.llm_client import LlmClient from trustgraph.clients.prompt_client import PromptClient +from ... schema import Value, Metadata, Document, TextDocument, Triple + from ... schema import TextCompletionRequest, TextCompletionResponse from ... schema import text_completion_request_queue from ... schema import text_completion_response_queue @@ -44,7 +47,7 @@ from ... schema import GraphRagQuery, GraphRagResponse from ... schema import graph_rag_request_queue from ... schema import graph_rag_response_queue -from ... schema import TriplesQueryRequest, TriplesQueryResponse, Value +from ... schema import TriplesQueryRequest, TriplesQueryResponse from ... schema import triples_request_queue from ... schema import triples_response_queue @@ -56,6 +59,8 @@ from ... schema import EmbeddingsRequest, EmbeddingsResponse from ... schema import embeddings_request_queue from ... schema import embeddings_response_queue +from ... schema import document_ingest_queue, text_ingest_queue + logger = logging.getLogger("api") logger.setLevel(logging.INFO) @@ -63,13 +68,31 @@ default_pulsar_host = os.getenv("PULSAR_HOST", "pulsar://pulsar:6650") default_timeout = 600 default_port = 8088 +def to_value(x): + if x.startswith("http:") or x.startswith("https:"): + return Value(value=x, is_uri=True) + else: + return Value(value=x, is_uri=True) + +def to_subgraph(x): + return [ + Triple( + s=to_value(t["s"]), + p=to_value(t["p"]), + o=to_value(t["o"]) + ) + for t in x + ] + class Publisher: - def __init__(self, pulsar_host, topic, schema=None, max_size=10): + def __init__(self, pulsar_host, topic, schema=None, max_size=10, + chunking_enabled=False): self.pulsar_host = pulsar_host self.topic = topic self.schema = schema self.q = asyncio.Queue(maxsize=max_size) + self.chunking_enabled = chunking_enabled async def run(self): @@ -80,10 +103,16 @@ class Publisher: async with client.create_producer( topic=self.topic, schema=self.schema, + chunking_enabled=self.chunking_enabled, ) as producer: while True: id, item = await self.q.get() - await producer.send(item, { "id": id }) + + if id: + await producer.send(item, { "id": id }) + else: + await producer.send(item) + except Exception as e: print("Exception:", e, flush=True) @@ -139,7 +168,10 @@ class Api: def __init__(self, **config): - self.app = web.Application(middlewares=[]) + self.app = web.Application( + middlewares=[], + client_max_size=256 * 1024 * 1024 + ) self.port = int(config.get("port", default_port)) self.timeout = int(config.get("timeout", default_timeout)) @@ -211,6 +243,18 @@ class Api: JsonSchema(EmbeddingsResponse) ) + self.document_out = Publisher( + self.pulsar_host, document_ingest_queue, + schema=JsonSchema(Document), + chunking_enabled=True, + ) + + self.text_out = Publisher( + self.pulsar_host, text_ingest_queue, + schema=JsonSchema(TextDocument), + chunking_enabled=True, + ) + self.app.add_routes([ web.post("/api/v1/text-completion", self.llm), web.post("/api/v1/prompt", self.prompt), @@ -218,6 +262,8 @@ class Api: web.post("/api/v1/triples-query", self.triples_query), web.post("/api/v1/agent", self.agent), web.post("/api/v1/embeddings", self.embeddings), + web.post("/api/v1/load/document", self.load_document), + web.post("/api/v1/load/text", self.load_text), ]) async def llm(self, request): @@ -368,26 +414,17 @@ class Api: q = await self.triples_query_in.subscribe(id) if "s" in data: - if data["s"].startswith("http:") or data["s"].startswith("https:"): - s = Value(value=data["s"], is_uri=True) - else: - s = Value(value=data["s"], is_uri=True) + s = to_value(data["s"]) else: s = None if "p" in data: - if data["p"].startswith("http:") or data["p"].startswith("https:"): - p = Value(value=data["p"], is_uri=True) - else: - p = Value(value=data["p"], is_uri=True) + p = to_value(data["p"]) else: p = None if "o" in data: - if data["o"].startswith("http:") or data["o"].startswith("https:"): - o = Value(value=data["o"], is_uri=True) - else: - o = Value(value=data["o"], is_uri=True) + o = to_value(data["o"]) else: o = None @@ -537,6 +574,92 @@ class Api: finally: await self.embeddings_in.unsubscribe(id) + async def load_document(self, request): + + try: + + data = await request.json() + + if "metadata" in data: + metadata = to_subgraph(data["metadata"]) + else: + metadata = [] + + # Doing a base64 decode/encode here to make sure the + # content is valid base64 + doc = base64.b64decode(data["data"]) + + resp = await self.document_out.send( + None, + Document( + metadata=Metadata( + id=data.get("id"), + metadata=metadata, + user=data.get("user", "trustgraph"), + collection=data.get("collection", "default"), + ), + data=base64.b64encode(doc).decode("utf-8") + ) + ) + + print("Document loaded.") + + return web.json_response( + { } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + async def load_text(self, request): + + try: + + data = await request.json() + + if "metadata" in data: + metadata = to_subgraph(data["metadata"]) + else: + metadata = [] + + if "charset" in data: + charset = data["charset"] + else: + charset = "utf-8" + + # Text is base64 encoded + text = base64.b64decode(data["text"]).decode(charset) + + resp = await self.text_out.send( + None, + TextDocument( + metadata=Metadata( + id=data.get("id"), + metadata=metadata, + user=data.get("user", "trustgraph"), + collection=data.get("collection", "default"), + ), + text=text, + ) + ) + + print("Text document loaded.") + + return web.json_response( + { } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + async def app_factory(self): self.llm_pub_task = asyncio.create_task(self.llm_in.run()) @@ -565,6 +688,10 @@ class Api: self.embeddings_out.run() ) + self.doc_ingest_pub_task = asyncio.create_task(self.document_out.run()) + + self.text_ingest_pub_task = asyncio.create_task(self.text_out.run()) + return self.app def run(self): From 7a64385a575400951f266caeba576ae4bc8da9c7 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Thu, 21 Nov 2024 18:02:49 +0000 Subject: [PATCH 06/37] Fix graph query in Cassandra (#168) --- trustgraph-flow/trustgraph/api/gateway/service.py | 2 +- trustgraph-flow/trustgraph/direct/cassandra.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index 2ac22892..148bc321 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -72,7 +72,7 @@ def to_value(x): if x.startswith("http:") or x.startswith("https:"): return Value(value=x, is_uri=True) else: - return Value(value=x, is_uri=True) + return Value(value=x, is_uri=False) def to_subgraph(x): return [ diff --git a/trustgraph-flow/trustgraph/direct/cassandra.py b/trustgraph-flow/trustgraph/direct/cassandra.py index 2b577df1..568411a9 100644 --- a/trustgraph-flow/trustgraph/direct/cassandra.py +++ b/trustgraph-flow/trustgraph/direct/cassandra.py @@ -97,7 +97,7 @@ class TrustGraph: def get_po(self, p, o, limit=10): return self.session.execute( - f"select s from {self.table} where p = %s and o = %s allow filtering limit {limit}", + f"select s from {self.table} where p = %s and o = %s limit {limit} allow filtering", (p, o) ) From ae1264f5c4c3c351b60cde1c20986fec8551e6c1 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 22 Nov 2024 15:55:32 +0000 Subject: [PATCH 07/37] Add Python support to calling the API (#169) --- trustgraph-base/trustgraph/api/__init__.py | 3 + trustgraph-base/trustgraph/api/api.py | 336 +++++++++++++++++++++ 2 files changed, 339 insertions(+) create mode 100644 trustgraph-base/trustgraph/api/__init__.py create mode 100644 trustgraph-base/trustgraph/api/api.py diff --git a/trustgraph-base/trustgraph/api/__init__.py b/trustgraph-base/trustgraph/api/__init__.py new file mode 100644 index 00000000..daa6a964 --- /dev/null +++ b/trustgraph-base/trustgraph/api/__init__.py @@ -0,0 +1,3 @@ + +from . api import * + diff --git a/trustgraph-base/trustgraph/api/api.py b/trustgraph-base/trustgraph/api/api.py new file mode 100644 index 00000000..818e42c3 --- /dev/null +++ b/trustgraph-base/trustgraph/api/api.py @@ -0,0 +1,336 @@ + +import requests +import json +import dataclasses +import base64 + +from trustgraph.knowledge import hash + +class ProtocolException(Exception): + pass + +class ApplicationException(Exception): + pass + +class Uri(str): + def is_uri(self): return True + def is_literal(self): return False + +class Literal(str): + def is_uri(self): return False + def is_literal(self): return True + +@dataclasses.dataclass +class Triple: + s : str + p : str + o : str + +class Api: + + def __init__(self, url="http://localhost:8088/"): + + self.url = url + + if not url.endswith("/"): + self.url += "/" + + self.url += "api/v1/" + + def check_error(self, response): + + if "error" in response: + + try: + msg = response["error"]["message"] + tp = response["error"]["message"] + except: + raise ApplicationException( + "Error, but the error object is broken" + ) + + raise ApplicationException(f"{tp}: {msg}") + + def text_completion(self, system, prompt): + + # The input consists of system and prompt strings + input = { + "system": system, + "prompt": prompt + } + + url = f"{self.url}text-completion" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException(f"Expected JSON response") + + self.check_error(resp) + + try: + return object["response"] + except: + raise ProtocolException(f"Response not formatted correctly") + + def agent(self, question): + + # The input consists of a question + input = { + "question": question + } + + url = f"{self.url}agent" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException(f"Expected JSON response") + + self.check_error(resp) + + try: + return object["answer"] + except: + raise ProtocolException(f"Response not formatted correctly") + + def graph_rag(self, question): + + # The input consists of a question + input = { + "query": question + } + + url = f"{self.url}graph-rag" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException(f"Expected JSON response") + + self.check_error(resp) + + try: + return object["response"] + except: + raise ProtocolException(f"Response not formatted correctly") + + def embeddings(self, text): + + # The input consists of a text block + input = { + "text": text + } + + url = f"{self.url}embeddings" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException(f"Expected JSON response") + + self.check_error(resp) + + try: + return object["vectors"] + except: + raise ProtocolException(f"Response not formatted correctly") + + def prompt(self, id, variables): + + # The input consists of system and prompt strings + input = { + "id": id, + "variables": variables + } + + url = f"{self.url}prompt" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException("Expected JSON response") + + self.check_error(resp) + + if "text" in object: + return object["text"] + + if "object" in object: + try: + return json.loads(object["object"]) + except Exception as e: + raise ProtocolException( + "Returned object not well-formed JSON" + ) + + raise ProtocolException("Response not formatted correctly") + + def triples_query(self, s=None, p=None, o=None, limit=10000): + + # The input consists of system and prompt strings + input = { + "limit": limit + } + + if s: input["s"] = s + if p: input["p"] = p + if o: input["o"] = o + + url = f"{self.url}triples-query" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException("Expected JSON response") + + self.check_error(resp) + + if "response" not in object: + raise ProtocolException("Response not formatted correctly") + + def to_value(x): + if x["e"]: return Uri(x["v"]) + return Literal(x["v"]) + + return [ + Triple( + s=to_value(t["s"]), + p=to_value(t["p"]), + o=to_value(t["o"]) + ) + for t in object["response"] + ] + + return object["response"] + + def load_document(self, document, id=None, metadata=None): + + if id is None: + + if metadata is not None: + + # Situation makes no sense. What can the metadata possibly + # mean if the caller doesn't know the document ID. + # Metadata should relate to the document by ID + raise RuntimeError("Can't specify metadata without id") + + id = hash(document) + + triples = [] + + def emit(t): + triples.append(t) + + if metadata: + metadata.emit( + lambda t: triples.append({ + "s": t.s.value, + "p": t.p.value, + "o": t.o.value + }) + ) + + input = { + "id": id, + "metadata": triples, + "data": base64.b64encode(document).decode("utf-8"), + } + + url = f"{self.url}load/document" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + def load_text(self, text, id=None, metadata=None, charset="utf-8"): + + if id is None: + + if metadata is not None: + + # Situation makes no sense. What can the metadata possibly + # mean if the caller doesn't know the document ID. + # Metadata should relate to the document by ID + raise RuntimeError("Can't specify metadata without id") + + id = hash(text) + + triples = [] + + if metadata: + metadata.emit( + lambda t: triples.append({ + "s": t.s.value, + "p": t.p.value, + "o": t.o.value + }) + ) + + input = { + "id": id, + "metadata": triples, + "charset": charset, + "text": base64.b64encode(text).decode("utf-8"), + } + + url = f"{self.url}load/text" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + From 319f9ac04a0a272f9f710d1e9abb4f77166dbbd3 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 22 Nov 2024 23:48:21 +0000 Subject: [PATCH 08/37] Feature/pinecone integration (#170) * Added Pinecone for GE write & query * Add templates * Doc embedding support --- templates/components.jsonnet | 2 + templates/components/pinecone.jsonnet | 153 ++++++++++++++++ trustgraph-flow/scripts/ge-query-pinecone | 6 + trustgraph-flow/scripts/ge-write-pinecone | 6 + trustgraph-flow/setup.py | 3 + .../query/doc_embeddings/pinecone/__init__.py | 3 + .../query/doc_embeddings/pinecone/__main__.py | 7 + .../query/doc_embeddings/pinecone/service.py | 142 +++++++++++++++ .../graph_embeddings/pinecone/__init__.py | 3 + .../graph_embeddings/pinecone/__main__.py | 7 + .../graph_embeddings/pinecone/service.py | 156 ++++++++++++++++ .../doc_embeddings/pinecone/__init__.py | 3 + .../doc_embeddings/pinecone/__main__.py | 7 + .../storage/doc_embeddings/pinecone/write.py | 167 ++++++++++++++++++ .../graph_embeddings/pinecone/__init__.py | 3 + .../graph_embeddings/pinecone/__main__.py | 7 + .../graph_embeddings/pinecone/write.py | 167 ++++++++++++++++++ 17 files changed, 842 insertions(+) create mode 100644 templates/components/pinecone.jsonnet create mode 100755 trustgraph-flow/scripts/ge-query-pinecone create mode 100755 trustgraph-flow/scripts/ge-write-pinecone create mode 100644 trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__init__.py create mode 100755 trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__main__.py create mode 100755 trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py create mode 100644 trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__init__.py create mode 100755 trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__main__.py create mode 100755 trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/service.py create mode 100644 trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__init__.py create mode 100644 trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__main__.py create mode 100644 trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/write.py create mode 100644 trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__init__.py create mode 100755 trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__main__.py create mode 100755 trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/write.py diff --git a/templates/components.jsonnet b/templates/components.jsonnet index ec7f862b..26368deb 100644 --- a/templates/components.jsonnet +++ b/templates/components.jsonnet @@ -25,6 +25,7 @@ "trustgraph-base": import "components/trustgraph.jsonnet", "vector-store-milvus": import "components/milvus.jsonnet", "vector-store-qdrant": import "components/qdrant.jsonnet", + "vector-store-pinecone": import "components/pinecone.jsonnet", "vertexai": import "components/vertexai.jsonnet", "null": {}, @@ -34,6 +35,7 @@ "cassandra": import "components/cassandra.jsonnet", "neo4j": import "components/neo4j.jsonnet", "qdrant": import "components/qdrant.jsonnet", + "pinecone": import "components/pinecone.jsonnet", "milvus": import "components/milvus.jsonnet", "trustgraph": import "components/trustgraph.jsonnet", diff --git a/templates/components/pinecone.jsonnet b/templates/components/pinecone.jsonnet new file mode 100644 index 00000000..3422952a --- /dev/null +++ b/templates/components/pinecone.jsonnet @@ -0,0 +1,153 @@ +local base = import "base/base.jsonnet"; +local images = import "values/images.jsonnet"; +local url = import "values/url.jsonnet"; +local cassandra_hosts = "cassandra"; + +{ + + "pinecone-cloud":: "aws", + "pinecone-region":: "us-east-1", + + "store-graph-embeddings" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("pinecone-api-key") + .with_env_var("PINECONE_API_KEY", "pinecone-api-key"); + + local container = + engine.container("store-graph-embeddings") + .with_image(images.trustgraph) + .with_command([ + "ge-write-pinecone", + "-p", + url.pulsar, + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "store-graph-embeddings", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + }, + + "query-graph-embeddings" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("pinecone-api-key") + .with_env_var("PINECONE_API_KEY", "pinecone-api-key"); + + local container = + engine.container("query-graph-embeddings") + .with_image(images.trustgraph) + .with_command([ + "ge-query-pinecone", + "-p", + url.pulsar, + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "query-graph-embeddings", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + }, + + "store-doc-embeddings" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("pinecone-api-key") + .with_env_var("PINECONE_API_KEY", "pinecone-api-key"); + + local container = + engine.container("store-doc-embeddings") + .with_image(images.trustgraph) + .with_command([ + "de-write-pinecone", + "-p", + url.pulsar, + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "store-doc-embeddings", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + }, + + "query-doc-embeddings" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("pinecone-api-key") + .with_env_var("PINECONE_API_KEY", "pinecone-api-key"); + + local container = + engine.container("query-doc-embeddings") + .with_image(images.trustgraph) + .with_command([ + "de-query-pinecone", + "-p", + url.pulsar, + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "query-doc-embeddings", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + + } + +} + diff --git a/trustgraph-flow/scripts/ge-query-pinecone b/trustgraph-flow/scripts/ge-query-pinecone new file mode 100755 index 00000000..b75aec78 --- /dev/null +++ b/trustgraph-flow/scripts/ge-query-pinecone @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.query.graph_embeddings.pinecone import run + +run() + diff --git a/trustgraph-flow/scripts/ge-write-pinecone b/trustgraph-flow/scripts/ge-write-pinecone new file mode 100755 index 00000000..802a8377 --- /dev/null +++ b/trustgraph-flow/scripts/ge-write-pinecone @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.storage.graph_embeddings.pinecone import run + +run() + diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 44901119..1650122f 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -60,6 +60,7 @@ setuptools.setup( "jsonschema", "aiohttp", "aiopulsar-py", + "pinecone[grpc]", ], scripts=[ "scripts/api-gateway", @@ -74,8 +75,10 @@ setuptools.setup( "scripts/embeddings-ollama", "scripts/embeddings-vectorize", "scripts/ge-query-milvus", + "scripts/ge-query-pinecone", "scripts/ge-query-qdrant", "scripts/ge-write-milvus", + "scripts/ge-write-pinecone", "scripts/ge-write-qdrant", "scripts/graph-rag", "scripts/kg-extract-definitions", diff --git a/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__init__.py b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__main__.py b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__main__.py new file mode 100755 index 00000000..89684e3e --- /dev/null +++ b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . hf import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py new file mode 100755 index 00000000..3fcbfb21 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py @@ -0,0 +1,142 @@ + +""" +Document embeddings query service. Input is vector, output is an array +of chunks. Pinecone implementation. +""" + +from pinecone import Pinecone, ServerlessSpec +from pinecone.grpc import PineconeGRPC, GRPCClientConfig + +import uuid +import os + +from .... schema import DocumentEmbeddingsRequest, DocumentEmbeddingsResponse +from .... schema import Error, Value +from .... schema import document_embeddings_request_queue +from .... schema import document_embeddings_response_queue +from .... base import ConsumerProducer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = document_embeddings_request_queue +default_output_queue = document_embeddings_response_queue +default_subscriber = module +default_api_key = os.getenv("PINECONE_API_KEY", "not-specified") + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + + self.url = params.get("url", None) + self.api_key = params.get("api_key", default_api_key) + + if self.url: + + self.pinecone = PineconeGRPC( + api_key = self.api_key, + host = self.url + ) + + else: + + self.pinecone = Pinecone(api_key = self.api_key) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": DocumentEmbeddingsRequest, + "output_schema": DocumentEmbeddingsResponse, + "url": self.url, + } + ) + + def handle(self, msg): + + try: + + v = msg.value() + + # Sender-produced ID + id = msg.properties()["id"] + + print(f"Handling input {id}...", flush=True) + + chunks = [] + + for vec in v.vectors: + + dim = len(vec) + + index_name = ( + "d-" + v.user + "-" + str(dim) + ) + + index = self.pinecone.Index(index_name) + + results = index.query( + namespace=v.collection, + vector=vec, + top_k=v.limit, + include_values=False, + include_metadata=True + ) + + search_result = self.client.query_points( + collection_name=collection, + query=vec, + limit=v.limit, + with_payload=True, + ).points + + for r in results.matches: + doc = r.metadata["doc"] + chunks.add(doc) + + print("Send response...", flush=True) + r = DocumentEmbeddingsResponse(documents=chunks, error=None) + self.producer.send(r, properties={"id": id}) + + print("Done.", flush=True) + + except Exception as e: + + print(f"Exception: {e}") + + print("Send error response...", flush=True) + + r = DocumentEmbeddingsResponse( + error=Error( + type = "llm-error", + message = str(e), + ), + documents=None, + ) + + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + + parser.add_argument( + '-t', '--store-uri', + default=default_store_uri, + help=f'Milvus store URI (default: {default_store_uri})' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__init__.py b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__main__.py b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__main__.py new file mode 100755 index 00000000..89684e3e --- /dev/null +++ b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . hf import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/service.py b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/service.py new file mode 100755 index 00000000..64ae4d32 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/service.py @@ -0,0 +1,156 @@ + +""" +Graph embeddings query service. Input is vector, output is list of +entities. Pinecone implementation. +""" + +from pinecone import Pinecone, ServerlessSpec +from pinecone.grpc import PineconeGRPC, GRPCClientConfig + +import uuid +import os + +from .... schema import GraphEmbeddingsRequest, GraphEmbeddingsResponse +from .... schema import Error, Value +from .... schema import graph_embeddings_request_queue +from .... schema import graph_embeddings_response_queue +from .... base import ConsumerProducer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = graph_embeddings_request_queue +default_output_queue = graph_embeddings_response_queue +default_subscriber = module +default_api_key = os.getenv("PINECONE_API_KEY", "not-specified") + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + + self.url = params.get("url", None) + self.api_key = params.get("api_key", default_api_key) + + if self.url: + + self.pinecone = PineconeGRPC( + api_key = self.api_key, + host = self.url + ) + + else: + + self.pinecone = Pinecone(api_key = self.api_key) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": GraphEmbeddingsRequest, + "output_schema": GraphEmbeddingsResponse, + "url": self.url, + } + ) + + def create_value(self, ent): + if ent.startswith("http://") or ent.startswith("https://"): + return Value(value=ent, is_uri=True) + else: + return Value(value=ent, is_uri=False) + + def handle(self, msg): + + try: + + v = msg.value() + + # Sender-produced ID + id = msg.properties()["id"] + + print(f"Handling input {id}...", flush=True) + + entities = set() + + for vec in v.vectors: + + dim = len(vec) + + index_name = ( + "t-" + v.user + "-" + str(dim) + ) + + index = self.pinecone.Index(index_name) + + results = index.query( + namespace=v.collection, + vector=vec, + top_k=v.limit, + include_values=False, + include_metadata=True + ) + + for r in results.matches: + ent = r.metadata["entity"] + entities.add(ent) + + # Convert set to list + entities = list(entities) + + ents2 = [] + + for ent in entities: + ents2.append(self.create_value(ent)) + + entities = ents2 + + print("Send response...", flush=True) + r = GraphEmbeddingsResponse(entities=entities, error=None) + self.producer.send(r, properties={"id": id}) + + print("Done.", flush=True) + + except Exception as e: + + print(f"Exception: {e}") + + print("Send error response...", flush=True) + + r = GraphEmbeddingsResponse( + error=Error( + type = "llm-error", + message = str(e), + ), + entities=None, + ) + + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + + parser.add_argument( + '-a', '--api-key', + default=default_api_key, + help='Pinecone API key. (default from PINECONE_API_KEY)' + ) + + parser.add_argument( + '-u', '--url', + help='Pinecone URL. If unspecified, serverless is used' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__init__.py b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__init__.py new file mode 100644 index 00000000..d891d55f --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__init__.py @@ -0,0 +1,3 @@ + +from . write import * + diff --git a/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__main__.py b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__main__.py new file mode 100644 index 00000000..c05d8c6d --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . write import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/write.py b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/write.py new file mode 100644 index 00000000..24cfcb78 --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/write.py @@ -0,0 +1,167 @@ + +""" +Accepts entity/vector pairs and writes them to a Qdrant store. +""" + +from qdrant_client import QdrantClient +from qdrant_client.models import PointStruct +from qdrant_client.models import Distance, VectorParams + +import time +import uuid +import os + +from .... schema import ChunkEmbeddings +from .... schema import chunk_embeddings_ingest_queue +from .... log_level import LogLevel +from .... base import Consumer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = chunk_embeddings_ingest_queue +default_subscriber = module +default_api_key = os.getenv("PINECONE_API_KEY", "not-specified") +default_cloud = "aws" +default_region = "us-east-1" + +class Processor(Consumer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + subscriber = params.get("subscriber", default_subscriber) + + self.url = params.get("url", None) + self.cloud = params.get("cloud", default_cloud) + self.region = params.get("region", default_region) + self.api_key = params.get("api_key", default_api_key) + + if self.api_key is None: + raise RuntimeError("Pinecone API key must be specified") + + if self.url: + + self.pinecone = PineconeGRPC( + api_key = self.api_key, + host = self.url + ) + + else: + + self.pinecone = Pinecone(api_key = self.api_key) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "subscriber": subscriber, + "input_schema": ChunkEmbeddings, + "url": self.url, + } + ) + + self.last_index_name = None + + def handle(self, msg): + + v = msg.value() + + chunk = v.chunk.decode("utf-8") + + if chunk == "": return + + for vec in v.vectors: + + dim = len(vec) + collection = ( + "d-" + v.metadata.user + "-" + str(dim) + ) + + if index_name != self.last_index_name: + + if not self.pinecone.has_index(index_name): + + try: + + self.pinecone.create_index( + name = index_name, + dimension = dim, + metric = "cosine", + spec = ServerlessSpec( + cloud = self.cloud, + region = self.region, + ) + ) + + for i in range(0, 1000): + + if self.pinecone.describe_index( + index_name + ).status["ready"]: + break + + time.sleep(1) + + if not self.pinecone.describe_index( + index_name + ).status["ready"]: + raise RuntimeError( + "Gave up waiting for index creation" + ) + + except Exception as e: + print("Pinecone index creation failed") + raise e + + print(f"Index {index_name} created", flush=True) + + self.last_index_name = index_name + + index = self.pinecone.Index(index_name) + + records = [ + { + "id": id, + "values": vec, + "metadata": { "doc": chunk }, + } + ] + + index.upsert( + vectors = records, + namespace = v.metadata.collection, + ) + + @staticmethod + def add_args(parser): + + Consumer.add_args( + parser, default_input_queue, default_subscriber, + ) + + parser.add_argument( + '-a', '--api-key', + default=default_api_key, + help='Pinecone API key. (default from PINECONE_API_KEY)' + ) + + parser.add_argument( + '-u', '--url', + help='Pinecone URL. If unspecified, serverless is used' + ) + + parser.add_argument( + '--cloud', + default=default_cloud, + help=f'Pinecone cloud, (default: {default_cloud}' + ) + + parser.add_argument( + '--region', + default=default_region, + help=f'Pinecone region, (default: {default_region}' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__init__.py b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__init__.py new file mode 100644 index 00000000..d891d55f --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__init__.py @@ -0,0 +1,3 @@ + +from . write import * + diff --git a/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__main__.py b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__main__.py new file mode 100755 index 00000000..c05d8c6d --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . write import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/write.py b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/write.py new file mode 100755 index 00000000..b918c10b --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/write.py @@ -0,0 +1,167 @@ + +""" +Accepts entity/vector pairs and writes them to a Pinecone store. +""" + +from pinecone import Pinecone, ServerlessSpec +from pinecone.grpc import PineconeGRPC, GRPCClientConfig + +import time +import uuid +import os + +from .... schema import GraphEmbeddings +from .... schema import graph_embeddings_store_queue +from .... log_level import LogLevel +from .... base import Consumer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = graph_embeddings_store_queue +default_subscriber = module +default_api_key = os.getenv("PINECONE_API_KEY", "not-specified") +default_cloud = "aws" +default_region = "us-east-1" + +class Processor(Consumer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + subscriber = params.get("subscriber", default_subscriber) + + self.url = params.get("url", None) + self.cloud = params.get("cloud", default_cloud) + self.region = params.get("region", default_region) + self.api_key = params.get("api_key", default_api_key) + + if self.api_key is None: + raise RuntimeError("Pinecone API key must be specified") + + if self.url: + + self.pinecone = PineconeGRPC( + api_key = self.api_key, + host = self.url + ) + + else: + + self.pinecone = Pinecone(api_key = self.api_key) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "subscriber": subscriber, + "input_schema": GraphEmbeddings, + "url": self.url, + } + ) + + self.last_index_name = None + + def handle(self, msg): + + v = msg.value() + + id = str(uuid.uuid4()) + + if v.entity.value == "" or v.entity.value is None: return + + for vec in v.vectors: + + dim = len(vec) + + index_name = ( + "t-" + v.metadata.user + "-" + str(dim) + ) + + if index_name != self.last_index_name: + + if not self.pinecone.has_index(index_name): + + try: + + self.pinecone.create_index( + name = index_name, + dimension = dim, + metric = "cosine", + spec = ServerlessSpec( + cloud = self.cloud, + region = self.region, + ) + ) + + for i in range(0, 1000): + + if self.pinecone.describe_index( + index_name + ).status["ready"]: + break + + time.sleep(1) + + if not self.pinecone.describe_index( + index_name + ).status["ready"]: + raise RuntimeError( + "Gave up waiting for index creation" + ) + + except Exception as e: + print("Pinecone index creation failed") + raise e + + print(f"Index {index_name} created", flush=True) + + self.last_index_name = index_name + + index = self.pinecone.Index(index_name) + + records = [ + { + "id": id, + "values": vec, + "metadata": { "entity": v.entity.value }, + } + ] + + index.upsert( + vectors = records, + namespace = v.metadata.collection, + ) + + @staticmethod + def add_args(parser): + + Consumer.add_args( + parser, default_input_queue, default_subscriber, + ) + + parser.add_argument( + '-a', '--api-key', + default=default_api_key, + help='Pinecone API key. (default from PINECONE_API_KEY)' + ) + + parser.add_argument( + '-u', '--url', + help='Pinecone URL. If unspecified, serverless is used' + ) + + parser.add_argument( + '--cloud', + default=default_cloud, + help=f'Pinecone cloud, (default: {default_cloud}' + ) + + parser.add_argument( + '--region', + default=default_region, + help=f'Pinecone region, (default: {default_region}' + ) + +def run(): + + Processor.start(module, __doc__) + From 340d7a224f51b259cbca9184131dc337b08ef59d Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Mon, 25 Nov 2024 20:46:35 +0000 Subject: [PATCH 09/37] Feature/rework kg core (#171) * Knowledge cores with msgpack * Put it in the cli package * Tidy up msgpack dumper * Created a loader --- test-api/test-embeddings-api | 3 + trustgraph-cli/scripts/tg-dump-msgpack | 34 ++ trustgraph-cli/scripts/tg-load-kg-core | 179 +++++++++++ trustgraph-cli/scripts/tg-save-kg-core | 190 +++++++++++ trustgraph-cli/setup.py | 4 + .../trustgraph/api/gateway/service.py | 294 +++++++++++++++++- 6 files changed, 700 insertions(+), 4 deletions(-) create mode 100755 trustgraph-cli/scripts/tg-dump-msgpack create mode 100755 trustgraph-cli/scripts/tg-load-kg-core create mode 100755 trustgraph-cli/scripts/tg-save-kg-core diff --git a/test-api/test-embeddings-api b/test-api/test-embeddings-api index ef9ea099..b1defd01 100755 --- a/test-api/test-embeddings-api +++ b/test-api/test-embeddings-api @@ -23,3 +23,6 @@ if "error" in resp: print(f"Error: {resp['error']}") sys.exit(1) +print(resp["vectors"]) + + diff --git a/trustgraph-cli/scripts/tg-dump-msgpack b/trustgraph-cli/scripts/tg-dump-msgpack new file mode 100755 index 00000000..9f91394f --- /dev/null +++ b/trustgraph-cli/scripts/tg-dump-msgpack @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import msgpack +import sys +import argparse + +def run(input_file): + + with open(input_file, 'rb') as f: + + unpacker = msgpack.Unpacker(f, raw=False) + + for unpacked in unpacker: + print(unpacked) + +def main(): + + parser = argparse.ArgumentParser( + prog='tg-load-pdf', + description=__doc__, + ) + + parser.add_argument( + '-i', '--input-file', + required=True, + help=f'Input file' + ) + + args = parser.parse_args() + + run(**vars(args)) + +main() + diff --git a/trustgraph-cli/scripts/tg-load-kg-core b/trustgraph-cli/scripts/tg-load-kg-core new file mode 100755 index 00000000..2469772d --- /dev/null +++ b/trustgraph-cli/scripts/tg-load-kg-core @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 + +import aiohttp +import asyncio +import msgpack +import json +import sys +import argparse +import os + +async def load_ge(queue, url): + + async with aiohttp.ClientSession() as session: + + async with session.ws_connect(f"{url}load/graph-embeddings") as ws: + + while True: + + msg = await queue.get() + + msg = { + "metadata": { + "id": msg["m"]["i"], + "metadata": msg["m"]["m"], + "user": msg["m"]["u"], + "collection": msg["m"]["c"], + }, + "vectors": msg["v"], + "entity": msg["e"], + } + + await ws.send_json(msg) + +async def load_triples(queue, url): + async with aiohttp.ClientSession() as session: + async with session.ws_connect(f"{url}load/triples") as ws: + + while True: + + msg = await queue.get() + + msg ={ + "metadata": { + "id": msg["m"]["i"], + "metadata": msg["m"]["m"], + "user": msg["m"]["u"], + "collection": msg["m"]["c"], + }, + "triples": msg["t"], + } + + await ws.send_json(msg) + +ge_counts = 0 +t_counts = 0 + +async def stats(): + + global t_counts + global ge_counts + + while True: + await asyncio.sleep(5) + print( + f"Graph embeddings: {ge_counts:10d} Triples: {t_counts:10d}" + ) + +async def loader(ge_queue, t_queue, path, format, user, collection): + + global t_counts + global ge_counts + + if format == "json": + + raise RuntimeError("Not implemented") + + else: + + with open(path, "rb") as f: + + unpacker = msgpack.Unpacker(f, raw=False) + + for unpacked in unpacker: + + if user: + unpacked["metadata"]["user"] = user + + if collection: + unpacked["metadata"]["collection"] = collection + + + if unpacked[0] == "t": + await t_queue.put(unpacked[1]) + t_counts += 1 + else: + if unpacked[0] == "ge": + await ge_queue.put(unpacked[1]) + ge_counts += 1 + +async def run(**args): + + ge_q = asyncio.Queue() + t_q = asyncio.Queue() + + load_task = asyncio.create_task( + loader( + ge_queue=ge_q, t_queue=t_q, + path=args["input_file"], format=args["format"], + user=args["user"], collection=args["collection"], + ) + + ) + + ge_task = asyncio.create_task( + load_ge( + queue=ge_q, url=args["url"] + "api/v1/" + ) + ) + + triples_task = asyncio.create_task( + load_triples( + queue=t_q, url=args["url"] + "api/v1/" + ) + ) + + stats_task = asyncio.create_task(stats()) + + await load_task + await triples_task + await ge_task + await stats_task + +async def main(): + + parser = argparse.ArgumentParser( + prog='tg-load-pdf', + description=__doc__, + ) + + default_url = os.getenv("TRUSTGRAPH_API", "http://localhost:8088/") + default_user = "trustgraph" + collection = "default" + + parser.add_argument( + '-u', '--url', + default=default_url, + help=f'TrustGraph API URL (default: {default_url})', + ) + + parser.add_argument( + '-i', '--input-file', + # Make it mandatory, difficult to over-write an existing file + required=True, + help=f'Output file' + ) + + parser.add_argument( + '--format', + default="msgpack", + choices=["msgpack", "json"], + help=f'Output format (default: msgpack)', + ) + + parser.add_argument( + '--user', + help=f'User ID to load as (default: from input)' + ) + + parser.add_argument( + '--collection', + help=f'Collection ID to load as (default: from input)' + ) + + args = parser.parse_args() + + await run(**vars(args)) + +asyncio.run(main()) + diff --git a/trustgraph-cli/scripts/tg-save-kg-core b/trustgraph-cli/scripts/tg-save-kg-core new file mode 100755 index 00000000..feeea1ef --- /dev/null +++ b/trustgraph-cli/scripts/tg-save-kg-core @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 + +import aiohttp +import asyncio +import msgpack +import json +import sys +import argparse +import os + +async def fetch_ge(queue, user, collection, url): + async with aiohttp.ClientSession() as session: + async with session.ws_connect(f"{url}stream/graph-embeddings") as ws: + async for msg in ws: + if msg.type == aiohttp.WSMsgType.TEXT: + + data = msg.json() + + if user: + if data["metadata"]["user"] != user: + continue + + if collection: + if data["metadata"]["collection"] != collection: + continue + + await queue.put([ + "ge", + { + "m": { + "i": data["metadata"]["id"], + "m": data["metadata"]["metadata"], + "u": data["metadata"]["user"], + "c": data["metadata"]["collection"], + }, + "v": data["vectors"], + "e": data["entity"], + } + ]) + if msg.type == aiohttp.WSMsgType.ERROR: + print("Error") + break + +async def fetch_triples(queue, user, collection, url): + async with aiohttp.ClientSession() as session: + async with session.ws_connect(f"{url}stream/triples") as ws: + async for msg in ws: + if msg.type == aiohttp.WSMsgType.TEXT: + + data = msg.json() + + if user: + if data["metadata"]["user"] != user: + continue + + if collection: + if data["metadata"]["collection"] != collection: + continue + + await queue.put(( + "t", + { + "m": { + "i": data["metadata"]["id"], + "m": data["metadata"]["metadata"], + "u": data["metadata"]["user"], + "c": data["metadata"]["collection"], + }, + "t": data["triples"], + } + )) + if msg.type == aiohttp.WSMsgType.ERROR: + print("Error") + break + +ge_counts = 0 +t_counts = 0 + +async def stats(): + + global t_counts + global ge_counts + + while True: + await asyncio.sleep(5) + print( + f"Graph embeddings: {ge_counts:10d} Triples: {t_counts:10d}" + ) + +async def output(queue, path, format): + + global t_counts + global ge_counts + + with open(path, "wb") as f: + + while True: + + msg = await queue.get() + + if format == "msgpack": + f.write(msgpack.packb(msg, use_bin_type=True)) + else: + f.write(json.dumps(msg).encode("utf-8")) + + if msg[0] == "t": + t_counts += 1 + else: + if msg[0] == "ge": + ge_counts += 1 + +async def run(**args): + + q = asyncio.Queue() + + ge_task = asyncio.create_task( + fetch_ge( + queue=q, user=args["user"], collection=args["collection"], + url=args["url"] + "api/v1/" + ) + ) + + triples_task = asyncio.create_task( + fetch_triples( + queue=q, user=args["user"], collection=args["collection"], + url=args["url"] + "api/v1/" + ) + ) + + output_task = asyncio.create_task( + output( + queue=q, path=args["output_file"], format=args["format"], + ) + + ) + + stats_task = asyncio.create_task(stats()) + + await output_task + await triples_task + await ge_task + await stats_task + +async def main(): + + parser = argparse.ArgumentParser( + prog='tg-load-pdf', + description=__doc__, + ) + + default_url = os.getenv("TRUSTGRAPH_API", "http://localhost:8088/") + default_user = "trustgraph" + collection = "default" + + parser.add_argument( + '-u', '--url', + default=default_url, + help=f'TrustGraph API URL (default: {default_url})', + ) + + parser.add_argument( + '-o', '--output-file', + # Make it mandatory, difficult to over-write an existing file + required=True, + help=f'Output file' + ) + + parser.add_argument( + '--format', + default="msgpack", + choices=["msgpack", "json"], + help=f'Output format (default: msgpack)', + ) + + parser.add_argument( + '--user', + help=f'User ID to filter on (default: no filter)' + ) + + parser.add_argument( + '--collection', + help=f'Collection ID to filter on (default: no filter)' + ) + + args = parser.parse_args() + + await run(**vars(args)) + +asyncio.run(main()) + diff --git a/trustgraph-cli/setup.py b/trustgraph-cli/setup.py index ec541c8b..1608cfdb 100644 --- a/trustgraph-cli/setup.py +++ b/trustgraph-cli/setup.py @@ -39,6 +39,7 @@ setuptools.setup( "pulsar-client", "rdflib", "tabulate", + "msgpack", ], scripts=[ "scripts/tg-graph-show", @@ -54,5 +55,8 @@ setuptools.setup( "scripts/tg-invoke-agent", "scripts/tg-invoke-prompt", "scripts/tg-invoke-llm", + "scripts/tg-save-kg-core", + "scripts/tg-load-kg-core", + "scripts/tg-dump-msgpack", ] ) diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index 148bc321..6d5f70ce 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -14,7 +14,7 @@ module = ".".join(__name__.split(".")[1:-1]) import asyncio import argparse -from aiohttp import web +from aiohttp import web, WSMsgType import json import logging import uuid @@ -47,9 +47,13 @@ from ... schema import GraphRagQuery, GraphRagResponse from ... schema import graph_rag_request_queue from ... schema import graph_rag_response_queue -from ... schema import TriplesQueryRequest, TriplesQueryResponse +from ... schema import TriplesQueryRequest, TriplesQueryResponse, Triples from ... schema import triples_request_queue from ... schema import triples_response_queue +from ... schema import triples_store_queue + +from ... schema import GraphEmbeddings +from ... schema import graph_embeddings_store_queue from ... schema import AgentRequest, AgentResponse from ... schema import agent_request_queue @@ -84,6 +88,11 @@ def to_subgraph(x): for t in x ] +class Running: + def __init__(self): self.running = True + def get(self): return self.running + def stop(self): self.running = False + class Publisher: def __init__(self, pulsar_host, topic, schema=None, max_size=10, @@ -132,6 +141,7 @@ class Subscriber: self.consumer_name = consumer_name self.schema = schema self.q = {} + self.full = {} async def run(self): while True: @@ -145,10 +155,19 @@ class Subscriber: ) as consumer: while True: msg = await consumer.receive() - id = msg.properties()["id"] + + try: + id = msg.properties()["id"] + except: + id = None + value = msg.value() if id in self.q: await self.q[id].put(value) + + for q in self.full.values(): + await q.put(value) + except Exception as e: print("Exception:", e, flush=True) @@ -164,6 +183,59 @@ class Subscriber: if id in self.q: del self.q[id] + async def subscribe_all(self, id): + q = asyncio.Queue() + self.full[id] = q + return q + + async def unsubscribe_all(self, id): + if id in self.full: + del self.full[id] + +def serialize_triples(message): + return { + "metadata": { + "id": message.metadata.id, + "metadata": [ + { + "s": t.s.value, + "p": t.p.value, + "o": t.o.value, + } + for t in message.metadata.metadata + ], + "user": message.metadata.user, + "collection": message.metadata.collection, + }, + "triples": [ + { + "s": t.s.value, + "p": t.p.value, + "o": t.o.value, + } + for t in message.triples + ] + } + +def serialize_graph_embeddings(message): + return { + "metadata": { + "id": message.metadata.id, + "metadata": [ + { + "s": t.s.value, + "p": t.p.value, + "o": t.o.value, + } + for t in message.metadata.metadata + ], + "user": message.metadata.user, + "collection": message.metadata.collection, + }, + "vectors": message.vectors, + "entity": message.entity.value, + } + class Api: def __init__(self, **config): @@ -243,6 +315,28 @@ class Api: JsonSchema(EmbeddingsResponse) ) + self.triples_tap = Subscriber( + self.pulsar_host, triples_store_queue, + "api-gateway", "api-gateway", + schema=JsonSchema(Triples) + ) + + self.triples_pub = Publisher( + self.pulsar_host, triples_store_queue, + schema=JsonSchema(Triples) + ) + + self.graph_embeddings_tap = Subscriber( + self.pulsar_host, graph_embeddings_store_queue, + "api-gateway", "api-gateway", + schema=JsonSchema(GraphEmbeddings) + ) + + self.graph_embeddings_pub = Publisher( + self.pulsar_host, graph_embeddings_store_queue, + schema=JsonSchema(GraphEmbeddings) + ) + self.document_out = Publisher( self.pulsar_host, document_ingest_queue, schema=JsonSchema(Document), @@ -264,6 +358,20 @@ class Api: web.post("/api/v1/embeddings", self.embeddings), web.post("/api/v1/load/document", self.load_document), web.post("/api/v1/load/text", self.load_text), + web.get("/api/v1/ws", self.socket), + + web.get("/api/v1/stream/triples", self.stream_triples), + web.get( + "/api/v1/stream/graph-embeddings", + self.stream_graph_embeddings + ), + + web.get("/api/v1/load/triples", self.load_triples), + web.get( + "/api/v1/load/graph-embeddings", + self.load_graph_embeddings + ), + ]) async def llm(self, request): @@ -660,6 +768,169 @@ class Api: { "error": str(e) } ) + async def socket(self, request): + + ws = web.WebSocketResponse() + await ws.prepare(request) + + async for msg in ws: + if msg.type == WSMsgType.TEXT: + if msg.data == 'close': + await ws.close() + else: + await ws.send_str(msg.data + '/answer') + elif msg.type == WSMsgType.ERROR: + print('ws connection closed with exception %s' % + ws.exception()) + + print('websocket connection closed') + + return ws + + async def stream(self, q, ws, running, fn): + + while running.get(): + try: + resp = await asyncio.wait_for(q.get(), 0.5) + await ws.send_json(fn(resp)) + + except TimeoutError: + continue + + except Exception as e: + print(f"Exception: {str(e)}", flush=True) + + async def stream_triples(self, request): + + id = str(uuid.uuid4()) + + q = await self.triples_tap.subscribe_all(id) + running = Running() + + ws = web.WebSocketResponse() + await ws.prepare(request) + + tsk = asyncio.create_task(self.stream( + q, + ws, + running, + serialize_triples, + )) + + async for msg in ws: + if msg.type == WSMsgType.ERROR: + break + else: + # Ignore incoming messages + pass + + running.stop() + + await self.triples_tap.unsubscribe_all(id) + await tsk + + return ws + + async def stream_graph_embeddings(self, request): + + id = str(uuid.uuid4()) + + q = await self.graph_embeddings_tap.subscribe_all(id) + running = Running() + + ws = web.WebSocketResponse() + await ws.prepare(request) + + tsk = asyncio.create_task(self.stream( + q, + ws, + running, + serialize_graph_embeddings, + )) + + async for msg in ws: + if msg.type == WSMsgType.ERROR: + break + else: + # Ignore incoming messages + pass + + running.stop() + + await self.graph_embeddings_tap.unsubscribe_all(id) + await tsk + + return ws + + async def load_triples(self, request): + + ws = web.WebSocketResponse() + await ws.prepare(request) + + async for msg in ws: + + try: + + if msg.type == WSMsgType.TEXT: + + data = msg.json() + + elt = Triples( + metadata=Metadata( + id=data["metadata"]["id"], + metadata=to_subgraph(data["metadata"]["metadata"]), + user=data["metadata"]["user"], + collection=data["metadata"]["collection"], + ), + triples=to_subgraph(data["triples"]), + ) + + await self.triples_pub.send(None, elt) + + elif msg.type == WSMsgType.ERROR: + break + + except Exception as e: + + print("Exception:", e) + + return ws + + async def load_graph_embeddings(self, request): + + ws = web.WebSocketResponse() + await ws.prepare(request) + + async for msg in ws: + + try: + + if msg.type == WSMsgType.TEXT: + + data = msg.json() + + elt = GraphEmbeddings( + metadata=Metadata( + id=data["metadata"]["id"], + metadata=to_subgraph(data["metadata"]["metadata"]), + user=data["metadata"]["user"], + collection=data["metadata"]["collection"], + ), + entity=to_value(data["entity"]), + vectors=data["vectors"], + ) + + await self.graph_embeddings_pub.send(None, elt) + + elif msg.type == WSMsgType.ERROR: + break + + except Exception as e: + + print("Exception:", e) + + return ws + async def app_factory(self): self.llm_pub_task = asyncio.create_task(self.llm_in.run()) @@ -688,6 +959,22 @@ class Api: self.embeddings_out.run() ) + self.triples_tap_task = asyncio.create_task( + self.triples_tap.run() + ) + + self.triples_pub_task = asyncio.create_task( + self.triples_pub.run() + ) + + self.graph_embeddings_tap_task = asyncio.create_task( + self.graph_embeddings_tap.run() + ) + + self.graph_embeddings_pub_task = asyncio.create_task( + self.graph_embeddings_pub.run() + ) + self.doc_ingest_pub_task = asyncio.create_task(self.document_out.run()) self.text_ingest_pub_task = asyncio.create_task(self.text_out.run()) @@ -699,7 +986,6 @@ class Api: def run(): - parser = argparse.ArgumentParser( prog="api-gateway", description=__doc__ From 887fafcf8ca2c3a09df7c5092022406dbb0b4ec4 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Tue, 26 Nov 2024 16:46:38 +0000 Subject: [PATCH 10/37] Fix/core save api (#172) * Acknowledge messaages from Pulsar, doh! * Change API to deliver a boolean e if value is an entity * Change loaders to use new API * Changes, entity-aware API is complete --- trustgraph-base/trustgraph/api/api.py | 35 +++++---- .../trustgraph/knowledge/__init__.py | 1 + trustgraph-base/trustgraph/knowledge/defs.py | 8 +++ .../trustgraph/knowledge/document.py | 12 +++- .../trustgraph/knowledge/organization.py | 12 +++- .../trustgraph/knowledge/publication.py | 12 +++- trustgraph-cli/scripts/tg-load-pdf | 11 ++- trustgraph-cli/scripts/tg-load-text | 11 ++- .../trustgraph/api/gateway/service.py | 72 +++++++------------ 9 files changed, 104 insertions(+), 70 deletions(-) diff --git a/trustgraph-base/trustgraph/api/api.py b/trustgraph-base/trustgraph/api/api.py index 818e42c3..7942e081 100644 --- a/trustgraph-base/trustgraph/api/api.py +++ b/trustgraph-base/trustgraph/api/api.py @@ -4,7 +4,7 @@ import json import dataclasses import base64 -from trustgraph.knowledge import hash +from trustgraph.knowledge import hash, Uri, Literal class ProtocolException(Exception): pass @@ -12,14 +12,6 @@ class ProtocolException(Exception): class ApplicationException(Exception): pass -class Uri(str): - def is_uri(self): return True - def is_literal(self): return False - -class Literal(str): - def is_uri(self): return False - def is_literal(self): return True - @dataclasses.dataclass class Triple: s : str @@ -213,9 +205,16 @@ class Api: "limit": limit } - if s: input["s"] = s - if p: input["p"] = p - if o: input["o"] = o + if not isinstance(s, Uri): + raise RuntimeError("s must be Uri") + if not isinstance(p, Uri): + raise RuntimeError("p must be Uri") + if not isinstance(o, Uri) and not isinstance(o, Literal): + raise RuntimeError("o must be Uri or Literal") + + if s: input["s"] = { "v": str(s), "e": isinstance(s, Uri), } + if p: input["p"] = { "v": str(p), "e": isinstance(p, Uri), } + if o: input["o"] = { "v": str(o), "e": isinstance(o, Uri), } url = f"{self.url}triples-query" @@ -273,9 +272,9 @@ class Api: if metadata: metadata.emit( lambda t: triples.append({ - "s": t.s.value, - "p": t.p.value, - "o": t.o.value + "s": { "v": t["s"], "e": isinstance(t["s"], Uri) }, + "p": { "v": t["p"], "e": isinstance(t["p"], Uri) }, + "o": { "v": t["o"], "e": isinstance(t["o"], Uri) } }) ) @@ -312,9 +311,9 @@ class Api: if metadata: metadata.emit( lambda t: triples.append({ - "s": t.s.value, - "p": t.p.value, - "o": t.o.value + "s": { "v": t["s"], "e": isinstance(t["s"], Uri) }, + "p": { "v": t["p"], "e": isinstance(t["p"], Uri) }, + "o": { "v": t["o"], "e": isinstance(t["o"], Uri) } }) ) diff --git a/trustgraph-base/trustgraph/knowledge/__init__.py b/trustgraph-base/trustgraph/knowledge/__init__.py index 0ab6b5db..8349abf0 100644 --- a/trustgraph-base/trustgraph/knowledge/__init__.py +++ b/trustgraph-base/trustgraph/knowledge/__init__.py @@ -1,4 +1,5 @@ +from . defs import * from . identifier import * from . publication import * from . document import * diff --git a/trustgraph-base/trustgraph/knowledge/defs.py b/trustgraph-base/trustgraph/knowledge/defs.py index b95863c6..d6290930 100644 --- a/trustgraph-base/trustgraph/knowledge/defs.py +++ b/trustgraph-base/trustgraph/knowledge/defs.py @@ -23,3 +23,11 @@ URL = 'https://schema.org/url' IDENTIFIER = 'https://schema.org/identifier' KEYWORD = 'https://schema.org/keywords' +class Uri(str): + def is_uri(self): return True + def is_literal(self): return False + +class Literal(str): + def is_uri(self): return False + def is_literal(self): return True + diff --git a/trustgraph-base/trustgraph/knowledge/document.py b/trustgraph-base/trustgraph/knowledge/document.py index dc2f43e3..99d06c72 100644 --- a/trustgraph-base/trustgraph/knowledge/document.py +++ b/trustgraph-base/trustgraph/knowledge/document.py @@ -1,6 +1,16 @@ from . defs import * -from .. schema import Triple, Value + +def Value(value, is_uri): + if is_uri: + return Uri(value) + else: + return Literal(value) + +def Triple(s, p, o): + return { + "s": s, "p": p, "o": o, + } class DigitalDocument: diff --git a/trustgraph-base/trustgraph/knowledge/organization.py b/trustgraph-base/trustgraph/knowledge/organization.py index 1129dd6c..5653aa97 100644 --- a/trustgraph-base/trustgraph/knowledge/organization.py +++ b/trustgraph-base/trustgraph/knowledge/organization.py @@ -1,6 +1,16 @@ from . defs import * -from .. schema import Triple, Value + +def Value(value, is_uri): + if is_uri: + return Uri(value) + else: + return Literal(value) + +def Triple(s, p, o): + return { + "s": s, "p": p, "o": o, + } class Organization: def __init__(self, id, name=None, description=None): diff --git a/trustgraph-base/trustgraph/knowledge/publication.py b/trustgraph-base/trustgraph/knowledge/publication.py index 3c9d41c8..d197df93 100644 --- a/trustgraph-base/trustgraph/knowledge/publication.py +++ b/trustgraph-base/trustgraph/knowledge/publication.py @@ -1,6 +1,16 @@ from . defs import * -from .. schema import Triple, Value + +def Value(value, is_uri): + if is_uri: + return Uri(value) + else: + return Literal(value) + +def Triple(s, p, o): + return { + "s": s, "p": p, "o": o, + } class PublicationEvent: def __init__( diff --git a/trustgraph-cli/scripts/tg-load-pdf b/trustgraph-cli/scripts/tg-load-pdf index 18ac57cb..0dc8ced6 100755 --- a/trustgraph-cli/scripts/tg-load-pdf +++ b/trustgraph-cli/scripts/tg-load-pdf @@ -14,7 +14,7 @@ import time import uuid from trustgraph.schema import Document, document_ingest_queue -from trustgraph.schema import Metadata +from trustgraph.schema import Metadata, Triple, Value from trustgraph.log_level import LogLevel from trustgraph.knowledge import hash, to_uri from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG @@ -79,7 +79,14 @@ class Loader: r = Document( metadata=Metadata( id=id, - metadata=triples, + metadata=[ + Triple( + s=Value(value=t["s"]["v"], is_uri=t["s"]["e"]), + p=Value(value=t["p"]["v"], is_uri=t["p"]["e"]), + o=Value(value=t["o"]["v"], is_uri=t["o"]["e"]) + ) + for t in triples + ], user=self.user, collection=self.collection, ), diff --git a/trustgraph-cli/scripts/tg-load-text b/trustgraph-cli/scripts/tg-load-text index e49ee7a9..6ff8d09a 100755 --- a/trustgraph-cli/scripts/tg-load-text +++ b/trustgraph-cli/scripts/tg-load-text @@ -13,7 +13,7 @@ import time import uuid from trustgraph.schema import TextDocument, text_ingest_queue -from trustgraph.schema import Metadata +from trustgraph.schema import Metadata, Triple, Value from trustgraph.log_level import LogLevel from trustgraph.knowledge import hash, to_uri from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG @@ -78,7 +78,14 @@ class Loader: r = TextDocument( metadata=Metadata( id=id, - metadata=triples, + metadata=[ + Triple( + s=Value(value=t["s"]["v"], is_uri=t["s"]["e"]), + p=Value(value=t["p"]["v"], is_uri=t["p"]["e"]), + o=Value(value=t["o"]["v"], is_uri=t["o"]["e"]) + ) + for t in triples + ], user=self.user, collection=self.collection, ), diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index 6d5f70ce..0ae01d3a 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -73,10 +73,7 @@ default_timeout = 600 default_port = 8088 def to_value(x): - if x.startswith("http:") or x.startswith("https:"): - return Value(value=x, is_uri=True) - else: - return Value(value=x, is_uri=False) + return Value(value=x["v"], is_uri=x["e"]) def to_subgraph(x): return [ @@ -156,6 +153,9 @@ class Subscriber: while True: msg = await consumer.receive() + # Acknowledge successful reception of the message + await consumer.acknowledge(msg) + try: id = msg.properties()["id"] except: @@ -192,43 +192,41 @@ class Subscriber: if id in self.full: del self.full[id] +def serialize_value(v): + return { + "v": v.value, + "e": v.is_uri, + } + +def serialize_triple(t): + return { + "s": serialize_value(t.s), + "p": serialize_value(t.p), + "o": serialize_value(t.o) + } + +def serialize_subgraph(sg): + return [ + serialize_triple(t) + for t in sg + ] + def serialize_triples(message): return { "metadata": { "id": message.metadata.id, - "metadata": [ - { - "s": t.s.value, - "p": t.p.value, - "o": t.o.value, - } - for t in message.metadata.metadata - ], + "metadata": serialize_subgraph(message.metadata.metadata), "user": message.metadata.user, "collection": message.metadata.collection, }, - "triples": [ - { - "s": t.s.value, - "p": t.p.value, - "o": t.o.value, - } - for t in message.triples - ] + "triples": serialize_subgraph(message.triples), } def serialize_graph_embeddings(message): return { "metadata": { "id": message.metadata.id, - "metadata": [ - { - "s": t.s.value, - "p": t.p.value, - "o": t.o.value, - } - for t in message.metadata.metadata - ], + "metadata": serialize_subgraph(message.metadata.metadata), "user": message.metadata.user, "collection": message.metadata.collection, }, @@ -560,23 +558,7 @@ class Api: return web.json_response( { - "response": [ - { - "s": { - "v": t.s.value, - "e": t.s.is_uri, - }, - "p": { - "v": t.p.value, - "e": t.p.is_uri, - }, - "o": { - "v": t.o.value, - "e": t.o.is_uri, - } - } - for t in resp.triples - ] + "response": serialize_subgraph(resp.triples), } ) From 99e3e43f7ba621f8e28bab77d6b84d4a996053b5 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Tue, 26 Nov 2024 16:58:47 +0000 Subject: [PATCH 11/37] Fix/kg cli help (#173) * Fix kg-core-help --- trustgraph-cli/scripts/tg-dump-msgpack | 8 +++++++- trustgraph-cli/scripts/tg-load-kg-core | 7 ++++++- trustgraph-cli/scripts/tg-save-kg-core | 11 ++++++++++- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/trustgraph-cli/scripts/tg-dump-msgpack b/trustgraph-cli/scripts/tg-dump-msgpack index 9f91394f..dc4a8139 100755 --- a/trustgraph-cli/scripts/tg-dump-msgpack +++ b/trustgraph-cli/scripts/tg-dump-msgpack @@ -1,5 +1,11 @@ #!/usr/bin/env python3 +"" +This utility reads a knowledge core in msgpack format and outputs its +contents in JSON form to standard output. This is useful only as a +diagnostic utility. +""" + import msgpack import sys import argparse @@ -16,7 +22,7 @@ def run(input_file): def main(): parser = argparse.ArgumentParser( - prog='tg-load-pdf', + prog='tg-dump-msgpack', description=__doc__, ) diff --git a/trustgraph-cli/scripts/tg-load-kg-core b/trustgraph-cli/scripts/tg-load-kg-core index 2469772d..13fac153 100755 --- a/trustgraph-cli/scripts/tg-load-kg-core +++ b/trustgraph-cli/scripts/tg-load-kg-core @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +"""This utility takes a knowledge core and loads it into a running TrustGraph +through the API. The knowledge core should be in msgpack format, which is the +default format produce by tg-save-kg-core. +""" + import aiohttp import asyncio import msgpack @@ -133,7 +138,7 @@ async def run(**args): async def main(): parser = argparse.ArgumentParser( - prog='tg-load-pdf', + prog='tg-load-kg-core', description=__doc__, ) diff --git a/trustgraph-cli/scripts/tg-save-kg-core b/trustgraph-cli/scripts/tg-save-kg-core index feeea1ef..3c03383f 100755 --- a/trustgraph-cli/scripts/tg-save-kg-core +++ b/trustgraph-cli/scripts/tg-save-kg-core @@ -1,5 +1,14 @@ #!/usr/bin/env python3 +""" +This utility connects to a running TrustGraph through the API and creates +a knowledge core from the data streaming through the processing queues. +For completeness of data, tg-save-kg-core should be initiated before data +loading takes place. The default output format, msgpack should be used. +JSON output format is also available - msgpack produces a more compact +representation, which is also more performant to load. +""" + import aiohttp import asyncio import msgpack @@ -144,7 +153,7 @@ async def run(**args): async def main(): parser = argparse.ArgumentParser( - prog='tg-load-pdf', + prog='tg-save-kg-core', description=__doc__, ) From 2f7ccb2ef85fd70b08b25f5fdc4f9671db193c62 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Wed, 27 Nov 2024 17:24:27 +0000 Subject: [PATCH 12/37] - Reduce back-pressure on tg-load-kg-core (#179) - Save entity in correct format in tg-save-core --- trustgraph-cli/scripts/tg-load-kg-core | 6 ++++-- trustgraph-flow/trustgraph/api/gateway/service.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/trustgraph-cli/scripts/tg-load-kg-core b/trustgraph-cli/scripts/tg-load-kg-core index 13fac153..e2d0a405 100755 --- a/trustgraph-cli/scripts/tg-load-kg-core +++ b/trustgraph-cli/scripts/tg-load-kg-core @@ -104,8 +104,10 @@ async def loader(ge_queue, t_queue, path, format, user, collection): async def run(**args): - ge_q = asyncio.Queue() - t_q = asyncio.Queue() + # Maxsize on queues reduces back-pressure so tg-load-kg-core doesn't + # grow to eat all memory + ge_q = asyncio.Queue(maxsize=500) + t_q = asyncio.Queue(maxsize=500) load_task = asyncio.create_task( loader( diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index 0ae01d3a..7b12e1a2 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -231,7 +231,7 @@ def serialize_graph_embeddings(message): "collection": message.metadata.collection, }, "vectors": message.vectors, - "entity": message.entity.value, + "entity": message.entity, } class Api: From b2f7b3452926dd6156be9713032609ffb9c5593a Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Wed, 27 Nov 2024 17:36:15 +0000 Subject: [PATCH 13/37] Fix/pinecone integration (#180) * Add missing pinecone references * Add missing Pinecone executables --- trustgraph-flow/scripts/de-query-pinecone | 6 ++++++ trustgraph-flow/scripts/de-write-pinecone | 6 ++++++ trustgraph-flow/setup.py | 2 ++ 3 files changed, 14 insertions(+) create mode 100755 trustgraph-flow/scripts/de-query-pinecone create mode 100755 trustgraph-flow/scripts/de-write-pinecone diff --git a/trustgraph-flow/scripts/de-query-pinecone b/trustgraph-flow/scripts/de-query-pinecone new file mode 100755 index 00000000..b21d9045 --- /dev/null +++ b/trustgraph-flow/scripts/de-query-pinecone @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.query.doc_embeddings.pinecone import run + +run() + diff --git a/trustgraph-flow/scripts/de-write-pinecone b/trustgraph-flow/scripts/de-write-pinecone new file mode 100755 index 00000000..eb604747 --- /dev/null +++ b/trustgraph-flow/scripts/de-write-pinecone @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.storage.doc_embeddings.pinecone import run + +run() + diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 1650122f..2cbbdee4 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -69,8 +69,10 @@ setuptools.setup( "scripts/chunker-token", "scripts/de-query-milvus", "scripts/de-query-qdrant", + "scripts/de-query-pinecone", "scripts/de-write-milvus", "scripts/de-write-qdrant", + "scripts/de-write-pinecone", "scripts/document-rag", "scripts/embeddings-ollama", "scripts/embeddings-vectorize", From 9c97ca32f6e7792ecd29fbd77093126b5df84253 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Thu, 28 Nov 2024 19:21:28 +0000 Subject: [PATCH 14/37] Feature/memgraph (#182) * Add database override to bolt output, default is neo4j * Add memgraph templates --- templates/components.jsonnet | 2 + templates/components/memgraph.jsonnet | 81 +++++++++++++++++++ templates/stores/memgraph.jsonnet | 65 +++++++++++++++ templates/values/images.jsonnet | 4 +- .../trustgraph/query/triples/neo4j/service.py | 12 ++- .../trustgraph/storage/triples/neo4j/write.py | 12 ++- 6 files changed, 171 insertions(+), 5 deletions(-) create mode 100644 templates/components/memgraph.jsonnet create mode 100644 templates/stores/memgraph.jsonnet diff --git a/templates/components.jsonnet b/templates/components.jsonnet index 26368deb..1abf44a4 100644 --- a/templates/components.jsonnet +++ b/templates/components.jsonnet @@ -12,6 +12,7 @@ "graph-rag": import "components/graph-rag.jsonnet", "triple-store-cassandra": import "components/cassandra.jsonnet", "triple-store-neo4j": import "components/neo4j.jsonnet", + "triple-store-memgraph": import "components/memgraph.jsonnet", "llamafile": import "components/llamafile.jsonnet", "ollama": import "components/ollama.jsonnet", "openai": import "components/openai.jsonnet", @@ -34,6 +35,7 @@ // FIXME: Dupes "cassandra": import "components/cassandra.jsonnet", "neo4j": import "components/neo4j.jsonnet", + "memgraph": import "components/memgraph.jsonnet", "qdrant": import "components/qdrant.jsonnet", "pinecone": import "components/pinecone.jsonnet", "milvus": import "components/milvus.jsonnet", diff --git a/templates/components/memgraph.jsonnet b/templates/components/memgraph.jsonnet new file mode 100644 index 00000000..5ec0a76e --- /dev/null +++ b/templates/components/memgraph.jsonnet @@ -0,0 +1,81 @@ +local base = import "base/base.jsonnet"; +local images = import "values/images.jsonnet"; +local url = import "values/url.jsonnet"; +local memgraph = import "stores/memgraph.jsonnet"; + +memgraph + { + + "memgraph-url":: "bolt://memgraph:7687", + "memgraph-database":: "memgraph", + + "store-triples" +: { + + create:: function(engine) + + local container = + engine.container("store-triples") + .with_image(images.trustgraph) + .with_command([ + "triples-write-neo4j", + "-p", + url.pulsar, + "-g", + $["memgraph-url"], + "--database", + $["memgraph-database"], + ]) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "store-triples", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + containerSet, + service, + ]) + + }, + + "query-triples" +: { + + create:: function(engine) + + local container = + engine.container("query-triples") + .with_image(images.trustgraph) + .with_command([ + "triples-query-neo4j", + "-p", + url.pulsar, + "-g", + $["memgraph-url"], + "--database", + $["memgraph-database"], + ]) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "query-triples", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + containerSet, + service, + ]) + + + } + +} + diff --git a/templates/stores/memgraph.jsonnet b/templates/stores/memgraph.jsonnet new file mode 100644 index 00000000..8f8b6216 --- /dev/null +++ b/templates/stores/memgraph.jsonnet @@ -0,0 +1,65 @@ +local base = import "base/base.jsonnet"; +local images = import "values/images.jsonnet"; + +{ + + "memgraph" +: { + + create:: function(engine) + + local container = + engine.container("memgraph") + .with_image(images.memgraph_mage) + .with_limits("1.0", "1000M") + .with_reservations("0.5", "1000M") + .with_port(7474, 7474, "api") + .with_port(7687, 7687, "api2"); + + local containerSet = engine.containers( + "memgraph", [ container ] + ); + + local service = + engine.service(containerSet) + .with_port(7474, 7474, "api") + .with_port(7687, 7687, "api2"); + + engine.resources([ + containerSet, + service, + ]) + + }, + + "memgraph-lab" +: { + + create:: function(engine) + + local container = + engine.container("lab") + .with_image(images.memgraph_lab) + .with_environment({ + QUICK_CONNECT_MG_HOST: "memgraph", + QUICK_CONNECT_MG_PORT: "7687", + }) + .with_limits("1.0", "512M") + .with_reservations("0.5", "512M") + .with_port(3010, 3000, "http"); + + local containerSet = engine.containers( + "lab", [ container ] + ); + + local service = + engine.service(containerSet) + .with_port(3010, 3010, "http"); + + engine.resources([ + containerSet, + service, + ]) + + }, + +} + diff --git a/templates/values/images.jsonnet b/templates/values/images.jsonnet index 01ecee4d..c583815b 100644 --- a/templates/values/images.jsonnet +++ b/templates/values/images.jsonnet @@ -10,5 +10,7 @@ local version = import "version.jsonnet"; prometheus: "docker.io/prom/prometheus:v2.53.2", grafana: "docker.io/grafana/grafana:11.1.4", trustgraph: "docker.io/trustgraph/trustgraph-flow:" + version, - qdrant: "docker.io/qdrant/qdrant:v1.11.1" + qdrant: "docker.io/qdrant/qdrant:v1.11.1", + memgraph_mage: "docker.io/memgraph/memgraph-mage:1.22-memgraph-2.22", + memgraph_lab: "docker.io/memgraph/lab:2.19.1", } diff --git a/trustgraph-flow/trustgraph/query/triples/neo4j/service.py b/trustgraph-flow/trustgraph/query/triples/neo4j/service.py index 9038f76d..2caa0193 100755 --- a/trustgraph-flow/trustgraph/query/triples/neo4j/service.py +++ b/trustgraph-flow/trustgraph/query/triples/neo4j/service.py @@ -21,6 +21,7 @@ default_subscriber = module default_graph_host = 'bolt://neo4j:7687' default_username = 'neo4j' default_password = 'password' +default_database = 'neo4j' class Processor(ConsumerProducer): @@ -31,7 +32,8 @@ class Processor(ConsumerProducer): subscriber = params.get("subscriber", default_subscriber) graph_host = params.get("graph_host", default_graph_host) username = params.get("username", default_username) - password = params.get("passowrd", default_password) + password = params.get("password", default_password) + database = params.get("database", default_database) super(Processor, self).__init__( **params | { @@ -44,7 +46,7 @@ class Processor(ConsumerProducer): } ) - self.db = "neo4j" + self.db = database self.io = GraphDatabase.driver(graph_host, auth=(username, password)) @@ -342,6 +344,12 @@ class Processor(ConsumerProducer): help=f'Neo4j password (default: {default_password})' ) + parser.add_argument( + '--database', + default=default_database, + help=f'Neo4j database (default: {default_database})' + ) + def run(): Processor.start(module, __doc__) diff --git a/trustgraph-flow/trustgraph/storage/triples/neo4j/write.py b/trustgraph-flow/trustgraph/storage/triples/neo4j/write.py index 82302e96..929333e5 100755 --- a/trustgraph-flow/trustgraph/storage/triples/neo4j/write.py +++ b/trustgraph-flow/trustgraph/storage/triples/neo4j/write.py @@ -24,6 +24,7 @@ default_subscriber = module default_graph_host = 'bolt://neo4j:7687' default_username = 'neo4j' default_password = 'password' +default_database = 'neo4j' class Processor(Consumer): @@ -33,7 +34,8 @@ class Processor(Consumer): subscriber = params.get("subscriber", default_subscriber) graph_host = params.get("graph_host", default_graph_host) username = params.get("username", default_username) - password = params.get("passowrd", default_password) + password = params.get("password", default_password) + database = params.get("database", default_database) super(Processor, self).__init__( **params | { @@ -44,7 +46,7 @@ class Processor(Consumer): } ) - self.db = "neo4j" + self.db = database self.io = GraphDatabase.driver(graph_host, auth=(username, password)) @@ -152,6 +154,12 @@ class Processor(Consumer): help=f'Neo4j password (default: {default_password})' ) + parser.add_argument( + '--database', + default=default_database, + help=f'Neo4j database (default: {default_database})' + ) + def run(): Processor.start(module, __doc__) From c52b70c2864a6085b45c6e73432b921f539d3d80 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Thu, 28 Nov 2024 19:21:41 +0000 Subject: [PATCH 15/37] Fix metadata load format (#181) --- trustgraph-cli/scripts/tg-load-pdf | 15 ++++++++++++--- trustgraph-cli/scripts/tg-load-text | 17 +++++++++++++---- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/trustgraph-cli/scripts/tg-load-pdf b/trustgraph-cli/scripts/tg-load-pdf index 0dc8ced6..08ce6f91 100755 --- a/trustgraph-cli/scripts/tg-load-pdf +++ b/trustgraph-cli/scripts/tg-load-pdf @@ -81,9 +81,18 @@ class Loader: id=id, metadata=[ Triple( - s=Value(value=t["s"]["v"], is_uri=t["s"]["e"]), - p=Value(value=t["p"]["v"], is_uri=t["p"]["e"]), - o=Value(value=t["o"]["v"], is_uri=t["o"]["e"]) + s=Value( + value=t["s"], + is_uri=isinstance(t["s"], Uri) + ), + p=Value( + value=t["p"], + is_uri=isinstance(t["p"], Uri) + ), + o=Value( + value=t["o"], + is_uri=isinstance(t["o"], Uri) + ), ) for t in triples ], diff --git a/trustgraph-cli/scripts/tg-load-text b/trustgraph-cli/scripts/tg-load-text index 6ff8d09a..51664a1b 100755 --- a/trustgraph-cli/scripts/tg-load-text +++ b/trustgraph-cli/scripts/tg-load-text @@ -15,7 +15,7 @@ import uuid from trustgraph.schema import TextDocument, text_ingest_queue from trustgraph.schema import Metadata, Triple, Value from trustgraph.log_level import LogLevel -from trustgraph.knowledge import hash, to_uri +from trustgraph.knowledge import hash, to_uri, Literal, Uri from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG from trustgraph.knowledge import Organization, PublicationEvent from trustgraph.knowledge import DigitalDocument @@ -80,9 +80,18 @@ class Loader: id=id, metadata=[ Triple( - s=Value(value=t["s"]["v"], is_uri=t["s"]["e"]), - p=Value(value=t["p"]["v"], is_uri=t["p"]["e"]), - o=Value(value=t["o"]["v"], is_uri=t["o"]["e"]) + s=Value( + value=t["s"], + is_uri=isinstance(t["s"], Uri) + ), + p=Value( + value=t["p"], + is_uri=isinstance(t["p"], Uri) + ), + o=Value( + value=t["o"], + is_uri=isinstance(t["o"], Uri) + ), ) for t in triples ], From cf564ed1473dfa100fe3bbab50c95300eaeec855 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Thu, 28 Nov 2024 21:26:36 +0000 Subject: [PATCH 16/37] PDF loader symbol error fixed (#183) --- trustgraph-cli/scripts/tg-load-pdf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trustgraph-cli/scripts/tg-load-pdf b/trustgraph-cli/scripts/tg-load-pdf index 08ce6f91..a0d2b3bc 100755 --- a/trustgraph-cli/scripts/tg-load-pdf +++ b/trustgraph-cli/scripts/tg-load-pdf @@ -16,7 +16,7 @@ import uuid from trustgraph.schema import Document, document_ingest_queue from trustgraph.schema import Metadata, Triple, Value from trustgraph.log_level import LogLevel -from trustgraph.knowledge import hash, to_uri +from trustgraph.knowledge import hash, to_uri, Uri from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG from trustgraph.knowledge import Organization, PublicationEvent from trustgraph.knowledge import DigitalDocument From c844d805e52c44b6a010244cd3f892497babba4c Mon Sep 17 00:00:00 2001 From: Cyber MacGeddon Date: Fri, 29 Nov 2024 17:03:31 +0000 Subject: [PATCH 17/37] Setup for release 0.17 branch --- .github/workflows/release.yaml | 2 +- trustgraph-bedrock/setup.py | 2 +- trustgraph-cli/setup.py | 2 +- trustgraph-embeddings-hf/setup.py | 4 ++-- trustgraph-flow/setup.py | 2 +- trustgraph-parquet/setup.py | 2 +- trustgraph-vertexai/setup.py | 2 +- trustgraph/setup.py | 14 +++++++------- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 0d6d2d29..fc85a6a8 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -5,7 +5,7 @@ on: workflow_dispatch: push: tags: - - v0.16.* + - v0.17.* permissions: contents: read diff --git a/trustgraph-bedrock/setup.py b/trustgraph-bedrock/setup.py index 1a99e227..d92cc9c7 100644 --- a/trustgraph-bedrock/setup.py +++ b/trustgraph-bedrock/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.16,<0.17", + "trustgraph-base>=0.17,<0.18", "pulsar-client", "prometheus-client", "boto3", diff --git a/trustgraph-cli/setup.py b/trustgraph-cli/setup.py index 1608cfdb..e9de429a 100644 --- a/trustgraph-cli/setup.py +++ b/trustgraph-cli/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.16,<0.17", + "trustgraph-base>=0.17,<0.18", "requests", "pulsar-client", "rdflib", diff --git a/trustgraph-embeddings-hf/setup.py b/trustgraph-embeddings-hf/setup.py index 2fbe079e..25ccfeab 100644 --- a/trustgraph-embeddings-hf/setup.py +++ b/trustgraph-embeddings-hf/setup.py @@ -34,8 +34,8 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.16,<0.17", - "trustgraph-flow>=0.16,<0.17", + "trustgraph-base>=0.17,<0.18", + "trustgraph-flow>=0.17,<0.18", "torch", "urllib3", "transformers", diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 2cbbdee4..8e81e12c 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.16,<0.17", + "trustgraph-base>=0.17,<0.18", "urllib3", "rdflib", "pymilvus", diff --git a/trustgraph-parquet/setup.py b/trustgraph-parquet/setup.py index 7dab60ac..dfe29653 100644 --- a/trustgraph-parquet/setup.py +++ b/trustgraph-parquet/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.16,<0.17", + "trustgraph-base>=0.17,<0.18", "pulsar-client", "prometheus-client", "pyarrow", diff --git a/trustgraph-vertexai/setup.py b/trustgraph-vertexai/setup.py index d19e8c0d..3ce10305 100644 --- a/trustgraph-vertexai/setup.py +++ b/trustgraph-vertexai/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.16,<0.17", + "trustgraph-base>=0.17,<0.18", "pulsar-client", "google-cloud-aiplatform", "prometheus-client", diff --git a/trustgraph/setup.py b/trustgraph/setup.py index 7bb8dfd3..5f9f1f2c 100644 --- a/trustgraph/setup.py +++ b/trustgraph/setup.py @@ -34,13 +34,13 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.16,<0.17", - "trustgraph-bedrock>=0.16,<0.17", - "trustgraph-cli>=0.16,<0.17", - "trustgraph-embeddings-hf>=0.16,<0.17", - "trustgraph-flow>=0.16,<0.17", - "trustgraph-parquet>=0.16,<0.17", - "trustgraph-vertexai>=0.16,<0.17", + "trustgraph-base>=0.17,<0.18", + "trustgraph-bedrock>=0.17,<0.18", + "trustgraph-cli>=0.17,<0.18", + "trustgraph-embeddings-hf>=0.17,<0.18", + "trustgraph-flow>=0.17,<0.18", + "trustgraph-parquet>=0.17,<0.18", + "trustgraph-vertexai>=0.17,<0.18", ], scripts=[ ] From 212102c61c485d4dc3a5a0bf1751539e4653e616 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 29 Nov 2024 17:17:20 +0000 Subject: [PATCH 18/37] Tweak queue names (#184) --- trustgraph-base/trustgraph/schema/documents.py | 2 +- trustgraph-base/trustgraph/schema/graph.py | 4 ++-- trustgraph-base/trustgraph/schema/models.py | 4 ++-- trustgraph-base/trustgraph/schema/prompt.py | 2 +- trustgraph-base/trustgraph/schema/retrieval.py | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/trustgraph-base/trustgraph/schema/documents.py b/trustgraph-base/trustgraph/schema/documents.py index 59aba287..2a3d3d0c 100644 --- a/trustgraph-base/trustgraph/schema/documents.py +++ b/trustgraph-base/trustgraph/schema/documents.py @@ -60,5 +60,5 @@ document_embeddings_request_queue = topic( 'doc-embeddings', kind='non-persistent', namespace='request' ) document_embeddings_response_queue = topic( - 'doc-embeddings-response', kind='non-persistent', namespace='response', + 'doc-embeddings', kind='non-persistent', namespace='response', ) diff --git a/trustgraph-base/trustgraph/schema/graph.py b/trustgraph-base/trustgraph/schema/graph.py index 2d108a30..78c1a99c 100644 --- a/trustgraph-base/trustgraph/schema/graph.py +++ b/trustgraph-base/trustgraph/schema/graph.py @@ -34,7 +34,7 @@ graph_embeddings_request_queue = topic( 'graph-embeddings', kind='non-persistent', namespace='request' ) graph_embeddings_response_queue = topic( - 'graph-embeddings-response', kind='non-persistent', namespace='response', + 'graph-embeddings', kind='non-persistent', namespace='response' ) ############################################################################ @@ -67,5 +67,5 @@ triples_request_queue = topic( 'triples', kind='non-persistent', namespace='request' ) triples_response_queue = topic( - 'triples-response', kind='non-persistent', namespace='response', + 'triples', kind='non-persistent', namespace='response' ) diff --git a/trustgraph-base/trustgraph/schema/models.py b/trustgraph-base/trustgraph/schema/models.py index 70cb2c8f..a634e1c4 100644 --- a/trustgraph-base/trustgraph/schema/models.py +++ b/trustgraph-base/trustgraph/schema/models.py @@ -23,7 +23,7 @@ text_completion_request_queue = topic( 'text-completion', kind='non-persistent', namespace='request' ) text_completion_response_queue = topic( - 'text-completion-response', kind='non-persistent', namespace='response', + 'text-completion', kind='non-persistent', namespace='response' ) ############################################################################ @@ -41,5 +41,5 @@ embeddings_request_queue = topic( 'embeddings', kind='non-persistent', namespace='request' ) embeddings_response_queue = topic( - 'embeddings-response', kind='non-persistent', namespace='response' + 'embeddings', kind='non-persistent', namespace='response' ) diff --git a/trustgraph-base/trustgraph/schema/prompt.py b/trustgraph-base/trustgraph/schema/prompt.py index 9bcdf117..15eddea8 100644 --- a/trustgraph-base/trustgraph/schema/prompt.py +++ b/trustgraph-base/trustgraph/schema/prompt.py @@ -59,7 +59,7 @@ prompt_request_queue = topic( 'prompt', kind='non-persistent', namespace='request' ) prompt_response_queue = topic( - 'prompt-response', kind='non-persistent', namespace='response' + 'prompt', kind='non-persistent', namespace='response' ) ############################################################################ diff --git a/trustgraph-base/trustgraph/schema/retrieval.py b/trustgraph-base/trustgraph/schema/retrieval.py index ad860c3c..9c4361a1 100644 --- a/trustgraph-base/trustgraph/schema/retrieval.py +++ b/trustgraph-base/trustgraph/schema/retrieval.py @@ -20,7 +20,7 @@ graph_rag_request_queue = topic( 'graph-rag', kind='non-persistent', namespace='request' ) graph_rag_response_queue = topic( - 'graph-rag-response', kind='non-persistent', namespace='response' + 'graph-rag', kind='non-persistent', namespace='response' ) ############################################################################ @@ -40,5 +40,5 @@ document_rag_request_queue = topic( 'doc-rag', kind='non-persistent', namespace='request' ) document_rag_response_queue = topic( - 'doc-rag-response', kind='non-persistent', namespace='response' + 'doc-rag', kind='non-persistent', namespace='response' ) From 6d200c79c5796de8fd9e04b7802c83041335a711 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Mon, 2 Dec 2024 17:41:30 +0000 Subject: [PATCH 19/37] Feature/wikipedia ddg (#185) API-side support for Wikipedia, DBpedia and internet search functions This incorporates a refactor of the API code to break it up, separate classes for endpoints to reduce duplication --- templates/components/azure-openai.jsonnet | 2 +- templates/components/azure.jsonnet | 2 +- templates/components/bedrock.jsonnet | 2 +- templates/components/claude.jsonnet | 2 +- templates/components/cohere.jsonnet | 2 +- templates/components/document-rag.jsonnet | 2 +- templates/components/googleaistudio.jsonnet | 2 +- templates/components/graph-rag.jsonnet | 2 +- templates/components/llamafile.jsonnet | 2 +- templates/components/ollama.jsonnet | 2 +- templates/components/openai.jsonnet | 2 +- templates/components/prompt-template.jsonnet | 6 +- templates/components/trustgraph.jsonnet | 2 +- templates/components/vertexai.jsonnet | 2 +- test-api/test-agent2-api | 28 + test-api/test-dbpedia | 30 + test-api/test-encyclopedia | 30 + test-api/test-internet-search | 30 + test-api/test-prompt-api | 1 - test-api/test-prompt2-api | 1 - test-api/test-triples-query-api | 5 +- trustgraph-base/trustgraph/schema/__init__.py | 2 + trustgraph-base/trustgraph/schema/lookup.py | 42 + trustgraph-cli/scripts/tg-load-kg-core | 1 - trustgraph-flow/scripts/wikipedia-lookup | 6 + trustgraph-flow/setup.py | 1 + .../trustgraph/api/gateway/agent.py | 30 + .../trustgraph/api/gateway/dbpedia.py | 29 + .../trustgraph/api/gateway/embeddings.py | 27 + .../trustgraph/api/gateway/encyclopedia.py | 29 + .../trustgraph/api/gateway/endpoint.py | 153 +++ .../api/gateway/graph_embeddings_load.py | 60 ++ .../api/gateway/graph_embeddings_stream.py | 56 ++ .../trustgraph/api/gateway/graph_rag.py | 30 + .../trustgraph/api/gateway/internet_search.py | 29 + .../trustgraph/api/gateway/prompt.py | 41 + .../trustgraph/api/gateway/publisher.py | 41 + .../trustgraph/api/gateway/running.py | 5 + .../trustgraph/api/gateway/serialize.py | 57 ++ .../trustgraph/api/gateway/service.py | 873 ++---------------- .../trustgraph/api/gateway/socket.py | 68 ++ .../trustgraph/api/gateway/subscriber.py | 68 ++ .../trustgraph/api/gateway/text_completion.py | 28 + .../trustgraph/api/gateway/triples_load.py | 59 ++ .../trustgraph/api/gateway/triples_query.py | 53 ++ .../trustgraph/api/gateway/triples_stream.py | 56 ++ .../trustgraph/external/__init__.py | 0 .../trustgraph/external/wikipedia/__init__.py | 3 + .../trustgraph/external/wikipedia/__main__.py | 7 + .../trustgraph/external/wikipedia/service.py | 102 ++ 50 files changed, 1287 insertions(+), 826 deletions(-) create mode 100755 test-api/test-agent2-api create mode 100755 test-api/test-dbpedia create mode 100755 test-api/test-encyclopedia create mode 100755 test-api/test-internet-search create mode 100644 trustgraph-base/trustgraph/schema/lookup.py create mode 100755 trustgraph-flow/scripts/wikipedia-lookup create mode 100644 trustgraph-flow/trustgraph/api/gateway/agent.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/dbpedia.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/embeddings.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/encyclopedia.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/endpoint.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/graph_rag.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/internet_search.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/prompt.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/publisher.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/running.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/serialize.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/socket.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/subscriber.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/text_completion.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/triples_load.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/triples_query.py create mode 100644 trustgraph-flow/trustgraph/api/gateway/triples_stream.py create mode 100644 trustgraph-flow/trustgraph/external/__init__.py create mode 100644 trustgraph-flow/trustgraph/external/wikipedia/__init__.py create mode 100644 trustgraph-flow/trustgraph/external/wikipedia/__main__.py create mode 100644 trustgraph-flow/trustgraph/external/wikipedia/service.py diff --git a/templates/components/azure-openai.jsonnet b/templates/components/azure-openai.jsonnet index cc3847c0..8afcaf11 100644 --- a/templates/components/azure-openai.jsonnet +++ b/templates/components/azure-openai.jsonnet @@ -48,7 +48,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/azure.jsonnet b/templates/components/azure.jsonnet index 82b79133..cf10dc66 100644 --- a/templates/components/azure.jsonnet +++ b/templates/components/azure.jsonnet @@ -46,7 +46,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/bedrock.jsonnet b/templates/components/bedrock.jsonnet index 93978a59..6ccaa1c5 100644 --- a/templates/components/bedrock.jsonnet +++ b/templates/components/bedrock.jsonnet @@ -53,7 +53,7 @@ local chunker = import "chunker-recursive.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/claude.jsonnet b/templates/components/claude.jsonnet index c6c94e21..00e4ec79 100644 --- a/templates/components/claude.jsonnet +++ b/templates/components/claude.jsonnet @@ -45,7 +45,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/cohere.jsonnet b/templates/components/cohere.jsonnet index 11c30fbd..5bc9b39c 100644 --- a/templates/components/cohere.jsonnet +++ b/templates/components/cohere.jsonnet @@ -43,7 +43,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_limits("0.5", "128M") .with_reservations("0.1", "128M"); diff --git a/templates/components/document-rag.jsonnet b/templates/components/document-rag.jsonnet index ac5c11ec..0a68dd52 100644 --- a/templates/components/document-rag.jsonnet +++ b/templates/components/document-rag.jsonnet @@ -19,7 +19,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "--prompt-request-queue", "non-persistent://tg/request/prompt-rag", "--prompt-response-queue", - "non-persistent://tg/response/prompt-rag-response", + "non-persistent://tg/response/prompt-rag", ]) .with_limits("0.5", "128M") .with_reservations("0.1", "128M"); diff --git a/templates/components/googleaistudio.jsonnet b/templates/components/googleaistudio.jsonnet index b6ee1d85..4088ceef 100644 --- a/templates/components/googleaistudio.jsonnet +++ b/templates/components/googleaistudio.jsonnet @@ -50,7 +50,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/graph-rag.jsonnet b/templates/components/graph-rag.jsonnet index c0200d1e..860152c9 100644 --- a/templates/components/graph-rag.jsonnet +++ b/templates/components/graph-rag.jsonnet @@ -112,7 +112,7 @@ local url = import "values/url.jsonnet"; "--prompt-request-queue", "non-persistent://tg/request/prompt-rag", "--prompt-response-queue", - "non-persistent://tg/response/prompt-rag-response", + "non-persistent://tg/response/prompt-rag", "--entity-limit", std.toString($["graph-rag-entity-limit"]), "--triple-limit", diff --git a/templates/components/llamafile.jsonnet b/templates/components/llamafile.jsonnet index d51cda61..bc1a011c 100644 --- a/templates/components/llamafile.jsonnet +++ b/templates/components/llamafile.jsonnet @@ -40,7 +40,7 @@ local prompts = import "prompts/slm.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/ollama.jsonnet b/templates/components/ollama.jsonnet index 2ae696b4..8da00848 100644 --- a/templates/components/ollama.jsonnet +++ b/templates/components/ollama.jsonnet @@ -40,7 +40,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/openai.jsonnet b/templates/components/openai.jsonnet index 83cbd406..27725cb6 100644 --- a/templates/components/openai.jsonnet +++ b/templates/components/openai.jsonnet @@ -50,7 +50,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/prompt-template.jsonnet b/templates/components/prompt-template.jsonnet index ac820df6..3dadf337 100644 --- a/templates/components/prompt-template.jsonnet +++ b/templates/components/prompt-template.jsonnet @@ -53,7 +53,7 @@ local default_prompts = import "prompts/default-prompts.jsonnet"; "--text-completion-request-queue", "non-persistent://tg/request/text-completion", "--text-completion-response-queue", - "non-persistent://tg/response/text-completion-response", + "non-persistent://tg/response/text-completion", "--system-prompt", $["prompts"]["system-template"], @@ -92,11 +92,11 @@ local default_prompts = import "prompts/default-prompts.jsonnet"; "-i", "non-persistent://tg/request/prompt-rag", "-o", - "non-persistent://tg/response/prompt-rag-response", + "non-persistent://tg/response/prompt-rag", "--text-completion-request-queue", "non-persistent://tg/request/text-completion-rag", "--text-completion-response-queue", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", "--system-prompt", $["prompts"]["system-template"], diff --git a/templates/components/trustgraph.jsonnet b/templates/components/trustgraph.jsonnet index 37c05dae..6c60921c 100644 --- a/templates/components/trustgraph.jsonnet +++ b/templates/components/trustgraph.jsonnet @@ -186,7 +186,7 @@ local prompt = import "prompt-template.jsonnet"; "-p", url.pulsar, "-i", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_limits("0.5", "128M") .with_reservations("0.1", "128M"); diff --git a/templates/components/vertexai.jsonnet b/templates/components/vertexai.jsonnet index 44fe27c6..ef193156 100644 --- a/templates/components/vertexai.jsonnet +++ b/templates/components/vertexai.jsonnet @@ -93,7 +93,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_limits("0.5", "256M") .with_reservations("0.1", "256M") diff --git a/test-api/test-agent2-api b/test-api/test-agent2-api new file mode 100755 index 00000000..766b16c9 --- /dev/null +++ b/test-api/test-agent2-api @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "question": "What is 14 plus 12. Justify your answer.", +} + +resp = requests.post( + f"{url}agent", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["answer"]) + + diff --git a/test-api/test-dbpedia b/test-api/test-dbpedia new file mode 100755 index 00000000..e361f533 --- /dev/null +++ b/test-api/test-dbpedia @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "term": "Cornwall", +} + +resp = requests.post( + f"{url}dbpedia", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-encyclopedia b/test-api/test-encyclopedia new file mode 100755 index 00000000..ad4e5b36 --- /dev/null +++ b/test-api/test-encyclopedia @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "term": "Cornwall", +} + +resp = requests.post( + f"{url}encyclopedia", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-internet-search b/test-api/test-internet-search new file mode 100755 index 00000000..8c854c77 --- /dev/null +++ b/test-api/test-internet-search @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "term": "Cornwall", +} + +resp = requests.post( + f"{url}internet-search", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-prompt-api b/test-api/test-prompt-api index 1005bc90..4f69f09a 100755 --- a/test-api/test-prompt-api +++ b/test-api/test-prompt-api @@ -22,7 +22,6 @@ resp = requests.post( resp = resp.json() -print(resp) if "error" in resp: print(f"Error: {resp['error']}") sys.exit(1) diff --git a/test-api/test-prompt2-api b/test-api/test-prompt2-api index f1b80c48..1e641439 100755 --- a/test-api/test-prompt2-api +++ b/test-api/test-prompt2-api @@ -22,7 +22,6 @@ resp = requests.post( resp = resp.json() -print(resp) if "error" in resp: print(f"Error: {resp['error']}") sys.exit(1) diff --git a/test-api/test-triples-query-api b/test-api/test-triples-query-api index e2895a28..1aa8a0b1 100755 --- a/test-api/test-triples-query-api +++ b/test-api/test-triples-query-api @@ -9,7 +9,10 @@ url = "http://localhost:8088/api/v1/" ############################################################################ input = { - "p": "http://www.w3.org/2000/01/rdf-schema#label", + "p": { + "v": "http://www.w3.org/2000/01/rdf-schema#label", + "e": True, + }, "limit": 10 } diff --git a/trustgraph-base/trustgraph/schema/__init__.py b/trustgraph-base/trustgraph/schema/__init__.py index 3196691b..be41b670 100644 --- a/trustgraph-base/trustgraph/schema/__init__.py +++ b/trustgraph-base/trustgraph/schema/__init__.py @@ -9,4 +9,6 @@ from . graph import * from . retrieval import * from . metadata import * from . agent import * +from . lookup import * + diff --git a/trustgraph-base/trustgraph/schema/lookup.py b/trustgraph-base/trustgraph/schema/lookup.py new file mode 100644 index 00000000..d0a0517c --- /dev/null +++ b/trustgraph-base/trustgraph/schema/lookup.py @@ -0,0 +1,42 @@ + +from pulsar.schema import Record, String + +from . types import Error, Value, Triple +from . topic import topic +from . metadata import Metadata + +############################################################################ + +# Lookups + +class LookupRequest(Record): + kind = String() + term = String() + +class LookupResponse(Record): + text = String() + error = Error() + +encyclopedia_lookup_request_queue = topic( + 'encyclopedia', kind='non-persistent', namespace='request' +) +encyclopedia_lookup_response_queue = topic( + 'encyclopedia', kind='non-persistent', namespace='response', +) + +dbpedia_lookup_request_queue = topic( + 'dbpedia', kind='non-persistent', namespace='request' +) +dbpedia_lookup_response_queue = topic( + 'dbpedia', kind='non-persistent', namespace='response', +) + +internet_search_request_queue = topic( + 'internet-search', kind='non-persistent', namespace='request' +) +internet_search_response_queue = topic( + 'internet-search', kind='non-persistent', namespace='response', +) + +############################################################################ + diff --git a/trustgraph-cli/scripts/tg-load-kg-core b/trustgraph-cli/scripts/tg-load-kg-core index e2d0a405..4e207cf1 100755 --- a/trustgraph-cli/scripts/tg-load-kg-core +++ b/trustgraph-cli/scripts/tg-load-kg-core @@ -93,7 +93,6 @@ async def loader(ge_queue, t_queue, path, format, user, collection): if collection: unpacked["metadata"]["collection"] = collection - if unpacked[0] == "t": await t_queue.put(unpacked[1]) t_counts += 1 diff --git a/trustgraph-flow/scripts/wikipedia-lookup b/trustgraph-flow/scripts/wikipedia-lookup new file mode 100755 index 00000000..a89b1009 --- /dev/null +++ b/trustgraph-flow/scripts/wikipedia-lookup @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.external.wikipedia import run + +run() + diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 8e81e12c..65bb7326 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -106,5 +106,6 @@ setuptools.setup( "scripts/triples-query-neo4j", "scripts/triples-write-cassandra", "scripts/triples-write-neo4j", + "scripts/wikipedia-lookup", ] ) diff --git a/trustgraph-flow/trustgraph/api/gateway/agent.py b/trustgraph-flow/trustgraph/api/gateway/agent.py new file mode 100644 index 00000000..28a1e185 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/agent.py @@ -0,0 +1,30 @@ + +from ... schema import AgentRequest, AgentResponse +from ... schema import agent_request_queue +from ... schema import agent_response_queue + +from . endpoint import MultiResponseServiceEndpoint + +class AgentEndpoint(MultiResponseServiceEndpoint): + def __init__(self, pulsar_host, timeout): + + super(AgentEndpoint, self).__init__( + pulsar_host=pulsar_host, + request_queue=agent_request_queue, + response_queue=agent_response_queue, + request_schema=AgentRequest, + response_schema=AgentResponse, + endpoint_path="/api/v1/agent", + timeout=timeout, + ) + + def to_request(self, body): + return AgentRequest( + question=body["question"] + ) + + def from_response(self, message): + if message.answer: + return { "answer": message.answer }, True + else: + return {}, False diff --git a/trustgraph-flow/trustgraph/api/gateway/dbpedia.py b/trustgraph-flow/trustgraph/api/gateway/dbpedia.py new file mode 100644 index 00000000..0ccb3d6b --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/dbpedia.py @@ -0,0 +1,29 @@ + +from ... schema import LookupRequest, LookupResponse +from ... schema import dbpedia_lookup_request_queue +from ... schema import dbpedia_lookup_response_queue + +from . endpoint import ServiceEndpoint + +class DbpediaEndpoint(ServiceEndpoint): + def __init__(self, pulsar_host, timeout): + + super(DbpediaEndpoint, self).__init__( + pulsar_host=pulsar_host, + request_queue=dbpedia_lookup_request_queue, + response_queue=dbpedia_lookup_response_queue, + request_schema=LookupRequest, + response_schema=LookupResponse, + endpoint_path="/api/v1/dbpedia", + timeout=timeout, + ) + + def to_request(self, body): + return LookupRequest( + term=body["term"], + kind=body.get("kind", None), + ) + + def from_response(self, message): + return { "text": message.text } + diff --git a/trustgraph-flow/trustgraph/api/gateway/embeddings.py b/trustgraph-flow/trustgraph/api/gateway/embeddings.py new file mode 100644 index 00000000..b5fcc0a4 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/embeddings.py @@ -0,0 +1,27 @@ + +from ... schema import EmbeddingsRequest, EmbeddingsResponse +from ... schema import embeddings_request_queue +from ... schema import embeddings_response_queue + +from . endpoint import ServiceEndpoint + +class EmbeddingsEndpoint(ServiceEndpoint): + def __init__(self, pulsar_host, timeout): + + super(EmbeddingsEndpoint, self).__init__( + pulsar_host=pulsar_host, + request_queue=embeddings_request_queue, + response_queue=embeddings_response_queue, + request_schema=EmbeddingsRequest, + response_schema=EmbeddingsResponse, + endpoint_path="/api/v1/embeddings", + timeout=timeout, + ) + + def to_request(self, body): + return EmbeddingsRequest( + text=body["text"] + ) + + def from_response(self, message): + return { "vectors": message.vectors } diff --git a/trustgraph-flow/trustgraph/api/gateway/encyclopedia.py b/trustgraph-flow/trustgraph/api/gateway/encyclopedia.py new file mode 100644 index 00000000..e379d7d4 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/encyclopedia.py @@ -0,0 +1,29 @@ + +from ... schema import LookupRequest, LookupResponse +from ... schema import encyclopedia_lookup_request_queue +from ... schema import encyclopedia_lookup_response_queue + +from . endpoint import ServiceEndpoint + +class EncyclopediaEndpoint(ServiceEndpoint): + def __init__(self, pulsar_host, timeout): + + super(EncyclopediaEndpoint, self).__init__( + pulsar_host=pulsar_host, + request_queue=encyclopedia_lookup_request_queue, + response_queue=encyclopedia_lookup_response_queue, + request_schema=LookupRequest, + response_schema=LookupResponse, + endpoint_path="/api/v1/encyclopedia", + timeout=timeout, + ) + + def to_request(self, body): + return LookupRequest( + term=body["term"], + kind=body.get("kind", None), + ) + + def from_response(self, message): + return { "text": message.text } + diff --git a/trustgraph-flow/trustgraph/api/gateway/endpoint.py b/trustgraph-flow/trustgraph/api/gateway/endpoint.py new file mode 100644 index 00000000..075e4a0e --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/endpoint.py @@ -0,0 +1,153 @@ + +import asyncio +from pulsar.schema import JsonSchema +from aiohttp import web +import uuid +import logging + +from . publisher import Publisher +from . subscriber import Subscriber + +logger = logging.getLogger("endpoint") +logger.setLevel(logging.INFO) + +class ServiceEndpoint: + + def __init__( + self, + pulsar_host, + request_queue, request_schema, + response_queue, response_schema, + endpoint_path, + subscription="api-gateway", consumer_name="api-gateway", + timeout=600, + ): + + self.pub = Publisher( + pulsar_host, request_queue, + schema=JsonSchema(request_schema) + ) + + self.sub = Subscriber( + pulsar_host, response_queue, + subscription, consumer_name, + JsonSchema(response_schema) + ) + + self.path = endpoint_path + self.timeout = timeout + + async def start(self): + + self.pub_task = asyncio.create_task(self.pub.run()) + self.sub_task = asyncio.create_task(self.sub.run()) + + def add_routes(self, app): + + app.add_routes([ + web.post(self.path, self.handle), + ]) + + def to_request(self, request): + raise RuntimeError("Not defined") + + def from_response(self, response): + raise RuntimeError("Not defined") + + async def handle(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.sub.subscribe(id) + + print(data) + + await self.pub.send( + id, + self.to_request(data), + ) + + try: + resp = await asyncio.wait_for(q.get(), self.timeout) + except: + raise RuntimeError("Timeout waiting for response") + + print(resp) + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + return web.json_response( + self.from_response(resp) + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.sub.unsubscribe(id) + + +class MultiResponseServiceEndpoint(ServiceEndpoint): + + async def handle(self, request): + + id = str(uuid.uuid4()) + + try: + + data = await request.json() + + q = await self.sub.subscribe(id) + + print(data) + + await self.pub.send( + id, + self.to_request(data), + ) + + # Keeps looking at responses... + + while True: + + try: + resp = await asyncio.wait_for(q.get(), self.timeout) + except: + raise RuntimeError("Timeout waiting for response") + + print(resp) + + if resp.error: + return web.json_response( + { "error": resp.error.message } + ) + + # Until from_response says we have a finished answer + resp, fin = self.from_response(resp) + + + if fin: + return web.json_response(resp) + + # Not finished, so loop round and continue + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + finally: + await self.sub.unsubscribe(id) diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py new file mode 100644 index 00000000..3cc3f533 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py @@ -0,0 +1,60 @@ + +import asyncio +from pulsar.schema import JsonSchema +import uuid +from aiohttp import WSMsgType + +from ... schema import Metadata +from ... schema import GraphEmbeddings +from ... schema import graph_embeddings_store_queue + +from . publisher import Publisher +from . socket import SocketEndpoint +from . serialize import to_subgraph, to_value + +class GraphEmbeddingsLoadEndpoint(SocketEndpoint): + + def __init__(self, pulsar_host, path="/api/v1/load/graph-embeddings"): + + super(GraphEmbeddingsLoadEndpoint, self).__init__( + endpoint_path=path + ) + + self.pulsar_host=pulsar_host + + self.publisher = Publisher( + self.pulsar_host, graph_embeddings_store_queue, + schema=JsonSchema(GraphEmbeddings) + ) + + async def start(self): + + self.task = asyncio.create_task( + self.publisher.run() + ) + + async def listener(self, ws, running): + + async for msg in ws: + # On error, finish + if msg.type == WSMsgType.ERROR: + break + else: + + data = msg.json() + + elt = GraphEmbeddings( + metadata=Metadata( + id=data["metadata"]["id"], + metadata=to_subgraph(data["metadata"]["metadata"]), + user=data["metadata"]["user"], + collection=data["metadata"]["collection"], + ), + entity=to_value(data["entity"]), + vectors=data["vectors"], + ) + + await self.publisher.send(None, elt) + + + running.stop() diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py new file mode 100644 index 00000000..978684cf --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py @@ -0,0 +1,56 @@ + +import asyncio +from pulsar.schema import JsonSchema +import uuid + +from ... schema import GraphEmbeddings +from ... schema import graph_embeddings_store_queue + +from . subscriber import Subscriber +from . socket import SocketEndpoint +from . serialize import serialize_graph_embeddings + +class GraphEmbeddingsStreamEndpoint(SocketEndpoint): + + def __init__(self, pulsar_host, path="/api/v1/stream/graph-embeddings"): + + super(GraphEmbeddingsStreamEndpoint, self).__init__( + endpoint_path=path + ) + + self.pulsar_host=pulsar_host + + self.subscriber = Subscriber( + self.pulsar_host, graph_embeddings_store_queue, + "api-gateway", "api-gateway", + schema=JsonSchema(GraphEmbeddings) + ) + + async def start(self): + + self.task = asyncio.create_task( + self.subscriber.run() + ) + + async def async_thread(self, ws, running): + + id = str(uuid.uuid4()) + + q = await self.subscriber.subscribe_all(id) + + while running.get(): + try: + resp = await asyncio.wait_for(q.get(), 0.5) + await ws.send_json(serialize_graph_embeddings(resp)) + + except TimeoutError: + continue + + except Exception as e: + print(f"Exception: {str(e)}", flush=True) + break + + await self.subscriber.unsubscribe_all(id) + + running.stop() + diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_rag.py b/trustgraph-flow/trustgraph/api/gateway/graph_rag.py new file mode 100644 index 00000000..1381dc23 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/graph_rag.py @@ -0,0 +1,30 @@ + +from ... schema import GraphRagQuery, GraphRagResponse +from ... schema import graph_rag_request_queue +from ... schema import graph_rag_response_queue + +from . endpoint import ServiceEndpoint + +class GraphRagEndpoint(ServiceEndpoint): + def __init__(self, pulsar_host, timeout): + + super(GraphRagEndpoint, self).__init__( + pulsar_host=pulsar_host, + request_queue=graph_rag_request_queue, + response_queue=graph_rag_response_queue, + request_schema=GraphRagQuery, + response_schema=GraphRagResponse, + endpoint_path="/api/v1/graph-rag", + timeout=timeout, + ) + + def to_request(self, body): + return GraphRagQuery( + query=body["query"], + user=body.get("user", "trustgraph"), + collection=body.get("collection", "default"), + ) + + def from_response(self, message): + return { "response": message.response } + diff --git a/trustgraph-flow/trustgraph/api/gateway/internet_search.py b/trustgraph-flow/trustgraph/api/gateway/internet_search.py new file mode 100644 index 00000000..c84ed82a --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/internet_search.py @@ -0,0 +1,29 @@ + +from ... schema import LookupRequest, LookupResponse +from ... schema import internet_search_request_queue +from ... schema import internet_search_response_queue + +from . endpoint import ServiceEndpoint + +class InternetSearchEndpoint(ServiceEndpoint): + def __init__(self, pulsar_host, timeout): + + super(InternetSearchEndpoint, self).__init__( + pulsar_host=pulsar_host, + request_queue=internet_search_request_queue, + response_queue=internet_search_response_queue, + request_schema=LookupRequest, + response_schema=LookupResponse, + endpoint_path="/api/v1/internet-search", + timeout=timeout, + ) + + def to_request(self, body): + return LookupRequest( + term=body["term"], + kind=body.get("kind", None), + ) + + def from_response(self, message): + return { "text": message.text } + diff --git a/trustgraph-flow/trustgraph/api/gateway/prompt.py b/trustgraph-flow/trustgraph/api/gateway/prompt.py new file mode 100644 index 00000000..e02effb9 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/prompt.py @@ -0,0 +1,41 @@ + +import json + +from ... schema import PromptRequest, PromptResponse +from ... schema import prompt_request_queue +from ... schema import prompt_response_queue + +from . endpoint import ServiceEndpoint + +class PromptEndpoint(ServiceEndpoint): + def __init__(self, pulsar_host, timeout): + + super(PromptEndpoint, self).__init__( + pulsar_host=pulsar_host, + request_queue=prompt_request_queue, + response_queue=prompt_response_queue, + request_schema=PromptRequest, + response_schema=PromptResponse, + endpoint_path="/api/v1/prompt", + timeout=timeout, + ) + + def to_request(self, body): + return PromptRequest( + id=body["id"], + terms={ + k: json.dumps(v) + for k, v in body["variables"].items() + } + ) + + def from_response(self, message): + if message.object: + return { + "object": message.object + } + else: + return { + "text": message.text + } + diff --git a/trustgraph-flow/trustgraph/api/gateway/publisher.py b/trustgraph-flow/trustgraph/api/gateway/publisher.py new file mode 100644 index 00000000..1bff44dd --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/publisher.py @@ -0,0 +1,41 @@ + +import asyncio +import aiopulsar + +class Publisher: + + def __init__(self, pulsar_host, topic, schema=None, max_size=10, + chunking_enabled=False): + self.pulsar_host = pulsar_host + self.topic = topic + self.schema = schema + self.q = asyncio.Queue(maxsize=max_size) + self.chunking_enabled = chunking_enabled + + async def run(self): + + while True: + + try: + async with aiopulsar.connect(self.pulsar_host) as client: + async with client.create_producer( + topic=self.topic, + schema=self.schema, + chunking_enabled=self.chunking_enabled, + ) as producer: + while True: + id, item = await self.q.get() + + if id: + await producer.send(item, { "id": id }) + else: + await producer.send(item) + + except Exception as e: + print("Exception:", e, flush=True) + + # If handler drops out, sleep a retry + await asyncio.sleep(2) + + async def send(self, id, msg): + await self.q.put((id, msg)) diff --git a/trustgraph-flow/trustgraph/api/gateway/running.py b/trustgraph-flow/trustgraph/api/gateway/running.py new file mode 100644 index 00000000..e6a91e66 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/running.py @@ -0,0 +1,5 @@ + +class Running: + def __init__(self): self.running = True + def get(self): return self.running + def stop(self): self.running = False diff --git a/trustgraph-flow/trustgraph/api/gateway/serialize.py b/trustgraph-flow/trustgraph/api/gateway/serialize.py new file mode 100644 index 00000000..2b955645 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/serialize.py @@ -0,0 +1,57 @@ +from ... schema import Value, Triple + +def to_value(x): + return Value(value=x["v"], is_uri=x["e"]) + +def to_subgraph(x): + return [ + Triple( + s=to_value(t["s"]), + p=to_value(t["p"]), + o=to_value(t["o"]) + ) + for t in x + ] + +def serialize_value(v): + return { + "v": v.value, + "e": v.is_uri, + } + +def serialize_triple(t): + return { + "s": serialize_value(t.s), + "p": serialize_value(t.p), + "o": serialize_value(t.o) + } + +def serialize_subgraph(sg): + return [ + serialize_triple(t) + for t in sg + ] + +def serialize_triples(message): + return { + "metadata": { + "id": message.metadata.id, + "metadata": serialize_subgraph(message.metadata.metadata), + "user": message.metadata.user, + "collection": message.metadata.collection, + }, + "triples": serialize_subgraph(message.triples), + } + +def serialize_graph_embeddings(message): + return { + "metadata": { + "id": message.metadata.id, + "metadata": serialize_subgraph(message.metadata.metadata), + "user": message.metadata.user, + "collection": message.metadata.collection, + }, + "vectors": message.vectors, + "entity": serialize_value(message.entity), + } + diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index 7b12e1a2..dcdd9779 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -1,4 +1,3 @@ - """ API gateway. Offers HTTP services which are translated to interaction on the Pulsar bus. @@ -14,57 +13,39 @@ module = ".".join(__name__.split(".")[1:-1]) import asyncio import argparse -from aiohttp import web, WSMsgType -import json +from aiohttp import web import logging -import uuid import os import base64 import pulsar -from pulsar.asyncio import Client from pulsar.schema import JsonSchema -import _pulsar -import aiopulsar from prometheus_client import start_http_server from ... log_level import LogLevel -from trustgraph.clients.llm_client import LlmClient -from trustgraph.clients.prompt_client import PromptClient - -from ... schema import Value, Metadata, Document, TextDocument, Triple - -from ... schema import TextCompletionRequest, TextCompletionResponse -from ... schema import text_completion_request_queue -from ... schema import text_completion_response_queue - -from ... schema import PromptRequest, PromptResponse -from ... schema import prompt_request_queue -from ... schema import prompt_response_queue - -from ... schema import GraphRagQuery, GraphRagResponse -from ... schema import graph_rag_request_queue -from ... schema import graph_rag_response_queue - -from ... schema import TriplesQueryRequest, TriplesQueryResponse, Triples -from ... schema import triples_request_queue -from ... schema import triples_response_queue -from ... schema import triples_store_queue - -from ... schema import GraphEmbeddings -from ... schema import graph_embeddings_store_queue - -from ... schema import AgentRequest, AgentResponse -from ... schema import agent_request_queue -from ... schema import agent_response_queue - -from ... schema import EmbeddingsRequest, EmbeddingsResponse -from ... schema import embeddings_request_queue -from ... schema import embeddings_response_queue - +from ... schema import Metadata, Document, TextDocument from ... schema import document_ingest_queue, text_ingest_queue +from . serialize import to_subgraph +from . running import Running +from . publisher import Publisher +from . subscriber import Subscriber +from . endpoint import ServiceEndpoint, MultiResponseServiceEndpoint +from . text_completion import TextCompletionEndpoint +from . prompt import PromptEndpoint +from . graph_rag import GraphRagEndpoint +from . triples_query import TriplesQueryEndpoint +from . embeddings import EmbeddingsEndpoint +from . encyclopedia import EncyclopediaEndpoint +from . agent import AgentEndpoint +from . dbpedia import DbpediaEndpoint +from . internet_search import InternetSearchEndpoint +from . triples_stream import TriplesStreamEndpoint +from . graph_embeddings_stream import GraphEmbeddingsStreamEndpoint +from . triples_load import TriplesLoadEndpoint +from . graph_embeddings_load import GraphEmbeddingsLoadEndpoint + logger = logging.getLogger("api") logger.setLevel(logging.INFO) @@ -72,168 +53,6 @@ default_pulsar_host = os.getenv("PULSAR_HOST", "pulsar://pulsar:6650") default_timeout = 600 default_port = 8088 -def to_value(x): - return Value(value=x["v"], is_uri=x["e"]) - -def to_subgraph(x): - return [ - Triple( - s=to_value(t["s"]), - p=to_value(t["p"]), - o=to_value(t["o"]) - ) - for t in x - ] - -class Running: - def __init__(self): self.running = True - def get(self): return self.running - def stop(self): self.running = False - -class Publisher: - - def __init__(self, pulsar_host, topic, schema=None, max_size=10, - chunking_enabled=False): - self.pulsar_host = pulsar_host - self.topic = topic - self.schema = schema - self.q = asyncio.Queue(maxsize=max_size) - self.chunking_enabled = chunking_enabled - - async def run(self): - - while True: - - try: - async with aiopulsar.connect(self.pulsar_host) as client: - async with client.create_producer( - topic=self.topic, - schema=self.schema, - chunking_enabled=self.chunking_enabled, - ) as producer: - while True: - id, item = await self.q.get() - - if id: - await producer.send(item, { "id": id }) - else: - await producer.send(item) - - except Exception as e: - print("Exception:", e, flush=True) - - # If handler drops out, sleep a retry - await asyncio.sleep(2) - - async def send(self, id, msg): - await self.q.put((id, msg)) - -class Subscriber: - - def __init__(self, pulsar_host, topic, subscription, consumer_name, - schema=None, max_size=10): - self.pulsar_host = pulsar_host - self.topic = topic - self.subscription = subscription - self.consumer_name = consumer_name - self.schema = schema - self.q = {} - self.full = {} - - async def run(self): - while True: - try: - async with aiopulsar.connect(self.pulsar_host) as client: - async with client.subscribe( - topic=self.topic, - subscription_name=self.subscription, - consumer_name=self.consumer_name, - schema=self.schema, - ) as consumer: - while True: - msg = await consumer.receive() - - # Acknowledge successful reception of the message - await consumer.acknowledge(msg) - - try: - id = msg.properties()["id"] - except: - id = None - - value = msg.value() - if id in self.q: - await self.q[id].put(value) - - for q in self.full.values(): - await q.put(value) - - except Exception as e: - print("Exception:", e, flush=True) - - # If handler drops out, sleep a retry - await asyncio.sleep(2) - - async def subscribe(self, id): - q = asyncio.Queue() - self.q[id] = q - return q - - async def unsubscribe(self, id): - if id in self.q: - del self.q[id] - - async def subscribe_all(self, id): - q = asyncio.Queue() - self.full[id] = q - return q - - async def unsubscribe_all(self, id): - if id in self.full: - del self.full[id] - -def serialize_value(v): - return { - "v": v.value, - "e": v.is_uri, - } - -def serialize_triple(t): - return { - "s": serialize_value(t.s), - "p": serialize_value(t.p), - "o": serialize_value(t.o) - } - -def serialize_subgraph(sg): - return [ - serialize_triple(t) - for t in sg - ] - -def serialize_triples(message): - return { - "metadata": { - "id": message.metadata.id, - "metadata": serialize_subgraph(message.metadata.metadata), - "user": message.metadata.user, - "collection": message.metadata.collection, - }, - "triples": serialize_subgraph(message.triples), - } - -def serialize_graph_embeddings(message): - return { - "metadata": { - "id": message.metadata.id, - "metadata": serialize_subgraph(message.metadata.metadata), - "user": message.metadata.user, - "collection": message.metadata.collection, - }, - "vectors": message.vectors, - "entity": message.entity, - } - class Api: def __init__(self, **config): @@ -247,93 +66,47 @@ class Api: self.timeout = int(config.get("timeout", default_timeout)) self.pulsar_host = config.get("pulsar_host", default_pulsar_host) - self.llm_out = Publisher( - self.pulsar_host, text_completion_request_queue, - schema=JsonSchema(TextCompletionRequest) - ) - - self.llm_in = Subscriber( - self.pulsar_host, text_completion_response_queue, - "api-gateway", "api-gateway", - JsonSchema(TextCompletionResponse) - ) - - self.prompt_out = Publisher( - self.pulsar_host, prompt_request_queue, - schema=JsonSchema(PromptRequest) - ) - - self.prompt_in = Subscriber( - self.pulsar_host, prompt_response_queue, - "api-gateway", "api-gateway", - JsonSchema(PromptResponse) - ) - - self.graph_rag_out = Publisher( - self.pulsar_host, graph_rag_request_queue, - schema=JsonSchema(GraphRagQuery) - ) - - self.graph_rag_in = Subscriber( - self.pulsar_host, graph_rag_response_queue, - "api-gateway", "api-gateway", - JsonSchema(GraphRagResponse) - ) - - self.triples_query_out = Publisher( - self.pulsar_host, triples_request_queue, - schema=JsonSchema(TriplesQueryRequest) - ) - - self.triples_query_in = Subscriber( - self.pulsar_host, triples_response_queue, - "api-gateway", "api-gateway", - JsonSchema(TriplesQueryResponse) - ) - - self.agent_out = Publisher( - self.pulsar_host, agent_request_queue, - schema=JsonSchema(AgentRequest) - ) - - self.agent_in = Subscriber( - self.pulsar_host, agent_response_queue, - "api-gateway", "api-gateway", - JsonSchema(AgentResponse) - ) - - self.embeddings_out = Publisher( - self.pulsar_host, embeddings_request_queue, - schema=JsonSchema(EmbeddingsRequest) - ) - - self.embeddings_in = Subscriber( - self.pulsar_host, embeddings_response_queue, - "api-gateway", "api-gateway", - JsonSchema(EmbeddingsResponse) - ) - - self.triples_tap = Subscriber( - self.pulsar_host, triples_store_queue, - "api-gateway", "api-gateway", - schema=JsonSchema(Triples) - ) - - self.triples_pub = Publisher( - self.pulsar_host, triples_store_queue, - schema=JsonSchema(Triples) - ) - - self.graph_embeddings_tap = Subscriber( - self.pulsar_host, graph_embeddings_store_queue, - "api-gateway", "api-gateway", - schema=JsonSchema(GraphEmbeddings) - ) - - self.graph_embeddings_pub = Publisher( - self.pulsar_host, graph_embeddings_store_queue, - schema=JsonSchema(GraphEmbeddings) - ) + self.endpoints = [ + TextCompletionEndpoint( + pulsar_host=self.pulsar_host, timeout=self.timeout, + ), + PromptEndpoint( + pulsar_host=self.pulsar_host, timeout=self.timeout, + ), + GraphRagEndpoint( + pulsar_host=self.pulsar_host, timeout=self.timeout, + ), + TriplesQueryEndpoint( + pulsar_host=self.pulsar_host, timeout=self.timeout, + ), + EmbeddingsEndpoint( + pulsar_host=self.pulsar_host, timeout=self.timeout, + ), + AgentEndpoint( + pulsar_host=self.pulsar_host, timeout=self.timeout, + ), + EncyclopediaEndpoint( + pulsar_host=self.pulsar_host, timeout=self.timeout, + ), + DbpediaEndpoint( + pulsar_host=self.pulsar_host, timeout=self.timeout, + ), + InternetSearchEndpoint( + pulsar_host=self.pulsar_host, timeout=self.timeout, + ), + TriplesStreamEndpoint( + pulsar_host=self.pulsar_host + ), + GraphEmbeddingsStreamEndpoint( + pulsar_host=self.pulsar_host + ), + TriplesLoadEndpoint( + pulsar_host=self.pulsar_host + ), + GraphEmbeddingsLoadEndpoint( + pulsar_host=self.pulsar_host + ), + ] self.document_out = Publisher( self.pulsar_host, document_ingest_queue, @@ -347,323 +120,14 @@ class Api: chunking_enabled=True, ) + for ep in self.endpoints: + ep.add_routes(self.app) + self.app.add_routes([ - web.post("/api/v1/text-completion", self.llm), - web.post("/api/v1/prompt", self.prompt), - web.post("/api/v1/graph-rag", self.graph_rag), - web.post("/api/v1/triples-query", self.triples_query), - web.post("/api/v1/agent", self.agent), - web.post("/api/v1/embeddings", self.embeddings), web.post("/api/v1/load/document", self.load_document), web.post("/api/v1/load/text", self.load_text), - web.get("/api/v1/ws", self.socket), - - web.get("/api/v1/stream/triples", self.stream_triples), - web.get( - "/api/v1/stream/graph-embeddings", - self.stream_graph_embeddings - ), - - web.get("/api/v1/load/triples", self.load_triples), - web.get( - "/api/v1/load/graph-embeddings", - self.load_graph_embeddings - ), - ]) - async def llm(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.llm_in.subscribe(id) - - await self.llm_out.send( - id, - TextCompletionRequest( - system=data["system"], - prompt=data["prompt"] - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), self.timeout) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - return web.json_response( - { "response": resp.response } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.llm_in.unsubscribe(id) - - async def prompt(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.prompt_in.subscribe(id) - - terms = { - k: json.dumps(v) - for k, v in data["variables"].items() - } - - await self.prompt_out.send( - id, - PromptRequest( - id=data["id"], - terms=terms - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), self.timeout) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - if resp.object: - return web.json_response( - { "object": resp.object } - ) - - return web.json_response( - { "text": resp.text } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.prompt_in.unsubscribe(id) - - async def graph_rag(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.graph_rag_in.subscribe(id) - - await self.graph_rag_out.send( - id, - GraphRagQuery( - query=data["query"], - user=data.get("user", "trustgraph"), - collection=data.get("collection", "default"), - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), self.timeout) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - return web.json_response( - { "response": resp.response } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.graph_rag_in.unsubscribe(id) - - async def triples_query(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.triples_query_in.subscribe(id) - - if "s" in data: - s = to_value(data["s"]) - else: - s = None - - if "p" in data: - p = to_value(data["p"]) - else: - p = None - - if "o" in data: - o = to_value(data["o"]) - else: - o = None - - limit = int(data.get("limit", 10000)) - - await self.triples_query_out.send( - id, - TriplesQueryRequest( - s = s, p = p, o = o, - limit = limit, - user = data.get("user", "trustgraph"), - collection = data.get("collection", "default"), - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), self.timeout) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - return web.json_response( - { - "response": serialize_subgraph(resp.triples), - } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.graph_rag_in.unsubscribe(id) - - async def agent(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.agent_in.subscribe(id) - - await self.agent_out.send( - id, - AgentRequest( - question=data["question"], - ) - ) - - while True: - try: - resp = await asyncio.wait_for(q.get(), self.timeout) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - if resp.answer: break - - if resp.thought: print("thought:", resp.thought) - if resp.observation: print("observation:", resp.observation) - - if resp.answer: - return web.json_response( - { "answer": resp.answer } - ) - - # Can't happen, ook at the logic - raise RuntimeError("Strange state") - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.agent_in.unsubscribe(id) - - async def embeddings(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = await self.embeddings_in.subscribe(id) - - await self.embeddings_out.send( - id, - EmbeddingsRequest( - text=data["text"], - ) - ) - - try: - resp = await asyncio.wait_for(q.get(), self.timeout) - except: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - return web.json_response( - { "vectors": resp.vectors } - ) - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - await self.embeddings_in.unsubscribe(id) - async def load_document(self, request): try: @@ -750,215 +214,12 @@ class Api: { "error": str(e) } ) - async def socket(self, request): - - ws = web.WebSocketResponse() - await ws.prepare(request) - - async for msg in ws: - if msg.type == WSMsgType.TEXT: - if msg.data == 'close': - await ws.close() - else: - await ws.send_str(msg.data + '/answer') - elif msg.type == WSMsgType.ERROR: - print('ws connection closed with exception %s' % - ws.exception()) - - print('websocket connection closed') - - return ws - - async def stream(self, q, ws, running, fn): - - while running.get(): - try: - resp = await asyncio.wait_for(q.get(), 0.5) - await ws.send_json(fn(resp)) - - except TimeoutError: - continue - - except Exception as e: - print(f"Exception: {str(e)}", flush=True) - - async def stream_triples(self, request): - - id = str(uuid.uuid4()) - - q = await self.triples_tap.subscribe_all(id) - running = Running() - - ws = web.WebSocketResponse() - await ws.prepare(request) - - tsk = asyncio.create_task(self.stream( - q, - ws, - running, - serialize_triples, - )) - - async for msg in ws: - if msg.type == WSMsgType.ERROR: - break - else: - # Ignore incoming messages - pass - - running.stop() - - await self.triples_tap.unsubscribe_all(id) - await tsk - - return ws - - async def stream_graph_embeddings(self, request): - - id = str(uuid.uuid4()) - - q = await self.graph_embeddings_tap.subscribe_all(id) - running = Running() - - ws = web.WebSocketResponse() - await ws.prepare(request) - - tsk = asyncio.create_task(self.stream( - q, - ws, - running, - serialize_graph_embeddings, - )) - - async for msg in ws: - if msg.type == WSMsgType.ERROR: - break - else: - # Ignore incoming messages - pass - - running.stop() - - await self.graph_embeddings_tap.unsubscribe_all(id) - await tsk - - return ws - - async def load_triples(self, request): - - ws = web.WebSocketResponse() - await ws.prepare(request) - - async for msg in ws: - - try: - - if msg.type == WSMsgType.TEXT: - - data = msg.json() - - elt = Triples( - metadata=Metadata( - id=data["metadata"]["id"], - metadata=to_subgraph(data["metadata"]["metadata"]), - user=data["metadata"]["user"], - collection=data["metadata"]["collection"], - ), - triples=to_subgraph(data["triples"]), - ) - - await self.triples_pub.send(None, elt) - - elif msg.type == WSMsgType.ERROR: - break - - except Exception as e: - - print("Exception:", e) - - return ws - - async def load_graph_embeddings(self, request): - - ws = web.WebSocketResponse() - await ws.prepare(request) - - async for msg in ws: - - try: - - if msg.type == WSMsgType.TEXT: - - data = msg.json() - - elt = GraphEmbeddings( - metadata=Metadata( - id=data["metadata"]["id"], - metadata=to_subgraph(data["metadata"]["metadata"]), - user=data["metadata"]["user"], - collection=data["metadata"]["collection"], - ), - entity=to_value(data["entity"]), - vectors=data["vectors"], - ) - - await self.graph_embeddings_pub.send(None, elt) - - elif msg.type == WSMsgType.ERROR: - break - - except Exception as e: - - print("Exception:", e) - - return ws - async def app_factory(self): - self.llm_pub_task = asyncio.create_task(self.llm_in.run()) - self.llm_sub_task = asyncio.create_task(self.llm_out.run()) - - self.prompt_pub_task = asyncio.create_task(self.prompt_in.run()) - self.prompt_sub_task = asyncio.create_task(self.prompt_out.run()) - - self.graph_rag_pub_task = asyncio.create_task(self.graph_rag_in.run()) - self.graph_rag_sub_task = asyncio.create_task(self.graph_rag_out.run()) - - self.triples_query_pub_task = asyncio.create_task( - self.triples_query_in.run() - ) - self.triples_query_sub_task = asyncio.create_task( - self.triples_query_out.run() - ) - - self.agent_pub_task = asyncio.create_task(self.agent_in.run()) - self.agent_sub_task = asyncio.create_task(self.agent_out.run()) - - self.embeddings_pub_task = asyncio.create_task( - self.embeddings_in.run() - ) - self.embeddings_sub_task = asyncio.create_task( - self.embeddings_out.run() - ) - - self.triples_tap_task = asyncio.create_task( - self.triples_tap.run() - ) - - self.triples_pub_task = asyncio.create_task( - self.triples_pub.run() - ) - - self.graph_embeddings_tap_task = asyncio.create_task( - self.graph_embeddings_tap.run() - ) - - self.graph_embeddings_pub_task = asyncio.create_task( - self.graph_embeddings_pub.run() - ) + for ep in self.endpoints: + await ep.start() self.doc_ingest_pub_task = asyncio.create_task(self.document_out.run()) - self.text_ingest_pub_task = asyncio.create_task(self.text_out.run()) return self.app diff --git a/trustgraph-flow/trustgraph/api/gateway/socket.py b/trustgraph-flow/trustgraph/api/gateway/socket.py new file mode 100644 index 00000000..235bfd21 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/socket.py @@ -0,0 +1,68 @@ + +import asyncio +from aiohttp import web, WSMsgType +import logging + +from . running import Running + +logger = logging.getLogger("socket") +logger.setLevel(logging.INFO) + +class SocketEndpoint: + + def __init__( + self, + endpoint_path="/api/v1/socket", + ): + + self.path = endpoint_path + + async def listener(self, ws, running): + + async for msg in ws: + # On error, finish + if msg.type == WSMsgType.ERROR: + break + else: + # Ignore incoming messages + pass + + running.stop() + + async def async_thread(self, ws, running): + + while running.get(): + try: + await asyncio.sleep(1) + + except TimeoutError: + continue + + except Exception as e: + print(f"Exception: {str(e)}", flush=True) + + async def handle(self, request): + + running = Running() + ws = web.WebSocketResponse() + await ws.prepare(request) + + task = asyncio.create_task(self.async_thread(ws, running)) + + await self.listener(ws, running) + + await task + + running.stop() + + return ws + + async def start(self): + pass + + def add_routes(self, app): + + app.add_routes([ + web.get(self.path, self.handle), + ]) + diff --git a/trustgraph-flow/trustgraph/api/gateway/subscriber.py b/trustgraph-flow/trustgraph/api/gateway/subscriber.py new file mode 100644 index 00000000..3d8840f6 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/subscriber.py @@ -0,0 +1,68 @@ + +import asyncio +import aiopulsar + +class Subscriber: + + def __init__(self, pulsar_host, topic, subscription, consumer_name, + schema=None, max_size=10): + self.pulsar_host = pulsar_host + self.topic = topic + self.subscription = subscription + self.consumer_name = consumer_name + self.schema = schema + self.q = {} + self.full = {} + + async def run(self): + while True: + try: + async with aiopulsar.connect(self.pulsar_host) as client: + async with client.subscribe( + topic=self.topic, + subscription_name=self.subscription, + consumer_name=self.consumer_name, + schema=self.schema, + ) as consumer: + while True: + msg = await consumer.receive() + + # Acknowledge successful reception of the message + await consumer.acknowledge(msg) + + try: + id = msg.properties()["id"] + except: + id = None + + value = msg.value() + if id in self.q: + await self.q[id].put(value) + + for q in self.full.values(): + await q.put(value) + + except Exception as e: + print("Exception:", e, flush=True) + + # If handler drops out, sleep a retry + await asyncio.sleep(2) + + async def subscribe(self, id): + q = asyncio.Queue() + self.q[id] = q + return q + + async def unsubscribe(self, id): + if id in self.q: + del self.q[id] + + async def subscribe_all(self, id): + q = asyncio.Queue() + self.full[id] = q + return q + + async def unsubscribe_all(self, id): + if id in self.full: + del self.full[id] + diff --git a/trustgraph-flow/trustgraph/api/gateway/text_completion.py b/trustgraph-flow/trustgraph/api/gateway/text_completion.py new file mode 100644 index 00000000..04dbc9c8 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/text_completion.py @@ -0,0 +1,28 @@ + +from ... schema import TextCompletionRequest, TextCompletionResponse +from ... schema import text_completion_request_queue +from ... schema import text_completion_response_queue + +from . endpoint import ServiceEndpoint + +class TextCompletionEndpoint(ServiceEndpoint): + def __init__(self, pulsar_host, timeout): + + super(TextCompletionEndpoint, self).__init__( + pulsar_host=pulsar_host, + request_queue=text_completion_request_queue, + response_queue=text_completion_response_queue, + request_schema=TextCompletionRequest, + response_schema=TextCompletionResponse, + endpoint_path="/api/v1/text-completion", + timeout=timeout, + ) + + def to_request(self, body): + return TextCompletionRequest( + system=body["system"], + prompt=body["prompt"] + ) + + def from_response(self, message): + return { "response": message.response } diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_load.py b/trustgraph-flow/trustgraph/api/gateway/triples_load.py new file mode 100644 index 00000000..d835a363 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/triples_load.py @@ -0,0 +1,59 @@ + +import asyncio +from pulsar.schema import JsonSchema +import uuid +from aiohttp import WSMsgType + +from ... schema import Metadata +from ... schema import Triples +from ... schema import triples_store_queue + +from . publisher import Publisher +from . socket import SocketEndpoint +from . serialize import to_subgraph + +class TriplesLoadEndpoint(SocketEndpoint): + + def __init__(self, pulsar_host, path="/api/v1/load/triples"): + + super(TriplesLoadEndpoint, self).__init__( + endpoint_path=path + ) + + self.pulsar_host=pulsar_host + + self.publisher = Publisher( + self.pulsar_host, triples_store_queue, + schema=JsonSchema(Triples) + ) + + async def start(self): + + self.task = asyncio.create_task( + self.publisher.run() + ) + + async def listener(self, ws, running): + + async for msg in ws: + # On error, finish + if msg.type == WSMsgType.ERROR: + break + else: + + data = msg.json() + + elt = Triples( + metadata=Metadata( + id=data["metadata"]["id"], + metadata=to_subgraph(data["metadata"]["metadata"]), + user=data["metadata"]["user"], + collection=data["metadata"]["collection"], + ), + triples=to_subgraph(data["triples"]), + ) + + await self.publisher.send(None, elt) + + + running.stop() diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_query.py b/trustgraph-flow/trustgraph/api/gateway/triples_query.py new file mode 100644 index 00000000..8b4192d8 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/triples_query.py @@ -0,0 +1,53 @@ + +from ... schema import TriplesQueryRequest, TriplesQueryResponse, Triples +from ... schema import triples_request_queue +from ... schema import triples_response_queue + +from . endpoint import ServiceEndpoint +from . serialize import to_value, serialize_subgraph + +class TriplesQueryEndpoint(ServiceEndpoint): + def __init__(self, pulsar_host, timeout): + + super(TriplesQueryEndpoint, self).__init__( + pulsar_host=pulsar_host, + request_queue=triples_request_queue, + response_queue=triples_response_queue, + request_schema=TriplesQueryRequest, + response_schema=TriplesQueryResponse, + endpoint_path="/api/v1/triples-query", + timeout=timeout, + ) + + def to_request(self, body): + + if "s" in body: + s = to_value(body["s"]) + else: + s = None + + if "p" in body: + p = to_value(body["p"]) + else: + p = None + + if "o" in body: + o = to_value(body["o"]) + else: + o = None + + limit = int(body.get("limit", 10000)) + + return TriplesQueryRequest( + s = s, p = p, o = o, + limit = limit, + user = body.get("user", "trustgraph"), + collection = body.get("collection", "default"), + ) + + def from_response(self, message): + print(message) + return { + "response": serialize_subgraph(message.triples) + } + diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_stream.py b/trustgraph-flow/trustgraph/api/gateway/triples_stream.py new file mode 100644 index 00000000..e8b538a4 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/triples_stream.py @@ -0,0 +1,56 @@ + +import asyncio +from pulsar.schema import JsonSchema +import uuid + +from ... schema import Triples +from ... schema import triples_store_queue + +from . subscriber import Subscriber +from . socket import SocketEndpoint +from . serialize import serialize_triples + +class TriplesStreamEndpoint(SocketEndpoint): + + def __init__(self, pulsar_host, path="/api/v1/stream/triples"): + + super(TriplesStreamEndpoint, self).__init__( + endpoint_path=path + ) + + self.pulsar_host=pulsar_host + + self.subscriber = Subscriber( + self.pulsar_host, triples_store_queue, + "api-gateway", "api-gateway", + schema=JsonSchema(Triples) + ) + + async def start(self): + + self.task = asyncio.create_task( + self.subscriber.run() + ) + + async def async_thread(self, ws, running): + + id = str(uuid.uuid4()) + + q = await self.subscriber.subscribe_all(id) + + while running.get(): + try: + resp = await asyncio.wait_for(q.get(), 0.5) + await ws.send_json(serialize_triples(resp)) + + except TimeoutError: + continue + + except Exception as e: + print(f"Exception: {str(e)}", flush=True) + break + + await self.subscriber.unsubscribe_all(id) + + running.stop() + diff --git a/trustgraph-flow/trustgraph/external/__init__.py b/trustgraph-flow/trustgraph/external/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trustgraph-flow/trustgraph/external/wikipedia/__init__.py b/trustgraph-flow/trustgraph/external/wikipedia/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/external/wikipedia/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/external/wikipedia/__main__.py b/trustgraph-flow/trustgraph/external/wikipedia/__main__.py new file mode 100644 index 00000000..e9136855 --- /dev/null +++ b/trustgraph-flow/trustgraph/external/wikipedia/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . service import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/external/wikipedia/service.py b/trustgraph-flow/trustgraph/external/wikipedia/service.py new file mode 100644 index 00000000..932e1213 --- /dev/null +++ b/trustgraph-flow/trustgraph/external/wikipedia/service.py @@ -0,0 +1,102 @@ + +""" +Wikipedia lookup service. Fetchs an extract from the Wikipedia page +using the API. +""" + +from trustgraph.schema import LookupRequest, LookupResponse, Error +from trustgraph.schema import encyclopedia_lookup_request_queue +from trustgraph.schema import encyclopedia_lookup_response_queue +from trustgraph.log_level import LogLevel +from trustgraph.base import ConsumerProducer +import requests + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = encyclopedia_lookup_request_queue +default_output_queue = encyclopedia_lookup_response_queue +default_subscriber = module +default_url="https://en.wikipedia.org/" + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + url = params.get("url", default_url) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": LookupRequest, + "output_schema": LookupResponse, + } + ) + + self.url = url + + def handle(self, msg): + + v = msg.value() + + # Sender-produced ID + id = msg.properties()["id"] + + print(f"Handling {v.kind} / {v.term}...", flush=True) + + try: + + url = f"{self.url}/api/rest_v1/page/summary/{v.term}" + + resp = Result = requests.get(url).json() + resp = resp["extract"] + + r = LookupResponse( + error=None, + text=resp + ) + + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + return + + except Exception as e: + + r = LookupResponse( + error=Error( + type = "lookup-error", + message = str(e), + ), + text=None, + ) + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + return + + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + + parser.add_argument( + '-u', '--url', + default=default_url, + help=f'LLM model (default: {default_url})' + ) + +def run(): + + Processor.start(module, __doc__) + From 1b9c6be4fc3175e90c11719d820ddc3b146cd33c Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Mon, 2 Dec 2024 19:57:21 +0000 Subject: [PATCH 20/37] Feature/gateway auth (#186) * Added auth module, just a simple token at this stage * Pass auth token GATEWAY_SECRET through * Auth token not mandatory, can be provided in env var --- templates/components/trustgraph.jsonnet | 5 +++ .../trustgraph/api/gateway/agent.py | 3 +- .../trustgraph/api/gateway/auth.py | 22 +++++++++++ .../trustgraph/api/gateway/dbpedia.py | 3 +- .../trustgraph/api/gateway/embeddings.py | 3 +- .../trustgraph/api/gateway/encyclopedia.py | 3 +- .../trustgraph/api/gateway/endpoint.py | 24 ++++++++---- .../api/gateway/graph_embeddings_load.py | 6 ++- .../api/gateway/graph_embeddings_stream.py | 6 ++- .../trustgraph/api/gateway/graph_rag.py | 3 +- .../trustgraph/api/gateway/internet_search.py | 3 +- .../trustgraph/api/gateway/prompt.py | 3 +- .../trustgraph/api/gateway/service.py | 37 +++++++++++++++++-- .../trustgraph/api/gateway/socket.py | 24 ++++++++++-- .../trustgraph/api/gateway/text_completion.py | 3 +- .../trustgraph/api/gateway/triples_load.py | 4 +- .../trustgraph/api/gateway/triples_query.py | 3 +- .../trustgraph/api/gateway/triples_stream.py | 4 +- 18 files changed, 126 insertions(+), 33 deletions(-) create mode 100644 trustgraph-flow/trustgraph/api/gateway/auth.py diff --git a/templates/components/trustgraph.jsonnet b/templates/components/trustgraph.jsonnet index 6c60921c..31ae420e 100644 --- a/templates/components/trustgraph.jsonnet +++ b/templates/components/trustgraph.jsonnet @@ -15,6 +15,9 @@ local prompt = import "prompt-template.jsonnet"; create:: function(engine) + local envSecrets = engine.envSecrets("gateway-secret") + .with_env_var("GATEWAY_SECRET", "gateway-secret"); + local port = $["api-gateway-port"]; local container = @@ -29,6 +32,7 @@ local prompt = import "prompt-template.jsonnet"; "--port", std.toString(port), ]) + .with_env_var_secrets(envSecrets) .with_limits("0.5", "256M") .with_reservations("0.1", "256M") .with_port(8000, 8000, "metrics") @@ -44,6 +48,7 @@ local prompt = import "prompt-template.jsonnet"; .with_port(port, port, "api"); engine.resources([ + envSecrets, containerSet, service, ]) diff --git a/trustgraph-flow/trustgraph/api/gateway/agent.py b/trustgraph-flow/trustgraph/api/gateway/agent.py index 28a1e185..40586133 100644 --- a/trustgraph-flow/trustgraph/api/gateway/agent.py +++ b/trustgraph-flow/trustgraph/api/gateway/agent.py @@ -6,7 +6,7 @@ from ... schema import agent_response_queue from . endpoint import MultiResponseServiceEndpoint class AgentEndpoint(MultiResponseServiceEndpoint): - def __init__(self, pulsar_host, timeout): + def __init__(self, pulsar_host, timeout, auth): super(AgentEndpoint, self).__init__( pulsar_host=pulsar_host, @@ -16,6 +16,7 @@ class AgentEndpoint(MultiResponseServiceEndpoint): response_schema=AgentResponse, endpoint_path="/api/v1/agent", timeout=timeout, + auth=auth, ) def to_request(self, body): diff --git a/trustgraph-flow/trustgraph/api/gateway/auth.py b/trustgraph-flow/trustgraph/api/gateway/auth.py new file mode 100644 index 00000000..a693ca32 --- /dev/null +++ b/trustgraph-flow/trustgraph/api/gateway/auth.py @@ -0,0 +1,22 @@ + +class Authenticator: + + def __init__(self, token=None, allow_all=False): + + if not allow_all and token is None: + raise RuntimeError("Need a token") + + if not allow_all and token == "": + raise RuntimeError("Need a token") + + self.token = token + self.allow_all = allow_all + + def permitted(self, token, roles): + + if self.allow_all: return True + + if self.token != token: return False + + return True + diff --git a/trustgraph-flow/trustgraph/api/gateway/dbpedia.py b/trustgraph-flow/trustgraph/api/gateway/dbpedia.py index 0ccb3d6b..4fa7336b 100644 --- a/trustgraph-flow/trustgraph/api/gateway/dbpedia.py +++ b/trustgraph-flow/trustgraph/api/gateway/dbpedia.py @@ -6,7 +6,7 @@ from ... schema import dbpedia_lookup_response_queue from . endpoint import ServiceEndpoint class DbpediaEndpoint(ServiceEndpoint): - def __init__(self, pulsar_host, timeout): + def __init__(self, pulsar_host, timeout, auth): super(DbpediaEndpoint, self).__init__( pulsar_host=pulsar_host, @@ -16,6 +16,7 @@ class DbpediaEndpoint(ServiceEndpoint): response_schema=LookupResponse, endpoint_path="/api/v1/dbpedia", timeout=timeout, + auth=auth, ) def to_request(self, body): diff --git a/trustgraph-flow/trustgraph/api/gateway/embeddings.py b/trustgraph-flow/trustgraph/api/gateway/embeddings.py index b5fcc0a4..7c4b578d 100644 --- a/trustgraph-flow/trustgraph/api/gateway/embeddings.py +++ b/trustgraph-flow/trustgraph/api/gateway/embeddings.py @@ -6,7 +6,7 @@ from ... schema import embeddings_response_queue from . endpoint import ServiceEndpoint class EmbeddingsEndpoint(ServiceEndpoint): - def __init__(self, pulsar_host, timeout): + def __init__(self, pulsar_host, timeout, auth): super(EmbeddingsEndpoint, self).__init__( pulsar_host=pulsar_host, @@ -16,6 +16,7 @@ class EmbeddingsEndpoint(ServiceEndpoint): response_schema=EmbeddingsResponse, endpoint_path="/api/v1/embeddings", timeout=timeout, + auth=auth, ) def to_request(self, body): diff --git a/trustgraph-flow/trustgraph/api/gateway/encyclopedia.py b/trustgraph-flow/trustgraph/api/gateway/encyclopedia.py index e379d7d4..c6041cb2 100644 --- a/trustgraph-flow/trustgraph/api/gateway/encyclopedia.py +++ b/trustgraph-flow/trustgraph/api/gateway/encyclopedia.py @@ -6,7 +6,7 @@ from ... schema import encyclopedia_lookup_response_queue from . endpoint import ServiceEndpoint class EncyclopediaEndpoint(ServiceEndpoint): - def __init__(self, pulsar_host, timeout): + def __init__(self, pulsar_host, timeout, auth): super(EncyclopediaEndpoint, self).__init__( pulsar_host=pulsar_host, @@ -16,6 +16,7 @@ class EncyclopediaEndpoint(ServiceEndpoint): response_schema=LookupResponse, endpoint_path="/api/v1/encyclopedia", timeout=timeout, + auth=auth, ) def to_request(self, body): diff --git a/trustgraph-flow/trustgraph/api/gateway/endpoint.py b/trustgraph-flow/trustgraph/api/gateway/endpoint.py index 075e4a0e..af7a5070 100644 --- a/trustgraph-flow/trustgraph/api/gateway/endpoint.py +++ b/trustgraph-flow/trustgraph/api/gateway/endpoint.py @@ -19,6 +19,7 @@ class ServiceEndpoint: request_queue, request_schema, response_queue, response_schema, endpoint_path, + auth, subscription="api-gateway", consumer_name="api-gateway", timeout=600, ): @@ -36,6 +37,9 @@ class ServiceEndpoint: self.path = endpoint_path self.timeout = timeout + self.auth = auth + + self.operation = "service" async def start(self): @@ -58,14 +62,24 @@ class ServiceEndpoint: id = str(uuid.uuid4()) + try: + ht = request.headers["Authorization"] + tokens = ht.split(" ", 2) + if tokens[0] != "Bearer": + return web.HTTPUnauthorized() + token = tokens[1] + except: + token = "" + + if not self.auth.permitted(token, self.operation): + return web.HTTPUnauthorized() + try: data = await request.json() q = await self.sub.subscribe(id) - print(data) - await self.pub.send( id, self.to_request(data), @@ -76,8 +90,6 @@ class ServiceEndpoint: except: raise RuntimeError("Timeout waiting for response") - print(resp) - if resp.error: return web.json_response( { "error": resp.error.message } @@ -110,8 +122,6 @@ class MultiResponseServiceEndpoint(ServiceEndpoint): q = await self.sub.subscribe(id) - print(data) - await self.pub.send( id, self.to_request(data), @@ -126,8 +136,6 @@ class MultiResponseServiceEndpoint(ServiceEndpoint): except: raise RuntimeError("Timeout waiting for response") - print(resp) - if resp.error: return web.json_response( { "error": resp.error.message } diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py index 3cc3f533..15efdf5b 100644 --- a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py +++ b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py @@ -14,10 +14,12 @@ from . serialize import to_subgraph, to_value class GraphEmbeddingsLoadEndpoint(SocketEndpoint): - def __init__(self, pulsar_host, path="/api/v1/load/graph-embeddings"): + def __init__( + self, pulsar_host, auth, path="/api/v1/load/graph-embeddings", + ): super(GraphEmbeddingsLoadEndpoint, self).__init__( - endpoint_path=path + endpoint_path=path, auth=auth, ) self.pulsar_host=pulsar_host diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py index 978684cf..7f3e5e18 100644 --- a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py +++ b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py @@ -12,10 +12,12 @@ from . serialize import serialize_graph_embeddings class GraphEmbeddingsStreamEndpoint(SocketEndpoint): - def __init__(self, pulsar_host, path="/api/v1/stream/graph-embeddings"): + def __init__( + self, pulsar_host, auth, path="/api/v1/stream/graph-embeddings" + ): super(GraphEmbeddingsStreamEndpoint, self).__init__( - endpoint_path=path + endpoint_path=path, auth=auth, ) self.pulsar_host=pulsar_host diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_rag.py b/trustgraph-flow/trustgraph/api/gateway/graph_rag.py index 1381dc23..d33090ca 100644 --- a/trustgraph-flow/trustgraph/api/gateway/graph_rag.py +++ b/trustgraph-flow/trustgraph/api/gateway/graph_rag.py @@ -6,7 +6,7 @@ from ... schema import graph_rag_response_queue from . endpoint import ServiceEndpoint class GraphRagEndpoint(ServiceEndpoint): - def __init__(self, pulsar_host, timeout): + def __init__(self, pulsar_host, timeout, auth): super(GraphRagEndpoint, self).__init__( pulsar_host=pulsar_host, @@ -16,6 +16,7 @@ class GraphRagEndpoint(ServiceEndpoint): response_schema=GraphRagResponse, endpoint_path="/api/v1/graph-rag", timeout=timeout, + auth=auth, ) def to_request(self, body): diff --git a/trustgraph-flow/trustgraph/api/gateway/internet_search.py b/trustgraph-flow/trustgraph/api/gateway/internet_search.py index c84ed82a..f55a4a3e 100644 --- a/trustgraph-flow/trustgraph/api/gateway/internet_search.py +++ b/trustgraph-flow/trustgraph/api/gateway/internet_search.py @@ -6,7 +6,7 @@ from ... schema import internet_search_response_queue from . endpoint import ServiceEndpoint class InternetSearchEndpoint(ServiceEndpoint): - def __init__(self, pulsar_host, timeout): + def __init__(self, pulsar_host, timeout, auth): super(InternetSearchEndpoint, self).__init__( pulsar_host=pulsar_host, @@ -16,6 +16,7 @@ class InternetSearchEndpoint(ServiceEndpoint): response_schema=LookupResponse, endpoint_path="/api/v1/internet-search", timeout=timeout, + auth=auth, ) def to_request(self, body): diff --git a/trustgraph-flow/trustgraph/api/gateway/prompt.py b/trustgraph-flow/trustgraph/api/gateway/prompt.py index e02effb9..d19005bc 100644 --- a/trustgraph-flow/trustgraph/api/gateway/prompt.py +++ b/trustgraph-flow/trustgraph/api/gateway/prompt.py @@ -8,7 +8,7 @@ from ... schema import prompt_response_queue from . endpoint import ServiceEndpoint class PromptEndpoint(ServiceEndpoint): - def __init__(self, pulsar_host, timeout): + def __init__(self, pulsar_host, timeout, auth): super(PromptEndpoint, self).__init__( pulsar_host=pulsar_host, @@ -18,6 +18,7 @@ class PromptEndpoint(ServiceEndpoint): response_schema=PromptResponse, endpoint_path="/api/v1/prompt", timeout=timeout, + auth=auth, ) def to_request(self, body): diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index dcdd9779..a25dd9dc 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -45,6 +45,7 @@ from . triples_stream import TriplesStreamEndpoint from . graph_embeddings_stream import GraphEmbeddingsStreamEndpoint from . triples_load import TriplesLoadEndpoint from . graph_embeddings_load import GraphEmbeddingsLoadEndpoint +from . auth import Authenticator logger = logging.getLogger("api") logger.setLevel(logging.INFO) @@ -52,6 +53,7 @@ logger.setLevel(logging.INFO) default_pulsar_host = os.getenv("PULSAR_HOST", "pulsar://pulsar:6650") default_timeout = 600 default_port = 8088 +default_api_token = os.getenv("GATEWAY_SECRET", "") class Api: @@ -66,45 +68,66 @@ class Api: self.timeout = int(config.get("timeout", default_timeout)) self.pulsar_host = config.get("pulsar_host", default_pulsar_host) + api_token = config.get("api_token", default_api_token) + + # Token not set, or token equal empty string means no auth + if api_token: + self.auth = Authenticator(token=api_token) + else: + self.auth = Authenticator(allow_all=True) + self.endpoints = [ TextCompletionEndpoint( pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, ), PromptEndpoint( pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, ), GraphRagEndpoint( pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, ), TriplesQueryEndpoint( pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, ), EmbeddingsEndpoint( pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, ), AgentEndpoint( pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, ), EncyclopediaEndpoint( pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, ), DbpediaEndpoint( pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, ), InternetSearchEndpoint( pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, ), TriplesStreamEndpoint( - pulsar_host=self.pulsar_host + pulsar_host=self.pulsar_host, + auth = self.auth, ), GraphEmbeddingsStreamEndpoint( - pulsar_host=self.pulsar_host + pulsar_host=self.pulsar_host, + auth = self.auth, ), TriplesLoadEndpoint( - pulsar_host=self.pulsar_host + pulsar_host=self.pulsar_host, + auth = self.auth, ), GraphEmbeddingsLoadEndpoint( - pulsar_host=self.pulsar_host + pulsar_host=self.pulsar_host, + auth = self.auth, ), ] @@ -254,6 +277,12 @@ def run(): help=f'API request timeout in seconds (default: {default_timeout})', ) + parser.add_argument( + '--api-token', + default=default_api_token, + help=f'Secret API token (default: no auth)', + ) + parser.add_argument( '-l', '--log-level', type=LogLevel, diff --git a/trustgraph-flow/trustgraph/api/gateway/socket.py b/trustgraph-flow/trustgraph/api/gateway/socket.py index 235bfd21..869792b7 100644 --- a/trustgraph-flow/trustgraph/api/gateway/socket.py +++ b/trustgraph-flow/trustgraph/api/gateway/socket.py @@ -11,11 +11,12 @@ logger.setLevel(logging.INFO) class SocketEndpoint: def __init__( - self, - endpoint_path="/api/v1/socket", + self, endpoint_path, auth, ): self.path = endpoint_path + self.auth = auth + self.operation = "socket" async def listener(self, ws, running): @@ -43,18 +44,33 @@ class SocketEndpoint: async def handle(self, request): + try: + token = request.query['token'] + except: + token = "" + + if not self.auth.permitted(token, self.operation): + return web.HTTPUnauthorized() + running = Running() ws = web.WebSocketResponse() await ws.prepare(request) task = asyncio.create_task(self.async_thread(ws, running)) - await self.listener(ws, running) + try: - await task + await self.listener(ws, running) + + except Exception as e: + print(e, flush=True) running.stop() + await ws.close() + + await task + return ws async def start(self): diff --git a/trustgraph-flow/trustgraph/api/gateway/text_completion.py b/trustgraph-flow/trustgraph/api/gateway/text_completion.py index 04dbc9c8..d9f69b7e 100644 --- a/trustgraph-flow/trustgraph/api/gateway/text_completion.py +++ b/trustgraph-flow/trustgraph/api/gateway/text_completion.py @@ -6,7 +6,7 @@ from ... schema import text_completion_response_queue from . endpoint import ServiceEndpoint class TextCompletionEndpoint(ServiceEndpoint): - def __init__(self, pulsar_host, timeout): + def __init__(self, pulsar_host, timeout, auth): super(TextCompletionEndpoint, self).__init__( pulsar_host=pulsar_host, @@ -16,6 +16,7 @@ class TextCompletionEndpoint(ServiceEndpoint): response_schema=TextCompletionResponse, endpoint_path="/api/v1/text-completion", timeout=timeout, + auth=auth, ) def to_request(self, body): diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_load.py b/trustgraph-flow/trustgraph/api/gateway/triples_load.py index d835a363..7f4561b1 100644 --- a/trustgraph-flow/trustgraph/api/gateway/triples_load.py +++ b/trustgraph-flow/trustgraph/api/gateway/triples_load.py @@ -14,10 +14,10 @@ from . serialize import to_subgraph class TriplesLoadEndpoint(SocketEndpoint): - def __init__(self, pulsar_host, path="/api/v1/load/triples"): + def __init__(self, pulsar_host, auth, path="/api/v1/load/triples"): super(TriplesLoadEndpoint, self).__init__( - endpoint_path=path + endpoint_path=path, auth=auth, ) self.pulsar_host=pulsar_host diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_query.py b/trustgraph-flow/trustgraph/api/gateway/triples_query.py index 8b4192d8..9c5939c8 100644 --- a/trustgraph-flow/trustgraph/api/gateway/triples_query.py +++ b/trustgraph-flow/trustgraph/api/gateway/triples_query.py @@ -7,7 +7,7 @@ from . endpoint import ServiceEndpoint from . serialize import to_value, serialize_subgraph class TriplesQueryEndpoint(ServiceEndpoint): - def __init__(self, pulsar_host, timeout): + def __init__(self, pulsar_host, timeout, auth): super(TriplesQueryEndpoint, self).__init__( pulsar_host=pulsar_host, @@ -17,6 +17,7 @@ class TriplesQueryEndpoint(ServiceEndpoint): response_schema=TriplesQueryResponse, endpoint_path="/api/v1/triples-query", timeout=timeout, + auth=auth, ) def to_request(self, body): diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_stream.py b/trustgraph-flow/trustgraph/api/gateway/triples_stream.py index e8b538a4..6ecd2bdb 100644 --- a/trustgraph-flow/trustgraph/api/gateway/triples_stream.py +++ b/trustgraph-flow/trustgraph/api/gateway/triples_stream.py @@ -12,10 +12,10 @@ from . serialize import serialize_triples class TriplesStreamEndpoint(SocketEndpoint): - def __init__(self, pulsar_host, path="/api/v1/stream/triples"): + def __init__(self, pulsar_host, auth, path="/api/v1/stream/triples"): super(TriplesStreamEndpoint, self).__init__( - endpoint_path=path + endpoint_path=path, auth=auth, ) self.pulsar_host=pulsar_host From f24eed3023412a99f2baad6d3a67f7bac5de05af Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Tue, 3 Dec 2024 09:51:33 +0000 Subject: [PATCH 21/37] Fix/pinecone de (#187) * Fix Goog AI studio settings * Fix pinecone startup params --- templates/components/googleaistudio.jsonnet | 2 +- .../query/doc_embeddings/pinecone/service.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/templates/components/googleaistudio.jsonnet b/templates/components/googleaistudio.jsonnet index 4088ceef..c2a40f2c 100644 --- a/templates/components/googleaistudio.jsonnet +++ b/templates/components/googleaistudio.jsonnet @@ -13,7 +13,7 @@ local prompts = import "prompts/mixtral.jsonnet"; create:: function(engine) - local envSecrets = engine.envSecrets("bedrock-credentials") + local envSecrets = engine.envSecrets("googleaistudio-key") .with_env_var("GOOGLE_AI_STUDIO_KEY", "googleaistudio-key"); local container = diff --git a/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py index 3fcbfb21..b8502143 100755 --- a/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py +++ b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py @@ -131,9 +131,14 @@ class Processor(ConsumerProducer): ) parser.add_argument( - '-t', '--store-uri', - default=default_store_uri, - help=f'Milvus store URI (default: {default_store_uri})' + '-a', '--api-key', + default=default_api_key, + help='Pinecone API key. (default from PINECONE_API_KEY)' + ) + + parser.add_argument( + '-u', '--url', + help='Pinecone URL. If unspecified, serverless is used' ) def run(): From df23e29971ceaea0cd198160b27654488631a1e9 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Tue, 3 Dec 2024 09:52:00 +0000 Subject: [PATCH 22/37] Add debug to endpoint (#188) --- trustgraph-flow/trustgraph/api/gateway/endpoint.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/trustgraph-flow/trustgraph/api/gateway/endpoint.py b/trustgraph-flow/trustgraph/api/gateway/endpoint.py index af7a5070..dc380f4b 100644 --- a/trustgraph-flow/trustgraph/api/gateway/endpoint.py +++ b/trustgraph-flow/trustgraph/api/gateway/endpoint.py @@ -62,6 +62,8 @@ class ServiceEndpoint: id = str(uuid.uuid4()) + print(request.path, "...") + try: ht = request.headers["Authorization"] tokens = ht.split(" ", 2) @@ -78,23 +80,31 @@ class ServiceEndpoint: data = await request.json() + print(data) + q = await self.sub.subscribe(id) await self.pub.send( id, self.to_request(data), ) + print("Request sent") try: resp = await asyncio.wait_for(q.get(), self.timeout) except: raise RuntimeError("Timeout waiting for response") + print("Response got") + if resp.error: + print("Error") return web.json_response( { "error": resp.error.message } ) + print("Send response") + return web.json_response( self.from_response(resp) ) From 7e78aa6d91aba84fcfc66db157a89b76847b1586 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Tue, 3 Dec 2024 14:13:40 +0000 Subject: [PATCH 23/37] Reduc pulsar connections (#189) --- .../trustgraph/api/gateway/endpoint.py | 11 +++-- .../api/gateway/graph_embeddings_load.py | 4 +- .../api/gateway/graph_embeddings_stream.py | 4 +- .../trustgraph/api/gateway/publisher.py | 26 ++++++------ .../trustgraph/api/gateway/service.py | 33 ++++++++++++--- .../trustgraph/api/gateway/socket.py | 6 +++ .../trustgraph/api/gateway/subscriber.py | 42 +++++++++---------- .../trustgraph/api/gateway/triples_load.py | 4 +- .../trustgraph/api/gateway/triples_stream.py | 4 +- 9 files changed, 82 insertions(+), 52 deletions(-) diff --git a/trustgraph-flow/trustgraph/api/gateway/endpoint.py b/trustgraph-flow/trustgraph/api/gateway/endpoint.py index dc380f4b..c7cd6b04 100644 --- a/trustgraph-flow/trustgraph/api/gateway/endpoint.py +++ b/trustgraph-flow/trustgraph/api/gateway/endpoint.py @@ -41,10 +41,15 @@ class ServiceEndpoint: self.operation = "service" - async def start(self): + async def start(self, client): - self.pub_task = asyncio.create_task(self.pub.run()) - self.sub_task = asyncio.create_task(self.sub.run()) + self.pub_task = asyncio.create_task(self.pub.run(client)) + self.sub_task = asyncio.create_task(self.sub.run(client)) + + async def join(self): + + await self.pub_task + await self.sub_task def add_routes(self, app): diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py index 15efdf5b..764e7210 100644 --- a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py +++ b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py @@ -29,10 +29,10 @@ class GraphEmbeddingsLoadEndpoint(SocketEndpoint): schema=JsonSchema(GraphEmbeddings) ) - async def start(self): + async def start(self, client): self.task = asyncio.create_task( - self.publisher.run() + self.publisher.run(client) ) async def listener(self, ws, running): diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py index 7f3e5e18..12647547 100644 --- a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py +++ b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py @@ -28,10 +28,10 @@ class GraphEmbeddingsStreamEndpoint(SocketEndpoint): schema=JsonSchema(GraphEmbeddings) ) - async def start(self): + async def start(self, client): self.task = asyncio.create_task( - self.subscriber.run() + self.subscriber.run(client) ) async def async_thread(self, ws, running): diff --git a/trustgraph-flow/trustgraph/api/gateway/publisher.py b/trustgraph-flow/trustgraph/api/gateway/publisher.py index 1bff44dd..2bbf05d9 100644 --- a/trustgraph-flow/trustgraph/api/gateway/publisher.py +++ b/trustgraph-flow/trustgraph/api/gateway/publisher.py @@ -1,6 +1,5 @@ import asyncio -import aiopulsar class Publisher: @@ -12,24 +11,23 @@ class Publisher: self.q = asyncio.Queue(maxsize=max_size) self.chunking_enabled = chunking_enabled - async def run(self): + async def run(self, client): while True: try: - async with aiopulsar.connect(self.pulsar_host) as client: - async with client.create_producer( - topic=self.topic, - schema=self.schema, - chunking_enabled=self.chunking_enabled, - ) as producer: - while True: - id, item = await self.q.get() + async with client.create_producer( + topic=self.topic, + schema=self.schema, + chunking_enabled=self.chunking_enabled, + ) as producer: + while True: + id, item = await self.q.get() - if id: - await producer.send(item, { "id": id }) - else: - await producer.send(item) + if id: + await producer.send(item, { "id": id }) + else: + await producer.send(item) except Exception as e: print("Exception:", e, flush=True) diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index a25dd9dc..38a86a51 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -17,6 +17,7 @@ from aiohttp import web import logging import os import base64 +import aiopulsar import pulsar from pulsar.schema import JsonSchema @@ -237,13 +238,35 @@ class Api: { "error": str(e) } ) + async def run_endpoints(self): + + async with aiopulsar.connect(self.pulsar_host) as client: + + for ep in self.endpoints: + await ep.start(client) + + self.doc_ingest_pub_task = asyncio.create_task( + self.document_out.run(client) + ) + + self.text_ingest_pub_task = asyncio.create_task( + self.text_out.run(client) + ) + + print("Endpoints are running...") + + # They never exit + for ep in self.endpoints: + await ep.join() + + await self.doc_ingest_pub_task + await self.text_ingest_pub_task + + print("Endpoints are stopped.") + async def app_factory(self): - for ep in self.endpoints: - await ep.start() - - self.doc_ingest_pub_task = asyncio.create_task(self.document_out.run()) - self.text_ingest_pub_task = asyncio.create_task(self.text_out.run()) + self.endpoint_task = asyncio.create_task(self.run_endpoints()) return self.app diff --git a/trustgraph-flow/trustgraph/api/gateway/socket.py b/trustgraph-flow/trustgraph/api/gateway/socket.py index 869792b7..a4cb0feb 100644 --- a/trustgraph-flow/trustgraph/api/gateway/socket.py +++ b/trustgraph-flow/trustgraph/api/gateway/socket.py @@ -76,6 +76,12 @@ class SocketEndpoint: async def start(self): pass + async def join(self): + + # Nothing to wait for + while True: + await asyncio.sleep(100) + def add_routes(self, app): app.add_routes([ diff --git a/trustgraph-flow/trustgraph/api/gateway/subscriber.py b/trustgraph-flow/trustgraph/api/gateway/subscriber.py index 3d8840f6..ba53bab6 100644 --- a/trustgraph-flow/trustgraph/api/gateway/subscriber.py +++ b/trustgraph-flow/trustgraph/api/gateway/subscriber.py @@ -1,6 +1,5 @@ import asyncio -import aiopulsar class Subscriber: @@ -14,33 +13,32 @@ class Subscriber: self.q = {} self.full = {} - async def run(self): + async def run(self, client): while True: try: - async with aiopulsar.connect(self.pulsar_host) as client: - async with client.subscribe( - topic=self.topic, - subscription_name=self.subscription, - consumer_name=self.consumer_name, - schema=self.schema, - ) as consumer: - while True: - msg = await consumer.receive() + async with client.subscribe( + topic=self.topic, + subscription_name=self.subscription, + consumer_name=self.consumer_name, + schema=self.schema, + ) as consumer: + while True: + msg = await consumer.receive() - # Acknowledge successful reception of the message - await consumer.acknowledge(msg) + # Acknowledge successful reception of the message + await consumer.acknowledge(msg) - try: - id = msg.properties()["id"] - except: - id = None + try: + id = msg.properties()["id"] + except: + id = None - value = msg.value() - if id in self.q: - await self.q[id].put(value) + value = msg.value() + if id in self.q: + await self.q[id].put(value) - for q in self.full.values(): - await q.put(value) + for q in self.full.values(): + await q.put(value) except Exception as e: print("Exception:", e, flush=True) diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_load.py b/trustgraph-flow/trustgraph/api/gateway/triples_load.py index 7f4561b1..0460d1e4 100644 --- a/trustgraph-flow/trustgraph/api/gateway/triples_load.py +++ b/trustgraph-flow/trustgraph/api/gateway/triples_load.py @@ -27,10 +27,10 @@ class TriplesLoadEndpoint(SocketEndpoint): schema=JsonSchema(Triples) ) - async def start(self): + async def start(self, client): self.task = asyncio.create_task( - self.publisher.run() + self.publisher.run(client) ) async def listener(self, ws, running): diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_stream.py b/trustgraph-flow/trustgraph/api/gateway/triples_stream.py index 6ecd2bdb..571d5e61 100644 --- a/trustgraph-flow/trustgraph/api/gateway/triples_stream.py +++ b/trustgraph-flow/trustgraph/api/gateway/triples_stream.py @@ -26,10 +26,10 @@ class TriplesStreamEndpoint(SocketEndpoint): schema=JsonSchema(Triples) ) - async def start(self): + async def start(self, client): self.task = asyncio.create_task( - self.subscriber.run() + self.subscriber.run(client) ) async def async_thread(self, ws, running): From 26865a515caae6a260834dfddbcce1aaea47e4b4 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Tue, 3 Dec 2024 18:03:00 +0000 Subject: [PATCH 24/37] Fix/async problem (#190) * Back out previous change * To multithreads * Remove aiopulsar dependency --- trustgraph-flow/setup.py | 1 - .../trustgraph/api/gateway/endpoint.py | 44 +++----- .../api/gateway/graph_embeddings_load.py | 8 +- .../api/gateway/graph_embeddings_stream.py | 15 ++- .../trustgraph/api/gateway/publisher.py | 48 ++++++--- .../trustgraph/api/gateway/service.py | 39 ++----- .../trustgraph/api/gateway/socket.py | 6 -- .../trustgraph/api/gateway/subscriber.py | 101 +++++++++++++----- .../trustgraph/api/gateway/triples_load.py | 8 +- .../trustgraph/api/gateway/triples_stream.py | 15 ++- 10 files changed, 149 insertions(+), 136 deletions(-) diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 65bb7326..e6c732a3 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -59,7 +59,6 @@ setuptools.setup( "ibis", "jsonschema", "aiohttp", - "aiopulsar-py", "pinecone[grpc]", ], scripts=[ diff --git a/trustgraph-flow/trustgraph/api/gateway/endpoint.py b/trustgraph-flow/trustgraph/api/gateway/endpoint.py index c7cd6b04..2b246361 100644 --- a/trustgraph-flow/trustgraph/api/gateway/endpoint.py +++ b/trustgraph-flow/trustgraph/api/gateway/endpoint.py @@ -41,15 +41,10 @@ class ServiceEndpoint: self.operation = "service" - async def start(self, client): + async def start(self): - self.pub_task = asyncio.create_task(self.pub.run(client)) - self.sub_task = asyncio.create_task(self.sub.run(client)) - - async def join(self): - - await self.pub_task - await self.sub_task + self.pub.start() + self.sub.start() def add_routes(self, app): @@ -87,20 +82,18 @@ class ServiceEndpoint: print(data) - q = await self.sub.subscribe(id) + q = self.sub.subscribe(id) - await self.pub.send( - id, - self.to_request(data), + await asyncio.to_thread( + self.pub.send, id, self.to_request(data) ) - print("Request sent") try: - resp = await asyncio.wait_for(q.get(), self.timeout) - except: - raise RuntimeError("Timeout waiting for response") + resp = await asyncio.to_thread(q.get, timeout=self.timeout) + except Exception as e: + raise RuntimeError("Timeout") - print("Response got") + print(resp) if resp.error: print("Error") @@ -108,8 +101,6 @@ class ServiceEndpoint: { "error": resp.error.message } ) - print("Send response") - return web.json_response( self.from_response(resp) ) @@ -122,7 +113,7 @@ class ServiceEndpoint: ) finally: - await self.sub.unsubscribe(id) + self.sub.unsubscribe(id) class MultiResponseServiceEndpoint(ServiceEndpoint): @@ -135,11 +126,10 @@ class MultiResponseServiceEndpoint(ServiceEndpoint): data = await request.json() - q = await self.sub.subscribe(id) + q = self.sub.subscribe(id) - await self.pub.send( - id, - self.to_request(data), + await asyncio.to_thread( + self.pub.send, id, self.to_request(data) ) # Keeps looking at responses... @@ -147,8 +137,8 @@ class MultiResponseServiceEndpoint(ServiceEndpoint): while True: try: - resp = await asyncio.wait_for(q.get(), self.timeout) - except: + resp = await asyncio.to_thread(q.get, timeout=self.timeout) + except Exception as e: raise RuntimeError("Timeout waiting for response") if resp.error: @@ -173,4 +163,4 @@ class MultiResponseServiceEndpoint(ServiceEndpoint): ) finally: - await self.sub.unsubscribe(id) + self.sub.unsubscribe(id) diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py index 764e7210..81fb6647 100644 --- a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py +++ b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py @@ -29,11 +29,9 @@ class GraphEmbeddingsLoadEndpoint(SocketEndpoint): schema=JsonSchema(GraphEmbeddings) ) - async def start(self, client): + async def start(self): - self.task = asyncio.create_task( - self.publisher.run(client) - ) + self.publisher.start() async def listener(self, ws, running): @@ -56,7 +54,7 @@ class GraphEmbeddingsLoadEndpoint(SocketEndpoint): vectors=data["vectors"], ) - await self.publisher.send(None, elt) + self.publisher.send(None, elt) running.stop() diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py index 12647547..3d4efd45 100644 --- a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py +++ b/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py @@ -1,5 +1,6 @@ import asyncio +import queue from pulsar.schema import JsonSchema import uuid @@ -28,31 +29,29 @@ class GraphEmbeddingsStreamEndpoint(SocketEndpoint): schema=JsonSchema(GraphEmbeddings) ) - async def start(self, client): + async def start(self): - self.task = asyncio.create_task( - self.subscriber.run(client) - ) + self.subscriber.start() async def async_thread(self, ws, running): id = str(uuid.uuid4()) - q = await self.subscriber.subscribe_all(id) + q = self.subscriber.subscribe_all(id) while running.get(): try: - resp = await asyncio.wait_for(q.get(), 0.5) + resp = await asyncio.to_thread(q.get, timeout=0.5) await ws.send_json(serialize_graph_embeddings(resp)) - except TimeoutError: + except queue.Empty: continue except Exception as e: print(f"Exception: {str(e)}", flush=True) break - await self.subscriber.unsubscribe_all(id) + self.subscriber.unsubscribe_all(id) running.stop() diff --git a/trustgraph-flow/trustgraph/api/gateway/publisher.py b/trustgraph-flow/trustgraph/api/gateway/publisher.py index 2bbf05d9..89c612ce 100644 --- a/trustgraph-flow/trustgraph/api/gateway/publisher.py +++ b/trustgraph-flow/trustgraph/api/gateway/publisher.py @@ -1,5 +1,8 @@ -import asyncio +import queue +import time +import pulsar +import threading class Publisher: @@ -8,32 +11,43 @@ class Publisher: self.pulsar_host = pulsar_host self.topic = topic self.schema = schema - self.q = asyncio.Queue(maxsize=max_size) + self.q = queue.Queue(maxsize=max_size) self.chunking_enabled = chunking_enabled - async def run(self, client): + def start(self): + self.task = threading.Thread(target=self.run) + self.task.start() + + def run(self): while True: try: - async with client.create_producer( - topic=self.topic, - schema=self.schema, - chunking_enabled=self.chunking_enabled, - ) as producer: - while True: - id, item = await self.q.get() - if id: - await producer.send(item, { "id": id }) - else: - await producer.send(item) + client = pulsar.Client( + self.pulsar_host, + ) + + producer = client.create_producer( + topic=self.topic, + schema=self.schema, + chunking_enabled=self.chunking_enabled, + ) + + while True: + + id, item = self.q.get() + + if id: + producer.send(item, { "id": id }) + else: + producer.send(item) except Exception as e: print("Exception:", e, flush=True) # If handler drops out, sleep a retry - await asyncio.sleep(2) + time.sleep(2) - async def send(self, id, msg): - await self.q.put((id, msg)) + def send(self, id, msg): + self.q.put((id, msg)) diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index 38a86a51..38ff8291 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -17,7 +17,6 @@ from aiohttp import web import logging import os import base64 -import aiopulsar import pulsar from pulsar.schema import JsonSchema @@ -167,7 +166,8 @@ class Api: # content is valid base64 doc = base64.b64decode(data["data"]) - resp = await self.document_out.send( + resp = await asyncio.to_thread( + self.document_out.send, None, Document( metadata=Metadata( @@ -212,7 +212,8 @@ class Api: # Text is base64 encoded text = base64.b64decode(data["text"]).decode(charset) - resp = await self.text_out.send( + resp = asyncio.to_thread( + self.text_out.send, None, TextDocument( metadata=Metadata( @@ -238,35 +239,13 @@ class Api: { "error": str(e) } ) - async def run_endpoints(self): - - async with aiopulsar.connect(self.pulsar_host) as client: - - for ep in self.endpoints: - await ep.start(client) - - self.doc_ingest_pub_task = asyncio.create_task( - self.document_out.run(client) - ) - - self.text_ingest_pub_task = asyncio.create_task( - self.text_out.run(client) - ) - - print("Endpoints are running...") - - # They never exit - for ep in self.endpoints: - await ep.join() - - await self.doc_ingest_pub_task - await self.text_ingest_pub_task - - print("Endpoints are stopped.") - async def app_factory(self): - self.endpoint_task = asyncio.create_task(self.run_endpoints()) + for ep in self.endpoints: + await ep.start() + + self.document_out.start() + self.text_out.start() return self.app diff --git a/trustgraph-flow/trustgraph/api/gateway/socket.py b/trustgraph-flow/trustgraph/api/gateway/socket.py index a4cb0feb..869792b7 100644 --- a/trustgraph-flow/trustgraph/api/gateway/socket.py +++ b/trustgraph-flow/trustgraph/api/gateway/socket.py @@ -76,12 +76,6 @@ class SocketEndpoint: async def start(self): pass - async def join(self): - - # Nothing to wait for - while True: - await asyncio.sleep(100) - def add_routes(self, app): app.add_routes([ diff --git a/trustgraph-flow/trustgraph/api/gateway/subscriber.py b/trustgraph-flow/trustgraph/api/gateway/subscriber.py index ba53bab6..cccfc5b4 100644 --- a/trustgraph-flow/trustgraph/api/gateway/subscriber.py +++ b/trustgraph-flow/trustgraph/api/gateway/subscriber.py @@ -1,10 +1,13 @@ -import asyncio +import queue +import pulsar +import threading +import time class Subscriber: def __init__(self, pulsar_host, topic, subscription, consumer_name, - schema=None, max_size=10): + schema=None, max_size=100): self.pulsar_host = pulsar_host self.topic = topic self.subscription = subscription @@ -12,55 +15,95 @@ class Subscriber: self.schema = schema self.q = {} self.full = {} + self.max_size = max_size + self.lock = threading.Lock() + + def start(self): + self.task = threading.Thread(target=self.run) + self.task.start() + + def run(self): - async def run(self, client): while True: + try: - async with client.subscribe( + + client = pulsar.Client( + self.pulsar_host, + ) + + consumer = client.subscribe( topic=self.topic, subscription_name=self.subscription, consumer_name=self.consumer_name, schema=self.schema, - ) as consumer: - while True: - msg = await consumer.receive() + ) - # Acknowledge successful reception of the message - await consumer.acknowledge(msg) + while True: - try: - id = msg.properties()["id"] - except: - id = None + msg = consumer.receive() + + # Acknowledge successful reception of the message + consumer.acknowledge(msg) + + try: + id = msg.properties()["id"] + except: + id = None + + value = msg.value() + + with self.lock: - value = msg.value() if id in self.q: - await self.q[id].put(value) + try: + self.q[id].put(value, timeout=0.5) + except: + pass for q in self.full.values(): - await q.put(value) + try: + q.put(value, timeout=0.5) + except: + pass except Exception as e: print("Exception:", e, flush=True) # If handler drops out, sleep a retry - await asyncio.sleep(2) + time.sleep(2) + + def subscribe(self, id): + + with self.lock: + + q = queue.Queue(maxsize=self.max_size) + self.q[id] = q - async def subscribe(self, id): - q = asyncio.Queue() - self.q[id] = q return q - async def unsubscribe(self, id): - if id in self.q: - del self.q[id] + def unsubscribe(self, id): + + with self.lock: + + if id in self.q: +# self.q[id].shutdown(immediate=True) + del self.q[id] - async def subscribe_all(self, id): - q = asyncio.Queue() - self.full[id] = q + def subscribe_all(self, id): + + with self.lock: + + q = queue.Queue(maxsize=self.max_size) + self.full[id] = q + return q - async def unsubscribe_all(self, id): - if id in self.full: - del self.full[id] + def unsubscribe_all(self, id): + + with self.lock: + + if id in self.full: +# self.full[id].shutdown(immediate=True) + del self.full[id] diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_load.py b/trustgraph-flow/trustgraph/api/gateway/triples_load.py index 0460d1e4..dbb3e617 100644 --- a/trustgraph-flow/trustgraph/api/gateway/triples_load.py +++ b/trustgraph-flow/trustgraph/api/gateway/triples_load.py @@ -27,11 +27,9 @@ class TriplesLoadEndpoint(SocketEndpoint): schema=JsonSchema(Triples) ) - async def start(self, client): + async def start(self): - self.task = asyncio.create_task( - self.publisher.run(client) - ) + self.publisher.start() async def listener(self, ws, running): @@ -53,7 +51,7 @@ class TriplesLoadEndpoint(SocketEndpoint): triples=to_subgraph(data["triples"]), ) - await self.publisher.send(None, elt) + self.publisher.send(None, elt) running.stop() diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_stream.py b/trustgraph-flow/trustgraph/api/gateway/triples_stream.py index 571d5e61..4638e08d 100644 --- a/trustgraph-flow/trustgraph/api/gateway/triples_stream.py +++ b/trustgraph-flow/trustgraph/api/gateway/triples_stream.py @@ -1,5 +1,6 @@ import asyncio +import queue from pulsar.schema import JsonSchema import uuid @@ -26,31 +27,29 @@ class TriplesStreamEndpoint(SocketEndpoint): schema=JsonSchema(Triples) ) - async def start(self, client): + async def start(self): - self.task = asyncio.create_task( - self.subscriber.run(client) - ) + self.subscriber.start() async def async_thread(self, ws, running): id = str(uuid.uuid4()) - q = await self.subscriber.subscribe_all(id) + q = self.subscriber.subscribe_all(id) while running.get(): try: - resp = await asyncio.wait_for(q.get(), 0.5) + resp = await asyncio.to_thread(q.get, timeout=0.5) await ws.send_json(serialize_triples(resp)) - except TimeoutError: + except queue.Empty: continue except Exception as e: print(f"Exception: {str(e)}", flush=True) break - await self.subscriber.unsubscribe_all(id) + self.subscriber.unsubscribe_all(id) running.stop() From 5770af51ef61d5464cd275bba503de6a6cd901c0 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Tue, 3 Dec 2024 21:30:14 +0000 Subject: [PATCH 25/37] Fix async problem on text load (#191) --- trustgraph-flow/trustgraph/api/gateway/service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/api/gateway/service.py index 38ff8291..faa250dc 100755 --- a/trustgraph-flow/trustgraph/api/gateway/service.py +++ b/trustgraph-flow/trustgraph/api/gateway/service.py @@ -212,7 +212,7 @@ class Api: # Text is base64 encoded text = base64.b64decode(data["text"]).decode(charset) - resp = asyncio.to_thread( + resp = await asyncio.to_thread( self.text_out.send, None, TextDocument( From e3d06ab80b075ab28f699f45ac61521dbf71001f Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Wed, 4 Dec 2024 14:42:55 +0000 Subject: [PATCH 26/37] Fix isinstance test on null values (#192) Co-authored-by: Mark Adams --- trustgraph-base/trustgraph/api/api.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/trustgraph-base/trustgraph/api/api.py b/trustgraph-base/trustgraph/api/api.py index 7942e081..de96499c 100644 --- a/trustgraph-base/trustgraph/api/api.py +++ b/trustgraph-base/trustgraph/api/api.py @@ -205,16 +205,20 @@ class Api: "limit": limit } - if not isinstance(s, Uri): - raise RuntimeError("s must be Uri") - if not isinstance(p, Uri): - raise RuntimeError("p must be Uri") - if not isinstance(o, Uri) and not isinstance(o, Literal): - raise RuntimeError("o must be Uri or Literal") + if s: + if not isinstance(s, Uri): + raise RuntimeError("s must be Uri") + input["s"] = { "v": str(s), "e": isinstance(s, Uri), } + + if p: + if not isinstance(p, Uri): + raise RuntimeError("p must be Uri") + input["p"] = { "v": str(p), "e": isinstance(p, Uri), } - if s: input["s"] = { "v": str(s), "e": isinstance(s, Uri), } - if p: input["p"] = { "v": str(p), "e": isinstance(p, Uri), } - if o: input["o"] = { "v": str(o), "e": isinstance(o, Uri), } + if o: + if not isinstance(o, Uri) and not isinstance(o, Literal): + raise RuntimeError("o must be Uri or Literal") + input["o"] = { "v": str(o), "e": isinstance(o, Uri), } url = f"{self.url}triples-query" From bffaf62490c5b8339995b14e02e1a53215358337 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 6 Dec 2024 00:12:49 +0000 Subject: [PATCH 27/37] Feature/memgraph optim (#193) * Separate memgraph query/write modules to optimise for memgraph * Used 1GB memory for Memgraph * Deployed specialised memgraph query/write processors, created memgraph indexes * One triple is loaded as a single transaction * Fixed index creation --- templates/components/memgraph.jsonnet | 4 +- templates/stores/memgraph.jsonnet | 3 + .../scripts/triples-query-memgraph | 6 + .../scripts/triples-write-memgraph | 6 + trustgraph-flow/setup.py | 2 + .../query/triples/memgraph/__init__.py | 3 + .../query/triples/memgraph/__main__.py | 7 + .../query/triples/memgraph/service.py | 357 ++++++++++++++++++ .../trustgraph/query/triples/neo4j/service.py | 5 +- .../storage/triples/memgraph/__init__.py | 3 + .../storage/triples/memgraph/__main__.py | 7 + .../storage/triples/memgraph/write.py | 252 +++++++++++++ 12 files changed, 651 insertions(+), 4 deletions(-) create mode 100755 trustgraph-flow/scripts/triples-query-memgraph create mode 100755 trustgraph-flow/scripts/triples-write-memgraph create mode 100644 trustgraph-flow/trustgraph/query/triples/memgraph/__init__.py create mode 100755 trustgraph-flow/trustgraph/query/triples/memgraph/__main__.py create mode 100755 trustgraph-flow/trustgraph/query/triples/memgraph/service.py create mode 100644 trustgraph-flow/trustgraph/storage/triples/memgraph/__init__.py create mode 100755 trustgraph-flow/trustgraph/storage/triples/memgraph/__main__.py create mode 100755 trustgraph-flow/trustgraph/storage/triples/memgraph/write.py diff --git a/templates/components/memgraph.jsonnet b/templates/components/memgraph.jsonnet index 5ec0a76e..609da3a2 100644 --- a/templates/components/memgraph.jsonnet +++ b/templates/components/memgraph.jsonnet @@ -16,7 +16,7 @@ memgraph + { engine.container("store-triples") .with_image(images.trustgraph) .with_command([ - "triples-write-neo4j", + "triples-write-memgraph", "-p", url.pulsar, "-g", @@ -50,7 +50,7 @@ memgraph + { engine.container("query-triples") .with_image(images.trustgraph) .with_command([ - "triples-query-neo4j", + "triples-query-memgraph", "-p", url.pulsar, "-g", diff --git a/templates/stores/memgraph.jsonnet b/templates/stores/memgraph.jsonnet index 8f8b6216..75faf5f0 100644 --- a/templates/stores/memgraph.jsonnet +++ b/templates/stores/memgraph.jsonnet @@ -10,6 +10,9 @@ local images = import "values/images.jsonnet"; local container = engine.container("memgraph") .with_image(images.memgraph_mage) + .with_environment({ + MEMGRAPH: "--storage-properties-on-edges=true --storage-enable-edges-metadata=true" + }) .with_limits("1.0", "1000M") .with_reservations("0.5", "1000M") .with_port(7474, 7474, "api") diff --git a/trustgraph-flow/scripts/triples-query-memgraph b/trustgraph-flow/scripts/triples-query-memgraph new file mode 100755 index 00000000..443929e4 --- /dev/null +++ b/trustgraph-flow/scripts/triples-query-memgraph @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.query.triples.memgraph import run + +run() + diff --git a/trustgraph-flow/scripts/triples-write-memgraph b/trustgraph-flow/scripts/triples-write-memgraph new file mode 100755 index 00000000..3d94a576 --- /dev/null +++ b/trustgraph-flow/scripts/triples-write-memgraph @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.storage.triples.memgraph import run + +run() + diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index e6c732a3..c53f96e7 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -103,8 +103,10 @@ setuptools.setup( "scripts/text-completion-openai", "scripts/triples-query-cassandra", "scripts/triples-query-neo4j", + "scripts/triples-query-memgraph", "scripts/triples-write-cassandra", "scripts/triples-write-neo4j", + "scripts/triples-write-memgraph", "scripts/wikipedia-lookup", ] ) diff --git a/trustgraph-flow/trustgraph/query/triples/memgraph/__init__.py b/trustgraph-flow/trustgraph/query/triples/memgraph/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/triples/memgraph/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/query/triples/memgraph/__main__.py b/trustgraph-flow/trustgraph/query/triples/memgraph/__main__.py new file mode 100755 index 00000000..89684e3e --- /dev/null +++ b/trustgraph-flow/trustgraph/query/triples/memgraph/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . hf import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/query/triples/memgraph/service.py b/trustgraph-flow/trustgraph/query/triples/memgraph/service.py new file mode 100755 index 00000000..5144f781 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/triples/memgraph/service.py @@ -0,0 +1,357 @@ + +""" +Triples query service for memgraph. +Input is a (s, p, o) triple, some values may be null. Output is a list of +triples. +""" + +from neo4j import GraphDatabase + +from .... schema import TriplesQueryRequest, TriplesQueryResponse, Error +from .... schema import Value, Triple +from .... schema import triples_request_queue +from .... schema import triples_response_queue +from .... base import ConsumerProducer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = triples_request_queue +default_output_queue = triples_response_queue +default_subscriber = module + +default_graph_host = 'bolt://memgraph:7687' +default_username = 'memgraph' +default_password = 'password' +default_database = 'memgraph' + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + graph_host = params.get("graph_host", default_graph_host) + username = params.get("username", default_username) + password = params.get("password", default_password) + database = params.get("database", default_database) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": TriplesQueryRequest, + "output_schema": TriplesQueryResponse, + "graph_host": graph_host, + } + ) + + self.db = database + + self.io = GraphDatabase.driver(graph_host, auth=(username, password)) + + def create_value(self, ent): + + if ent.startswith("http://") or ent.startswith("https://"): + return Value(value=ent, is_uri=True) + else: + return Value(value=ent, is_uri=False) + + def handle(self, msg): + + try: + + v = msg.value() + + # Sender-produced ID + id = msg.properties()["id"] + + print(f"Handling input {id}...", flush=True) + + triples = [] + + if v.s is not None: + if v.p is not None: + if v.o is not None: + + # SPO + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Literal {value: $value}) " + "RETURN $src as src", + src=v.s.value, rel=v.p.value, value=v.o.value, + database_=self.db, + ) + + for rec in records: + triples.append((v.s.value, v.p.value, v.o.value)) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Node {uri: $uri}) " + "RETURN $src as src", + src=v.s.value, rel=v.p.value, uri=v.o.value, + database_=self.db, + ) + + for rec in records: + triples.append((v.s.value, v.p.value, v.o.value)) + + else: + + # SP + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Literal) " + "RETURN dest.value as dest", + src=v.s.value, rel=v.p.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, v.p.value, data["dest"])) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Node) " + "RETURN dest.uri as dest", + src=v.s.value, rel=v.p.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, v.p.value, data["dest"])) + + else: + + if v.o is not None: + + # SO + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Literal {value: $value}) " + "RETURN rel.uri as rel", + src=v.s.value, value=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, data["rel"], v.o.value)) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Node {uri: $uri}) " + "RETURN rel.uri as rel", + src=v.s.value, uri=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, data["rel"], v.o.value)) + + else: + + # S + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Literal) " + "RETURN rel.uri as rel, dest.value as dest", + src=v.s.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, data["rel"], data["dest"])) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Node) " + "RETURN rel.uri as rel, dest.uri as dest", + src=v.s.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, data["rel"], data["dest"])) + + + else: + + if v.p is not None: + + if v.o is not None: + + # PO + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Literal {value: $value}) " + "RETURN src.uri as src", + uri=v.p.value, value=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], v.p.value, v.o.value)) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Node {uri: $uri}) " + "RETURN src.uri as src", + uri=v.p.value, dest=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], v.p.value, v.o.value)) + + else: + + # P + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Literal) " + "RETURN src.uri as src, dest.value as dest", + uri=v.p.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], v.p.value, data["dest"])) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Node) " + "RETURN src.uri as src, dest.uri as dest", + uri=v.p.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], v.p.value, data["dest"])) + + else: + + if v.o is not None: + + # O + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel]->(dest:Literal {value: $value}) " + "RETURN src.uri as src, rel.uri as rel", + value=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], data["rel"], v.o.value)) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel]->(dest:Node {uri: $uri}) " + "RETURN src.uri as src, rel.uri as rel", + uri=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], data["rel"], v.o.value)) + + else: + + # * + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel]->(dest:Literal) " + "RETURN src.uri as src, rel.uri as rel, dest.value as dest", + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], data["rel"], data["dest"])) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel]->(dest:Node) " + "RETURN src.uri as src, rel.uri as rel, dest.uri as dest", + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], data["rel"], data["dest"])) + + triples = [ + Triple( + s=self.create_value(t[0]), + p=self.create_value(t[1]), + o=self.create_value(t[2]) + ) + for t in triples + ] + + print("Send response...", flush=True) + r = TriplesQueryResponse(triples=triples, error=None) + self.producer.send(r, properties={"id": id}) + + print("Done.", flush=True) + + except Exception as e: + + print(f"Exception: {e}") + + print("Send error response...", flush=True) + + r = TriplesQueryResponse( + error=Error( + type = "llm-error", + message = str(e), + ), + response=None, + ) + + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + + parser.add_argument( + '-g', '--graph-host', + default=default_graph_host, + help=f'Graph host (default: {default_graph_host})' + ) + + parser.add_argument( + '--username', + default=default_username, + help=f'Memgraph username (default: {default_username})' + ) + + parser.add_argument( + '--password', + default=default_password, + help=f'Memgraph password (default: {default_password})' + ) + + parser.add_argument( + '--database', + default=default_database, + help=f'Memgraph database (default: {default_database})' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/query/triples/neo4j/service.py b/trustgraph-flow/trustgraph/query/triples/neo4j/service.py index 2caa0193..d60bc4f4 100755 --- a/trustgraph-flow/trustgraph/query/triples/neo4j/service.py +++ b/trustgraph-flow/trustgraph/query/triples/neo4j/service.py @@ -1,7 +1,8 @@ """ -Triples query service. Input is a (s, p, o) triple, some values may be -null. Output is a list of triples. +Triples query service for neo4j. +Input is a (s, p, o) triple, some values may be null. Output is a list of +triples. """ from neo4j import GraphDatabase diff --git a/trustgraph-flow/trustgraph/storage/triples/memgraph/__init__.py b/trustgraph-flow/trustgraph/storage/triples/memgraph/__init__.py new file mode 100644 index 00000000..d891d55f --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/triples/memgraph/__init__.py @@ -0,0 +1,3 @@ + +from . write import * + diff --git a/trustgraph-flow/trustgraph/storage/triples/memgraph/__main__.py b/trustgraph-flow/trustgraph/storage/triples/memgraph/__main__.py new file mode 100755 index 00000000..c05d8c6d --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/triples/memgraph/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . write import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/storage/triples/memgraph/write.py b/trustgraph-flow/trustgraph/storage/triples/memgraph/write.py new file mode 100755 index 00000000..17e8c67e --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/triples/memgraph/write.py @@ -0,0 +1,252 @@ + +""" +Graph writer. Input is graph edge. Writes edges to Cassandra graph. +""" + +import pulsar +import base64 +import os +import argparse +import time + +from neo4j import GraphDatabase + +from .... schema import Triples +from .... schema import triples_store_queue +from .... log_level import LogLevel +from .... base import Consumer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = triples_store_queue +default_subscriber = module + +default_graph_host = 'bolt://memgraph:7687' +default_username = 'memgraph' +default_password = 'password' +default_database = 'memgraph' + +class Processor(Consumer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + subscriber = params.get("subscriber", default_subscriber) + graph_host = params.get("graph_host", default_graph_host) + username = params.get("username", default_username) + password = params.get("password", default_password) + database = params.get("database", default_database) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "subscriber": subscriber, + "input_schema": Triples, + "graph_host": graph_host, + } + ) + + self.db = database + + self.io = GraphDatabase.driver(graph_host, auth=(username, password)) + + with self.io.session(database=self.db) as session: + self.create_indexes(session) + + def create_indexes(self, session): + + print("Create indexes...", flush=True) + + try: + session.run( + "CREATE INDEX ON :Node", + ) + except Exception as e: + print(e, flush=True) + # Maybe index already exists + print("Index create failure ignored", flush=True) + + try: + session.run( + "CREATE INDEX ON :Node(uri)" + ) + except Exception as e: + print(e, flush=True) + # Maybe index already exists + print("Index create failure ignored", flush=True) + + try: + session.run( + "CREATE INDEX ON :Literal", + ) + except Exception as e: + print(e, flush=True) + # Maybe index already exists + print("Index create failure ignored", flush=True) + + try: + session.run( + "CREATE INDEX ON :Literal(value)" + ) + except Exception as e: + print(e, flush=True) + # Maybe index already exists + print("Index create failure ignored", flush=True) + + print("Index creation done", flush=True) + + def create_node(self, uri): + + print("Create node", uri) + + summary = self.io.execute_query( + "MERGE (n:Node {uri: $uri})", + uri=uri, + database_=self.db, + ).summary + + print("Created {nodes_created} nodes in {time} ms.".format( + nodes_created=summary.counters.nodes_created, + time=summary.result_available_after + )) + + def create_literal(self, value): + + print("Create literal", value) + + summary = self.io.execute_query( + "MERGE (n:Literal {value: $value})", + value=value, + database_=self.db, + ).summary + + print("Created {nodes_created} nodes in {time} ms.".format( + nodes_created=summary.counters.nodes_created, + time=summary.result_available_after + )) + + def relate_node(self, src, uri, dest): + + print("Create node rel", src, uri, dest) + + summary = self.io.execute_query( + "MATCH (src:Node {uri: $src}) " + "MATCH (dest:Node {uri: $dest}) " + "MERGE (src)-[:Rel {uri: $uri}]->(dest)", + src=src, dest=dest, uri=uri, + database_=self.db, + ).summary + + print("Created {nodes_created} nodes in {time} ms.".format( + nodes_created=summary.counters.nodes_created, + time=summary.result_available_after + )) + + def relate_literal(self, src, uri, dest): + + print("Create literal rel", src, uri, dest) + + summary = self.io.execute_query( + "MATCH (src:Node {uri: $src}) " + "MATCH (dest:Literal {value: $dest}) " + "MERGE (src)-[:Rel {uri: $uri}]->(dest)", + src=src, dest=dest, uri=uri, + database_=self.db, + ).summary + + print("Created {nodes_created} nodes in {time} ms.".format( + nodes_created=summary.counters.nodes_created, + time=summary.result_available_after + )) + + def create_triple(self, tx, t): + + # Create new s node with given uri, if not exists + result = tx.run( + "MERGE (n:Node {uri: $uri})", + uri=t.s.value + ) + + if t.o.is_uri: + + # Create new o node with given uri, if not exists + result = tx.run( + "MERGE (n:Node {uri: $uri})", + uri=t.o.value + ) + + result = tx.run( + "MATCH (src:Node {uri: $src}) " + "MATCH (dest:Node {uri: $dest}) " + "MERGE (src)-[:Rel {uri: $uri}]->(dest)", + src=t.s.value, dest=t.o.value, uri=t.p.value, + ) + + else: + + # Create new o literal with given uri, if not exists + result = tx.run( + "MERGE (n:Literal {value: $value})", + value=t.o.value + ) + + result = tx.run( + "MATCH (src:Node {uri: $src}) " + "MATCH (dest:Literal {value: $dest}) " + "MERGE (src)-[:Rel {uri: $uri}]->(dest)", + src=t.s.value, dest=t.o.value, uri=t.p.value, + ) + + def handle(self, msg): + + v = msg.value() + + for t in v.triples: + + # self.create_node(t.s.value) + + # if t.o.is_uri: + # self.create_node(t.o.value) + # self.relate_node(t.s.value, t.p.value, t.o.value) + # else: + # self.create_literal(t.o.value) + # self.relate_literal(t.s.value, t.p.value, t.o.value) + + with self.io.session(database=self.db) as session: + session.execute_write(self.create_triple, t) + + @staticmethod + def add_args(parser): + + Consumer.add_args( + parser, default_input_queue, default_subscriber, + ) + + parser.add_argument( + '-g', '--graph_host', + default=default_graph_host, + help=f'Graph host (default: {default_graph_host})' + ) + + parser.add_argument( + '--username', + default=default_username, + help=f'Memgraph username (default: {default_username})' + ) + + parser.add_argument( + '--password', + default=default_password, + help=f'Memgraph password (default: {default_password})' + ) + + parser.add_argument( + '--database', + default=default_database, + help=f'Memgraph database (default: {default_database})' + ) + +def run(): + + Processor.start(module, __doc__) + From 2818ec9f236ed9373f639a3c809c38cf3bd688b6 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 6 Dec 2024 08:50:49 +0000 Subject: [PATCH 28/37] Fix header (#194) --- trustgraph-cli/scripts/tg-dump-msgpack | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trustgraph-cli/scripts/tg-dump-msgpack b/trustgraph-cli/scripts/tg-dump-msgpack index dc4a8139..2be950db 100755 --- a/trustgraph-cli/scripts/tg-dump-msgpack +++ b/trustgraph-cli/scripts/tg-dump-msgpack @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -"" +""" This utility reads a knowledge core in msgpack format and outputs its contents in JSON form to standard output. This is useful only as a diagnostic utility. From 7df7843dad25bf643734522245b5c7eb5c59976e Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 6 Dec 2024 08:51:10 +0000 Subject: [PATCH 29/37] Main/remove parquet (#195) * Remove Parquet code, and package build --- .github/workflows/release.yaml | 24 +-- Containerfile | 5 +- Makefile | 3 - trustgraph-parquet/README.md | 1 - trustgraph-parquet/scripts/concat-parquet | 45 ----- trustgraph-parquet/scripts/dump-parquet | 24 --- trustgraph-parquet/scripts/ge-dump-parquet | 6 - .../scripts/load-graph-embeddings | 170 ----------------- trustgraph-parquet/scripts/load-triples | 180 ------------------ .../scripts/triples-dump-parquet | 6 - trustgraph-parquet/setup.py | 51 ----- .../trustgraph/dump/__init__.py | 0 .../dump/graph_embeddings/__init__.py | 0 .../dump/graph_embeddings/parquet/__init__.py | 3 - .../dump/graph_embeddings/parquet/__main__.py | 7 - .../graph_embeddings/parquet/processor.py | 85 --------- .../dump/graph_embeddings/parquet/writer.py | 94 --------- .../trustgraph/dump/triples/__init__.py | 0 .../dump/triples/parquet/__init__.py | 3 - .../dump/triples/parquet/__main__.py | 7 - .../dump/triples/parquet/processor.py | 87 --------- .../trustgraph/dump/triples/parquet/writer.py | 96 ---------- 22 files changed, 11 insertions(+), 886 deletions(-) delete mode 100644 trustgraph-parquet/README.md delete mode 100755 trustgraph-parquet/scripts/concat-parquet delete mode 100755 trustgraph-parquet/scripts/dump-parquet delete mode 100755 trustgraph-parquet/scripts/ge-dump-parquet delete mode 100755 trustgraph-parquet/scripts/load-graph-embeddings delete mode 100755 trustgraph-parquet/scripts/load-triples delete mode 100755 trustgraph-parquet/scripts/triples-dump-parquet delete mode 100644 trustgraph-parquet/setup.py delete mode 100644 trustgraph-parquet/trustgraph/dump/__init__.py delete mode 100644 trustgraph-parquet/trustgraph/dump/graph_embeddings/__init__.py delete mode 100644 trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__init__.py delete mode 100755 trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__main__.py delete mode 100755 trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/processor.py delete mode 100644 trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/writer.py delete mode 100644 trustgraph-parquet/trustgraph/dump/triples/__init__.py delete mode 100644 trustgraph-parquet/trustgraph/dump/triples/parquet/__init__.py delete mode 100755 trustgraph-parquet/trustgraph/dump/triples/parquet/__main__.py delete mode 100755 trustgraph-parquet/trustgraph/dump/triples/parquet/processor.py delete mode 100644 trustgraph-parquet/trustgraph/dump/triples/parquet/writer.py diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index fc85a6a8..30fc70ff 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -48,20 +48,6 @@ jobs: - name: Publish release distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 - - name: Create deploy bundle - run: templates/generate-all deploy.zip ${{ steps.version.outputs.VERSION }} - - - uses: ncipollo/release-action@v1 - with: - artifacts: deploy.zip - generateReleaseNotes: true - makeLatest: false - prerelease: true - skipIfReleaseExists: true - - - name: Build container - run: make container VERSION=${{ steps.version.outputs.VERSION }} - - name: Extract metadata for container id: meta uses: docker/metadata-action@v4 @@ -84,3 +70,13 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} + - name: Create deploy bundle + run: templates/generate-all deploy.zip ${{ steps.version.outputs.VERSION }} + + - uses: ncipollo/release-action@v1 + with: + artifacts: deploy.zip + generateReleaseNotes: true + makeLatest: false + prerelease: true + skipIfReleaseExists: true diff --git a/Containerfile b/Containerfile index 0d6d357b..c2735feb 100644 --- a/Containerfile +++ b/Containerfile @@ -16,7 +16,7 @@ RUN pip3 install torch --index-url https://download.pytorch.org/whl/cpu RUN pip3 install anthropic boto3 cohere openai google-cloud-aiplatform ollama google-generativeai \ langchain langchain-core langchain-huggingface langchain-text-splitters \ langchain-community pymilvus sentence-transformers transformers \ - huggingface-hub pulsar-client cassandra-driver pyarrow pyyaml \ + huggingface-hub pulsar-client cassandra-driver pyyaml \ neo4j tiktoken && \ pip3 cache purge @@ -32,7 +32,6 @@ COPY trustgraph-base/ /root/build/trustgraph-base/ COPY trustgraph-flow/ /root/build/trustgraph-flow/ COPY trustgraph-vertexai/ /root/build/trustgraph-vertexai/ COPY trustgraph-bedrock/ /root/build/trustgraph-bedrock/ -COPY trustgraph-parquet/ /root/build/trustgraph-parquet/ COPY trustgraph-embeddings-hf/ /root/build/trustgraph-embeddings-hf/ COPY trustgraph-cli/ /root/build/trustgraph-cli/ @@ -42,7 +41,6 @@ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-base/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-flow/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-vertexai/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-bedrock/ -RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-parquet/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-embeddings-hf/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-cli/ @@ -61,7 +59,6 @@ RUN \ pip3 install /root/wheels/trustgraph_flow-* && \ pip3 install /root/wheels/trustgraph_vertexai-* && \ pip3 install /root/wheels/trustgraph_bedrock-* && \ - pip3 install /root/wheels/trustgraph_parquet-* && \ pip3 install /root/wheels/trustgraph_embeddings_hf-* && \ pip3 install /root/wheels/trustgraph_cli-* && \ pip3 cache purge && \ diff --git a/Makefile b/Makefile index 0fb4b175..72d144a9 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,6 @@ wheels: pip3 wheel --no-deps --wheel-dir dist trustgraph-flow/ pip3 wheel --no-deps --wheel-dir dist trustgraph-vertexai/ pip3 wheel --no-deps --wheel-dir dist trustgraph-bedrock/ - pip3 wheel --no-deps --wheel-dir dist trustgraph-parquet/ pip3 wheel --no-deps --wheel-dir dist trustgraph-embeddings-hf/ pip3 wheel --no-deps --wheel-dir dist trustgraph-cli/ @@ -25,7 +24,6 @@ packages: update-package-versions cd trustgraph-flow && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-vertexai && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-bedrock && python3 setup.py sdist --dist-dir ../dist/ - cd trustgraph-parquet && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-embeddings-hf && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-cli && python3 setup.py sdist --dist-dir ../dist/ @@ -41,7 +39,6 @@ update-package-versions: echo __version__ = \"${VERSION}\" > trustgraph-flow/trustgraph/flow_version.py echo __version__ = \"${VERSION}\" > trustgraph-vertexai/trustgraph/vertexai_version.py echo __version__ = \"${VERSION}\" > trustgraph-bedrock/trustgraph/bedrock_version.py - echo __version__ = \"${VERSION}\" > trustgraph-parquet/trustgraph/parquet_version.py echo __version__ = \"${VERSION}\" > trustgraph-embeddings-hf/trustgraph/embeddings_hf_version.py echo __version__ = \"${VERSION}\" > trustgraph-cli/trustgraph/cli_version.py echo __version__ = \"${VERSION}\" > trustgraph/trustgraph/trustgraph_version.py diff --git a/trustgraph-parquet/README.md b/trustgraph-parquet/README.md deleted file mode 100644 index 7a2ce130..00000000 --- a/trustgraph-parquet/README.md +++ /dev/null @@ -1 +0,0 @@ -See https://trustgraph.ai/ diff --git a/trustgraph-parquet/scripts/concat-parquet b/trustgraph-parquet/scripts/concat-parquet deleted file mode 100755 index 7943d436..00000000 --- a/trustgraph-parquet/scripts/concat-parquet +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 - -""" -Concatenates multiple parquet files into a single parquet output -""" - -import pyarrow as pa -import pyarrow.parquet as pq -import pandas as pd -import sys -import argparse - -parser = argparse.ArgumentParser( - prog="combine-parquet", - description=__doc__ -) - -parser.add_argument( - '-i', '--input', - nargs='*', - help=f'Input files' -) - -parser.add_argument( - '-o', '--output', - help=f'Output files' -) - -args = parser.parse_args() - -df = None - -for file in args.input: - - part = pq.read_table(file).to_pandas() - - if df is None: - df = part - else: - df = pd.concat([df, part], ignore_index=True) - -if df is not None: - - table = pa.Table.from_pandas(df) - pq.write_table(table, args.output) diff --git a/trustgraph-parquet/scripts/dump-parquet b/trustgraph-parquet/scripts/dump-parquet deleted file mode 100755 index 62b28998..00000000 --- a/trustgraph-parquet/scripts/dump-parquet +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python3 - -import pyarrow as pa -import pyarrow.csv as pc -import pyarrow.parquet as pq -import pandas as pd -import sys - -df = None - -for file in sys.argv[1:]: - - part = pq.read_table(file).to_pandas() - - if df is None: - df = part - else: - df = pd.concat([df, part], ignore_index=True) - -if df is not None: - - table = pa.Table.from_pandas(df) - pc.write_csv(table, sys.stdout.buffer) - diff --git a/trustgraph-parquet/scripts/ge-dump-parquet b/trustgraph-parquet/scripts/ge-dump-parquet deleted file mode 100755 index c2b29c51..00000000 --- a/trustgraph-parquet/scripts/ge-dump-parquet +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 - -from trustgraph.dump.graph_embeddings.parquet import run - -run() - diff --git a/trustgraph-parquet/scripts/load-graph-embeddings b/trustgraph-parquet/scripts/load-graph-embeddings deleted file mode 100755 index 0e6ecf93..00000000 --- a/trustgraph-parquet/scripts/load-graph-embeddings +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -""" -Loads Graph embeddings into TrustGraph processing. -""" - -import pulsar -from pulsar.schema import JsonSchema -from trustgraph.schema import GraphEmbeddings, Value, Metadata -from trustgraph.schema import graph_embeddings_store_queue -import argparse -import os -import time -import pyarrow as pa -import pyarrow.parquet as pq - -from trustgraph.log_level import LogLevel - -class Loader: - - def __init__( - self, - pulsar_host, - output_queue, - log_level, - file, - user, - collection, - ): - - self.client = pulsar.Client( - pulsar_host, - logger=pulsar.ConsoleLogger(log_level.to_pulsar()) - ) - - self.producer = self.client.create_producer( - topic=output_queue, - schema=JsonSchema(GraphEmbeddings), - chunking_enabled=True, - ) - - self.file = file - self.user = user - self.collection = collection - - def run(self): - - try: - - path = self.file - - print("Reading file...") - table = pq.read_table(path) - print("Loaded.") - - names = set(table.column_names) - - if "embeddings" not in names: - print("No 'embeddings' column") - - if "entity" not in names: - print("No 'entity' column") - - embc = table.column("embeddings") - entc = table.column("entity") - - for emb, ent in zip(embc, entc): - - b = emb.as_py() - n = ent.as_py() - - r = GraphEmbeddings( - metadata=Metadata( - metadata=[], - user=self.user, - collection=self.collection, - ), - vectors=b, - entity=Value( - value=n, - is_uri=n.startswith("https:") - ), - ) - - self.producer.send(r) - - except Exception as e: - print(e, flush=True) - - def __del__(self): - self.client.close() - -def main(): - - parser = argparse.ArgumentParser( - prog='loader', - description=__doc__, - ) - - default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650') - default_output_queue = graph_embeddings_store_queue - default_user = 'trustgraph' - default_collection = 'default' - - parser.add_argument( - '-p', '--pulsar-host', - default=default_pulsar_host, - help=f'Pulsar host (default: {default_pulsar_host})', - ) - - parser.add_argument( - '-o', '--output-queue', - default=default_output_queue, - help=f'Output queue (default: {default_output_queue})' - ) - - parser.add_argument( - '-u', '--user', - default=default_user, - help=f'User ID (default: {default_user})' - ) - - parser.add_argument( - '-c', '--collection', - default=default_collection, - help=f'Collection ID (default: {default_collection})' - ) - - parser.add_argument( - '-l', '--log-level', - type=LogLevel, - default=LogLevel.ERROR, - choices=list(LogLevel), - help=f'Output queue (default: info)' - ) - - parser.add_argument( - '-f', '--file', - required=True, - help=f'File to load' - ) - - args = parser.parse_args() - - while True: - - try: - p = Loader( - pulsar_host=args.pulsar_host, - output_queue=args.output_queue, - log_level=args.log_level, - file=args.file, - user=args.user, - collection=args.collection, - ) - - p.run() - - print("File loaded.") - break - - except Exception as e: - - print("Exception:", e, flush=True) - print("Will retry...", flush=True) - - time.sleep(10) - -main() - diff --git a/trustgraph-parquet/scripts/load-triples b/trustgraph-parquet/scripts/load-triples deleted file mode 100755 index e6bb0ff7..00000000 --- a/trustgraph-parquet/scripts/load-triples +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python3 - -""" -Loads Graph embeddings into TrustGraph processing. -""" - -import pulsar -from pulsar.schema import JsonSchema -from trustgraph.schema import Triples, Triple, Value, Metadata -from trustgraph.schema import triples_store_queue -import argparse -import os -import time -import pyarrow as pa -import pyarrow.parquet as pq - -from trustgraph.log_level import LogLevel - -class Loader: - - def __init__( - self, - pulsar_host, - output_queue, - log_level, - file, - user, - collection, - ): - - self.client = pulsar.Client( - pulsar_host, - logger=pulsar.ConsoleLogger(log_level.to_pulsar()) - ) - - self.producer = self.client.create_producer( - topic=output_queue, - schema=JsonSchema(Triples), - chunking_enabled=True, - ) - - self.file = file - self.user = user - self.collection = collection - - def run(self): - - try: - - path = self.file - - print("Reading file...") - table = pq.read_table(path) - print("Loaded.") - - names = set(table.column_names) - - if "s" not in names: - print("No 's' column") - - if "p" not in names: - print("No 'p' column") - - if "o" not in names: - print("No 'o' column") - - sc = table.column("s") - pc = table.column("p") - oc = table.column("o") - - for s, p, o in zip(sc, pc, oc): - - r = Triples( - metadata=Metadata( - metadata=[], - user=self.user, - collection=self.collection, - ), - triples=[ - Triple( - s=Value( - value=s.as_py(), is_uri=True - ), - p=Value( - value=p.as_py(), is_uri=True - ), - o=Value( - value=o.as_py(), - is_uri=o.as_py().startswith("https:") - ) - ) - ] - ) - - self.producer.send(r) - - except Exception as e: - print(e, flush=True) - - def __del__(self): - self.client.close() - -def main(): - - parser = argparse.ArgumentParser( - prog='loader', - description=__doc__, - ) - - default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650') - default_output_queue = triples_store_queue - default_user = 'trustgraph' - default_collection = 'default' - - parser.add_argument( - '-p', '--pulsar-host', - default=default_pulsar_host, - help=f'Pulsar host (default: {default_pulsar_host})', - ) - - parser.add_argument( - '-o', '--output-queue', - default=default_output_queue, - help=f'Output queue (default: {default_output_queue})' - ) - - parser.add_argument( - '-u', '--user', - default=default_user, - help=f'User ID (default: {default_user})' - ) - - parser.add_argument( - '-c', '--collection', - default=default_collection, - help=f'Collection ID (default: {default_collection})' - ) - - parser.add_argument( - '-l', '--log-level', - type=LogLevel, - default=LogLevel.ERROR, - choices=list(LogLevel), - help=f'Output queue (default: info)' - ) - - parser.add_argument( - '-f', '--file', - required=True, - help=f'File to load' - ) - - args = parser.parse_args() - - while True: - - try: - p = Loader( - pulsar_host=args.pulsar_host, - output_queue=args.output_queue, - log_level=args.log_level, - file=args.file, - user=args.user, - collection=args.collection, - ) - - p.run() - - print("File loaded.") - break - - except Exception as e: - - print("Exception:", e, flush=True) - print("Will retry...", flush=True) - - time.sleep(10) - -main() - diff --git a/trustgraph-parquet/scripts/triples-dump-parquet b/trustgraph-parquet/scripts/triples-dump-parquet deleted file mode 100755 index 78d79196..00000000 --- a/trustgraph-parquet/scripts/triples-dump-parquet +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 - -from trustgraph.dump.triples.parquet import run - -run() - diff --git a/trustgraph-parquet/setup.py b/trustgraph-parquet/setup.py deleted file mode 100644 index dfe29653..00000000 --- a/trustgraph-parquet/setup.py +++ /dev/null @@ -1,51 +0,0 @@ -import setuptools -import os -import importlib - -with open("README.md", "r") as fh: - long_description = fh.read() - -# Load a version number module -spec = importlib.util.spec_from_file_location( - 'version', 'trustgraph/parquet_version.py' -) -version_module = importlib.util.module_from_spec(spec) -spec.loader.exec_module(version_module) - -version = version_module.__version__ - -setuptools.setup( - name="trustgraph-parquet", - version=version, - author="trustgraph.ai", - author_email="security@trustgraph.ai", - description="TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/trustgraph-ai/trustgraph", - packages=setuptools.find_namespace_packages( - where='./', - ), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - "Operating System :: OS Independent", - ], - python_requires='>=3.8', - download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", - install_requires=[ - "trustgraph-base>=0.17,<0.18", - "pulsar-client", - "prometheus-client", - "pyarrow", - "pandas", - ], - scripts=[ - "scripts/concat-parquet", - "scripts/dump-parquet", - "scripts/ge-dump-parquet", - "scripts/triples-dump-parquet", - "scripts/load-graph-embeddings", - "scripts/load-triples", - ] -) diff --git a/trustgraph-parquet/trustgraph/dump/__init__.py b/trustgraph-parquet/trustgraph/dump/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/__init__.py b/trustgraph-parquet/trustgraph/dump/graph_embeddings/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__init__.py b/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__init__.py deleted file mode 100644 index 9d16af90..00000000 --- a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ - -from . processor import * - diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__main__.py b/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__main__.py deleted file mode 100755 index c05d8c6d..00000000 --- a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__main__.py +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env python3 - -from . write import run - -if __name__ == '__main__': - run() - diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/processor.py b/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/processor.py deleted file mode 100755 index 795f3351..00000000 --- a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/processor.py +++ /dev/null @@ -1,85 +0,0 @@ - -""" -Write graph embeddings to parquet files in a directory. -""" - -import pulsar -import base64 -import os -import argparse -import time - -from .... schema import GraphEmbeddings -from .... schema import graph_embeddings_store_queue -from .... base import Consumer - -from . writer import ParquetWriter - -module = ".".join(__name__.split(".")[1:-1]) - -default_input_queue = graph_embeddings_store_queue -default_subscriber = module -default_graph_host='localhost' -default_directory = "." -default_file_template = "graph-embeds-{id}.parquet" -default_rotation_time = 60 - -class Processor(Consumer): - - def __init__(self, **params): - - input_queue = params.get("input_queue", default_input_queue) - subscriber = params.get("subscriber", default_subscriber) - directory = params.get("directory", default_directory) - file_template = params.get("file_template", default_file_template) - rotation_time = params.get("rotation_time", default_rotation_time) - - super(Processor, self).__init__( - **params | { - "input_queue": input_queue, - "subscriber": subscriber, - "input_schema": GraphEmbeddings, - } - ) - - self.writer = ParquetWriter(directory, file_template, rotation_time) - - def __del__(self): - if hasattr(self, "writer"): - del self.writer - - def handle(self, msg): - - v = msg.value() - self.writer.write(v.vectors, v.entity.value) - - @staticmethod - def add_args(parser): - - Consumer.add_args( - parser, default_input_queue, default_subscriber, - ) - - parser.add_argument( - '-d', '--directory', - default=default_directory, - help=f'Directory to write to (default: {default_directory})' - ) - - parser.add_argument( - '-f', '--file-template', - default=default_file_template, - help=f'Directory to write to (default: {default_file_template})' - ) - - parser.add_argument( - '-t', '--rotation-time', - type=int, - default=default_rotation_time, - help=f'Rotation time / seconds (default: {default_rotation_time})' - ) - -def run(): - - Processor.start(module, __doc__) - diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/writer.py b/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/writer.py deleted file mode 100644 index 1844cdd1..00000000 --- a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/writer.py +++ /dev/null @@ -1,94 +0,0 @@ - -import threading -import queue -import time -import uuid -import pyarrow as pa -import pyarrow.parquet as pq - -class ParquetWriter: - - def __init__(self, directory, file_template, rotation_time): - self.directory = directory - self.file_template = file_template - self.rotation_time = rotation_time - - self.q = queue.Queue() - - self.running = True - - self.thread = threading.Thread(target=(self.writer_thread)) - self.thread.start() - - def writer_thread(self): - - items = [] - - timeout = None - - while self.running: - - try: - - item = self.q.get(timeout=1) - - if timeout == None: - timeout = time.time() + self.rotation_time - - items.append(item) - - except queue.Empty: - pass - - if timeout: - if time.time() > timeout: - - self.write_file(items) - timeout = None - items = [] - - def write_file(self, items): - - try: - - schema = pa.schema([ - pa.field('embeddings', pa.list_(pa.list_(pa.float64()))), - pa.field('entity', pa.string()), - ]) - - fname = self.file_template.format(id=str(uuid.uuid4())) - path = f"{self.directory}/{fname}" - - writer = pq.ParquetWriter(path, schema) - - batch = pa.record_batch( - [ - [i[0] for i in items], - [i[1] for i in items], - ], - names=['embeddings', 'entity'] - ) - - writer.write_batch(batch) - - writer.close() - - print(f"Wrote {path}.") - - except Exception as e: - - print("Parquet write:", e) - - def write(self, embeds, ent): - self.q.put((embeds, ent)) - - def __del__(self): - - self.running = False - - if hasattr(self, "q"): - self.thread.join() - - - - diff --git a/trustgraph-parquet/trustgraph/dump/triples/__init__.py b/trustgraph-parquet/trustgraph/dump/triples/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/trustgraph-parquet/trustgraph/dump/triples/parquet/__init__.py b/trustgraph-parquet/trustgraph/dump/triples/parquet/__init__.py deleted file mode 100644 index 9d16af90..00000000 --- a/trustgraph-parquet/trustgraph/dump/triples/parquet/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ - -from . processor import * - diff --git a/trustgraph-parquet/trustgraph/dump/triples/parquet/__main__.py b/trustgraph-parquet/trustgraph/dump/triples/parquet/__main__.py deleted file mode 100755 index c05d8c6d..00000000 --- a/trustgraph-parquet/trustgraph/dump/triples/parquet/__main__.py +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env python3 - -from . write import run - -if __name__ == '__main__': - run() - diff --git a/trustgraph-parquet/trustgraph/dump/triples/parquet/processor.py b/trustgraph-parquet/trustgraph/dump/triples/parquet/processor.py deleted file mode 100755 index dc15d8a9..00000000 --- a/trustgraph-parquet/trustgraph/dump/triples/parquet/processor.py +++ /dev/null @@ -1,87 +0,0 @@ - -""" -Write graphs triples to parquet files in a directory. -""" - -import pulsar -import base64 -import os -import argparse -import time - -from .... schema import Triples -from .... schema import triples_store_queue -from .... base import Consumer - -from . writer import ParquetWriter - -module = ".".join(__name__.split(".")[1:-1]) - -default_input_queue = triples_store_queue -default_subscriber = module -default_graph_host='localhost' -default_directory = "." -default_file_template = "triples-{id}.parquet" -default_rotation_time = 60 - -class Processor(Consumer): - - def __init__(self, **params): - - input_queue = params.get("input_queue", default_input_queue) - subscriber = params.get("subscriber", default_subscriber) - directory = params.get("directory", default_directory) - file_template = params.get("file_template", default_file_template) - rotation_time = params.get("rotation_time", default_rotation_time) - - super(Processor, self).__init__( - **params | { - "input_queue": input_queue, - "subscriber": subscriber, - "input_schema": Triples, - } - ) - - self.writer = ParquetWriter(directory, file_template, rotation_time) - - def __del__(self): - if hasattr(self, "writer"): - del self.writer - - def handle(self, msg): - - v = msg.value() - - for t in v.triples: - self.writer.write(t.s.value, t.p.value, t.o.value) - - @staticmethod - def add_args(parser): - - Consumer.add_args( - parser, default_input_queue, default_subscriber, - ) - - parser.add_argument( - '-d', '--directory', - default=default_directory, - help=f'Directory to write to (default: {default_directory})' - ) - - parser.add_argument( - '-f', '--file-template', - default=default_file_template, - help=f'Directory to write to (default: {default_file_template})' - ) - - parser.add_argument( - '-t', '--rotation-time', - type=int, - default=default_rotation_time, - help=f'Rotation time / seconds (default: {default_rotation_time})' - ) - -def run(): - - Processor.start(module, __doc__) - diff --git a/trustgraph-parquet/trustgraph/dump/triples/parquet/writer.py b/trustgraph-parquet/trustgraph/dump/triples/parquet/writer.py deleted file mode 100644 index e68bf342..00000000 --- a/trustgraph-parquet/trustgraph/dump/triples/parquet/writer.py +++ /dev/null @@ -1,96 +0,0 @@ - -import threading -import queue -import time -import uuid -import pyarrow as pa -import pyarrow.parquet as pq - -class ParquetWriter: - - def __init__(self, directory, file_template, rotation_time): - self.directory = directory - self.file_template = file_template - self.rotation_time = rotation_time - - self.q = queue.Queue() - - self.running = True - - self.thread = threading.Thread(target=(self.writer_thread)) - self.thread.start() - - def writer_thread(self): - - triples = [] - - timeout = None - - while self.running: - - try: - - item = self.q.get(timeout=1) - - if timeout == None: - timeout = time.time() + self.rotation_time - - triples.append(item) - - except queue.Empty: - pass - - if timeout: - if time.time() > timeout: - - self.write_file(triples) - timeout = None - triples = [] - - def write_file(self, triples): - - try: - - schema = pa.schema([ - pa.field('s', pa.string()), - pa.field('p', pa.string()), - pa.field('o', pa.string()), - ]) - - fname = self.file_template.format(id=str(uuid.uuid4())) - path = f"{self.directory}/{fname}" - - writer = pq.ParquetWriter(path, schema) - - batch = pa.record_batch( - [ - [tpl[0] for tpl in triples], - [tpl[1] for tpl in triples], - [tpl[2] for tpl in triples], - ], - names=['s', 'p', 'o'] - ) - - writer.write_batch(batch) - - writer.close() - - print(f"Wrote {path}.") - - except Exception as e: - - print("Parquet write:", e) - - def write(self, s, p, o): - self.q.put((s, p, o)) - - def __del__(self): - - self.running = False - - if hasattr(self, "q"): - self.thread.join() - - - - From 67d69b5285d437577d30b487977672fcb0b48834 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 6 Dec 2024 13:05:56 +0000 Subject: [PATCH 30/37] Fixed a problem with the packages, api/__init__.py appeared in both (#196) trustgraph-flow and trustgraph-base, moved the gateway stuff into a different directory. --- trustgraph-flow/scripts/api-gateway | 2 +- trustgraph-flow/trustgraph/api/__init__.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/__init__.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/__main__.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/agent.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/auth.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/dbpedia.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/embeddings.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/encyclopedia.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/endpoint.py | 0 .../trustgraph/{api => }/gateway/graph_embeddings_load.py | 0 .../trustgraph/{api => }/gateway/graph_embeddings_stream.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/graph_rag.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/internet_search.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/prompt.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/publisher.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/running.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/serialize.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/service.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/socket.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/subscriber.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/text_completion.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/triples_load.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/triples_query.py | 0 trustgraph-flow/trustgraph/{api => }/gateway/triples_stream.py | 0 25 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 trustgraph-flow/trustgraph/api/__init__.py rename trustgraph-flow/trustgraph/{api => }/gateway/__init__.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/__main__.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/agent.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/auth.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/dbpedia.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/embeddings.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/encyclopedia.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/endpoint.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/graph_embeddings_load.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/graph_embeddings_stream.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/graph_rag.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/internet_search.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/prompt.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/publisher.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/running.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/serialize.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/service.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/socket.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/subscriber.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/text_completion.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/triples_load.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/triples_query.py (100%) rename trustgraph-flow/trustgraph/{api => }/gateway/triples_stream.py (100%) diff --git a/trustgraph-flow/scripts/api-gateway b/trustgraph-flow/scripts/api-gateway index d28a5b8a..f7ba0fda 100755 --- a/trustgraph-flow/scripts/api-gateway +++ b/trustgraph-flow/scripts/api-gateway @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from trustgraph.api.gateway import run +from trustgraph.gateway import run run() diff --git a/trustgraph-flow/trustgraph/api/__init__.py b/trustgraph-flow/trustgraph/api/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/trustgraph-flow/trustgraph/api/gateway/__init__.py b/trustgraph-flow/trustgraph/gateway/__init__.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/__init__.py rename to trustgraph-flow/trustgraph/gateway/__init__.py diff --git a/trustgraph-flow/trustgraph/api/gateway/__main__.py b/trustgraph-flow/trustgraph/gateway/__main__.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/__main__.py rename to trustgraph-flow/trustgraph/gateway/__main__.py diff --git a/trustgraph-flow/trustgraph/api/gateway/agent.py b/trustgraph-flow/trustgraph/gateway/agent.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/agent.py rename to trustgraph-flow/trustgraph/gateway/agent.py diff --git a/trustgraph-flow/trustgraph/api/gateway/auth.py b/trustgraph-flow/trustgraph/gateway/auth.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/auth.py rename to trustgraph-flow/trustgraph/gateway/auth.py diff --git a/trustgraph-flow/trustgraph/api/gateway/dbpedia.py b/trustgraph-flow/trustgraph/gateway/dbpedia.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/dbpedia.py rename to trustgraph-flow/trustgraph/gateway/dbpedia.py diff --git a/trustgraph-flow/trustgraph/api/gateway/embeddings.py b/trustgraph-flow/trustgraph/gateway/embeddings.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/embeddings.py rename to trustgraph-flow/trustgraph/gateway/embeddings.py diff --git a/trustgraph-flow/trustgraph/api/gateway/encyclopedia.py b/trustgraph-flow/trustgraph/gateway/encyclopedia.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/encyclopedia.py rename to trustgraph-flow/trustgraph/gateway/encyclopedia.py diff --git a/trustgraph-flow/trustgraph/api/gateway/endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/endpoint.py rename to trustgraph-flow/trustgraph/gateway/endpoint.py diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py b/trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/graph_embeddings_load.py rename to trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py b/trustgraph-flow/trustgraph/gateway/graph_embeddings_stream.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/graph_embeddings_stream.py rename to trustgraph-flow/trustgraph/gateway/graph_embeddings_stream.py diff --git a/trustgraph-flow/trustgraph/api/gateway/graph_rag.py b/trustgraph-flow/trustgraph/gateway/graph_rag.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/graph_rag.py rename to trustgraph-flow/trustgraph/gateway/graph_rag.py diff --git a/trustgraph-flow/trustgraph/api/gateway/internet_search.py b/trustgraph-flow/trustgraph/gateway/internet_search.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/internet_search.py rename to trustgraph-flow/trustgraph/gateway/internet_search.py diff --git a/trustgraph-flow/trustgraph/api/gateway/prompt.py b/trustgraph-flow/trustgraph/gateway/prompt.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/prompt.py rename to trustgraph-flow/trustgraph/gateway/prompt.py diff --git a/trustgraph-flow/trustgraph/api/gateway/publisher.py b/trustgraph-flow/trustgraph/gateway/publisher.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/publisher.py rename to trustgraph-flow/trustgraph/gateway/publisher.py diff --git a/trustgraph-flow/trustgraph/api/gateway/running.py b/trustgraph-flow/trustgraph/gateway/running.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/running.py rename to trustgraph-flow/trustgraph/gateway/running.py diff --git a/trustgraph-flow/trustgraph/api/gateway/serialize.py b/trustgraph-flow/trustgraph/gateway/serialize.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/serialize.py rename to trustgraph-flow/trustgraph/gateway/serialize.py diff --git a/trustgraph-flow/trustgraph/api/gateway/service.py b/trustgraph-flow/trustgraph/gateway/service.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/service.py rename to trustgraph-flow/trustgraph/gateway/service.py diff --git a/trustgraph-flow/trustgraph/api/gateway/socket.py b/trustgraph-flow/trustgraph/gateway/socket.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/socket.py rename to trustgraph-flow/trustgraph/gateway/socket.py diff --git a/trustgraph-flow/trustgraph/api/gateway/subscriber.py b/trustgraph-flow/trustgraph/gateway/subscriber.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/subscriber.py rename to trustgraph-flow/trustgraph/gateway/subscriber.py diff --git a/trustgraph-flow/trustgraph/api/gateway/text_completion.py b/trustgraph-flow/trustgraph/gateway/text_completion.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/text_completion.py rename to trustgraph-flow/trustgraph/gateway/text_completion.py diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_load.py b/trustgraph-flow/trustgraph/gateway/triples_load.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/triples_load.py rename to trustgraph-flow/trustgraph/gateway/triples_load.py diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_query.py b/trustgraph-flow/trustgraph/gateway/triples_query.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/triples_query.py rename to trustgraph-flow/trustgraph/gateway/triples_query.py diff --git a/trustgraph-flow/trustgraph/api/gateway/triples_stream.py b/trustgraph-flow/trustgraph/gateway/triples_stream.py similarity index 100% rename from trustgraph-flow/trustgraph/api/gateway/triples_stream.py rename to trustgraph-flow/trustgraph/gateway/triples_stream.py From 55c5c398b60c4bbfa7b8d563e8d205a886b8edba Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 6 Dec 2024 13:37:44 +0000 Subject: [PATCH 31/37] Fix relative imports (#197) --- trustgraph-flow/trustgraph/gateway/agent.py | 6 +++--- trustgraph-flow/trustgraph/gateway/dbpedia.py | 6 +++--- trustgraph-flow/trustgraph/gateway/embeddings.py | 6 +++--- trustgraph-flow/trustgraph/gateway/encyclopedia.py | 6 +++--- trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py | 6 +++--- .../trustgraph/gateway/graph_embeddings_stream.py | 4 ++-- trustgraph-flow/trustgraph/gateway/graph_rag.py | 6 +++--- trustgraph-flow/trustgraph/gateway/internet_search.py | 6 +++--- trustgraph-flow/trustgraph/gateway/prompt.py | 6 +++--- trustgraph-flow/trustgraph/gateway/serialize.py | 2 +- trustgraph-flow/trustgraph/gateway/service.py | 6 +++--- trustgraph-flow/trustgraph/gateway/text_completion.py | 6 +++--- trustgraph-flow/trustgraph/gateway/triples_load.py | 6 +++--- trustgraph-flow/trustgraph/gateway/triples_query.py | 6 +++--- trustgraph-flow/trustgraph/gateway/triples_stream.py | 4 ++-- 15 files changed, 41 insertions(+), 41 deletions(-) diff --git a/trustgraph-flow/trustgraph/gateway/agent.py b/trustgraph-flow/trustgraph/gateway/agent.py index 40586133..e8fd0e72 100644 --- a/trustgraph-flow/trustgraph/gateway/agent.py +++ b/trustgraph-flow/trustgraph/gateway/agent.py @@ -1,7 +1,7 @@ -from ... schema import AgentRequest, AgentResponse -from ... schema import agent_request_queue -from ... schema import agent_response_queue +from .. schema import AgentRequest, AgentResponse +from .. schema import agent_request_queue +from .. schema import agent_response_queue from . endpoint import MultiResponseServiceEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/dbpedia.py b/trustgraph-flow/trustgraph/gateway/dbpedia.py index 4fa7336b..a61292a6 100644 --- a/trustgraph-flow/trustgraph/gateway/dbpedia.py +++ b/trustgraph-flow/trustgraph/gateway/dbpedia.py @@ -1,7 +1,7 @@ -from ... schema import LookupRequest, LookupResponse -from ... schema import dbpedia_lookup_request_queue -from ... schema import dbpedia_lookup_response_queue +from .. schema import LookupRequest, LookupResponse +from .. schema import dbpedia_lookup_request_queue +from .. schema import dbpedia_lookup_response_queue from . endpoint import ServiceEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/embeddings.py b/trustgraph-flow/trustgraph/gateway/embeddings.py index 7c4b578d..6d3a9fe6 100644 --- a/trustgraph-flow/trustgraph/gateway/embeddings.py +++ b/trustgraph-flow/trustgraph/gateway/embeddings.py @@ -1,7 +1,7 @@ -from ... schema import EmbeddingsRequest, EmbeddingsResponse -from ... schema import embeddings_request_queue -from ... schema import embeddings_response_queue +from .. schema import EmbeddingsRequest, EmbeddingsResponse +from .. schema import embeddings_request_queue +from .. schema import embeddings_response_queue from . endpoint import ServiceEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/encyclopedia.py b/trustgraph-flow/trustgraph/gateway/encyclopedia.py index c6041cb2..32eb5cd1 100644 --- a/trustgraph-flow/trustgraph/gateway/encyclopedia.py +++ b/trustgraph-flow/trustgraph/gateway/encyclopedia.py @@ -1,7 +1,7 @@ -from ... schema import LookupRequest, LookupResponse -from ... schema import encyclopedia_lookup_request_queue -from ... schema import encyclopedia_lookup_response_queue +from .. schema import LookupRequest, LookupResponse +from .. schema import encyclopedia_lookup_request_queue +from .. schema import encyclopedia_lookup_response_queue from . endpoint import ServiceEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py b/trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py index 81fb6647..18a2e6fe 100644 --- a/trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py +++ b/trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py @@ -4,9 +4,9 @@ from pulsar.schema import JsonSchema import uuid from aiohttp import WSMsgType -from ... schema import Metadata -from ... schema import GraphEmbeddings -from ... schema import graph_embeddings_store_queue +from .. schema import Metadata +from .. schema import GraphEmbeddings +from .. schema import graph_embeddings_store_queue from . publisher import Publisher from . socket import SocketEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/graph_embeddings_stream.py b/trustgraph-flow/trustgraph/gateway/graph_embeddings_stream.py index 3d4efd45..f0b4dd86 100644 --- a/trustgraph-flow/trustgraph/gateway/graph_embeddings_stream.py +++ b/trustgraph-flow/trustgraph/gateway/graph_embeddings_stream.py @@ -4,8 +4,8 @@ import queue from pulsar.schema import JsonSchema import uuid -from ... schema import GraphEmbeddings -from ... schema import graph_embeddings_store_queue +from .. schema import GraphEmbeddings +from .. schema import graph_embeddings_store_queue from . subscriber import Subscriber from . socket import SocketEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/graph_rag.py b/trustgraph-flow/trustgraph/gateway/graph_rag.py index d33090ca..58679004 100644 --- a/trustgraph-flow/trustgraph/gateway/graph_rag.py +++ b/trustgraph-flow/trustgraph/gateway/graph_rag.py @@ -1,7 +1,7 @@ -from ... schema import GraphRagQuery, GraphRagResponse -from ... schema import graph_rag_request_queue -from ... schema import graph_rag_response_queue +from .. schema import GraphRagQuery, GraphRagResponse +from .. schema import graph_rag_request_queue +from .. schema import graph_rag_response_queue from . endpoint import ServiceEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/internet_search.py b/trustgraph-flow/trustgraph/gateway/internet_search.py index f55a4a3e..5a5dc948 100644 --- a/trustgraph-flow/trustgraph/gateway/internet_search.py +++ b/trustgraph-flow/trustgraph/gateway/internet_search.py @@ -1,7 +1,7 @@ -from ... schema import LookupRequest, LookupResponse -from ... schema import internet_search_request_queue -from ... schema import internet_search_response_queue +from .. schema import LookupRequest, LookupResponse +from .. schema import internet_search_request_queue +from .. schema import internet_search_response_queue from . endpoint import ServiceEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/prompt.py b/trustgraph-flow/trustgraph/gateway/prompt.py index d19005bc..f09a0e0e 100644 --- a/trustgraph-flow/trustgraph/gateway/prompt.py +++ b/trustgraph-flow/trustgraph/gateway/prompt.py @@ -1,9 +1,9 @@ import json -from ... schema import PromptRequest, PromptResponse -from ... schema import prompt_request_queue -from ... schema import prompt_response_queue +from .. schema import PromptRequest, PromptResponse +from .. schema import prompt_request_queue +from .. schema import prompt_response_queue from . endpoint import ServiceEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/serialize.py b/trustgraph-flow/trustgraph/gateway/serialize.py index 2b955645..35932382 100644 --- a/trustgraph-flow/trustgraph/gateway/serialize.py +++ b/trustgraph-flow/trustgraph/gateway/serialize.py @@ -1,4 +1,4 @@ -from ... schema import Value, Triple +from .. schema import Value, Triple def to_value(x): return Value(value=x["v"], is_uri=x["e"]) diff --git a/trustgraph-flow/trustgraph/gateway/service.py b/trustgraph-flow/trustgraph/gateway/service.py index faa250dc..e927ecf6 100755 --- a/trustgraph-flow/trustgraph/gateway/service.py +++ b/trustgraph-flow/trustgraph/gateway/service.py @@ -22,10 +22,10 @@ import pulsar from pulsar.schema import JsonSchema from prometheus_client import start_http_server -from ... log_level import LogLevel +from .. log_level import LogLevel -from ... schema import Metadata, Document, TextDocument -from ... schema import document_ingest_queue, text_ingest_queue +from .. schema import Metadata, Document, TextDocument +from .. schema import document_ingest_queue, text_ingest_queue from . serialize import to_subgraph from . running import Running diff --git a/trustgraph-flow/trustgraph/gateway/text_completion.py b/trustgraph-flow/trustgraph/gateway/text_completion.py index d9f69b7e..d59737f0 100644 --- a/trustgraph-flow/trustgraph/gateway/text_completion.py +++ b/trustgraph-flow/trustgraph/gateway/text_completion.py @@ -1,7 +1,7 @@ -from ... schema import TextCompletionRequest, TextCompletionResponse -from ... schema import text_completion_request_queue -from ... schema import text_completion_response_queue +from .. schema import TextCompletionRequest, TextCompletionResponse +from .. schema import text_completion_request_queue +from .. schema import text_completion_response_queue from . endpoint import ServiceEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/triples_load.py b/trustgraph-flow/trustgraph/gateway/triples_load.py index dbb3e617..2689f3ad 100644 --- a/trustgraph-flow/trustgraph/gateway/triples_load.py +++ b/trustgraph-flow/trustgraph/gateway/triples_load.py @@ -4,9 +4,9 @@ from pulsar.schema import JsonSchema import uuid from aiohttp import WSMsgType -from ... schema import Metadata -from ... schema import Triples -from ... schema import triples_store_queue +from .. schema import Metadata +from .. schema import Triples +from .. schema import triples_store_queue from . publisher import Publisher from . socket import SocketEndpoint diff --git a/trustgraph-flow/trustgraph/gateway/triples_query.py b/trustgraph-flow/trustgraph/gateway/triples_query.py index 9c5939c8..5a0cfff8 100644 --- a/trustgraph-flow/trustgraph/gateway/triples_query.py +++ b/trustgraph-flow/trustgraph/gateway/triples_query.py @@ -1,7 +1,7 @@ -from ... schema import TriplesQueryRequest, TriplesQueryResponse, Triples -from ... schema import triples_request_queue -from ... schema import triples_response_queue +from .. schema import TriplesQueryRequest, TriplesQueryResponse, Triples +from .. schema import triples_request_queue +from .. schema import triples_response_queue from . endpoint import ServiceEndpoint from . serialize import to_value, serialize_subgraph diff --git a/trustgraph-flow/trustgraph/gateway/triples_stream.py b/trustgraph-flow/trustgraph/gateway/triples_stream.py index 4638e08d..92ada132 100644 --- a/trustgraph-flow/trustgraph/gateway/triples_stream.py +++ b/trustgraph-flow/trustgraph/gateway/triples_stream.py @@ -4,8 +4,8 @@ import queue from pulsar.schema import JsonSchema import uuid -from ... schema import Triples -from ... schema import triples_store_queue +from .. schema import Triples +from .. schema import triples_store_queue from . subscriber import Subscriber from . socket import SocketEndpoint From fd3db3c925bf7ef28ef795291627ebc885edf7d2 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 6 Dec 2024 15:16:09 +0000 Subject: [PATCH 32/37] Feature/tidy kg load save (#198) * Clean exit on ctrl-C * More functionality in dump * Dump some metadata --- test-api/test-llm-api | 3 + trustgraph-cli/scripts/tg-dump-msgpack | 56 +++++++++++++++++- trustgraph-cli/scripts/tg-load-kg-core | 77 +++++++++++++++++++------ trustgraph-cli/scripts/tg-save-kg-core | 78 ++++++++++++++++++++------ 4 files changed, 177 insertions(+), 37 deletions(-) diff --git a/test-api/test-llm-api b/test-api/test-llm-api index c33c6634..6bee2048 100755 --- a/test-api/test-llm-api +++ b/test-api/test-llm-api @@ -19,6 +19,9 @@ resp = requests.post( json=input, ) +if resp.status_code != 200: + raise RuntimeError(f"Status code: {resp.status_code}") + resp = resp.json() if "error" in resp: diff --git a/trustgraph-cli/scripts/tg-dump-msgpack b/trustgraph-cli/scripts/tg-dump-msgpack index 2be950db..18819649 100755 --- a/trustgraph-cli/scripts/tg-dump-msgpack +++ b/trustgraph-cli/scripts/tg-dump-msgpack @@ -10,7 +10,7 @@ import msgpack import sys import argparse -def run(input_file): +def dump(input_file, action): with open(input_file, 'rb') as f: @@ -19,6 +19,43 @@ def run(input_file): for unpacked in unpacker: print(unpacked) +def summary(input_file, action): + + vector_dim = None + + triples = set() + + max_records = 1000000 + + with open(input_file, 'rb') as f: + + unpacker = msgpack.Unpacker(f, raw=False) + + rec_count = 0 + + for msg in unpacker: + + if msg[0] == "ge": + vector_dim = len(msg[1]["v"][0]) + + if msg[0] == "t": + + for elt in msg[1]["m"]["m"]: + triples.add(( + elt["s"]["v"], + elt["p"]["v"], + elt["o"]["v"], + )) + + if rec_count > max_records: break + rec_count += 1 + + print("Vector dimension:", vector_dim) + + for t in triples: + if t[1] == "http://www.w3.org/2000/01/rdf-schema#label": + print("-", t[2]) + def main(): parser = argparse.ArgumentParser( @@ -32,9 +69,24 @@ def main(): help=f'Input file' ) + parser.add_argument( + '-s', '--summary', action="store_const", const="summary", + dest="action", + help=f'Show a summary' + ) + + parser.add_argument( + '-r', '--records', action="store_const", const="records", + dest="action", + help=f'Dump individual records' + ) + args = parser.parse_args() - run(**vars(args)) + if args.action == "summary": + summary(**vars(args)) + else: + dump(**vars(args)) main() diff --git a/trustgraph-cli/scripts/tg-load-kg-core b/trustgraph-cli/scripts/tg-load-kg-core index 4e207cf1..5c2ae140 100755 --- a/trustgraph-cli/scripts/tg-load-kg-core +++ b/trustgraph-cli/scripts/tg-load-kg-core @@ -12,16 +12,25 @@ import json import sys import argparse import os +import signal -async def load_ge(queue, url): +class Running: + def __init__(self): self.running = True + def get(self): return self.running + def stop(self): self.running = False + +async def load_ge(running, queue, url): async with aiohttp.ClientSession() as session: async with session.ws_connect(f"{url}load/graph-embeddings") as ws: - while True: + while running.get(): - msg = await queue.get() + try: + msg = await asyncio.wait_for(queue.get(), 1) + except TimeoutError: + continue msg = { "metadata": { @@ -36,13 +45,18 @@ async def load_ge(queue, url): await ws.send_json(msg) -async def load_triples(queue, url): +async def load_triples(running, queue, url): + async with aiohttp.ClientSession() as session: + async with session.ws_connect(f"{url}load/triples") as ws: - while True: + while running.get(): - msg = await queue.get() + try: + msg = await asyncio.wait_for(queue.get(), 1) + except TimeoutError: + continue msg ={ "metadata": { @@ -59,18 +73,18 @@ async def load_triples(queue, url): ge_counts = 0 t_counts = 0 -async def stats(): +async def stats(running): global t_counts global ge_counts - while True: - await asyncio.sleep(5) + while running.get(): + await asyncio.sleep(2) print( f"Graph embeddings: {ge_counts:10d} Triples: {t_counts:10d}" ) -async def loader(ge_queue, t_queue, path, format, user, collection): +async def loader(running, ge_queue, t_queue, path, format, user, collection): global t_counts global ge_counts @@ -85,7 +99,12 @@ async def loader(ge_queue, t_queue, path, format, user, collection): unpacker = msgpack.Unpacker(f, raw=False) - for unpacked in unpacker: + while running.get(): + + try: + unpacked = unpacker.unpack() + except: + break if user: unpacked["metadata"]["user"] = user @@ -94,14 +113,25 @@ async def loader(ge_queue, t_queue, path, format, user, collection): unpacked["metadata"]["collection"] = collection if unpacked[0] == "t": - await t_queue.put(unpacked[1]) + qtype = t_queue t_counts += 1 else: if unpacked[0] == "ge": - await ge_queue.put(unpacked[1]) + qtype = ge_queue ge_counts += 1 -async def run(**args): + while running.get(): + + try: + await asyncio.wait_for(qtype.put(unpacked[1]), 0.5) + except TimeoutError: + continue + + if not running.get(): break + + running.stop() + +async def run(running, **args): # Maxsize on queues reduces back-pressure so tg-load-kg-core doesn't # grow to eat all memory @@ -110,6 +140,7 @@ async def run(**args): load_task = asyncio.create_task( loader( + running=running, ge_queue=ge_q, t_queue=t_q, path=args["input_file"], format=args["format"], user=args["user"], collection=args["collection"], @@ -119,24 +150,26 @@ async def run(**args): ge_task = asyncio.create_task( load_ge( + running=running, queue=ge_q, url=args["url"] + "api/v1/" ) ) triples_task = asyncio.create_task( load_triples( + running=running, queue=t_q, url=args["url"] + "api/v1/" ) ) - stats_task = asyncio.create_task(stats()) + stats_task = asyncio.create_task(stats(running)) await load_task await triples_task await ge_task await stats_task -async def main(): +async def main(running): parser = argparse.ArgumentParser( prog='tg-load-kg-core', @@ -179,7 +212,15 @@ async def main(): args = parser.parse_args() - await run(**vars(args)) + await run(running, **vars(args)) -asyncio.run(main()) +running = Running() + +def interrupt(sig, frame): + running.stop() + print('Interrupt') + +signal.signal(signal.SIGINT, interrupt) + +asyncio.run(main(running)) diff --git a/trustgraph-cli/scripts/tg-save-kg-core b/trustgraph-cli/scripts/tg-save-kg-core index 3c03383f..f2509dba 100755 --- a/trustgraph-cli/scripts/tg-save-kg-core +++ b/trustgraph-cli/scripts/tg-save-kg-core @@ -16,11 +16,26 @@ import json import sys import argparse import os +import signal + +class Running: + def __init__(self): self.running = True + def get(self): return self.running + def stop(self): self.running = False + +async def fetch_ge(running, queue, user, collection, url): -async def fetch_ge(queue, user, collection, url): async with aiohttp.ClientSession() as session: + async with session.ws_connect(f"{url}stream/graph-embeddings") as ws: - async for msg in ws: + + while running.get(): + + try: + msg = await asyncio.wait_for(ws.receive(), 1) + except: + continue + if msg.type == aiohttp.WSMsgType.TEXT: data = msg.json() @@ -50,10 +65,19 @@ async def fetch_ge(queue, user, collection, url): print("Error") break -async def fetch_triples(queue, user, collection, url): +async def fetch_triples(running, queue, user, collection, url): + async with aiohttp.ClientSession() as session: + async with session.ws_connect(f"{url}stream/triples") as ws: - async for msg in ws: + + while running.get(): + + try: + msg = await asyncio.wait_for(ws.receive(), 1) + except: + continue + if msg.type == aiohttp.WSMsgType.TEXT: data = msg.json() @@ -85,27 +109,32 @@ async def fetch_triples(queue, user, collection, url): ge_counts = 0 t_counts = 0 -async def stats(): +async def stats(running): global t_counts global ge_counts - while True: - await asyncio.sleep(5) + while running.get(): + + await asyncio.sleep(2) + print( f"Graph embeddings: {ge_counts:10d} Triples: {t_counts:10d}" ) -async def output(queue, path, format): +async def output(running, queue, path, format): global t_counts global ge_counts with open(path, "wb") as f: - while True: + while running.get(): - msg = await queue.get() + try: + msg = await asyncio.wait_for(queue.get(), 0.5) + except TimeoutError: + continue if format == "msgpack": f.write(msgpack.packb(msg, use_bin_type=True)) @@ -118,12 +147,15 @@ async def output(queue, path, format): if msg[0] == "ge": ge_counts += 1 -async def run(**args): + print("Output file closed") + +async def run(running, **args): q = asyncio.Queue() ge_task = asyncio.create_task( fetch_ge( + running=running, queue=q, user=args["user"], collection=args["collection"], url=args["url"] + "api/v1/" ) @@ -131,26 +163,30 @@ async def run(**args): triples_task = asyncio.create_task( fetch_triples( - queue=q, user=args["user"], collection=args["collection"], + running=running, queue=q, + user=args["user"], collection=args["collection"], url=args["url"] + "api/v1/" ) ) output_task = asyncio.create_task( output( - queue=q, path=args["output_file"], format=args["format"], + running=running, queue=q, + path=args["output_file"], format=args["format"], ) ) - stats_task = asyncio.create_task(stats()) + stats_task = asyncio.create_task(stats(running)) await output_task await triples_task await ge_task await stats_task -async def main(): + print("Exiting") + +async def main(running): parser = argparse.ArgumentParser( prog='tg-save-kg-core', @@ -193,7 +229,15 @@ async def main(): args = parser.parse_args() - await run(**vars(args)) + await run(running, **vars(args)) -asyncio.run(main()) +running = Running() + +def interrupt(sig, frame): + running.stop() + print('Interrupt') + +signal.signal(signal.SIGINT, interrupt) + +asyncio.run(main(running)) From 656dcb22a92a77a1effda9530d3cb1c72c13199d Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 6 Dec 2024 23:56:10 +0000 Subject: [PATCH 33/37] Feature/general websocket (#199) * Split API endpoint into endpoint and requestor * Service/endpoint separation * Call out to multiple services working * Add ID field * Add mux service on websocket, calls out to all services --- trustgraph-flow/trustgraph/gateway/agent.py | 27 ++-- trustgraph-flow/trustgraph/gateway/dbpedia.py | 9 +- .../trustgraph/gateway/embeddings.py | 11 +- .../trustgraph/gateway/encyclopedia.py | 9 +- .../trustgraph/gateway/endpoint.py | 115 ++--------------- .../trustgraph/gateway/graph_rag.py | 9 +- .../trustgraph/gateway/internet_search.py | 9 +- trustgraph-flow/trustgraph/gateway/mux.py | 94 ++++++++++++++ trustgraph-flow/trustgraph/gateway/prompt.py | 11 +- .../trustgraph/gateway/requestor.py | 88 +++++++++++++ trustgraph-flow/trustgraph/gateway/service.py | 122 ++++++++++++------ .../trustgraph/gateway/text_completion.py | 10 +- .../trustgraph/gateway/triples_query.py | 9 +- 13 files changed, 330 insertions(+), 193 deletions(-) create mode 100644 trustgraph-flow/trustgraph/gateway/mux.py create mode 100644 trustgraph-flow/trustgraph/gateway/requestor.py diff --git a/trustgraph-flow/trustgraph/gateway/agent.py b/trustgraph-flow/trustgraph/gateway/agent.py index e8fd0e72..c7af947b 100644 --- a/trustgraph-flow/trustgraph/gateway/agent.py +++ b/trustgraph-flow/trustgraph/gateway/agent.py @@ -3,20 +3,19 @@ from .. schema import AgentRequest, AgentResponse from .. schema import agent_request_queue from .. schema import agent_response_queue -from . endpoint import MultiResponseServiceEndpoint +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor -class AgentEndpoint(MultiResponseServiceEndpoint): +class AgentRequestor(ServiceRequestor): def __init__(self, pulsar_host, timeout, auth): - super(AgentEndpoint, self).__init__( + super(AgentRequestor, self).__init__( pulsar_host=pulsar_host, request_queue=agent_request_queue, response_queue=agent_response_queue, request_schema=AgentRequest, response_schema=AgentResponse, - endpoint_path="/api/v1/agent", timeout=timeout, - auth=auth, ) def to_request(self, body): @@ -25,7 +24,19 @@ class AgentEndpoint(MultiResponseServiceEndpoint): ) def from_response(self, message): + resp = { + } + if message.answer: - return { "answer": message.answer }, True - else: - return {}, False + resp["answer"] = message.answer + + if message.thought: + resp["thought"] = message.thought + + if message.observation: + resp["observation"] = message.observation + + # The 2nd boolean expression indicates whether we're done responding + return resp, (message.answer is not None) + + diff --git a/trustgraph-flow/trustgraph/gateway/dbpedia.py b/trustgraph-flow/trustgraph/gateway/dbpedia.py index a61292a6..8ae4f695 100644 --- a/trustgraph-flow/trustgraph/gateway/dbpedia.py +++ b/trustgraph-flow/trustgraph/gateway/dbpedia.py @@ -4,19 +4,18 @@ from .. schema import dbpedia_lookup_request_queue from .. schema import dbpedia_lookup_response_queue from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor -class DbpediaEndpoint(ServiceEndpoint): +class DbpediaRequestor(ServiceRequestor): def __init__(self, pulsar_host, timeout, auth): - super(DbpediaEndpoint, self).__init__( + super(DbpediaRequestor, self).__init__( pulsar_host=pulsar_host, request_queue=dbpedia_lookup_request_queue, response_queue=dbpedia_lookup_response_queue, request_schema=LookupRequest, response_schema=LookupResponse, - endpoint_path="/api/v1/dbpedia", timeout=timeout, - auth=auth, ) def to_request(self, body): @@ -26,5 +25,5 @@ class DbpediaEndpoint(ServiceEndpoint): ) def from_response(self, message): - return { "text": message.text } + return { "text": message.text }, True diff --git a/trustgraph-flow/trustgraph/gateway/embeddings.py b/trustgraph-flow/trustgraph/gateway/embeddings.py index 6d3a9fe6..d0f3e1ef 100644 --- a/trustgraph-flow/trustgraph/gateway/embeddings.py +++ b/trustgraph-flow/trustgraph/gateway/embeddings.py @@ -4,19 +4,18 @@ from .. schema import embeddings_request_queue from .. schema import embeddings_response_queue from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor -class EmbeddingsEndpoint(ServiceEndpoint): +class EmbeddingsRequestor(ServiceRequestor): def __init__(self, pulsar_host, timeout, auth): - super(EmbeddingsEndpoint, self).__init__( + super(EmbeddingsRequestor, self).__init__( pulsar_host=pulsar_host, request_queue=embeddings_request_queue, response_queue=embeddings_response_queue, request_schema=EmbeddingsRequest, response_schema=EmbeddingsResponse, - endpoint_path="/api/v1/embeddings", timeout=timeout, - auth=auth, ) def to_request(self, body): @@ -25,4 +24,6 @@ class EmbeddingsEndpoint(ServiceEndpoint): ) def from_response(self, message): - return { "vectors": message.vectors } + return { "vectors": message.vectors }, True + + diff --git a/trustgraph-flow/trustgraph/gateway/encyclopedia.py b/trustgraph-flow/trustgraph/gateway/encyclopedia.py index 32eb5cd1..3f4dad79 100644 --- a/trustgraph-flow/trustgraph/gateway/encyclopedia.py +++ b/trustgraph-flow/trustgraph/gateway/encyclopedia.py @@ -4,19 +4,18 @@ from .. schema import encyclopedia_lookup_request_queue from .. schema import encyclopedia_lookup_response_queue from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor -class EncyclopediaEndpoint(ServiceEndpoint): +class EncyclopediaRequestor(ServiceRequestor): def __init__(self, pulsar_host, timeout, auth): - super(EncyclopediaEndpoint, self).__init__( + super(EncyclopediaRequestor, self).__init__( pulsar_host=pulsar_host, request_queue=encyclopedia_lookup_request_queue, response_queue=encyclopedia_lookup_response_queue, request_schema=LookupRequest, response_schema=LookupResponse, - endpoint_path="/api/v1/encyclopedia", timeout=timeout, - auth=auth, ) def to_request(self, body): @@ -26,5 +25,5 @@ class EncyclopediaEndpoint(ServiceEndpoint): ) def from_response(self, message): - return { "text": message.text } + return { "text": message.text }, True diff --git a/trustgraph-flow/trustgraph/gateway/endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint.py index 2b246361..6d6ca8d5 100644 --- a/trustgraph-flow/trustgraph/gateway/endpoint.py +++ b/trustgraph-flow/trustgraph/gateway/endpoint.py @@ -13,38 +13,17 @@ logger.setLevel(logging.INFO) class ServiceEndpoint: - def __init__( - self, - pulsar_host, - request_queue, request_schema, - response_queue, response_schema, - endpoint_path, - auth, - subscription="api-gateway", consumer_name="api-gateway", - timeout=600, - ): - - self.pub = Publisher( - pulsar_host, request_queue, - schema=JsonSchema(request_schema) - ) - - self.sub = Subscriber( - pulsar_host, response_queue, - subscription, consumer_name, - JsonSchema(response_schema) - ) + def __init__(self, endpoint_path, auth, requestor): self.path = endpoint_path - self.timeout = timeout - self.auth = auth + self.auth = auth self.operation = "service" - async def start(self): + self.requestor = requestor - self.pub.start() - self.sub.start() + async def start(self): + await self.requestor.start() def add_routes(self, app): @@ -52,16 +31,8 @@ class ServiceEndpoint: web.post(self.path, self.handle), ]) - def to_request(self, request): - raise RuntimeError("Not defined") - - def from_response(self, response): - raise RuntimeError("Not defined") - async def handle(self, request): - id = str(uuid.uuid4()) - print(request.path, "...") try: @@ -82,28 +53,12 @@ class ServiceEndpoint: print(data) - q = self.sub.subscribe(id) + def responder(x, fin): + print(x) - await asyncio.to_thread( - self.pub.send, id, self.to_request(data) - ) + resp, fin = await self.requestor.process(data, responder) - try: - resp = await asyncio.to_thread(q.get, timeout=self.timeout) - except Exception as e: - raise RuntimeError("Timeout") - - print(resp) - - if resp.error: - print("Error") - return web.json_response( - { "error": resp.error.message } - ) - - return web.json_response( - self.from_response(resp) - ) + return web.json_response(resp) except Exception as e: logging.error(f"Exception: {e}") @@ -112,55 +67,3 @@ class ServiceEndpoint: { "error": str(e) } ) - finally: - self.sub.unsubscribe(id) - - -class MultiResponseServiceEndpoint(ServiceEndpoint): - - async def handle(self, request): - - id = str(uuid.uuid4()) - - try: - - data = await request.json() - - q = self.sub.subscribe(id) - - await asyncio.to_thread( - self.pub.send, id, self.to_request(data) - ) - - # Keeps looking at responses... - - while True: - - try: - resp = await asyncio.to_thread(q.get, timeout=self.timeout) - except Exception as e: - raise RuntimeError("Timeout waiting for response") - - if resp.error: - return web.json_response( - { "error": resp.error.message } - ) - - # Until from_response says we have a finished answer - resp, fin = self.from_response(resp) - - - if fin: - return web.json_response(resp) - - # Not finished, so loop round and continue - - except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - - finally: - self.sub.unsubscribe(id) diff --git a/trustgraph-flow/trustgraph/gateway/graph_rag.py b/trustgraph-flow/trustgraph/gateway/graph_rag.py index 58679004..55fd5d2f 100644 --- a/trustgraph-flow/trustgraph/gateway/graph_rag.py +++ b/trustgraph-flow/trustgraph/gateway/graph_rag.py @@ -4,19 +4,18 @@ from .. schema import graph_rag_request_queue from .. schema import graph_rag_response_queue from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor -class GraphRagEndpoint(ServiceEndpoint): +class GraphRagRequestor(ServiceRequestor): def __init__(self, pulsar_host, timeout, auth): - super(GraphRagEndpoint, self).__init__( + super(GraphRagRequestor, self).__init__( pulsar_host=pulsar_host, request_queue=graph_rag_request_queue, response_queue=graph_rag_response_queue, request_schema=GraphRagQuery, response_schema=GraphRagResponse, - endpoint_path="/api/v1/graph-rag", timeout=timeout, - auth=auth, ) def to_request(self, body): @@ -27,5 +26,5 @@ class GraphRagEndpoint(ServiceEndpoint): ) def from_response(self, message): - return { "response": message.response } + return { "response": message.response }, True diff --git a/trustgraph-flow/trustgraph/gateway/internet_search.py b/trustgraph-flow/trustgraph/gateway/internet_search.py index 5a5dc948..127cd5d1 100644 --- a/trustgraph-flow/trustgraph/gateway/internet_search.py +++ b/trustgraph-flow/trustgraph/gateway/internet_search.py @@ -4,19 +4,18 @@ from .. schema import internet_search_request_queue from .. schema import internet_search_response_queue from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor -class InternetSearchEndpoint(ServiceEndpoint): +class InternetSearchRequestor(ServiceRequestor): def __init__(self, pulsar_host, timeout, auth): - super(InternetSearchEndpoint, self).__init__( + super(InternetSearchRequestor, self).__init__( pulsar_host=pulsar_host, request_queue=internet_search_request_queue, response_queue=internet_search_response_queue, request_schema=LookupRequest, response_schema=LookupResponse, - endpoint_path="/api/v1/internet-search", timeout=timeout, - auth=auth, ) def to_request(self, body): @@ -26,5 +25,5 @@ class InternetSearchEndpoint(ServiceEndpoint): ) def from_response(self, message): - return { "text": message.text } + return { "text": message.text }, True diff --git a/trustgraph-flow/trustgraph/gateway/mux.py b/trustgraph-flow/trustgraph/gateway/mux.py new file mode 100644 index 00000000..cd5ddfba --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/mux.py @@ -0,0 +1,94 @@ + +import asyncio +import queue +from pulsar.schema import JsonSchema +import uuid +from aiohttp import web, WSMsgType + +from . socket import SocketEndpoint +from . text_completion import TextCompletionRequestor + +class MuxEndpoint(SocketEndpoint): + + def __init__( + self, pulsar_host, auth, + services, + path="/api/v1/mux", + ): + + super(MuxEndpoint, self).__init__( + endpoint_path=path, auth=auth, + ) + + self.q = asyncio.Queue(maxsize=10) + + self.services = services + + async def start(self): + pass + + async def async_thread(self, ws, running): + + while running.get(): + + try: + id, svc, request = await asyncio.wait_for(self.q.get(), 1) + except TimeoutError: + continue + except Exception as e: + await ws.send_json({"id": id, "error": str(e)}) + + try: + + print(svc, request) + + requestor = self.services[svc] + + async def responder(resp, fin): + await ws.send_json({ + "id": id, + "response": resp, + "complete": fin, + }) + + resp = await requestor.process(request, responder) + + except Exception as e: + + await ws.send_json({"error": str(e)}) + + running.stop() + + async def listener(self, ws, running): + + async for msg in ws: + + # On error, finish + if msg.type == WSMsgType.ERROR: + break + else: + + try: + + data = msg.json() + + if data["service"] not in self.services: + raise RuntimeError("Bad service") + + if "request" not in data: + raise RuntimeError("Bad message") + + if "id" not in data: + raise RuntimeError("Bad message") + + await self.q.put( + (data["id"], data["service"], data["request"]) + ) + + except Exception as e: + + await ws.send_json({"error": str(e)}) + continue + + running.stop() + diff --git a/trustgraph-flow/trustgraph/gateway/prompt.py b/trustgraph-flow/trustgraph/gateway/prompt.py index f09a0e0e..080d5618 100644 --- a/trustgraph-flow/trustgraph/gateway/prompt.py +++ b/trustgraph-flow/trustgraph/gateway/prompt.py @@ -6,19 +6,18 @@ from .. schema import prompt_request_queue from .. schema import prompt_response_queue from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor -class PromptEndpoint(ServiceEndpoint): +class PromptRequestor(ServiceRequestor): def __init__(self, pulsar_host, timeout, auth): - super(PromptEndpoint, self).__init__( + super(PromptRequestor, self).__init__( pulsar_host=pulsar_host, request_queue=prompt_request_queue, response_queue=prompt_response_queue, request_schema=PromptRequest, response_schema=PromptResponse, - endpoint_path="/api/v1/prompt", timeout=timeout, - auth=auth, ) def to_request(self, body): @@ -34,9 +33,9 @@ class PromptEndpoint(ServiceEndpoint): if message.object: return { "object": message.object - } + }, True else: return { "text": message.text - } + }, True diff --git a/trustgraph-flow/trustgraph/gateway/requestor.py b/trustgraph-flow/trustgraph/gateway/requestor.py new file mode 100644 index 00000000..5f6e2692 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/requestor.py @@ -0,0 +1,88 @@ + +import asyncio +from pulsar.schema import JsonSchema +import uuid +import logging + +from . publisher import Publisher +from . subscriber import Subscriber + +logger = logging.getLogger("requestor") +logger.setLevel(logging.INFO) + +class ServiceRequestor: + + def __init__( + self, + pulsar_host, + request_queue, request_schema, + response_queue, response_schema, + subscription="api-gateway", consumer_name="api-gateway", + timeout=600, + ): + + self.pub = Publisher( + pulsar_host, request_queue, + schema=JsonSchema(request_schema) + ) + + self.sub = Subscriber( + pulsar_host, response_queue, + subscription, consumer_name, + JsonSchema(response_schema) + ) + + self.timeout = timeout + + async def start(self): + + self.pub.start() + self.sub.start() + + def to_request(self, request): + raise RuntimeError("Not defined") + + def from_response(self, response): + raise RuntimeError("Not defined") + + async def process(self, request, responder=None): + + id = str(uuid.uuid4()) + + try: + + q = self.sub.subscribe(id) + + await asyncio.to_thread( + self.pub.send, id, self.to_request(request) + ) + + while True: + + try: + resp = await asyncio.to_thread(q.get, timeout=self.timeout) + except Exception as e: + raise RuntimeError("Timeout") + + if resp.error: + return { "error": resp.error.message } + + resp, fin = self.from_response(resp) + + print(resp, fin) + + if responder: + await responder(resp, fin) + + if fin: + return resp + + except Exception as e: + + logging.error(f"Exception: {e}") + + return { "error": str(e) } + + finally: + self.sub.unsubscribe(id) + diff --git a/trustgraph-flow/trustgraph/gateway/service.py b/trustgraph-flow/trustgraph/gateway/service.py index e927ecf6..6a8a62eb 100755 --- a/trustgraph-flow/trustgraph/gateway/service.py +++ b/trustgraph-flow/trustgraph/gateway/service.py @@ -31,20 +31,22 @@ from . serialize import to_subgraph from . running import Running from . publisher import Publisher from . subscriber import Subscriber -from . endpoint import ServiceEndpoint, MultiResponseServiceEndpoint -from . text_completion import TextCompletionEndpoint -from . prompt import PromptEndpoint -from . graph_rag import GraphRagEndpoint -from . triples_query import TriplesQueryEndpoint -from . embeddings import EmbeddingsEndpoint -from . encyclopedia import EncyclopediaEndpoint -from . agent import AgentEndpoint -from . dbpedia import DbpediaEndpoint -from . internet_search import InternetSearchEndpoint +from . text_completion import TextCompletionRequestor +from . prompt import PromptRequestor +from . graph_rag import GraphRagRequestor +from . triples_query import TriplesQueryRequestor +from . embeddings import EmbeddingsRequestor +from . encyclopedia import EncyclopediaRequestor +from . agent import AgentRequestor +from . dbpedia import DbpediaRequestor +from . internet_search import InternetSearchRequestor from . triples_stream import TriplesStreamEndpoint from . graph_embeddings_stream import GraphEmbeddingsStreamEndpoint from . triples_load import TriplesLoadEndpoint from . graph_embeddings_load import GraphEmbeddingsLoadEndpoint +from . mux import MuxEndpoint + +from . endpoint import ServiceEndpoint from . auth import Authenticator logger = logging.getLogger("api") @@ -76,42 +78,81 @@ class Api: else: self.auth = Authenticator(allow_all=True) + self.services = { + "text-completion": TextCompletionRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "prompt": PromptRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "graph-rag": GraphRagRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "triples-query": TriplesQueryRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "embeddings": EmbeddingsRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "agent": AgentRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "encyclopedia": EncyclopediaRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "dbpedia": DbpediaRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "internet-search": InternetSearchRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + } + self.endpoints = [ - TextCompletionEndpoint( - pulsar_host=self.pulsar_host, timeout=self.timeout, - auth = self.auth, + ServiceEndpoint( + endpoint_path = "/api/v1/text-completion", auth=self.auth, + requestor = self.services["text-completion"], ), - PromptEndpoint( - pulsar_host=self.pulsar_host, timeout=self.timeout, - auth = self.auth, + ServiceEndpoint( + endpoint_path = "/api/v1/prompt", auth=self.auth, + requestor = self.services["prompt"], ), - GraphRagEndpoint( - pulsar_host=self.pulsar_host, timeout=self.timeout, - auth = self.auth, + ServiceEndpoint( + endpoint_path = "/api/v1/graph-rag", auth=self.auth, + requestor = self.services["graph-rag"], ), - TriplesQueryEndpoint( - pulsar_host=self.pulsar_host, timeout=self.timeout, - auth = self.auth, + ServiceEndpoint( + endpoint_path = "/api/v1/triples-query", auth=self.auth, + requestor = self.services["triples-query"], ), - EmbeddingsEndpoint( - pulsar_host=self.pulsar_host, timeout=self.timeout, - auth = self.auth, + ServiceEndpoint( + endpoint_path = "/api/v1/embeddings", auth=self.auth, + requestor = self.services["embeddings"], ), - AgentEndpoint( - pulsar_host=self.pulsar_host, timeout=self.timeout, - auth = self.auth, + ServiceEndpoint( + endpoint_path = "/api/v1/agent", auth=self.auth, + requestor = self.services["agent"], ), - EncyclopediaEndpoint( - pulsar_host=self.pulsar_host, timeout=self.timeout, - auth = self.auth, + ServiceEndpoint( + endpoint_path = "/api/v1/encyclopedia", auth=self.auth, + requestor = self.services["encyclopedia"], ), - DbpediaEndpoint( - pulsar_host=self.pulsar_host, timeout=self.timeout, - auth = self.auth, + ServiceEndpoint( + endpoint_path = "/api/v1/dbpedia", auth=self.auth, + requestor = self.services["dbpedia"], ), - InternetSearchEndpoint( - pulsar_host=self.pulsar_host, timeout=self.timeout, - auth = self.auth, + ServiceEndpoint( + endpoint_path = "/api/v1/internet-search", auth=self.auth, + requestor = self.services["internet-search"], ), TriplesStreamEndpoint( pulsar_host=self.pulsar_host, @@ -129,6 +170,11 @@ class Api: pulsar_host=self.pulsar_host, auth = self.auth, ), + MuxEndpoint( + pulsar_host=self.pulsar_host, + auth = self.auth, + services = self.services, + ), ] self.document_out = Publisher( @@ -162,7 +208,7 @@ class Api: else: metadata = [] - # Doing a base64 decode/encode here to make sure the + # Doing a base64 decoe/encode here to make sure the # content is valid base64 doc = base64.b64decode(data["data"]) diff --git a/trustgraph-flow/trustgraph/gateway/text_completion.py b/trustgraph-flow/trustgraph/gateway/text_completion.py index d59737f0..7291fc88 100644 --- a/trustgraph-flow/trustgraph/gateway/text_completion.py +++ b/trustgraph-flow/trustgraph/gateway/text_completion.py @@ -4,19 +4,18 @@ from .. schema import text_completion_request_queue from .. schema import text_completion_response_queue from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor -class TextCompletionEndpoint(ServiceEndpoint): +class TextCompletionRequestor(ServiceRequestor): def __init__(self, pulsar_host, timeout, auth): - super(TextCompletionEndpoint, self).__init__( + super(TextCompletionRequestor, self).__init__( pulsar_host=pulsar_host, request_queue=text_completion_request_queue, response_queue=text_completion_response_queue, request_schema=TextCompletionRequest, response_schema=TextCompletionResponse, - endpoint_path="/api/v1/text-completion", timeout=timeout, - auth=auth, ) def to_request(self, body): @@ -26,4 +25,5 @@ class TextCompletionEndpoint(ServiceEndpoint): ) def from_response(self, message): - return { "response": message.response } + return { "response": message.response }, True + diff --git a/trustgraph-flow/trustgraph/gateway/triples_query.py b/trustgraph-flow/trustgraph/gateway/triples_query.py index 5a0cfff8..0ea7cd8d 100644 --- a/trustgraph-flow/trustgraph/gateway/triples_query.py +++ b/trustgraph-flow/trustgraph/gateway/triples_query.py @@ -4,20 +4,19 @@ from .. schema import triples_request_queue from .. schema import triples_response_queue from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor from . serialize import to_value, serialize_subgraph -class TriplesQueryEndpoint(ServiceEndpoint): +class TriplesQueryRequestor(ServiceRequestor): def __init__(self, pulsar_host, timeout, auth): - super(TriplesQueryEndpoint, self).__init__( + super(TriplesQueryRequestor, self).__init__( pulsar_host=pulsar_host, request_queue=triples_request_queue, response_queue=triples_response_queue, request_schema=TriplesQueryRequest, response_schema=TriplesQueryResponse, - endpoint_path="/api/v1/triples-query", timeout=timeout, - auth=auth, ) def to_request(self, body): @@ -50,5 +49,5 @@ class TriplesQueryEndpoint(ServiceEndpoint): print(message) return { "response": serialize_subgraph(message.triples) - } + }, True From a714221b2291286b51bc612cd005c0125c445e2d Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Sat, 7 Dec 2024 00:16:52 +0000 Subject: [PATCH 34/37] Add memgraph cypher LIMIT support (#200) --- .../query/triples/memgraph/service.py | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/trustgraph-flow/trustgraph/query/triples/memgraph/service.py b/trustgraph-flow/trustgraph/query/triples/memgraph/service.py index 5144f781..46dd19e3 100755 --- a/trustgraph-flow/trustgraph/query/triples/memgraph/service.py +++ b/trustgraph-flow/trustgraph/query/triples/memgraph/service.py @@ -79,7 +79,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Literal {value: $value}) " - "RETURN $src as src", + "RETURN $src as src " + "LIMIT " + str(v.limit), src=v.s.value, rel=v.p.value, value=v.o.value, database_=self.db, ) @@ -89,7 +90,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Node {uri: $uri}) " - "RETURN $src as src", + "RETURN $src as src " + "LIMIT " + str(v.limit), src=v.s.value, rel=v.p.value, uri=v.o.value, database_=self.db, ) @@ -103,7 +105,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Literal) " - "RETURN dest.value as dest", + "RETURN dest.value as dest " + "LIMIT " + str(v.limit), src=v.s.value, rel=v.p.value, database_=self.db, ) @@ -114,7 +117,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Node) " - "RETURN dest.uri as dest", + "RETURN dest.uri as dest " + "LIMIT " + str(v.limit), src=v.s.value, rel=v.p.value, database_=self.db, ) @@ -131,7 +135,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Literal {value: $value}) " - "RETURN rel.uri as rel", + "RETURN rel.uri as rel " + "LIMIT " + str(v.limit), src=v.s.value, value=v.o.value, database_=self.db, ) @@ -142,7 +147,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Node {uri: $uri}) " - "RETURN rel.uri as rel", + "RETURN rel.uri as rel " + "LIMIT " + str(v.limit), src=v.s.value, uri=v.o.value, database_=self.db, ) @@ -157,7 +163,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Literal) " - "RETURN rel.uri as rel, dest.value as dest", + "RETURN rel.uri as rel, dest.value as dest " + "LIMIT " + str(v.limit), src=v.s.value, database_=self.db, ) @@ -168,7 +175,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Node) " - "RETURN rel.uri as rel, dest.uri as dest", + "RETURN rel.uri as rel, dest.uri as dest " + "LIMIT " + str(v.limit), src=v.s.value, database_=self.db, ) @@ -188,7 +196,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Literal {value: $value}) " - "RETURN src.uri as src", + "RETURN src.uri as src " + "LIMIT " + str(v.limit), uri=v.p.value, value=v.o.value, database_=self.db, ) @@ -199,7 +208,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Node {uri: $uri}) " - "RETURN src.uri as src", + "RETURN src.uri as src " + "LIMIT " + str(v.limit), uri=v.p.value, dest=v.o.value, database_=self.db, ) @@ -214,7 +224,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Literal) " - "RETURN src.uri as src, dest.value as dest", + "RETURN src.uri as src, dest.value as dest " + "LIMIT " + str(v.limit), uri=v.p.value, database_=self.db, ) @@ -225,7 +236,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Node) " - "RETURN src.uri as src, dest.uri as dest", + "RETURN src.uri as src, dest.uri as dest " + "LIMIT " + str(v.limit), uri=v.p.value, database_=self.db, ) @@ -242,7 +254,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node)-[rel:Rel]->(dest:Literal {value: $value}) " - "RETURN src.uri as src, rel.uri as rel", + "RETURN src.uri as src, rel.uri as rel " + "LIMIT " + str(v.limit), value=v.o.value, database_=self.db, ) @@ -253,7 +266,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node)-[rel:Rel]->(dest:Node {uri: $uri}) " - "RETURN src.uri as src, rel.uri as rel", + "RETURN src.uri as src, rel.uri as rel " + "LIMIT " + str(v.limit), uri=v.o.value, database_=self.db, ) @@ -268,7 +282,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node)-[rel:Rel]->(dest:Literal) " - "RETURN src.uri as src, rel.uri as rel, dest.value as dest", + "RETURN src.uri as src, rel.uri as rel, dest.value as dest " + "LIMIT " + str(v.limit), database_=self.db, ) @@ -278,7 +293,8 @@ class Processor(ConsumerProducer): records, summary, keys = self.io.execute_query( "MATCH (src:Node)-[rel:Rel]->(dest:Node) " - "RETURN src.uri as src, rel.uri as rel, dest.uri as dest", + "RETURN src.uri as src, rel.uri as rel, dest.uri as dest " + "LIMIT " + str(v.limit), database_=self.db, ) @@ -292,7 +308,7 @@ class Processor(ConsumerProducer): p=self.create_value(t[1]), o=self.create_value(t[2]) ) - for t in triples + for t in triples[:v.limit] ] print("Send response...", flush=True) From cf8c76b5c60d2f45dc8bc848be0f01727b1fd46e Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Mon, 9 Dec 2024 00:01:01 +0000 Subject: [PATCH 35/37] Fix/save core hang (#201) * Working around an exception class change in Python 3.11 --- trustgraph-cli/scripts/tg-load-kg-core | 8 ++++++-- trustgraph-cli/scripts/tg-save-kg-core | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/trustgraph-cli/scripts/tg-load-kg-core b/trustgraph-cli/scripts/tg-load-kg-core index 5c2ae140..cada13a7 100755 --- a/trustgraph-cli/scripts/tg-load-kg-core +++ b/trustgraph-cli/scripts/tg-load-kg-core @@ -29,7 +29,9 @@ async def load_ge(running, queue, url): try: msg = await asyncio.wait_for(queue.get(), 1) - except TimeoutError: + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. continue msg = { @@ -55,7 +57,9 @@ async def load_triples(running, queue, url): try: msg = await asyncio.wait_for(queue.get(), 1) - except TimeoutError: + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. continue msg ={ diff --git a/trustgraph-cli/scripts/tg-save-kg-core b/trustgraph-cli/scripts/tg-save-kg-core index f2509dba..e52cd7dc 100755 --- a/trustgraph-cli/scripts/tg-save-kg-core +++ b/trustgraph-cli/scripts/tg-save-kg-core @@ -133,7 +133,9 @@ async def output(running, queue, path, format): try: msg = await asyncio.wait_for(queue.get(), 0.5) - except TimeoutError: + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. continue if format == "msgpack": From 803f11089107a1bcf3fd702a0f05eec50032c122 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Mon, 9 Dec 2024 00:31:03 +0000 Subject: [PATCH 36/37] Timeout alias error (#202) --- trustgraph-cli/scripts/tg-load-kg-core | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/trustgraph-cli/scripts/tg-load-kg-core b/trustgraph-cli/scripts/tg-load-kg-core index cada13a7..3d31dd25 100755 --- a/trustgraph-cli/scripts/tg-load-kg-core +++ b/trustgraph-cli/scripts/tg-load-kg-core @@ -128,7 +128,9 @@ async def loader(running, ge_queue, t_queue, path, format, user, collection): try: await asyncio.wait_for(qtype.put(unpacked[1]), 0.5) - except TimeoutError: + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. continue if not running.get(): break From 61031270e4cddd5615a00ff17fb8847d2d932db9 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Mon, 9 Dec 2024 12:44:30 +0000 Subject: [PATCH 37/37] Fix loop logic flaws in loader (#203) --- trustgraph-cli/scripts/tg-load-kg-core | 82 +++++++++++++++++++++----- 1 file changed, 68 insertions(+), 14 deletions(-) diff --git a/trustgraph-cli/scripts/tg-load-kg-core b/trustgraph-cli/scripts/tg-load-kg-core index 3d31dd25..4e76e525 100755 --- a/trustgraph-cli/scripts/tg-load-kg-core +++ b/trustgraph-cli/scripts/tg-load-kg-core @@ -19,8 +19,13 @@ class Running: def get(self): return self.running def stop(self): self.running = False +ge_counts = 0 +t_counts = 0 + async def load_ge(running, queue, url): + global ge_counts + async with aiohttp.ClientSession() as session: async with session.ws_connect(f"{url}load/graph-embeddings") as ws: @@ -29,6 +34,11 @@ async def load_ge(running, queue, url): try: msg = await asyncio.wait_for(queue.get(), 1) + + # End of load + if msg is None: + break + except: # Hopefully it's TimeoutError. Annoying to match since # it changed in 3.11. @@ -45,10 +55,17 @@ async def load_ge(running, queue, url): "entity": msg["e"], } - await ws.send_json(msg) + try: + await ws.send_json(msg) + except Exception as e: + print(e) + + ge_counts += 1 async def load_triples(running, queue, url): + global t_counts + async with aiohttp.ClientSession() as session: async with session.ws_connect(f"{url}load/triples") as ws: @@ -57,6 +74,11 @@ async def load_triples(running, queue, url): try: msg = await asyncio.wait_for(queue.get(), 1) + + # End of load + if msg is None: + break + except: # Hopefully it's TimeoutError. Annoying to match since # it changed in 3.11. @@ -72,10 +94,12 @@ async def load_triples(running, queue, url): "triples": msg["t"], } - await ws.send_json(msg) + try: + await ws.send_json(msg) + except Exception as e: + print(e) -ge_counts = 0 -t_counts = 0 + t_counts += 1 async def stats(running): @@ -83,16 +107,15 @@ async def stats(running): global ge_counts while running.get(): + await asyncio.sleep(2) + print( f"Graph embeddings: {ge_counts:10d} Triples: {t_counts:10d}" ) async def loader(running, ge_queue, t_queue, path, format, user, collection): - global t_counts - global ge_counts - if format == "json": raise RuntimeError("Not implemented") @@ -118,16 +141,18 @@ async def loader(running, ge_queue, t_queue, path, format, user, collection): if unpacked[0] == "t": qtype = t_queue - t_counts += 1 else: if unpacked[0] == "ge": qtype = ge_queue - ge_counts += 1 while running.get(): try: await asyncio.wait_for(qtype.put(unpacked[1]), 0.5) + + # Successful put message, move on + break + except: # Hopefully it's TimeoutError. Annoying to match since # it changed in 3.11. @@ -135,14 +160,40 @@ async def loader(running, ge_queue, t_queue, path, format, user, collection): if not running.get(): break - running.stop() - + # Put 'None' on end of queue to finish + while running.get(): + + try: + await asyncio.wait_for(t_queue.put(None), 1) + + # Successful put message, move on + break + + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. + continue + + # Put 'None' on end of queue to finish + while running.get(): + + try: + await asyncio.wait_for(ge_queue.put(None), 1) + + # Successful put message, move on + break + + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. + continue + async def run(running, **args): # Maxsize on queues reduces back-pressure so tg-load-kg-core doesn't # grow to eat all memory - ge_q = asyncio.Queue(maxsize=500) - t_q = asyncio.Queue(maxsize=500) + ge_q = asyncio.Queue(maxsize=10) + t_q = asyncio.Queue(maxsize=10) load_task = asyncio.create_task( loader( @@ -170,9 +221,12 @@ async def run(running, **args): stats_task = asyncio.create_task(stats(running)) - await load_task await triples_task await ge_task + + running.stop() + + await load_task await stats_task async def main(running):