diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 5ca3b735..30fc70ff 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -5,7 +5,7 @@ on: workflow_dispatch: push: tags: - - v0.15.* + - v0.17.* permissions: contents: read @@ -48,20 +48,6 @@ jobs: - name: Publish release distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 - - name: Create deploy bundle - run: templates/generate-all deploy.zip ${{ steps.version.outputs.VERSION }} - - - uses: ncipollo/release-action@v1 - with: - artifacts: deploy.zip - generateReleaseNotes: true - makeLatest: false - prerelease: true - skipIfReleaseExists: true - - - name: Build container - run: make container VERSION=${{ steps.version.outputs.VERSION }} - - name: Extract metadata for container id: meta uses: docker/metadata-action@v4 @@ -84,3 +70,13 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} + - name: Create deploy bundle + run: templates/generate-all deploy.zip ${{ steps.version.outputs.VERSION }} + + - uses: ncipollo/release-action@v1 + with: + artifacts: deploy.zip + generateReleaseNotes: true + makeLatest: false + prerelease: true + skipIfReleaseExists: true diff --git a/Containerfile b/Containerfile index 0d6d357b..c2735feb 100644 --- a/Containerfile +++ b/Containerfile @@ -16,7 +16,7 @@ RUN pip3 install torch --index-url https://download.pytorch.org/whl/cpu RUN pip3 install anthropic boto3 cohere openai google-cloud-aiplatform ollama google-generativeai \ langchain langchain-core langchain-huggingface langchain-text-splitters \ langchain-community pymilvus sentence-transformers transformers \ - huggingface-hub pulsar-client cassandra-driver pyarrow pyyaml \ + huggingface-hub pulsar-client cassandra-driver pyyaml \ neo4j tiktoken && \ pip3 cache purge @@ -32,7 +32,6 @@ COPY trustgraph-base/ /root/build/trustgraph-base/ COPY trustgraph-flow/ /root/build/trustgraph-flow/ COPY trustgraph-vertexai/ /root/build/trustgraph-vertexai/ COPY trustgraph-bedrock/ /root/build/trustgraph-bedrock/ -COPY trustgraph-parquet/ /root/build/trustgraph-parquet/ COPY trustgraph-embeddings-hf/ /root/build/trustgraph-embeddings-hf/ COPY trustgraph-cli/ /root/build/trustgraph-cli/ @@ -42,7 +41,6 @@ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-base/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-flow/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-vertexai/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-bedrock/ -RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-parquet/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-embeddings-hf/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-cli/ @@ -61,7 +59,6 @@ RUN \ pip3 install /root/wheels/trustgraph_flow-* && \ pip3 install /root/wheels/trustgraph_vertexai-* && \ pip3 install /root/wheels/trustgraph_bedrock-* && \ - pip3 install /root/wheels/trustgraph_parquet-* && \ pip3 install /root/wheels/trustgraph_embeddings_hf-* && \ pip3 install /root/wheels/trustgraph_cli-* && \ pip3 cache purge && \ diff --git a/Makefile b/Makefile index 0fb4b175..72d144a9 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,6 @@ wheels: pip3 wheel --no-deps --wheel-dir dist trustgraph-flow/ pip3 wheel --no-deps --wheel-dir dist trustgraph-vertexai/ pip3 wheel --no-deps --wheel-dir dist trustgraph-bedrock/ - pip3 wheel --no-deps --wheel-dir dist trustgraph-parquet/ pip3 wheel --no-deps --wheel-dir dist trustgraph-embeddings-hf/ pip3 wheel --no-deps --wheel-dir dist trustgraph-cli/ @@ -25,7 +24,6 @@ packages: update-package-versions cd trustgraph-flow && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-vertexai && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-bedrock && python3 setup.py sdist --dist-dir ../dist/ - cd trustgraph-parquet && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-embeddings-hf && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-cli && python3 setup.py sdist --dist-dir ../dist/ @@ -41,7 +39,6 @@ update-package-versions: echo __version__ = \"${VERSION}\" > trustgraph-flow/trustgraph/flow_version.py echo __version__ = \"${VERSION}\" > trustgraph-vertexai/trustgraph/vertexai_version.py echo __version__ = \"${VERSION}\" > trustgraph-bedrock/trustgraph/bedrock_version.py - echo __version__ = \"${VERSION}\" > trustgraph-parquet/trustgraph/parquet_version.py echo __version__ = \"${VERSION}\" > trustgraph-embeddings-hf/trustgraph/embeddings_hf_version.py echo __version__ = \"${VERSION}\" > trustgraph-cli/trustgraph/cli_version.py echo __version__ = \"${VERSION}\" > trustgraph/trustgraph/trustgraph_version.py diff --git a/templates/components.jsonnet b/templates/components.jsonnet index ec7f862b..1abf44a4 100644 --- a/templates/components.jsonnet +++ b/templates/components.jsonnet @@ -12,6 +12,7 @@ "graph-rag": import "components/graph-rag.jsonnet", "triple-store-cassandra": import "components/cassandra.jsonnet", "triple-store-neo4j": import "components/neo4j.jsonnet", + "triple-store-memgraph": import "components/memgraph.jsonnet", "llamafile": import "components/llamafile.jsonnet", "ollama": import "components/ollama.jsonnet", "openai": import "components/openai.jsonnet", @@ -25,6 +26,7 @@ "trustgraph-base": import "components/trustgraph.jsonnet", "vector-store-milvus": import "components/milvus.jsonnet", "vector-store-qdrant": import "components/qdrant.jsonnet", + "vector-store-pinecone": import "components/pinecone.jsonnet", "vertexai": import "components/vertexai.jsonnet", "null": {}, @@ -33,7 +35,9 @@ // FIXME: Dupes "cassandra": import "components/cassandra.jsonnet", "neo4j": import "components/neo4j.jsonnet", + "memgraph": import "components/memgraph.jsonnet", "qdrant": import "components/qdrant.jsonnet", + "pinecone": import "components/pinecone.jsonnet", "milvus": import "components/milvus.jsonnet", "trustgraph": import "components/trustgraph.jsonnet", diff --git a/templates/components/azure-openai.jsonnet b/templates/components/azure-openai.jsonnet index cc3847c0..8afcaf11 100644 --- a/templates/components/azure-openai.jsonnet +++ b/templates/components/azure-openai.jsonnet @@ -48,7 +48,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/azure.jsonnet b/templates/components/azure.jsonnet index 82b79133..cf10dc66 100644 --- a/templates/components/azure.jsonnet +++ b/templates/components/azure.jsonnet @@ -46,7 +46,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/bedrock.jsonnet b/templates/components/bedrock.jsonnet index 93978a59..6ccaa1c5 100644 --- a/templates/components/bedrock.jsonnet +++ b/templates/components/bedrock.jsonnet @@ -53,7 +53,7 @@ local chunker = import "chunker-recursive.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/claude.jsonnet b/templates/components/claude.jsonnet index c6c94e21..00e4ec79 100644 --- a/templates/components/claude.jsonnet +++ b/templates/components/claude.jsonnet @@ -45,7 +45,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/cohere.jsonnet b/templates/components/cohere.jsonnet index 11c30fbd..5bc9b39c 100644 --- a/templates/components/cohere.jsonnet +++ b/templates/components/cohere.jsonnet @@ -43,7 +43,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_limits("0.5", "128M") .with_reservations("0.1", "128M"); diff --git a/templates/components/document-rag.jsonnet b/templates/components/document-rag.jsonnet index ac5c11ec..0a68dd52 100644 --- a/templates/components/document-rag.jsonnet +++ b/templates/components/document-rag.jsonnet @@ -19,7 +19,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "--prompt-request-queue", "non-persistent://tg/request/prompt-rag", "--prompt-response-queue", - "non-persistent://tg/response/prompt-rag-response", + "non-persistent://tg/response/prompt-rag", ]) .with_limits("0.5", "128M") .with_reservations("0.1", "128M"); diff --git a/templates/components/googleaistudio.jsonnet b/templates/components/googleaistudio.jsonnet index b6ee1d85..c2a40f2c 100644 --- a/templates/components/googleaistudio.jsonnet +++ b/templates/components/googleaistudio.jsonnet @@ -13,7 +13,7 @@ local prompts = import "prompts/mixtral.jsonnet"; create:: function(engine) - local envSecrets = engine.envSecrets("bedrock-credentials") + local envSecrets = engine.envSecrets("googleaistudio-key") .with_env_var("GOOGLE_AI_STUDIO_KEY", "googleaistudio-key"); local container = @@ -50,7 +50,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/graph-rag.jsonnet b/templates/components/graph-rag.jsonnet index c0200d1e..860152c9 100644 --- a/templates/components/graph-rag.jsonnet +++ b/templates/components/graph-rag.jsonnet @@ -112,7 +112,7 @@ local url = import "values/url.jsonnet"; "--prompt-request-queue", "non-persistent://tg/request/prompt-rag", "--prompt-response-queue", - "non-persistent://tg/response/prompt-rag-response", + "non-persistent://tg/response/prompt-rag", "--entity-limit", std.toString($["graph-rag-entity-limit"]), "--triple-limit", diff --git a/templates/components/llamafile.jsonnet b/templates/components/llamafile.jsonnet index d51cda61..bc1a011c 100644 --- a/templates/components/llamafile.jsonnet +++ b/templates/components/llamafile.jsonnet @@ -40,7 +40,7 @@ local prompts = import "prompts/slm.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/memgraph.jsonnet b/templates/components/memgraph.jsonnet new file mode 100644 index 00000000..609da3a2 --- /dev/null +++ b/templates/components/memgraph.jsonnet @@ -0,0 +1,81 @@ +local base = import "base/base.jsonnet"; +local images = import "values/images.jsonnet"; +local url = import "values/url.jsonnet"; +local memgraph = import "stores/memgraph.jsonnet"; + +memgraph + { + + "memgraph-url":: "bolt://memgraph:7687", + "memgraph-database":: "memgraph", + + "store-triples" +: { + + create:: function(engine) + + local container = + engine.container("store-triples") + .with_image(images.trustgraph) + .with_command([ + "triples-write-memgraph", + "-p", + url.pulsar, + "-g", + $["memgraph-url"], + "--database", + $["memgraph-database"], + ]) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "store-triples", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + containerSet, + service, + ]) + + }, + + "query-triples" +: { + + create:: function(engine) + + local container = + engine.container("query-triples") + .with_image(images.trustgraph) + .with_command([ + "triples-query-memgraph", + "-p", + url.pulsar, + "-g", + $["memgraph-url"], + "--database", + $["memgraph-database"], + ]) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "query-triples", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + containerSet, + service, + ]) + + + } + +} + diff --git a/templates/components/ollama.jsonnet b/templates/components/ollama.jsonnet index 2ae696b4..8da00848 100644 --- a/templates/components/ollama.jsonnet +++ b/templates/components/ollama.jsonnet @@ -40,7 +40,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/openai.jsonnet b/templates/components/openai.jsonnet index 83cbd406..27725cb6 100644 --- a/templates/components/openai.jsonnet +++ b/templates/components/openai.jsonnet @@ -50,7 +50,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_env_var_secrets(envSecrets) .with_limits("0.5", "128M") diff --git a/templates/components/pinecone.jsonnet b/templates/components/pinecone.jsonnet new file mode 100644 index 00000000..3422952a --- /dev/null +++ b/templates/components/pinecone.jsonnet @@ -0,0 +1,153 @@ +local base = import "base/base.jsonnet"; +local images = import "values/images.jsonnet"; +local url = import "values/url.jsonnet"; +local cassandra_hosts = "cassandra"; + +{ + + "pinecone-cloud":: "aws", + "pinecone-region":: "us-east-1", + + "store-graph-embeddings" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("pinecone-api-key") + .with_env_var("PINECONE_API_KEY", "pinecone-api-key"); + + local container = + engine.container("store-graph-embeddings") + .with_image(images.trustgraph) + .with_command([ + "ge-write-pinecone", + "-p", + url.pulsar, + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "store-graph-embeddings", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + }, + + "query-graph-embeddings" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("pinecone-api-key") + .with_env_var("PINECONE_API_KEY", "pinecone-api-key"); + + local container = + engine.container("query-graph-embeddings") + .with_image(images.trustgraph) + .with_command([ + "ge-query-pinecone", + "-p", + url.pulsar, + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "query-graph-embeddings", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + }, + + "store-doc-embeddings" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("pinecone-api-key") + .with_env_var("PINECONE_API_KEY", "pinecone-api-key"); + + local container = + engine.container("store-doc-embeddings") + .with_image(images.trustgraph) + .with_command([ + "de-write-pinecone", + "-p", + url.pulsar, + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "store-doc-embeddings", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + }, + + "query-doc-embeddings" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("pinecone-api-key") + .with_env_var("PINECONE_API_KEY", "pinecone-api-key"); + + local container = + engine.container("query-doc-embeddings") + .with_image(images.trustgraph) + .with_command([ + "de-query-pinecone", + "-p", + url.pulsar, + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "query-doc-embeddings", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + + } + +} + diff --git a/templates/components/prompt-template.jsonnet b/templates/components/prompt-template.jsonnet index ac820df6..3dadf337 100644 --- a/templates/components/prompt-template.jsonnet +++ b/templates/components/prompt-template.jsonnet @@ -53,7 +53,7 @@ local default_prompts = import "prompts/default-prompts.jsonnet"; "--text-completion-request-queue", "non-persistent://tg/request/text-completion", "--text-completion-response-queue", - "non-persistent://tg/response/text-completion-response", + "non-persistent://tg/response/text-completion", "--system-prompt", $["prompts"]["system-template"], @@ -92,11 +92,11 @@ local default_prompts = import "prompts/default-prompts.jsonnet"; "-i", "non-persistent://tg/request/prompt-rag", "-o", - "non-persistent://tg/response/prompt-rag-response", + "non-persistent://tg/response/prompt-rag", "--text-completion-request-queue", "non-persistent://tg/request/text-completion-rag", "--text-completion-response-queue", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", "--system-prompt", $["prompts"]["system-template"], diff --git a/templates/components/trustgraph.jsonnet b/templates/components/trustgraph.jsonnet index e178cc27..31ae420e 100644 --- a/templates/components/trustgraph.jsonnet +++ b/templates/components/trustgraph.jsonnet @@ -5,9 +5,56 @@ local prompt = import "prompt-template.jsonnet"; { + "api-gateway-port":: 8088, + "api-gateway-timeout":: 600, + "chunk-size":: 250, "chunk-overlap":: 15, + "api-gateway" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("gateway-secret") + .with_env_var("GATEWAY_SECRET", "gateway-secret"); + + local port = $["api-gateway-port"]; + + local container = + engine.container("api-gateway") + .with_image(images.trustgraph) + .with_command([ + "api-gateway", + "-p", + url.pulsar, + "--timeout", + std.toString($["api-gateway-timeout"]), + "--port", + std.toString(port), + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "256M") + .with_reservations("0.1", "256M") + .with_port(8000, 8000, "metrics") + .with_port(port, port, "api"); + + local containerSet = engine.containers( + "api-gateway", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8000, 8000, "metrics") + .with_port(port, port, "api"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + }, + "chunker" +: { create:: function(engine) @@ -144,7 +191,7 @@ local prompt = import "prompt-template.jsonnet"; "-p", url.pulsar, "-i", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_limits("0.5", "128M") .with_reservations("0.1", "128M"); diff --git a/templates/components/vertexai.jsonnet b/templates/components/vertexai.jsonnet index 44fe27c6..ef193156 100644 --- a/templates/components/vertexai.jsonnet +++ b/templates/components/vertexai.jsonnet @@ -93,7 +93,7 @@ local prompts = import "prompts/mixtral.jsonnet"; "-i", "non-persistent://tg/request/text-completion-rag", "-o", - "non-persistent://tg/response/text-completion-rag-response", + "non-persistent://tg/response/text-completion-rag", ]) .with_limits("0.5", "256M") .with_reservations("0.1", "256M") diff --git a/templates/stores/memgraph.jsonnet b/templates/stores/memgraph.jsonnet new file mode 100644 index 00000000..75faf5f0 --- /dev/null +++ b/templates/stores/memgraph.jsonnet @@ -0,0 +1,68 @@ +local base = import "base/base.jsonnet"; +local images = import "values/images.jsonnet"; + +{ + + "memgraph" +: { + + create:: function(engine) + + local container = + engine.container("memgraph") + .with_image(images.memgraph_mage) + .with_environment({ + MEMGRAPH: "--storage-properties-on-edges=true --storage-enable-edges-metadata=true" + }) + .with_limits("1.0", "1000M") + .with_reservations("0.5", "1000M") + .with_port(7474, 7474, "api") + .with_port(7687, 7687, "api2"); + + local containerSet = engine.containers( + "memgraph", [ container ] + ); + + local service = + engine.service(containerSet) + .with_port(7474, 7474, "api") + .with_port(7687, 7687, "api2"); + + engine.resources([ + containerSet, + service, + ]) + + }, + + "memgraph-lab" +: { + + create:: function(engine) + + local container = + engine.container("lab") + .with_image(images.memgraph_lab) + .with_environment({ + QUICK_CONNECT_MG_HOST: "memgraph", + QUICK_CONNECT_MG_PORT: "7687", + }) + .with_limits("1.0", "512M") + .with_reservations("0.5", "512M") + .with_port(3010, 3000, "http"); + + local containerSet = engine.containers( + "lab", [ container ] + ); + + local service = + engine.service(containerSet) + .with_port(3010, 3010, "http"); + + engine.resources([ + containerSet, + service, + ]) + + }, + +} + diff --git a/templates/values/images.jsonnet b/templates/values/images.jsonnet index 01ecee4d..c583815b 100644 --- a/templates/values/images.jsonnet +++ b/templates/values/images.jsonnet @@ -10,5 +10,7 @@ local version = import "version.jsonnet"; prometheus: "docker.io/prom/prometheus:v2.53.2", grafana: "docker.io/grafana/grafana:11.1.4", trustgraph: "docker.io/trustgraph/trustgraph-flow:" + version, - qdrant: "docker.io/qdrant/qdrant:v1.11.1" + qdrant: "docker.io/qdrant/qdrant:v1.11.1", + memgraph_mage: "docker.io/memgraph/memgraph-mage:1.22-memgraph-2.22", + memgraph_lab: "docker.io/memgraph/lab:2.19.1", } diff --git a/test-api/test-agent-api b/test-api/test-agent-api new file mode 100755 index 00000000..f36ba196 --- /dev/null +++ b/test-api/test-agent-api @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "question": "What is the highest risk aspect of running a space shuttle program? Provide 5 detailed reasons to justify our answer.", +} + +resp = requests.post( + f"{url}agent", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["answer"]) + + diff --git a/test-api/test-agent2-api b/test-api/test-agent2-api new file mode 100755 index 00000000..766b16c9 --- /dev/null +++ b/test-api/test-agent2-api @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "question": "What is 14 plus 12. Justify your answer.", +} + +resp = requests.post( + f"{url}agent", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["answer"]) + + diff --git a/test-api/test-dbpedia b/test-api/test-dbpedia new file mode 100755 index 00000000..e361f533 --- /dev/null +++ b/test-api/test-dbpedia @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "term": "Cornwall", +} + +resp = requests.post( + f"{url}dbpedia", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-embeddings-api b/test-api/test-embeddings-api new file mode 100755 index 00000000..b1defd01 --- /dev/null +++ b/test-api/test-embeddings-api @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "text": "What is the highest risk aspect of running a space shuttle program? Provide 5 detailed reasons to justify our answer.", +} + +resp = requests.post( + f"{url}embeddings", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["vectors"]) + + diff --git a/test-api/test-encyclopedia b/test-api/test-encyclopedia new file mode 100755 index 00000000..ad4e5b36 --- /dev/null +++ b/test-api/test-encyclopedia @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "term": "Cornwall", +} + +resp = requests.post( + f"{url}encyclopedia", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-graph-rag-api b/test-api/test-graph-rag-api new file mode 100755 index 00000000..c329934c --- /dev/null +++ b/test-api/test-graph-rag-api @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "query": "Give me 10 facts", +} + +resp = requests.post( + f"{url}graph-rag", + json=input, +) + +resp = resp.json() + +print(resp) +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["response"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-internet-search b/test-api/test-internet-search new file mode 100755 index 00000000..8c854c77 --- /dev/null +++ b/test-api/test-internet-search @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "term": "Cornwall", +} + +resp = requests.post( + f"{url}internet-search", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-llm-api b/test-api/test-llm-api new file mode 100755 index 00000000..6bee2048 --- /dev/null +++ b/test-api/test-llm-api @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "system": "Respond in French. Use long word, form of numbers, no digits", +# "prompt": "Add 2 and 12" + "prompt": "Add 12 and 14, and then make a poem about llamas which incorporates that number. Then write a joke about llamas" +} + +resp = requests.post( + f"{url}text-completion", + json=input, +) + +if resp.status_code != 200: + raise RuntimeError(f"Status code: {resp.status_code}") + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["response"]) + +############################################################################ + diff --git a/test-api/test-prompt-api b/test-api/test-prompt-api new file mode 100755 index 00000000..4f69f09a --- /dev/null +++ b/test-api/test-prompt-api @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "id": "question", + "variables": { + "question": "Write a joke about llamas." + } +} + +resp = requests.post( + f"{url}prompt", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +if "object" in resp: + print(f"Object: {resp['object']}") + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-prompt2-api b/test-api/test-prompt2-api new file mode 100755 index 00000000..1e641439 --- /dev/null +++ b/test-api/test-prompt2-api @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "id": "extract-definitions", + "variables": { + "text": "A cat is a large mammal." + } +} + +resp = requests.post( + f"{url}prompt", + json=input, +) + +resp = resp.json() + +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +if "object" in resp: + object = json.loads(resp["object"]) + print(json.dumps(object, indent=4)) + sys.exit(1) + +print(resp["text"]) + +sys.exit(0) +############################################################################ + diff --git a/test-api/test-triples-query-api b/test-api/test-triples-query-api new file mode 100755 index 00000000..1aa8a0b1 --- /dev/null +++ b/test-api/test-triples-query-api @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import requests +import json +import sys + +url = "http://localhost:8088/api/v1/" + +############################################################################ + +input = { + "p": { + "v": "http://www.w3.org/2000/01/rdf-schema#label", + "e": True, + }, + "limit": 10 +} + +resp = requests.post( + f"{url}triples-query", + json=input, +) + +print(resp.text) +resp = resp.json() + + +print(resp) +if "error" in resp: + print(f"Error: {resp['error']}") + sys.exit(1) + +print(resp["response"]) + +sys.exit(0) + +############################################################################ + diff --git a/trustgraph-base/trustgraph/api/__init__.py b/trustgraph-base/trustgraph/api/__init__.py new file mode 100644 index 00000000..daa6a964 --- /dev/null +++ b/trustgraph-base/trustgraph/api/__init__.py @@ -0,0 +1,3 @@ + +from . api import * + diff --git a/trustgraph-base/trustgraph/api/api.py b/trustgraph-base/trustgraph/api/api.py new file mode 100644 index 00000000..de96499c --- /dev/null +++ b/trustgraph-base/trustgraph/api/api.py @@ -0,0 +1,339 @@ + +import requests +import json +import dataclasses +import base64 + +from trustgraph.knowledge import hash, Uri, Literal + +class ProtocolException(Exception): + pass + +class ApplicationException(Exception): + pass + +@dataclasses.dataclass +class Triple: + s : str + p : str + o : str + +class Api: + + def __init__(self, url="http://localhost:8088/"): + + self.url = url + + if not url.endswith("/"): + self.url += "/" + + self.url += "api/v1/" + + def check_error(self, response): + + if "error" in response: + + try: + msg = response["error"]["message"] + tp = response["error"]["message"] + except: + raise ApplicationException( + "Error, but the error object is broken" + ) + + raise ApplicationException(f"{tp}: {msg}") + + def text_completion(self, system, prompt): + + # The input consists of system and prompt strings + input = { + "system": system, + "prompt": prompt + } + + url = f"{self.url}text-completion" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException(f"Expected JSON response") + + self.check_error(resp) + + try: + return object["response"] + except: + raise ProtocolException(f"Response not formatted correctly") + + def agent(self, question): + + # The input consists of a question + input = { + "question": question + } + + url = f"{self.url}agent" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException(f"Expected JSON response") + + self.check_error(resp) + + try: + return object["answer"] + except: + raise ProtocolException(f"Response not formatted correctly") + + def graph_rag(self, question): + + # The input consists of a question + input = { + "query": question + } + + url = f"{self.url}graph-rag" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException(f"Expected JSON response") + + self.check_error(resp) + + try: + return object["response"] + except: + raise ProtocolException(f"Response not formatted correctly") + + def embeddings(self, text): + + # The input consists of a text block + input = { + "text": text + } + + url = f"{self.url}embeddings" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException(f"Expected JSON response") + + self.check_error(resp) + + try: + return object["vectors"] + except: + raise ProtocolException(f"Response not formatted correctly") + + def prompt(self, id, variables): + + # The input consists of system and prompt strings + input = { + "id": id, + "variables": variables + } + + url = f"{self.url}prompt" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException("Expected JSON response") + + self.check_error(resp) + + if "text" in object: + return object["text"] + + if "object" in object: + try: + return json.loads(object["object"]) + except Exception as e: + raise ProtocolException( + "Returned object not well-formed JSON" + ) + + raise ProtocolException("Response not formatted correctly") + + def triples_query(self, s=None, p=None, o=None, limit=10000): + + # The input consists of system and prompt strings + input = { + "limit": limit + } + + if s: + if not isinstance(s, Uri): + raise RuntimeError("s must be Uri") + input["s"] = { "v": str(s), "e": isinstance(s, Uri), } + + if p: + if not isinstance(p, Uri): + raise RuntimeError("p must be Uri") + input["p"] = { "v": str(p), "e": isinstance(p, Uri), } + + if o: + if not isinstance(o, Uri) and not isinstance(o, Literal): + raise RuntimeError("o must be Uri or Literal") + input["o"] = { "v": str(o), "e": isinstance(o, Uri), } + + url = f"{self.url}triples-query" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + try: + # Parse the response as JSON + object = resp.json() + except: + raise ProtocolException("Expected JSON response") + + self.check_error(resp) + + if "response" not in object: + raise ProtocolException("Response not formatted correctly") + + def to_value(x): + if x["e"]: return Uri(x["v"]) + return Literal(x["v"]) + + return [ + Triple( + s=to_value(t["s"]), + p=to_value(t["p"]), + o=to_value(t["o"]) + ) + for t in object["response"] + ] + + return object["response"] + + def load_document(self, document, id=None, metadata=None): + + if id is None: + + if metadata is not None: + + # Situation makes no sense. What can the metadata possibly + # mean if the caller doesn't know the document ID. + # Metadata should relate to the document by ID + raise RuntimeError("Can't specify metadata without id") + + id = hash(document) + + triples = [] + + def emit(t): + triples.append(t) + + if metadata: + metadata.emit( + lambda t: triples.append({ + "s": { "v": t["s"], "e": isinstance(t["s"], Uri) }, + "p": { "v": t["p"], "e": isinstance(t["p"], Uri) }, + "o": { "v": t["o"], "e": isinstance(t["o"], Uri) } + }) + ) + + input = { + "id": id, + "metadata": triples, + "data": base64.b64encode(document).decode("utf-8"), + } + + url = f"{self.url}load/document" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + + def load_text(self, text, id=None, metadata=None, charset="utf-8"): + + if id is None: + + if metadata is not None: + + # Situation makes no sense. What can the metadata possibly + # mean if the caller doesn't know the document ID. + # Metadata should relate to the document by ID + raise RuntimeError("Can't specify metadata without id") + + id = hash(text) + + triples = [] + + if metadata: + metadata.emit( + lambda t: triples.append({ + "s": { "v": t["s"], "e": isinstance(t["s"], Uri) }, + "p": { "v": t["p"], "e": isinstance(t["p"], Uri) }, + "o": { "v": t["o"], "e": isinstance(t["o"], Uri) } + }) + ) + + input = { + "id": id, + "metadata": triples, + "charset": charset, + "text": base64.b64encode(text).decode("utf-8"), + } + + url = f"{self.url}load/text" + + # Invoke the API, input is passed as JSON + resp = requests.post(url, json=input) + + # Should be a 200 status code + if resp.status_code != 200: + raise ProtocolException(f"Status code {resp.status_code}") + diff --git a/trustgraph-base/trustgraph/knowledge/__init__.py b/trustgraph-base/trustgraph/knowledge/__init__.py index 0ab6b5db..8349abf0 100644 --- a/trustgraph-base/trustgraph/knowledge/__init__.py +++ b/trustgraph-base/trustgraph/knowledge/__init__.py @@ -1,4 +1,5 @@ +from . defs import * from . identifier import * from . publication import * from . document import * diff --git a/trustgraph-base/trustgraph/knowledge/defs.py b/trustgraph-base/trustgraph/knowledge/defs.py index b95863c6..d6290930 100644 --- a/trustgraph-base/trustgraph/knowledge/defs.py +++ b/trustgraph-base/trustgraph/knowledge/defs.py @@ -23,3 +23,11 @@ URL = 'https://schema.org/url' IDENTIFIER = 'https://schema.org/identifier' KEYWORD = 'https://schema.org/keywords' +class Uri(str): + def is_uri(self): return True + def is_literal(self): return False + +class Literal(str): + def is_uri(self): return False + def is_literal(self): return True + diff --git a/trustgraph-base/trustgraph/knowledge/document.py b/trustgraph-base/trustgraph/knowledge/document.py index dc2f43e3..99d06c72 100644 --- a/trustgraph-base/trustgraph/knowledge/document.py +++ b/trustgraph-base/trustgraph/knowledge/document.py @@ -1,6 +1,16 @@ from . defs import * -from .. schema import Triple, Value + +def Value(value, is_uri): + if is_uri: + return Uri(value) + else: + return Literal(value) + +def Triple(s, p, o): + return { + "s": s, "p": p, "o": o, + } class DigitalDocument: diff --git a/trustgraph-base/trustgraph/knowledge/organization.py b/trustgraph-base/trustgraph/knowledge/organization.py index 1129dd6c..5653aa97 100644 --- a/trustgraph-base/trustgraph/knowledge/organization.py +++ b/trustgraph-base/trustgraph/knowledge/organization.py @@ -1,6 +1,16 @@ from . defs import * -from .. schema import Triple, Value + +def Value(value, is_uri): + if is_uri: + return Uri(value) + else: + return Literal(value) + +def Triple(s, p, o): + return { + "s": s, "p": p, "o": o, + } class Organization: def __init__(self, id, name=None, description=None): diff --git a/trustgraph-base/trustgraph/knowledge/publication.py b/trustgraph-base/trustgraph/knowledge/publication.py index 3c9d41c8..d197df93 100644 --- a/trustgraph-base/trustgraph/knowledge/publication.py +++ b/trustgraph-base/trustgraph/knowledge/publication.py @@ -1,6 +1,16 @@ from . defs import * -from .. schema import Triple, Value + +def Value(value, is_uri): + if is_uri: + return Uri(value) + else: + return Literal(value) + +def Triple(s, p, o): + return { + "s": s, "p": p, "o": o, + } class PublicationEvent: def __init__( diff --git a/trustgraph-base/trustgraph/schema/__init__.py b/trustgraph-base/trustgraph/schema/__init__.py index 3196691b..be41b670 100644 --- a/trustgraph-base/trustgraph/schema/__init__.py +++ b/trustgraph-base/trustgraph/schema/__init__.py @@ -9,4 +9,6 @@ from . graph import * from . retrieval import * from . metadata import * from . agent import * +from . lookup import * + diff --git a/trustgraph-base/trustgraph/schema/documents.py b/trustgraph-base/trustgraph/schema/documents.py index 59aba287..2a3d3d0c 100644 --- a/trustgraph-base/trustgraph/schema/documents.py +++ b/trustgraph-base/trustgraph/schema/documents.py @@ -60,5 +60,5 @@ document_embeddings_request_queue = topic( 'doc-embeddings', kind='non-persistent', namespace='request' ) document_embeddings_response_queue = topic( - 'doc-embeddings-response', kind='non-persistent', namespace='response', + 'doc-embeddings', kind='non-persistent', namespace='response', ) diff --git a/trustgraph-base/trustgraph/schema/graph.py b/trustgraph-base/trustgraph/schema/graph.py index 2d108a30..78c1a99c 100644 --- a/trustgraph-base/trustgraph/schema/graph.py +++ b/trustgraph-base/trustgraph/schema/graph.py @@ -34,7 +34,7 @@ graph_embeddings_request_queue = topic( 'graph-embeddings', kind='non-persistent', namespace='request' ) graph_embeddings_response_queue = topic( - 'graph-embeddings-response', kind='non-persistent', namespace='response', + 'graph-embeddings', kind='non-persistent', namespace='response' ) ############################################################################ @@ -67,5 +67,5 @@ triples_request_queue = topic( 'triples', kind='non-persistent', namespace='request' ) triples_response_queue = topic( - 'triples-response', kind='non-persistent', namespace='response', + 'triples', kind='non-persistent', namespace='response' ) diff --git a/trustgraph-base/trustgraph/schema/lookup.py b/trustgraph-base/trustgraph/schema/lookup.py new file mode 100644 index 00000000..d0a0517c --- /dev/null +++ b/trustgraph-base/trustgraph/schema/lookup.py @@ -0,0 +1,42 @@ + +from pulsar.schema import Record, String + +from . types import Error, Value, Triple +from . topic import topic +from . metadata import Metadata + +############################################################################ + +# Lookups + +class LookupRequest(Record): + kind = String() + term = String() + +class LookupResponse(Record): + text = String() + error = Error() + +encyclopedia_lookup_request_queue = topic( + 'encyclopedia', kind='non-persistent', namespace='request' +) +encyclopedia_lookup_response_queue = topic( + 'encyclopedia', kind='non-persistent', namespace='response', +) + +dbpedia_lookup_request_queue = topic( + 'dbpedia', kind='non-persistent', namespace='request' +) +dbpedia_lookup_response_queue = topic( + 'dbpedia', kind='non-persistent', namespace='response', +) + +internet_search_request_queue = topic( + 'internet-search', kind='non-persistent', namespace='request' +) +internet_search_response_queue = topic( + 'internet-search', kind='non-persistent', namespace='response', +) + +############################################################################ + diff --git a/trustgraph-base/trustgraph/schema/models.py b/trustgraph-base/trustgraph/schema/models.py index 70cb2c8f..a634e1c4 100644 --- a/trustgraph-base/trustgraph/schema/models.py +++ b/trustgraph-base/trustgraph/schema/models.py @@ -23,7 +23,7 @@ text_completion_request_queue = topic( 'text-completion', kind='non-persistent', namespace='request' ) text_completion_response_queue = topic( - 'text-completion-response', kind='non-persistent', namespace='response', + 'text-completion', kind='non-persistent', namespace='response' ) ############################################################################ @@ -41,5 +41,5 @@ embeddings_request_queue = topic( 'embeddings', kind='non-persistent', namespace='request' ) embeddings_response_queue = topic( - 'embeddings-response', kind='non-persistent', namespace='response' + 'embeddings', kind='non-persistent', namespace='response' ) diff --git a/trustgraph-base/trustgraph/schema/prompt.py b/trustgraph-base/trustgraph/schema/prompt.py index 9bcdf117..15eddea8 100644 --- a/trustgraph-base/trustgraph/schema/prompt.py +++ b/trustgraph-base/trustgraph/schema/prompt.py @@ -59,7 +59,7 @@ prompt_request_queue = topic( 'prompt', kind='non-persistent', namespace='request' ) prompt_response_queue = topic( - 'prompt-response', kind='non-persistent', namespace='response' + 'prompt', kind='non-persistent', namespace='response' ) ############################################################################ diff --git a/trustgraph-base/trustgraph/schema/retrieval.py b/trustgraph-base/trustgraph/schema/retrieval.py index ad860c3c..9c4361a1 100644 --- a/trustgraph-base/trustgraph/schema/retrieval.py +++ b/trustgraph-base/trustgraph/schema/retrieval.py @@ -20,7 +20,7 @@ graph_rag_request_queue = topic( 'graph-rag', kind='non-persistent', namespace='request' ) graph_rag_response_queue = topic( - 'graph-rag-response', kind='non-persistent', namespace='response' + 'graph-rag', kind='non-persistent', namespace='response' ) ############################################################################ @@ -40,5 +40,5 @@ document_rag_request_queue = topic( 'doc-rag', kind='non-persistent', namespace='request' ) document_rag_response_queue = topic( - 'doc-rag-response', kind='non-persistent', namespace='response' + 'doc-rag', kind='non-persistent', namespace='response' ) diff --git a/trustgraph-bedrock/setup.py b/trustgraph-bedrock/setup.py index 80cee09c..d92cc9c7 100644 --- a/trustgraph-bedrock/setup.py +++ b/trustgraph-bedrock/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", + "trustgraph-base>=0.17,<0.18", "pulsar-client", "prometheus-client", "boto3", diff --git a/trustgraph-cli/scripts/tg-dump-msgpack b/trustgraph-cli/scripts/tg-dump-msgpack new file mode 100755 index 00000000..18819649 --- /dev/null +++ b/trustgraph-cli/scripts/tg-dump-msgpack @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +""" +This utility reads a knowledge core in msgpack format and outputs its +contents in JSON form to standard output. This is useful only as a +diagnostic utility. +""" + +import msgpack +import sys +import argparse + +def dump(input_file, action): + + with open(input_file, 'rb') as f: + + unpacker = msgpack.Unpacker(f, raw=False) + + for unpacked in unpacker: + print(unpacked) + +def summary(input_file, action): + + vector_dim = None + + triples = set() + + max_records = 1000000 + + with open(input_file, 'rb') as f: + + unpacker = msgpack.Unpacker(f, raw=False) + + rec_count = 0 + + for msg in unpacker: + + if msg[0] == "ge": + vector_dim = len(msg[1]["v"][0]) + + if msg[0] == "t": + + for elt in msg[1]["m"]["m"]: + triples.add(( + elt["s"]["v"], + elt["p"]["v"], + elt["o"]["v"], + )) + + if rec_count > max_records: break + rec_count += 1 + + print("Vector dimension:", vector_dim) + + for t in triples: + if t[1] == "http://www.w3.org/2000/01/rdf-schema#label": + print("-", t[2]) + +def main(): + + parser = argparse.ArgumentParser( + prog='tg-dump-msgpack', + description=__doc__, + ) + + parser.add_argument( + '-i', '--input-file', + required=True, + help=f'Input file' + ) + + parser.add_argument( + '-s', '--summary', action="store_const", const="summary", + dest="action", + help=f'Show a summary' + ) + + parser.add_argument( + '-r', '--records', action="store_const", const="records", + dest="action", + help=f'Dump individual records' + ) + + args = parser.parse_args() + + if args.action == "summary": + summary(**vars(args)) + else: + dump(**vars(args)) + +main() + diff --git a/trustgraph-cli/scripts/tg-load-kg-core b/trustgraph-cli/scripts/tg-load-kg-core new file mode 100755 index 00000000..4e76e525 --- /dev/null +++ b/trustgraph-cli/scripts/tg-load-kg-core @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 + +"""This utility takes a knowledge core and loads it into a running TrustGraph +through the API. The knowledge core should be in msgpack format, which is the +default format produce by tg-save-kg-core. +""" + +import aiohttp +import asyncio +import msgpack +import json +import sys +import argparse +import os +import signal + +class Running: + def __init__(self): self.running = True + def get(self): return self.running + def stop(self): self.running = False + +ge_counts = 0 +t_counts = 0 + +async def load_ge(running, queue, url): + + global ge_counts + + async with aiohttp.ClientSession() as session: + + async with session.ws_connect(f"{url}load/graph-embeddings") as ws: + + while running.get(): + + try: + msg = await asyncio.wait_for(queue.get(), 1) + + # End of load + if msg is None: + break + + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. + continue + + msg = { + "metadata": { + "id": msg["m"]["i"], + "metadata": msg["m"]["m"], + "user": msg["m"]["u"], + "collection": msg["m"]["c"], + }, + "vectors": msg["v"], + "entity": msg["e"], + } + + try: + await ws.send_json(msg) + except Exception as e: + print(e) + + ge_counts += 1 + +async def load_triples(running, queue, url): + + global t_counts + + async with aiohttp.ClientSession() as session: + + async with session.ws_connect(f"{url}load/triples") as ws: + + while running.get(): + + try: + msg = await asyncio.wait_for(queue.get(), 1) + + # End of load + if msg is None: + break + + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. + continue + + msg ={ + "metadata": { + "id": msg["m"]["i"], + "metadata": msg["m"]["m"], + "user": msg["m"]["u"], + "collection": msg["m"]["c"], + }, + "triples": msg["t"], + } + + try: + await ws.send_json(msg) + except Exception as e: + print(e) + + t_counts += 1 + +async def stats(running): + + global t_counts + global ge_counts + + while running.get(): + + await asyncio.sleep(2) + + print( + f"Graph embeddings: {ge_counts:10d} Triples: {t_counts:10d}" + ) + +async def loader(running, ge_queue, t_queue, path, format, user, collection): + + if format == "json": + + raise RuntimeError("Not implemented") + + else: + + with open(path, "rb") as f: + + unpacker = msgpack.Unpacker(f, raw=False) + + while running.get(): + + try: + unpacked = unpacker.unpack() + except: + break + + if user: + unpacked["metadata"]["user"] = user + + if collection: + unpacked["metadata"]["collection"] = collection + + if unpacked[0] == "t": + qtype = t_queue + else: + if unpacked[0] == "ge": + qtype = ge_queue + + while running.get(): + + try: + await asyncio.wait_for(qtype.put(unpacked[1]), 0.5) + + # Successful put message, move on + break + + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. + continue + + if not running.get(): break + + # Put 'None' on end of queue to finish + while running.get(): + + try: + await asyncio.wait_for(t_queue.put(None), 1) + + # Successful put message, move on + break + + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. + continue + + # Put 'None' on end of queue to finish + while running.get(): + + try: + await asyncio.wait_for(ge_queue.put(None), 1) + + # Successful put message, move on + break + + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. + continue + +async def run(running, **args): + + # Maxsize on queues reduces back-pressure so tg-load-kg-core doesn't + # grow to eat all memory + ge_q = asyncio.Queue(maxsize=10) + t_q = asyncio.Queue(maxsize=10) + + load_task = asyncio.create_task( + loader( + running=running, + ge_queue=ge_q, t_queue=t_q, + path=args["input_file"], format=args["format"], + user=args["user"], collection=args["collection"], + ) + + ) + + ge_task = asyncio.create_task( + load_ge( + running=running, + queue=ge_q, url=args["url"] + "api/v1/" + ) + ) + + triples_task = asyncio.create_task( + load_triples( + running=running, + queue=t_q, url=args["url"] + "api/v1/" + ) + ) + + stats_task = asyncio.create_task(stats(running)) + + await triples_task + await ge_task + + running.stop() + + await load_task + await stats_task + +async def main(running): + + parser = argparse.ArgumentParser( + prog='tg-load-kg-core', + description=__doc__, + ) + + default_url = os.getenv("TRUSTGRAPH_API", "http://localhost:8088/") + default_user = "trustgraph" + collection = "default" + + parser.add_argument( + '-u', '--url', + default=default_url, + help=f'TrustGraph API URL (default: {default_url})', + ) + + parser.add_argument( + '-i', '--input-file', + # Make it mandatory, difficult to over-write an existing file + required=True, + help=f'Output file' + ) + + parser.add_argument( + '--format', + default="msgpack", + choices=["msgpack", "json"], + help=f'Output format (default: msgpack)', + ) + + parser.add_argument( + '--user', + help=f'User ID to load as (default: from input)' + ) + + parser.add_argument( + '--collection', + help=f'Collection ID to load as (default: from input)' + ) + + args = parser.parse_args() + + await run(running, **vars(args)) + +running = Running() + +def interrupt(sig, frame): + running.stop() + print('Interrupt') + +signal.signal(signal.SIGINT, interrupt) + +asyncio.run(main(running)) + diff --git a/trustgraph-cli/scripts/tg-load-pdf b/trustgraph-cli/scripts/tg-load-pdf index 18ac57cb..a0d2b3bc 100755 --- a/trustgraph-cli/scripts/tg-load-pdf +++ b/trustgraph-cli/scripts/tg-load-pdf @@ -14,9 +14,9 @@ import time import uuid from trustgraph.schema import Document, document_ingest_queue -from trustgraph.schema import Metadata +from trustgraph.schema import Metadata, Triple, Value from trustgraph.log_level import LogLevel -from trustgraph.knowledge import hash, to_uri +from trustgraph.knowledge import hash, to_uri, Uri from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG from trustgraph.knowledge import Organization, PublicationEvent from trustgraph.knowledge import DigitalDocument @@ -79,7 +79,23 @@ class Loader: r = Document( metadata=Metadata( id=id, - metadata=triples, + metadata=[ + Triple( + s=Value( + value=t["s"], + is_uri=isinstance(t["s"], Uri) + ), + p=Value( + value=t["p"], + is_uri=isinstance(t["p"], Uri) + ), + o=Value( + value=t["o"], + is_uri=isinstance(t["o"], Uri) + ), + ) + for t in triples + ], user=self.user, collection=self.collection, ), diff --git a/trustgraph-cli/scripts/tg-load-text b/trustgraph-cli/scripts/tg-load-text index 88dc8e17..51664a1b 100755 --- a/trustgraph-cli/scripts/tg-load-text +++ b/trustgraph-cli/scripts/tg-load-text @@ -6,7 +6,6 @@ Loads a text document into TrustGraph processing. import pulsar from pulsar.schema import JsonSchema -import base64 import hashlib import argparse import os @@ -14,9 +13,9 @@ import time import uuid from trustgraph.schema import TextDocument, text_ingest_queue -from trustgraph.schema import Metadata +from trustgraph.schema import Metadata, Triple, Value from trustgraph.log_level import LogLevel -from trustgraph.knowledge import hash, to_uri +from trustgraph.knowledge import hash, to_uri, Literal, Uri from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG from trustgraph.knowledge import Organization, PublicationEvent from trustgraph.knowledge import DigitalDocument @@ -79,7 +78,23 @@ class Loader: r = TextDocument( metadata=Metadata( id=id, - metadata=triples, + metadata=[ + Triple( + s=Value( + value=t["s"], + is_uri=isinstance(t["s"], Uri) + ), + p=Value( + value=t["p"], + is_uri=isinstance(t["p"], Uri) + ), + o=Value( + value=t["o"], + is_uri=isinstance(t["o"], Uri) + ), + ) + for t in triples + ], user=self.user, collection=self.collection, ), diff --git a/trustgraph-cli/scripts/tg-save-kg-core b/trustgraph-cli/scripts/tg-save-kg-core new file mode 100755 index 00000000..e52cd7dc --- /dev/null +++ b/trustgraph-cli/scripts/tg-save-kg-core @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 + +""" +This utility connects to a running TrustGraph through the API and creates +a knowledge core from the data streaming through the processing queues. +For completeness of data, tg-save-kg-core should be initiated before data +loading takes place. The default output format, msgpack should be used. +JSON output format is also available - msgpack produces a more compact +representation, which is also more performant to load. +""" + +import aiohttp +import asyncio +import msgpack +import json +import sys +import argparse +import os +import signal + +class Running: + def __init__(self): self.running = True + def get(self): return self.running + def stop(self): self.running = False + +async def fetch_ge(running, queue, user, collection, url): + + async with aiohttp.ClientSession() as session: + + async with session.ws_connect(f"{url}stream/graph-embeddings") as ws: + + while running.get(): + + try: + msg = await asyncio.wait_for(ws.receive(), 1) + except: + continue + + if msg.type == aiohttp.WSMsgType.TEXT: + + data = msg.json() + + if user: + if data["metadata"]["user"] != user: + continue + + if collection: + if data["metadata"]["collection"] != collection: + continue + + await queue.put([ + "ge", + { + "m": { + "i": data["metadata"]["id"], + "m": data["metadata"]["metadata"], + "u": data["metadata"]["user"], + "c": data["metadata"]["collection"], + }, + "v": data["vectors"], + "e": data["entity"], + } + ]) + if msg.type == aiohttp.WSMsgType.ERROR: + print("Error") + break + +async def fetch_triples(running, queue, user, collection, url): + + async with aiohttp.ClientSession() as session: + + async with session.ws_connect(f"{url}stream/triples") as ws: + + while running.get(): + + try: + msg = await asyncio.wait_for(ws.receive(), 1) + except: + continue + + if msg.type == aiohttp.WSMsgType.TEXT: + + data = msg.json() + + if user: + if data["metadata"]["user"] != user: + continue + + if collection: + if data["metadata"]["collection"] != collection: + continue + + await queue.put(( + "t", + { + "m": { + "i": data["metadata"]["id"], + "m": data["metadata"]["metadata"], + "u": data["metadata"]["user"], + "c": data["metadata"]["collection"], + }, + "t": data["triples"], + } + )) + if msg.type == aiohttp.WSMsgType.ERROR: + print("Error") + break + +ge_counts = 0 +t_counts = 0 + +async def stats(running): + + global t_counts + global ge_counts + + while running.get(): + + await asyncio.sleep(2) + + print( + f"Graph embeddings: {ge_counts:10d} Triples: {t_counts:10d}" + ) + +async def output(running, queue, path, format): + + global t_counts + global ge_counts + + with open(path, "wb") as f: + + while running.get(): + + try: + msg = await asyncio.wait_for(queue.get(), 0.5) + except: + # Hopefully it's TimeoutError. Annoying to match since + # it changed in 3.11. + continue + + if format == "msgpack": + f.write(msgpack.packb(msg, use_bin_type=True)) + else: + f.write(json.dumps(msg).encode("utf-8")) + + if msg[0] == "t": + t_counts += 1 + else: + if msg[0] == "ge": + ge_counts += 1 + + print("Output file closed") + +async def run(running, **args): + + q = asyncio.Queue() + + ge_task = asyncio.create_task( + fetch_ge( + running=running, + queue=q, user=args["user"], collection=args["collection"], + url=args["url"] + "api/v1/" + ) + ) + + triples_task = asyncio.create_task( + fetch_triples( + running=running, queue=q, + user=args["user"], collection=args["collection"], + url=args["url"] + "api/v1/" + ) + ) + + output_task = asyncio.create_task( + output( + running=running, queue=q, + path=args["output_file"], format=args["format"], + ) + + ) + + stats_task = asyncio.create_task(stats(running)) + + await output_task + await triples_task + await ge_task + await stats_task + + print("Exiting") + +async def main(running): + + parser = argparse.ArgumentParser( + prog='tg-save-kg-core', + description=__doc__, + ) + + default_url = os.getenv("TRUSTGRAPH_API", "http://localhost:8088/") + default_user = "trustgraph" + collection = "default" + + parser.add_argument( + '-u', '--url', + default=default_url, + help=f'TrustGraph API URL (default: {default_url})', + ) + + parser.add_argument( + '-o', '--output-file', + # Make it mandatory, difficult to over-write an existing file + required=True, + help=f'Output file' + ) + + parser.add_argument( + '--format', + default="msgpack", + choices=["msgpack", "json"], + help=f'Output format (default: msgpack)', + ) + + parser.add_argument( + '--user', + help=f'User ID to filter on (default: no filter)' + ) + + parser.add_argument( + '--collection', + help=f'Collection ID to filter on (default: no filter)' + ) + + args = parser.parse_args() + + await run(running, **vars(args)) + +running = Running() + +def interrupt(sig, frame): + running.stop() + print('Interrupt') + +signal.signal(signal.SIGINT, interrupt) + +asyncio.run(main(running)) + diff --git a/trustgraph-cli/setup.py b/trustgraph-cli/setup.py index 651fdc27..e9de429a 100644 --- a/trustgraph-cli/setup.py +++ b/trustgraph-cli/setup.py @@ -34,11 +34,12 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", + "trustgraph-base>=0.17,<0.18", "requests", "pulsar-client", "rdflib", "tabulate", + "msgpack", ], scripts=[ "scripts/tg-graph-show", @@ -54,5 +55,8 @@ setuptools.setup( "scripts/tg-invoke-agent", "scripts/tg-invoke-prompt", "scripts/tg-invoke-llm", + "scripts/tg-save-kg-core", + "scripts/tg-load-kg-core", + "scripts/tg-dump-msgpack", ] ) diff --git a/trustgraph-embeddings-hf/setup.py b/trustgraph-embeddings-hf/setup.py index ad01667f..25ccfeab 100644 --- a/trustgraph-embeddings-hf/setup.py +++ b/trustgraph-embeddings-hf/setup.py @@ -34,8 +34,8 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", - "trustgraph-flow>=0.15,<0.16", + "trustgraph-base>=0.17,<0.18", + "trustgraph-flow>=0.17,<0.18", "torch", "urllib3", "transformers", diff --git a/trustgraph-flow/scripts/api-gateway b/trustgraph-flow/scripts/api-gateway new file mode 100755 index 00000000..f7ba0fda --- /dev/null +++ b/trustgraph-flow/scripts/api-gateway @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.gateway import run + +run() + diff --git a/trustgraph-flow/scripts/de-query-pinecone b/trustgraph-flow/scripts/de-query-pinecone new file mode 100755 index 00000000..b21d9045 --- /dev/null +++ b/trustgraph-flow/scripts/de-query-pinecone @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.query.doc_embeddings.pinecone import run + +run() + diff --git a/trustgraph-flow/scripts/de-write-pinecone b/trustgraph-flow/scripts/de-write-pinecone new file mode 100755 index 00000000..eb604747 --- /dev/null +++ b/trustgraph-flow/scripts/de-write-pinecone @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.storage.doc_embeddings.pinecone import run + +run() + diff --git a/trustgraph-flow/scripts/ge-query-pinecone b/trustgraph-flow/scripts/ge-query-pinecone new file mode 100755 index 00000000..b75aec78 --- /dev/null +++ b/trustgraph-flow/scripts/ge-query-pinecone @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.query.graph_embeddings.pinecone import run + +run() + diff --git a/trustgraph-flow/scripts/ge-write-pinecone b/trustgraph-flow/scripts/ge-write-pinecone new file mode 100755 index 00000000..802a8377 --- /dev/null +++ b/trustgraph-flow/scripts/ge-write-pinecone @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.storage.graph_embeddings.pinecone import run + +run() + diff --git a/trustgraph-flow/scripts/triples-query-memgraph b/trustgraph-flow/scripts/triples-query-memgraph new file mode 100755 index 00000000..443929e4 --- /dev/null +++ b/trustgraph-flow/scripts/triples-query-memgraph @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.query.triples.memgraph import run + +run() + diff --git a/trustgraph-flow/scripts/triples-write-memgraph b/trustgraph-flow/scripts/triples-write-memgraph new file mode 100755 index 00000000..3d94a576 --- /dev/null +++ b/trustgraph-flow/scripts/triples-write-memgraph @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.storage.triples.memgraph import run + +run() + diff --git a/trustgraph-flow/scripts/wikipedia-lookup b/trustgraph-flow/scripts/wikipedia-lookup new file mode 100755 index 00000000..a89b1009 --- /dev/null +++ b/trustgraph-flow/scripts/wikipedia-lookup @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.external.wikipedia import run + +run() + diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 8b46b2d2..c53f96e7 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", + "trustgraph-base>=0.17,<0.18", "urllib3", "rdflib", "pymilvus", @@ -58,21 +58,28 @@ setuptools.setup( "google-generativeai", "ibis", "jsonschema", + "aiohttp", + "pinecone[grpc]", ], scripts=[ + "scripts/api-gateway", "scripts/agent-manager-react", "scripts/chunker-recursive", "scripts/chunker-token", "scripts/de-query-milvus", "scripts/de-query-qdrant", + "scripts/de-query-pinecone", "scripts/de-write-milvus", "scripts/de-write-qdrant", + "scripts/de-write-pinecone", "scripts/document-rag", "scripts/embeddings-ollama", "scripts/embeddings-vectorize", "scripts/ge-query-milvus", + "scripts/ge-query-pinecone", "scripts/ge-query-qdrant", "scripts/ge-write-milvus", + "scripts/ge-write-pinecone", "scripts/ge-write-qdrant", "scripts/graph-rag", "scripts/kg-extract-definitions", @@ -96,7 +103,10 @@ setuptools.setup( "scripts/text-completion-openai", "scripts/triples-query-cassandra", "scripts/triples-query-neo4j", + "scripts/triples-query-memgraph", "scripts/triples-write-cassandra", "scripts/triples-write-neo4j", + "scripts/triples-write-memgraph", + "scripts/wikipedia-lookup", ] ) diff --git a/trustgraph-flow/trustgraph/direct/cassandra.py b/trustgraph-flow/trustgraph/direct/cassandra.py index 2b577df1..568411a9 100644 --- a/trustgraph-flow/trustgraph/direct/cassandra.py +++ b/trustgraph-flow/trustgraph/direct/cassandra.py @@ -97,7 +97,7 @@ class TrustGraph: def get_po(self, p, o, limit=10): return self.session.execute( - f"select s from {self.table} where p = %s and o = %s allow filtering limit {limit}", + f"select s from {self.table} where p = %s and o = %s limit {limit} allow filtering", (p, o) ) diff --git a/trustgraph-parquet/trustgraph/dump/__init__.py b/trustgraph-flow/trustgraph/external/__init__.py similarity index 100% rename from trustgraph-parquet/trustgraph/dump/__init__.py rename to trustgraph-flow/trustgraph/external/__init__.py diff --git a/trustgraph-flow/trustgraph/external/wikipedia/__init__.py b/trustgraph-flow/trustgraph/external/wikipedia/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/external/wikipedia/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/external/wikipedia/__main__.py b/trustgraph-flow/trustgraph/external/wikipedia/__main__.py new file mode 100644 index 00000000..e9136855 --- /dev/null +++ b/trustgraph-flow/trustgraph/external/wikipedia/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . service import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/external/wikipedia/service.py b/trustgraph-flow/trustgraph/external/wikipedia/service.py new file mode 100644 index 00000000..932e1213 --- /dev/null +++ b/trustgraph-flow/trustgraph/external/wikipedia/service.py @@ -0,0 +1,102 @@ + +""" +Wikipedia lookup service. Fetchs an extract from the Wikipedia page +using the API. +""" + +from trustgraph.schema import LookupRequest, LookupResponse, Error +from trustgraph.schema import encyclopedia_lookup_request_queue +from trustgraph.schema import encyclopedia_lookup_response_queue +from trustgraph.log_level import LogLevel +from trustgraph.base import ConsumerProducer +import requests + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = encyclopedia_lookup_request_queue +default_output_queue = encyclopedia_lookup_response_queue +default_subscriber = module +default_url="https://en.wikipedia.org/" + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + url = params.get("url", default_url) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": LookupRequest, + "output_schema": LookupResponse, + } + ) + + self.url = url + + def handle(self, msg): + + v = msg.value() + + # Sender-produced ID + id = msg.properties()["id"] + + print(f"Handling {v.kind} / {v.term}...", flush=True) + + try: + + url = f"{self.url}/api/rest_v1/page/summary/{v.term}" + + resp = Result = requests.get(url).json() + resp = resp["extract"] + + r = LookupResponse( + error=None, + text=resp + ) + + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + return + + except Exception as e: + + r = LookupResponse( + error=Error( + type = "lookup-error", + message = str(e), + ), + text=None, + ) + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + return + + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + + parser.add_argument( + '-u', '--url', + default=default_url, + help=f'LLM model (default: {default_url})' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/gateway/__init__.py b/trustgraph-flow/trustgraph/gateway/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/gateway/__main__.py b/trustgraph-flow/trustgraph/gateway/__main__.py new file mode 100755 index 00000000..e9136855 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . service import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/gateway/agent.py b/trustgraph-flow/trustgraph/gateway/agent.py new file mode 100644 index 00000000..c7af947b --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/agent.py @@ -0,0 +1,42 @@ + +from .. schema import AgentRequest, AgentResponse +from .. schema import agent_request_queue +from .. schema import agent_response_queue + +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor + +class AgentRequestor(ServiceRequestor): + def __init__(self, pulsar_host, timeout, auth): + + super(AgentRequestor, self).__init__( + pulsar_host=pulsar_host, + request_queue=agent_request_queue, + response_queue=agent_response_queue, + request_schema=AgentRequest, + response_schema=AgentResponse, + timeout=timeout, + ) + + def to_request(self, body): + return AgentRequest( + question=body["question"] + ) + + def from_response(self, message): + resp = { + } + + if message.answer: + resp["answer"] = message.answer + + if message.thought: + resp["thought"] = message.thought + + if message.observation: + resp["observation"] = message.observation + + # The 2nd boolean expression indicates whether we're done responding + return resp, (message.answer is not None) + + diff --git a/trustgraph-flow/trustgraph/gateway/auth.py b/trustgraph-flow/trustgraph/gateway/auth.py new file mode 100644 index 00000000..a693ca32 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/auth.py @@ -0,0 +1,22 @@ + +class Authenticator: + + def __init__(self, token=None, allow_all=False): + + if not allow_all and token is None: + raise RuntimeError("Need a token") + + if not allow_all and token == "": + raise RuntimeError("Need a token") + + self.token = token + self.allow_all = allow_all + + def permitted(self, token, roles): + + if self.allow_all: return True + + if self.token != token: return False + + return True + diff --git a/trustgraph-flow/trustgraph/gateway/dbpedia.py b/trustgraph-flow/trustgraph/gateway/dbpedia.py new file mode 100644 index 00000000..8ae4f695 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/dbpedia.py @@ -0,0 +1,29 @@ + +from .. schema import LookupRequest, LookupResponse +from .. schema import dbpedia_lookup_request_queue +from .. schema import dbpedia_lookup_response_queue + +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor + +class DbpediaRequestor(ServiceRequestor): + def __init__(self, pulsar_host, timeout, auth): + + super(DbpediaRequestor, self).__init__( + pulsar_host=pulsar_host, + request_queue=dbpedia_lookup_request_queue, + response_queue=dbpedia_lookup_response_queue, + request_schema=LookupRequest, + response_schema=LookupResponse, + timeout=timeout, + ) + + def to_request(self, body): + return LookupRequest( + term=body["term"], + kind=body.get("kind", None), + ) + + def from_response(self, message): + return { "text": message.text }, True + diff --git a/trustgraph-flow/trustgraph/gateway/embeddings.py b/trustgraph-flow/trustgraph/gateway/embeddings.py new file mode 100644 index 00000000..d0f3e1ef --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/embeddings.py @@ -0,0 +1,29 @@ + +from .. schema import EmbeddingsRequest, EmbeddingsResponse +from .. schema import embeddings_request_queue +from .. schema import embeddings_response_queue + +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor + +class EmbeddingsRequestor(ServiceRequestor): + def __init__(self, pulsar_host, timeout, auth): + + super(EmbeddingsRequestor, self).__init__( + pulsar_host=pulsar_host, + request_queue=embeddings_request_queue, + response_queue=embeddings_response_queue, + request_schema=EmbeddingsRequest, + response_schema=EmbeddingsResponse, + timeout=timeout, + ) + + def to_request(self, body): + return EmbeddingsRequest( + text=body["text"] + ) + + def from_response(self, message): + return { "vectors": message.vectors }, True + + diff --git a/trustgraph-flow/trustgraph/gateway/encyclopedia.py b/trustgraph-flow/trustgraph/gateway/encyclopedia.py new file mode 100644 index 00000000..3f4dad79 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/encyclopedia.py @@ -0,0 +1,29 @@ + +from .. schema import LookupRequest, LookupResponse +from .. schema import encyclopedia_lookup_request_queue +from .. schema import encyclopedia_lookup_response_queue + +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor + +class EncyclopediaRequestor(ServiceRequestor): + def __init__(self, pulsar_host, timeout, auth): + + super(EncyclopediaRequestor, self).__init__( + pulsar_host=pulsar_host, + request_queue=encyclopedia_lookup_request_queue, + response_queue=encyclopedia_lookup_response_queue, + request_schema=LookupRequest, + response_schema=LookupResponse, + timeout=timeout, + ) + + def to_request(self, body): + return LookupRequest( + term=body["term"], + kind=body.get("kind", None), + ) + + def from_response(self, message): + return { "text": message.text }, True + diff --git a/trustgraph-flow/trustgraph/gateway/endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint.py new file mode 100644 index 00000000..6d6ca8d5 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/endpoint.py @@ -0,0 +1,69 @@ + +import asyncio +from pulsar.schema import JsonSchema +from aiohttp import web +import uuid +import logging + +from . publisher import Publisher +from . subscriber import Subscriber + +logger = logging.getLogger("endpoint") +logger.setLevel(logging.INFO) + +class ServiceEndpoint: + + def __init__(self, endpoint_path, auth, requestor): + + self.path = endpoint_path + + self.auth = auth + self.operation = "service" + + self.requestor = requestor + + async def start(self): + await self.requestor.start() + + def add_routes(self, app): + + app.add_routes([ + web.post(self.path, self.handle), + ]) + + async def handle(self, request): + + print(request.path, "...") + + try: + ht = request.headers["Authorization"] + tokens = ht.split(" ", 2) + if tokens[0] != "Bearer": + return web.HTTPUnauthorized() + token = tokens[1] + except: + token = "" + + if not self.auth.permitted(token, self.operation): + return web.HTTPUnauthorized() + + try: + + data = await request.json() + + print(data) + + def responder(x, fin): + print(x) + + resp, fin = await self.requestor.process(data, responder) + + return web.json_response(resp) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + diff --git a/trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py b/trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py new file mode 100644 index 00000000..18a2e6fe --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/graph_embeddings_load.py @@ -0,0 +1,60 @@ + +import asyncio +from pulsar.schema import JsonSchema +import uuid +from aiohttp import WSMsgType + +from .. schema import Metadata +from .. schema import GraphEmbeddings +from .. schema import graph_embeddings_store_queue + +from . publisher import Publisher +from . socket import SocketEndpoint +from . serialize import to_subgraph, to_value + +class GraphEmbeddingsLoadEndpoint(SocketEndpoint): + + def __init__( + self, pulsar_host, auth, path="/api/v1/load/graph-embeddings", + ): + + super(GraphEmbeddingsLoadEndpoint, self).__init__( + endpoint_path=path, auth=auth, + ) + + self.pulsar_host=pulsar_host + + self.publisher = Publisher( + self.pulsar_host, graph_embeddings_store_queue, + schema=JsonSchema(GraphEmbeddings) + ) + + async def start(self): + + self.publisher.start() + + async def listener(self, ws, running): + + async for msg in ws: + # On error, finish + if msg.type == WSMsgType.ERROR: + break + else: + + data = msg.json() + + elt = GraphEmbeddings( + metadata=Metadata( + id=data["metadata"]["id"], + metadata=to_subgraph(data["metadata"]["metadata"]), + user=data["metadata"]["user"], + collection=data["metadata"]["collection"], + ), + entity=to_value(data["entity"]), + vectors=data["vectors"], + ) + + self.publisher.send(None, elt) + + + running.stop() diff --git a/trustgraph-flow/trustgraph/gateway/graph_embeddings_stream.py b/trustgraph-flow/trustgraph/gateway/graph_embeddings_stream.py new file mode 100644 index 00000000..f0b4dd86 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/graph_embeddings_stream.py @@ -0,0 +1,57 @@ + +import asyncio +import queue +from pulsar.schema import JsonSchema +import uuid + +from .. schema import GraphEmbeddings +from .. schema import graph_embeddings_store_queue + +from . subscriber import Subscriber +from . socket import SocketEndpoint +from . serialize import serialize_graph_embeddings + +class GraphEmbeddingsStreamEndpoint(SocketEndpoint): + + def __init__( + self, pulsar_host, auth, path="/api/v1/stream/graph-embeddings" + ): + + super(GraphEmbeddingsStreamEndpoint, self).__init__( + endpoint_path=path, auth=auth, + ) + + self.pulsar_host=pulsar_host + + self.subscriber = Subscriber( + self.pulsar_host, graph_embeddings_store_queue, + "api-gateway", "api-gateway", + schema=JsonSchema(GraphEmbeddings) + ) + + async def start(self): + + self.subscriber.start() + + async def async_thread(self, ws, running): + + id = str(uuid.uuid4()) + + q = self.subscriber.subscribe_all(id) + + while running.get(): + try: + resp = await asyncio.to_thread(q.get, timeout=0.5) + await ws.send_json(serialize_graph_embeddings(resp)) + + except queue.Empty: + continue + + except Exception as e: + print(f"Exception: {str(e)}", flush=True) + break + + self.subscriber.unsubscribe_all(id) + + running.stop() + diff --git a/trustgraph-flow/trustgraph/gateway/graph_rag.py b/trustgraph-flow/trustgraph/gateway/graph_rag.py new file mode 100644 index 00000000..55fd5d2f --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/graph_rag.py @@ -0,0 +1,30 @@ + +from .. schema import GraphRagQuery, GraphRagResponse +from .. schema import graph_rag_request_queue +from .. schema import graph_rag_response_queue + +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor + +class GraphRagRequestor(ServiceRequestor): + def __init__(self, pulsar_host, timeout, auth): + + super(GraphRagRequestor, self).__init__( + pulsar_host=pulsar_host, + request_queue=graph_rag_request_queue, + response_queue=graph_rag_response_queue, + request_schema=GraphRagQuery, + response_schema=GraphRagResponse, + timeout=timeout, + ) + + def to_request(self, body): + return GraphRagQuery( + query=body["query"], + user=body.get("user", "trustgraph"), + collection=body.get("collection", "default"), + ) + + def from_response(self, message): + return { "response": message.response }, True + diff --git a/trustgraph-flow/trustgraph/gateway/internet_search.py b/trustgraph-flow/trustgraph/gateway/internet_search.py new file mode 100644 index 00000000..127cd5d1 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/internet_search.py @@ -0,0 +1,29 @@ + +from .. schema import LookupRequest, LookupResponse +from .. schema import internet_search_request_queue +from .. schema import internet_search_response_queue + +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor + +class InternetSearchRequestor(ServiceRequestor): + def __init__(self, pulsar_host, timeout, auth): + + super(InternetSearchRequestor, self).__init__( + pulsar_host=pulsar_host, + request_queue=internet_search_request_queue, + response_queue=internet_search_response_queue, + request_schema=LookupRequest, + response_schema=LookupResponse, + timeout=timeout, + ) + + def to_request(self, body): + return LookupRequest( + term=body["term"], + kind=body.get("kind", None), + ) + + def from_response(self, message): + return { "text": message.text }, True + diff --git a/trustgraph-flow/trustgraph/gateway/mux.py b/trustgraph-flow/trustgraph/gateway/mux.py new file mode 100644 index 00000000..cd5ddfba --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/mux.py @@ -0,0 +1,94 @@ + +import asyncio +import queue +from pulsar.schema import JsonSchema +import uuid +from aiohttp import web, WSMsgType + +from . socket import SocketEndpoint +from . text_completion import TextCompletionRequestor + +class MuxEndpoint(SocketEndpoint): + + def __init__( + self, pulsar_host, auth, + services, + path="/api/v1/mux", + ): + + super(MuxEndpoint, self).__init__( + endpoint_path=path, auth=auth, + ) + + self.q = asyncio.Queue(maxsize=10) + + self.services = services + + async def start(self): + pass + + async def async_thread(self, ws, running): + + while running.get(): + + try: + id, svc, request = await asyncio.wait_for(self.q.get(), 1) + except TimeoutError: + continue + except Exception as e: + await ws.send_json({"id": id, "error": str(e)}) + + try: + + print(svc, request) + + requestor = self.services[svc] + + async def responder(resp, fin): + await ws.send_json({ + "id": id, + "response": resp, + "complete": fin, + }) + + resp = await requestor.process(request, responder) + + except Exception as e: + + await ws.send_json({"error": str(e)}) + + running.stop() + + async def listener(self, ws, running): + + async for msg in ws: + + # On error, finish + if msg.type == WSMsgType.ERROR: + break + else: + + try: + + data = msg.json() + + if data["service"] not in self.services: + raise RuntimeError("Bad service") + + if "request" not in data: + raise RuntimeError("Bad message") + + if "id" not in data: + raise RuntimeError("Bad message") + + await self.q.put( + (data["id"], data["service"], data["request"]) + ) + + except Exception as e: + + await ws.send_json({"error": str(e)}) + continue + + running.stop() + diff --git a/trustgraph-flow/trustgraph/gateway/prompt.py b/trustgraph-flow/trustgraph/gateway/prompt.py new file mode 100644 index 00000000..080d5618 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/prompt.py @@ -0,0 +1,41 @@ + +import json + +from .. schema import PromptRequest, PromptResponse +from .. schema import prompt_request_queue +from .. schema import prompt_response_queue + +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor + +class PromptRequestor(ServiceRequestor): + def __init__(self, pulsar_host, timeout, auth): + + super(PromptRequestor, self).__init__( + pulsar_host=pulsar_host, + request_queue=prompt_request_queue, + response_queue=prompt_response_queue, + request_schema=PromptRequest, + response_schema=PromptResponse, + timeout=timeout, + ) + + def to_request(self, body): + return PromptRequest( + id=body["id"], + terms={ + k: json.dumps(v) + for k, v in body["variables"].items() + } + ) + + def from_response(self, message): + if message.object: + return { + "object": message.object + }, True + else: + return { + "text": message.text + }, True + diff --git a/trustgraph-flow/trustgraph/gateway/publisher.py b/trustgraph-flow/trustgraph/gateway/publisher.py new file mode 100644 index 00000000..89c612ce --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/publisher.py @@ -0,0 +1,53 @@ + +import queue +import time +import pulsar +import threading + +class Publisher: + + def __init__(self, pulsar_host, topic, schema=None, max_size=10, + chunking_enabled=False): + self.pulsar_host = pulsar_host + self.topic = topic + self.schema = schema + self.q = queue.Queue(maxsize=max_size) + self.chunking_enabled = chunking_enabled + + def start(self): + self.task = threading.Thread(target=self.run) + self.task.start() + + def run(self): + + while True: + + try: + + client = pulsar.Client( + self.pulsar_host, + ) + + producer = client.create_producer( + topic=self.topic, + schema=self.schema, + chunking_enabled=self.chunking_enabled, + ) + + while True: + + id, item = self.q.get() + + if id: + producer.send(item, { "id": id }) + else: + producer.send(item) + + except Exception as e: + print("Exception:", e, flush=True) + + # If handler drops out, sleep a retry + time.sleep(2) + + def send(self, id, msg): + self.q.put((id, msg)) diff --git a/trustgraph-flow/trustgraph/gateway/requestor.py b/trustgraph-flow/trustgraph/gateway/requestor.py new file mode 100644 index 00000000..5f6e2692 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/requestor.py @@ -0,0 +1,88 @@ + +import asyncio +from pulsar.schema import JsonSchema +import uuid +import logging + +from . publisher import Publisher +from . subscriber import Subscriber + +logger = logging.getLogger("requestor") +logger.setLevel(logging.INFO) + +class ServiceRequestor: + + def __init__( + self, + pulsar_host, + request_queue, request_schema, + response_queue, response_schema, + subscription="api-gateway", consumer_name="api-gateway", + timeout=600, + ): + + self.pub = Publisher( + pulsar_host, request_queue, + schema=JsonSchema(request_schema) + ) + + self.sub = Subscriber( + pulsar_host, response_queue, + subscription, consumer_name, + JsonSchema(response_schema) + ) + + self.timeout = timeout + + async def start(self): + + self.pub.start() + self.sub.start() + + def to_request(self, request): + raise RuntimeError("Not defined") + + def from_response(self, response): + raise RuntimeError("Not defined") + + async def process(self, request, responder=None): + + id = str(uuid.uuid4()) + + try: + + q = self.sub.subscribe(id) + + await asyncio.to_thread( + self.pub.send, id, self.to_request(request) + ) + + while True: + + try: + resp = await asyncio.to_thread(q.get, timeout=self.timeout) + except Exception as e: + raise RuntimeError("Timeout") + + if resp.error: + return { "error": resp.error.message } + + resp, fin = self.from_response(resp) + + print(resp, fin) + + if responder: + await responder(resp, fin) + + if fin: + return resp + + except Exception as e: + + logging.error(f"Exception: {e}") + + return { "error": str(e) } + + finally: + self.sub.unsubscribe(id) + diff --git a/trustgraph-flow/trustgraph/gateway/running.py b/trustgraph-flow/trustgraph/gateway/running.py new file mode 100644 index 00000000..e6a91e66 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/running.py @@ -0,0 +1,5 @@ + +class Running: + def __init__(self): self.running = True + def get(self): return self.running + def stop(self): self.running = False diff --git a/trustgraph-flow/trustgraph/gateway/serialize.py b/trustgraph-flow/trustgraph/gateway/serialize.py new file mode 100644 index 00000000..35932382 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/serialize.py @@ -0,0 +1,57 @@ +from .. schema import Value, Triple + +def to_value(x): + return Value(value=x["v"], is_uri=x["e"]) + +def to_subgraph(x): + return [ + Triple( + s=to_value(t["s"]), + p=to_value(t["p"]), + o=to_value(t["o"]) + ) + for t in x + ] + +def serialize_value(v): + return { + "v": v.value, + "e": v.is_uri, + } + +def serialize_triple(t): + return { + "s": serialize_value(t.s), + "p": serialize_value(t.p), + "o": serialize_value(t.o) + } + +def serialize_subgraph(sg): + return [ + serialize_triple(t) + for t in sg + ] + +def serialize_triples(message): + return { + "metadata": { + "id": message.metadata.id, + "metadata": serialize_subgraph(message.metadata.metadata), + "user": message.metadata.user, + "collection": message.metadata.collection, + }, + "triples": serialize_subgraph(message.triples), + } + +def serialize_graph_embeddings(message): + return { + "metadata": { + "id": message.metadata.id, + "metadata": serialize_subgraph(message.metadata.metadata), + "user": message.metadata.user, + "collection": message.metadata.collection, + }, + "vectors": message.vectors, + "entity": serialize_value(message.entity), + } + diff --git a/trustgraph-flow/trustgraph/gateway/service.py b/trustgraph-flow/trustgraph/gateway/service.py new file mode 100755 index 00000000..6a8a62eb --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/service.py @@ -0,0 +1,364 @@ +""" +API gateway. Offers HTTP services which are translated to interaction on the +Pulsar bus. +""" + +module = ".".join(__name__.split(".")[1:-1]) + +# FIXME: Subscribes to Pulsar unnecessarily, should only do it when there +# are active listeners + +# FIXME: Connection errors in publishers / subscribers cause those threads +# to fail and are not failed or retried + +import asyncio +import argparse +from aiohttp import web +import logging +import os +import base64 + +import pulsar +from pulsar.schema import JsonSchema +from prometheus_client import start_http_server + +from .. log_level import LogLevel + +from .. schema import Metadata, Document, TextDocument +from .. schema import document_ingest_queue, text_ingest_queue + +from . serialize import to_subgraph +from . running import Running +from . publisher import Publisher +from . subscriber import Subscriber +from . text_completion import TextCompletionRequestor +from . prompt import PromptRequestor +from . graph_rag import GraphRagRequestor +from . triples_query import TriplesQueryRequestor +from . embeddings import EmbeddingsRequestor +from . encyclopedia import EncyclopediaRequestor +from . agent import AgentRequestor +from . dbpedia import DbpediaRequestor +from . internet_search import InternetSearchRequestor +from . triples_stream import TriplesStreamEndpoint +from . graph_embeddings_stream import GraphEmbeddingsStreamEndpoint +from . triples_load import TriplesLoadEndpoint +from . graph_embeddings_load import GraphEmbeddingsLoadEndpoint +from . mux import MuxEndpoint + +from . endpoint import ServiceEndpoint +from . auth import Authenticator + +logger = logging.getLogger("api") +logger.setLevel(logging.INFO) + +default_pulsar_host = os.getenv("PULSAR_HOST", "pulsar://pulsar:6650") +default_timeout = 600 +default_port = 8088 +default_api_token = os.getenv("GATEWAY_SECRET", "") + +class Api: + + def __init__(self, **config): + + self.app = web.Application( + middlewares=[], + client_max_size=256 * 1024 * 1024 + ) + + self.port = int(config.get("port", default_port)) + self.timeout = int(config.get("timeout", default_timeout)) + self.pulsar_host = config.get("pulsar_host", default_pulsar_host) + + api_token = config.get("api_token", default_api_token) + + # Token not set, or token equal empty string means no auth + if api_token: + self.auth = Authenticator(token=api_token) + else: + self.auth = Authenticator(allow_all=True) + + self.services = { + "text-completion": TextCompletionRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "prompt": PromptRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "graph-rag": GraphRagRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "triples-query": TriplesQueryRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "embeddings": EmbeddingsRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "agent": AgentRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "encyclopedia": EncyclopediaRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "dbpedia": DbpediaRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + "internet-search": InternetSearchRequestor( + pulsar_host=self.pulsar_host, timeout=self.timeout, + auth = self.auth, + ), + } + + self.endpoints = [ + ServiceEndpoint( + endpoint_path = "/api/v1/text-completion", auth=self.auth, + requestor = self.services["text-completion"], + ), + ServiceEndpoint( + endpoint_path = "/api/v1/prompt", auth=self.auth, + requestor = self.services["prompt"], + ), + ServiceEndpoint( + endpoint_path = "/api/v1/graph-rag", auth=self.auth, + requestor = self.services["graph-rag"], + ), + ServiceEndpoint( + endpoint_path = "/api/v1/triples-query", auth=self.auth, + requestor = self.services["triples-query"], + ), + ServiceEndpoint( + endpoint_path = "/api/v1/embeddings", auth=self.auth, + requestor = self.services["embeddings"], + ), + ServiceEndpoint( + endpoint_path = "/api/v1/agent", auth=self.auth, + requestor = self.services["agent"], + ), + ServiceEndpoint( + endpoint_path = "/api/v1/encyclopedia", auth=self.auth, + requestor = self.services["encyclopedia"], + ), + ServiceEndpoint( + endpoint_path = "/api/v1/dbpedia", auth=self.auth, + requestor = self.services["dbpedia"], + ), + ServiceEndpoint( + endpoint_path = "/api/v1/internet-search", auth=self.auth, + requestor = self.services["internet-search"], + ), + TriplesStreamEndpoint( + pulsar_host=self.pulsar_host, + auth = self.auth, + ), + GraphEmbeddingsStreamEndpoint( + pulsar_host=self.pulsar_host, + auth = self.auth, + ), + TriplesLoadEndpoint( + pulsar_host=self.pulsar_host, + auth = self.auth, + ), + GraphEmbeddingsLoadEndpoint( + pulsar_host=self.pulsar_host, + auth = self.auth, + ), + MuxEndpoint( + pulsar_host=self.pulsar_host, + auth = self.auth, + services = self.services, + ), + ] + + self.document_out = Publisher( + self.pulsar_host, document_ingest_queue, + schema=JsonSchema(Document), + chunking_enabled=True, + ) + + self.text_out = Publisher( + self.pulsar_host, text_ingest_queue, + schema=JsonSchema(TextDocument), + chunking_enabled=True, + ) + + for ep in self.endpoints: + ep.add_routes(self.app) + + self.app.add_routes([ + web.post("/api/v1/load/document", self.load_document), + web.post("/api/v1/load/text", self.load_text), + ]) + + async def load_document(self, request): + + try: + + data = await request.json() + + if "metadata" in data: + metadata = to_subgraph(data["metadata"]) + else: + metadata = [] + + # Doing a base64 decoe/encode here to make sure the + # content is valid base64 + doc = base64.b64decode(data["data"]) + + resp = await asyncio.to_thread( + self.document_out.send, + None, + Document( + metadata=Metadata( + id=data.get("id"), + metadata=metadata, + user=data.get("user", "trustgraph"), + collection=data.get("collection", "default"), + ), + data=base64.b64encode(doc).decode("utf-8") + ) + ) + + print("Document loaded.") + + return web.json_response( + { } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + async def load_text(self, request): + + try: + + data = await request.json() + + if "metadata" in data: + metadata = to_subgraph(data["metadata"]) + else: + metadata = [] + + if "charset" in data: + charset = data["charset"] + else: + charset = "utf-8" + + # Text is base64 encoded + text = base64.b64decode(data["text"]).decode(charset) + + resp = await asyncio.to_thread( + self.text_out.send, + None, + TextDocument( + metadata=Metadata( + id=data.get("id"), + metadata=metadata, + user=data.get("user", "trustgraph"), + collection=data.get("collection", "default"), + ), + text=text, + ) + ) + + print("Text document loaded.") + + return web.json_response( + { } + ) + + except Exception as e: + logging.error(f"Exception: {e}") + + return web.json_response( + { "error": str(e) } + ) + + async def app_factory(self): + + for ep in self.endpoints: + await ep.start() + + self.document_out.start() + self.text_out.start() + + return self.app + + def run(self): + web.run_app(self.app_factory(), port=self.port) + +def run(): + + parser = argparse.ArgumentParser( + prog="api-gateway", + description=__doc__ + ) + + parser.add_argument( + '-p', '--pulsar-host', + default=default_pulsar_host, + help=f'Pulsar host (default: {default_pulsar_host})', + ) + + parser.add_argument( + '--port', + type=int, + default=default_port, + help=f'Port number to listen on (default: {default_port})', + ) + + parser.add_argument( + '--timeout', + type=int, + default=default_timeout, + help=f'API request timeout in seconds (default: {default_timeout})', + ) + + parser.add_argument( + '--api-token', + default=default_api_token, + help=f'Secret API token (default: no auth)', + ) + + parser.add_argument( + '-l', '--log-level', + type=LogLevel, + default=LogLevel.INFO, + choices=list(LogLevel), + help=f'Output queue (default: info)' + ) + + parser.add_argument( + '--metrics', + action=argparse.BooleanOptionalAction, + default=True, + help=f'Metrics enabled (default: true)', + ) + + parser.add_argument( + '-P', '--metrics-port', + type=int, + default=8000, + help=f'Prometheus metrics port (default: 8000)', + ) + + args = parser.parse_args() + args = vars(args) + + if args["metrics"]: + start_http_server(args["metrics_port"]) + + a = Api(**args) + a.run() + diff --git a/trustgraph-flow/trustgraph/gateway/socket.py b/trustgraph-flow/trustgraph/gateway/socket.py new file mode 100644 index 00000000..869792b7 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/socket.py @@ -0,0 +1,84 @@ + +import asyncio +from aiohttp import web, WSMsgType +import logging + +from . running import Running + +logger = logging.getLogger("socket") +logger.setLevel(logging.INFO) + +class SocketEndpoint: + + def __init__( + self, endpoint_path, auth, + ): + + self.path = endpoint_path + self.auth = auth + self.operation = "socket" + + async def listener(self, ws, running): + + async for msg in ws: + # On error, finish + if msg.type == WSMsgType.ERROR: + break + else: + # Ignore incoming messages + pass + + running.stop() + + async def async_thread(self, ws, running): + + while running.get(): + try: + await asyncio.sleep(1) + + except TimeoutError: + continue + + except Exception as e: + print(f"Exception: {str(e)}", flush=True) + + async def handle(self, request): + + try: + token = request.query['token'] + except: + token = "" + + if not self.auth.permitted(token, self.operation): + return web.HTTPUnauthorized() + + running = Running() + ws = web.WebSocketResponse() + await ws.prepare(request) + + task = asyncio.create_task(self.async_thread(ws, running)) + + try: + + await self.listener(ws, running) + + except Exception as e: + print(e, flush=True) + + running.stop() + + await ws.close() + + await task + + return ws + + async def start(self): + pass + + def add_routes(self, app): + + app.add_routes([ + web.get(self.path, self.handle), + ]) + diff --git a/trustgraph-flow/trustgraph/gateway/subscriber.py b/trustgraph-flow/trustgraph/gateway/subscriber.py new file mode 100644 index 00000000..cccfc5b4 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/subscriber.py @@ -0,0 +1,109 @@ + +import queue +import pulsar +import threading +import time + +class Subscriber: + + def __init__(self, pulsar_host, topic, subscription, consumer_name, + schema=None, max_size=100): + self.pulsar_host = pulsar_host + self.topic = topic + self.subscription = subscription + self.consumer_name = consumer_name + self.schema = schema + self.q = {} + self.full = {} + self.max_size = max_size + self.lock = threading.Lock() + + def start(self): + self.task = threading.Thread(target=self.run) + self.task.start() + + def run(self): + + while True: + + try: + + client = pulsar.Client( + self.pulsar_host, + ) + + consumer = client.subscribe( + topic=self.topic, + subscription_name=self.subscription, + consumer_name=self.consumer_name, + schema=self.schema, + ) + + while True: + + msg = consumer.receive() + + # Acknowledge successful reception of the message + consumer.acknowledge(msg) + + try: + id = msg.properties()["id"] + except: + id = None + + value = msg.value() + + with self.lock: + + if id in self.q: + try: + self.q[id].put(value, timeout=0.5) + except: + pass + + for q in self.full.values(): + try: + q.put(value, timeout=0.5) + except: + pass + + except Exception as e: + print("Exception:", e, flush=True) + + # If handler drops out, sleep a retry + time.sleep(2) + + def subscribe(self, id): + + with self.lock: + + q = queue.Queue(maxsize=self.max_size) + self.q[id] = q + + return q + + def unsubscribe(self, id): + + with self.lock: + + if id in self.q: +# self.q[id].shutdown(immediate=True) + del self.q[id] + + def subscribe_all(self, id): + + with self.lock: + + q = queue.Queue(maxsize=self.max_size) + self.full[id] = q + + return q + + def unsubscribe_all(self, id): + + with self.lock: + + if id in self.full: +# self.full[id].shutdown(immediate=True) + del self.full[id] + diff --git a/trustgraph-flow/trustgraph/gateway/text_completion.py b/trustgraph-flow/trustgraph/gateway/text_completion.py new file mode 100644 index 00000000..7291fc88 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/text_completion.py @@ -0,0 +1,29 @@ + +from .. schema import TextCompletionRequest, TextCompletionResponse +from .. schema import text_completion_request_queue +from .. schema import text_completion_response_queue + +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor + +class TextCompletionRequestor(ServiceRequestor): + def __init__(self, pulsar_host, timeout, auth): + + super(TextCompletionRequestor, self).__init__( + pulsar_host=pulsar_host, + request_queue=text_completion_request_queue, + response_queue=text_completion_response_queue, + request_schema=TextCompletionRequest, + response_schema=TextCompletionResponse, + timeout=timeout, + ) + + def to_request(self, body): + return TextCompletionRequest( + system=body["system"], + prompt=body["prompt"] + ) + + def from_response(self, message): + return { "response": message.response }, True + diff --git a/trustgraph-flow/trustgraph/gateway/triples_load.py b/trustgraph-flow/trustgraph/gateway/triples_load.py new file mode 100644 index 00000000..2689f3ad --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/triples_load.py @@ -0,0 +1,57 @@ + +import asyncio +from pulsar.schema import JsonSchema +import uuid +from aiohttp import WSMsgType + +from .. schema import Metadata +from .. schema import Triples +from .. schema import triples_store_queue + +from . publisher import Publisher +from . socket import SocketEndpoint +from . serialize import to_subgraph + +class TriplesLoadEndpoint(SocketEndpoint): + + def __init__(self, pulsar_host, auth, path="/api/v1/load/triples"): + + super(TriplesLoadEndpoint, self).__init__( + endpoint_path=path, auth=auth, + ) + + self.pulsar_host=pulsar_host + + self.publisher = Publisher( + self.pulsar_host, triples_store_queue, + schema=JsonSchema(Triples) + ) + + async def start(self): + + self.publisher.start() + + async def listener(self, ws, running): + + async for msg in ws: + # On error, finish + if msg.type == WSMsgType.ERROR: + break + else: + + data = msg.json() + + elt = Triples( + metadata=Metadata( + id=data["metadata"]["id"], + metadata=to_subgraph(data["metadata"]["metadata"]), + user=data["metadata"]["user"], + collection=data["metadata"]["collection"], + ), + triples=to_subgraph(data["triples"]), + ) + + self.publisher.send(None, elt) + + + running.stop() diff --git a/trustgraph-flow/trustgraph/gateway/triples_query.py b/trustgraph-flow/trustgraph/gateway/triples_query.py new file mode 100644 index 00000000..0ea7cd8d --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/triples_query.py @@ -0,0 +1,53 @@ + +from .. schema import TriplesQueryRequest, TriplesQueryResponse, Triples +from .. schema import triples_request_queue +from .. schema import triples_response_queue + +from . endpoint import ServiceEndpoint +from . requestor import ServiceRequestor +from . serialize import to_value, serialize_subgraph + +class TriplesQueryRequestor(ServiceRequestor): + def __init__(self, pulsar_host, timeout, auth): + + super(TriplesQueryRequestor, self).__init__( + pulsar_host=pulsar_host, + request_queue=triples_request_queue, + response_queue=triples_response_queue, + request_schema=TriplesQueryRequest, + response_schema=TriplesQueryResponse, + timeout=timeout, + ) + + def to_request(self, body): + + if "s" in body: + s = to_value(body["s"]) + else: + s = None + + if "p" in body: + p = to_value(body["p"]) + else: + p = None + + if "o" in body: + o = to_value(body["o"]) + else: + o = None + + limit = int(body.get("limit", 10000)) + + return TriplesQueryRequest( + s = s, p = p, o = o, + limit = limit, + user = body.get("user", "trustgraph"), + collection = body.get("collection", "default"), + ) + + def from_response(self, message): + print(message) + return { + "response": serialize_subgraph(message.triples) + }, True + diff --git a/trustgraph-flow/trustgraph/gateway/triples_stream.py b/trustgraph-flow/trustgraph/gateway/triples_stream.py new file mode 100644 index 00000000..92ada132 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/triples_stream.py @@ -0,0 +1,55 @@ + +import asyncio +import queue +from pulsar.schema import JsonSchema +import uuid + +from .. schema import Triples +from .. schema import triples_store_queue + +from . subscriber import Subscriber +from . socket import SocketEndpoint +from . serialize import serialize_triples + +class TriplesStreamEndpoint(SocketEndpoint): + + def __init__(self, pulsar_host, auth, path="/api/v1/stream/triples"): + + super(TriplesStreamEndpoint, self).__init__( + endpoint_path=path, auth=auth, + ) + + self.pulsar_host=pulsar_host + + self.subscriber = Subscriber( + self.pulsar_host, triples_store_queue, + "api-gateway", "api-gateway", + schema=JsonSchema(Triples) + ) + + async def start(self): + + self.subscriber.start() + + async def async_thread(self, ws, running): + + id = str(uuid.uuid4()) + + q = self.subscriber.subscribe_all(id) + + while running.get(): + try: + resp = await asyncio.to_thread(q.get, timeout=0.5) + await ws.send_json(serialize_triples(resp)) + + except queue.Empty: + continue + + except Exception as e: + print(f"Exception: {str(e)}", flush=True) + break + + self.subscriber.unsubscribe_all(id) + + running.stop() + diff --git a/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__init__.py b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__main__.py b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__main__.py new file mode 100755 index 00000000..89684e3e --- /dev/null +++ b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . hf import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py new file mode 100755 index 00000000..b8502143 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/doc_embeddings/pinecone/service.py @@ -0,0 +1,147 @@ + +""" +Document embeddings query service. Input is vector, output is an array +of chunks. Pinecone implementation. +""" + +from pinecone import Pinecone, ServerlessSpec +from pinecone.grpc import PineconeGRPC, GRPCClientConfig + +import uuid +import os + +from .... schema import DocumentEmbeddingsRequest, DocumentEmbeddingsResponse +from .... schema import Error, Value +from .... schema import document_embeddings_request_queue +from .... schema import document_embeddings_response_queue +from .... base import ConsumerProducer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = document_embeddings_request_queue +default_output_queue = document_embeddings_response_queue +default_subscriber = module +default_api_key = os.getenv("PINECONE_API_KEY", "not-specified") + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + + self.url = params.get("url", None) + self.api_key = params.get("api_key", default_api_key) + + if self.url: + + self.pinecone = PineconeGRPC( + api_key = self.api_key, + host = self.url + ) + + else: + + self.pinecone = Pinecone(api_key = self.api_key) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": DocumentEmbeddingsRequest, + "output_schema": DocumentEmbeddingsResponse, + "url": self.url, + } + ) + + def handle(self, msg): + + try: + + v = msg.value() + + # Sender-produced ID + id = msg.properties()["id"] + + print(f"Handling input {id}...", flush=True) + + chunks = [] + + for vec in v.vectors: + + dim = len(vec) + + index_name = ( + "d-" + v.user + "-" + str(dim) + ) + + index = self.pinecone.Index(index_name) + + results = index.query( + namespace=v.collection, + vector=vec, + top_k=v.limit, + include_values=False, + include_metadata=True + ) + + search_result = self.client.query_points( + collection_name=collection, + query=vec, + limit=v.limit, + with_payload=True, + ).points + + for r in results.matches: + doc = r.metadata["doc"] + chunks.add(doc) + + print("Send response...", flush=True) + r = DocumentEmbeddingsResponse(documents=chunks, error=None) + self.producer.send(r, properties={"id": id}) + + print("Done.", flush=True) + + except Exception as e: + + print(f"Exception: {e}") + + print("Send error response...", flush=True) + + r = DocumentEmbeddingsResponse( + error=Error( + type = "llm-error", + message = str(e), + ), + documents=None, + ) + + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + + parser.add_argument( + '-a', '--api-key', + default=default_api_key, + help='Pinecone API key. (default from PINECONE_API_KEY)' + ) + + parser.add_argument( + '-u', '--url', + help='Pinecone URL. If unspecified, serverless is used' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__init__.py b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__main__.py b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__main__.py new file mode 100755 index 00000000..89684e3e --- /dev/null +++ b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . hf import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/service.py b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/service.py new file mode 100755 index 00000000..64ae4d32 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/graph_embeddings/pinecone/service.py @@ -0,0 +1,156 @@ + +""" +Graph embeddings query service. Input is vector, output is list of +entities. Pinecone implementation. +""" + +from pinecone import Pinecone, ServerlessSpec +from pinecone.grpc import PineconeGRPC, GRPCClientConfig + +import uuid +import os + +from .... schema import GraphEmbeddingsRequest, GraphEmbeddingsResponse +from .... schema import Error, Value +from .... schema import graph_embeddings_request_queue +from .... schema import graph_embeddings_response_queue +from .... base import ConsumerProducer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = graph_embeddings_request_queue +default_output_queue = graph_embeddings_response_queue +default_subscriber = module +default_api_key = os.getenv("PINECONE_API_KEY", "not-specified") + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + + self.url = params.get("url", None) + self.api_key = params.get("api_key", default_api_key) + + if self.url: + + self.pinecone = PineconeGRPC( + api_key = self.api_key, + host = self.url + ) + + else: + + self.pinecone = Pinecone(api_key = self.api_key) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": GraphEmbeddingsRequest, + "output_schema": GraphEmbeddingsResponse, + "url": self.url, + } + ) + + def create_value(self, ent): + if ent.startswith("http://") or ent.startswith("https://"): + return Value(value=ent, is_uri=True) + else: + return Value(value=ent, is_uri=False) + + def handle(self, msg): + + try: + + v = msg.value() + + # Sender-produced ID + id = msg.properties()["id"] + + print(f"Handling input {id}...", flush=True) + + entities = set() + + for vec in v.vectors: + + dim = len(vec) + + index_name = ( + "t-" + v.user + "-" + str(dim) + ) + + index = self.pinecone.Index(index_name) + + results = index.query( + namespace=v.collection, + vector=vec, + top_k=v.limit, + include_values=False, + include_metadata=True + ) + + for r in results.matches: + ent = r.metadata["entity"] + entities.add(ent) + + # Convert set to list + entities = list(entities) + + ents2 = [] + + for ent in entities: + ents2.append(self.create_value(ent)) + + entities = ents2 + + print("Send response...", flush=True) + r = GraphEmbeddingsResponse(entities=entities, error=None) + self.producer.send(r, properties={"id": id}) + + print("Done.", flush=True) + + except Exception as e: + + print(f"Exception: {e}") + + print("Send error response...", flush=True) + + r = GraphEmbeddingsResponse( + error=Error( + type = "llm-error", + message = str(e), + ), + entities=None, + ) + + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + + parser.add_argument( + '-a', '--api-key', + default=default_api_key, + help='Pinecone API key. (default from PINECONE_API_KEY)' + ) + + parser.add_argument( + '-u', '--url', + help='Pinecone URL. If unspecified, serverless is used' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/query/triples/memgraph/__init__.py b/trustgraph-flow/trustgraph/query/triples/memgraph/__init__.py new file mode 100644 index 00000000..ba844705 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/triples/memgraph/__init__.py @@ -0,0 +1,3 @@ + +from . service import * + diff --git a/trustgraph-flow/trustgraph/query/triples/memgraph/__main__.py b/trustgraph-flow/trustgraph/query/triples/memgraph/__main__.py new file mode 100755 index 00000000..89684e3e --- /dev/null +++ b/trustgraph-flow/trustgraph/query/triples/memgraph/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . hf import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/query/triples/memgraph/service.py b/trustgraph-flow/trustgraph/query/triples/memgraph/service.py new file mode 100755 index 00000000..46dd19e3 --- /dev/null +++ b/trustgraph-flow/trustgraph/query/triples/memgraph/service.py @@ -0,0 +1,373 @@ + +""" +Triples query service for memgraph. +Input is a (s, p, o) triple, some values may be null. Output is a list of +triples. +""" + +from neo4j import GraphDatabase + +from .... schema import TriplesQueryRequest, TriplesQueryResponse, Error +from .... schema import Value, Triple +from .... schema import triples_request_queue +from .... schema import triples_response_queue +from .... base import ConsumerProducer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = triples_request_queue +default_output_queue = triples_response_queue +default_subscriber = module + +default_graph_host = 'bolt://memgraph:7687' +default_username = 'memgraph' +default_password = 'password' +default_database = 'memgraph' + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + graph_host = params.get("graph_host", default_graph_host) + username = params.get("username", default_username) + password = params.get("password", default_password) + database = params.get("database", default_database) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": TriplesQueryRequest, + "output_schema": TriplesQueryResponse, + "graph_host": graph_host, + } + ) + + self.db = database + + self.io = GraphDatabase.driver(graph_host, auth=(username, password)) + + def create_value(self, ent): + + if ent.startswith("http://") or ent.startswith("https://"): + return Value(value=ent, is_uri=True) + else: + return Value(value=ent, is_uri=False) + + def handle(self, msg): + + try: + + v = msg.value() + + # Sender-produced ID + id = msg.properties()["id"] + + print(f"Handling input {id}...", flush=True) + + triples = [] + + if v.s is not None: + if v.p is not None: + if v.o is not None: + + # SPO + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Literal {value: $value}) " + "RETURN $src as src " + "LIMIT " + str(v.limit), + src=v.s.value, rel=v.p.value, value=v.o.value, + database_=self.db, + ) + + for rec in records: + triples.append((v.s.value, v.p.value, v.o.value)) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Node {uri: $uri}) " + "RETURN $src as src " + "LIMIT " + str(v.limit), + src=v.s.value, rel=v.p.value, uri=v.o.value, + database_=self.db, + ) + + for rec in records: + triples.append((v.s.value, v.p.value, v.o.value)) + + else: + + # SP + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Literal) " + "RETURN dest.value as dest " + "LIMIT " + str(v.limit), + src=v.s.value, rel=v.p.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, v.p.value, data["dest"])) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel {uri: $rel}]->(dest:Node) " + "RETURN dest.uri as dest " + "LIMIT " + str(v.limit), + src=v.s.value, rel=v.p.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, v.p.value, data["dest"])) + + else: + + if v.o is not None: + + # SO + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Literal {value: $value}) " + "RETURN rel.uri as rel " + "LIMIT " + str(v.limit), + src=v.s.value, value=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, data["rel"], v.o.value)) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Node {uri: $uri}) " + "RETURN rel.uri as rel " + "LIMIT " + str(v.limit), + src=v.s.value, uri=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, data["rel"], v.o.value)) + + else: + + # S + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Literal) " + "RETURN rel.uri as rel, dest.value as dest " + "LIMIT " + str(v.limit), + src=v.s.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, data["rel"], data["dest"])) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node {uri: $src})-[rel:Rel]->(dest:Node) " + "RETURN rel.uri as rel, dest.uri as dest " + "LIMIT " + str(v.limit), + src=v.s.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((v.s.value, data["rel"], data["dest"])) + + + else: + + if v.p is not None: + + if v.o is not None: + + # PO + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Literal {value: $value}) " + "RETURN src.uri as src " + "LIMIT " + str(v.limit), + uri=v.p.value, value=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], v.p.value, v.o.value)) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Node {uri: $uri}) " + "RETURN src.uri as src " + "LIMIT " + str(v.limit), + uri=v.p.value, dest=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], v.p.value, v.o.value)) + + else: + + # P + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Literal) " + "RETURN src.uri as src, dest.value as dest " + "LIMIT " + str(v.limit), + uri=v.p.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], v.p.value, data["dest"])) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel {uri: $uri}]->(dest:Node) " + "RETURN src.uri as src, dest.uri as dest " + "LIMIT " + str(v.limit), + uri=v.p.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], v.p.value, data["dest"])) + + else: + + if v.o is not None: + + # O + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel]->(dest:Literal {value: $value}) " + "RETURN src.uri as src, rel.uri as rel " + "LIMIT " + str(v.limit), + value=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], data["rel"], v.o.value)) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel]->(dest:Node {uri: $uri}) " + "RETURN src.uri as src, rel.uri as rel " + "LIMIT " + str(v.limit), + uri=v.o.value, + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], data["rel"], v.o.value)) + + else: + + # * + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel]->(dest:Literal) " + "RETURN src.uri as src, rel.uri as rel, dest.value as dest " + "LIMIT " + str(v.limit), + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], data["rel"], data["dest"])) + + records, summary, keys = self.io.execute_query( + "MATCH (src:Node)-[rel:Rel]->(dest:Node) " + "RETURN src.uri as src, rel.uri as rel, dest.uri as dest " + "LIMIT " + str(v.limit), + database_=self.db, + ) + + for rec in records: + data = rec.data() + triples.append((data["src"], data["rel"], data["dest"])) + + triples = [ + Triple( + s=self.create_value(t[0]), + p=self.create_value(t[1]), + o=self.create_value(t[2]) + ) + for t in triples[:v.limit] + ] + + print("Send response...", flush=True) + r = TriplesQueryResponse(triples=triples, error=None) + self.producer.send(r, properties={"id": id}) + + print("Done.", flush=True) + + except Exception as e: + + print(f"Exception: {e}") + + print("Send error response...", flush=True) + + r = TriplesQueryResponse( + error=Error( + type = "llm-error", + message = str(e), + ), + response=None, + ) + + self.producer.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + + parser.add_argument( + '-g', '--graph-host', + default=default_graph_host, + help=f'Graph host (default: {default_graph_host})' + ) + + parser.add_argument( + '--username', + default=default_username, + help=f'Memgraph username (default: {default_username})' + ) + + parser.add_argument( + '--password', + default=default_password, + help=f'Memgraph password (default: {default_password})' + ) + + parser.add_argument( + '--database', + default=default_database, + help=f'Memgraph database (default: {default_database})' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/query/triples/neo4j/service.py b/trustgraph-flow/trustgraph/query/triples/neo4j/service.py index 9038f76d..d60bc4f4 100755 --- a/trustgraph-flow/trustgraph/query/triples/neo4j/service.py +++ b/trustgraph-flow/trustgraph/query/triples/neo4j/service.py @@ -1,7 +1,8 @@ """ -Triples query service. Input is a (s, p, o) triple, some values may be -null. Output is a list of triples. +Triples query service for neo4j. +Input is a (s, p, o) triple, some values may be null. Output is a list of +triples. """ from neo4j import GraphDatabase @@ -21,6 +22,7 @@ default_subscriber = module default_graph_host = 'bolt://neo4j:7687' default_username = 'neo4j' default_password = 'password' +default_database = 'neo4j' class Processor(ConsumerProducer): @@ -31,7 +33,8 @@ class Processor(ConsumerProducer): subscriber = params.get("subscriber", default_subscriber) graph_host = params.get("graph_host", default_graph_host) username = params.get("username", default_username) - password = params.get("passowrd", default_password) + password = params.get("password", default_password) + database = params.get("database", default_database) super(Processor, self).__init__( **params | { @@ -44,7 +47,7 @@ class Processor(ConsumerProducer): } ) - self.db = "neo4j" + self.db = database self.io = GraphDatabase.driver(graph_host, auth=(username, password)) @@ -342,6 +345,12 @@ class Processor(ConsumerProducer): help=f'Neo4j password (default: {default_password})' ) + parser.add_argument( + '--database', + default=default_database, + help=f'Neo4j database (default: {default_database})' + ) + def run(): Processor.start(module, __doc__) diff --git a/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__init__.py b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__init__.py new file mode 100644 index 00000000..d891d55f --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__init__.py @@ -0,0 +1,3 @@ + +from . write import * + diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__main__.py b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__main__.py old mode 100755 new mode 100644 similarity index 100% rename from trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__main__.py rename to trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/__main__.py diff --git a/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/write.py b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/write.py new file mode 100644 index 00000000..24cfcb78 --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/doc_embeddings/pinecone/write.py @@ -0,0 +1,167 @@ + +""" +Accepts entity/vector pairs and writes them to a Qdrant store. +""" + +from qdrant_client import QdrantClient +from qdrant_client.models import PointStruct +from qdrant_client.models import Distance, VectorParams + +import time +import uuid +import os + +from .... schema import ChunkEmbeddings +from .... schema import chunk_embeddings_ingest_queue +from .... log_level import LogLevel +from .... base import Consumer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = chunk_embeddings_ingest_queue +default_subscriber = module +default_api_key = os.getenv("PINECONE_API_KEY", "not-specified") +default_cloud = "aws" +default_region = "us-east-1" + +class Processor(Consumer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + subscriber = params.get("subscriber", default_subscriber) + + self.url = params.get("url", None) + self.cloud = params.get("cloud", default_cloud) + self.region = params.get("region", default_region) + self.api_key = params.get("api_key", default_api_key) + + if self.api_key is None: + raise RuntimeError("Pinecone API key must be specified") + + if self.url: + + self.pinecone = PineconeGRPC( + api_key = self.api_key, + host = self.url + ) + + else: + + self.pinecone = Pinecone(api_key = self.api_key) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "subscriber": subscriber, + "input_schema": ChunkEmbeddings, + "url": self.url, + } + ) + + self.last_index_name = None + + def handle(self, msg): + + v = msg.value() + + chunk = v.chunk.decode("utf-8") + + if chunk == "": return + + for vec in v.vectors: + + dim = len(vec) + collection = ( + "d-" + v.metadata.user + "-" + str(dim) + ) + + if index_name != self.last_index_name: + + if not self.pinecone.has_index(index_name): + + try: + + self.pinecone.create_index( + name = index_name, + dimension = dim, + metric = "cosine", + spec = ServerlessSpec( + cloud = self.cloud, + region = self.region, + ) + ) + + for i in range(0, 1000): + + if self.pinecone.describe_index( + index_name + ).status["ready"]: + break + + time.sleep(1) + + if not self.pinecone.describe_index( + index_name + ).status["ready"]: + raise RuntimeError( + "Gave up waiting for index creation" + ) + + except Exception as e: + print("Pinecone index creation failed") + raise e + + print(f"Index {index_name} created", flush=True) + + self.last_index_name = index_name + + index = self.pinecone.Index(index_name) + + records = [ + { + "id": id, + "values": vec, + "metadata": { "doc": chunk }, + } + ] + + index.upsert( + vectors = records, + namespace = v.metadata.collection, + ) + + @staticmethod + def add_args(parser): + + Consumer.add_args( + parser, default_input_queue, default_subscriber, + ) + + parser.add_argument( + '-a', '--api-key', + default=default_api_key, + help='Pinecone API key. (default from PINECONE_API_KEY)' + ) + + parser.add_argument( + '-u', '--url', + help='Pinecone URL. If unspecified, serverless is used' + ) + + parser.add_argument( + '--cloud', + default=default_cloud, + help=f'Pinecone cloud, (default: {default_cloud}' + ) + + parser.add_argument( + '--region', + default=default_region, + help=f'Pinecone region, (default: {default_region}' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__init__.py b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__init__.py new file mode 100644 index 00000000..d891d55f --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__init__.py @@ -0,0 +1,3 @@ + +from . write import * + diff --git a/trustgraph-parquet/trustgraph/dump/triples/parquet/__main__.py b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__main__.py similarity index 100% rename from trustgraph-parquet/trustgraph/dump/triples/parquet/__main__.py rename to trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/__main__.py diff --git a/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/write.py b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/write.py new file mode 100755 index 00000000..b918c10b --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/write.py @@ -0,0 +1,167 @@ + +""" +Accepts entity/vector pairs and writes them to a Pinecone store. +""" + +from pinecone import Pinecone, ServerlessSpec +from pinecone.grpc import PineconeGRPC, GRPCClientConfig + +import time +import uuid +import os + +from .... schema import GraphEmbeddings +from .... schema import graph_embeddings_store_queue +from .... log_level import LogLevel +from .... base import Consumer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = graph_embeddings_store_queue +default_subscriber = module +default_api_key = os.getenv("PINECONE_API_KEY", "not-specified") +default_cloud = "aws" +default_region = "us-east-1" + +class Processor(Consumer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + subscriber = params.get("subscriber", default_subscriber) + + self.url = params.get("url", None) + self.cloud = params.get("cloud", default_cloud) + self.region = params.get("region", default_region) + self.api_key = params.get("api_key", default_api_key) + + if self.api_key is None: + raise RuntimeError("Pinecone API key must be specified") + + if self.url: + + self.pinecone = PineconeGRPC( + api_key = self.api_key, + host = self.url + ) + + else: + + self.pinecone = Pinecone(api_key = self.api_key) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "subscriber": subscriber, + "input_schema": GraphEmbeddings, + "url": self.url, + } + ) + + self.last_index_name = None + + def handle(self, msg): + + v = msg.value() + + id = str(uuid.uuid4()) + + if v.entity.value == "" or v.entity.value is None: return + + for vec in v.vectors: + + dim = len(vec) + + index_name = ( + "t-" + v.metadata.user + "-" + str(dim) + ) + + if index_name != self.last_index_name: + + if not self.pinecone.has_index(index_name): + + try: + + self.pinecone.create_index( + name = index_name, + dimension = dim, + metric = "cosine", + spec = ServerlessSpec( + cloud = self.cloud, + region = self.region, + ) + ) + + for i in range(0, 1000): + + if self.pinecone.describe_index( + index_name + ).status["ready"]: + break + + time.sleep(1) + + if not self.pinecone.describe_index( + index_name + ).status["ready"]: + raise RuntimeError( + "Gave up waiting for index creation" + ) + + except Exception as e: + print("Pinecone index creation failed") + raise e + + print(f"Index {index_name} created", flush=True) + + self.last_index_name = index_name + + index = self.pinecone.Index(index_name) + + records = [ + { + "id": id, + "values": vec, + "metadata": { "entity": v.entity.value }, + } + ] + + index.upsert( + vectors = records, + namespace = v.metadata.collection, + ) + + @staticmethod + def add_args(parser): + + Consumer.add_args( + parser, default_input_queue, default_subscriber, + ) + + parser.add_argument( + '-a', '--api-key', + default=default_api_key, + help='Pinecone API key. (default from PINECONE_API_KEY)' + ) + + parser.add_argument( + '-u', '--url', + help='Pinecone URL. If unspecified, serverless is used' + ) + + parser.add_argument( + '--cloud', + default=default_cloud, + help=f'Pinecone cloud, (default: {default_cloud}' + ) + + parser.add_argument( + '--region', + default=default_region, + help=f'Pinecone region, (default: {default_region}' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/storage/triples/memgraph/__init__.py b/trustgraph-flow/trustgraph/storage/triples/memgraph/__init__.py new file mode 100644 index 00000000..d891d55f --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/triples/memgraph/__init__.py @@ -0,0 +1,3 @@ + +from . write import * + diff --git a/trustgraph-flow/trustgraph/storage/triples/memgraph/__main__.py b/trustgraph-flow/trustgraph/storage/triples/memgraph/__main__.py new file mode 100755 index 00000000..c05d8c6d --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/triples/memgraph/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . write import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/storage/triples/memgraph/write.py b/trustgraph-flow/trustgraph/storage/triples/memgraph/write.py new file mode 100755 index 00000000..17e8c67e --- /dev/null +++ b/trustgraph-flow/trustgraph/storage/triples/memgraph/write.py @@ -0,0 +1,252 @@ + +""" +Graph writer. Input is graph edge. Writes edges to Cassandra graph. +""" + +import pulsar +import base64 +import os +import argparse +import time + +from neo4j import GraphDatabase + +from .... schema import Triples +from .... schema import triples_store_queue +from .... log_level import LogLevel +from .... base import Consumer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = triples_store_queue +default_subscriber = module + +default_graph_host = 'bolt://memgraph:7687' +default_username = 'memgraph' +default_password = 'password' +default_database = 'memgraph' + +class Processor(Consumer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + subscriber = params.get("subscriber", default_subscriber) + graph_host = params.get("graph_host", default_graph_host) + username = params.get("username", default_username) + password = params.get("password", default_password) + database = params.get("database", default_database) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "subscriber": subscriber, + "input_schema": Triples, + "graph_host": graph_host, + } + ) + + self.db = database + + self.io = GraphDatabase.driver(graph_host, auth=(username, password)) + + with self.io.session(database=self.db) as session: + self.create_indexes(session) + + def create_indexes(self, session): + + print("Create indexes...", flush=True) + + try: + session.run( + "CREATE INDEX ON :Node", + ) + except Exception as e: + print(e, flush=True) + # Maybe index already exists + print("Index create failure ignored", flush=True) + + try: + session.run( + "CREATE INDEX ON :Node(uri)" + ) + except Exception as e: + print(e, flush=True) + # Maybe index already exists + print("Index create failure ignored", flush=True) + + try: + session.run( + "CREATE INDEX ON :Literal", + ) + except Exception as e: + print(e, flush=True) + # Maybe index already exists + print("Index create failure ignored", flush=True) + + try: + session.run( + "CREATE INDEX ON :Literal(value)" + ) + except Exception as e: + print(e, flush=True) + # Maybe index already exists + print("Index create failure ignored", flush=True) + + print("Index creation done", flush=True) + + def create_node(self, uri): + + print("Create node", uri) + + summary = self.io.execute_query( + "MERGE (n:Node {uri: $uri})", + uri=uri, + database_=self.db, + ).summary + + print("Created {nodes_created} nodes in {time} ms.".format( + nodes_created=summary.counters.nodes_created, + time=summary.result_available_after + )) + + def create_literal(self, value): + + print("Create literal", value) + + summary = self.io.execute_query( + "MERGE (n:Literal {value: $value})", + value=value, + database_=self.db, + ).summary + + print("Created {nodes_created} nodes in {time} ms.".format( + nodes_created=summary.counters.nodes_created, + time=summary.result_available_after + )) + + def relate_node(self, src, uri, dest): + + print("Create node rel", src, uri, dest) + + summary = self.io.execute_query( + "MATCH (src:Node {uri: $src}) " + "MATCH (dest:Node {uri: $dest}) " + "MERGE (src)-[:Rel {uri: $uri}]->(dest)", + src=src, dest=dest, uri=uri, + database_=self.db, + ).summary + + print("Created {nodes_created} nodes in {time} ms.".format( + nodes_created=summary.counters.nodes_created, + time=summary.result_available_after + )) + + def relate_literal(self, src, uri, dest): + + print("Create literal rel", src, uri, dest) + + summary = self.io.execute_query( + "MATCH (src:Node {uri: $src}) " + "MATCH (dest:Literal {value: $dest}) " + "MERGE (src)-[:Rel {uri: $uri}]->(dest)", + src=src, dest=dest, uri=uri, + database_=self.db, + ).summary + + print("Created {nodes_created} nodes in {time} ms.".format( + nodes_created=summary.counters.nodes_created, + time=summary.result_available_after + )) + + def create_triple(self, tx, t): + + # Create new s node with given uri, if not exists + result = tx.run( + "MERGE (n:Node {uri: $uri})", + uri=t.s.value + ) + + if t.o.is_uri: + + # Create new o node with given uri, if not exists + result = tx.run( + "MERGE (n:Node {uri: $uri})", + uri=t.o.value + ) + + result = tx.run( + "MATCH (src:Node {uri: $src}) " + "MATCH (dest:Node {uri: $dest}) " + "MERGE (src)-[:Rel {uri: $uri}]->(dest)", + src=t.s.value, dest=t.o.value, uri=t.p.value, + ) + + else: + + # Create new o literal with given uri, if not exists + result = tx.run( + "MERGE (n:Literal {value: $value})", + value=t.o.value + ) + + result = tx.run( + "MATCH (src:Node {uri: $src}) " + "MATCH (dest:Literal {value: $dest}) " + "MERGE (src)-[:Rel {uri: $uri}]->(dest)", + src=t.s.value, dest=t.o.value, uri=t.p.value, + ) + + def handle(self, msg): + + v = msg.value() + + for t in v.triples: + + # self.create_node(t.s.value) + + # if t.o.is_uri: + # self.create_node(t.o.value) + # self.relate_node(t.s.value, t.p.value, t.o.value) + # else: + # self.create_literal(t.o.value) + # self.relate_literal(t.s.value, t.p.value, t.o.value) + + with self.io.session(database=self.db) as session: + session.execute_write(self.create_triple, t) + + @staticmethod + def add_args(parser): + + Consumer.add_args( + parser, default_input_queue, default_subscriber, + ) + + parser.add_argument( + '-g', '--graph_host', + default=default_graph_host, + help=f'Graph host (default: {default_graph_host})' + ) + + parser.add_argument( + '--username', + default=default_username, + help=f'Memgraph username (default: {default_username})' + ) + + parser.add_argument( + '--password', + default=default_password, + help=f'Memgraph password (default: {default_password})' + ) + + parser.add_argument( + '--database', + default=default_database, + help=f'Memgraph database (default: {default_database})' + ) + +def run(): + + Processor.start(module, __doc__) + diff --git a/trustgraph-flow/trustgraph/storage/triples/neo4j/write.py b/trustgraph-flow/trustgraph/storage/triples/neo4j/write.py index 82302e96..929333e5 100755 --- a/trustgraph-flow/trustgraph/storage/triples/neo4j/write.py +++ b/trustgraph-flow/trustgraph/storage/triples/neo4j/write.py @@ -24,6 +24,7 @@ default_subscriber = module default_graph_host = 'bolt://neo4j:7687' default_username = 'neo4j' default_password = 'password' +default_database = 'neo4j' class Processor(Consumer): @@ -33,7 +34,8 @@ class Processor(Consumer): subscriber = params.get("subscriber", default_subscriber) graph_host = params.get("graph_host", default_graph_host) username = params.get("username", default_username) - password = params.get("passowrd", default_password) + password = params.get("password", default_password) + database = params.get("database", default_database) super(Processor, self).__init__( **params | { @@ -44,7 +46,7 @@ class Processor(Consumer): } ) - self.db = "neo4j" + self.db = database self.io = GraphDatabase.driver(graph_host, auth=(username, password)) @@ -152,6 +154,12 @@ class Processor(Consumer): help=f'Neo4j password (default: {default_password})' ) + parser.add_argument( + '--database', + default=default_database, + help=f'Neo4j database (default: {default_database})' + ) + def run(): Processor.start(module, __doc__) diff --git a/trustgraph-parquet/README.md b/trustgraph-parquet/README.md deleted file mode 100644 index 7a2ce130..00000000 --- a/trustgraph-parquet/README.md +++ /dev/null @@ -1 +0,0 @@ -See https://trustgraph.ai/ diff --git a/trustgraph-parquet/scripts/concat-parquet b/trustgraph-parquet/scripts/concat-parquet deleted file mode 100755 index 7943d436..00000000 --- a/trustgraph-parquet/scripts/concat-parquet +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 - -""" -Concatenates multiple parquet files into a single parquet output -""" - -import pyarrow as pa -import pyarrow.parquet as pq -import pandas as pd -import sys -import argparse - -parser = argparse.ArgumentParser( - prog="combine-parquet", - description=__doc__ -) - -parser.add_argument( - '-i', '--input', - nargs='*', - help=f'Input files' -) - -parser.add_argument( - '-o', '--output', - help=f'Output files' -) - -args = parser.parse_args() - -df = None - -for file in args.input: - - part = pq.read_table(file).to_pandas() - - if df is None: - df = part - else: - df = pd.concat([df, part], ignore_index=True) - -if df is not None: - - table = pa.Table.from_pandas(df) - pq.write_table(table, args.output) diff --git a/trustgraph-parquet/scripts/dump-parquet b/trustgraph-parquet/scripts/dump-parquet deleted file mode 100755 index 62b28998..00000000 --- a/trustgraph-parquet/scripts/dump-parquet +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python3 - -import pyarrow as pa -import pyarrow.csv as pc -import pyarrow.parquet as pq -import pandas as pd -import sys - -df = None - -for file in sys.argv[1:]: - - part = pq.read_table(file).to_pandas() - - if df is None: - df = part - else: - df = pd.concat([df, part], ignore_index=True) - -if df is not None: - - table = pa.Table.from_pandas(df) - pc.write_csv(table, sys.stdout.buffer) - diff --git a/trustgraph-parquet/scripts/ge-dump-parquet b/trustgraph-parquet/scripts/ge-dump-parquet deleted file mode 100755 index c2b29c51..00000000 --- a/trustgraph-parquet/scripts/ge-dump-parquet +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 - -from trustgraph.dump.graph_embeddings.parquet import run - -run() - diff --git a/trustgraph-parquet/scripts/load-graph-embeddings b/trustgraph-parquet/scripts/load-graph-embeddings deleted file mode 100755 index 0e6ecf93..00000000 --- a/trustgraph-parquet/scripts/load-graph-embeddings +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -""" -Loads Graph embeddings into TrustGraph processing. -""" - -import pulsar -from pulsar.schema import JsonSchema -from trustgraph.schema import GraphEmbeddings, Value, Metadata -from trustgraph.schema import graph_embeddings_store_queue -import argparse -import os -import time -import pyarrow as pa -import pyarrow.parquet as pq - -from trustgraph.log_level import LogLevel - -class Loader: - - def __init__( - self, - pulsar_host, - output_queue, - log_level, - file, - user, - collection, - ): - - self.client = pulsar.Client( - pulsar_host, - logger=pulsar.ConsoleLogger(log_level.to_pulsar()) - ) - - self.producer = self.client.create_producer( - topic=output_queue, - schema=JsonSchema(GraphEmbeddings), - chunking_enabled=True, - ) - - self.file = file - self.user = user - self.collection = collection - - def run(self): - - try: - - path = self.file - - print("Reading file...") - table = pq.read_table(path) - print("Loaded.") - - names = set(table.column_names) - - if "embeddings" not in names: - print("No 'embeddings' column") - - if "entity" not in names: - print("No 'entity' column") - - embc = table.column("embeddings") - entc = table.column("entity") - - for emb, ent in zip(embc, entc): - - b = emb.as_py() - n = ent.as_py() - - r = GraphEmbeddings( - metadata=Metadata( - metadata=[], - user=self.user, - collection=self.collection, - ), - vectors=b, - entity=Value( - value=n, - is_uri=n.startswith("https:") - ), - ) - - self.producer.send(r) - - except Exception as e: - print(e, flush=True) - - def __del__(self): - self.client.close() - -def main(): - - parser = argparse.ArgumentParser( - prog='loader', - description=__doc__, - ) - - default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650') - default_output_queue = graph_embeddings_store_queue - default_user = 'trustgraph' - default_collection = 'default' - - parser.add_argument( - '-p', '--pulsar-host', - default=default_pulsar_host, - help=f'Pulsar host (default: {default_pulsar_host})', - ) - - parser.add_argument( - '-o', '--output-queue', - default=default_output_queue, - help=f'Output queue (default: {default_output_queue})' - ) - - parser.add_argument( - '-u', '--user', - default=default_user, - help=f'User ID (default: {default_user})' - ) - - parser.add_argument( - '-c', '--collection', - default=default_collection, - help=f'Collection ID (default: {default_collection})' - ) - - parser.add_argument( - '-l', '--log-level', - type=LogLevel, - default=LogLevel.ERROR, - choices=list(LogLevel), - help=f'Output queue (default: info)' - ) - - parser.add_argument( - '-f', '--file', - required=True, - help=f'File to load' - ) - - args = parser.parse_args() - - while True: - - try: - p = Loader( - pulsar_host=args.pulsar_host, - output_queue=args.output_queue, - log_level=args.log_level, - file=args.file, - user=args.user, - collection=args.collection, - ) - - p.run() - - print("File loaded.") - break - - except Exception as e: - - print("Exception:", e, flush=True) - print("Will retry...", flush=True) - - time.sleep(10) - -main() - diff --git a/trustgraph-parquet/scripts/load-triples b/trustgraph-parquet/scripts/load-triples deleted file mode 100755 index e6bb0ff7..00000000 --- a/trustgraph-parquet/scripts/load-triples +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python3 - -""" -Loads Graph embeddings into TrustGraph processing. -""" - -import pulsar -from pulsar.schema import JsonSchema -from trustgraph.schema import Triples, Triple, Value, Metadata -from trustgraph.schema import triples_store_queue -import argparse -import os -import time -import pyarrow as pa -import pyarrow.parquet as pq - -from trustgraph.log_level import LogLevel - -class Loader: - - def __init__( - self, - pulsar_host, - output_queue, - log_level, - file, - user, - collection, - ): - - self.client = pulsar.Client( - pulsar_host, - logger=pulsar.ConsoleLogger(log_level.to_pulsar()) - ) - - self.producer = self.client.create_producer( - topic=output_queue, - schema=JsonSchema(Triples), - chunking_enabled=True, - ) - - self.file = file - self.user = user - self.collection = collection - - def run(self): - - try: - - path = self.file - - print("Reading file...") - table = pq.read_table(path) - print("Loaded.") - - names = set(table.column_names) - - if "s" not in names: - print("No 's' column") - - if "p" not in names: - print("No 'p' column") - - if "o" not in names: - print("No 'o' column") - - sc = table.column("s") - pc = table.column("p") - oc = table.column("o") - - for s, p, o in zip(sc, pc, oc): - - r = Triples( - metadata=Metadata( - metadata=[], - user=self.user, - collection=self.collection, - ), - triples=[ - Triple( - s=Value( - value=s.as_py(), is_uri=True - ), - p=Value( - value=p.as_py(), is_uri=True - ), - o=Value( - value=o.as_py(), - is_uri=o.as_py().startswith("https:") - ) - ) - ] - ) - - self.producer.send(r) - - except Exception as e: - print(e, flush=True) - - def __del__(self): - self.client.close() - -def main(): - - parser = argparse.ArgumentParser( - prog='loader', - description=__doc__, - ) - - default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650') - default_output_queue = triples_store_queue - default_user = 'trustgraph' - default_collection = 'default' - - parser.add_argument( - '-p', '--pulsar-host', - default=default_pulsar_host, - help=f'Pulsar host (default: {default_pulsar_host})', - ) - - parser.add_argument( - '-o', '--output-queue', - default=default_output_queue, - help=f'Output queue (default: {default_output_queue})' - ) - - parser.add_argument( - '-u', '--user', - default=default_user, - help=f'User ID (default: {default_user})' - ) - - parser.add_argument( - '-c', '--collection', - default=default_collection, - help=f'Collection ID (default: {default_collection})' - ) - - parser.add_argument( - '-l', '--log-level', - type=LogLevel, - default=LogLevel.ERROR, - choices=list(LogLevel), - help=f'Output queue (default: info)' - ) - - parser.add_argument( - '-f', '--file', - required=True, - help=f'File to load' - ) - - args = parser.parse_args() - - while True: - - try: - p = Loader( - pulsar_host=args.pulsar_host, - output_queue=args.output_queue, - log_level=args.log_level, - file=args.file, - user=args.user, - collection=args.collection, - ) - - p.run() - - print("File loaded.") - break - - except Exception as e: - - print("Exception:", e, flush=True) - print("Will retry...", flush=True) - - time.sleep(10) - -main() - diff --git a/trustgraph-parquet/scripts/triples-dump-parquet b/trustgraph-parquet/scripts/triples-dump-parquet deleted file mode 100755 index 78d79196..00000000 --- a/trustgraph-parquet/scripts/triples-dump-parquet +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 - -from trustgraph.dump.triples.parquet import run - -run() - diff --git a/trustgraph-parquet/setup.py b/trustgraph-parquet/setup.py deleted file mode 100644 index 668cde1c..00000000 --- a/trustgraph-parquet/setup.py +++ /dev/null @@ -1,51 +0,0 @@ -import setuptools -import os -import importlib - -with open("README.md", "r") as fh: - long_description = fh.read() - -# Load a version number module -spec = importlib.util.spec_from_file_location( - 'version', 'trustgraph/parquet_version.py' -) -version_module = importlib.util.module_from_spec(spec) -spec.loader.exec_module(version_module) - -version = version_module.__version__ - -setuptools.setup( - name="trustgraph-parquet", - version=version, - author="trustgraph.ai", - author_email="security@trustgraph.ai", - description="TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/trustgraph-ai/trustgraph", - packages=setuptools.find_namespace_packages( - where='./', - ), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - "Operating System :: OS Independent", - ], - python_requires='>=3.8', - download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", - install_requires=[ - "trustgraph-base>=0.15,<0.16", - "pulsar-client", - "prometheus-client", - "pyarrow", - "pandas", - ], - scripts=[ - "scripts/concat-parquet", - "scripts/dump-parquet", - "scripts/ge-dump-parquet", - "scripts/triples-dump-parquet", - "scripts/load-graph-embeddings", - "scripts/load-triples", - ] -) diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/__init__.py b/trustgraph-parquet/trustgraph/dump/graph_embeddings/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__init__.py b/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__init__.py deleted file mode 100644 index 9d16af90..00000000 --- a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ - -from . processor import * - diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/processor.py b/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/processor.py deleted file mode 100755 index 795f3351..00000000 --- a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/processor.py +++ /dev/null @@ -1,85 +0,0 @@ - -""" -Write graph embeddings to parquet files in a directory. -""" - -import pulsar -import base64 -import os -import argparse -import time - -from .... schema import GraphEmbeddings -from .... schema import graph_embeddings_store_queue -from .... base import Consumer - -from . writer import ParquetWriter - -module = ".".join(__name__.split(".")[1:-1]) - -default_input_queue = graph_embeddings_store_queue -default_subscriber = module -default_graph_host='localhost' -default_directory = "." -default_file_template = "graph-embeds-{id}.parquet" -default_rotation_time = 60 - -class Processor(Consumer): - - def __init__(self, **params): - - input_queue = params.get("input_queue", default_input_queue) - subscriber = params.get("subscriber", default_subscriber) - directory = params.get("directory", default_directory) - file_template = params.get("file_template", default_file_template) - rotation_time = params.get("rotation_time", default_rotation_time) - - super(Processor, self).__init__( - **params | { - "input_queue": input_queue, - "subscriber": subscriber, - "input_schema": GraphEmbeddings, - } - ) - - self.writer = ParquetWriter(directory, file_template, rotation_time) - - def __del__(self): - if hasattr(self, "writer"): - del self.writer - - def handle(self, msg): - - v = msg.value() - self.writer.write(v.vectors, v.entity.value) - - @staticmethod - def add_args(parser): - - Consumer.add_args( - parser, default_input_queue, default_subscriber, - ) - - parser.add_argument( - '-d', '--directory', - default=default_directory, - help=f'Directory to write to (default: {default_directory})' - ) - - parser.add_argument( - '-f', '--file-template', - default=default_file_template, - help=f'Directory to write to (default: {default_file_template})' - ) - - parser.add_argument( - '-t', '--rotation-time', - type=int, - default=default_rotation_time, - help=f'Rotation time / seconds (default: {default_rotation_time})' - ) - -def run(): - - Processor.start(module, __doc__) - diff --git a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/writer.py b/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/writer.py deleted file mode 100644 index 1844cdd1..00000000 --- a/trustgraph-parquet/trustgraph/dump/graph_embeddings/parquet/writer.py +++ /dev/null @@ -1,94 +0,0 @@ - -import threading -import queue -import time -import uuid -import pyarrow as pa -import pyarrow.parquet as pq - -class ParquetWriter: - - def __init__(self, directory, file_template, rotation_time): - self.directory = directory - self.file_template = file_template - self.rotation_time = rotation_time - - self.q = queue.Queue() - - self.running = True - - self.thread = threading.Thread(target=(self.writer_thread)) - self.thread.start() - - def writer_thread(self): - - items = [] - - timeout = None - - while self.running: - - try: - - item = self.q.get(timeout=1) - - if timeout == None: - timeout = time.time() + self.rotation_time - - items.append(item) - - except queue.Empty: - pass - - if timeout: - if time.time() > timeout: - - self.write_file(items) - timeout = None - items = [] - - def write_file(self, items): - - try: - - schema = pa.schema([ - pa.field('embeddings', pa.list_(pa.list_(pa.float64()))), - pa.field('entity', pa.string()), - ]) - - fname = self.file_template.format(id=str(uuid.uuid4())) - path = f"{self.directory}/{fname}" - - writer = pq.ParquetWriter(path, schema) - - batch = pa.record_batch( - [ - [i[0] for i in items], - [i[1] for i in items], - ], - names=['embeddings', 'entity'] - ) - - writer.write_batch(batch) - - writer.close() - - print(f"Wrote {path}.") - - except Exception as e: - - print("Parquet write:", e) - - def write(self, embeds, ent): - self.q.put((embeds, ent)) - - def __del__(self): - - self.running = False - - if hasattr(self, "q"): - self.thread.join() - - - - diff --git a/trustgraph-parquet/trustgraph/dump/triples/__init__.py b/trustgraph-parquet/trustgraph/dump/triples/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/trustgraph-parquet/trustgraph/dump/triples/parquet/__init__.py b/trustgraph-parquet/trustgraph/dump/triples/parquet/__init__.py deleted file mode 100644 index 9d16af90..00000000 --- a/trustgraph-parquet/trustgraph/dump/triples/parquet/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ - -from . processor import * - diff --git a/trustgraph-parquet/trustgraph/dump/triples/parquet/processor.py b/trustgraph-parquet/trustgraph/dump/triples/parquet/processor.py deleted file mode 100755 index dc15d8a9..00000000 --- a/trustgraph-parquet/trustgraph/dump/triples/parquet/processor.py +++ /dev/null @@ -1,87 +0,0 @@ - -""" -Write graphs triples to parquet files in a directory. -""" - -import pulsar -import base64 -import os -import argparse -import time - -from .... schema import Triples -from .... schema import triples_store_queue -from .... base import Consumer - -from . writer import ParquetWriter - -module = ".".join(__name__.split(".")[1:-1]) - -default_input_queue = triples_store_queue -default_subscriber = module -default_graph_host='localhost' -default_directory = "." -default_file_template = "triples-{id}.parquet" -default_rotation_time = 60 - -class Processor(Consumer): - - def __init__(self, **params): - - input_queue = params.get("input_queue", default_input_queue) - subscriber = params.get("subscriber", default_subscriber) - directory = params.get("directory", default_directory) - file_template = params.get("file_template", default_file_template) - rotation_time = params.get("rotation_time", default_rotation_time) - - super(Processor, self).__init__( - **params | { - "input_queue": input_queue, - "subscriber": subscriber, - "input_schema": Triples, - } - ) - - self.writer = ParquetWriter(directory, file_template, rotation_time) - - def __del__(self): - if hasattr(self, "writer"): - del self.writer - - def handle(self, msg): - - v = msg.value() - - for t in v.triples: - self.writer.write(t.s.value, t.p.value, t.o.value) - - @staticmethod - def add_args(parser): - - Consumer.add_args( - parser, default_input_queue, default_subscriber, - ) - - parser.add_argument( - '-d', '--directory', - default=default_directory, - help=f'Directory to write to (default: {default_directory})' - ) - - parser.add_argument( - '-f', '--file-template', - default=default_file_template, - help=f'Directory to write to (default: {default_file_template})' - ) - - parser.add_argument( - '-t', '--rotation-time', - type=int, - default=default_rotation_time, - help=f'Rotation time / seconds (default: {default_rotation_time})' - ) - -def run(): - - Processor.start(module, __doc__) - diff --git a/trustgraph-parquet/trustgraph/dump/triples/parquet/writer.py b/trustgraph-parquet/trustgraph/dump/triples/parquet/writer.py deleted file mode 100644 index e68bf342..00000000 --- a/trustgraph-parquet/trustgraph/dump/triples/parquet/writer.py +++ /dev/null @@ -1,96 +0,0 @@ - -import threading -import queue -import time -import uuid -import pyarrow as pa -import pyarrow.parquet as pq - -class ParquetWriter: - - def __init__(self, directory, file_template, rotation_time): - self.directory = directory - self.file_template = file_template - self.rotation_time = rotation_time - - self.q = queue.Queue() - - self.running = True - - self.thread = threading.Thread(target=(self.writer_thread)) - self.thread.start() - - def writer_thread(self): - - triples = [] - - timeout = None - - while self.running: - - try: - - item = self.q.get(timeout=1) - - if timeout == None: - timeout = time.time() + self.rotation_time - - triples.append(item) - - except queue.Empty: - pass - - if timeout: - if time.time() > timeout: - - self.write_file(triples) - timeout = None - triples = [] - - def write_file(self, triples): - - try: - - schema = pa.schema([ - pa.field('s', pa.string()), - pa.field('p', pa.string()), - pa.field('o', pa.string()), - ]) - - fname = self.file_template.format(id=str(uuid.uuid4())) - path = f"{self.directory}/{fname}" - - writer = pq.ParquetWriter(path, schema) - - batch = pa.record_batch( - [ - [tpl[0] for tpl in triples], - [tpl[1] for tpl in triples], - [tpl[2] for tpl in triples], - ], - names=['s', 'p', 'o'] - ) - - writer.write_batch(batch) - - writer.close() - - print(f"Wrote {path}.") - - except Exception as e: - - print("Parquet write:", e) - - def write(self, s, p, o): - self.q.put((s, p, o)) - - def __del__(self): - - self.running = False - - if hasattr(self, "q"): - self.thread.join() - - - - diff --git a/trustgraph-vertexai/setup.py b/trustgraph-vertexai/setup.py index 0cdc3a97..3ce10305 100644 --- a/trustgraph-vertexai/setup.py +++ b/trustgraph-vertexai/setup.py @@ -34,7 +34,7 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", + "trustgraph-base>=0.17,<0.18", "pulsar-client", "google-cloud-aiplatform", "prometheus-client", diff --git a/trustgraph/setup.py b/trustgraph/setup.py index 8e50aed5..5f9f1f2c 100644 --- a/trustgraph/setup.py +++ b/trustgraph/setup.py @@ -34,13 +34,13 @@ setuptools.setup( python_requires='>=3.8', download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", install_requires=[ - "trustgraph-base>=0.15,<0.16", - "trustgraph-bedrock>=0.15,<0.16", - "trustgraph-cli>=0.15,<0.16", - "trustgraph-embeddings-hf>=0.15,<0.16", - "trustgraph-flow>=0.15,<0.16", - "trustgraph-parquet>=0.15,<0.16", - "trustgraph-vertexai>=0.15,<0.16", + "trustgraph-base>=0.17,<0.18", + "trustgraph-bedrock>=0.17,<0.18", + "trustgraph-cli>=0.17,<0.18", + "trustgraph-embeddings-hf>=0.17,<0.18", + "trustgraph-flow>=0.17,<0.18", + "trustgraph-parquet>=0.17,<0.18", + "trustgraph-vertexai>=0.17,<0.18", ], scripts=[ ]