Add a docker-compose for just the stores (#13)

* - Added docker-compose-storage.yaml, just the infrastructure bits
- Tidied storage invocation

* Util, sits on chunker output and reports histogram of chunk sizes
This commit is contained in:
cybermaggedon 2024-08-21 16:20:21 +01:00 committed by GitHub
parent b0fdb4f314
commit 0e2db095e3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 391 additions and 121 deletions

View file

@ -1,6 +1,6 @@
# VERSION=$(shell git describe | sed 's/^v//')
VERSION=0.6.6
VERSION=0.6.7
DOCKER=podman
@ -33,7 +33,9 @@ set-version:
# sed -i 's/trustgraph-flow:[0-9]*\.[0-9]*\.[0-9]*/trustgraph-flow:'${VERSION}'/' docker-compose*.yaml
echo '"${VERSION}"' > templates/components/version.jsonnet
TEMPLATES=azure bedrock claude cohere mix ollama openai vertexai openai-neo4j
TEMPLATES=azure bedrock claude cohere mix ollama openai vertexai \
openai-neo4j storage
DCS=$(foreach template,${TEMPLATES},${template:%=docker-compose-%.yaml})
update-templates: set-version ${DCS}

115
docker-compose-storage.yaml Normal file
View file

@ -0,0 +1,115 @@
"services":
"cassandra":
"image": "docker.io/cassandra:4.1.5"
"ports":
- "9042:9042"
"restart": "on-failure:100"
"volumes":
- "cassandra:/var/lib/cassandra"
"etcd":
"command":
- "etcd"
- "-advertise-client-urls=http://127.0.0.1:2379"
- "-listen-client-urls"
- "http://0.0.0.0:2379"
- "--data-dir"
- "/etcd"
"environment":
"ETCD_AUTO_COMPACTION_MODE": "revision"
"ETCD_AUTO_COMPACTION_RETENTION": "1000"
"ETCD_QUOTA_BACKEND_BYTES": "4294967296"
"ETCD_SNAPSHOT_COUNT": "50000"
"image": "quay.io/coreos/etcd:v3.5.5"
"ports":
- "2379:2379"
"restart": "on-failure:100"
"volumes":
- "etcd:/etcd"
"grafana":
"environment":
"GF_ORG_NAME": "trustgraph.ai"
"image": "docker.io/grafana/grafana:10.0.0"
"ports":
- "3000:3000"
"restart": "on-failure:100"
"volumes":
- "grafana-storage:/var/lib/grafana"
- "./grafana/dashboard.yml:/etc/grafana/provisioning/dashboards/dashboard.yml"
- "./grafana/datasource.yml:/etc/grafana/provisioning/datasources/datasource.yml"
- "./grafana/dashboard.json:/var/lib/grafana/dashboards/dashboard.json"
"init-pulsar":
"command":
- "sh"
- "-c"
- "pulsar-admin --admin-url http://pulsar:8080 tenants create tg && pulsar-admin --admin-url http://pulsar:8080 namespaces create tg/flow && pulsar-admin --admin-url http://pulsar:8080 namespaces create tg/request && pulsar-admin --admin-url http://pulsar:8080 namespaces create tg/response && pulsar-admin --admin-url http://pulsar:8080 namespaces set-retention --size -1 --time 3m tg/response"
"depends_on":
"pulsar":
"condition": "service_started"
"image": "docker.io/apachepulsar/pulsar:3.3.0"
"restart": "on-failure:100"
"milvus":
"command":
- "milvus"
- "run"
- "standalone"
"environment":
"ETCD_ENDPOINTS": "etcd:2379"
"MINIO_ADDRESS": "minio:9000"
"image": "docker.io/milvusdb/milvus:v2.4.5"
"ports":
- "9091:9091"
- "19530:19530"
"restart": "on-failure:100"
"volumes":
- "milvus:/var/lib/milvus"
"minio":
"command":
- "minio"
- "server"
- "/minio_data"
- "--console-address"
- ":9001"
"environment":
"MINIO_ROOT_PASSWORD": "minioadmin"
"MINIO_ROOT_USER": "minioadmin"
"image": "docker.io/minio/minio:RELEASE.2024-07-04T14-25-45Z"
"ports":
- "9001:9001"
"restart": "on-failure:100"
"volumes":
- "minio-data:/minio_data"
"prometheus":
"image": "docker.io/prom/prometheus:v2.53.1"
"ports":
- "9090:9090"
"restart": "on-failure:100"
"volumes":
- "./prometheus:/etc/prometheus"
- "prometheus-data:/prometheus"
"pulsar":
"command": "bin/pulsar standalone"
"image": "docker.io/apachepulsar/pulsar:3.3.0"
"ports":
- "6650:6650"
- "8080:8080"
"restart": "on-failure:100"
"volumes":
- "pulsar-conf:/pulsar/conf"
- "pulsar-data:/pulsar/data"
"pulsar-manager":
"environment":
"SPRING_CONFIGURATION_FILE": "/pulsar-manager/pulsar-manager/application.properties"
"image": "docker.io/apachepulsar/pulsar-manager:v0.3.0"
"ports":
- "9527:9527"
- "7750:7750"
"restart": "on-failure:100"
"volumes":
"cassandra": {}
"etcd": {}
"grafana-storage": {}
"milvus": {}
"minio-data": {}
"prometheus-data": {}
"pulsar-conf": {}
"pulsar-data": {}

View file

@ -4,7 +4,7 @@ import os
with open("README.md", "r") as fh:
long_description = fh.read()
version = "0.6.6"
version = "0.6.7"
setuptools.setup(
name="trustgraph",

View file

@ -2,21 +2,11 @@ local base = import "base.jsonnet";
local images = import "images.jsonnet";
local url = import "url.jsonnet";
local cassandra_hosts = "cassandra";
{
volumes +: {
cassandra: {},
},
services +: {
local cassandra = import "stores/cassandra.jsonnet";
cassandra: base + {
image: images.cassandra,
ports: [
"9042:9042"
],
volumes: [
"cassandra:/var/lib/cassandra"
],
},
cassandra + {
services +: {
"store-triples": base + {
image: images.trustgraph,

View file

@ -1,71 +1,36 @@
local base = import "base.jsonnet";
local images = import "images.jsonnet";
{
volumes +: {
etcd: {},
"minio-data": {},
milvus: {},
},
local url = import "url.jsonnet";
local milvus = import "stores/milvus.jsonnet";
milvus + {
services +: {
etcd: base + {
image: images.etcd,
"store-graph-embeddings": base + {
image: images.trustgraph,
command: [
"etcd",
"-advertise-client-urls=http://127.0.0.1:2379",
"-listen-client-urls",
"http://0.0.0.0:2379",
"--data-dir",
"/etcd",
],
environment: {
ETCD_AUTO_COMPACTION_MODE: "revision",
ETCD_AUTO_COMPACTION_RETENTION: "1000",
ETCD_QUOTA_BACKEND_BYTES: "4294967296",
ETCD_SNAPSHOT_COUNT: "50000"
},
ports: [
"2379:2379",
],
volumes: [
"etcd:/etcd"
"ge-write-milvus",
"-p",
url.pulsar,
"-t",
url.milvus,
],
},
minio: base + {
image: images.minio,
"query-graph-embeddings": base + {
image: images.trustgraph,
command: [
"minio",
"server",
"/minio_data",
"--console-address",
":9001",
],
environment: {
MINIO_ROOT_USER: "minioadmin",
MINIO_ROOT_PASSWORD: "minioadmin",
},
ports: [
"9001:9001",
],
volumes: [
"minio-data:/minio_data",
],
},
milvus: base + {
image: images.milvus,
command: [
"milvus", "run", "standalone"
],
environment: {
ETCD_ENDPOINTS: "etcd:2379",
MINIO_ADDRESS: "minio:9000",
},
ports: [
"9091:9091",
"19530:19530",
],
volumes: [
"milvus:/var/lib/milvus"
"ge-query-milvus",
"-p",
url.pulsar,
"-t",
url.milvus,
],
},
},
}
}

View file

@ -1,31 +1,12 @@
local base = import "base.jsonnet";
local images = import "images.jsonnet";
local url = import "url.jsonnet";
{
local neo4j = import "stores/neo4j.jsonnet";
volumes +: {
neo4j: {},
},
neo4j + {
services +: {
neo4j: base + {
image: images.neo4j,
ports: [
"7474:7474",
"7687:7687",
],
environment: {
NEO4J_AUTH: "neo4j/password",
// NEO4J_server_bolt_listen__address: "0.0.0.0:7687",
// NEO4J_server_default__listen__address: "0.0.0.0",
// NEO4J_server_http_listen__address: "0.0.0.0:7474",
},
volumes: [
"neo4j:/data"
],
},
"query-triples": base + {
image: images.trustgraph,
command: [

View file

@ -0,0 +1,20 @@
local base = import "../base.jsonnet";
local images = import "../images.jsonnet";
{
volumes +: {
cassandra: {},
},
services +: {
cassandra: base + {
image: images.cassandra,
ports: [
"9042:9042"
],
volumes: [
"cassandra:/var/lib/cassandra"
],
},
},
}

View file

@ -0,0 +1,79 @@
local base = import "../base.jsonnet";
local images = import "../images.jsonnet";
{
volumes +: {
etcd: {},
"minio-data": {},
milvus: {},
},
services +: {
etcd: base + {
image: images.etcd,
command: [
"etcd",
"-advertise-client-urls=http://127.0.0.1:2379",
"-listen-client-urls",
"http://0.0.0.0:2379",
"--data-dir",
"/etcd",
],
environment: {
ETCD_AUTO_COMPACTION_MODE: "revision",
ETCD_AUTO_COMPACTION_RETENTION: "1000",
ETCD_QUOTA_BACKEND_BYTES: "4294967296",
ETCD_SNAPSHOT_COUNT: "50000"
},
ports: [
"2379:2379",
],
volumes: [
"etcd:/etcd"
],
},
minio: base + {
image: images.minio,
command: [
"minio",
"server",
"/minio_data",
"--console-address",
":9001",
],
environment: {
MINIO_ROOT_USER: "minioadmin",
MINIO_ROOT_PASSWORD: "minioadmin",
},
ports: [
"9001:9001",
],
volumes: [
"minio-data:/minio_data",
],
},
milvus: base + {
image: images.milvus,
command: [
"milvus", "run", "standalone"
],
environment: {
ETCD_ENDPOINTS: "etcd:2379",
MINIO_ADDRESS: "minio:9000",
},
ports: [
"9091:9091",
"19530:19530",
],
volumes: [
"milvus:/var/lib/milvus"
],
},
},
}

View file

@ -0,0 +1,30 @@
local base = import "../base.jsonnet";
local images = import "../images.jsonnet";
{
volumes +: {
neo4j: {},
},
services +: {
neo4j: base + {
image: images.neo4j,
ports: [
"7474:7474",
"7687:7687",
],
environment: {
NEO4J_AUTH: "neo4j/password",
// NEO4J_server_bolt_listen__address: "0.0.0.0:7687",
// NEO4J_server_default__listen__address: "0.0.0.0",
// NEO4J_server_http_listen__address: "0.0.0.0:7474",
},
volumes: [
"neo4j:/data"
],
},
},
}

View file

@ -64,28 +64,6 @@ local url = import "url.jsonnet";
],
},
"store-graph-embeddings": base + {
image: images.trustgraph,
command: [
"ge-write-milvus",
"-p",
url.pulsar,
"-t",
url.milvus,
],
},
"query-graph-embeddings": base + {
image: images.trustgraph,
command: [
"ge-query-milvus",
"-p",
url.pulsar,
"-t",
url.milvus,
],
},
"graph-rag": base + {
image: images.trustgraph,
command: [

View file

@ -0,0 +1,10 @@
local cassandra = import "components/stores/cassandra.jsonnet";
local pulsar = import "components/pulsar.jsonnet";
local milvus = import "components/stores/milvus.jsonnet";
local grafana = import "components/grafana.jsonnet";
local config = cassandra + pulsar + milvus + grafana;
std.manifestYamlDoc(config)

100
tests/report-chunk-sizes Executable file
View file

@ -0,0 +1,100 @@
#!/usr/bin/env python3
"""
Accepts entity/vector pairs and writes them to a Milvus store.
"""
from trustgraph.schema import Chunk
from trustgraph.schema import chunk_ingest_queue
from trustgraph.log_level import LogLevel
from trustgraph.base import Consumer
from threading import Thread, Lock
import time
module = "test-chunk-size"
default_input_queue = chunk_ingest_queue
default_subscriber = module
default_store_uri = 'http://localhost:19530'
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
width = params.get("width", 200)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": Chunk,
}
)
self.sizes = {}
self.width = width
self.lock = Lock()
Thread(target=self.report).start()
def report(self):
while True:
time.sleep(1)
print()
with self.lock:
tot = 0
for i in range(0, 20000, self.width):
k = (i, i + self.width)
if k in self.sizes:
print(f"{i:5d} ..{i+self.width:5d}: {self.sizes[k]}")
tot += self.sizes[k]
print(f"{'Total':13s}: {tot}")
def handle(self, msg):
v = msg.value()
chunk = v.chunk.decode("utf-8")
l = len(chunk)
low = int(l / self.width) * self.width
high = low + self.width
key = (low, high)
with self.lock:
if key not in self.sizes:
self.sizes[key] = 0
self.sizes[key] += 1
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'--width',
type=int,
default=200,
help=f'Histogram width (default: 200)',
)
def run():
Processor.start(module, __doc__)
run()