mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-03 15:01:00 +02:00
git-subtree-dir: ai-context/trustgraph-templates git-subtree-split: 42a5fd1b678f32be378062e30451e2052ccb95dd
446 lines
19 KiB
Jsonnet
446 lines
19 KiB
Jsonnet
|
|
//
|
|
// This majorly does not work
|
|
//
|
|
|
|
// This configuration fails primarily because it tries to treat Ceph
|
|
// like a stateless web app. You are currently pointing mon host to a
|
|
// generic service name (ceph-mon), but you aren't telling the Monitor
|
|
// process to assume that service identity.
|
|
|
|
// To make this work with the "Service Name" approach across any
|
|
// engine, you need to fix the binding logic and the messenger protocol.
|
|
|
|
// 1. The ceph.conf Fix
|
|
|
|
// Need to enable Messenger v2 (modern) and tell the cluster to use
|
|
// the service names for its initial quorum.
|
|
|
|
//Change this section:
|
|
// Ini, TOML
|
|
|
|
// [global]
|
|
// fsid = %s
|
|
// mon initial members = mon0
|
|
// mon host = ceph-mon:6789
|
|
// ...
|
|
|
|
// To this:
|
|
// Ini, TOML
|
|
|
|
// [global]
|
|
// fsid = %s
|
|
// # Use the actual service names as members
|
|
// mon initial members = ceph-mon
|
|
// # Explicitly use the Service name (VIP)
|
|
// mon host = ceph-mon
|
|
// # Force modern protocol
|
|
// ms_bind_msgr2 = true
|
|
// ms_bind_msgr1 = true
|
|
|
|
// 2. The MON Environment & Command Fix
|
|
|
|
// This is the most critical part. Your current config sets MON_IP:
|
|
// "0.0.0.0". This causes the MON to bind to the Pod IP, which breaks
|
|
// when the Pod restarts. You must force it to bind to the Service IP.
|
|
|
|
// Update mon_env:
|
|
// Code snippet
|
|
|
|
// local mon_env = cluster_env + {
|
|
// CEPH_DAEMON: "MON",
|
|
// MON_NAME: "mon0",
|
|
// # Remove MON_IP: "0.0.0.0"
|
|
// # Add these:
|
|
// MON_ADDR: "ceph-mon", // This says to resolve the service name
|
|
// };
|
|
|
|
// Update mon_container command: You are currently wiping the MON data
|
|
// on every start (rm -rf /var/lib/ceph/mon/*). Stop doing that. If you
|
|
// wipe the data, you lose the cluster state and the OSDs will refuse to
|
|
// talk to the "new" MON.
|
|
|
|
// Code snippet
|
|
|
|
// .with_command([
|
|
// "bash", "-c",
|
|
// # 1. Resolve the Service IP at runtime
|
|
// "export MON_IP=$(getent hosts ceph-mon | awk '{ print $1 }'); " +
|
|
// inject_mon_config +
|
|
// # 2. Start the daemon telling it its PUBLIC address is the Service VIP
|
|
// "exec /opt/ceph-container/bin/entrypoint.sh"
|
|
// ])
|
|
|
|
// 3. Why your current config "Majorly does not work"
|
|
|
|
// The "Wipe" Logic: By running rm -rf /var/lib/ceph/mon/* in the
|
|
// MON container, you are creating a "New Cluster" every time the
|
|
// container starts. Since the OSDs store the fsid and cluster
|
|
// secrets, they will reject the "new" MON.
|
|
|
|
// DNS Race Condition: Your OSD/MGR/RGW containers wait for
|
|
// ceph-mon DNS, which is good. However, if ceph-mon resolves to a
|
|
// Round Robin IP (multiple pods) rather than a stable ClusterIP, the
|
|
// connection will be flaky.
|
|
|
|
// Messenger Protocol: Without ms_bind_msgr2, Ceph defaults to the
|
|
// old v1 protocol which is much more sensitive to NAT/Container IP
|
|
// mismatches.
|
|
|
|
// SUMMARY
|
|
|
|
// Component: ceph.conf
|
|
// Change: Add ms_bind_msgr2 = true
|
|
// Why: Supports modern container networking better.
|
|
|
|
// Component: MON Start
|
|
// Change: Remove rm -rf
|
|
// Why: Ceph MONs must keep their database to maintain the cluster.
|
|
|
|
// Component: MON Address
|
|
// Change: Use getent hosts ceph-mon
|
|
// Why: Forces the MON to advertise the Service VIP instead of its own Pod IP.
|
|
|
|
// Component: MON Keyring
|
|
// Change: Ensure /etc/ceph/ceph.mon.keyring exists
|
|
// Why: MONs need their specific key to start.
|
|
|
|
local images = import "values/images.jsonnet";
|
|
|
|
{
|
|
with:: function(key, value)
|
|
self + {
|
|
["ceph-" + key]:: value,
|
|
},
|
|
|
|
// Ceph credentials and cluster settings
|
|
"ceph-access-key":: "object-user",
|
|
"ceph-secret-key":: "object-password",
|
|
"ceph-cluster-id":: "ceph",
|
|
"ceph-fsid":: "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",
|
|
|
|
// Pool redundancy settings
|
|
// size: 2 = two replicas for fault tolerance
|
|
// min_size: 1 = allow degraded I/O if one OSD is down (prevents cluster freeze)
|
|
"ceph-pool-size":: "2",
|
|
"ceph-pool-min-size":: "1",
|
|
|
|
ceph +: {
|
|
create:: function(engine)
|
|
// Pre-Shared Cryptographic Material - Config-as-Code Approach
|
|
// These keys are generated once and distributed to all daemons
|
|
// This ensures cryptographic consistency across the shared-nothing architecture
|
|
local admin_key = "AQBpxSBlAAAAABAAU99V6D8vS7Uu9y1S8W0iBg==";
|
|
local mon_key = "AQBpxSBlAAAAABAAn7pL/pG9oT+X6vO7V1S6bg==";
|
|
|
|
// Ceph configuration file - rendered from Jsonnet variables
|
|
local ceph_conf = |||
|
|
[global]
|
|
fsid = %s
|
|
mon initial members = mon0
|
|
mon host = ceph-mon:6789
|
|
public network = 0.0.0.0/0
|
|
cluster network = 0.0.0.0/0
|
|
osd pool default size = %s
|
|
osd pool default min size = %s
|
|
osd crush chooseleaf type = 0
|
|
auth cluster required = cephx
|
|
auth service required = cephx
|
|
auth client required = cephx
|
|
||| % [$["ceph-fsid"], $["ceph-pool-size"], $["ceph-pool-min-size"]];
|
|
|
|
// Admin keyring - distributed to all daemons
|
|
local admin_keyring = |||
|
|
[client.admin]
|
|
key = %s
|
|
caps mds = "allow *"
|
|
caps mgr = "allow *"
|
|
caps mon = "allow *"
|
|
caps osd = "allow *"
|
|
||| % [admin_key];
|
|
|
|
// Monitor keyring - used by MON for cluster operations
|
|
local mon_keyring = |||
|
|
[mon.]
|
|
key = %s
|
|
caps mon = "allow *"
|
|
||| % [mon_key];
|
|
|
|
// Config injection command - writes files before entrypoint
|
|
local inject_config = "printf '%s' > /etc/ceph/ceph.conf; printf '%s' > /etc/ceph/ceph.client.admin.keyring; " % [ceph_conf, admin_keyring];
|
|
local inject_mon_config = inject_config + ("printf '%s' > /etc/ceph/ceph.mon.keyring; " % [mon_keyring]);
|
|
|
|
// Data volumes - sized appropriately for production workloads
|
|
local vol_mon = engine.volume("ceph-mon").with_size("20G");
|
|
local vol_mgr = engine.volume("ceph-mgr").with_size("20G");
|
|
local vol_osd = engine.volume("ceph-osd").with_size("100G");
|
|
local vol_rgw = engine.volume("ceph-rgw").with_size("20G");
|
|
|
|
// Isolated config volumes per daemon (ReadWriteOnce compatible)
|
|
// Each daemon gets its own non-shared config volume to support
|
|
// multi-node scheduling in K8s and other orchestrators
|
|
local vol_mon_config = engine.volume("ceph-mon-config").with_size("500M");
|
|
local vol_mgr_config = engine.volume("ceph-mgr-config").with_size("500M");
|
|
local vol_osd_config = engine.volume("ceph-osd-config").with_size("500M");
|
|
local vol_rgw_config = engine.volume("ceph-rgw-config").with_size("500M");
|
|
local vol_init_config = engine.volume("ceph-init-config").with_size("500M");
|
|
|
|
// Simplified cluster environment - Config-as-Code model
|
|
// No fetch logic needed - config is injected before entrypoint runs
|
|
local cluster_env = {
|
|
CLUSTER: $["ceph-cluster-id"],
|
|
FSID: $["ceph-fsid"],
|
|
KV_TYPE: "none", // No external coordination
|
|
};
|
|
|
|
// MON-specific environment
|
|
// Config-as-Code: MON uses injected config files, not fetch logic
|
|
//
|
|
// CRITICAL: MON_DATA_AVAIL="0" forces fresh cluster bootstrap
|
|
// The ceph/daemon entrypoint script (variables_stack.sh) uses this as a gate:
|
|
// - MON_DATA_AVAIL="0" -> run mkfs, create new cluster with our FSID
|
|
// - MON_DATA_AVAIL="1" -> attempt to join existing cluster (infinite probe loop)
|
|
//
|
|
// Network configuration for monmap generation
|
|
local mon_env = cluster_env + {
|
|
CEPH_DAEMON: "MON",
|
|
MON_NAME: "mon0",
|
|
MON_PORT: "6789",
|
|
MON_DATA_AVAIL: "0",
|
|
MON_IP: "0.0.0.0",
|
|
NETWORK_AUTO_DETECT: "4",
|
|
CEPH_PUBLIC_NETWORK: "0.0.0.0/0",
|
|
};
|
|
|
|
// Simplified daemon environments - Config-as-Code model
|
|
// All daemons receive config via injection, not fetch from MON
|
|
// This eliminates "static mode" errors and networking complexity
|
|
|
|
// MGR-specific environment
|
|
local mgr_env = cluster_env + {
|
|
CEPH_DAEMON: "MGR",
|
|
MGR_NAME: "mgr0",
|
|
};
|
|
|
|
// OSD-specific environment
|
|
local osd_env = cluster_env + {
|
|
CEPH_DAEMON: "OSD",
|
|
OSD_TYPE: "directory",
|
|
};
|
|
|
|
// RGW-specific environment
|
|
local rgw_env = cluster_env + {
|
|
CEPH_DAEMON: "RGW",
|
|
RGW_NAME: "rgw0",
|
|
RGW_FRONTEND_PORT: "7480",
|
|
};
|
|
|
|
// MON (Monitor) container - cluster state and quorum
|
|
// Config-as-Code: Injects pre-shared keys before entrypoint
|
|
// CRITICAL: Wipes /var/lib/ceph/mon/* on every start to force fresh bootstrap
|
|
// This ensures MON always uses our FSID and doesn't inherit stale cluster state
|
|
local mon_container =
|
|
engine.container("ceph-mon")
|
|
.with_image(images.ceph)
|
|
.with_environment(mon_env)
|
|
.with_command([
|
|
"bash", "-c",
|
|
"rm -rf /var/lib/ceph/mon/*; " +
|
|
inject_mon_config +
|
|
"exec /opt/ceph-container/bin/entrypoint.sh"
|
|
])
|
|
.with_limits("1.0", "1536M")
|
|
.with_reservations("0.5", "1024M")
|
|
.with_port(6789, 6789, "mon")
|
|
.with_port(3300, 3300, "mon-msgr2")
|
|
.with_volume_mount(vol_mon, "/var/lib/ceph/mon")
|
|
.with_volume_mount(vol_mon_config, "/etc/ceph");
|
|
|
|
// MGR (Manager) container - cluster management and dashboard
|
|
// Config-as-Code: Uses injected config files with pre-shared keys
|
|
// DNS wait ensures MON is available before MGR connects
|
|
local mgr_container =
|
|
engine.container("ceph-mgr")
|
|
.with_image(images.ceph)
|
|
.with_environment(mgr_env)
|
|
.with_command([
|
|
"bash", "-c",
|
|
"until getent hosts ceph-mon; do echo 'Waiting for MON DNS...'; sleep 2; done; " +
|
|
inject_config +
|
|
"exec /opt/ceph-container/bin/entrypoint.sh"
|
|
])
|
|
.with_limits("1.0", "1536M")
|
|
.with_reservations("0.5", "1024M")
|
|
.with_port(7000, 7000, "mgr")
|
|
.with_port(8443, 8443, "dashboard")
|
|
.with_port(9283, 9283, "prometheus")
|
|
.with_volume_mount(vol_mgr, "/var/lib/ceph/mgr")
|
|
.with_volume_mount(vol_mgr_config, "/etc/ceph");
|
|
|
|
// OSD (Object Storage Daemon) - actual data storage
|
|
// Config-as-Code: Uses injected config files with pre-shared keys
|
|
// Increased resources to prevent OOM during recovery operations
|
|
// DNS wait ensures MON is available before OSD connects
|
|
local osd_container =
|
|
engine.container("ceph-osd")
|
|
.with_image(images.ceph)
|
|
.with_environment(osd_env)
|
|
.with_command([
|
|
"bash", "-c",
|
|
"until getent hosts ceph-mon; do echo 'Waiting for MON DNS...'; sleep 2; done; " +
|
|
inject_config +
|
|
"exec /opt/ceph-container/bin/entrypoint.sh"
|
|
])
|
|
.with_limits("2.0", "4096M")
|
|
.with_reservations("1.0", "2048M")
|
|
.with_port(6800, 6800, "osd")
|
|
.with_volume_mount(vol_osd, "/var/lib/ceph/osd")
|
|
.with_volume_mount(vol_osd_config, "/etc/ceph");
|
|
|
|
// RGW (RADOS Gateway) - S3 API endpoint
|
|
// Config-as-Code: Uses injected config files with pre-shared keys
|
|
// DNS wait ensures MON is available before RGW connects
|
|
local rgw_container =
|
|
engine.container("ceph-rgw")
|
|
.with_image(images.ceph)
|
|
.with_environment(rgw_env)
|
|
.with_command([
|
|
"bash", "-c",
|
|
"until getent hosts ceph-mon; do echo 'Waiting for MON DNS...'; sleep 2; done; " +
|
|
inject_config +
|
|
"exec /opt/ceph-container/bin/entrypoint.sh"
|
|
])
|
|
.with_limits("1.0", "1536M")
|
|
.with_reservations("0.5", "1024M")
|
|
.with_port(7480, 7480, "s3")
|
|
.with_volume_mount(vol_rgw, "/var/lib/ceph/radosgw")
|
|
.with_volume_mount(vol_rgw_config, "/etc/ceph");
|
|
|
|
// Init container - one-time S3 user provisioning
|
|
// IMPORTANT: This container exits with code 0 after completion
|
|
// Orchestrator must NOT restart it (use K8s Job or Compose restart: "no")
|
|
// Config-as-Code: Uses injected config to run radosgw-admin commands
|
|
local init_container =
|
|
engine.container("ceph-init")
|
|
.with_image(images.ceph)
|
|
.with_environment({
|
|
CLUSTER: $["ceph-cluster-id"],
|
|
FSID: $["ceph-fsid"],
|
|
KV_TYPE: "none",
|
|
RGW_ACCESS_KEY: $["ceph-access-key"],
|
|
RGW_SECRET_KEY: $["ceph-secret-key"],
|
|
})
|
|
.with_limits("0.5", "512M")
|
|
.with_reservations("0.25", "256M")
|
|
.with_volume_mount(vol_init_config, "/etc/ceph")
|
|
.with_command([
|
|
"bash", "-c",
|
|
inject_config + |||
|
|
set -e
|
|
|
|
# Wait for cluster health
|
|
echo "Waiting for Ceph cluster to be healthy..."
|
|
MAX_ATTEMPTS=60
|
|
ATTEMPT=0
|
|
until ceph --cluster ${CLUSTER} health 2>/dev/null | grep -q "HEALTH_OK\|HEALTH_WARN"; do
|
|
ATTEMPT=$((ATTEMPT+1))
|
|
if [ $ATTEMPT -ge $MAX_ATTEMPTS ]; then
|
|
echo "ERROR: Cluster failed to become healthy after ${MAX_ATTEMPTS} attempts"
|
|
exit 1
|
|
fi
|
|
echo "Attempt ${ATTEMPT}/${MAX_ATTEMPTS}: Cluster not ready, retrying in 5s..."
|
|
sleep 5
|
|
done
|
|
echo "Cluster is healthy."
|
|
|
|
# Wait for RGW availability
|
|
echo "Waiting for RGW to be ready..."
|
|
ATTEMPT=0
|
|
until curl -sf http://ceph-rgw:7480 >/dev/null 2>&1; do
|
|
ATTEMPT=$((ATTEMPT+1))
|
|
if [ $ATTEMPT -ge $MAX_ATTEMPTS ]; then
|
|
echo "ERROR: RGW failed to become ready after ${MAX_ATTEMPTS} attempts"
|
|
exit 1
|
|
fi
|
|
echo "Attempt ${ATTEMPT}/${MAX_ATTEMPTS}: RGW not ready, retrying in 5s..."
|
|
sleep 5
|
|
done
|
|
echo "RGW is ready."
|
|
|
|
# Idempotent S3 user creation
|
|
echo "Provisioning S3 user: ${RGW_ACCESS_KEY}"
|
|
if radosgw-admin --cluster ${CLUSTER} user info --uid="${RGW_ACCESS_KEY}" >/dev/null 2>&1; then
|
|
echo "User ${RGW_ACCESS_KEY} already exists, skipping creation."
|
|
else
|
|
echo "Creating new S3 user: ${RGW_ACCESS_KEY}"
|
|
radosgw-admin --cluster ${CLUSTER} user create \
|
|
--uid="${RGW_ACCESS_KEY}" \
|
|
--display-name="Object Storage User" \
|
|
--access-key="${RGW_ACCESS_KEY}" \
|
|
--secret-key="${RGW_SECRET_KEY}"
|
|
echo "S3 user created successfully."
|
|
fi
|
|
|
|
echo "Initialization complete. Exiting."
|
|
exit 0
|
|
|||,
|
|
]);
|
|
|
|
// Container sets - each daemon gets its own for K8s node distribution
|
|
local mon_containerSet = engine.containers("ceph-mon", [mon_container]);
|
|
local mgr_containerSet = engine.containers("ceph-mgr", [mgr_container]);
|
|
local osd_containerSet = engine.containers("ceph-osd", [osd_container]);
|
|
local rgw_containerSet = engine.containers("ceph-rgw", [rgw_container]);
|
|
local init_containerSet = engine.containers("ceph-init", [init_container]);
|
|
|
|
// Services - expose daemon ports for inter-daemon communication
|
|
local mon_service =
|
|
engine.service(mon_containerSet)
|
|
.with_port(6789, 6789, "mon")
|
|
.with_port(3300, 3300, "mon-msgr2");
|
|
|
|
local mgr_service =
|
|
engine.service(mgr_containerSet)
|
|
.with_port(7000, 7000, "mgr")
|
|
.with_port(8443, 8443, "dashboard")
|
|
.with_port(9283, 9283, "prometheus");
|
|
|
|
local osd_service =
|
|
engine.service(osd_containerSet)
|
|
.with_port(6800, 6800, "osd");
|
|
|
|
local rgw_service =
|
|
engine.service(rgw_containerSet)
|
|
.with_port(7480, 7480, "s3");
|
|
|
|
engine.resources([
|
|
|
|
// Data volumes
|
|
vol_mon,
|
|
vol_mgr,
|
|
vol_osd,
|
|
vol_rgw,
|
|
|
|
// Config volumes (isolated, no sharing)
|
|
vol_mon_config,
|
|
vol_mgr_config,
|
|
vol_osd_config,
|
|
vol_rgw_config,
|
|
vol_init_config,
|
|
|
|
// Container sets
|
|
mon_containerSet,
|
|
mgr_containerSet,
|
|
osd_containerSet,
|
|
rgw_containerSet,
|
|
init_containerSet,
|
|
|
|
// Services
|
|
mon_service,
|
|
mgr_service,
|
|
osd_service,
|
|
rgw_service,
|
|
|
|
])
|
|
},
|
|
}
|