Initial open-source release

This commit is contained in:
Andrey Avtomonov 2026-05-10 23:12:26 +02:00
commit 1a42152e6f
1199 changed files with 257054 additions and 0 deletions

View file

@ -0,0 +1,115 @@
# Postgres Historic SQL Example
This example is a manual smoke for Postgres historic-SQL ingest through
`pg_stat_statements`. It starts Postgres 14 with the extension preloaded,
generates query workload under separate users, runs `klo setup` with
`--enable-historic-sql`, and verifies three local ingest runs:
- first run creates a fresh PGSS baseline
- second run emits only positive deltas
- reset run treats `pg_stat_statements_reset()` as a fresh baseline
## Prerequisites
- Docker with Compose v2
- Node and pnpm matching the KLO workspace
- `python-service/.venv` already created, or `KLO_SQL_ANALYSIS_URL` pointing at
a running service that exposes `/api/sql/analyze-for-fingerprint`
## Run
From the KLO repository root:
```bash
examples/postgres-historic/scripts/smoke.sh
```
The smoke creates a temporary KLO project, starts Postgres on
`127.0.0.1:55432`, and uses this connection URL:
```bash
postgresql://klo_reader:klo_reader@127.0.0.1:55432/analytics # pragma: allowlist secret
```
Set `KLO_POSTGRES_HISTORIC_KEEP_DOCKER=1` to leave the container running after
the script exits.
The smoke validates the historic-SQL raw snapshot path without requiring LLM
credentials. It uses KLO's local stage-only ingest API after `klo setup` so the
PGSS baseline and delta behavior can be checked independently from curation.
## Manual Commands
Start Postgres and generate the base workload:
```bash
docker compose -f examples/postgres-historic/docker-compose.yml up -d --wait
examples/postgres-historic/scripts/generate-workload.sh base
```
Create a project and enable historic SQL:
```bash
export WAREHOUSE_DATABASE_URL=postgresql://klo_reader:klo_reader@127.0.0.1:55432/analytics # pragma: allowlist secret
pnpm --filter @klo/cli run build
node packages/cli/dist/bin.js --project-dir /tmp/klo-postgres-historic setup \
--new \
--skip-agents \
--skip-llm \
--skip-embeddings \
--skip-sources \
--database postgres \
--new-database-connection-id warehouse \
--database-url env:WAREHOUSE_DATABASE_URL \
--database-schema public \
--enable-historic-sql \
--historic-sql-min-calls 2 \
--yes \
--no-input
```
### Readiness check
```bash
pnpm run klo -- dev doctor --project-dir /tmp/klo-postgres-historic --no-input
```
The installed CLI form is `klo dev doctor --project-dir
/tmp/klo-postgres-historic --no-input`. Expected output includes `PASS Postgres
Historic SQL (warehouse)` when `pg_stat_statements` is installed,
`pg_read_all_stats` is granted, tracking is enabled, and
`pg_stat_statements.max` is at least 5000.
Run local historic-SQL ingest:
```bash
node packages/cli/dist/bin.js --project-dir /tmp/klo-postgres-historic dev ingest run \
--connection-id warehouse \
--adapter historic-sql \
--plain \
--no-input
```
The full `dev ingest run` path also runs curation work units, so it requires a
configured LLM provider.
Inspect the latest manifest:
```bash
find /tmp/klo-postgres-historic/raw-sources/warehouse/historic-sql -name manifest.json | sort | tail -n 1
```
The manifest should have `dialect: "postgres"`, `degraded: true`,
`baselineFirstRun: true` on the first run, and populated `pgServerVersion` and
`statsResetAt`.
## Troubleshooting
- Missing extension: confirm `shared_preload_libraries=pg_stat_statements` and
`CREATE EXTENSION pg_stat_statements;` both happened in the `analytics`
database.
- Missing grants: confirm `GRANT pg_read_all_stats TO klo_reader;`.
- Empty templates: rerun `scripts/generate-workload.sh base` and keep
`--historic-sql-min-calls 2` for the smoke.
- SQL-analysis failures: set `KLO_SQL_ANALYSIS_URL` to the running service URL
or create `python-service/.venv` before running `scripts/smoke.sh`.

View file

@ -0,0 +1,24 @@
services:
postgres:
image: postgres:14
command:
- postgres
- -c
- shared_preload_libraries=pg_stat_statements
- -c
- pg_stat_statements.track=top
- -c
- pg_stat_statements.max=10000
environment:
POSTGRES_DB: analytics
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres # pragma: allowlist secret
ports:
- "55432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres -d analytics"]
interval: 2s
timeout: 5s
retries: 30
volumes:
- ./init:/docker-entrypoint-initdb.d:ro

View file

@ -0,0 +1,51 @@
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
CREATE ROLE app_user LOGIN PASSWORD 'app_pass';
CREATE ROLE etl_user LOGIN PASSWORD 'etl_pass';
CREATE ROLE klo_reader LOGIN PASSWORD 'klo_reader';
GRANT pg_read_all_stats TO klo_reader;
CREATE TABLE customers (
id integer PRIMARY KEY,
region text NOT NULL,
plan text NOT NULL
);
CREATE TABLE orders (
id integer PRIMARY KEY,
customer_id integer NOT NULL REFERENCES customers(id),
status text NOT NULL,
total numeric(12, 2) NOT NULL,
created_at timestamptz NOT NULL
);
CREATE TABLE events (
id integer PRIMARY KEY,
customer_id integer NOT NULL REFERENCES customers(id),
event_name text NOT NULL,
occurred_at timestamptz NOT NULL
);
INSERT INTO customers (id, region, plan) VALUES
(1, 'na', 'enterprise'),
(2, 'na', 'team'),
(3, 'eu', 'enterprise'),
(4, 'apac', 'team');
INSERT INTO orders (id, customer_id, status, total, created_at) VALUES
(1, 1, 'paid', 125.50, now() - interval '9 days'),
(2, 1, 'paid', 89.00, now() - interval '4 days'),
(3, 2, 'pending', 42.00, now() - interval '2 days'),
(4, 3, 'paid', 301.25, now() - interval '1 day'),
(5, 4, 'refunded', 77.70, now() - interval '3 hours');
INSERT INTO events (id, customer_id, event_name, occurred_at) VALUES
(1, 1, 'dashboard_viewed', now() - interval '1 day'),
(2, 1, 'export_started', now() - interval '8 hours'),
(3, 2, 'dashboard_viewed', now() - interval '7 hours'),
(4, 3, 'sync_completed', now() - interval '6 hours'),
(5, 4, 'dashboard_viewed', now() - interval '5 hours');
GRANT USAGE ON SCHEMA public TO app_user, etl_user, klo_reader;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO app_user, etl_user, klo_reader;

View file

@ -0,0 +1,33 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
EXAMPLE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
COMPOSE_FILE="$EXAMPLE_DIR/docker-compose.yml"
MODE="${1:-base}"
run_sql() {
local user="$1"
local password="$2"
local sql="$3"
docker compose -f "$COMPOSE_FILE" exec -T -e PGPASSWORD="$password" postgres \
psql -h 127.0.0.1 -U "$user" -d analytics -v ON_ERROR_STOP=1 -c "$sql" >/dev/null
}
for _ in $(seq 1 12); do
run_sql app_user app_pass "SELECT c.region, count(*) AS order_count FROM orders o JOIN customers c ON c.id = o.customer_id WHERE o.status = 'paid' GROUP BY c.region ORDER BY c.region"
done
for _ in $(seq 1 7); do
run_sql app_user app_pass "SELECT c.plan, sum(o.total) AS revenue FROM orders o JOIN customers c ON c.id = o.customer_id WHERE o.created_at >= now() - interval '14 days' GROUP BY c.plan ORDER BY revenue DESC"
done
for _ in $(seq 1 5); do
run_sql etl_user etl_pass "SELECT e.event_name, count(*) AS event_count FROM events e JOIN customers c ON c.id = e.customer_id WHERE c.region = 'na' GROUP BY e.event_name ORDER BY event_count DESC"
done
if [[ "$MODE" == "extra" ]]; then
for _ in $(seq 1 4); do
run_sql etl_user etl_pass "SELECT c.region, avg(o.total) AS avg_total FROM orders o JOIN customers c ON c.id = o.customer_id WHERE o.status <> 'refunded' GROUP BY c.region ORDER BY avg_total DESC"
done
fi

View file

@ -0,0 +1,152 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
EXAMPLE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
KLO_ROOT="$(cd "$EXAMPLE_DIR/../.." && pwd)"
REPO_ROOT="$(cd "$KLO_ROOT/.." && pwd)"
COMPOSE_FILE="$EXAMPLE_DIR/docker-compose.yml"
PROJECT_PARENT="${KLO_POSTGRES_HISTORIC_PROJECT_PARENT:-$(mktemp -d)}"
PROJECT_DIR="$PROJECT_PARENT/postgres-historic-klo"
KLO_BIN="$KLO_ROOT/packages/cli/dist/bin.js"
PYTHON_SERVICE_LOG="$PROJECT_PARENT/python-service.log"
PYTHON_SERVICE_PID=""
cleanup() {
if [[ -n "$PYTHON_SERVICE_PID" ]]; then
kill "$PYTHON_SERVICE_PID" >/dev/null 2>&1 || true
fi
if [[ "${KLO_POSTGRES_HISTORIC_KEEP_DOCKER:-0}" != "1" ]]; then
docker compose -f "$COMPOSE_FILE" down -v >/dev/null 2>&1 || true
fi
}
trap cleanup EXIT
start_sql_analysis_if_needed() {
if [[ -n "${KLO_SQL_ANALYSIS_URL:-}" ]]; then
return
fi
if [[ ! -d "$REPO_ROOT/python-service/.venv" ]]; then
echo "Set KLO_SQL_ANALYSIS_URL or create python-service/.venv before running this smoke." >&2
exit 1
fi
(
cd "$REPO_ROOT/python-service"
source .venv/bin/activate
uvicorn app.main:app --host 127.0.0.1 --port 18081 >"$PYTHON_SERVICE_LOG" 2>&1
) &
PYTHON_SERVICE_PID="$!"
export KLO_SQL_ANALYSIS_URL="http://127.0.0.1:18081"
for _ in $(seq 1 60); do
if curl -fsS "$KLO_SQL_ANALYSIS_URL/health" >/dev/null 2>&1; then
return
fi
sleep 1
done
echo "SQL analysis service did not become healthy. Log: $PYTHON_SERVICE_LOG" >&2
exit 1
}
latest_manifest() {
find "$PROJECT_DIR/raw-sources/warehouse/historic-sql" -name manifest.json | sort | tail -n 1
}
assert_manifest() {
local manifest_path="$1"
local expected_first_run="$2"
node - "$manifest_path" "$expected_first_run" <<'NODE'
const { readFileSync } = require('node:fs');
const manifestPath = process.argv[2];
const expectedFirstRun = process.argv[3] === 'true';
const manifest = JSON.parse(readFileSync(manifestPath, 'utf8'));
if (manifest.dialect !== 'postgres') throw new Error(`Expected dialect postgres, got ${manifest.dialect}`);
if (manifest.degraded !== true) throw new Error('Expected degraded:true for Postgres PGSS v1');
if (manifest.baselineFirstRun !== expectedFirstRun) {
throw new Error(`Expected baselineFirstRun:${expectedFirstRun}, got ${manifest.baselineFirstRun}`);
}
if (!manifest.pgServerVersion) throw new Error('Expected pgServerVersion');
if (!manifest.statsResetAt) throw new Error('Expected statsResetAt');
if (!Array.isArray(manifest.templates) || manifest.templates.length === 0) {
throw new Error('Expected at least one staged historic-SQL template');
}
NODE
}
run_historic_stage_only() {
local job_id="$1"
node - "$KLO_ROOT" "$PROJECT_DIR" "$job_id" <<'NODE'
const { join } = await import('node:path');
const kloRoot = process.argv[2];
const projectDir = process.argv[3];
const jobId = process.argv[4];
const { loadKloProject } = await import(join(kloRoot, 'packages/context/dist/project/index.js'));
const { runLocalStageOnlyIngest } = await import(join(kloRoot, 'packages/context/dist/ingest/index.js'));
const { createKloCliLocalIngestAdapters } = await import(join(kloRoot, 'packages/cli/dist/local-adapters.js'));
const project = await loadKloProject({ projectDir });
const adapters = createKloCliLocalIngestAdapters(project, { historicSqlConnectionId: 'warehouse' });
const adapter = adapters.find((candidate) => candidate.source === 'historic-sql');
if (!adapter) throw new Error('historic-sql adapter was not registered for local run');
const record = await runLocalStageOnlyIngest({
project,
adapters,
adapter: 'historic-sql',
connectionId: 'warehouse',
trigger: 'manual_resync',
jobId,
});
await adapter.onPullSucceeded?.({
connectionId: 'warehouse',
sourceKey: 'historic-sql',
syncId: record.syncId,
trigger: 'manual_resync',
completedAt: new Date(record.completedAt),
stagedDir: join(project.projectDir, '.klo/cache/local-ingest', jobId, 'staged'),
});
console.log(record.syncId);
NODE
}
cd "$KLO_ROOT"
pnpm --filter @klo/context run build
pnpm --filter @klo/cli run build
start_sql_analysis_if_needed
docker compose -f "$COMPOSE_FILE" up -d --wait
"$EXAMPLE_DIR/scripts/generate-workload.sh" base
export WAREHOUSE_DATABASE_URL="${WAREHOUSE_DATABASE_URL:-postgresql://klo_reader:klo_reader@127.0.0.1:55432/analytics}" # pragma: allowlist secret
node "$KLO_BIN" --project-dir "$PROJECT_DIR" setup \
--new \
--skip-agents \
--skip-llm \
--skip-embeddings \
--skip-sources \
--database postgres \
--new-database-connection-id warehouse \
--database-url env:WAREHOUSE_DATABASE_URL \
--database-schema public \
--enable-historic-sql \
--historic-sql-min-calls 2 \
--yes \
--no-input
run_historic_stage_only "historic-first-$$"
FIRST_MANIFEST="$(latest_manifest)"
assert_manifest "$FIRST_MANIFEST" true
"$EXAMPLE_DIR/scripts/generate-workload.sh" extra
run_historic_stage_only "historic-second-$$"
SECOND_MANIFEST="$(latest_manifest)"
assert_manifest "$SECOND_MANIFEST" false
docker compose -f "$COMPOSE_FILE" exec -T postgres \
psql -U postgres -d analytics -v ON_ERROR_STOP=1 -c "SELECT pg_stat_statements_reset();" >/dev/null
"$EXAMPLE_DIR/scripts/generate-workload.sh" extra
run_historic_stage_only "historic-reset-$$"
RESET_MANIFEST="$(latest_manifest)"
assert_manifest "$RESET_MANIFEST" true
echo "Postgres historic SQL smoke passed"
echo "Project dir: $PROJECT_DIR"