mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-16 08:25:14 +02:00
Polish documentation copy (#98)
This commit is contained in:
parent
ce23aca4c4
commit
372c90b533
65 changed files with 478 additions and 478 deletions
|
|
@ -36,7 +36,7 @@ describe('renderDemoBanner', () => {
|
|||
- [ ] **Step 2: Run the test to verify it fails**
|
||||
|
||||
Run: `pnpm --filter @ktx/cli run test -- --testPathPattern setup-demo-tour`
|
||||
Expected: FAIL — module not found
|
||||
Expected: FAIL - module not found
|
||||
|
||||
- [ ] **Step 3: Implement `renderDemoBanner` and `waitForDemoNavigation`**
|
||||
|
||||
|
|
@ -58,7 +58,7 @@ function dim(text: string): string {
|
|||
export function renderDemoBanner(): string {
|
||||
const lines = [
|
||||
'',
|
||||
`┌ ${cyan('Demo mode')} — data has been pre-processed and KTX context is already built.`,
|
||||
`┌ ${cyan('Demo mode')} - data has been pre-processed and KTX context is already built.`,
|
||||
`│ This walkthrough illustrates the setup steps. Selections are pre-filled and read-only.`,
|
||||
'',
|
||||
];
|
||||
|
|
@ -145,7 +145,7 @@ describe('renderDemoCardContent', () => {
|
|||
- [ ] **Step 2: Run the test to verify it fails**
|
||||
|
||||
Run: `pnpm --filter @ktx/cli run test -- --testPathPattern setup-demo-tour`
|
||||
Expected: FAIL — `renderDemoCardContent` not exported
|
||||
Expected: FAIL - `renderDemoCardContent` not exported
|
||||
|
||||
- [ ] **Step 3: Implement `renderDemoCardContent` and `renderDemoCard`**
|
||||
|
||||
|
|
@ -243,7 +243,7 @@ describe('DEMO_REPLAY_TARGETS', () => {
|
|||
- [ ] **Step 2: Run the test to verify it fails**
|
||||
|
||||
Run: `pnpm --filter @ktx/cli run test -- --testPathPattern setup-demo-tour`
|
||||
Expected: FAIL — exports not found
|
||||
Expected: FAIL - exports not found
|
||||
|
||||
- [ ] **Step 3: Implement replay timeline and target definitions**
|
||||
|
||||
|
|
@ -388,7 +388,7 @@ function renderDemoContextCompletionSummary(): string {
|
|||
'',
|
||||
`${cyan('★')} KTX finished ingesting demo data`,
|
||||
'',
|
||||
' Placeholder — final counts will come from pre-packaged demo results.',
|
||||
' Placeholder - final counts will come from pre-packaged demo results.',
|
||||
'',
|
||||
` ${dim('Press Enter to continue, Escape to go back')}`,
|
||||
'',
|
||||
|
|
@ -459,7 +459,7 @@ describe('renderDemoCompletionSummary', () => {
|
|||
- [ ] **Step 2: Run the test to verify it fails**
|
||||
|
||||
Run: `pnpm --filter @ktx/cli run test -- --testPathPattern setup-demo-tour`
|
||||
Expected: FAIL — exports not found
|
||||
Expected: FAIL - exports not found
|
||||
|
||||
- [ ] **Step 3: Implement transition and completion rendering**
|
||||
|
||||
|
|
@ -469,7 +469,7 @@ Add to `setup-demo-tour.ts`:
|
|||
export function renderDemoAgentTransition(): string {
|
||||
const lines = [
|
||||
'',
|
||||
`┌ Demo project is ready — let's connect your agent`,
|
||||
`┌ Demo project is ready - let's connect your agent`,
|
||||
'│',
|
||||
'│ Your KTX context has been built with demo data.',
|
||||
'│ Select an agent to start using it.',
|
||||
|
|
@ -583,7 +583,7 @@ describe('runDemoTour', () => {
|
|||
- [ ] **Step 2: Run the test to verify it fails**
|
||||
|
||||
Run: `pnpm --filter @ktx/cli run test -- --testPathPattern setup-demo-tour`
|
||||
Expected: FAIL — `runDemoTour` not exported or wrong signature
|
||||
Expected: FAIL - `runDemoTour` not exported or wrong signature
|
||||
|
||||
- [ ] **Step 3: Implement `runDemoTour`**
|
||||
|
||||
|
|
@ -677,7 +677,7 @@ Expected: PASS
|
|||
- [ ] **Step 5: Run type-check**
|
||||
|
||||
Run: `pnpm --filter @ktx/cli run type-check`
|
||||
Expected: PASS — all types align with existing interfaces
|
||||
Expected: PASS - all types align with existing interfaces
|
||||
|
||||
- [ ] **Step 6: Commit**
|
||||
|
||||
|
|
@ -736,7 +736,7 @@ async function runKtxSetupDemoFromEntryMenu(
|
|||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Update imports — remove unused `defaultDemoProjectDir` import if no longer needed elsewhere in setup.ts**
|
||||
- [ ] **Step 3: Update imports - remove unused `defaultDemoProjectDir` import if no longer needed elsewhere in setup.ts**
|
||||
|
||||
Check if `defaultDemoProjectDir` is used elsewhere in `setup.ts`. If it's only used
|
||||
in `runKtxSetupDemoFromEntryMenu`, remove the import. If used elsewhere, keep it.
|
||||
|
|
@ -749,7 +749,7 @@ called from the entry menu path.
|
|||
- [ ] **Step 4: Run type-check and tests**
|
||||
|
||||
Run: `pnpm --filter @ktx/cli run type-check && pnpm --filter @ktx/cli run test`
|
||||
Expected: PASS — existing tests continue to work, demo tour is now wired in
|
||||
Expected: PASS - existing tests continue to work, demo tour is now wired in
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
|
|
@ -807,7 +807,7 @@ git commit -m "fix(cli): demo tour adjustments from smoke test"
|
|||
|
||||
When the user provides the real pre-packaged demo results, update these locations:
|
||||
|
||||
1. **`renderDemoContextCompletionSummary()`** in `setup-demo-tour.ts` — replace placeholder text with actual counts (business areas, query definitions, knowledge pages) from the demo data
|
||||
2. **`buildDemoReplayTimeline()`** in `setup-demo-tour.ts` — adjust timing and progress details to match the real ingestion profile
|
||||
3. **`demo-assets.ts`** — update `REQUIRED_SEEDED_ASSET_PATHS` and `demoConfig()` if the demo dataset changes from SQLite/Orbit to Postgres/dbt/Metabase/Notion
|
||||
4. **Pre-packaged asset files** in `packages/cli/assets/demo/` — replace with the new demo dataset
|
||||
1. **`renderDemoContextCompletionSummary()`** in `setup-demo-tour.ts` - replace placeholder text with actual counts (business areas, query definitions, knowledge pages) from the demo data
|
||||
2. **`buildDemoReplayTimeline()`** in `setup-demo-tour.ts` - adjust timing and progress details to match the real ingestion profile
|
||||
3. **`demo-assets.ts`** - update `REQUIRED_SEEDED_ASSET_PATHS` and `demoConfig()` if the demo dataset changes from SQLite/Orbit to Postgres/dbt/Metabase/Notion
|
||||
4. **Pre-packaged asset files** in `packages/cli/assets/demo/` - replace with the new demo dataset
|
||||
|
|
|
|||
|
|
@ -654,11 +654,11 @@ In `docs/content/docs/cli-reference/ktx-setup.mdx`, replace the Historic SQL fla
|
|||
```markdown
|
||||
| `--enable-historic-sql` | Enable Historic SQL when the selected database supports it | `false` |
|
||||
| `--disable-historic-sql` | Disable Historic SQL for the selected database | `false` |
|
||||
| `--historic-sql-window-days <number>` | Historic SQL query-history window in days | — |
|
||||
| `--historic-sql-min-executions <number>` | Minimum executions for a Historic SQL template | — |
|
||||
| `--historic-sql-min-calls <number>` | Alias for `--historic-sql-min-executions` for one release | — |
|
||||
| `--historic-sql-service-account-pattern <pattern>` | Historic SQL service-account regex; repeatable | — |
|
||||
| `--historic-sql-redaction-pattern <pattern>` | Historic SQL SQL-literal redaction regex; repeatable | — |
|
||||
| `--historic-sql-window-days <number>` | Historic SQL query-history window in days | - |
|
||||
| `--historic-sql-min-executions <number>` | Minimum executions for a Historic SQL template | - |
|
||||
| `--historic-sql-min-calls <number>` | Alias for `--historic-sql-min-executions` for one release | - |
|
||||
| `--historic-sql-service-account-pattern <pattern>` | Historic SQL service-account regex; repeatable | - |
|
||||
| `--historic-sql-redaction-pattern <pattern>` | Historic SQL SQL-literal redaction regex; repeatable | - |
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Update primary source Historic SQL docs**
|
||||
|
|
|
|||
|
|
@ -874,7 +874,7 @@ Expected: PASS. The output includes `# fail 0`.
|
|||
|
||||
- [ ] **Step 2: Verify stale artifact strings are gone from production/docs files**
|
||||
|
||||
Run (scans only production and docs files, not test files — test files keep guard assertions that reference the removed strings):
|
||||
Run (scans only production and docs files, not test files - test files keep guard assertions that reference the removed strings):
|
||||
|
||||
```bash
|
||||
rg -n "uv', \\['build', '--package', 'ktx-sl'|uv', \\['build', '--package', 'ktx-daemon'|ktx_sl-0\\.1\\.0|ktx_daemon-0\\.1\\.0|pythonArtifactInstallArgs|pythonVerifySource|verifyPythonArtifacts|standalone Python distributions|installs the Python artifacts directly" scripts/package-artifacts.mjs scripts/release-readiness.mjs README.md examples/package-artifacts/README.md release-policy.json
|
||||
|
|
|
|||
|
|
@ -199,7 +199,7 @@ Modify the raw schema markdown in
|
|||
.slice(0, limit)
|
||||
.map(
|
||||
(hit) =>
|
||||
`- ${hit.kind}: ${hit.display} [connectionName=${hit.connectionName}] (matched on ${hit.matchedOn}) — ` +
|
||||
`- ${hit.kind}: ${hit.display} [connectionName=${hit.connectionName}] (matched on ${hit.matchedOn}) - ` +
|
||||
`follow up with \`entity_details({connectionName: "${hit.connectionName}", targets: [{display: "${hit.display}"}]})\``,
|
||||
)
|
||||
.join('\n'),
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@
|
|||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Add a build-time script that prints the full `ktx` CLI command tree (name, aliases, description per node) as an indented text tree, for docs and discovery — without adding a runtime `ktx` subcommand.
|
||||
**Goal:** Add a build-time script that prints the full `ktx` CLI command tree (name, aliases, description per node) as an indented text tree, for docs and discovery - without adding a runtime `ktx` subcommand.
|
||||
|
||||
**Architecture:** Commander.js exposes every registered command as a `Command` instance with `.commands`, `.name()`, `.aliases()`, `.description()` — we walk that tree. The current `runCommanderKtxCli` in `packages/cli/src/cli-program.ts` builds the program inline; we extract that assembly into a pure `buildKtxProgram(...)` helper that any caller can use to materialize the configured root `Command` without parsing argv. A new pure module `command-tree.ts` walks the `Command` into plain data and renders it as indented text. A new TypeScript entrypoint `print-command-tree.ts` compiles alongside `bin.ts` into `dist/print-command-tree.js`, instantiates the program with stub IO/deps, and writes the rendered tree to stdout. A pnpm script under `@ktx/cli` exposes it as `pnpm --filter @ktx/cli run docs:commands`.
|
||||
**Architecture:** Commander.js exposes every registered command as a `Command` instance with `.commands`, `.name()`, `.aliases()`, `.description()` - we walk that tree. The current `runCommanderKtxCli` in `packages/cli/src/cli-program.ts` builds the program inline; we extract that assembly into a pure `buildKtxProgram(...)` helper that any caller can use to materialize the configured root `Command` without parsing argv. A new pure module `command-tree.ts` walks the `Command` into plain data and renders it as indented text. A new TypeScript entrypoint `print-command-tree.ts` compiles alongside `bin.ts` into `dist/print-command-tree.js`, instantiates the program with stub IO/deps, and writes the rendered tree to stdout. A pnpm script under `@ktx/cli` exposes it as `pnpm --filter @ktx/cli run docs:commands`.
|
||||
|
||||
**Tech Stack:** TypeScript (NodeNext ESM), Node 22, Commander 14 via `@commander-js/extra-typings`, vitest 4.
|
||||
|
||||
|
|
@ -12,14 +12,14 @@
|
|||
|
||||
## File Map
|
||||
|
||||
- **Modify:** `packages/cli/src/cli-program.ts` — extract `buildKtxProgram` from `runCommanderKtxCli`.
|
||||
- **Create:** `packages/cli/src/cli-program.test.ts` — vitest tests for the new helper.
|
||||
- **Create:** `packages/cli/src/command-tree.ts` — pure `walkCommandTree` + `formatCommandTree`.
|
||||
- **Create:** `packages/cli/src/command-tree.test.ts` — vitest tests against ad-hoc Command trees.
|
||||
- **Create:** `packages/cli/src/print-command-tree.ts` — script entrypoint; thin glue.
|
||||
- **Create:** `packages/cli/src/print-command-tree.test.ts` — vitest test that calls the script's exported `main()` with a fake stdout and asserts the rendered tree includes known top-level commands.
|
||||
- **Modify:** `packages/cli/package.json` — add `docs:commands` script and include the new entry in tsc build output (no change needed if `tsconfig` already globs `src/**/*.ts`, but verify).
|
||||
- **Modify:** `packages/cli/README.md` (if it exists; otherwise skip) — document `pnpm run docs:commands`.
|
||||
- **Modify:** `packages/cli/src/cli-program.ts` - extract `buildKtxProgram` from `runCommanderKtxCli`.
|
||||
- **Create:** `packages/cli/src/cli-program.test.ts` - vitest tests for the new helper.
|
||||
- **Create:** `packages/cli/src/command-tree.ts` - pure `walkCommandTree` + `formatCommandTree`.
|
||||
- **Create:** `packages/cli/src/command-tree.test.ts` - vitest tests against ad-hoc Command trees.
|
||||
- **Create:** `packages/cli/src/print-command-tree.ts` - script entrypoint; thin glue.
|
||||
- **Create:** `packages/cli/src/print-command-tree.test.ts` - vitest test that calls the script's exported `main()` with a fake stdout and asserts the rendered tree includes known top-level commands.
|
||||
- **Modify:** `packages/cli/package.json` - add `docs:commands` script and include the new entry in tsc build output (no change needed if `tsconfig` already globs `src/**/*.ts`, but verify).
|
||||
- **Modify:** `packages/cli/README.md` (if it exists; otherwise skip) - document `pnpm run docs:commands`.
|
||||
|
||||
Files that change together (cli-program + its test, command-tree + its test, print-command-tree + its test) live next to each other under `packages/cli/src/`, matching the existing convention (e.g. `bin.ts`, `cli-runtime.ts`, `runtime.ts` + `runtime.test.ts`).
|
||||
|
||||
|
|
@ -27,7 +27,7 @@ Files that change together (cli-program + its test, command-tree + its test, pri
|
|||
|
||||
## Task 1: Extract `buildKtxProgram` from `runCommanderKtxCli`
|
||||
|
||||
Refactor only — no behavior change. The current code in `cli-program.ts` interleaves program construction with `parseAsync` dispatch. Splitting them lets the new script reuse construction without invoking the CLI.
|
||||
Refactor only - no behavior change. The current code in `cli-program.ts` interleaves program construction with `parseAsync` dispatch. Splitting them lets the new script reuse construction without invoking the CLI.
|
||||
|
||||
**Files:**
|
||||
- Modify: `packages/cli/src/cli-program.ts:197-275` (function `runCommanderKtxCli`)
|
||||
|
|
@ -88,7 +88,7 @@ describe('buildKtxProgram', () => {
|
|||
|
||||
Run: `pnpm --filter @ktx/cli exec vitest run src/cli-program.test.ts`
|
||||
|
||||
Expected: FAIL — `buildKtxProgram is not exported from './cli-program.js'` (or similar TS/ESM error).
|
||||
Expected: FAIL - `buildKtxProgram is not exported from './cli-program.js'` (or similar TS/ESM error).
|
||||
|
||||
- [ ] **Step 3: Extract `buildKtxProgram` from `runCommanderKtxCli`**
|
||||
|
||||
|
|
@ -160,19 +160,19 @@ Then rewrite the body of `runCommanderKtxCli` (lines 197-275) to delegate progra
|
|||
};
|
||||
```
|
||||
|
||||
Keep the `context` re-declaration only if subsequent code (the `if (argv.length === 0)` branch that calls `runBareInteractiveCommand(program, io, context)`) still needs it. It does — `runBareInteractiveCommand` consumes `context`. Keep `context` exactly as it was after the deletion; do not change `runBareInteractiveCommand`'s signature or behavior. Drop the now-removed individual `register*` calls and their `profileMark` lines from `runCommanderKtxCli`.
|
||||
Keep the `context` re-declaration only if subsequent code (the `if (argv.length === 0)` branch that calls `runBareInteractiveCommand(program, io, context)`) still needs it. It does - `runBareInteractiveCommand` consumes `context`. Keep `context` exactly as it was after the deletion; do not change `runBareInteractiveCommand`'s signature or behavior. Drop the now-removed individual `register*` calls and their `profileMark` lines from `runCommanderKtxCli`.
|
||||
|
||||
- [ ] **Step 4: Run the new test to verify it passes**
|
||||
|
||||
Run: `pnpm --filter @ktx/cli exec vitest run src/cli-program.test.ts`
|
||||
|
||||
Expected: PASS — both `it` blocks green.
|
||||
Expected: PASS - both `it` blocks green.
|
||||
|
||||
- [ ] **Step 5: Run the full CLI test suite to confirm no regression**
|
||||
|
||||
Run: `pnpm --filter @ktx/cli run test 2>&1 | tee /tmp/ktx-cli-test-output.log`
|
||||
|
||||
Expected: PASS overall. Inspect the log if any previously-passing test now fails — most likely a missing register call (compare to lines 221-249 of the pre-change file).
|
||||
Expected: PASS overall. Inspect the log if any previously-passing test now fails - most likely a missing register call (compare to lines 221-249 of the pre-change file).
|
||||
|
||||
- [ ] **Step 6: Type-check**
|
||||
|
||||
|
|
@ -191,7 +191,7 @@ git commit -m "refactor(cli): extract buildKtxProgram for reuse outside runComma
|
|||
|
||||
## Task 2: Pure tree walker `walkCommandTree`
|
||||
|
||||
Take a Commander `Command` and produce plain data: `{ name, description, aliases, children }`. No formatting yet. Pure function — depends only on the public `Command` API.
|
||||
Take a Commander `Command` and produce plain data: `{ name, description, aliases, children }`. No formatting yet. Pure function - depends only on the public `Command` API.
|
||||
|
||||
**Files:**
|
||||
- Create: `packages/cli/src/command-tree.ts`
|
||||
|
|
@ -254,7 +254,7 @@ describe('walkCommandTree', () => {
|
|||
|
||||
Run: `pnpm --filter @ktx/cli exec vitest run src/command-tree.test.ts`
|
||||
|
||||
Expected: FAIL — `walkCommandTree` cannot be resolved.
|
||||
Expected: FAIL - `walkCommandTree` cannot be resolved.
|
||||
|
||||
- [ ] **Step 3: Implement `walkCommandTree`**
|
||||
|
||||
|
|
@ -296,7 +296,7 @@ Expected: no errors.
|
|||
|
||||
## Task 3: Indented-text renderer `formatCommandTree`
|
||||
|
||||
Render a `CommandTreeNode` as plain text. Each node on its own line: `<indent><name>[ (alias1, alias2)][ — description]`. Indent is two spaces per depth level. Children sorted alphabetically by name to keep output stable across changes that reorder registrar calls.
|
||||
Render a `CommandTreeNode` as plain text. Each node on its own line: `<indent><name>[ (alias1, alias2)][ - description]`. Indent is two spaces per depth level. Children sorted alphabetically by name to keep output stable across changes that reorder registrar calls.
|
||||
|
||||
**Files:**
|
||||
- Modify: `packages/cli/src/command-tree.ts`
|
||||
|
|
@ -312,12 +312,12 @@ import { formatCommandTree } from './command-tree.js';
|
|||
describe('formatCommandTree', () => {
|
||||
it('renders a single node with no children', () => {
|
||||
const node = { name: 'solo', description: 'just me', aliases: [], children: [] };
|
||||
expect(formatCommandTree(node)).toBe('solo — just me\n');
|
||||
expect(formatCommandTree(node)).toBe('solo - just me\n');
|
||||
});
|
||||
|
||||
it('renders aliases in parentheses before the description', () => {
|
||||
const node = { name: 'cmd', description: 'does things', aliases: ['c', 'co'], children: [] };
|
||||
expect(formatCommandTree(node)).toBe('cmd (c, co) — does things\n');
|
||||
expect(formatCommandTree(node)).toBe('cmd (c, co) - does things\n');
|
||||
});
|
||||
|
||||
it('omits the dash when description is empty', () => {
|
||||
|
|
@ -338,10 +338,10 @@ describe('formatCommandTree', () => {
|
|||
],
|
||||
};
|
||||
expect(formatCommandTree(tree)).toBe(
|
||||
'root — top\n' +
|
||||
' alpha (al) — a\n' +
|
||||
' inner — i\n' +
|
||||
' beta — b\n',
|
||||
'root - top\n' +
|
||||
' alpha (al) - a\n' +
|
||||
' inner - i\n' +
|
||||
' beta - b\n',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -351,7 +351,7 @@ describe('formatCommandTree', () => {
|
|||
|
||||
Run: `pnpm --filter @ktx/cli exec vitest run src/command-tree.test.ts`
|
||||
|
||||
Expected: FAIL — `formatCommandTree` is not exported.
|
||||
Expected: FAIL - `formatCommandTree` is not exported.
|
||||
|
||||
- [ ] **Step 3: Implement `formatCommandTree`**
|
||||
|
||||
|
|
@ -367,7 +367,7 @@ export function formatCommandTree(node: CommandTreeNode): string {
|
|||
function appendNode(node: CommandTreeNode, depth: number, lines: string[]): void {
|
||||
const indent = ' '.repeat(depth);
|
||||
const aliasPart = node.aliases.length > 0 ? ` (${node.aliases.join(', ')})` : '';
|
||||
const descPart = node.description.length > 0 ? ` — ${node.description}` : '';
|
||||
const descPart = node.description.length > 0 ? ` - ${node.description}` : '';
|
||||
lines.push(`${indent}${node.name}${aliasPart}${descPart}`);
|
||||
|
||||
const sortedChildren = [...node.children].sort((a, b) => a.name.localeCompare(b.name));
|
||||
|
|
@ -419,7 +419,7 @@ describe('renderKtxCommandTree', () => {
|
|||
const output = renderKtxCommandTree();
|
||||
|
||||
const lines = output.split('\n');
|
||||
expect(lines[0]).toMatch(/^ktx( |$|\s—)/);
|
||||
expect(lines[0]).toMatch(/^ktx( |$|\s-)/);
|
||||
|
||||
// Top-level commands are indented exactly two spaces.
|
||||
const topLevel = lines
|
||||
|
|
@ -443,7 +443,7 @@ describe('renderKtxCommandTree', () => {
|
|||
|
||||
Run: `pnpm --filter @ktx/cli exec vitest run src/print-command-tree.test.ts`
|
||||
|
||||
Expected: FAIL — module not found.
|
||||
Expected: FAIL - module not found.
|
||||
|
||||
- [ ] **Step 3: Implement the script**
|
||||
|
||||
|
|
@ -495,7 +495,7 @@ if (invokedAsScript) {
|
|||
|
||||
Run: `pnpm --filter @ktx/cli exec vitest run src/print-command-tree.test.ts`
|
||||
|
||||
Expected: PASS — both assertions green.
|
||||
Expected: PASS - both assertions green.
|
||||
|
||||
- [ ] **Step 5: Type-check**
|
||||
|
||||
|
|
@ -572,9 +572,9 @@ git commit -m "chore(cli): add docs:commands pnpm script"
|
|||
|
||||
After all tasks, confirm:
|
||||
|
||||
- [ ] `pnpm --filter @ktx/cli run type-check` — clean
|
||||
- [ ] `pnpm --filter @ktx/cli run test` — green, including new tests in `cli-program.test.ts`, `command-tree.test.ts`, `print-command-tree.test.ts`
|
||||
- [ ] `pnpm --filter @ktx/cli run docs:commands` — prints `ktx` followed by indented subcommand tree
|
||||
- [ ] `git status --short` — only the files listed in the File Map are modified or created; no incidental edits
|
||||
- [ ] `pnpm --filter @ktx/cli run type-check` - clean
|
||||
- [ ] `pnpm --filter @ktx/cli run test` - green, including new tests in `cli-program.test.ts`, `command-tree.test.ts`, `print-command-tree.test.ts`
|
||||
- [ ] `pnpm --filter @ktx/cli run docs:commands` - prints `ktx` followed by indented subcommand tree
|
||||
- [ ] `git status --short` - only the files listed in the File Map are modified or created; no incidental edits
|
||||
|
||||
If any check fails, fix in place and re-run before declaring done.
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# Demo Guided Tour — Design Spec
|
||||
# Demo Guided Tour - Design Spec
|
||||
|
||||
## Problem
|
||||
|
||||
|
|
@ -40,7 +40,7 @@ Copy pre-packaged assets (demo DB, replay, context artifacts)
|
|||
┌────────────────────────────────────────────────────────────────┐
|
||||
│ Demo banner (persistent, shown on every step) │
|
||||
│ │
|
||||
│ Demo mode — data has been pre-processed and KTX context is │
|
||||
│ Demo mode - data has been pre-processed and KTX context is │
|
||||
│ already built. This walkthrough illustrates the setup steps. │
|
||||
│ Selections are pre-filled and read-only. │
|
||||
└────────────────────────────────────────────────────────────────┘
|
||||
|
|
@ -67,7 +67,7 @@ Context build replay
|
|||
│
|
||||
▼
|
||||
Transition message:
|
||||
"Demo project is ready — let's connect your agent"
|
||||
"Demo project is ready - let's connect your agent"
|
||||
│
|
||||
▼
|
||||
Interactive agents step (real runKtxSetupAgentsStep())
|
||||
|
|
@ -89,7 +89,7 @@ Final summary:
|
|||
Shown at the top of every read-only step. Uses clack box-drawing style:
|
||||
|
||||
```
|
||||
┌ Demo mode — data has been pre-processed and KTX context is already built.
|
||||
┌ Demo mode - data has been pre-processed and KTX context is already built.
|
||||
│ This walkthrough illustrates the setup steps. Selections are pre-filled and read-only.
|
||||
```
|
||||
|
||||
|
|
@ -170,7 +170,7 @@ Completion summary uses the existing format:
|
|||
★ KTX finished ingesting your data
|
||||
|
||||
✓ Analyzed X business areas
|
||||
✓ Reconciled — 0 conflicts
|
||||
✓ Reconciled - 0 conflicts
|
||||
|
||||
KTX created:
|
||||
📊 X query definitions
|
||||
|
|
@ -187,7 +187,7 @@ The exact counts and artifact names come from the pre-packaged demo results
|
|||
A brief message bridges from the read-only tour to the interactive step:
|
||||
|
||||
```
|
||||
┌ Demo project is ready — let's connect your agent
|
||||
┌ Demo project is ready - let's connect your agent
|
||||
│
|
||||
│ Your KTX context has been built with demo data.
|
||||
│ Select an agent to start using it.
|
||||
|
|
@ -240,13 +240,13 @@ the pre-packaged replay file at an accelerated playback rate.
|
|||
| `packages/cli/src/setup.ts` | Add `demoMode` flag to setup loop; skip models/embeddings; dispatch to demo cards for databases/sources; show demo banner; demo completion summary |
|
||||
| `packages/cli/src/setup-demo-cards.ts` | New file: `renderDemoCard()` helper, demo banner renderer, demo step definitions |
|
||||
| `packages/cli/src/setup-context.ts` | Support replay mode for demo: feed pre-packaged events at accelerated pace through existing progress view |
|
||||
| `packages/cli/src/demo.ts` | Remove or simplify `runKtxSetupDemoFromEntryMenu()` — now dispatches to the main setup loop with `demoMode: true` |
|
||||
| `packages/cli/src/demo.ts` | Remove or simplify `runKtxSetupDemoFromEntryMenu()` - now dispatches to the main setup loop with `demoMode: true` |
|
||||
| `packages/cli/src/demo-assets.ts` | Update asset list if new demo data is provided; ensure demo project setup writes valid `ktx.yaml` for agent use |
|
||||
|
||||
## Open Items
|
||||
|
||||
- **Demo data**: User will provide improved pre-packaged results (Postgres,
|
||||
dbt, Metabase, Notion). Current demo assets may need updating.
|
||||
- **Replay speed**: Exact acceleration factor TBD — should feel brisk but
|
||||
- **Replay speed**: Exact acceleration factor TBD - should feel brisk but
|
||||
give users time to read source names and status transitions. Start with
|
||||
~2x real-time and adjust.
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# Historic SQL Ingestion — Redesign
|
||||
# Historic SQL Ingestion - Redesign
|
||||
|
||||
**Status:** draft
|
||||
**Date:** 2026-05-11
|
||||
|
|
@ -16,12 +16,12 @@ Concrete pain points observed:
|
|||
- The output is **rigid and shallow**: deterministic slot classification (constant / categorical / runtime) and triage-signal buckets do not produce narrative an agent can use. The current downstream skills (`historic_sql_ingest`, `historic_sql_curator`) try to recover narrative from these templates but at high cost.
|
||||
- Lots of moving parts (baseline files, reset detection, atomic per-connection commit, slot heuristics, ranking formula) for what is fundamentally "find interesting queries and tell agents about them."
|
||||
|
||||
The end goal — per the user — is for ingested content to be **searchable by `ktx wiki search` and `ktx sl search` to help consumer research agents do data analysis and agentic BI**.
|
||||
The end goal - per the user - is for ingested content to be **searchable by `ktx wiki search` and `ktx sl search` to help consumer research agents do data analysis and agentic BI**.
|
||||
|
||||
## 2. Design principles
|
||||
|
||||
1. **LLMs are the right tool for narrative and clustering.** Deterministic heuristics (slot classification, ranking formulas, categorical expansion) get replaced by LLM judgement applied to aggregated, bucketed inputs.
|
||||
2. **The adapter stays LLM-free.** The existing convention — adapters are deterministic, skills do LLM work — is preserved.
|
||||
2. **The adapter stays LLM-free.** The existing convention - adapters are deterministic, skills do LLM work - is preserved.
|
||||
3. **One pipeline across dialects.** A single reader interface, a single staging shape, a single set of skills. Dialect-specific behavior lives only in the snapshot query.
|
||||
4. **No work where no signal changed.** Daily reruns should LLM only the things that actually changed.
|
||||
5. **Lean context for caller agents.** Each retrieval tier (search hit → source read → pattern read) carries only what the agent needs to make the next decision. The principle lives in prompt instructions, not in defensive schema constraints.
|
||||
|
|
@ -53,9 +53,9 @@ Reader (unified) ─▶ Aggregated snapshot ─▶ Batch SQL parse ─▶ Bu
|
|||
└──────────────────────────┬───────────────────────────────────────────────────┘
|
||||
▼
|
||||
onPullSucceeded() projection (no LLM):
|
||||
Pass A — merge `usage` into _schema/{shard}.yaml (per-shard atomic, scan-managed keys)
|
||||
Pass B — write/update pattern wiki pages (slug stability + stale handling)
|
||||
Pass C — trigger SL search re-index for changed sources
|
||||
Pass A - merge `usage` into _schema/{shard}.yaml (per-shard atomic, scan-managed keys)
|
||||
Pass B - write/update pattern wiki pages (slug stability + stale handling)
|
||||
Pass C - trigger SL search re-index for changed sources
|
||||
```
|
||||
|
||||
## 4. Hot path (LLM-free)
|
||||
|
|
@ -78,7 +78,7 @@ interface HistoricSqlReader {
|
|||
|
||||
### 4.2 Snapshot queries (one per dialect)
|
||||
|
||||
**Postgres** — `pg_stat_statements` collapsed to `queryid`:
|
||||
**Postgres** - `pg_stat_statements` collapsed to `queryid`:
|
||||
|
||||
```sql
|
||||
SELECT queryid::text AS template_id,
|
||||
|
|
@ -95,7 +95,7 @@ HAVING SUM(calls) >= @min_executions
|
|||
|
||||
`firstSeen` derives from `pg_stat_statements_info.stats_reset`; `lastSeen` is `now()`. `p50RuntimeMs` / `p95RuntimeMs` collapse to `mean_ms`. `errorRate = 0` (PG doesn't track failures in PGSS).
|
||||
|
||||
**BigQuery** — warehouse-side aggregation over `INFORMATION_SCHEMA.JOBS_BY_PROJECT`:
|
||||
**BigQuery** - warehouse-side aggregation over `INFORMATION_SCHEMA.JOBS_BY_PROJECT`:
|
||||
|
||||
```sql
|
||||
SELECT query_hash AS template_id,
|
||||
|
|
@ -115,7 +115,7 @@ GROUP BY query_hash
|
|||
HAVING COUNT(*) >= @min_executions
|
||||
```
|
||||
|
||||
**Snowflake** — analogous, over `SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY`:
|
||||
**Snowflake** - analogous, over `SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY`:
|
||||
|
||||
```sql
|
||||
SELECT query_hash AS template_id,
|
||||
|
|
@ -154,22 +154,22 @@ Per-row parse failures are non-fatal: the template loses table grounding (exclud
|
|||
|
||||
### 4.4 Filtering (three layers)
|
||||
|
||||
**Layer A — Warehouse-side (in the SQL above):**
|
||||
**Layer A - Warehouse-side (in the SQL above):**
|
||||
|
||||
- Noise prefixes (`SHOW`, `DESCRIBE`, `EXPLAIN`, `USE`, `SET`).
|
||||
- System catalogs (`INFORMATION_SCHEMA`, `SNOWFLAKE.ACCOUNT_USAGE`, `pg_*`, `system.*`).
|
||||
- DDL / non-analytical statement types via `statement_type` / `query_type` columns (PG falls back to prefix regex).
|
||||
- Trivial probes (`SELECT 1`, `SELECT NOW()`, `SELECT VERSION()`) — configurable.
|
||||
- Trivial probes (`SELECT 1`, `SELECT NOW()`, `SELECT VERSION()`) - configurable.
|
||||
- Minimum executions threshold (`@min_executions`, default 5).
|
||||
- Trailing window (`@window_days`, default 90) — BQ/SF only.
|
||||
- Trailing window (`@window_days`, default 90) - BQ/SF only.
|
||||
|
||||
**Layer B — Post-fetch, in-memory:**
|
||||
**Layer B - Post-fetch, in-memory:**
|
||||
|
||||
- Service-account exclusion/inclusion via configurable regex patterns; three modes (`exclude` default, `include`, `mark-only`).
|
||||
- Orchestrator boilerplate (dbt/Looker/Metabase markers) — default `mark-only` (do not drop; dbt-generated queries are often the actual business logic).
|
||||
- Orchestrator boilerplate (dbt/Looker/Metabase markers) - default `mark-only` (do not drop; dbt-generated queries are often the actual business logic).
|
||||
- Failed-query filter (BQ/SF only): templates with `errorRate > 0.9 AND executions < 10`.
|
||||
|
||||
**Layer C — Post-parse:**
|
||||
**Layer C - Post-parse:**
|
||||
|
||||
- Zero-table templates (parsed cleanly but touch no real tables) are dropped from per-table bucketization and from patterns.
|
||||
|
||||
|
|
@ -187,7 +187,7 @@ In-memory pass: a single template touching N tables ends up in N table buckets.
|
|||
patterns-input.json
|
||||
```
|
||||
|
||||
`manifest.json` is small (summary, window, counts, warnings — schema in §9).
|
||||
`manifest.json` is small (summary, window, counts, warnings - schema in §9).
|
||||
|
||||
`tables/{schema}.{name}.json` contains **bucketed** content so that DiffSet content hashes are stable when nothing material changed:
|
||||
|
||||
|
|
@ -223,7 +223,7 @@ Bucket bands are defined deterministically in code (e.g. `executionsBucket`: `<1
|
|||
|
||||
### 4.7 `chunk()` (trivial, convention-following)
|
||||
|
||||
One `WorkUnit` per `tables/*.json` file (handled by `historic_sql_table_digest`) + one `WorkUnit` referencing `patterns-input.json` (handled by `historic_sql_patterns`). No custom diff logic — the framework's `DiffSetComputerPort` already filters to changed files.
|
||||
One `WorkUnit` per `tables/*.json` file (handled by `historic_sql_table_digest`) + one `WorkUnit` referencing `patterns-input.json` (handled by `historic_sql_patterns`). No custom diff logic - the framework's `DiffSetComputerPort` already filters to changed files.
|
||||
|
||||
## 5. Cold path (LLM, via skills)
|
||||
|
||||
|
|
@ -262,7 +262,7 @@ No hard length/cap constraints in the schema. Concision is a behavioral instruct
|
|||
|
||||
### 5.2 `historic_sql_patterns`
|
||||
|
||||
One invocation per run (or a small handful if `patterns-input.json` exceeds a context budget — split deterministically by `tablesTouched` cardinality stratification).
|
||||
One invocation per run (or a small handful if `patterns-input.json` exceeds a context budget - split deterministically by `tablesTouched` cardinality stratification).
|
||||
|
||||
**Prompt:** identifies recurring analytical intents that span ≥2 tables with ≥mid executionsBucket and ≥2-5 distinct users. Output is a list of `PatternOutput`.
|
||||
|
||||
|
|
@ -288,18 +288,18 @@ export const patternOutputSchema = z.object({
|
|||
|
||||
After all skills complete and evidence is committed, run two passes. Both are pure data transformations, no LLM calls.
|
||||
|
||||
**Pass A — `_schema` shard reconciliation:**
|
||||
**Pass A - `_schema` shard reconciliation:**
|
||||
|
||||
1. Collect all `historic_sql_table_usage` evidence written this run.
|
||||
2. Group by `shardKey` (`catalog.schema`).
|
||||
3. For each shard:
|
||||
- Load existing `_schema/{shardKey}.yaml`.
|
||||
- For each table entry: if new evidence exists, merge under `usage` via `mergeUsagePreservingExternal()` (only `historicSql`-managed keys touched; user-added keys preserved — same pattern as `mergeDescriptionsPreservingExternal` at `local-enrichment-artifacts.ts:237-242`).
|
||||
- For each table entry: if new evidence exists, merge under `usage` via `mergeUsagePreservingExternal()` (only `historicSql`-managed keys touched; user-added keys preserved - same pattern as `mergeDescriptionsPreservingExternal` at `local-enrichment-artifacts.ts:237-242`).
|
||||
- For tables previously present with `historicSql`-managed `usage` but absent from this run's snapshot: set `usage.staleSince = lastSnapshotSeenAt`, clear other historicSql-managed fields.
|
||||
- Atomic write to `_schema/{shardKey}.yaml`.
|
||||
4. Trigger SL search re-index for changed sources via the existing flow (`sl-search.service.ts:91-99` detects search-text drift).
|
||||
|
||||
**Pass B — wiki pattern pages:**
|
||||
**Pass B - wiki pattern pages:**
|
||||
|
||||
1. Collect all `historic_sql_pattern` evidence written this run.
|
||||
2. Load existing wiki pages with tags `['historic-sql', 'pattern']` for this connection.
|
||||
|
|
@ -312,21 +312,21 @@ After all skills complete and evidence is committed, run two passes. Both are pu
|
|||
|
||||
## 6. Search-surface plumbing
|
||||
|
||||
### 6.1 `ktx wiki search` — no plumbing required
|
||||
### 6.1 `ktx wiki search` - no plumbing required
|
||||
|
||||
Pattern pages are written to `knowledge/global/historic-sql/{slug}.md` and are discovered by the existing `searchLocalKnowledgePages()` walk. Tags `['historic-sql', 'pattern']` enable faceted search.
|
||||
|
||||
### 6.2 `ktx sl search` — small extension
|
||||
### 6.2 `ktx sl search` - small extension
|
||||
|
||||
**6.2.1 — `SemanticLayerSource.usage` field**
|
||||
**6.2.1 - `SemanticLayerSource.usage` field**
|
||||
|
||||
Add an optional `usage` field to `SemanticLayerSource` in `packages/context/src/sl/schemas.ts`, reusing the same `tableUsageOutputSchema` from `skill-schemas.ts`. Single source of truth end-to-end.
|
||||
|
||||
**6.2.2 — `_schema` → `SemanticLayerSource` projection carries `usage`**
|
||||
**6.2.2 - `_schema` → `SemanticLayerSource` projection carries `usage`**
|
||||
|
||||
The existing projection step in `local-sl.ts` (or wherever the manifest reader builds `SemanticLayerSource` objects) needs one new field copy: `entry.usage → source.usage`.
|
||||
|
||||
**6.2.3 — `buildSemanticLayerSourceSearchText()` extension**
|
||||
**6.2.3 - `buildSemanticLayerSourceSearchText()` extension**
|
||||
|
||||
Extend the function at `sl-search.service.ts:8-74` to include usage content in the FTS5/embedding text:
|
||||
|
||||
|
|
@ -344,11 +344,11 @@ if (source.usage) {
|
|||
}
|
||||
```
|
||||
|
||||
**6.2.4 — Re-index trigger**
|
||||
**6.2.4 - Re-index trigger**
|
||||
|
||||
Already wired. Per-source content-hash detection at `sl-search.service.ts:91-99` ensures only sources whose `usage` changed re-embed.
|
||||
|
||||
**6.2.5 — Query-mode result enrichment**
|
||||
**6.2.5 - Query-mode result enrichment**
|
||||
|
||||
Extend the search result shape returned by `agent sl list --query` to include `score` and an FTS5 `snippet()` per hit. Implementation: small SQL change in `sqlite-sl-sources-index.ts` to select `snippet(local_sl_sources_fts, ...)` alongside the source row.
|
||||
|
||||
|
|
@ -510,7 +510,7 @@ export const stagedManifestSchema = z.object({
|
|||
});
|
||||
```
|
||||
|
||||
In `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts` — the **single source of truth for LLM I/O shapes**, imported by the prompt builder, the evidence parser, the projection step, the `SemanticLayerSource` type, and the `_schema` manifest entry type:
|
||||
In `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts` - the **single source of truth for LLM I/O shapes**, imported by the prompt builder, the evidence parser, the projection step, the `SemanticLayerSource` type, and the `_schema` manifest entry type:
|
||||
|
||||
```typescript
|
||||
export const tableUsageOutputSchema = z.object({
|
||||
|
|
@ -541,10 +541,10 @@ export type PatternOutput = z.infer<typeof patternOutputSchema>;
|
|||
|
||||
**Extensions to existing types:**
|
||||
|
||||
- `packages/context/src/sl/schemas.ts` — `SemanticLayerSource.usage: tableUsageOutputSchema.optional()`.
|
||||
- `packages/context/src/ingest/adapters/live-database/manifest.ts` — `LiveDatabaseManifestTableEntry.usage?: TableUsageOutput`.
|
||||
- `packages/context/src/sl/schemas.ts` - `SemanticLayerSource.usage: tableUsageOutputSchema.optional()`.
|
||||
- `packages/context/src/ingest/adapters/live-database/manifest.ts` - `LiveDatabaseManifestTableEntry.usage?: TableUsageOutput`.
|
||||
|
||||
The `_schema/{shard}.yaml` manifest version need not bump — `usage` is an additive, optional field. Validators must allow unknown future keys (audit during step 1 of §10).
|
||||
The `_schema/{shard}.yaml` manifest version need not bump - `usage` is an additive, optional field. Validators must allow unknown future keys (audit during step 1 of §10).
|
||||
|
||||
## 10. Cutover plan
|
||||
|
||||
|
|
@ -554,22 +554,22 @@ Hard cutover. No parallel codepaths. Single coordinated PR (or PR train).
|
|||
|
||||
Within `packages/context/src/ingest/adapters/historic-sql/`:
|
||||
|
||||
- `stage.ts` — rewritten
|
||||
- `stage-pgss.ts` — **deleted** (no baseline tracking)
|
||||
- `stage-pgss.test.ts`, `stage-pgss-golden.test.ts` — **deleted**
|
||||
- `historic-sql.adapter.ts` — rewritten
|
||||
- `historic-sql.adapter.test.ts` — rewritten
|
||||
- `chunk.ts` / `chunk.test.ts` — rewritten (becomes trivial)
|
||||
- `detect.ts` / `detect.test.ts` — trivial update
|
||||
- `postgres-pgss-query-history-reader.ts` — rewritten as `postgres-pgss-reader.ts`; baseline-tracking code removed
|
||||
- `bigquery-query-history-reader.ts` / `snowflake-query-history-reader.ts` — rewritten; cursor logic removed; warehouse-side GROUP BY
|
||||
- `types.ts` — rewritten
|
||||
- `stage.ts` - rewritten
|
||||
- `stage-pgss.ts` - **deleted** (no baseline tracking)
|
||||
- `stage-pgss.test.ts`, `stage-pgss-golden.test.ts` - **deleted**
|
||||
- `historic-sql.adapter.ts` - rewritten
|
||||
- `historic-sql.adapter.test.ts` - rewritten
|
||||
- `chunk.ts` / `chunk.test.ts` - rewritten (becomes trivial)
|
||||
- `detect.ts` / `detect.test.ts` - trivial update
|
||||
- `postgres-pgss-query-history-reader.ts` - rewritten as `postgres-pgss-reader.ts`; baseline-tracking code removed
|
||||
- `bigquery-query-history-reader.ts` / `snowflake-query-history-reader.ts` - rewritten; cursor logic removed; warehouse-side GROUP BY
|
||||
- `types.ts` - rewritten
|
||||
- **new** `skill-schemas.ts`
|
||||
- `errors.ts` — keep (probe errors); prune unused
|
||||
- `errors.ts` - keep (probe errors); prune unused
|
||||
|
||||
Old skills `historic_sql_ingest` and `historic_sql_curator` — audit; if only consumed by historic-sql, delete.
|
||||
Old skills `historic_sql_ingest` and `historic_sql_curator` - audit; if only consumed by historic-sql, delete.
|
||||
|
||||
`expandCategoricalTemplates`, `classifySlot`, `rankTemplate`, slot-related types — gone.
|
||||
`expandCategoricalTemplates`, `classifySlot`, `rankTemplate`, slot-related types - gone.
|
||||
|
||||
### 10.2 Existing artifacts
|
||||
|
||||
|
|
@ -596,7 +596,7 @@ Old skills `historic_sql_ingest` and `historic_sql_curator` — audit; if only c
|
|||
- `historic_sql_table_digest` + `historic_sql_patterns`.
|
||||
- `onPullSucceeded` projection passes.
|
||||
- One-time legacy cleanup.
|
||||
5. **Delete the old codepath** — same PR as step 3, ideally.
|
||||
5. **Delete the old codepath** - same PR as step 3, ideally.
|
||||
6. **Docs + setup wizard** updates.
|
||||
|
||||
### 10.4 Verification before merging
|
||||
|
|
@ -612,11 +612,11 @@ Old skills `historic_sql_ingest` and `historic_sql_curator` — audit; if only c
|
|||
### 10.5 Out of scope
|
||||
|
||||
- Embedding-based pattern clustering (rejected in favor of LLM-driven intent detection).
|
||||
- Wiki shard pages (rejected — patterns are sparse; per-page is correct).
|
||||
- Wiki shard pages (rejected - patterns are sparse; per-page is correct).
|
||||
- Incremental dialect-by-dialect rollout behind a flag.
|
||||
- A `ktx historic-sql migrate` command — cleanup runs automatically once.
|
||||
- A `ktx historic-sql migrate` command - cleanup runs automatically once.
|
||||
- Framework-level `raw-sources/` retention policy (separate concern; not introduced here).
|
||||
- Per-table wiki pages (the very problem `_schema` shards exist to avoid — see §11).
|
||||
- Per-table wiki pages (the very problem `_schema` shards exist to avoid - see §11).
|
||||
|
||||
### 10.6 Risks
|
||||
|
||||
|
|
@ -632,23 +632,23 @@ Old skills `historic_sql_ingest` and `historic_sql_curator` — audit; if only c
|
|||
|
||||
Documented so future readers don't relitigate.
|
||||
|
||||
**Per-table wiki pages** — one `.md` per table under `knowledge/global/historic-sql/`. Rejected: reintroduces the per-table-file proliferation problem (`writeLocalKnowledgePage` writes one file per page) that `_schema` shards exist to avoid. ~800 markdown files for a 1000-table warehouse, ~100 churning daily.
|
||||
**Per-table wiki pages** - one `.md` per table under `knowledge/global/historic-sql/`. Rejected: reintroduces the per-table-file proliferation problem (`writeLocalKnowledgePage` writes one file per page) that `_schema` shards exist to avoid. ~800 markdown files for a 1000-table warehouse, ~100 churning daily.
|
||||
|
||||
**Single-file all-usage page** — one giant page containing every table. Rejected: ~700 KB blob; FTS5 snippets all come from the same source; `wiki read` returns an unusable mass.
|
||||
**Single-file all-usage page** - one giant page containing every table. Rejected: ~700 KB blob; FTS5 snippets all come from the same source; `wiki read` returns an unusable mass.
|
||||
|
||||
**One file per table in a new `_usage/` directory** — same file-count problem as per-table wiki, plus needs new search plumbing.
|
||||
**One file per table in a new `_usage/` directory** - same file-count problem as per-table wiki, plus needs new search plumbing.
|
||||
|
||||
**New parallel `_usage/{shard}.yaml` shards** — same sharding benefit as merging into `_schema` but without riding SL search. Plumbing required without offsetting win.
|
||||
**New parallel `_usage/{shard}.yaml` shards** - same sharding benefit as merging into `_schema` but without riding SL search. Plumbing required without offsetting win.
|
||||
|
||||
**One wiki page per `catalog.schema`** — workable, but pages get large (200 tables per page) and only rides wiki search, not SL search. The chosen design rides both.
|
||||
**One wiki page per `catalog.schema`** - workable, but pages get large (200 tables per page) and only rides wiki search, not SL search. The chosen design rides both.
|
||||
|
||||
**Single staged `snapshot.json`** — to reduce `raw-sources/` accumulation. Rejected: required custom diff logic in `chunk()`, broke framework convention, saved bounded disk for a framework-level concern (sync retention). Per-table staged files with bucketed content is cleaner.
|
||||
**Single staged `snapshot.json`** - to reduce `raw-sources/` accumulation. Rejected: required custom diff logic in `chunk()`, broke framework convention, saved bounded disk for a framework-level concern (sync retention). Per-table staged files with bucketed content is cleaner.
|
||||
|
||||
**Embedding-based pattern clustering** — using sentence-transformer embeddings to cluster templates into themes before naming via LLM. Rejected: reintroduces clustering hyperparameters and determinism the redesign aims to avoid. The LLM does the grouping in one call from the full template list, no embedding step.
|
||||
**Embedding-based pattern clustering** - using sentence-transformer embeddings to cluster templates into themes before naming via LLM. Rejected: reintroduces clustering hyperparameters and determinism the redesign aims to avoid. The LLM does the grouping in one call from the full template list, no embedding step.
|
||||
|
||||
**Skip pattern pages entirely** — ship only `_schema` enrichment for a leaner v1. Rejected: leaves `ktx wiki search` empty of historic-sql content (loses one of two stated consumption surfaces) and forces agents to synthesize cross-cutting intents from fragmented per-table mentions.
|
||||
**Skip pattern pages entirely** - ship only `_schema` enrichment for a leaner v1. Rejected: leaves `ktx wiki search` empty of historic-sql content (loses one of two stated consumption surfaces) and forces agents to synthesize cross-cutting intents from fragmented per-table mentions.
|
||||
|
||||
**TypeScript-native SQL parser** instead of sqlglot via daemon — `node-sql-parser`, `pgsql-parser` (WASM), etc. Rejected: materially worse dialect coverage on Snowflake/BigQuery edge cases; duplicates parser logic when KTX already uses sqlglot elsewhere (`python/ktx-daemon/src/ktx_daemon/lookml.py`); AGENTS.md explicitly mandates sqlglot. Batch endpoint on the existing daemon achieves the perf win.
|
||||
**TypeScript-native SQL parser** instead of sqlglot via daemon - `node-sql-parser`, `pgsql-parser` (WASM), etc. Rejected: materially worse dialect coverage on Snowflake/BigQuery edge cases; duplicates parser logic when KTX already uses sqlglot elsewhere (`python/ktx-daemon/src/ktx_daemon/lookml.py`); AGENTS.md explicitly mandates sqlglot. Batch endpoint on the existing daemon achieves the perf win.
|
||||
|
||||
**Hard length/count caps in zod output schemas** (e.g. `narrative.max(250)`, `commonFilters.max(5)`). Rejected: arbitrary thresholds, brittle retry-on-violation paths, defensive coding for a soft concern. Concision belongs in prompt instructions; the schema validates shape.
|
||||
|
||||
|
|
@ -671,7 +671,7 @@ For a large warehouse (~800 touched tables): first-run ~$20–30, daily ~$0.20
|
|||
|
||||
## 13. Open questions
|
||||
|
||||
- Exact bucket thresholds for `executionsBucket`, `distinctUsersBucket`, etc. — to be chosen during implementation based on what produces stable hashes in practice.
|
||||
- Exact bucket thresholds for `executionsBucket`, `distinctUsersBucket`, etc. - to be chosen during implementation based on what produces stable hashes in practice.
|
||||
- Final naming of the daemon endpoint (`/sql/analyze-batch` vs alternatives).
|
||||
- Whether `historic_sql_ingest` / `historic_sql_curator` skills are consumed elsewhere — audit during step 1.
|
||||
- Whether to delete legacy wiki pages automatically or behind a confirmation flag — design assumes automatic.
|
||||
- Whether `historic_sql_ingest` / `historic_sql_curator` skills are consumed elsewhere - audit during step 1.
|
||||
- Whether to delete legacy wiki pages automatically or behind a confirmation flag - design assumes automatic.
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
**Date:** 2026-05-12
|
||||
**Author:** Andrey Avtomonov
|
||||
**Status:** Design — pending implementation plan
|
||||
**Status:** Design - pending implementation plan
|
||||
|
||||
## Background and motivation
|
||||
|
||||
|
|
@ -16,7 +16,7 @@ A real-world inspection (project `/tmp/ktx-proj-1`) surfaced two failure modes t
|
|||
Root cause analysis (`packages/context/skills/notion_synthesize/SKILL.md`, `packages/context/src/ingest/tools/emit-unmapped-fallback.tool.ts`, `packages/context/src/wiki/tools/wiki-write.tool.ts`) showed three contributing factors:
|
||||
|
||||
- The synthesis LLM has no verification primitive that distinguishes a real warehouse identifier from a fabricated one. `sl_discover` only finds objects already promoted into the semantic layer; raw warehouse scans (which already exist on disk under `raw-sources/<conn>/live-database/<sync>/`) are not surfaced to the LLM at all.
|
||||
- `wiki_write` performs no body-text validation — anything the LLM emits is written.
|
||||
- `wiki_write` performs no body-text validation - anything the LLM emits is written.
|
||||
- The skill prompt itself uses `orbit_analytics.customer` as a canonical example string (`SKILL.md:70`), reinforcing the same fictional name the LLM ends up emitting.
|
||||
|
||||
Kaelio's server-side ingest WU agent (`/Users/andrey/conductor/workspaces/kaelio-main2/douala/server/src/tools/toolset-factory.service.ts`) had four verification tools that KTX dropped during the open-source extraction: `discover_data`, `entity_details`, `dictionary_search`, and `sql_execution`. The underlying connector infrastructure (`KtxScanConnector`, dialect classes, `assertReadOnlySql`, `SemanticLayerService.executeQuery`) is present in KTX, so the gap is at the tool layer, not the platform layer.
|
||||
|
|
@ -115,7 +115,7 @@ export type SupportedDriver = 'postgres'|'postgresql'|'mysql'|'sqlserver'|'snowf
|
|||
export function getDialectForDriver(driver: SupportedDriver): KtxDialect;
|
||||
```
|
||||
|
||||
Sync dispatch. The connectors' existing dialect classes already expose the same shape — `formatTableName(KtxTableRef)`, `quoteIdentifier(string)`, `mapToDimensionType(nativeType)`. The implementation plan introduces a minimal `KtxDialect` interface that these classes already satisfy structurally; no connector-internal changes required. Used by tools only for display-string parsing and error-message formatting; tools never construct executable SQL.
|
||||
Sync dispatch. The connectors' existing dialect classes already expose the same shape - `formatTableName(KtxTableRef)`, `quoteIdentifier(string)`, `mapToDimensionType(nativeType)`. The implementation plan introduces a minimal `KtxDialect` interface that these classes already satisfy structurally; no connector-internal changes required. Used by tools only for display-string parsing and error-message formatting; tools never construct executable SQL.
|
||||
|
||||
## Tool contracts
|
||||
|
||||
|
|
@ -139,7 +139,7 @@ Type: table | Native columns: 11 | PK: account_id | FKs: parent_account_id → o
|
|||
Description: One row per customer account…
|
||||
|
||||
Columns:
|
||||
- account_id (text, nullable=false, PK) — sample: ["acct_001","acct_002",…]
|
||||
- account_id (text, nullable=false, PK) - sample: ["acct_001","acct_002",…]
|
||||
- parent_account_id (text, nullable=true, FK → orbit_raw.accounts.account_id)
|
||||
- account_name (text, nullable=false)
|
||||
- …
|
||||
|
|
@ -147,7 +147,7 @@ Columns:
|
|||
Profile: rowCount=4321 distinctCount(account_id)=4321 nullRate(parent_account_id)=0.62
|
||||
```
|
||||
|
||||
When `column` is provided in a target, output is scoped to that one column. When a target doesn't resolve, output is `Not found in scan. Closest matches: …` with up to 5 candidates from `searchByName`. When the connection has no `live-database` scan, output is `No live-database scan available for connection "<name>"; run \`ktx scan\` first.` — distinct from the "not found" state.
|
||||
When `column` is provided in a target, output is scoped to that one column. When a target doesn't resolve, output is `Not found in scan. Closest matches: …` with up to 5 candidates from `searchByName`. When the connection has no `live-database` scan, output is `No live-database scan available for connection "<name>"; run \`ktx scan\` first.` - distinct from the "not found" state.
|
||||
|
||||
Structured output: `{ resolved: TableDetail[], missing: Array<{target, candidates}>, scanAvailable: boolean }`.
|
||||
|
||||
|
|
@ -165,14 +165,14 @@ input = {
|
|||
|
||||
Pipeline:
|
||||
|
||||
1. `assertReadOnlySql(sql)` — regex rejects anything starting with `insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh`.
|
||||
2. `limitSqlForExecution(sql, rowLimit)` — wraps as `select * from (<llm_sql>) as ktx_query_result limit N`.
|
||||
1. `assertReadOnlySql(sql)` - regex rejects anything starting with `insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh`.
|
||||
2. `limitSqlForExecution(sql, rowLimit)` - wraps as `select * from (<llm_sql>) as ktx_query_result limit N`.
|
||||
3. `SemanticLayerService.executeQuery(connectionName, wrappedSql)`.
|
||||
4. Format as markdown table; first ~20 rows inline; if truncated, append `… +N more rows`.
|
||||
|
||||
Structured output: `{ headers, rows, rowCount, truncated, sql, wrappedSql }`.
|
||||
|
||||
Connector errors surface verbatim (e.g., Postgres `relation "orbit_analytics.customer" does not exist`). That error message is the most valuable verification signal — it tells the LLM the identifier is fictional.
|
||||
Connector errors surface verbatim (e.g., Postgres `relation "orbit_analytics.customer" does not exist`). That error message is the most valuable verification signal - it tells the LLM the identifier is fictional.
|
||||
|
||||
Refuses `connectionName` not in `allowedConnectionNames`. Each connector's driver-level read-only enforcement (Postgres read-only transaction, BigQuery query-only jobs) is a second defence under the regex gate.
|
||||
|
||||
|
|
@ -189,9 +189,9 @@ input = {
|
|||
|
||||
Composes three searches and groups output into three sections, omitting empty sections:
|
||||
|
||||
1. **Wiki Pages** — `wiki_search({query, limit})`. Routing hint: *use `wiki_read(blockKey)` for full content*.
|
||||
2. **Semantic Layer Sources** — `sl_discover({query, connectionName})`. Routing hint: *use `sl_read_source(sourceName)` for the YAML, or `entity_details` for warehouse-shape details*.
|
||||
3. **Raw Warehouse Schema** — `WarehouseCatalogService.searchByName(connectionName, query, limit)`. Routing hint: *use `entity_details({connectionName, targets: [{display}]})` for full DDL + sample values*.
|
||||
1. **Wiki Pages** - `wiki_search({query, limit})`. Routing hint: *use `wiki_read(blockKey)` for full content*.
|
||||
2. **Semantic Layer Sources** - `sl_discover({query, connectionName})`. Routing hint: *use `sl_read_source(sourceName)` for the YAML, or `entity_details` for warehouse-shape details*.
|
||||
3. **Raw Warehouse Schema** - `WarehouseCatalogService.searchByName(connectionName, query, limit)`. Routing hint: *use `entity_details({connectionName, targets: [{display}]})` for full DDL + sample values*.
|
||||
|
||||
When `sourceName` is set, delegates entirely to `sl_discover` inspect mode and skips other sections. When all three sections are empty, output is `No matches for "<query>" across wiki, semantic layer, or raw warehouse schema. Try broader terms; this concept may not exist yet.`
|
||||
|
||||
|
|
@ -215,7 +215,7 @@ const warehouseTools = createWarehouseVerificationTools({
|
|||
// alongside emit_unmapped_fallback.
|
||||
```
|
||||
|
||||
`createWarehouseVerificationTools` returns `Record<string, Tool>` with three keys. The set is wired into every adapter's synthesis stage — no per-adapter opt-in.
|
||||
`createWarehouseVerificationTools` returns `Record<string, Tool>` with three keys. The set is wired into every adapter's synthesis stage - no per-adapter opt-in.
|
||||
|
||||
## Skill-prompt updates
|
||||
|
||||
|
|
@ -227,12 +227,12 @@ const warehouseTools = createWarehouseVerificationTools({
|
|||
## Identifier Verification Protocol
|
||||
|
||||
Before writing a wiki page or SL source on any topic:
|
||||
1. `discover_data({query: "<topic>"})` — see what wikis, SL sources, and raw tables
|
||||
1. `discover_data({query: "<topic>"})` - see what wikis, SL sources, and raw tables
|
||||
already exist. Prefer updating existing pages over creating new ones.
|
||||
|
||||
Before emitting any `schema.table` or `schema.table.column` into a wiki body,
|
||||
SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`:
|
||||
2. `entity_details({connectionName, targets: [{display: "<identifier>"}]})` —
|
||||
2. `entity_details({connectionName, targets: [{display: "<identifier>"}]})` -
|
||||
confirm the identifier resolves; inspect native types, FK/PK, and sampleValues.
|
||||
3. For literal values from the source (status codes, plan tiers): check whether
|
||||
they appear in `entity_details`' `sampleValues` for the relevant column.
|
||||
|
|
@ -241,7 +241,7 @@ SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`:
|
|||
4. If the candidate identifier still doesn't resolve, do one of:
|
||||
(a) Use `sql_execution` with `SELECT 1 FROM <ref> LIMIT 0`. If it errors,
|
||||
the identifier is fictional.
|
||||
(b) Wrap the identifier in `[unverified — from <rawPath>]` in the wiki body,
|
||||
(b) Wrap the identifier in `[unverified - from <rawPath>]` in the wiki body,
|
||||
citing the exact raw path that mentioned it.
|
||||
(c) When recording `emit_unmapped_fallback` with `no_physical_table`,
|
||||
include the failing probe error in `clarification`.
|
||||
|
|
@ -271,10 +271,10 @@ Two skills are deliberately excluded from updates: `ingest_triage` (read-only tr
|
|||
|
||||
### Cleanups beyond the four-tool addition
|
||||
|
||||
- `notion_synthesize/SKILL.md:70` — remove `orbit_analytics.customer` (placeholder).
|
||||
- `packages/context/src/ingest/tools/emit-unmapped-fallback.tool.ts:67` — same example string in the Zod `.describe()` — replace with `<schema>.<table>`.
|
||||
- `dbt_ingest/SKILL.md:24` — fix `wiki_sl_search` and `sl_describe_table` (neither tool exists in KTX).
|
||||
- `packages/context/src/sl/tools/sl-warehouse-validation.ts:93` — inline error message references the non-existent `sl_describe_table`. Replace with `sl_read_source`.
|
||||
- `notion_synthesize/SKILL.md:70` - remove `orbit_analytics.customer` (placeholder).
|
||||
- `packages/context/src/ingest/tools/emit-unmapped-fallback.tool.ts:67` - same example string in the Zod `.describe()` - replace with `<schema>.<table>`.
|
||||
- `dbt_ingest/SKILL.md:24` - fix `wiki_sl_search` and `sl_describe_table` (neither tool exists in KTX).
|
||||
- `packages/context/src/sl/tools/sl-warehouse-validation.ts:93` - inline error message references the non-existent `sl_describe_table`. Replace with `sl_read_source`.
|
||||
|
||||
## Testing strategy
|
||||
|
||||
|
|
@ -294,7 +294,7 @@ Two skills are deliberately excluded from updates: `ingest_triage` (read-only tr
|
|||
|
||||
- Extend `packages/context/src/ingest/ingest-bundle.runner.test.ts` to verify the three new tools are present in both WU-stage and reconcile-stage tool maps and refuse out-of-scope `connectionName` values.
|
||||
- New fixture-based test: stage a small `raw-sources/<conn>/live-database/<sync>/` directory with 2 tables + 1 enrichment profile, then call each tool through the runner's tool map and assert the markdown contains the expected fields. Uses the same fake-LLM harness as `notion.adapter.test.ts`.
|
||||
- One end-to-end regression test reproducing the `orbit_analytics.customer` hallucination: a fake Notion page mentioning the fictional table is fed to the synthesis stage; the run produces a wiki page where the fictional name is wrapped in `[unverified — …]` or omitted, not promoted to `tables:` frontmatter.
|
||||
- One end-to-end regression test reproducing the `orbit_analytics.customer` hallucination: a fake Notion page mentioning the fictional table is fed to the synthesis stage; the run produces a wiki page where the fictional name is wrapped in `[unverified - …]` or omitted, not promoted to `tables:` frontmatter.
|
||||
|
||||
### Prompt-bundling tests
|
||||
|
||||
|
|
@ -306,7 +306,7 @@ Extend `packages/context/src/memory/memory-runtime-assets.test.ts`:
|
|||
|
||||
### Performance guards
|
||||
|
||||
`WarehouseCatalogService` caches the per-connection table list per stage (one WorkUnit's lifetime). Tests assert second call is a cache hit. No DB index for `searchByName` in this iteration — linear scan over scan artefacts is acceptable up to ~50K columns. If volume warrants it later, a follow-up PR adds a SQLite FTS index.
|
||||
`WarehouseCatalogService` caches the per-connection table list per stage (one WorkUnit's lifetime). Tests assert second call is a cache hit. No DB index for `searchByName` in this iteration - linear scan over scan artefacts is acceptable up to ~50K columns. If volume warrants it later, a follow-up PR adds a SQLite FTS index.
|
||||
|
||||
## Rollout
|
||||
|
||||
|
|
@ -323,7 +323,7 @@ Skill prompts land last so they can reference the tools that already exist.
|
|||
|
||||
## Out of scope
|
||||
|
||||
- **Hard write-time validation in `wiki_write` / `emit_unmapped_fallback`.** A complementary spec covers regex-based identifier validation at the write boundary. Defence-in-depth — separate concern.
|
||||
- **Hard write-time validation in `wiki_write` / `emit_unmapped_fallback`.** A complementary spec covers regex-based identifier validation at the write boundary. Defence-in-depth - separate concern.
|
||||
- **SQLite FTS index for `searchByName`.** Deferred until the linear scan benchmark fails.
|
||||
- **`raw_schema_search` as a standalone tool.** `discover_data`'s raw section covers the concept-search case.
|
||||
- **`semantic_query` in the synthesis toolset.** `semantic_query` will exist in KTX for the research/chat-time agent; it is deliberately excluded from synthesis because synthesis creates SL sources rather than queries them.
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
**Date:** 2026-05-13
|
||||
**Author:** Andrey Avtomonov
|
||||
**Status:** Design — pending implementation plan
|
||||
**Status:** Design - pending implementation plan
|
||||
|
||||
## Background
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue