diff --git a/docs-site/content/docs/cli-reference/ktx-setup.mdx b/docs-site/content/docs/cli-reference/ktx-setup.mdx index 77f8b359..943033b5 100644 --- a/docs-site/content/docs/cli-reference/ktx-setup.mdx +++ b/docs-site/content/docs/cli-reference/ktx-setup.mdx @@ -90,7 +90,8 @@ ktx setup [options] | `--enable-historic-sql` | Enable Historic SQL when the selected database supports it | `false` | | `--disable-historic-sql` | Disable Historic SQL for the selected database | `false` | | `--historic-sql-window-days ` | Historic SQL query-history window in days | — | -| `--historic-sql-min-calls ` | Postgres `pg_stat_statements` minimum calls floor | — | +| `--historic-sql-min-executions ` | Minimum executions for a Historic SQL template | — | +| `--historic-sql-min-calls ` | Alias for `--historic-sql-min-executions` for one release | — | | `--historic-sql-service-account-pattern ` | Historic SQL service-account regex; repeatable | — | | `--historic-sql-redaction-pattern ` | Historic SQL SQL-literal redaction regex; repeatable | — | diff --git a/docs-site/content/docs/integrations/primary-sources.mdx b/docs-site/content/docs/integrations/primary-sources.mdx index 8f8c1391..be71cba0 100644 --- a/docs-site/content/docs/integrations/primary-sources.mdx +++ b/docs-site/content/docs/integrations/primary-sources.mdx @@ -76,8 +76,11 @@ PostgreSQL Historic SQL mines real query patterns from `pg_stat_statements`. Thi ```yaml historicSql: - minCalls: 5 # Minimum call count to include a query template - maxTemplatesPerRun: 5000 + enabled: true + dialect: postgres + minExecutions: 5 + filters: + dropTrivialProbes: true ``` ### Dialect notes @@ -134,18 +137,27 @@ For multiple schemas: | Foreign keys | No | Not available in Snowflake | | Row count estimates | Yes | From `INFORMATION_SCHEMA.TABLES.ROW_COUNT` | | Column statistics | No | — | -| Historic SQL | Configurable | Query-history settings can be stored; local CLI Historic SQL ingest currently uses the Postgres path | +| Historic SQL | Yes | Via `SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY` when enabled | | Table sampling | Yes | — | ### Historic SQL -Snowflake Historic SQL settings describe how query history should be sampled when that runtime path is available. +Snowflake Historic SQL reads aggregated query-history templates from +`SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY` and feeds the same unified staged +artifact shape as Postgres and BigQuery. ```yaml historicSql: + enabled: true + dialect: snowflake windowDays: 90 + minExecutions: 5 + filters: + dropTrivialProbes: true + serviceAccounts: + patterns: ['^svc_'] + mode: exclude redactionPatterns: [] - serviceAccountUserPatterns: [] ``` ### Dialect notes @@ -200,18 +212,27 @@ The project ID is extracted automatically from the service account JSON file. | Foreign keys | No | Not available in BigQuery | | Row count estimates | Yes | From table metadata | | Column statistics | No | — | -| Historic SQL | Configurable | Query-history settings can be stored; local CLI Historic SQL ingest currently uses the Postgres path | +| Historic SQL | Yes | Via region-scoped `INFORMATION_SCHEMA.JOBS_BY_PROJECT` when enabled | | Table sampling | Yes | — | ### Historic SQL -BigQuery Historic SQL settings describe how `INFORMATION_SCHEMA.JOBS_BY_PROJECT` should be sampled when that runtime path is available. +BigQuery Historic SQL reads aggregated query-history templates from +region-scoped `INFORMATION_SCHEMA.JOBS_BY_PROJECT` and feeds the same unified +staged artifact shape as Postgres and Snowflake. ```yaml historicSql: + enabled: true + dialect: bigquery windowDays: 90 + minExecutions: 5 + filters: + dropTrivialProbes: true + serviceAccounts: + patterns: ['@bot\\.'] + mode: exclude redactionPatterns: [] - serviceAccountUserPatterns: [] ``` ### Dialect notes diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md b/docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md new file mode 100644 index 00000000..a7a5cc6c --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md @@ -0,0 +1,1277 @@ +# Historic SQL Cross-Dialect Readiness Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make the redesigned historic-SQL adapter usable through the local CLI for Postgres, BigQuery, and Snowflake, with a truthful probe contract and Postgres doctor severity that matches the redesign. + +**Architecture:** Keep the unified hot path and skills/projection code intact. Normalize every historic-SQL reader to return a deterministic probe object, allow the local adapter factory to inject any `HistoricSqlReader` plus matching query client, and let the CLI choose the reader/query client from the configured connection dialect. Postgres `pg_stat_statements.max` becomes informational while `pg_stat_statements.track = none` remains a warning. + +**Tech Stack:** TypeScript ESM/NodeNext, zod 4, Vitest, existing KTX connector scan interfaces, existing managed daemon SQL-analysis port. + +--- + +## Starting Point + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans found that are based on this spec: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md` + +Implemented status verified in this worktree: + +- `2026-05-11-historic-sql-foundations.md` is implemented. Evidence: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`, `SqlAnalysisPort.analyzeBatch()` in `packages/context/src/sql-analysis/ports.ts`, `/sql/analyze-batch` in `python/ktx-daemon/src/ktx_daemon/app.py`, `SemanticLayerSource.usage` in `packages/context/src/sl/types.ts`, and `mergeUsagePreservingExternal()` in `packages/context/src/ingest/adapters/live-database/manifest.ts`. +- `2026-05-11-historic-sql-search-enrichment.md` is implemented. Evidence: `packages/context/src/sl/sl-search.service.ts` indexes `source.usage`, `packages/context/src/sl/sqlite-sl-sources-index.ts` selects FTS snippets, and local/MCP list surfaces expose `frequencyTier` and `snippet`. +- `2026-05-11-historic-sql-unified-hot-path.md` is implemented for the shared stager/chunker and Postgres reader. Evidence: `stageHistoricSqlAggregatedSnapshot()`, `chunkHistoricSqlUnifiedStagedDir()`, `PostgresPgssReader`, aggregate BigQuery/Snowflake `fetchAggregated()` methods, unified schemas, and exports exist. +- `2026-05-11-historic-sql-skills-projection-cutover.md` is implemented for the production adapter, skills, evidence tool, projection post-processor, and old code deletion. Evidence: `HistoricSqlSourceAdapter` uses `stageHistoricSqlAggregatedSnapshot()` and `chunkHistoricSqlUnifiedStagedDir()`, `packages/context/skills/historic_sql_table_digest/` and `packages/context/skills/historic_sql_patterns/` exist, `HistoricSqlProjectionPostProcessor` is wired in `local-bundle-runtime.ts`, and old `historic_sql_ingest` / `historic_sql_curator` skill directories are absent. + +Remaining core gaps from the spec: + +- `BigQueryHistoricSqlQueryHistoryReader.probe()` and `SnowflakeHistoricSqlQueryHistoryReader.probe()` return `void`, but `stageHistoricSqlAggregatedSnapshot()` reads `probe.warnings`. A BigQuery or Snowflake historic-SQL run would fail before staging. +- `createKtxCliLocalIngestAdapters()` only registers a historic-SQL adapter when the target connection is Postgres, while `ktx setup` can enable `historicSql` for BigQuery and Snowflake. +- `PostgresPgssReader.probe()` still reports low `pg_stat_statements.max` as a warning, but the spec says that check is informational after baseline tracking was removed. + +This plan does not update `examples/postgres-historic/README.md` or `examples/postgres-historic/scripts/smoke.sh`. Those still describe the legacy baseline/delta/reset behavior and should be handled in a separate documentation/acceptance plan after this cross-dialect code path is fixed. + +## File Structure + +Modify: + +- `packages/context/src/ingest/adapters/historic-sql/types.ts` + Adds optional probe `info` notes and lets injected historic-SQL dependencies use any reader/query client pair while preserving the existing Postgres-specific option. +- `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts` + Moves low `pg_stat_statements.max` from `warnings` to `info`. +- `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts` + Locks `track = none` as warning and low `max` as info. +- `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts` + Returns `{ warnings: [], info: [] }` from `probe()`. +- `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts` + Locks the BigQuery probe return object. +- `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts` + Returns `{ warnings: [], info: [] }` from `probe()`. +- `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts` + Locks the Snowflake probe return object. +- `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` + Updates test readers to return the normalized probe shape. +- `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts` + Updates test readers to return the normalized probe shape. +- `packages/context/src/ingest/local-adapters.ts` + Accepts generic historic-SQL reader/query-client dependencies while keeping `postgresQueryClient` as the compatibility input used by current callers. +- `packages/context/src/ingest/local-adapters.test.ts` + Verifies generic reader/query-client injection and the existing Postgres compatibility path. +- `packages/cli/src/local-adapters.ts` + Chooses Postgres, BigQuery, or Snowflake historic-SQL readers/query clients from the configured connection. +- `packages/cli/src/local-adapters.test.ts` + Adds direct tests for CLI local adapter registration for Postgres, BigQuery, and Snowflake. +- `packages/cli/src/historic-sql-doctor.ts` + Treats info-only Postgres probe notes as a passing doctor check, and warnings as warnings. +- `packages/cli/src/historic-sql-doctor.test.ts` + Verifies low `pg_stat_statements.max` is pass/detail, while `track = none` remains warn. +- `packages/cli/src/doctor.test.ts` + Updates the project doctor integration expectation for the new info-only behavior. + +## Task 1: Normalize Historic-SQL Probe Results + +**Files:** +- Modify: `packages/context/src/ingest/adapters/historic-sql/types.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts` + +- [ ] **Step 1: Update failing reader probe tests** + +In `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts`, replace the existing successful probe assertion: + +```typescript +await expect(reader.probe(client)).resolves.toBeUndefined(); +``` + +with: + +```typescript +await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] }); +``` + +In `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts`, replace the existing successful probe assertion: + +```typescript +await expect(reader.probe(client)).resolves.toBeUndefined(); +``` + +with: + +```typescript +await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] }); +``` + +In `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts`, change the successful probe expectation to include `info: []`: + +```typescript +await expect(reader.probe(client)).resolves.toEqual({ + pgServerVersion: 'PostgreSQL 16.4', + warnings: [], + info: [], +}); +``` + +In the `returns a warning instead of failing when pg_stat_statements.track is none` test, change the expected object to: + +```typescript +await expect(reader.probe(client)).resolves.toEqual({ + pgServerVersion: 'PostgreSQL 16.4', + warnings: [ + 'pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config', + ], + info: [], +}); +``` + +Rename the low-max test from: + +```typescript +it('warns when pg_stat_statements.max is below the recommended floor', async () => { +``` + +to: + +```typescript +it('returns an info note when pg_stat_statements.max is below the recommended floor', async () => { +``` + +and change its expected object to: + +```typescript +await expect(reader.probe(client)).resolves.toEqual({ + pgServerVersion: 'PostgreSQL 16.4', + warnings: [], + info: [ + 'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', + ], +}); +``` + +In `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts`, change the test reader probe from: + +```typescript +async probe() { + return { warnings: ['pg_stat_statements.max is low; aggregation still proceeds'] }; +}, +``` + +to: + +```typescript +async probe() { + return { warnings: ['pg_stat_statements.track is none; aggregation still proceeds'], info: [] }; +}, +``` + +and update the manifest expectation from: + +```typescript +probeWarnings: ['pg_stat_statements.max is low; aggregation still proceeds'], +``` + +to: + +```typescript +probeWarnings: ['pg_stat_statements.track is none; aggregation still proceeds'], +``` + +In `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts`, replace every test reader probe result: + +```typescript +return { warnings: [] }; +``` + +with: + +```typescript +return { warnings: [], info: [] }; +``` + +- [ ] **Step 2: Run reader tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts \ + src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts \ + src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts \ + src/ingest/adapters/historic-sql/stage-unified.test.ts \ + src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts +``` + +Expected: FAIL. The failure should show missing `info` fields and BigQuery/Snowflake probes resolving to `undefined`. + +- [ ] **Step 3: Update probe contracts and implementations** + +In `packages/context/src/ingest/adapters/historic-sql/types.ts`, replace: + +```typescript +export interface HistoricSqlProbeResult { + warnings: string[]; +} +``` + +with: + +```typescript +export interface HistoricSqlProbeResult { + warnings: string[]; + info?: string[]; +} +``` + +In the same file, replace: + +```typescript +export interface PostgresPgssProbeResult { + pgServerVersion: string; + warnings: string[]; +} +``` + +with: + +```typescript +export interface PostgresPgssProbeResult extends HistoricSqlProbeResult { + pgServerVersion: string; + warnings: string[]; + info: string[]; +} +``` + +In `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts`, replace the warning construction block: + +```typescript +const warnings: string[] = []; +if (track === 'none') { + warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config'); +} +if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) { + warnings.push( + `pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`, + ); +} + +return { pgServerVersion, warnings }; +``` + +with: + +```typescript +const warnings: string[] = []; +const info: string[] = []; +if (track === 'none') { + warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config'); +} +if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) { + info.push( + `pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`, + ); +} + +return { pgServerVersion, warnings, info }; +``` + +In `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts`, replace the successful end of `probe()`: + +```typescript +if (result.error) { + throw grantsError(result.error); +} +``` + +with: + +```typescript +if (result.error) { + throw grantsError(result.error); +} +return { warnings: [], info: [] }; +``` + +and change the method signature from: + +```typescript +async probe(client: unknown): Promise { +``` + +to: + +```typescript +async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> { +``` + +In `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts`, make the same signature and return changes: + +```typescript +async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> { + let result: QueryResultLike; + try { + result = await queryClient(client).executeQuery(PROBE_SQL); + } catch (error) { + throw grantsError(error); + } + if (result.error) { + throw grantsError(result.error); + } + return { warnings: [], info: [] }; +} +``` + +- [ ] **Step 4: Run reader tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts \ + src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts \ + src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts \ + src/ingest/adapters/historic-sql/stage-unified.test.ts \ + src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add \ + packages/context/src/ingest/adapters/historic-sql/types.ts \ + packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts \ + packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts \ + packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts \ + packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts \ + packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts \ + packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts \ + packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts \ + packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts +git commit -m "fix: normalize historic sql probe results" +``` + +## Task 2: Allow Generic Historic-SQL Reader Injection + +**Files:** +- Modify: `packages/context/src/ingest/local-adapters.ts` +- Modify: `packages/context/src/ingest/local-adapters.test.ts` + +- [ ] **Step 1: Write failing context adapter injection tests** + +In `packages/context/src/ingest/local-adapters.test.ts`, add `HistoricSqlReader` to the existing imports from `./adapters/historic-sql/types.js` if that import exists, or add this import near the other ingest imports: + +```typescript +import type { HistoricSqlReader } from './adapters/historic-sql/types.js'; +``` + +Add this test after `registers historic-sql locally when Postgres historic-SQL deps are provided`: + +```typescript +it('registers historic-sql with an injected non-Postgres reader and query client', () => { + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() {}, + }; + const queryClient = { executeQuery: async () => ({ headers: [], rows: [], totalRows: 0 }) }; + + const adapters = createDefaultLocalIngestAdapters(project, { + historicSql: { + sqlAnalysis: { + async analyzeForFingerprint(sql) { + return { + fingerprint: 'fp', + normalizedSql: sql, + tablesTouched: [], + literalSlots: [], + }; + }, + async analyzeBatch() { + return new Map(); + }, + }, + reader, + queryClient, + }, + }); + + const adapter = adapters.find((candidate) => candidate.source === 'historic-sql'); + expect(adapter).toBeDefined(); + expect(adapter?.fetch).toBeTypeOf('function'); +}); +``` + +Add this assertion inside the existing `registers historic-sql locally when Postgres historic-SQL deps are provided` test after the adapter lookup assertion: + +```typescript +expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([ + 'historic_sql_table_digest', + 'historic_sql_patterns', +]); +``` + +- [ ] **Step 2: Run context adapter tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/local-adapters.test.ts +``` + +Expected: FAIL with TypeScript or runtime errors because `DefaultLocalIngestAdaptersOptions['historicSql']` does not accept `reader` or `queryClient`. + +- [ ] **Step 3: Update local adapter dependency shape** + +In `packages/context/src/ingest/local-adapters.ts`, add `HistoricSqlReader` to the historic-SQL type imports: + +```typescript +import { + HISTORIC_SQL_SOURCE_KEY, + historicSqlUnifiedPullConfigSchema, + type HistoricSqlReader, + type KtxPostgresQueryClient, +} from './adapters/historic-sql/types.js'; +``` + +Replace the `historicSql` option block in `DefaultLocalIngestAdaptersOptions`: + +```typescript +historicSql?: { + sqlAnalysis: SqlAnalysisPort; + postgresQueryClient: KtxPostgresQueryClient; + postgresBaselineRootDir?: string; + now?: () => Date; +}; +``` + +with: + +```typescript +historicSql?: { + sqlAnalysis: SqlAnalysisPort; + reader?: HistoricSqlReader; + queryClient?: unknown; + postgresQueryClient?: KtxPostgresQueryClient; + postgresBaselineRootDir?: string; + now?: () => Date; +}; +``` + +Replace the historic-SQL adapter construction block: + +```typescript +if (options.historicSql) { + adapters.push( + new HistoricSqlSourceAdapter({ + sqlAnalysis: options.historicSql.sqlAnalysis, + reader: new PostgresPgssReader(), + queryClient: options.historicSql.postgresQueryClient, + legacyPostgresBaselineRootDir: options.historicSql.postgresBaselineRootDir, + now: options.historicSql.now, + }), + ); +} +``` + +with: + +```typescript +if (options.historicSql) { + const queryClient = options.historicSql.queryClient ?? options.historicSql.postgresQueryClient; + if (!queryClient) { + throw new Error('Historic SQL local adapter requires queryClient or postgresQueryClient'); + } + adapters.push( + new HistoricSqlSourceAdapter({ + sqlAnalysis: options.historicSql.sqlAnalysis, + reader: options.historicSql.reader ?? new PostgresPgssReader(), + queryClient, + legacyPostgresBaselineRootDir: options.historicSql.postgresBaselineRootDir, + now: options.historicSql.now, + }), + ); +} +``` + +- [ ] **Step 4: Run context adapter tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/local-adapters.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add packages/context/src/ingest/local-adapters.ts packages/context/src/ingest/local-adapters.test.ts +git commit -m "feat: allow generic historic sql readers locally" +``` + +## Task 3: Register BigQuery And Snowflake Historic SQL In The CLI + +**Files:** +- Create: `packages/cli/src/local-adapters.test.ts` +- Modify: `packages/cli/src/local-adapters.ts` + +- [ ] **Step 1: Write failing CLI local adapter tests** + +Create `packages/cli/src/local-adapters.test.ts`: + +```typescript +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { loadKtxProject } from '@ktx/context/project'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { createKtxCliLocalIngestAdapters } from './local-adapters.js'; + +function sqlAnalysisStub() { + return { + async analyzeForFingerprint(sql: string) { + return { + fingerprint: 'fp', + normalizedSql: sql, + tablesTouched: [], + literalSlots: [], + }; + }, + async analyzeBatch() { + return new Map(); + }, + }; +} + +async function writeProject(projectDir: string, body: string): Promise { + await writeFile(join(projectDir, 'ktx.yaml'), body, 'utf-8'); +} + +describe('CLI local ingest adapters', () => { + let tempDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-cli-local-adapters-')); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('registers Postgres historic SQL from the requested connection', async () => { + await writeProject( + tempDir, + [ + 'project: warehouse', + 'connections:', + ' warehouse:', + ' driver: postgres', + ' url: env:WAREHOUSE_DATABASE_URL', + ' readonly: true', + ' historicSql:', + ' enabled: true', + ' dialect: postgres', + 'ingest:', + ' adapters:', + ' - historic-sql', + '', + ].join('\n'), + ); + const project = await loadKtxProject({ projectDir: tempDir }); + + const adapters = createKtxCliLocalIngestAdapters(project, { + historicSqlConnectionId: 'warehouse', + sqlAnalysis: sqlAnalysisStub(), + }); + + expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([ + 'historic_sql_table_digest', + 'historic_sql_patterns', + ]); + }); + + it('registers BigQuery historic SQL from the requested connection', async () => { + await writeProject( + tempDir, + [ + 'project: warehouse', + 'connections:', + ' bq:', + ' driver: bigquery', + ' readonly: true', + ' dataset_id: analytics', + ' location: us', + ' credentials_json: \'{"project_id":"demo-project"}\'', + ' historicSql:', + ' enabled: true', + ' dialect: bigquery', + 'ingest:', + ' adapters:', + ' - historic-sql', + '', + ].join('\n'), + ); + const project = await loadKtxProject({ projectDir: tempDir }); + + const adapters = createKtxCliLocalIngestAdapters(project, { + historicSqlConnectionId: 'bq', + sqlAnalysis: sqlAnalysisStub(), + }); + + expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([ + 'historic_sql_table_digest', + 'historic_sql_patterns', + ]); + }); + + it('registers Snowflake historic SQL from the requested connection', async () => { + await writeProject( + tempDir, + [ + 'project: warehouse', + 'connections:', + ' sf:', + ' driver: snowflake', + ' readonly: true', + ' account: acct', + ' warehouse: wh', + ' database: ANALYTICS', + ' schema_name: PUBLIC', + ' username: reader', + ' password: env:SNOWFLAKE_PASSWORD', + ' historicSql:', + ' enabled: true', + ' dialect: snowflake', + 'ingest:', + ' adapters:', + ' - historic-sql', + '', + ].join('\n'), + ); + const project = await loadKtxProject({ projectDir: tempDir }); + + const adapters = createKtxCliLocalIngestAdapters(project, { + historicSqlConnectionId: 'sf', + sqlAnalysis: sqlAnalysisStub(), + }); + + expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([ + 'historic_sql_table_digest', + 'historic_sql_patterns', + ]); + }); +}); +``` + +- [ ] **Step 2: Run the new CLI adapter test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/local-adapters.test.ts +``` + +Expected: FAIL. BigQuery and Snowflake cases should not find a `historic-sql` adapter. + +- [ ] **Step 3: Add cross-dialect query clients and reader selection** + +In `packages/cli/src/local-adapters.ts`, replace the BigQuery import: + +```typescript +import { createBigQueryLiveDatabaseIntrospection, isKtxBigQueryConnectionConfig } from '@ktx/connector-bigquery'; +``` + +with: + +```typescript +import { + createBigQueryLiveDatabaseIntrospection, + isKtxBigQueryConnectionConfig, + KtxBigQueryScanConnector, + type KtxBigQueryConnectionConfig, +} from '@ktx/connector-bigquery'; +``` + +Replace the context ingest import block: + +```typescript +import { + createDaemonLiveDatabaseIntrospection, + createDefaultLocalIngestAdapters, + type DefaultLocalIngestAdaptersOptions, + type LiveDatabaseIntrospectionPort, + LiveDatabaseSourceAdapter, + type SourceAdapter, +} from '@ktx/context/ingest'; +``` + +with: + +```typescript +import { + BigQueryHistoricSqlQueryHistoryReader, + createDaemonLiveDatabaseIntrospection, + createDefaultLocalIngestAdapters, + type DefaultLocalIngestAdaptersOptions, + type HistoricSqlReader, + type LiveDatabaseIntrospectionPort, + LiveDatabaseSourceAdapter, + PostgresPgssReader, + SnowflakeHistoricSqlQueryHistoryReader, + type SourceAdapter, +} from '@ktx/context/ingest'; +``` + +Replace the SQL-analysis import: + +```typescript +import { createHttpSqlAnalysisPort } from '@ktx/context/sql-analysis'; +``` + +with: + +```typescript +import { createHttpSqlAnalysisPort, type SqlAnalysisPort } from '@ktx/context/sql-analysis'; +``` + +Add this top-level Snowflake type alias below `hasSnowflakeDriver()`: + +```typescript +type SnowflakeConnectorModule = typeof import('@ktx/connector-snowflake'); +``` + +Add an injectable SQL-analysis port to `KtxCliLocalIngestAdaptersOptions`: + +```typescript +export interface KtxCliLocalIngestAdaptersOptions extends DefaultLocalIngestAdaptersOptions { + historicSqlConnectionId?: string; + sqlAnalysis?: SqlAnalysisPort; + sqlAnalysisUrl?: string; + managedDaemon?: ManagedPythonCoreDaemonOptions; +} +``` + +Add this as the first branch in `ktxCliHistoricSqlAnalysis()`: + +```typescript +if (options.sqlAnalysis) { + return options.sqlAnalysis; +} +``` + +Replace `isEnabledPostgresHistoricSqlConnection()` with these helpers: + +```typescript +function historicSqlRecord(connection: unknown): Record | null { + if ( + connection && + typeof connection === 'object' && + 'historicSql' in connection && + typeof (connection as { historicSql?: unknown }).historicSql === 'object' && + (connection as { historicSql?: unknown }).historicSql !== null && + !Array.isArray((connection as { historicSql?: unknown }).historicSql) + ) { + return (connection as { historicSql: Record }).historicSql; + } + return null; +} + +function enabledHistoricSqlDialect(connection: unknown): 'postgres' | 'bigquery' | 'snowflake' | null { + const historicSql = historicSqlRecord(connection); + if (historicSql?.enabled !== true) { + return null; + } + const dialect = String(historicSql.dialect ?? '').toLowerCase(); + return dialect === 'postgres' || dialect === 'bigquery' || dialect === 'snowflake' ? dialect : null; +} +``` + +Keep `createEphemeralPostgresHistoricSqlClient()` and add these two query-client helpers below it: + +```typescript +function createEphemeralBigQueryHistoricSqlClient(project: KtxLocalProject, connectionId: string) { + const connection = project.config.connections[connectionId] as KtxBigQueryConnectionConfig | undefined; + if (!isKtxBigQueryConnectionConfig(connection)) { + throw new Error( + `Historic SQL local ingest requires a BigQuery connection, got ${String(connection?.driver ?? 'unknown')}`, + ); + } + return { + async executeQuery(query: string) { + const connector = new KtxBigQueryScanConnector({ + connectionId, + connection, + }); + try { + const result = await connector.executeReadOnly({ connectionId, sql: query }, {} as never); + return { + headers: result.headers, + rows: result.rows, + totalRows: result.totalRows, + }; + } finally { + await connector.cleanup(); + } + }, + }; +} + +async function createEphemeralSnowflakeHistoricSqlClient( + project: KtxLocalProject, + connectionId: string, + connectorModule: SnowflakeConnectorModule, +) { + const connection = project.config.connections[connectionId]; + if (!connectorModule.isKtxSnowflakeConnectionConfig(connection)) { + throw new Error( + `Historic SQL local ingest requires a Snowflake connection, got ${String(connection?.driver ?? 'unknown')}`, + ); + } + return { + async executeQuery(query: string) { + const connector = new connectorModule.KtxSnowflakeScanConnector({ + connectionId, + connection, + }); + try { + const result = await connector.executeReadOnly({ connectionId, sql: query }, {} as never); + return { + headers: result.headers, + rows: result.rows, + totalRows: result.totalRows, + }; + } finally { + await connector.cleanup(); + } + }, + }; +} +``` + +Replace `historicSqlOptionsForLocalRun()` with: + +```typescript +function bigQueryProjectId(connection: KtxBigQueryConnectionConfig, env: NodeJS.ProcessEnv): string { + const raw = typeof connection.credentials_json === 'string' ? connection.credentials_json : ''; + const resolved = raw.startsWith('env:') ? env[raw.slice('env:'.length)] ?? '' : raw; + const parsed = JSON.parse(resolved) as { project_id?: unknown }; + if (typeof parsed.project_id !== 'string' || parsed.project_id.trim().length === 0) { + throw new Error('Historic SQL BigQuery connection requires credentials_json.project_id'); + } + return parsed.project_id; +} + +function bigQueryRegion(connection: KtxBigQueryConnectionConfig): string { + return typeof connection.location === 'string' && connection.location.trim().length > 0 + ? connection.location.trim() + : 'us'; +} + +function historicSqlOptionsForLocalRun(project: KtxLocalProject, options: KtxCliLocalIngestAdaptersOptions) { + const connectionId = options.historicSqlConnectionId; + if (!connectionId) { + return undefined; + } + const connection = project.config.connections[connectionId]; + const dialect = enabledHistoricSqlDialect(connection); + if (!dialect) { + return undefined; + } + + const base = { + sqlAnalysis: ktxCliHistoricSqlAnalysis(options), + postgresBaselineRootDir: join(project.projectDir, '.ktx/cache/historic-sql'), + }; + + if (dialect === 'postgres') { + return { + ...base, + reader: new PostgresPgssReader() satisfies HistoricSqlReader, + queryClient: createEphemeralPostgresHistoricSqlClient(project, connectionId), + }; + } + + if (dialect === 'bigquery') { + if (!isKtxBigQueryConnectionConfig(connection)) { + throw new Error( + `Historic SQL local ingest requires a BigQuery connection, got ${String(connection?.driver ?? 'unknown')}`, + ); + } + return { + ...base, + reader: new BigQueryHistoricSqlQueryHistoryReader({ + projectId: bigQueryProjectId(connection, process.env), + region: bigQueryRegion(connection), + }) satisfies HistoricSqlReader, + queryClient: createEphemeralBigQueryHistoricSqlClient(project, connectionId), + }; + } + + return { + ...base, + reader: new SnowflakeHistoricSqlQueryHistoryReader() satisfies HistoricSqlReader, + queryClient: { + async executeQuery(query: string) { + const connectorModule = await import('@ktx/connector-snowflake'); + const client = await createEphemeralSnowflakeHistoricSqlClient(project, connectionId, connectorModule); + return client.executeQuery(query); + }, + }, + }; +} +``` + +- [ ] **Step 4: Run CLI adapter tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/local-adapters.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Run existing ingest wiring tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/ingest.test.ts +pnpm --filter @ktx/context exec vitest run src/ingest/local-adapters.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add packages/cli/src/local-adapters.ts packages/cli/src/local-adapters.test.ts +git commit -m "feat: wire historic sql readers for bigquery and snowflake" +``` + +## Task 4: Downgrade Low PGSS Max To Informational Doctor Output + +**Files:** +- Modify: `packages/cli/src/historic-sql-doctor.ts` +- Modify: `packages/cli/src/historic-sql-doctor.test.ts` +- Modify: `packages/cli/src/doctor.test.ts` + +- [ ] **Step 1: Write failing doctor severity tests** + +In `packages/cli/src/historic-sql-doctor.test.ts`, replace the existing low-max warning test with: + +```typescript +it('passes with an informational note when only pg_stat_statements.max is below the recommended floor', async () => { + const checks = await runPostgresHistoricSqlDoctorChecks( + projectWithConnections({ + warehouse: { + driver: 'postgres', + url: 'env:WAREHOUSE_DATABASE_URL', + readonly: true, + historicSql: { enabled: true, dialect: 'postgres' }, + }, + }), + { + postgresHistoricSqlProbe: async () => ({ + pgServerVersion: 'PostgreSQL 16.4', + warnings: [], + info: [ + 'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', + ], + }), + }, + ); + + expect(checks).toEqual([ + { + id: 'historic-sql-postgres-warehouse', + label: 'Postgres Historic SQL (warehouse)', + status: 'pass', + detail: + 'pg_stat_statements ready (PostgreSQL 16.4); info: pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', + }, + ]); +}); +``` + +Add this test immediately after it: + +```typescript +it('warns when pg_stat_statements tracking is disabled', async () => { + const checks = await runPostgresHistoricSqlDoctorChecks( + projectWithConnections({ + warehouse: { + driver: 'postgres', + url: 'env:WAREHOUSE_DATABASE_URL', + readonly: true, + historicSql: { enabled: true, dialect: 'postgres' }, + }, + }), + { + postgresHistoricSqlProbe: async () => ({ + pgServerVersion: 'PostgreSQL 16.4', + warnings: [ + 'pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config', + ], + info: [ + 'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', + ], + }), + }, + ); + + expect(checks).toEqual([ + { + id: 'historic-sql-postgres-warehouse', + label: 'Postgres Historic SQL (warehouse)', + status: 'warn', + detail: + 'pg_stat_statements ready (PostgreSQL 16.4) with warnings: pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config; info: pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', + fix: 'Update the Postgres parameter group or config, then rerun `ktx dev doctor --project-dir /tmp/ktx-project`', + }, + ]); +}); +``` + +In `packages/cli/src/doctor.test.ts`, replace the `includes Postgres historic-SQL readiness in project doctor output` test's fake historic-SQL check with a pass/info check: + +```typescript +const runHistoricSqlDoctorChecks = vi.fn(async () => [ + { + id: 'historic-sql-postgres-warehouse', + label: 'Postgres Historic SQL (warehouse)', + status: 'pass' as const, + detail: + 'pg_stat_statements ready (PostgreSQL 16.4); info: pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', + }, +]); +``` + +and replace the output assertions: + +```typescript +expect(testIo.stdout()).toContain('WARN Postgres Historic SQL (warehouse): pg_stat_statements ready'); +expect(testIo.stdout()).toContain('Fix: Update the Postgres parameter group or config'); +``` + +with: + +```typescript +expect(testIo.stdout()).toContain('PASS Postgres Historic SQL (warehouse): pg_stat_statements ready'); +expect(testIo.stdout()).toContain('info: pg_stat_statements.max is 1000'); +expect(testIo.stdout()).not.toContain('Fix: Update the Postgres parameter group or config'); +``` + +- [ ] **Step 2: Run doctor tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/historic-sql-doctor.test.ts src/doctor.test.ts +``` + +Expected: FAIL. The current doctor still treats any probe note as `warn`. + +- [ ] **Step 3: Update doctor probe and rendering logic** + +In `packages/cli/src/historic-sql-doctor.ts`, replace: + +```typescript +export interface PostgresHistoricSqlDoctorProbeResult { + pgServerVersion: string; + warnings: string[]; +} +``` + +with: + +```typescript +export interface PostgresHistoricSqlDoctorProbeResult { + pgServerVersion: string; + warnings: string[]; + info?: string[]; +} +``` + +Add this helper below `failureDetail()`: + +```typescript +function readinessDetail(result: PostgresHistoricSqlDoctorProbeResult): string { + const warningText = result.warnings.length > 0 ? ` with warnings: ${result.warnings.join('; ')}` : ''; + const info = result.info ?? []; + const infoText = info.length > 0 ? `; info: ${info.join('; ')}` : ''; + return `pg_stat_statements ready (${result.pgServerVersion})${warningText}${infoText}`; +} +``` + +Replace this block: + +```typescript +if (result.warnings.length > 0) { + checks.push( + check( + 'warn', + checkId(connectionId), + label, + `pg_stat_statements ready (${result.pgServerVersion}) with warnings: ${result.warnings.join('; ')}`, + `Update the Postgres parameter group or config, then rerun \`ktx dev doctor --project-dir ${project.projectDir}\``, + ), + ); +} else { + checks.push( + check('pass', checkId(connectionId), label, `pg_stat_statements ready (${result.pgServerVersion})`), + ); +} +``` + +with: + +```typescript +if (result.warnings.length > 0) { + checks.push( + check( + 'warn', + checkId(connectionId), + label, + readinessDetail(result), + `Update the Postgres parameter group or config, then rerun \`ktx dev doctor --project-dir ${project.projectDir}\``, + ), + ); +} else { + checks.push(check('pass', checkId(connectionId), label, readinessDetail(result))); +} +``` + +- [ ] **Step 4: Run doctor tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/historic-sql-doctor.test.ts src/doctor.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add packages/cli/src/historic-sql-doctor.ts packages/cli/src/historic-sql-doctor.test.ts packages/cli/src/doctor.test.ts +git commit -m "fix: make pgss max advisory informational" +``` + +## Task 5: Final Verification + +**Files:** +- Verify: `packages/context/src/ingest/adapters/historic-sql/*` +- Verify: `packages/context/src/ingest/local-adapters.ts` +- Verify: `packages/cli/src/local-adapters.ts` +- Verify: `packages/cli/src/historic-sql-doctor.ts` + +- [ ] **Step 1: Run focused historic-SQL test suites** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/ingest/adapters/historic-sql/types.test.ts \ + src/ingest/adapters/historic-sql/buckets.test.ts \ + src/ingest/adapters/historic-sql/stage-unified.test.ts \ + src/ingest/adapters/historic-sql/chunk-unified.test.ts \ + src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts \ + src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts \ + src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts \ + src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts \ + src/ingest/local-adapters.test.ts +pnpm --filter @ktx/cli exec vitest run \ + src/local-adapters.test.ts \ + src/historic-sql-doctor.test.ts \ + src/doctor.test.ts \ + src/ingest.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run package type checks** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +pnpm --filter @ktx/cli run type-check +``` + +Expected: PASS. + +- [ ] **Step 3: Run the no-old-code grep** + +Run: + +```bash +rg -n "stagePgStatStatementsTemplates|expandCategoricalTemplates|classifySlot|pgss-baseline|historic_sql_ingest|historic_sql_curator|PostgresPgssQueryHistoryReader|historic_sql_template" packages/context packages/cli +``` + +Expected: no matches. + +- [ ] **Step 4: Run pre-commit for touched files** + +Run with the actual touched file list: + +```bash +uv run pre-commit run --files $(git diff --name-only) +``` + +Expected: PASS. If local `uv` refuses due the repo's exact uv pin, activate `.venv` and run the nearest available checks, then record the exact uv version mismatch in the implementation summary. + +- [ ] **Step 5: Confirm verification did not create unintended changes** + +Run: + +```bash +git status --short +``` + +Expected: the only changed files are the files committed in Tasks 1-4. If a verification command changed another tracked file, inspect it with `git diff -- ` and either commit it with the task that intentionally owns that file or revert only that verification-generated file after confirming it was not user-authored work. + +## Self-Review + +Spec coverage: + +- One pipeline across dialects: Task 1 fixes reader probe compatibility; Task 3 wires BigQuery and Snowflake into the CLI local adapter path. +- Unified reader interface: Task 1 makes every reader return the probe result shape consumed by the stager. +- Doctor command severity: Task 4 implements the spec's downgrade of low `pg_stat_statements.max` from warning to informational note. +- Hard cutover and old-code deletion: Task 5 keeps the no-old-code grep in verification. +- Search surfaces, skills, evidence projection, wiki pattern pages, and old skill deletion are already implemented by earlier plans and intentionally unchanged here. +- Postgres example smoke/docs are outside this plan because they are documentation/acceptance assets, not cross-dialect adapter plumbing. The next plan should update `examples/postgres-historic/scripts/smoke.sh`, `examples/postgres-historic/README.md`, `examples/README.md`, and `scripts/examples-docs.test.mjs` from legacy baseline/delta/reset assertions to unified `manifest.json`, `tables/*.json`, `patterns-input.json`, and no-WorkUnit idempotency assertions. + +Plan-quality scan: + +- No unresolved marker text from the forbidden-pattern list is present. +- Every code-changing task names exact files, includes concrete test snippets or replacement blocks, and specifies commands plus expected outcomes. + +Type consistency: + +- `HistoricSqlProbeResult.info` is optional for the generic reader interface. +- `PostgresPgssProbeResult.info` is required because the doctor consumes Postgres-specific info notes. +- `DefaultLocalIngestAdaptersOptions.historicSql.reader` and `.queryClient` align with `HistoricSqlSourceAdapterDeps`. +- CLI query-client helpers return the `headers`, `rows`, and `totalRows` shape already consumed by BigQuery and Snowflake historic-SQL readers. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - I dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - Execute tasks in this session using executing-plans, batch execution with checkpoints + +Which approach? diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md b/docs/superpowers/plans/2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md new file mode 100644 index 00000000..6e6c6aa8 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md @@ -0,0 +1,886 @@ +# Historic SQL Docs Smoke And Config Cleanup Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Finish the historic-SQL redesign follow-through by making setup emit the canonical config shape and replacing stale PGSS baseline/delta/reset example docs with unified artifact and no-WorkUnit idempotency checks. + +**Architecture:** This is the acceptance/documentation slice after the adapter cutover. Product code changes are limited to `ktx setup` Historic SQL config serialization; the Postgres example smoke remains a deterministic stage-only path that uses the real local adapter, managed daemon, Docker Postgres, and raw artifact diffing without requiring LLM credentials. Public docs are updated to match the unified Postgres, BigQuery, and Snowflake reader behavior already present in source. + +**Tech Stack:** TypeScript, Vitest, Bash, Node.js ESM, `node:test`, pnpm, Docker Compose, KTX local stage-only ingest, managed `ktx-daemon`. + +--- + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans already based on this spec: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` - implemented in source: `skill-schemas.ts`, `SemanticLayerSource.usage`, `mergeUsagePreservingExternal()`, `/sql/analyze-batch`, and `SqlAnalysisPort.analyzeBatch()`. +- `docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md` - implemented in source: usage fields in `buildSemanticLayerSourceSearchText()`, SQLite FTS snippets, query-mode `score`, `frequencyTier`, and agent/MCP list propagation. +- `docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md` - implemented in source: unified config/types, bucket helpers, `stage-unified.ts`, aggregate readers, and `chunk-unified.ts`. +- `docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md` - implemented in source: replacement skills, evidence tool, projection, post-processor wiring, production adapter cutover, legacy source deletion, and `minExecutions` alias support. +- `docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md` - implemented in source: cross-dialect CLI wiring, generic reader injection, probe result normalization, and PGSS max informational doctor output. + +Remaining gap this plan covers: + +- `examples/postgres-historic/scripts/smoke.sh`, `examples/postgres-historic/README.md`, `examples/README.md`, and `scripts/examples-docs.test.mjs` still describe the legacy baseline/delta/reset model. +- Public docs still mention `minCalls` and say BigQuery/Snowflake local CLI Historic SQL uses the Postgres path. +- `packages/cli/src/setup-databases.ts` still writes `serviceAccountUserPatterns` for new setup output even though the redesign's canonical runtime config is `filters.serviceAccounts`. + +## File Structure + +- Modify `packages/cli/src/setup-databases.ts`: write canonical `historicSql.filters.serviceAccounts` blocks from setup flags while keeping existing parser compatibility in `packages/context/src/ingest/adapters/historic-sql/types.ts`. +- Modify `packages/cli/src/setup-databases.test.ts`: assert generated YAML uses `filters` and no longer writes `serviceAccountUserPatterns`. +- Modify `scripts/examples-docs.test.mjs`: lock public example docs and smoke script to the unified artifact contract. +- Modify `examples/postgres-historic/scripts/smoke.sh`: assert `manifest.json`, `tables/*.json`, `patterns-input.json`, per-run `workUnitCount`, and stage-only runtime under 60 seconds after runtime warm-up. +- Modify `examples/postgres-historic/README.md`: replace baseline/delta/reset instructions with unified artifact, no-WorkUnit idempotency, and `minExecutions` language. +- Modify `examples/README.md`: replace the stale one-paragraph summary. +- Modify `docs/content/docs/integrations/primary-sources.mdx`: update Postgres, Snowflake, and BigQuery Historic SQL docs to the unified config and current support status. +- Modify `docs/content/docs/cli-reference/ktx-setup.mdx`: document `--historic-sql-min-executions` as primary and `--historic-sql-min-calls` as the one-release alias. + +### Task 1: Emit Canonical Historic SQL Setup Config + +**Files:** +- Modify: `packages/cli/src/setup-databases.test.ts` +- Modify: `packages/cli/src/setup-databases.ts` + +- [ ] **Step 1: Update failing setup config assertions** + +In `packages/cli/src/setup-databases.test.ts`, update the Snowflake expectation in `writes Historic SQL config for supported Snowflake databases after validation succeeds` to: + +```typescript + expect(config.connections.snowflake).toMatchObject({ + driver: 'snowflake', + authMethod: 'password', + historicSql: { + enabled: true, + dialect: 'snowflake', + windowDays: 30, + filters: { + dropTrivialProbes: true, + serviceAccounts: { + patterns: ['^svc_'], + mode: 'exclude', + }, + }, + redactionPatterns: ['(?i)secret'], + }, + }); + expect(config.connections.snowflake.historicSql).not.toHaveProperty('serviceAccountUserPatterns'); +``` + +In the same file, update the Postgres expectation in `writes Postgres Historic SQL config with minExecutions and ignores window/redaction output` to: + +```typescript + expect(config.connections.warehouse).toMatchObject({ + driver: 'postgres', + url: 'env:DATABASE_URL', + schemas: ['public'], + historicSql: { + enabled: true, + dialect: 'postgres', + minExecutions: 12, + filters: { + dropTrivialProbes: true, + serviceAccounts: { + patterns: ['^svc_'], + mode: 'exclude', + }, + }, + }, + }); + expect(config.connections.warehouse.historicSql).not.toHaveProperty('minCalls'); + expect(config.connections.warehouse.historicSql).not.toHaveProperty('windowDays'); + expect(config.connections.warehouse.historicSql).not.toHaveProperty('redactionPatterns'); + expect(config.connections.warehouse.historicSql).not.toHaveProperty('serviceAccountUserPatterns'); +``` + +Update the existing BigQuery connection expectation in `writes Historic SQL config for supported existing database connections` to: + +```typescript + expect(config.connections.analytics).toMatchObject({ + historicSql: { + enabled: true, + dialect: 'bigquery', + windowDays: 45, + filters: { + dropTrivialProbes: true, + }, + redactionPatterns: [], + }, + }); + expect(config.connections.analytics.historicSql).not.toHaveProperty('serviceAccountUserPatterns'); +``` + +Update the existing Postgres connection expectation in `enables Historic SQL on an existing Postgres connection` to: + +```typescript + expect(config.connections.warehouse).toMatchObject({ + historicSql: { + enabled: true, + dialect: 'postgres', + minExecutions: 8, + filters: { + dropTrivialProbes: true, + }, + }, + }); + expect(config.connections.warehouse.historicSql).not.toHaveProperty('serviceAccountUserPatterns'); +``` + +- [ ] **Step 2: Run setup tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-databases.test.ts --testNamePattern "Historic SQL" +``` + +Expected: FAIL because `historicSql.serviceAccountUserPatterns` is still written and `historicSql.filters` is missing from generated setup YAML. + +- [ ] **Step 3: Write canonical setup config** + +In `packages/cli/src/setup-databases.ts`, add this helper near `maybeApplyHistoricSqlConfig()`: + +```typescript +function historicSqlFiltersForSetup(patterns: string[] | undefined) { + const serviceAccountPatterns = patterns ?? []; + return { + dropTrivialProbes: true, + ...(serviceAccountPatterns.length > 0 + ? { + serviceAccounts: { + patterns: serviceAccountPatterns, + mode: 'exclude' as const, + }, + } + : {}), + }; +} +``` + +Then replace the `common` object inside `maybeApplyHistoricSqlConfig()` with: + +```typescript + const common: Record = { + ...existing, + enabled: true, + dialect, + filters: historicSqlFiltersForSetup(input.args.historicSqlServiceAccountPatterns), + }; + delete common.serviceAccountUserPatterns; +``` + +Keep the existing `minExecutions`, `windowDays`, and `redactionPatterns` branches unchanged after this object replacement. + +- [ ] **Step 4: Run setup tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-databases.test.ts --testNamePattern "Historic SQL" +``` + +Expected: PASS for all Historic SQL setup tests in `src/setup-databases.test.ts`. + +- [ ] **Step 5: Commit** + +```bash +git add packages/cli/src/setup-databases.ts packages/cli/src/setup-databases.test.ts +git commit -m "fix: write canonical historic sql setup filters" +``` + +### Task 2: Lock Example Docs To Unified Historic SQL Terms + +**Files:** +- Modify: `scripts/examples-docs.test.mjs` + +- [ ] **Step 1: Update the failing example docs test** + +Replace the `documents the Postgres historic SQL smoke example` test body in `scripts/examples-docs.test.mjs` with: + +```javascript + it('documents the Postgres historic SQL smoke example', async () => { + const examples = await readText('examples/README.md'); + const readme = await readText('examples/postgres-historic/README.md'); + const compose = await readText('examples/postgres-historic/docker-compose.yml'); + const initSql = await readText('examples/postgres-historic/init/001-schema.sql'); + const workload = await readText('examples/postgres-historic/scripts/generate-workload.sh'); + const smoke = await readText('examples/postgres-historic/scripts/smoke.sh'); + + assert.match(examples, /postgres-historic/); + assert.match(examples, /unified Historic SQL artifacts/); + assert.match(readme, /--enable-historic-sql/); + assert.match(readme, /--historic-sql-min-executions 2/); + assert.match(readme, /ktx dev doctor --project-dir/); + assert.match(readme, /Postgres Historic SQL/); + assert.match(readme, /manifest\.json/); + assert.match(readme, /tables\/\*\.json/); + assert.match(readme, /patterns-input\.json/); + assert.match(readme, /workUnitCount: 0/); + assert.match(compose, /postgres:14/); + assert.match(compose, /shared_preload_libraries=pg_stat_statements/); + assert.match(compose, /pg_stat_statements.track=top/); + assert.match(initSql, /CREATE EXTENSION IF NOT EXISTS pg_stat_statements/); + assert.match(initSql, /GRANT pg_read_all_stats TO ktx_reader/); + assert.match(workload, /JOIN customers/); + assert.match(workload, /app_user/); + assert.match(workload, /etl_user/); + assert.match(smoke, /assert_unified_snapshot/); + assert.match(smoke, /assert_stage_record "\$UNCHANGED_RECORD" unchanged zero/); + assert.match(smoke, /--historic-sql-min-executions 2/); + assert.match(smoke, /KTX_RUNTIME_ROOT/); + assert.match(smoke, /managedDaemon/); + assert.match(smoke, /installPolicy: 'auto'/); + assert.match(smoke, /getKtxCliPackageInfo/); + assert.doesNotMatch(smoke, /python-service/); + assert.doesNotMatch(smoke, /PYTHON_SERVICE/); + assert.doesNotMatch(smoke, /uvicorn app\.main:app/); + assert.doesNotMatch(smoke, /export KTX_SQL_ANALYSIS_URL/); + assert.doesNotMatch(smoke, /baselineFirstRun|degraded|statsResetAt|assert_manifest/); + assert.doesNotMatch(readme, /python-service/); + assert.doesNotMatch(readme, /KTX_SQL_ANALYSIS_URL/); + assert.doesNotMatch(readme, /baselineFirstRun|degraded: true|statsResetAt|fresh PGSS baseline|delta-only/); + assert.doesNotMatch(readme, /--historic-sql-min-calls/); + }); +``` + +- [ ] **Step 2: Run the docs test to verify it fails** + +Run: + +```bash +node --test scripts/examples-docs.test.mjs +``` + +Expected: FAIL because the current README and smoke script still mention `--historic-sql-min-calls`, `baselineFirstRun`, `degraded`, and the legacy `assert_manifest` helper. + +- [ ] **Step 3: Commit the failing test** + +```bash +git add scripts/examples-docs.test.mjs +git commit -m "test: expect unified historic sql example docs" +``` + +### Task 3: Rewrite The Postgres Historic SQL Smoke + +**Files:** +- Modify: `examples/postgres-historic/scripts/smoke.sh` + +- [ ] **Step 1: Replace the smoke script with unified artifact assertions** + +Replace `examples/postgres-historic/scripts/smoke.sh` with: + +```bash +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +EXAMPLE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +KTX_ROOT="$(cd "$EXAMPLE_DIR/../.." && pwd)" +COMPOSE_FILE="$EXAMPLE_DIR/docker-compose.yml" +PROJECT_PARENT="${KTX_POSTGRES_HISTORIC_PROJECT_PARENT:-$(mktemp -d)}" +PROJECT_DIR="$PROJECT_PARENT/postgres-historic-ktx" +KTX_BIN="$KTX_ROOT/packages/cli/dist/bin.js" +MAX_STAGE_SECONDS="${KTX_POSTGRES_HISTORIC_MAX_STAGE_SECONDS:-60}" +export KTX_RUNTIME_ROOT="$PROJECT_PARENT/managed-runtime" +unset KTX_DAEMON_URL +unset KTX_SQL_ANALYSIS_URL + +cleanup() { + if [[ -f "$KTX_BIN" ]]; then + node "$KTX_BIN" runtime stop >/dev/null 2>&1 || true + fi + if [[ "${KTX_POSTGRES_HISTORIC_KEEP_DOCKER:-0}" != "1" ]]; then + docker compose -f "$COMPOSE_FILE" down -v >/dev/null 2>&1 || true + fi +} +trap cleanup EXIT + +latest_manifest() { + find "$PROJECT_DIR/raw-sources/warehouse/historic-sql" -name manifest.json | sort | tail -n 1 +} + +assert_unified_snapshot() { + local manifest_path="$1" + node - "$manifest_path" <<'NODE' +const { dirname, join } = require('node:path'); +const { readFileSync, readdirSync } = require('node:fs'); + +const manifestPath = process.argv[2]; +const manifest = JSON.parse(readFileSync(manifestPath, 'utf8')); +function assert(condition, message) { + if (!condition) throw new Error(message); +} + +assert(manifest.source === 'historic-sql', `Expected source historic-sql, got ${manifest.source}`); +assert(manifest.dialect === 'postgres', `Expected dialect postgres, got ${manifest.dialect}`); +assert(Number.isInteger(manifest.snapshotRowCount) && manifest.snapshotRowCount > 0, 'Expected snapshotRowCount > 0'); +assert(Number.isInteger(manifest.touchedTableCount) && manifest.touchedTableCount > 0, 'Expected touchedTableCount > 0'); +assert(Number.isInteger(manifest.parseFailures), 'Expected numeric parseFailures'); +assert(Array.isArray(manifest.warnings), 'Expected warnings array'); +assert(Array.isArray(manifest.probeWarnings), 'Expected probeWarnings array'); +for (const legacyKey of ['degraded', 'baselineFirstRun', 'pgServerVersion', 'statsResetAt', 'templates']) { + assert(!(legacyKey in manifest), `Legacy manifest key is still present: ${legacyKey}`); +} + +const root = dirname(manifestPath); +const tableDir = join(root, 'tables'); +const tableFiles = readdirSync(tableDir).filter((file) => file.endsWith('.json')).sort(); +assert(tableFiles.length === manifest.touchedTableCount, `Expected ${manifest.touchedTableCount} table files, got ${tableFiles.length}`); + +const firstTable = JSON.parse(readFileSync(join(tableDir, tableFiles[0]), 'utf8')); +assert(typeof firstTable.table === 'string' && firstTable.table.length > 0, 'Expected staged table name'); +assert(firstTable.stats && typeof firstTable.stats.executionsBucket === 'string', 'Expected bucketed table stats'); +assert(firstTable.columnsByClause && typeof firstTable.columnsByClause === 'object', 'Expected columnsByClause object'); +assert(Array.isArray(firstTable.observedJoins), 'Expected observedJoins array'); +assert(Array.isArray(firstTable.topTemplates) && firstTable.topTemplates.length > 0, 'Expected topTemplates'); + +const patterns = JSON.parse(readFileSync(join(root, 'patterns-input.json'), 'utf8')); +assert(Array.isArray(patterns.templates) && patterns.templates.length > 0, 'Expected patterns-input templates'); +assert( + patterns.templates.every((template) => Array.isArray(template.tablesTouched) && template.tablesTouched.length > 0), + 'Expected every pattern template to have touched tables', +); +NODE +} + +assert_stage_record() { + local record_path="$1" + local label="$2" + local expected_work_units="$3" + node - "$record_path" "$label" "$expected_work_units" "$MAX_STAGE_SECONDS" <<'NODE' +const { readFileSync } = require('node:fs'); + +const record = JSON.parse(readFileSync(process.argv[2], 'utf8')); +const label = process.argv[3]; +const expectedWorkUnits = process.argv[4]; +const maxSeconds = Number(process.argv[5]); +function assert(condition, message) { + if (!condition) throw new Error(message); +} + +assert(record.status === 'done', `${label}: expected status done, got ${record.status}`); +assert(record.adapter === 'historic-sql', `${label}: expected historic-sql adapter`); +assert(record.connectionId === 'warehouse', `${label}: expected warehouse connection`); +assert(record.rawFileCount >= 3, `${label}: expected manifest, patterns input, and at least one table file`); +assert(Array.isArray(record.errors) && record.errors.length === 0, `${label}: expected no errors`); + +if (expectedWorkUnits === 'zero') { + assert(record.workUnitCount === 0, `${label}: expected zero WorkUnits, got ${record.workUnitCount}`); + assert(Array.isArray(record.workUnits) && record.workUnits.length === 0, `${label}: expected empty workUnits`); +} else if (expectedWorkUnits === 'nonzero') { + assert(record.workUnitCount > 0, `${label}: expected nonzero WorkUnits`); + assert(record.workUnits.some((unit) => unit.unitKey === 'historic-sql-patterns'), `${label}: expected patterns WorkUnit`); + assert(record.workUnits.some((unit) => unit.unitKey.startsWith('historic-sql-table-')), `${label}: expected table WorkUnit`); +} else { + throw new Error(`${label}: unknown expected work unit mode ${expectedWorkUnits}`); +} + +const elapsedMs = Date.parse(record.completedAt) - Date.parse(record.startedAt); +assert(Number.isFinite(elapsedMs) && elapsedMs >= 0, `${label}: invalid elapsed time`); +assert(elapsedMs <= maxSeconds * 1000, `${label}: stage-only ingest took ${elapsedMs}ms, over ${maxSeconds}s`); +NODE +} + +run_historic_stage_only() { + local job_id="$1" + local record_path="$2" + node - "$KTX_ROOT" "$PROJECT_DIR" "$job_id" "$record_path" <<'NODE' +const { writeFile } = await import('node:fs/promises'); +const { join } = await import('node:path'); + +const ktxRoot = process.argv[2]; +const projectDir = process.argv[3]; +const jobId = process.argv[4]; +const recordPath = process.argv[5]; +const { loadKtxProject } = await import(join(ktxRoot, 'packages/context/dist/project/index.js')); +const { runLocalStageOnlyIngest } = await import(join(ktxRoot, 'packages/context/dist/ingest/index.js')); +const { createKtxCliLocalIngestAdapters } = await import(join(ktxRoot, 'packages/cli/dist/local-adapters.js')); +const { getKtxCliPackageInfo } = await import(join(ktxRoot, 'packages/cli/dist/index.js')); + +const project = await loadKtxProject({ projectDir }); +const cliVersion = getKtxCliPackageInfo().version; +const managedRuntimeIo = { stdout: process.stdout, stderr: process.stderr }; +const adapters = createKtxCliLocalIngestAdapters(project, { + historicSqlConnectionId: 'warehouse', + managedDaemon: { + cliVersion, + installPolicy: 'auto', + io: managedRuntimeIo, + }, +}); +const adapter = adapters.find((candidate) => candidate.source === 'historic-sql'); +if (!adapter) throw new Error('historic-sql adapter was not registered for local run'); +const record = await runLocalStageOnlyIngest({ + project, + adapters, + adapter: 'historic-sql', + connectionId: 'warehouse', + trigger: 'manual_resync', + jobId, +}); +await writeFile(recordPath, `${JSON.stringify(record, null, 2)}\n`, 'utf8'); +console.log(`${record.syncId} workUnits=${record.workUnitCount}`); +NODE +} + +cd "$KTX_ROOT" +pnpm --filter @ktx/context run build +pnpm --filter @ktx/cli run build + +docker compose -f "$COMPOSE_FILE" up -d --wait +"$EXAMPLE_DIR/scripts/generate-workload.sh" base + +export WAREHOUSE_DATABASE_URL="${WAREHOUSE_DATABASE_URL:-postgresql://ktx_reader:ktx_reader@127.0.0.1:55432/analytics}" # pragma: allowlist secret +node "$KTX_BIN" --project-dir "$PROJECT_DIR" setup \ + --new \ + --skip-agents \ + --skip-llm \ + --skip-embeddings \ + --skip-sources \ + --database postgres \ + --new-database-connection-id warehouse \ + --database-url env:WAREHOUSE_DATABASE_URL \ + --database-schema public \ + --enable-historic-sql \ + --historic-sql-min-executions 2 \ + --yes \ + --no-input + +node "$KTX_BIN" runtime install --yes +node "$KTX_BIN" runtime start + +FIRST_RECORD="$PROJECT_PARENT/first-record.json" +run_historic_stage_only "historic-first-$$" "$FIRST_RECORD" +FIRST_MANIFEST="$(latest_manifest)" +assert_unified_snapshot "$FIRST_MANIFEST" +assert_stage_record "$FIRST_RECORD" first nonzero + +UNCHANGED_RECORD="$PROJECT_PARENT/unchanged-record.json" +run_historic_stage_only "historic-unchanged-$$" "$UNCHANGED_RECORD" +UNCHANGED_MANIFEST="$(latest_manifest)" +assert_unified_snapshot "$UNCHANGED_MANIFEST" +assert_stage_record "$UNCHANGED_RECORD" unchanged zero + +"$EXAMPLE_DIR/scripts/generate-workload.sh" extra +CHANGED_RECORD="$PROJECT_PARENT/changed-record.json" +run_historic_stage_only "historic-changed-$$" "$CHANGED_RECORD" +CHANGED_MANIFEST="$(latest_manifest)" +assert_unified_snapshot "$CHANGED_MANIFEST" +assert_stage_record "$CHANGED_RECORD" changed nonzero + +echo "Postgres historic SQL smoke passed" +echo "Project dir: $PROJECT_DIR" +``` + +- [ ] **Step 2: Run the docs test to verify smoke-script assertions now pass or expose remaining README failures** + +Run: + +```bash +node --test scripts/examples-docs.test.mjs +``` + +Expected: FAIL remains because `examples/postgres-historic/README.md`, `examples/README.md`, and public docs have not been rewritten yet. The smoke-specific assertions for `assert_unified_snapshot`, `assert_stage_record`, and `--historic-sql-min-executions 2` should pass. + +- [ ] **Step 3: Commit** + +```bash +git add examples/postgres-historic/scripts/smoke.sh +git commit -m "test: assert unified postgres historic sql smoke" +``` + +### Task 4: Update Example And Public Docs + +**Files:** +- Modify: `examples/postgres-historic/README.md` +- Modify: `examples/README.md` +- Modify: `docs/content/docs/integrations/primary-sources.mdx` +- Modify: `docs/content/docs/cli-reference/ktx-setup.mdx` + +- [ ] **Step 1: Replace the Postgres historic README** + +Replace `examples/postgres-historic/README.md` with: + +````markdown +# Postgres Historic SQL Example + +This example is a manual smoke for the redesigned Postgres historic-SQL ingest +path through `pg_stat_statements`. It starts Postgres 14 with the extension +preloaded, generates query workload under separate users, runs `ktx setup` with +`--enable-historic-sql`, and verifies the unified staged artifacts: + +- `manifest.json` +- `tables/*.json` +- `patterns-input.json` + +The smoke also runs the same workload twice and verifies the second stage-only +run has `workUnitCount: 0`, which proves unchanged bucketed table and pattern +inputs do not schedule LLM work. + +## Prerequisites + +- Docker with Compose v2 +- Node and pnpm matching the KTX workspace +- `uv` on `PATH` so the KTX-managed Python runtime can install the bundled + runtime wheel + +## Run + +From the KTX repository root: + +```bash +examples/postgres-historic/scripts/smoke.sh +``` + +The smoke creates a temporary KTX project, isolates the managed Python runtime +under the temporary project parent, starts Postgres on `127.0.0.1:55432`, and +uses this connection URL: + +```bash +postgresql://ktx_reader:ktx_reader@127.0.0.1:55432/analytics # pragma: allowlist secret +``` + +Set `KTX_POSTGRES_HISTORIC_KEEP_DOCKER=1` to leave the container running after +the script exits. + +The smoke validates the historic-SQL raw snapshot path without requiring LLM +credentials. It uses KTX's local stage-only ingest API after `ktx setup`, so the +deterministic reader, batch SQL parser, stable artifact writer, and diff-based +WorkUnit planning are checked independently from curation. + +## Manual Commands + +Start Postgres and generate the base workload: + +```bash +docker compose -f examples/postgres-historic/docker-compose.yml up -d --wait +examples/postgres-historic/scripts/generate-workload.sh base +``` + +Create a project and enable historic SQL: + +```bash +export WAREHOUSE_DATABASE_URL=postgresql://ktx_reader:ktx_reader@127.0.0.1:55432/analytics # pragma: allowlist secret +pnpm --filter @ktx/cli run build +node packages/cli/dist/bin.js --project-dir /tmp/ktx-postgres-historic setup \ + --new \ + --skip-agents \ + --skip-llm \ + --skip-embeddings \ + --skip-sources \ + --database postgres \ + --new-database-connection-id warehouse \ + --database-url env:WAREHOUSE_DATABASE_URL \ + --database-schema public \ + --enable-historic-sql \ + --historic-sql-min-executions 2 \ + --yes \ + --no-input +``` + +### Readiness check + +```bash +pnpm run ktx -- dev doctor --project-dir /tmp/ktx-postgres-historic --no-input +``` + +The installed CLI form is: + +```bash +ktx dev doctor --project-dir /tmp/ktx-postgres-historic --no-input +``` + +Expected output includes `PASS Postgres Historic SQL (warehouse)` when +`pg_stat_statements` is installed, `pg_read_all_stats` is granted, and tracking +is enabled. A low `pg_stat_statements.max` value is reported as an informational +note, not a warning. + +Run local historic-SQL ingest: + +```bash +pnpm run ktx -- dev ingest run --project-dir /tmp/ktx-postgres-historic \ + --connection-id warehouse \ + --adapter historic-sql \ + --plain \ + --yes \ + --no-input +``` + +The full `dev ingest run` path also runs curation WorkUnits, so it requires a +configured LLM provider. + +Inspect the latest manifest: + +```bash +find /tmp/ktx-postgres-historic/raw-sources/warehouse/historic-sql -name manifest.json | sort | tail -n 1 +``` + +The manifest should have `source: "historic-sql"`, `dialect: "postgres"`, +positive `snapshotRowCount`, positive `touchedTableCount`, numeric +`parseFailures`, `warnings`, and `probeWarnings`. The same directory should +contain `patterns-input.json` and one `tables/*.json` file per touched table. + +## Troubleshooting + +- Missing extension: confirm `shared_preload_libraries=pg_stat_statements` and + `CREATE EXTENSION pg_stat_statements;` both happened in the `analytics` + database. +- Missing grants: confirm `GRANT pg_read_all_stats TO ktx_reader;`. +- Empty snapshot: rerun `scripts/generate-workload.sh base` and keep + `--historic-sql-min-executions 2` for the smoke. +- SQL-analysis failures: run `pnpm run ktx -- runtime doctor` from the KTX + repository root and confirm `uv`, the bundled Python wheel, and the managed + runtime all pass. +```` + +- [ ] **Step 2: Update the examples index paragraph** + +In `examples/README.md`, replace the `postgres-historic` paragraph with: + +```markdown +## postgres-historic + +`postgres-historic/` is a manual Docker-backed smoke for Postgres +historic-SQL ingest via `pg_stat_statements`. It verifies setup, unified +Historic SQL artifacts, managed daemon batch SQL analysis, and no-WorkUnit +idempotency for unchanged bucketed table and pattern inputs. +``` + +- [ ] **Step 3: Update the setup CLI reference** + +In `docs/content/docs/cli-reference/ktx-setup.mdx`, replace the Historic SQL flag rows with: + +```markdown +| `--enable-historic-sql` | Enable Historic SQL when the selected database supports it | `false` | +| `--disable-historic-sql` | Disable Historic SQL for the selected database | `false` | +| `--historic-sql-window-days ` | Historic SQL query-history window in days | — | +| `--historic-sql-min-executions ` | Minimum executions for a Historic SQL template | — | +| `--historic-sql-min-calls ` | Alias for `--historic-sql-min-executions` for one release | — | +| `--historic-sql-service-account-pattern ` | Historic SQL service-account regex; repeatable | — | +| `--historic-sql-redaction-pattern ` | Historic SQL SQL-literal redaction regex; repeatable | — | +``` + +- [ ] **Step 4: Update primary source Historic SQL docs** + +In `docs/content/docs/integrations/primary-sources.mdx`, replace the Postgres Historic SQL config block with: + +````markdown +```yaml +historicSql: + enabled: true + dialect: postgres + minExecutions: 5 + filters: + dropTrivialProbes: true +``` +```` + +Replace the Snowflake Historic SQL feature row with: + +```markdown +| Historic SQL | Yes | Via `SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY` when enabled | +``` + +Replace the Snowflake Historic SQL paragraph and config block with: + +````markdown +Snowflake Historic SQL reads aggregated query-history templates from +`SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY` and feeds the same unified staged +artifact shape as Postgres and BigQuery. + +```yaml +historicSql: + enabled: true + dialect: snowflake + windowDays: 90 + minExecutions: 5 + filters: + dropTrivialProbes: true + serviceAccounts: + patterns: ['^svc_'] + mode: exclude + redactionPatterns: [] +``` +```` + +Replace the BigQuery Historic SQL feature row with: + +```markdown +| Historic SQL | Yes | Via region-scoped `INFORMATION_SCHEMA.JOBS_BY_PROJECT` when enabled | +``` + +Replace the BigQuery Historic SQL paragraph and config block with: + +````markdown +BigQuery Historic SQL reads aggregated query-history templates from +region-scoped `INFORMATION_SCHEMA.JOBS_BY_PROJECT` and feeds the same unified +staged artifact shape as Postgres and Snowflake. + +```yaml +historicSql: + enabled: true + dialect: bigquery + windowDays: 90 + minExecutions: 5 + filters: + dropTrivialProbes: true + serviceAccounts: + patterns: ['@bot\\.'] + mode: exclude + redactionPatterns: [] +``` +```` + +- [ ] **Step 5: Run docs tests to verify they pass** + +Run: + +```bash +node --test scripts/examples-docs.test.mjs +``` + +Expected: PASS. The Postgres historic example test now sees unified artifact language and no legacy baseline/delta/reset wording. + +- [ ] **Step 6: Commit** + +```bash +git add examples/postgres-historic/README.md examples/README.md docs/content/docs/integrations/primary-sources.mdx docs/content/docs/cli-reference/ktx-setup.mdx +git commit -m "docs: refresh historic sql setup and smoke docs" +``` + +### Task 5: Final Verification + +**Files:** +- Verify: `packages/cli/src/setup-databases.ts` +- Verify: `packages/cli/src/setup-databases.test.ts` +- Verify: `scripts/examples-docs.test.mjs` +- Verify: `examples/postgres-historic/scripts/smoke.sh` +- Verify: `examples/postgres-historic/README.md` +- Verify: `examples/README.md` +- Verify: `docs/content/docs/integrations/primary-sources.mdx` +- Verify: `docs/content/docs/cli-reference/ktx-setup.mdx` + +- [ ] **Step 1: Run focused setup tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-databases.test.ts --testNamePattern "Historic SQL" +``` + +Expected: PASS. + +- [ ] **Step 2: Run example docs tests** + +Run: + +```bash +node --test scripts/examples-docs.test.mjs +``` + +Expected: PASS. + +- [ ] **Step 3: Run CLI type check** + +Run: + +```bash +pnpm --filter @ktx/cli run type-check +``` + +Expected: PASS. + +- [ ] **Step 4: Run grep checks for stale legacy wording** + +Run: + +```bash +rg -n "baselineFirstRun|fresh PGSS baseline|delta-only|--historic-sql-min-calls 2|local CLI Historic SQL ingest currently uses the Postgres path" examples docs/content scripts packages/cli/src/setup-databases.test.ts +``` + +Expected: no matches. + +Run: + +```bash +rg -n "serviceAccountUserPatterns" packages/cli/src/setup-databases.ts packages/cli/src/setup-databases.test.ts docs/content examples +``` + +Expected: no matches. Existing runtime compatibility in `packages/context/src/ingest/adapters/historic-sql/types.ts` must remain untouched, so do not run this grep across `packages/context`. + +- [ ] **Step 5: Run the Docker-backed smoke when Docker is available** + +Run: + +```bash +examples/postgres-historic/scripts/smoke.sh +``` + +Expected: PASS with `Postgres historic SQL smoke passed`. If Docker is not running or unavailable, record the exact Docker error and still run Steps 1-4. + +- [ ] **Step 6: Run pre-commit for touched files** + +Run: + +```bash +uv run pre-commit run --files \ + packages/cli/src/setup-databases.ts \ + packages/cli/src/setup-databases.test.ts \ + scripts/examples-docs.test.mjs \ + examples/postgres-historic/scripts/smoke.sh \ + examples/postgres-historic/README.md \ + examples/README.md \ + docs/content/docs/integrations/primary-sources.mdx \ + docs/content/docs/cli-reference/ktx-setup.mdx +``` + +Expected: PASS when pre-commit is configured. If pre-commit is not configured or this workspace lacks the required hook environment, keep the output and rely on Steps 1-5 plus `git diff --check`. + +- [ ] **Step 7: Run whitespace check** + +Run: + +```bash +git diff --check +``` + +Expected: no output. + +- [ ] **Step 8: Commit verification fixes only if verification changed files** + +If any verification step required an edit, commit the exact touched files: + +```bash +git add packages/cli/src/setup-databases.ts packages/cli/src/setup-databases.test.ts scripts/examples-docs.test.mjs examples/postgres-historic/scripts/smoke.sh examples/postgres-historic/README.md examples/README.md docs/content/docs/integrations/primary-sources.mdx docs/content/docs/cli-reference/ktx-setup.mdx +git commit -m "test: verify historic sql docs and smoke cleanup" +``` + +If verification made no edits, do not create an empty commit. + +## Self-Review + +Spec coverage: + +- Spec §8 setup config is covered by Task 1 and Task 4. +- Spec §10.3 docs and setup wizard updates are covered by Tasks 1 and 4. +- Spec §10.4 demo DB acceptance is covered by Task 3 and Task 5. +- The prior implemented plans already cover daemon batch analysis, unified staging, skills/projection, search enrichment, old-code deletion, and cross-dialect local adapter wiring. + +Placeholder scan: + +- This plan contains concrete file paths, exact replacement snippets, exact commands, and expected outcomes for every step. + +Type consistency: + +- `filters.dropTrivialProbes`, `filters.serviceAccounts.patterns`, and `filters.serviceAccounts.mode` match `historicSqlUnifiedPullConfigSchema`. +- `workUnitCount`, `rawFileCount`, `startedAt`, and `completedAt` match `LocalIngestRunRecord`. +- `manifest.json`, `tables/*.json`, and `patterns-input.json` match the unified staged artifact names from `stage-unified.ts`. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - I dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - Execute tasks in this session using executing-plans, batch execution with checkpoints + +Which approach? diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-end-to-end-retrieval-acceptance.md b/docs/superpowers/plans/2026-05-11-historic-sql-end-to-end-retrieval-acceptance.md new file mode 100644 index 00000000..c9c40fd9 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-end-to-end-retrieval-acceptance.md @@ -0,0 +1,452 @@ +# Historic SQL End-To-End Retrieval Acceptance Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add one focused regression test that proves the redesigned historic-SQL pipeline reaches both agent retrieval surfaces after a real scheduled local ingest run. + +**Architecture:** All historic-SQL redesign implementation slices are already present. This plan adds acceptance coverage around the existing production `HistoricSqlSourceAdapter`: a fake aggregate reader and fake batch SQL analysis drive the deterministic hot path, a fake `AgentRunnerService` emits typed table and pattern evidence through `emit_historic_sql_evidence`, and the normal local ingest runner performs projection, squash, wiki indexing, and semantic-layer reindexing. + +**Tech Stack:** TypeScript ESM/NodeNext, Vitest, YAML, SQLite FTS5 local search, existing local ingest runner, existing historic-SQL adapter. + +--- + +## Starting Point + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans found that are based on this spec: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-projection-archive-hardening.md` + +Implemented status verified from this worktree: + +- `2026-05-11-historic-sql-foundations.md` is implemented. Evidence: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`, `packages/context/src/sql-analysis/ports.ts` exposes `analyzeBatch()`, `python/ktx-daemon/src/ktx_daemon/app.py` registers `/sql/analyze-batch`, `packages/context/src/sl/types.ts` has `SemanticLayerSource.usage`, and `packages/context/src/ingest/adapters/live-database/manifest.ts` has `mergeUsagePreservingExternal()`. +- `2026-05-11-historic-sql-search-enrichment.md` is implemented. Evidence: `packages/context/src/sl/sl-search.service.ts` indexes `source.usage`, `packages/context/src/sl/sqlite-sl-sources-index.ts` selects FTS snippets, and local/MCP list surfaces expose `frequencyTier` and `snippet`. +- `2026-05-11-historic-sql-unified-hot-path.md` is implemented. Evidence: `stageHistoricSqlAggregatedSnapshot()`, `chunkHistoricSqlUnifiedStagedDir()`, `PostgresPgssReader`, aggregate BigQuery/Snowflake `fetchAggregated()` methods, unified schemas, and package exports exist. +- `2026-05-11-historic-sql-skills-projection-cutover.md` is implemented. Evidence: `HistoricSqlSourceAdapter` uses the unified stager/chunker, `packages/context/skills/historic_sql_table_digest/` and `packages/context/skills/historic_sql_patterns/` exist, `emit_historic_sql_evidence` exists, `HistoricSqlProjectionPostProcessor` is wired in `packages/context/src/ingest/local-bundle-runtime.ts`, and legacy skill names no longer grep in `packages/context` or `packages/cli`. +- `2026-05-11-historic-sql-cross-dialect-readiness.md` is implemented. Evidence: `packages/cli/src/local-adapters.test.ts` covers Postgres, BigQuery, and Snowflake historic-SQL registration, and `packages/cli/src/historic-sql-doctor.test.ts` covers low `pg_stat_statements.max` as informational output. +- `2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md` is implemented. Evidence: `packages/cli/src/setup-databases.test.ts` expects canonical `historicSql.filters.serviceAccounts`, `examples/postgres-historic/scripts/smoke.sh` asserts unified `manifest.json`, `tables/*.json`, `patterns-input.json`, and zero WorkUnits on the unchanged run, and public docs use `minExecutions`. +- `2026-05-11-historic-sql-projection-archive-hardening.md` is implemented. Evidence: `projection.ts` has `isArchivedPatternPage()`, excludes archived pages from active slug matching, and `projection.test.ts` covers reappearing archived patterns, stable archived pages, stale table marking, and legacy query-page deletion. + +Remaining acceptance gap this plan covers: + +- The current Postgres example smoke is intentionally stage-only, so it verifies raw artifacts and zero unchanged WorkUnits but does not prove table/pattern evidence projection and retrieval. +- `packages/context/src/ingest/local-bundle-ingest.test.ts` verifies the historic-SQL post-processor with a source-dir test adapter, but it does not exercise the production `HistoricSqlSourceAdapter` scheduled-pull path or the `historic_sql_patterns` WorkUnit. +- Existing SL and wiki search tests prove the search layers independently, but no single regression proves spec §7's retrieval chain after historic-SQL ingest writes `_schema` usage and `knowledge/global/historic-sql/*.md`. + +## File Structure + +Create: + +- `packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts` + Owns the end-to-end local regression for the redesigned historic-SQL pipeline. It uses the real adapter and local ingest runner, with fake deterministic reader/analysis/agent components so the test does not need a live database or LLM provider. + +## Task 1: Add Real-Adapter Local Ingest Acceptance Coverage + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts` + +- [ ] **Step 1: Verify the acceptance test does not exist yet** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +``` + +Expected: FAIL with "No test files found" because no end-to-end historic-SQL retrieval acceptance test exists yet. + +- [ ] **Step 2: Write the acceptance test** + +Create `packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts`: + +```typescript +import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import YAML from 'yaml'; +import { AgentRunnerService } from '../../../agent/index.js'; +import { initKtxProject, loadKtxProject, type KtxLocalProject } from '../../../project/index.js'; +import { type SqlAnalysisPort } from '../../../sql-analysis/index.js'; +import { searchLocalSlSources } from '../../../sl/local-sl.js'; +import { searchLocalKnowledgePages } from '../../../wiki/local-knowledge.js'; +import { runLocalIngest } from '../../local-ingest.js'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js'; +import type { AggregatedTemplate, HistoricSqlReader, HistoricSqlUnifiedPullConfig } from './types.js'; + +class AcceptanceHistoricSqlReader implements HistoricSqlReader { + async probe() { + return { warnings: [], info: [] }; + } + + async *fetchAggregated( + _client: unknown, + _window: { start: Date; end: Date }, + _config: HistoricSqlUnifiedPullConfig, + ): AsyncIterable { + yield { + templateId: 'pg:orders-lifecycle', + canonicalSql: + 'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.status = $1 group by o.status, c.segment', + dialect: 'postgres', + stats: { + executions: 42, + distinctUsers: 4, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 18, + p95RuntimeMs: 84, + errorRate: 0, + rowsProduced: 420, + }, + topUsers: [{ user: 'analyst@example.test', executions: 42 }], + }; + } +} + +class HistoricSqlAcceptanceAgentRunner extends AgentRunnerService { + override runLoop = vi.fn(async (params: any) => { + if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') { + return { stopReason: 'natural' as const }; + } + + const emitEvidence = params.toolSet.emit_historic_sql_evidence; + if (!emitEvidence?.execute) { + throw new Error('emit_historic_sql_evidence tool was not available to the historic-SQL WorkUnit'); + } + + if (params.telemetryTags.unitKey === 'historic-sql-table-public-orders') { + const result = await emitEvidence.execute( + { + kind: 'table_usage', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Analysts repeatedly inspect paid order lifecycle by customer segment.', + frequencyTier: 'high', + commonFilters: ['status'], + commonGroupBys: ['status', 'segment'], + commonJoins: [{ table: 'public.customers', on: ['customer_id', 'id'] }], + staleSince: null, + }, + }, + { toolCallId: 'historic-sql-orders-usage' }, + ); + if (!String(result).includes('Recorded historic-SQL table_usage evidence')) { + throw new Error(`Unexpected orders evidence result: ${String(result)}`); + } + } + + if (params.telemetryTags.unitKey === 'historic-sql-table-public-customers') { + const result = await emitEvidence.execute( + { + kind: 'table_usage', + table: 'public.customers', + rawPath: 'tables/public.customers.json', + usage: { + narrative: 'Customers provide segment context for paid order lifecycle analysis.', + frequencyTier: 'mid', + commonFilters: [], + commonGroupBys: ['segment'], + commonJoins: [{ table: 'public.orders', on: ['id', 'customer_id'] }], + staleSince: null, + }, + }, + { toolCallId: 'historic-sql-customers-usage' }, + ); + if (!String(result).includes('Recorded historic-SQL table_usage evidence')) { + throw new Error(`Unexpected customers evidence result: ${String(result)}`); + } + } + + if (params.telemetryTags.unitKey === 'historic-sql-patterns') { + const result = await emitEvidence.execute( + { + kind: 'pattern', + rawPath: 'patterns-input.json', + pattern: { + slug: 'paid-order-lifecycle', + title: 'Paid Order Lifecycle', + narrative: 'Analysts join orders and customers to compare paid order lifecycle by segment.', + definitionSql: + 'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status, c.segment', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['pg:orders-lifecycle'], + }, + }, + { toolCallId: 'historic-sql-pattern' }, + ); + if (!String(result).includes('Recorded historic-SQL pattern evidence')) { + throw new Error(`Unexpected pattern evidence result: ${String(result)}`); + } + } + + return { stopReason: 'natural' as const }; + }); + + constructor() { + super({ llmProvider: { getModel: () => ({}) as never } as never }); + } +} + +function acceptanceSqlAnalysis(): SqlAnalysisPort { + return { + analyzeForFingerprint: async () => { + throw new Error('analyzeForFingerprint should not be used by unified historic-SQL ingest'); + }, + analyzeBatch: vi.fn(async (items) => { + return new Map( + items.map((item) => [ + item.id, + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: ['status', 'segment'], + where: ['status'], + join: ['customer_id', 'id'], + groupBy: ['status', 'segment'], + }, + }, + ]), + ); + }), + }; +} + +async function writeHistoricSqlProject(project: KtxLocalProject): Promise { + await writeFile( + join(project.projectDir, 'ktx.yaml'), + [ + 'project: warehouse', + 'connections:', + ' warehouse:', + ' driver: postgres', + ' historicSql:', + ' enabled: true', + ' dialect: postgres', + ' minExecutions: 2', + 'ingest:', + ' adapters:', + ' - historic-sql', + ' embeddings:', + ' backend: deterministic', + 'storage:', + ' state: sqlite', + ' search: sqlite-fts5', + ' git:', + ' auto_commit: false', + ' author: KTX Test ', + '', + ].join('\n'), + 'utf-8', + ); + + const loaded = await loadKtxProject({ projectDir: project.projectDir }); + await loaded.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/public.yaml', + YAML.stringify({ + tables: { + orders: { + table: 'public.orders', + columns: [ + { name: 'id', type: 'string' }, + { name: 'status', type: 'string' }, + { name: 'customer_id', type: 'string' }, + ], + }, + customers: { + table: 'public.customers', + columns: [ + { name: 'id', type: 'string' }, + { name: 'segment', type: 'string' }, + ], + }, + }, + }), + 'KTX Test', + 'system@ktx.local', + 'Seed schema shard', + ); + return loaded; +} + +describe('historic-SQL local ingest retrieval acceptance', () => { + let tempDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-historic-sql-acceptance-')); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('projects table and pattern evidence into semantic-layer and wiki retrieval surfaces', async () => { + const initialized = await initKtxProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' }); + const project = await writeHistoricSqlProject(initialized); + const sqlAnalysis = acceptanceSqlAnalysis(); + const agentRunner = new HistoricSqlAcceptanceAgentRunner(); + const adapter = new HistoricSqlSourceAdapter({ + reader: new AcceptanceHistoricSqlReader(), + queryClient: {}, + sqlAnalysis, + now: () => new Date('2026-05-11T00:00:00.000Z'), + }); + + const result = await runLocalIngest({ + project, + adapters: [adapter], + adapter: 'historic-sql', + connectionId: 'warehouse', + jobId: 'historic-sql-retrieval-acceptance', + agentRunner, + }); + + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1); + expect(result.result.failedWorkUnits).toEqual([]); + expect(result.result.workUnitCount).toBe(3); + expect(agentRunner.runLoop).toHaveBeenCalledTimes(3); + expect(result.report.body.postProcessor).toMatchObject({ + sourceKey: 'historic-sql', + status: 'success', + result: { + tableUsageMerged: 2, + patternPagesWritten: 1, + }, + touchedSources: [ + { connectionId: 'warehouse', sourceName: 'customers' }, + { connectionId: 'warehouse', sourceName: 'orders' }, + ], + }); + + await expect(readFile(join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves + .toContain('Analysts repeatedly inspect paid order lifecycle by customer segment.'); + await expect(readFile(join(project.projectDir, 'knowledge/global/historic-sql/paid-order-lifecycle.md'), 'utf-8')) + .resolves.toContain('Paid Order Lifecycle'); + + const reloaded = await loadKtxProject({ projectDir: project.projectDir }); + await expect( + searchLocalSlSources(reloaded, { connectionId: 'warehouse', query: 'paid order lifecycle', limit: 5 }), + ).resolves.toEqual([ + expect.objectContaining({ + name: 'orders', + frequencyTier: 'high', + snippet: expect.stringContaining(''), + matchReasons: expect.arrayContaining(['lexical']), + }), + ]); + await expect( + searchLocalKnowledgePages(reloaded, { query: 'paid order lifecycle', userId: 'local', limit: 5 }), + ).resolves.toEqual([ + expect.objectContaining({ + key: 'historic-sql/paid-order-lifecycle', + summary: 'Paid Order Lifecycle', + matchReasons: expect.arrayContaining(['lexical']), + }), + ]); + }); +}); +``` + +- [ ] **Step 3: Run the focused acceptance test after creating the file** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +``` + +Expected: PASS. The output reports one passing test and `sqlAnalysis.analyzeBatch` is called exactly once by the test assertion. + +- [ ] **Step 4: Commit the acceptance test** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +git commit -m "test: cover historic sql retrieval acceptance" +``` + +## Task 2: Run Adjacent Historic-SQL Regression Checks + +**Files:** +- Verify: `packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/projection.test.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts` +- Verify: `packages/context/src/sl/local-sl.test.ts` +- Verify: `packages/context/src/wiki/local-knowledge.test.ts` + +- [ ] **Step 1: Run the new acceptance test with the adjacent historic-SQL unit tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts \ + src/ingest/adapters/historic-sql/projection.test.ts \ + src/ingest/adapters/historic-sql/stage-unified.test.ts \ + src/ingest/adapters/historic-sql/chunk-unified.test.ts \ + src/sl/local-sl.test.ts \ + src/wiki/local-knowledge.test.ts +``` + +Expected: PASS. These suites cover the new acceptance chain plus the deterministic projection, stager, chunker, SL search, and wiki search layers it depends on. + +- [ ] **Step 2: Run pre-commit for the new test file** + +Run: + +```bash +uv run pre-commit run --files packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +``` + +Expected: PASS. If `uv` refuses to run because the local binary does not satisfy the repo pin, activate `.venv` and run the closest TypeScript checks instead: + +```bash +pnpm --filter @ktx/context run type-check +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +``` + +- [ ] **Step 3: Confirm no unrelated files are included** + +Run: + +```bash +git status --short +``` + +Expected: either an empty status after the Task 1 commit, or only intentionally changed plan/test files if the worker is preserving an uncommitted plan handoff. + +## Self-Review + +Spec coverage: + +- Spec §4 hot path is covered because the test uses `HistoricSqlSourceAdapter.fetch()` with `stageHistoricSqlAggregatedSnapshot()`, a fake `HistoricSqlReader.fetchAggregated()`, and one `SqlAnalysisPort.analyzeBatch()` call. +- Spec §5 cold path is covered because the fake agent emits `table_usage` and `pattern` evidence through `emit_historic_sql_evidence`, and the normal `HistoricSqlProjectionPostProcessor` projects that evidence. +- Spec §6 and §7 retrieval surfaces are covered because the same test verifies `searchLocalSlSources()` returns `frequencyTier` and an FTS snippet and `searchLocalKnowledgePages()` returns `historic-sql/paid-order-lifecycle`. +- Spec §10.4 search retrieval acceptance is covered without requiring a live warehouse or LLM credentials. + +Placeholder scan: + +- The placeholder scan is clean, and the plan contains concrete file paths, code, commands, and expected outputs. +- The only fallback in the plan is the explicit `uv` version-mismatch path required by repository instructions. + +Type consistency: + +- `HistoricSqlReader`, `HistoricSqlUnifiedPullConfig`, `SqlAnalysisPort`, `HistoricSqlSourceAdapter`, `runLocalIngest`, `searchLocalSlSources`, and `searchLocalKnowledgePages` match existing exported APIs. +- Evidence payloads match `emit_historic_sql_evidence` input schemas: table evidence omits `connectionId` because the tool injects it; projected persisted evidence includes it. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-end-to-end-retrieval-acceptance.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - I dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - Execute tasks in this session using executing-plans, batch execution with checkpoints + +Which approach? diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-foundations.md b/docs/superpowers/plans/2026-05-11-historic-sql-foundations.md new file mode 100644 index 00000000..fdd97d3f --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-foundations.md @@ -0,0 +1,1477 @@ +# Historic SQL Foundations Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build the foundation slice for the historic SQL redesign: shared usage schemas, semantic-layer usage plumbing, scan-safe usage preservation, and batch SQL analysis across the Python daemon and TypeScript port. + +**Architecture:** Keep the existing historic-SQL adapter behavior unchanged in this slice. Add the additive contracts from the redesign first so later adapter, skill, projection, and search work can depend on stable types and daemon APIs. The Python daemon owns SQL parsing through `sqlglot`; TypeScript owns HTTP mapping, semantic-layer schema acceptance, and manifest projection. + +**Tech Stack:** TypeScript ESM/NodeNext, zod 4, Vitest, FastAPI, Pydantic v2, sqlglot, pytest, uv. + +--- + +## Starting Point + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Existing plans derived from this spec: none found. A repo search found only managed-runtime plans that mention historic-SQL smoke commands or `pg_stat_statements`; those plans are not based on the redesign spec and do not implement the redesign architecture. + +Current implementation state: + +- `packages/context/src/sql-analysis/ports.ts` exposes only `analyzeForFingerprint()`. +- `packages/context/src/sql-analysis/http-sql-analysis-port.ts` only calls `/api/sql/analyze-for-fingerprint`. +- `python/ktx-daemon/src/ktx_daemon/app.py` has no `/sql/analyze-batch` endpoint. +- `packages/context/src/sl/types.ts` has no `SemanticLayerSource.usage`. +- `packages/context/src/sl/schemas.ts` is strict and rejects top-level `usage`. +- `packages/context/src/sl/semantic-layer.service.ts` does not project `_schema` manifest `usage`. +- `packages/context/src/ingest/adapters/live-database/manifest.ts` does not preserve usage through live database scan rewrites. +- The old historic-SQL code path is still present (`stage-pgss.ts`, `stagePgStatStatementsTemplates`, `pgss-baseline`, slot classification, per-template wiki page staging). + +This plan implements only the foundation ordering item from spec §10.3: + +- Daemon `analyze-batch` endpoint. +- `SqlAnalysisPort.analyzeBatch()`. +- `SemanticLayerSource.usage`. +- `LiveDatabaseManifestTableEntry.usage`. +- `mergeUsagePreservingExternal()` plus tests. + +The next plan after this one should cover search enrichment from spec §6.2.3-§6.2.5. + +## File Structure + +Create: + +- `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts` + Owns the shared zod schemas for historic-SQL LLM outputs. +- `packages/context/src/ingest/adapters/historic-sql/skill-schemas.test.ts` + Locks schema acceptance, JSON schema generation, and future-key tolerance. +- `python/ktx-daemon/src/ktx_daemon/sql_analysis.py` + Implements batch sqlglot parsing for table and clause-level column extraction. +- `python/ktx-daemon/tests/test_sql_analysis.py` + Tests batch parser behavior without FastAPI. + +Modify: + +- `packages/context/src/ingest/index.ts` + Exports the new historic-SQL skill schemas. +- `packages/context/src/sl/types.ts` + Adds `usage?: TableUsageOutput` to `SemanticLayerSource`. +- `packages/context/src/sl/schemas.ts` + Accepts `usage` in standalone and overlay semantic-layer source validation. +- `packages/context/src/sl/semantic-layer.service.ts` + Projects manifest `usage` onto `SemanticLayerSource` and composes overlay usage intentionally. +- `packages/context/src/sl/semantic-layer.service.test.ts` + Tests source schema acceptance, manifest projection, and overlay composition. +- `packages/context/src/ingest/adapters/live-database/manifest.ts` + Adds `LiveDatabaseManifestTableEntry.usage`, existing-usage inputs, and `mergeUsagePreservingExternal()`. +- `packages/context/src/ingest/adapters/live-database/manifest.test.ts` + Tests scan-managed usage replacement while preserving external keys. +- `packages/context/src/scan/local-enrichment-artifacts.ts` + Loads existing manifest usage and passes it through scan manifest rebuilds. +- `packages/context/src/scan/local-enrichment-artifacts.test.ts` + Tests that structural scan rewrites preserve existing usage. +- `python/ktx-daemon/src/ktx_daemon/app.py` + Registers `/sql/analyze-batch`. +- `python/ktx-daemon/tests/test_app.py` + Tests the FastAPI endpoint. +- `packages/context/src/sql-analysis/ports.ts` + Adds batch analysis types and `SqlAnalysisPort.analyzeBatch()`. +- `packages/context/src/sql-analysis/index.ts` + Exports the new batch analysis types. +- `packages/context/src/sql-analysis/http-sql-analysis-port.ts` + Maps `/sql/analyze-batch` request and response payloads. +- `packages/context/src/sql-analysis/http-sql-analysis-port.test.ts` + Tests HTTP mapping and malformed response rejection. +- `packages/cli/src/managed-python-http.test.ts` + Verifies the managed daemon wrapper routes `analyzeBatch()`. +- Existing test files with `SqlAnalysisPort` object literals + Add a no-op `analyzeBatch: async () => new Map()` while legacy paths still use `analyzeForFingerprint()`. + +## Task 1: Add Historic SQL Skill Schemas + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.test.ts` +- Modify: `packages/context/src/ingest/index.ts` + +- [ ] **Step 1: Write the failing schema tests** + +Create `packages/context/src/ingest/adapters/historic-sql/skill-schemas.test.ts`: + +```typescript +import { describe, expect, it } from 'vitest'; +import { z } from 'zod'; +import { + patternOutputSchema, + patternsArraySchema, + tableUsageOutputSchema, +} from './skill-schemas.js'; + +describe('historic-sql skill schemas', () => { + it('accepts table usage output and preserves future keys', () => { + const parsed = tableUsageOutputSchema.parse({ + narrative: 'Orders are queried for paid/refunded lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + analystNote: 'preserve me', + }); + + expect(parsed).toMatchObject({ + narrative: 'Orders are queried for paid/refunded lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + analystNote: 'preserve me', + }); + }); + + it('rejects invalid frequency tiers', () => { + const result = tableUsageOutputSchema.safeParse({ + narrative: 'Orders are queried often.', + frequencyTier: 'sometimes', + commonFilters: [], + commonJoins: [], + }); + + expect(result.success).toBe(false); + }); + + it('accepts pattern outputs used for wiki projection', () => { + const parsed = patternsArraySchema.parse([ + { + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Teams inspect order status by customer and month.', + definitionSql: 'select status, count(*) from public.orders group by status', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['template_1', 'template_2'], + }, + ]); + + expect(parsed[0]).toEqual({ + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Teams inspect order status by customer and month.', + definitionSql: 'select status, count(*) from public.orders group by status', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['template_1', 'template_2'], + }); + }); + + it('exports zod schemas that can produce JSON schema for prompt prefixes', () => { + const tableUsageJsonSchema = z.toJSONSchema(tableUsageOutputSchema); + const patternJsonSchema = z.toJSONSchema(patternOutputSchema); + + expect(tableUsageJsonSchema).toMatchObject({ type: 'object' }); + expect(patternJsonSchema).toMatchObject({ type: 'object' }); + }); +}); +``` + +- [ ] **Step 2: Run the schema test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/skill-schemas.test.ts +``` + +Expected: FAIL with an import error for `./skill-schemas.js`. + +- [ ] **Step 3: Add the schema implementation** + +Create `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`: + +```typescript +import { z } from 'zod'; + +export const tableUsageOutputSchema = z + .object({ + narrative: z.string(), + frequencyTier: z.enum(['high', 'mid', 'low', 'unused']), + commonFilters: z.array(z.string()), + commonGroupBys: z.array(z.string()).optional(), + commonJoins: z.array( + z.object({ + table: z.string(), + on: z.array(z.string()), + }), + ), + staleSince: z.iso.datetime().nullable().optional(), + }) + .passthrough(); +export type TableUsageOutput = z.infer; + +export const patternOutputSchema = z.object({ + slug: z.string(), + title: z.string(), + narrative: z.string(), + definitionSql: z.string(), + tablesInvolved: z.array(z.string()), + slRefs: z.array(z.string()), + constituentTemplateIds: z.array(z.string()), +}); +export type PatternOutput = z.infer; + +export const patternsArraySchema = z.array(patternOutputSchema); +``` + +- [ ] **Step 4: Export the schemas from the ingest barrel** + +Add this export block to `packages/context/src/ingest/index.ts` near the other historic-SQL exports: + +```typescript +export { + patternOutputSchema, + patternsArraySchema, + tableUsageOutputSchema, +} from './adapters/historic-sql/skill-schemas.js'; +export type { + PatternOutput, + TableUsageOutput, +} from './adapters/historic-sql/skill-schemas.js'; +``` + +- [ ] **Step 5: Run the schema test to verify it passes** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/skill-schemas.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts packages/context/src/ingest/adapters/historic-sql/skill-schemas.test.ts packages/context/src/ingest/index.ts +git commit -m "feat: add historic sql skill schemas" +``` + +## Task 2: Add `usage` to Semantic Layer Sources + +**Files:** +- Modify: `packages/context/src/sl/types.ts` +- Modify: `packages/context/src/sl/schemas.ts` +- Modify: `packages/context/src/sl/semantic-layer.service.ts` +- Test: `packages/context/src/sl/semantic-layer.service.test.ts` + +- [ ] **Step 1: Write failing semantic-layer usage tests** + +In `packages/context/src/sl/semantic-layer.service.test.ts`, extend the import from `./semantic-layer.service.js`: + +```typescript +import { + composeOverlay, + enrichColumnsFromManifest, + findDanglingSegmentRefs, + projectManifestEntry, + SemanticLayerService, +} from './semantic-layer.service.js'; +``` + +Add this test inside `describe('composeOverlay', ...)` after the descriptions test: + +```typescript + it('replaces manifest usage only when an overlay explicitly provides usage', () => { + const baseWithUsage: SemanticLayerSource = { + ...baseTable, + usage: { + narrative: 'Orders are commonly queried by lifecycle status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }, + }; + + expect(composeOverlay(baseWithUsage, { name: 'fct_labs', measures: [] }).usage).toEqual(baseWithUsage.usage); + + const composed = composeOverlay(baseWithUsage, { + name: 'fct_labs', + usage: { + narrative: 'Overlay-curated usage note.', + frequencyTier: 'mid', + commonFilters: ['created_at'], + commonGroupBys: ['created_at'], + commonJoins: [], + }, + }); + + expect(composed.usage).toEqual({ + narrative: 'Overlay-curated usage note.', + frequencyTier: 'mid', + commonFilters: ['created_at'], + commonGroupBys: ['created_at'], + commonJoins: [], + }); + }); +``` + +Add this test inside `describe('sourceDefinitionSchema', ...)`: + +```typescript + it('accepts historic SQL usage on standalone sources', () => { + const result = sourceDefinitionSchema.safeParse({ + name: 'orders', + table: 'public.orders', + grain: ['id'], + columns: [{ name: 'id', type: 'string' }], + joins: [], + measures: [], + usage: { + narrative: 'Orders are queried for fulfillment and revenue analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + externalOwner: 'analytics', + }, + }); + + expect(result.success).toBe(true); + if (!result.success) { + return; + } + expect(result.data.usage).toMatchObject({ + narrative: 'Orders are queried for fulfillment and revenue analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + externalOwner: 'analytics', + }); + }); +``` + +Add a new describe block before `describe('findManifestEntryByTableRef', ...)`: + +```typescript +describe('projectManifestEntry', () => { + it('projects manifest usage onto the semantic-layer source', () => { + const source = projectManifestEntry('orders', { + table: 'public.orders', + usage: { + narrative: 'Orders are frequently filtered by status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }, + columns: [ + { name: 'id', type: 'string', pk: true }, + { name: 'status', type: 'string' }, + ], + }); + + expect(source.usage).toEqual({ + narrative: 'Orders are frequently filtered by status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }); + }); +}); +``` + +- [ ] **Step 2: Run the semantic-layer tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/semantic-layer.service.test.ts +``` + +Expected: FAIL because `usage` is rejected by strict schemas and not projected from manifest entries. + +- [ ] **Step 3: Add `usage` to the TypeScript source type** + +In `packages/context/src/sl/types.ts`, add this import at the top: + +```typescript +import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js'; +``` + +Add this field to `SemanticLayerSource` after `freshness`: + +```typescript + usage?: TableUsageOutput; +``` + +- [ ] **Step 4: Add `usage` to zod validation** + +In `packages/context/src/sl/schemas.ts`, add this import after the existing zod import: + +```typescript +import { tableUsageOutputSchema } from '../ingest/adapters/historic-sql/skill-schemas.js'; +``` + +Add this field to `sourceDefinitionSchema` near `freshness`: + +```typescript + usage: tableUsageOutputSchema.optional(), +``` + +Add this field to `sourceOverlaySchema` near `default_time_dimension`: + +```typescript + usage: tableUsageOutputSchema.optional(), +``` + +- [ ] **Step 5: Project and compose usage intentionally** + +In `packages/context/src/sl/semantic-layer.service.ts`, add this type import: + +```typescript +import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js'; +``` + +Add this field to `ManifestTableEntry`: + +```typescript + usage?: TableUsageOutput; +``` + +In `projectManifestEntry()`, add `usage` to the returned object: + +```typescript + ...(entry.usage ? { usage: entry.usage } : {}), +``` + +Add `'usage'` to `COMPOSE_KNOWN_KEYS`: + +```typescript + 'usage', +``` + +In `composeOverlay()`, add this block after the descriptions merge and before column filtering: + +```typescript + if (normalizedOverlay.usage !== undefined) { + result.usage = normalizedOverlay.usage as SemanticLayerSource['usage']; + } +``` + +- [ ] **Step 6: Run the semantic-layer tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/semantic-layer.service.test.ts +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add packages/context/src/sl/types.ts packages/context/src/sl/schemas.ts packages/context/src/sl/semantic-layer.service.ts packages/context/src/sl/semantic-layer.service.test.ts +git commit -m "feat: carry historic sql usage in semantic sources" +``` + +## Task 3: Preserve Manifest Usage Through Scan Rewrites + +**Files:** +- Modify: `packages/context/src/ingest/adapters/live-database/manifest.ts` +- Test: `packages/context/src/ingest/adapters/live-database/manifest.test.ts` +- Modify: `packages/context/src/scan/local-enrichment-artifacts.ts` +- Test: `packages/context/src/scan/local-enrichment-artifacts.test.ts` + +- [ ] **Step 1: Write failing manifest-builder test** + +In `packages/context/src/ingest/adapters/live-database/manifest.test.ts`, add this test inside `describe('buildLiveDatabaseManifestShards', ...)`: + +```typescript + it('preserves external usage keys while replacing historic SQL managed keys', () => { + const existingUsage = new Map([ + [ + 'orders', + { + narrative: 'Old generated usage narrative.', + frequencyTier: 'low' as const, + commonFilters: ['old_status'], + commonJoins: [], + ownerNote: 'Pinned analyst note', + }, + ], + ]); + + const result = buildLiveDatabaseManifestShards({ + connectionType: 'POSTGRESQL', + mapColumnType: (nativeType) => nativeType.toLowerCase(), + existingUsage, + tables: [ + { + name: 'orders', + catalog: null, + db: 'public', + usage: { + narrative: 'Fresh generated usage narrative.', + frequencyTier: 'high', + commonFilters: ['status'], + commonGroupBys: ['created_at'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }, + columns: [{ name: 'id', type: 'INTEGER' }], + }, + ], + joins: [], + }); + + expect(shardObject(result.shards)).toEqual({ + public: { + tables: { + orders: { + table: 'public.orders', + usage: { + ownerNote: 'Pinned analyst note', + narrative: 'Fresh generated usage narrative.', + frequencyTier: 'high', + commonFilters: ['status'], + commonGroupBys: ['created_at'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }, + columns: [{ name: 'id', type: 'integer' }], + }, + }, + }, + }); + }); +``` + +- [ ] **Step 2: Run the manifest test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/live-database/manifest.test.ts +``` + +Expected: FAIL because `existingUsage` and table input `usage` are not supported. + +- [ ] **Step 3: Add usage types and merge helper** + +In `packages/context/src/ingest/adapters/live-database/manifest.ts`, add this import at the top: + +```typescript +import type { TableUsageOutput } from '../historic-sql/skill-schemas.js'; +``` + +Add this constant after `SCAN_MANAGED_DESCRIPTION_KEYS`: + +```typescript +const HISTORIC_SQL_MANAGED_USAGE_KEYS = new Set([ + 'narrative', + 'frequencyTier', + 'commonFilters', + 'commonGroupBys', + 'commonJoins', + 'staleSince', +]); +``` + +Add `usage` to `LiveDatabaseManifestTableEntry`: + +```typescript + usage?: TableUsageOutput; +``` + +Add `usage` to `LiveDatabaseManifestTableData`: + +```typescript + usage?: TableUsageOutput; +``` + +Add `existingUsage` to `BuildLiveDatabaseManifestShardsInput`: + +```typescript + existingUsage?: Map; +``` + +Add this exported helper after `mergeDescriptionsPreservingExternal()`: + +```typescript +export function mergeUsagePreservingExternal( + existing: TableUsageOutput | undefined, + incoming: TableUsageOutput | undefined, +): TableUsageOutput | undefined { + if (!existing && !incoming) { + return undefined; + } + const result: Record = {}; + if (existing) { + for (const [key, value] of Object.entries(existing)) { + if (!HISTORIC_SQL_MANAGED_USAGE_KEYS.has(key)) { + result[key] = value; + } + } + } + if (incoming) { + Object.assign(result, incoming); + } + return Object.keys(result).length > 0 ? (result as TableUsageOutput) : undefined; +} +``` + +In `buildLiveDatabaseManifestShards()`, add this block after table descriptions are set: + +```typescript + const usage = mergeUsagePreservingExternal(input.existingUsage?.get(table.name), table.usage); + if (usage) { + entry.usage = usage; + } +``` + +- [ ] **Step 4: Run the manifest test to verify it passes** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/live-database/manifest.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Write failing scan-preservation test** + +In `packages/context/src/scan/local-enrichment-artifacts.test.ts`, inside the existing structural manifest shard test, extend the seeded YAML under `orders` with this block: + +```yaml + usage: + narrative: Orders are commonly filtered by lifecycle status. + frequencyTier: high + commonFilters: + - status + commonJoins: + - table: public.customers + on: + - customer_id + ownerNote: Preserve analyst note +``` + +Extend the parsed manifest type in that test: + +```typescript + usage?: Record; +``` + +Add this assertion after the descriptions assertions: + +```typescript + expect(manifest.tables.orders.usage).toEqual({ + narrative: 'Orders are commonly filtered by lifecycle status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + ownerNote: 'Preserve analyst note', + }); +``` + +- [ ] **Step 6: Run the scan-preservation test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/scan/local-enrichment-artifacts.test.ts +``` + +Expected: FAIL because `loadExistingManifestState()` does not capture usage and scan rewrites drop it. + +- [ ] **Step 7: Preserve usage in local enrichment artifact writes** + +In `packages/context/src/scan/local-enrichment-artifacts.ts`, add `TableUsageOutput` to the ingest import: + +```typescript + type TableUsageOutput, +``` + +Add `usage` to `ExistingManifestState`: + +```typescript + usage: Map; +``` + +Initialize it in `loadExistingManifestState()`: + +```typescript + const usage = new Map(); +``` + +Update the early catch return: + +```typescript + return { descriptions, preservedJoins, usage }; +``` + +Inside the `for (const [tableName, entry] of Object.entries(shard.tables))` loop, after descriptions are captured, add: + +```typescript + if (entry.usage) { + usage.set(tableName, { ...entry.usage }); + } +``` + +Update the final return: + +```typescript + return { descriptions, preservedJoins, usage }; +``` + +Pass usage into `buildLiveDatabaseManifestShards()`: + +```typescript + existingUsage: existing.usage, +``` + +- [ ] **Step 8: Run scan-preservation test to verify it passes** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/scan/local-enrichment-artifacts.test.ts +``` + +Expected: PASS. + +- [ ] **Step 9: Commit** + +```bash +git add packages/context/src/ingest/adapters/live-database/manifest.ts packages/context/src/ingest/adapters/live-database/manifest.test.ts packages/context/src/scan/local-enrichment-artifacts.ts packages/context/src/scan/local-enrichment-artifacts.test.ts +git commit -m "feat: preserve historic sql usage in manifest shards" +``` + +## Task 4: Add Python Batch SQL Analysis + +**Files:** +- Create: `python/ktx-daemon/src/ktx_daemon/sql_analysis.py` +- Create: `python/ktx-daemon/tests/test_sql_analysis.py` +- Modify: `python/ktx-daemon/src/ktx_daemon/app.py` +- Test: `python/ktx-daemon/tests/test_app.py` + +- [ ] **Step 1: Write failing parser tests** + +Create `python/ktx-daemon/tests/test_sql_analysis.py`: + +```python +from __future__ import annotations + +from ktx_daemon.sql_analysis import ( + AnalyzeSqlBatchItem, + AnalyzeSqlBatchRequest, + analyze_sql_batch_response, +) + + +def test_analyze_sql_batch_extracts_tables_and_clause_columns() -> None: + response = analyze_sql_batch_response( + AnalyzeSqlBatchRequest( + dialect="postgres", + items=[ + AnalyzeSqlBatchItem( + id="orders_by_customer", + sql=( + "select o.status, count(*) " + "from public.orders o " + "join public.customers c on o.customer_id = c.id " + "where o.created_at >= current_date - interval '30 day' " + "group by o.status" + ), + ) + ], + max_workers=1, + ) + ) + + result = response.results["orders_by_customer"] + assert result.error is None + assert result.tables_touched == ["public.orders", "public.customers"] + assert result.columns_by_clause == { + "select": ["status"], + "where": ["created_at"], + "join": ["customer_id", "id"], + "groupBy": ["status"], + } + + +def test_analyze_sql_batch_returns_per_item_parse_errors() -> None: + response = analyze_sql_batch_response( + AnalyzeSqlBatchRequest( + dialect="postgres", + items=[AnalyzeSqlBatchItem(id="broken", sql="select * from where")], + max_workers=1, + ) + ) + + result = response.results["broken"] + assert result.tables_touched == [] + assert result.columns_by_clause == {} + assert result.error is not None +``` + +- [ ] **Step 2: Run parser tests to verify they fail** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_sql_analysis.py -q +``` + +Expected: FAIL with `ModuleNotFoundError: No module named 'ktx_daemon.sql_analysis'`. + +- [ ] **Step 3: Add the batch parser module** + +Create `python/ktx-daemon/src/ktx_daemon/sql_analysis.py`: + +```python +from __future__ import annotations + +import os +from concurrent.futures import ProcessPoolExecutor +from typing import Literal + +import sqlglot +from pydantic import BaseModel, ConfigDict, Field +from sqlglot import exp + +SqlAnalysisClause = Literal["select", "where", "join", "groupBy", "having", "orderBy"] + + +class AnalyzeSqlBatchItem(BaseModel): + id: str + sql: str + + +class AnalyzeSqlBatchRequest(BaseModel): + dialect: str + items: list[AnalyzeSqlBatchItem] + max_workers: int | None = Field(default=None, ge=1, le=32) + + +class AnalyzeSqlBatchResult(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + tables_touched: list[str] = Field(default_factory=list) + columns_by_clause: dict[SqlAnalysisClause, list[str]] = Field(default_factory=dict) + error: str | None = None + + +class AnalyzeSqlBatchResponse(BaseModel): + results: dict[str, AnalyzeSqlBatchResult] + + +def _ordered_unique(values: list[str]) -> list[str]: + seen: set[str] = set() + result: list[str] = [] + for value in values: + if value and value not in seen: + seen.add(value) + result.append(value) + return result + + +def _table_ref(table: exp.Table) -> str: + parts: list[str] = [] + catalog = table.args.get("catalog") + db = table.args.get("db") + if catalog is not None and getattr(catalog, "name", None): + parts.append(str(catalog.name)) + if db is not None and getattr(db, "name", None): + parts.append(str(db.name)) + if table.name: + parts.append(str(table.name)) + return ".".join(parts) + + +def _column_name(column: exp.Column) -> str: + return str(column.name) + + +def _columns_from_nodes(nodes: list[exp.Expression | None]) -> list[str]: + names: list[str] = [] + for node in nodes: + if node is None: + continue + names.extend(_column_name(column) for column in node.find_all(exp.Column)) + return _ordered_unique(names) + + +def _columns_by_clause(tree: exp.Expression) -> dict[SqlAnalysisClause, list[str]]: + result: dict[SqlAnalysisClause, list[str]] = {} + + select_columns = _columns_from_nodes(list(tree.expressions)) + if select_columns: + result["select"] = select_columns + + where_columns = _columns_from_nodes([tree.args.get("where")]) + if where_columns: + result["where"] = where_columns + + join_columns = _columns_from_nodes([join.args.get("on") for join in tree.args.get("joins") or []]) + if join_columns: + result["join"] = join_columns + + group = tree.args.get("group") + group_columns = _columns_from_nodes(list(group.expressions) if group is not None else []) + if group_columns: + result["groupBy"] = group_columns + + having_columns = _columns_from_nodes([tree.args.get("having")]) + if having_columns: + result["having"] = having_columns + + order = tree.args.get("order") + order_columns = _columns_from_nodes(list(order.expressions) if order is not None else []) + if order_columns: + result["orderBy"] = order_columns + + return result + + +def _analyze_one(item_id: str, sql: str, dialect: str) -> tuple[str, AnalyzeSqlBatchResult]: + try: + tree = sqlglot.parse_one(sql, read=dialect) + except sqlglot.errors.SQLGlotError as exc: + return item_id, AnalyzeSqlBatchResult(error=str(exc)) + + cte_names = {cte.alias_or_name.lower() for cte in tree.find_all(exp.CTE)} + table_refs = [ + table_ref + for table_ref in (_table_ref(table) for table in tree.find_all(exp.Table)) + if table_ref and table_ref.split(".")[-1].lower() not in cte_names + ] + + return item_id, AnalyzeSqlBatchResult( + tables_touched=_ordered_unique(table_refs), + columns_by_clause=_columns_by_clause(tree), + error=None, + ) + + +def _analyze_payload(payload: tuple[str, str, str]) -> tuple[str, AnalyzeSqlBatchResult]: + item_id, sql, dialect = payload + return _analyze_one(item_id, sql, dialect) + + +def _worker_count(request: AnalyzeSqlBatchRequest) -> int: + if len(request.items) <= 1: + return 1 + if request.max_workers is not None: + return min(request.max_workers, len(request.items)) + return min(os.cpu_count() or 1, len(request.items), 8) + + +def analyze_sql_batch_response(request: AnalyzeSqlBatchRequest) -> AnalyzeSqlBatchResponse: + payloads = [(item.id, item.sql, request.dialect) for item in request.items] + if _worker_count(request) == 1: + analyzed = [_analyze_payload(payload) for payload in payloads] + else: + with ProcessPoolExecutor(max_workers=_worker_count(request)) as executor: + analyzed = list(executor.map(_analyze_payload, payloads)) + + return AnalyzeSqlBatchResponse(results={item_id: result for item_id, result in analyzed}) +``` + +- [ ] **Step 4: Run parser tests to verify they pass** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_sql_analysis.py -q +``` + +Expected: PASS. + +- [ ] **Step 5: Write failing FastAPI endpoint test** + +In `python/ktx-daemon/tests/test_app.py`, add this test after `test_sql_parse_table_identifier_endpoint()`: + +```python +def test_sql_analyze_batch_endpoint_returns_per_item_results() -> None: + client = TestClient(create_app()) + + response = client.post( + "/sql/analyze-batch", + json={ + "dialect": "postgres", + "max_workers": 1, + "items": [ + { + "id": "orders", + "sql": "select status from public.orders where created_at is not null", + }, + {"id": "broken", "sql": "select * from where"}, + ], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["results"]["orders"]["tables_touched"] == ["public.orders"] + assert body["results"]["orders"]["columns_by_clause"] == { + "select": ["status"], + "where": ["created_at"], + } + assert body["results"]["orders"]["error"] is None + assert body["results"]["broken"]["tables_touched"] == [] + assert body["results"]["broken"]["columns_by_clause"] == {} + assert body["results"]["broken"]["error"] is not None +``` + +- [ ] **Step 6: Run the endpoint test to verify it fails** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_app.py::test_sql_analyze_batch_endpoint_returns_per_item_results -q +``` + +Expected: FAIL with HTTP 404. + +- [ ] **Step 7: Register the daemon endpoint** + +In `python/ktx-daemon/src/ktx_daemon/app.py`, add this import block with the other daemon imports: + +```python +from ktx_daemon.sql_analysis import ( + AnalyzeSqlBatchRequest, + AnalyzeSqlBatchResponse, + analyze_sql_batch_response, +) +``` + +Add this route after `/sql/parse-table-identifier`: + +```python + @app.post("/sql/analyze-batch", response_model=AnalyzeSqlBatchResponse) + async def sql_analyze_batch( + request: AnalyzeSqlBatchRequest, + ) -> AnalyzeSqlBatchResponse: + try: + return analyze_sql_batch_response(request) + except Exception as error: + logger.exception("SQL batch analysis failed: %s", error) + raise HTTPException( + status_code=500, + detail=f"SQL batch analysis failed: {error}", + ) from error +``` + +- [ ] **Step 8: Run Python tests to verify the daemon slice passes** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py::test_sql_analyze_batch_endpoint_returns_per_item_results -q +``` + +Expected: PASS. + +- [ ] **Step 9: Check Python formatting/lint hook availability** + +Run: + +```bash +test -f .pre-commit-config.yaml && source .venv/bin/activate && uv run pre-commit run --files python/ktx-daemon/src/ktx_daemon/sql_analysis.py python/ktx-daemon/src/ktx_daemon/app.py python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py || printf 'pre-commit config missing\n' +``` + +Expected in this workspace: prints `pre-commit config missing`. + +- [ ] **Step 10: Commit** + +```bash +git add python/ktx-daemon/src/ktx_daemon/sql_analysis.py python/ktx-daemon/src/ktx_daemon/app.py python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py +git commit -m "feat: add daemon sql batch analysis" +``` + +## Task 5: Add TypeScript Batch SQL Analysis Port + +**Files:** +- Modify: `packages/context/src/sql-analysis/ports.ts` +- Modify: `packages/context/src/sql-analysis/index.ts` +- Modify: `packages/context/src/sql-analysis/http-sql-analysis-port.ts` +- Test: `packages/context/src/sql-analysis/http-sql-analysis-port.test.ts` +- Test: `packages/cli/src/managed-python-http.test.ts` +- Modify: legacy `SqlAnalysisPort` mocks found by `rg -n "const .*SqlAnalysis|sqlAnalysis: \\{" packages/context packages/cli` + +- [ ] **Step 1: Write failing HTTP port tests** + +In `packages/context/src/sql-analysis/http-sql-analysis-port.test.ts`, add these tests before the malformed daemon response test: + +```typescript + it('calls the SQL batch endpoint and maps snake_case response fields into a Map', async () => { + const requestJson = vi.fn(async () => ({ + results: { + orders: { + tables_touched: ['public.orders', 'public.customers'], + columns_by_clause: { + select: ['status'], + where: ['created_at'], + join: ['customer_id', 'id'], + }, + error: null, + }, + broken: { + tables_touched: [], + columns_by_clause: {}, + error: 'Invalid expression / Unexpected token', + }, + }, + })); + const port = createHttpSqlAnalysisPort({ baseUrl: 'http://python.test', requestJson }); + + await expect( + port.analyzeBatch( + [ + { id: 'orders', sql: 'select status from public.orders' }, + { id: 'broken', sql: 'select * from where' }, + ], + 'postgres', + ), + ).resolves.toEqual( + new Map([ + [ + 'orders', + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: ['status'], + where: ['created_at'], + join: ['customer_id', 'id'], + }, + error: null, + }, + ], + [ + 'broken', + { + tablesTouched: [], + columnsByClause: {}, + error: 'Invalid expression / Unexpected token', + }, + ], + ]), + ); + + expect(requestJson).toHaveBeenCalledWith('/sql/analyze-batch', { + dialect: 'postgres', + items: [ + { id: 'orders', sql: 'select status from public.orders' }, + { id: 'broken', sql: 'select * from where' }, + ], + }); + }); + + it('rejects malformed SQL batch responses instead of inventing defaults', async () => { + const requestJson = vi.fn(async () => ({ + results: { + orders: { + tables_touched: ['public.orders'], + columns_by_clause: { select: ['status'], where: [42] }, + error: null, + }, + }, + })); + const port = createHttpSqlAnalysisPort({ baseUrl: 'http://python.test', requestJson }); + + await expect(port.analyzeBatch([{ id: 'orders', sql: 'select status from public.orders' }], 'postgres')).rejects + .toThrow('sql analysis response is missing string[] field columns_by_clause.where'); + }); +``` + +- [ ] **Step 2: Run HTTP port tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sql-analysis/http-sql-analysis-port.test.ts +``` + +Expected: FAIL because `analyzeBatch` is not defined. + +- [ ] **Step 3: Add batch types to the port** + +In `packages/context/src/sql-analysis/ports.ts`, add these types after `SqlAnalysisFingerprintResult`: + +```typescript +export type SqlAnalysisClause = 'select' | 'where' | 'join' | 'groupBy' | 'having' | 'orderBy' | (string & {}); + +export interface SqlAnalysisBatchItem { + id: string; + sql: string; +} + +export interface SqlAnalysisBatchResult { + tablesTouched: string[]; + columnsByClause: Partial>; + error?: string | null; +} +``` + +Update `SqlAnalysisPort`: + +```typescript +export interface SqlAnalysisPort { + analyzeForFingerprint(sql: string, dialect: SqlAnalysisDialect): Promise; + analyzeBatch( + items: SqlAnalysisBatchItem[], + dialect: SqlAnalysisDialect, + ): Promise>; +} +``` + +In `packages/context/src/sql-analysis/index.ts`, export the new types: + +```typescript + SqlAnalysisBatchItem, + SqlAnalysisBatchResult, + SqlAnalysisClause, +``` + +- [ ] **Step 4: Map the HTTP batch response** + +In `packages/context/src/sql-analysis/http-sql-analysis-port.ts`, add the new type imports: + +```typescript + SqlAnalysisBatchItem, + SqlAnalysisBatchResult, +``` + +Add this helper after `requiredStringArray()`: + +```typescript +function requiredObject(raw: Record, field: string): Record { + const value = raw[field]; + if (!value || typeof value !== 'object' || Array.isArray(value)) { + throw new Error(`sql analysis response is missing object field ${field}`); + } + return value as Record; +} +``` + +Add this helper after `mapResult()`: + +```typescript +function mapColumnsByClause(raw: Record): SqlAnalysisBatchResult['columnsByClause'] { + const value = requiredObject(raw, 'columns_by_clause'); + const result: SqlAnalysisBatchResult['columnsByClause'] = {}; + for (const [clause, columns] of Object.entries(value)) { + if (!Array.isArray(columns) || columns.some((item) => typeof item !== 'string')) { + throw new Error(`sql analysis response is missing string[] field columns_by_clause.${clause}`); + } + result[clause] = columns; + } + return result; +} + +function mapBatchResult(raw: Record): SqlAnalysisBatchResult { + const error = optionalString(raw, 'error'); + return { + tablesTouched: requiredStringArray(raw, 'tables_touched'), + columnsByClause: mapColumnsByClause(raw), + ...(error !== undefined ? { error } : {}), + }; +} + +function mapBatchResponse(raw: Record): Map { + const results = requiredObject(raw, 'results'); + return new Map( + Object.entries(results).map(([id, value]) => { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + throw new Error(`sql analysis response contains invalid batch result ${id}`); + } + return [id, mapBatchResult(value as Record)]; + }), + ); +} +``` + +Add `analyzeBatch()` to the object returned by `createHttpSqlAnalysisPort()`: + +```typescript + async analyzeBatch(items: SqlAnalysisBatchItem[], dialect: SqlAnalysisDialect) { + const raw = await requestJson('/sql/analyze-batch', { + dialect, + items, + }); + return mapBatchResponse(raw); + }, +``` + +- [ ] **Step 5: Run HTTP port tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sql-analysis/http-sql-analysis-port.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Update managed-daemon wrapper test** + +In `packages/cli/src/managed-python-http.test.ts`, add this test after the existing SQL analysis port test: + +```typescript + it('routes SQL batch analysis through the managed daemon runner', async () => { + const requestJson = vi.fn(async () => ({ + results: { + orders: { + tables_touched: ['public.orders'], + columns_by_clause: { select: ['status'] }, + error: null, + }, + }, + })); + const sqlAnalysis = createManagedDaemonSqlAnalysisPort({ requestJson }); + + await expect(sqlAnalysis.analyzeBatch([{ id: 'orders', sql: 'select status from public.orders' }], 'postgres')) + .resolves.toEqual( + new Map([ + [ + 'orders', + { + tablesTouched: ['public.orders'], + columnsByClause: { select: ['status'] }, + error: null, + }, + ], + ]), + ); + expect(requestJson).toHaveBeenCalledWith('/sql/analyze-batch', { + dialect: 'postgres', + items: [{ id: 'orders', sql: 'select status from public.orders' }], + }); + }); +``` + +- [ ] **Step 7: Update legacy `SqlAnalysisPort` mocks** + +Run: + +```bash +rg -n "SqlAnalysisPort|sqlAnalysis: \\{|analyzeForFingerprint" packages/context/src/ingest packages/cli/src +``` + +For every object literal typed as `SqlAnalysisPort` or passed into a typed `sqlAnalysis` dependency, add: + +```typescript + async analyzeBatch() { + return new Map(); + }, +``` + +Known files from the current workspace: + +- `packages/context/src/ingest/local-adapters.test.ts` +- `packages/context/src/ingest/adapters/historic-sql/stage.test.ts` +- `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts` +- `packages/context/src/ingest/adapters/historic-sql/stage-pgss-golden.test.ts` +- `packages/context/src/ingest/adapters/historic-sql/stage-pgss.test.ts` + +- [ ] **Step 8: Run CLI wrapper and context type checks** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/managed-python-http.test.ts +pnpm --filter @ktx/context run type-check +pnpm --filter @ktx/cli run type-check +``` + +Expected: PASS. If type-check reports a `SqlAnalysisPort` mock missing `analyzeBatch`, add the no-op method from Step 7 and rerun the failing type-check command. + +- [ ] **Step 9: Commit** + +```bash +git add packages/context/src/sql-analysis/ports.ts packages/context/src/sql-analysis/index.ts packages/context/src/sql-analysis/http-sql-analysis-port.ts packages/context/src/sql-analysis/http-sql-analysis-port.test.ts packages/cli/src/managed-python-http.test.ts packages/context/src/ingest/local-adapters.test.ts packages/context/src/ingest/adapters/historic-sql/stage.test.ts packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts packages/context/src/ingest/adapters/historic-sql/stage-pgss-golden.test.ts packages/context/src/ingest/adapters/historic-sql/stage-pgss.test.ts +git commit -m "feat: add sql analysis batch port" +``` + +## Task 6: Final Verification + +**Files:** +- Read-only verification across TypeScript and Python. + +- [ ] **Step 1: Run focused TypeScript tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/skill-schemas.test.ts src/sl/semantic-layer.service.test.ts src/ingest/adapters/live-database/manifest.test.ts src/scan/local-enrichment-artifacts.test.ts src/sql-analysis/http-sql-analysis-port.test.ts +pnpm --filter @ktx/cli exec vitest run src/managed-python-http.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run focused Python tests** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py::test_sql_analyze_batch_endpoint_returns_per_item_results -q +``` + +Expected: PASS. + +- [ ] **Step 3: Run package type checks** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +pnpm --filter @ktx/cli run type-check +``` + +Expected: PASS. + +- [ ] **Step 4: Run Python pre-commit check if configured** + +Run: + +```bash +test -f .pre-commit-config.yaml && source .venv/bin/activate && uv run pre-commit run --files python/ktx-daemon/src/ktx_daemon/sql_analysis.py python/ktx-daemon/src/ktx_daemon/app.py python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py || printf 'pre-commit config missing\n' +``` + +Expected in this workspace: prints `pre-commit config missing`. + +- [ ] **Step 5: Confirm the old adapter was not cut over in this slice** + +Run: + +```bash +rg -n "stagePgStatStatementsTemplates|expandCategoricalTemplates|classifySlot|pgss-baseline" packages/context/src/ingest/adapters/historic-sql packages/context/src/ingest/index.ts +``` + +Expected: matches still exist. This confirms the foundation slice did not silently perform the hard cutover from spec §10.1. + +- [ ] **Step 6: Commit verification notes if code changed during verification** + +If verification required edits, commit only those files: + +```bash +git status --short +git add packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts packages/context/src/ingest/adapters/historic-sql/skill-schemas.test.ts packages/context/src/ingest/index.ts packages/context/src/sl/types.ts packages/context/src/sl/schemas.ts packages/context/src/sl/semantic-layer.service.ts packages/context/src/sl/semantic-layer.service.test.ts packages/context/src/ingest/adapters/live-database/manifest.ts packages/context/src/ingest/adapters/live-database/manifest.test.ts packages/context/src/scan/local-enrichment-artifacts.ts packages/context/src/scan/local-enrichment-artifacts.test.ts python/ktx-daemon/src/ktx_daemon/sql_analysis.py python/ktx-daemon/src/ktx_daemon/app.py python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py packages/context/src/sql-analysis/ports.ts packages/context/src/sql-analysis/index.ts packages/context/src/sql-analysis/http-sql-analysis-port.ts packages/context/src/sql-analysis/http-sql-analysis-port.test.ts packages/cli/src/managed-python-http.test.ts packages/context/src/ingest/local-adapters.test.ts packages/context/src/ingest/adapters/historic-sql/stage.test.ts packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts packages/context/src/ingest/adapters/historic-sql/stage-pgss-golden.test.ts packages/context/src/ingest/adapters/historic-sql/stage-pgss.test.ts +git commit -m "test: finish historic sql foundations verification" +``` + +If verification required no edits, do not create an empty commit. + +## Self-Review + +**Spec coverage:** This plan covers the foundation item in spec §10.3. It intentionally does not cover search enrichment (§6.2.3-§6.2.5), the unified reader and staged artifacts (§4), skills and projection (§5), legacy cleanup (§10.2), or setup/doctor docs (§8). Those should be separate plans because each produces a testable subsystem and avoids one oversized cutover plan. + +**Placeholder scan:** The plan contains exact file paths, test code, implementation snippets, commands, expected failures, expected passes, and commit commands. It does not use placeholder markers or deferred implementation text. + +**Type consistency:** `TableUsageOutput` is created in `skill-schemas.ts`, then reused by `SemanticLayerSource`, `ManifestTableEntry`, and `LiveDatabaseManifestTableEntry`. `SqlAnalysisPort.analyzeBatch()` returns `Map` consistently across `ports.ts`, `http-sql-analysis-port.ts`, and `managed-python-http.test.ts`. The Python daemon response uses snake_case fields that the TypeScript HTTP port maps to camelCase. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - I dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - Execute tasks in this session using executing-plans, batch execution with checkpoints + +Which approach? diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-pattern-shard-smoke-docs.md b/docs/superpowers/plans/2026-05-11-historic-sql-pattern-shard-smoke-docs.md new file mode 100644 index 00000000..9e386a16 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-pattern-shard-smoke-docs.md @@ -0,0 +1,407 @@ +# Historic SQL Pattern Shard Smoke Docs Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Align the Postgres historic-SQL smoke and example docs with sharded pattern WorkUnits. + +**Architecture:** The runtime already writes the full `patterns-input.json` audit file and bounded `patterns-input/part-0001.json` style shards. This plan updates the example acceptance assets so they verify the sharded contract instead of the pre-sharding root `historic-sql-patterns` WorkUnit. + +**Tech Stack:** Bash, Node.js built-in test runner, pnpm workspace scripts, KTX local stage-only ingest. + +--- + +## Spec And Existing Plan Status + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans derived from this spec and implemented in this worktree: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` - implemented. Evidence: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`, `packages/context/src/sql-analysis/ports.ts`, daemon `/sql/analyze-batch`, `SemanticLayerSource.usage`, and `mergeUsagePreservingExternal()`. +- `docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md` - implemented. Evidence: usage-aware SL search text, SQLite FTS snippets, and local/MCP result fields `frequencyTier` plus `snippet`. +- `docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md` - implemented. Evidence: `stageHistoricSqlAggregatedSnapshot()`, `chunkHistoricSqlUnifiedStagedDir()`, `PostgresPgssReader`, aggregate BigQuery/Snowflake readers, unified schemas, and package exports. +- `docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md` - implemented. Evidence: `HistoricSqlSourceAdapter`, `historic_sql_table_digest`, `historic_sql_patterns`, `emit_historic_sql_evidence`, `HistoricSqlProjectionPostProcessor`, and legacy skill removal from runtime code. +- `docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md` - implemented. Evidence: local adapter registration tests for Postgres, BigQuery, and Snowflake plus PG doctor coverage for informational `pg_stat_statements.max`. +- `docs/superpowers/plans/2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md` - implemented at the time it was written, but its smoke assertions predate pattern shard WorkUnits. +- `docs/superpowers/plans/2026-05-11-historic-sql-projection-archive-hardening.md` - implemented. Evidence: `isArchivedPatternPage()`, archive exclusion from slug matching, stale table tests, and legacy query-page cleanup coverage. +- `docs/superpowers/plans/2026-05-11-historic-sql-end-to-end-retrieval-acceptance.md` - implemented. Evidence: `local-ingest-acceptance.test.ts` proves production adapter output reaches SL search and wiki search. +- `docs/superpowers/plans/2026-05-11-historic-sql-redaction-hardening.md` - implemented. Evidence: `redaction.ts`, `redaction.test.ts`, and staged artifact redaction coverage in `stage-unified.test.ts`. +- `docs/superpowers/plans/2026-05-11-historic-sql-pattern-workunit-sharding.md` - implemented. Evidence: `pattern-inputs.ts`, `pattern-inputs.test.ts`, `stage-unified.ts` writes `patterns-input/part-*.json`, `chunk-unified.ts` emits `historic-sql-patterns-part-*`, `historic_sql_patterns` reads shards, and acceptance tests use `rawPath: 'patterns-input/part-0001.json'`. + +No existing spec-derived implementation plan is currently unimplemented in this worktree. + +Remaining gap this plan fixes: + +- `examples/postgres-historic/scripts/smoke.sh` still asserts a WorkUnit with `unitKey === 'historic-sql-patterns'`. +- Current runtime emits pattern WorkUnits with keys like `historic-sql-patterns-part-0001` and raw files like `patterns-input/part-0001.json`. +- The same smoke only validates the audit file `patterns-input.json`; it does not assert that the bounded shard files exist or contain only cross-table candidates. +- `examples/postgres-historic/README.md` and `examples/README.md` describe unchanged "pattern inputs" but do not explain that `patterns-input.json` is now audit-only and `patterns-input/part-*.json` drives pattern WorkUnits. +- `scripts/examples-docs.test.mjs` does not pin the sharded smoke/doc contract, so the stale root WorkUnit assertion can regress silently. + +## File Structure + +- Modify `scripts/examples-docs.test.mjs` + Pins docs and smoke script to the sharded pattern WorkUnit contract. +- Modify `examples/postgres-historic/scripts/smoke.sh` + Validates `patterns-input/part-*.json` shard files and `historic-sql-patterns-part-*` stage-only WorkUnits. +- Modify `examples/postgres-historic/README.md` + Documents `patterns-input.json` as the full audit artifact and `patterns-input/part-*.json` as bounded pattern WorkUnit input. +- Modify `examples/README.md` + Updates the short example catalog entry with the same audit-vs-shard wording. + +### Task 1: Pin Example Tests To Pattern Shards + +**Files:** +- Modify: `scripts/examples-docs.test.mjs` + +- [ ] **Step 1: Add failing assertions for sharded pattern smoke/docs** + +In `scripts/examples-docs.test.mjs`, inside `it('documents the Postgres historic SQL smoke example', ...)`, add these assertions immediately after the existing `assert.match(readme, /patterns-input\.json/);` line: + +```javascript + assert.match(readme, /patterns-input\/part-\*\.json/); + assert.match(readme, /full audit input/); + assert.match(readme, /bounded pattern WorkUnit shards/); +``` + +In the same test, add these assertions immediately after the existing `assert.match(smoke, /assert_stage_record "\$UNCHANGED_RECORD" unchanged zero/);` line: + +```javascript + assert.match(smoke, /assertPatternShards/); + assert.match(smoke, /historic-sql-patterns-part-/); + assert.match(smoke, /patterns-input\/part-/); + assert.doesNotMatch(smoke, /unitKey === 'historic-sql-patterns'/); +``` + +- [ ] **Step 2: Run the example docs test to verify it fails** + +Run: + +```bash +node --test scripts/examples-docs.test.mjs +``` + +Expected: FAIL. The test should report missing `patterns-input/part-*.json`, `full audit input`, `bounded pattern WorkUnit shards`, `assertPatternShards`, or it should fail because `smoke.sh` still contains `unitKey === 'historic-sql-patterns'`. + +- [ ] **Step 3: Commit the failing test** + +Run: + +```bash +git add scripts/examples-docs.test.mjs +git commit -m "test: expect historic sql pattern shard smoke docs" +``` + +### Task 2: Update The Postgres Historic Smoke + +**Files:** +- Modify: `examples/postgres-historic/scripts/smoke.sh` +- Test: `scripts/examples-docs.test.mjs` + +- [ ] **Step 1: Import `existsSync` in the embedded snapshot assertion** + +In `examples/postgres-historic/scripts/smoke.sh`, inside `assert_unified_snapshot()`, replace this line: + +```javascript +const { readFileSync, readdirSync } = require('node:fs'); +``` + +with: + +```javascript +const { existsSync, readFileSync, readdirSync } = require('node:fs'); +``` + +- [ ] **Step 2: Add shard validation to `assert_unified_snapshot()`** + +In `examples/postgres-historic/scripts/smoke.sh`, inside the embedded Node script in `assert_unified_snapshot()`, add this function after the `legacyKeys` loop: + +```javascript +function assertPatternShards(root) { + const shardDir = join(root, 'patterns-input'); + assert(existsSync(shardDir), 'Expected patterns-input shard directory'); + const shardFiles = readdirSync(shardDir) + .filter((file) => /^part-\d{4}\.json$/.test(file)) + .sort() + .map((file) => `patterns-input/${file}`); + assert(shardFiles.length > 0, 'Expected at least one pattern shard file'); + + for (const shardFile of shardFiles) { + const shard = JSON.parse(readFileSync(join(root, shardFile), 'utf8')); + assert(Array.isArray(shard.templates), `${shardFile}: expected templates array`); + assert(shard.templates.length > 0, `${shardFile}: expected at least one template`); + assert( + shard.templates.every((template) => Array.isArray(template.tablesTouched) && template.tablesTouched.length >= 2), + `${shardFile}: expected only cross-table pattern candidates`, + ); + } + + return shardFiles; +} +``` + +- [ ] **Step 3: Assert the full audit input and bounded shards** + +In the same embedded Node script, replace the current `patterns` block: + +```javascript +const patterns = JSON.parse(readFileSync(join(root, 'patterns-input.json'), 'utf8')); +assert(Array.isArray(patterns.templates) && patterns.templates.length > 0, 'Expected patterns-input templates'); +assert( + patterns.templates.every((template) => Array.isArray(template.tablesTouched) && template.tablesTouched.length > 0), + 'Expected every pattern template to have touched tables', +); +``` + +with: + +```javascript +const patterns = JSON.parse(readFileSync(join(root, 'patterns-input.json'), 'utf8')); +assert(Array.isArray(patterns.templates) && patterns.templates.length > 0, 'Expected patterns-input audit templates'); +assert( + patterns.templates.every((template) => Array.isArray(template.tablesTouched) && template.tablesTouched.length > 0), + 'Expected every audit pattern template to have touched tables', +); +const shardFiles = assertPatternShards(root); +assert( + shardFiles.length <= patterns.templates.length, + `Expected shard count ${shardFiles.length} to be no greater than audit template count ${patterns.templates.length}`, +); +``` + +- [ ] **Step 4: Update the stage record WorkUnit assertions** + +In `examples/postgres-historic/scripts/smoke.sh`, inside the embedded Node script in `assert_stage_record()`, replace: + +```javascript +assert(record.rawFileCount >= 3, `${label}: expected manifest, patterns input, and at least one table file`); +``` + +with: + +```javascript +assert(record.rawFileCount >= 4, `${label}: expected manifest, audit patterns input, pattern shard, and at least one table file`); +``` + +Then replace this nonzero WorkUnit block: + +```javascript +} else if (expectedWorkUnits === 'nonzero') { + assert(record.workUnitCount > 0, `${label}: expected nonzero WorkUnits`); + assert(record.workUnits.some((unit) => unit.unitKey === 'historic-sql-patterns'), `${label}: expected patterns WorkUnit`); + assert(record.workUnits.some((unit) => unit.unitKey.startsWith('historic-sql-table-')), `${label}: expected table WorkUnit`); +} else { +``` + +with: + +```javascript +} else if (expectedWorkUnits === 'nonzero') { + assert(record.workUnitCount > 0, `${label}: expected nonzero WorkUnits`); + const patternUnits = record.workUnits.filter((unit) => /^historic-sql-patterns-part-\d{4}$/.test(unit.unitKey)); + assert(patternUnits.length > 0, `${label}: expected sharded patterns WorkUnit`); + for (const unit of patternUnits) { + assert( + unit.rawFiles.some((rawFile) => /^patterns-input\/part-\d{4}\.json$/.test(rawFile)), + `${label}: expected ${unit.unitKey} to read a pattern shard`, + ); + assert( + !unit.rawFiles.includes('patterns-input.json'), + `${label}: expected ${unit.unitKey} not to schedule the full audit patterns input`, + ); + } + assert(record.workUnits.some((unit) => unit.unitKey.startsWith('historic-sql-table-')), `${label}: expected table WorkUnit`); +} else { +``` + +- [ ] **Step 5: Run shell syntax and the docs test** + +Run: + +```bash +bash -n examples/postgres-historic/scripts/smoke.sh +node --test scripts/examples-docs.test.mjs +``` + +Expected: `bash -n` exits 0. The docs test still fails until the README files are updated in Task 3. + +- [ ] **Step 6: Commit the smoke update** + +Run: + +```bash +git add examples/postgres-historic/scripts/smoke.sh +git commit -m "test: assert historic sql pattern shard smoke" +``` + +### Task 3: Update Example Documentation + +**Files:** +- Modify: `examples/postgres-historic/README.md` +- Modify: `examples/README.md` +- Test: `scripts/examples-docs.test.mjs` + +- [ ] **Step 1: Update the artifact list in the Postgres historic README** + +In `examples/postgres-historic/README.md`, replace this list: + +```markdown +- `manifest.json` +- `tables/*.json` +- `patterns-input.json` +``` + +with: + +```markdown +- `manifest.json` +- `tables/*.json` +- `patterns-input.json` as the full audit input +- `patterns-input/part-*.json` as bounded pattern WorkUnit shards +``` + +- [ ] **Step 2: Update the idempotency wording** + +In `examples/postgres-historic/README.md`, replace this paragraph: + +```markdown +The smoke also runs the same workload twice and verifies the second stage-only +run has `workUnitCount: 0`, which proves unchanged bucketed table and pattern +inputs do not schedule LLM work. +``` + +with: + +```markdown +The smoke also runs the same workload twice and verifies the second stage-only +run has `workUnitCount: 0`, which proves unchanged bucketed table inputs and +unchanged bounded pattern shards do not schedule LLM work. +``` + +- [ ] **Step 3: Update the manifest inspection wording** + +In `examples/postgres-historic/README.md`, replace this paragraph: + +```markdown +The manifest should have `source: "historic-sql"`, `dialect: "postgres"`, +positive `snapshotRowCount`, positive `touchedTableCount`, numeric +`parseFailures`, `warnings`, and `probeWarnings`. The same directory should +contain `patterns-input.json` and one `tables/*.json` file per touched table. +``` + +with: + +```markdown +The manifest should have `source: "historic-sql"`, `dialect: "postgres"`, +positive `snapshotRowCount`, positive `touchedTableCount`, numeric +`parseFailures`, `warnings`, and `probeWarnings`. The same directory should +contain `patterns-input.json`, at least one `patterns-input/part-*.json` pattern +shard for cross-table candidates, and one `tables/*.json` file per touched +table. +``` + +- [ ] **Step 4: Update the examples catalog entry** + +In `examples/README.md`, replace this paragraph: + +```markdown +`postgres-historic/` is a manual Docker-backed smoke for Postgres historic-SQL +ingest via `pg_stat_statements`. It verifies setup, unified Historic SQL artifacts, +managed daemon batch SQL analysis, and no-WorkUnit idempotency for unchanged +bucketed table and pattern inputs. +``` + +with: + +```markdown +`postgres-historic/` is a manual Docker-backed smoke for Postgres historic-SQL +ingest via `pg_stat_statements`. It verifies setup, unified Historic SQL artifacts, +managed daemon batch SQL analysis, bounded pattern WorkUnit shards, and +no-WorkUnit idempotency for unchanged bucketed table inputs and pattern shards. +``` + +- [ ] **Step 5: Run the example docs test** + +Run: + +```bash +node --test scripts/examples-docs.test.mjs +``` + +Expected: PASS. + +- [ ] **Step 6: Commit the docs update** + +Run: + +```bash +git add examples/postgres-historic/README.md examples/README.md +git commit -m "docs: explain historic sql pattern shards" +``` + +### Task 4: Verify The Smoke Contract + +**Files:** +- Verify: `scripts/examples-docs.test.mjs` +- Verify: `examples/postgres-historic/scripts/smoke.sh` +- Verify: `examples/postgres-historic/README.md` +- Verify: `examples/README.md` + +- [ ] **Step 1: Run focused local checks** + +Run: + +```bash +bash -n examples/postgres-historic/scripts/smoke.sh +node --test scripts/examples-docs.test.mjs +``` + +Expected: both commands pass. + +- [ ] **Step 2: Run the Docker-backed Postgres historic smoke** + +Run: + +```bash +examples/postgres-historic/scripts/smoke.sh +``` + +Expected: PASS with `Postgres historic SQL smoke passed`. The stage-only records should include pattern WorkUnits with keys like `historic-sql-patterns-part-0001`, each reading `patterns-input/part-0001.json`, and the unchanged run should report `workUnitCount: 0`. + +- [ ] **Step 3: Run the drift grep** + +Run: + +```bash +rg -n "unitKey === 'historic-sql-patterns'|expected patterns WorkUnit|patterns-input\\.json\\` and one \\`tables|unchanged bucketed table and pattern inputs" examples scripts +``` + +Expected: no matches. + +- [ ] **Step 4: Commit verification metadata if any test-only wording changed** + +Run: + +```bash +git status --short +``` + +Expected: no unstaged files. If a previous step required a wording fix, commit only the touched files: + +```bash +git add scripts/examples-docs.test.mjs examples/postgres-historic/scripts/smoke.sh examples/postgres-historic/README.md examples/README.md +git commit -m "test: verify historic sql sharded smoke docs" +``` + +## Self-Review + +**Spec coverage:** This plan follows spec section 5.2's deterministic pattern sharding and preserves section 4.6's full `patterns-input.json` audit artifact. It updates the smoke and docs around the already implemented sharded runtime contract. + +**Placeholder scan:** The plan contains exact file paths, exact snippets, commands, expected outcomes, and commit commands. + +**Type consistency:** The plan uses the implemented runtime names consistently: `patterns-input.json` for the audit file, `patterns-input/part-*.json` for bounded shards, and `historic-sql-patterns-part-0001` style WorkUnit keys for pattern curation. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-pattern-shard-smoke-docs.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - execute tasks in this session using executing-plans, batch execution with checkpoints diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-pattern-workunit-sharding.md b/docs/superpowers/plans/2026-05-11-historic-sql-pattern-workunit-sharding.md new file mode 100644 index 00000000..ee7604a7 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-pattern-workunit-sharding.md @@ -0,0 +1,943 @@ +# Historic SQL Pattern WorkUnit Sharding Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Keep historic-SQL pattern WorkUnit inputs under the raw-file and prompt-size limits by writing deterministic bounded pattern shards while preserving `patterns-input.json` as the full audit artifact. + +**Architecture:** The stager continues to write full `patterns-input.json` for audit and diff visibility, then writes bounded `patterns-input/part-0001.json` style shards that contain only cross-table pattern candidates. The chunker emits one `historic_sql_patterns` WorkUnit per changed shard and never asks the skill to read the full audit file. Pattern projection is unchanged because emitted evidence already carries a free-form `rawPath`. + +**Tech Stack:** TypeScript, Node.js filesystem APIs, Zod, Vitest, pnpm workspace commands. + +--- + +## Spec And Existing Plan Status + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans derived from this spec: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` - implemented. Current evidence includes `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`, `SqlAnalysisPort.analyzeBatch()`, daemon `/sql/analyze-batch`, `SemanticLayerSource.usage`, and `mergeUsagePreservingExternal()`. +- `docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md` - implemented. Current evidence includes usage-aware `buildSemanticLayerSourceSearchText()`, FTS snippets in `sqlite-sl-sources-index.ts`, and list surfaces exposing `frequencyTier` plus `snippet`. +- `docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md` - implemented. Current evidence includes `stageHistoricSqlAggregatedSnapshot()`, `chunkHistoricSqlUnifiedStagedDir()`, `PostgresPgssReader`, aggregate BigQuery/Snowflake readers, unified schemas, and package exports. +- `docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md` - implemented. Current evidence includes production adapter cutover, `historic_sql_table_digest`, `historic_sql_patterns`, `emit_historic_sql_evidence`, `HistoricSqlProjectionPostProcessor`, and removal of legacy skill names from runtime code. +- `docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md` - implemented. Current evidence includes local adapter registration tests for Postgres, BigQuery, and Snowflake plus PG doctor coverage for informational `pg_stat_statements.max`. +- `docs/superpowers/plans/2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md` - implemented. Current evidence includes canonical setup config tests, docs using `minExecutions`, and the Postgres historic smoke script asserting unified staged artifacts and unchanged-run idempotency. +- `docs/superpowers/plans/2026-05-11-historic-sql-projection-archive-hardening.md` - implemented. Current evidence includes `isArchivedPatternPage()`, archive exclusion from active slug matching, stale table tests, and legacy query-page cleanup coverage. +- `docs/superpowers/plans/2026-05-11-historic-sql-end-to-end-retrieval-acceptance.md` - implemented. Current evidence includes `local-ingest-acceptance.test.ts` proving production adapter output reaches SL search and wiki search. +- `docs/superpowers/plans/2026-05-11-historic-sql-redaction-hardening.md` - implemented. Current evidence includes `redaction.ts`, `redaction.test.ts`, and `stage-unified.test.ts` proving original SQL is analyzed while staged artifacts contain `[REDACTED]`. + +No existing spec-derived plan is currently unimplemented in this worktree. This plan covers the next uncovered implementation gap from spec section 5.2: `historic_sql_patterns` may need "a small handful" of deterministic chunks when `patterns-input.json` exceeds the LLM context budget. Current code always emits one WorkUnit with raw file `patterns-input.json`; `read_raw_file` rejects files larger than 120,000 bytes and WorkUnit prompt construction rejects prompts larger than 240,000 characters. + +## File Structure + +- Create `packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts` + Owns deterministic pattern audit ordering, cross-table candidate filtering, byte-bounded shard creation, shard path constants, and shard path detection. +- Create `packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts` + Covers deterministic shard ordering, single-table exclusion from WorkUnit shards, byte limits, and oversize-template manifest warnings. +- Modify `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` + Writes full `patterns-input.json` plus bounded `patterns-input/part-0001.json` shard files, and appends shard warnings to `manifest.json`. +- Modify `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` + Adds a regression for audit file preservation and sharded WorkUnit input creation. +- Modify `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts` + Emits one patterns WorkUnit per changed shard path, treats root `patterns-input.json` as audit-only, and includes shard paths in the scope descriptor and eviction calculation. +- Modify `packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts` + Updates root-file expectations and adds multi-shard diff behavior. +- Modify `packages/context/skills/historic_sql_patterns/SKILL.md` + Tells the skill to read the exact pattern shard in `rawFiles` and emit evidence with that shard as `rawPath`. +- Modify `packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts` + Updates the fake agent to emit pattern evidence for `historic-sql-patterns-part-0001`. +- Modify `packages/context/src/ingest/ingest-runtime-assets.test.ts` + Keeps packaged skill assertions aligned with sharded pattern file guidance. + +## Task 1: Add Pattern Input Sharding Helper + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts` + +- [ ] **Step 1: Write the failing helper tests** + +Create `packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts`: + +```typescript +import { describe, expect, it } from 'vitest'; +import { + HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES, + isHistoricSqlPatternInputShardPath, + serializedStagedPatternsInputByteLength, + splitHistoricSqlPatternInputs, +} from './pattern-inputs.js'; +import type { StagedPatternsInput } from './types.js'; + +type PatternTemplate = StagedPatternsInput['templates'][number]; + +function template(id: string, tablesTouched: string[], canonicalSql = 'select 1'): PatternTemplate { + return { + id, + canonicalSql, + tablesTouched, + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }; +} + +describe('historic-SQL pattern input sharding', () => { + it('keeps the audit input complete while sharding only cross-table pattern candidates', () => { + const largeSql = `select * from public.orders join public.customers on true where marker = '${'x'.repeat(260)}'`; + const input: StagedPatternsInput = { + templates: [ + template('single-table-orders', ['public.orders']), + template('orders-customers-2', ['public.orders', 'public.customers'], largeSql), + template('orders-customers-1', ['public.customers', 'public.orders'], largeSql), + template('orders-customers-payments', ['public.orders', 'public.customers', 'public.payments'], largeSql), + ], + }; + + const result = splitHistoricSqlPatternInputs(input, { maxBytes: 760 }); + + expect(result.auditInput.templates.map((entry) => entry.id)).toEqual([ + 'orders-customers-1', + 'orders-customers-2', + 'orders-customers-payments', + 'single-table-orders', + ]); + expect(result.shards.length).toBeGreaterThan(1); + expect(result.shards.map((shard) => shard.path)).toEqual(['patterns-input/part-0001.json', 'patterns-input/part-0002.json', 'patterns-input/part-0003.json']); + expect(result.shards.flatMap((shard) => shard.input.templates.map((entry) => entry.id))).toEqual([ + 'orders-customers-payments', + 'orders-customers-1', + 'orders-customers-2', + ]); + expect(result.shards.every((shard) => shard.byteLength <= 760)).toBe(true); + expect(result.shards.flatMap((shard) => shard.input.templates).some((entry) => entry.id === 'single-table-orders')).toBe(false); + expect(result.warnings).toEqual([]); + }); + + it('omits a single oversized template from shards and reports a manifest warning', () => { + const input: StagedPatternsInput = { + templates: [ + template( + 'oversized-cross-table', + ['public.orders', 'public.customers'], + `select * from public.orders join public.customers on true where payload = '${'x'.repeat(500)}'`, + ), + ], + }; + + const result = splitHistoricSqlPatternInputs(input, { maxBytes: 240 }); + + expect(result.auditInput.templates.map((entry) => entry.id)).toEqual(['oversized-cross-table']); + expect(result.shards).toEqual([]); + expect(result.warnings).toEqual(['patterns_input_template_too_large:oversized-cross-table']); + }); + + it('recognizes only generated pattern shard paths', () => { + expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0001.json')).toBe(true); + expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0012.json')).toBe(true); + expect(isHistoricSqlPatternInputShardPath('patterns-input.json')).toBe(false); + expect(isHistoricSqlPatternInputShardPath('patterns-input/part-1.json')).toBe(false); + expect(isHistoricSqlPatternInputShardPath('patterns-input/readme.md')).toBe(false); + }); + + it('uses a production byte budget below read_raw_file maximum size', () => { + expect(HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES).toBeLessThan(120_000); + expect(serializedStagedPatternsInputByteLength({ templates: [] })).toBeGreaterThan(0); + }); +}); +``` + +- [ ] **Step 2: Run helper tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/pattern-inputs.test.ts +``` + +Expected: FAIL because `./pattern-inputs.js` does not exist. + +- [ ] **Step 3: Add the sharding helper** + +Create `packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts`: + +```typescript +import { Buffer } from 'node:buffer'; +import type { StagedPatternsInput } from './types.js'; + +export const HISTORIC_SQL_PATTERN_WORKUNIT_DIR = 'patterns-input'; +export const HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES = 110_000; +export const HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE = /^patterns-input\/part-\d{4}\.json$/; + +type PatternTemplate = StagedPatternsInput['templates'][number]; + +export interface HistoricSqlPatternInputShard { + path: string; + input: StagedPatternsInput; + byteLength: number; +} + +export interface HistoricSqlPatternInputSplitResult { + auditInput: StagedPatternsInput; + shards: HistoricSqlPatternInputShard[]; + warnings: string[]; +} + +export interface HistoricSqlPatternInputSplitOptions { + maxBytes?: number; +} + +export function isHistoricSqlPatternInputShardPath(path: string): boolean { + return HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE.test(path); +} + +export function serializeStagedPatternsInput(input: StagedPatternsInput): string { + return `${JSON.stringify(input, null, 2)}\n`; +} + +export function serializedStagedPatternsInputByteLength(input: StagedPatternsInput): number { + return Buffer.byteLength(serializeStagedPatternsInput(input), 'utf-8'); +} + +function sortedAuditTemplates(templates: readonly PatternTemplate[]): PatternTemplate[] { + return [...templates].sort((left, right) => left.id.localeCompare(right.id)); +} + +function sortedPatternCandidates(templates: readonly PatternTemplate[]): PatternTemplate[] { + return [...templates] + .filter((template) => template.tablesTouched.length >= 2) + .map((template) => ({ ...template, tablesTouched: [...template.tablesTouched].sort() })) + .sort((left, right) => { + const cardinality = right.tablesTouched.length - left.tablesTouched.length; + if (cardinality !== 0) return cardinality; + const tableSignature = left.tablesTouched.join('\0').localeCompare(right.tablesTouched.join('\0')); + if (tableSignature !== 0) return tableSignature; + return left.id.localeCompare(right.id); + }); +} + +function shardPath(index: number): string { + return `${HISTORIC_SQL_PATTERN_WORKUNIT_DIR}/part-${String(index).padStart(4, '0')}.json`; +} + +export function splitHistoricSqlPatternInputs( + input: StagedPatternsInput, + options: HistoricSqlPatternInputSplitOptions = {}, +): HistoricSqlPatternInputSplitResult { + const maxBytes = options.maxBytes ?? HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES; + const auditInput: StagedPatternsInput = { templates: sortedAuditTemplates(input.templates) }; + const warnings: string[] = []; + const shards: HistoricSqlPatternInputShard[] = []; + let current: PatternTemplate[] = []; + + const flush = () => { + if (current.length === 0) { + return; + } + const shardInput: StagedPatternsInput = { templates: current }; + shards.push({ + path: shardPath(shards.length + 1), + input: shardInput, + byteLength: serializedStagedPatternsInputByteLength(shardInput), + }); + current = []; + }; + + for (const template of sortedPatternCandidates(input.templates)) { + const singleInput: StagedPatternsInput = { templates: [template] }; + if (serializedStagedPatternsInputByteLength(singleInput) > maxBytes) { + warnings.push(`patterns_input_template_too_large:${template.id}`); + continue; + } + + const nextInput: StagedPatternsInput = { templates: [...current, template] }; + if (current.length > 0 && serializedStagedPatternsInputByteLength(nextInput) > maxBytes) { + flush(); + } + current.push(template); + } + + flush(); + return { auditInput, shards, warnings }; +} +``` + +- [ ] **Step 4: Run helper tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/pattern-inputs.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit the helper** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts +git commit -m "feat: shard historic sql pattern inputs" +``` + +## Task 2: Write Pattern Shards During Staging + +**Files:** +- Modify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` +- Test: `packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts` + +- [ ] **Step 1: Add the failing stager regression** + +Append this test inside the existing `describe('stageHistoricSqlAggregatedSnapshot', ...)` block in `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts`: + +```typescript + it('preserves full patterns audit input and writes bounded cross-table pattern shards', async () => { + const stagedDir = await tempDir(); + const largeSql = `select * from public.orders o join public.customers c on c.id = o.customer_id where payload = '${'x'.repeat(8000)}'`; + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ + templateId: 'orders-customers-a', + canonicalSql: largeSql, + stats: { + executions: 25, + distinctUsers: 4, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 15, + p95RuntimeMs: 90, + errorRate: 0, + rowsProduced: 250, + }, + }); + yield aggregate({ + templateId: 'orders-customers-b', + canonicalSql: largeSql.replace('payload', 'payload_b'), + stats: { + executions: 22, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 20, + p95RuntimeMs: 95, + errorRate: 0, + rowsProduced: 220, + }, + }); + yield aggregate({ + templateId: 'orders-single-table', + canonicalSql: 'select count(*) from public.orders', + stats: { + executions: 30, + distinctUsers: 2, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 10, + p95RuntimeMs: 20, + errorRate: 0, + rowsProduced: 30, + }, + }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + [ + 'orders-customers-a', + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: [], + where: ['payload'], + join: ['customer_id', 'id'], + groupBy: [], + }, + }, + ], + [ + 'orders-customers-b', + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: [], + where: ['payload_b'], + join: ['customer_id', 'id'], + groupBy: [], + }, + }, + ], + [ + 'orders-single-table', + { + tablesTouched: ['public.orders'], + columnsByClause: { + select: [], + where: [], + join: [], + groupBy: [], + }, + }, + ], + ])), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { dialect: 'postgres' }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + const audit = await readJson>(stagedDir, 'patterns-input.json'); + expect(audit.templates.map((entry: any) => entry.id)).toEqual([ + 'orders-customers-a', + 'orders-customers-b', + 'orders-single-table', + ]); + + const firstShard = await readJson>(stagedDir, 'patterns-input/part-0001.json'); + expect(firstShard.templates.map((entry: any) => entry.id)).toEqual(['orders-customers-a', 'orders-customers-b']); + expect(firstShard.templates.some((entry: any) => entry.id === 'orders-single-table')).toBe(false); + + const manifest = await readJson>(stagedDir, 'manifest.json'); + expect(manifest.warnings).toEqual([]); + }); +``` + +- [ ] **Step 2: Run the stager regression to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/stage-unified.test.ts +``` + +Expected: FAIL because `patterns-input/part-0001.json` is not written. + +- [ ] **Step 3: Import the sharding helper in the stager** + +In `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts`, add this import below the bucket import block: + +```typescript +import { splitHistoricSqlPatternInputs } from './pattern-inputs.js'; +``` + +- [ ] **Step 4: Write the audit input and shard files** + +In `stageHistoricSqlAggregatedSnapshot()` in `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts`, replace this block: + +```typescript + await writeJson(input.stagedDir, 'patterns-input.json', toPatternsInput(parsedTemplates)); + await writeJson(input.stagedDir, 'manifest.json', { + source: HISTORIC_SQL_SOURCE_KEY, + connectionId: input.connectionId, + dialect: config.dialect, + fetchedAt: now.toISOString(), + windowStart: windowStart.toISOString(), + windowEnd: now.toISOString(), + snapshotRowCount, + touchedTableCount: byTable.size, + parseFailures: warnings.filter((warning) => warning.startsWith('parse_failed:')).length, + warnings, + probeWarnings: probe.warnings, + staleArchiveAfterDays: config.staleArchiveAfterDays, + }); +``` + +with this code: + +```typescript + const patternsInput = toPatternsInput(parsedTemplates); + const patternInputSplit = splitHistoricSqlPatternInputs(patternsInput); + const allWarnings = [...warnings, ...patternInputSplit.warnings]; + await writeJson(input.stagedDir, 'patterns-input.json', patternInputSplit.auditInput); + for (const shard of patternInputSplit.shards) { + await writeJson(input.stagedDir, shard.path, shard.input); + } + await writeJson(input.stagedDir, 'manifest.json', { + source: HISTORIC_SQL_SOURCE_KEY, + connectionId: input.connectionId, + dialect: config.dialect, + fetchedAt: now.toISOString(), + windowStart: windowStart.toISOString(), + windowEnd: now.toISOString(), + snapshotRowCount, + touchedTableCount: byTable.size, + parseFailures: allWarnings.filter((warning) => warning.startsWith('parse_failed:')).length, + warnings: allWarnings, + probeWarnings: probe.warnings, + staleArchiveAfterDays: config.staleArchiveAfterDays, + }); +``` + +- [ ] **Step 5: Run helper and stager tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/pattern-inputs.test.ts src/ingest/adapters/historic-sql/stage-unified.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit stager shard writing** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts +git commit -m "feat: write historic sql pattern shards" +``` + +## Task 3: Emit Pattern WorkUnits From Shards + +**Files:** +- Modify: `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts` +- Test: `packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts` + +- [ ] **Step 1: Update chunk tests for sharded pattern WorkUnits** + +In `packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts`, replace the `patterns-input.json` write inside `writeUnifiedStagedDir()` with these writes: + +```typescript + await writeJson(root, 'patterns-input.json', { + templates: [ + { + id: 'orders', + canonicalSql: 'select * from public.orders join public.customers on true', + tablesTouched: ['public.orders', 'public.customers'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ], + }); + await writeJson(root, 'patterns-input/part-0001.json', { + templates: [ + { + id: 'orders', + canonicalSql: 'select * from public.orders join public.customers on true', + tablesTouched: ['public.orders', 'public.customers'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ], + }); +``` + +In the first test, replace the patterns WorkUnit expectation with: + +```typescript + expect.objectContaining({ + unitKey: 'historic-sql-patterns-part-0001', + displayLabel: 'Historic SQL cross-table patterns: part-0001', + rawFiles: ['patterns-input/part-0001.json'], + dependencyPaths: ['manifest.json'], + notes: expect.stringContaining('patterns-input/part-0001.json'), + }), +``` + +In the diff-set test, replace the second expectation with: + +```typescript + await expect( + chunkHistoricSqlUnifiedStagedDir(stagedDir, { + added: [], + modified: ['patterns-input/part-0001.json'], + deleted: [], + unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'], + }), + ).resolves.toMatchObject({ + workUnits: [expect.objectContaining({ unitKey: 'historic-sql-patterns-part-0001' })], + }); + + await expect( + chunkHistoricSqlUnifiedStagedDir(stagedDir, { + added: [], + modified: ['patterns-input.json'], + deleted: [], + unchanged: ['manifest.json', 'patterns-input/part-0001.json', 'tables/public.orders.json'], + }), + ).resolves.toMatchObject({ + workUnits: [], + }); +``` + +In the scope test, add these expectations: + +```typescript + expect(scope.isPathInScope('patterns-input/part-0001.json')).toBe(true); + expect(scope.isPathInScope('patterns-input/part-1.json')).toBe(false); +``` + +Append this additional test inside the same `describe` block: + +```typescript + it('emits one patterns WorkUnit per changed shard', async () => { + const stagedDir = await tempDir(); + await writeUnifiedStagedDir(stagedDir); + await writeJson(stagedDir, 'patterns-input/part-0002.json', { + templates: [ + { + id: 'line-items', + canonicalSql: 'select * from public.orders join public.line_items on true', + tablesTouched: ['public.orders', 'public.line_items'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ], + }); + + const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir, { + added: ['patterns-input/part-0002.json'], + modified: ['patterns-input/part-0001.json'], + deleted: [], + unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'], + }); + + expect(result.workUnits.map((unit) => unit.unitKey)).toEqual([ + 'historic-sql-patterns-part-0001', + 'historic-sql-patterns-part-0002', + ]); + expect(result.workUnits.map((unit) => unit.rawFiles)).toEqual([ + ['patterns-input/part-0001.json'], + ['patterns-input/part-0002.json'], + ]); + }); +``` + +- [ ] **Step 2: Run chunk tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/chunk-unified.test.ts +``` + +Expected: FAIL because `chunkHistoricSqlUnifiedStagedDir()` still emits `historic-sql-patterns` from root `patterns-input.json`. + +- [ ] **Step 3: Import shard path helpers in the chunker** + +In `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts`, add this import below the existing type imports: + +```typescript +import { isHistoricSqlPatternInputShardPath } from './pattern-inputs.js'; +``` + +- [ ] **Step 4: Emit WorkUnits from shard paths** + +In `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts`, replace the root `patterns-input.json` WorkUnit block: + +```typescript + if (files.includes('patterns-input.json') && touchedPath('patterns-input.json', touched)) { + stagedPatternsInputSchema.parse(await readJson(stagedDir, 'patterns-input.json')); + workUnits.push({ + unitKey: 'historic-sql-patterns', + displayLabel: 'Historic SQL cross-table patterns', + rawFiles: ['patterns-input.json'], + dependencyPaths: ['manifest.json'], + peerFileIndex: files.filter((file) => file !== 'patterns-input.json' && file !== 'manifest.json').sort(), + notes: + 'Use historic_sql_patterns. Read patterns-input.json and emit pattern objects with emit_historic_sql_evidence. Do not call wiki_write or sl_write_source.', + }); + } +``` + +with this code: + +```typescript + for (const path of files.filter(isHistoricSqlPatternInputShardPath)) { + if (!touchedPath(path, touched)) { + continue; + } + stagedPatternsInputSchema.parse(await readJson(stagedDir, path)); + const shardLabel = path.replace(/^patterns-input\//, '').replace(/\.json$/, ''); + workUnits.push({ + unitKey: `historic-sql-patterns-${safeUnitKey(shardLabel)}`, + displayLabel: `Historic SQL cross-table patterns: ${shardLabel}`, + rawFiles: [path], + dependencyPaths: ['manifest.json'], + peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(), + notes: + `Use historic_sql_patterns. Read ${path} and emit pattern objects with emit_historic_sql_evidence using rawPath "${path}". Do not call wiki_write or sl_write_source.`, + }); + } +``` + +- [ ] **Step 5: Update eviction and scope matching** + +In `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts`, replace the deleted-path filter: + +```typescript + const deleted = diffSet?.deleted.filter((path) => path === 'patterns-input.json' || /^tables\/.+\.json$/.test(path)).sort(); +``` + +with: + +```typescript + const deleted = diffSet?.deleted + .filter((path) => isHistoricSqlPatternInputShardPath(path) || /^tables\/.+\.json$/.test(path)) + .sort(); +``` + +In `describeHistoricSqlUnifiedScope()`, replace the scope predicate: + +```typescript + isPathInScope: (rawPath) => + rawPath === 'manifest.json' || rawPath === 'patterns-input.json' || /^tables\/.+\.json$/.test(rawPath), +``` + +with: + +```typescript + isPathInScope: (rawPath) => + rawPath === 'manifest.json' || + rawPath === 'patterns-input.json' || + isHistoricSqlPatternInputShardPath(rawPath) || + /^tables\/.+\.json$/.test(rawPath), +``` + +- [ ] **Step 6: Run helper, stage, and chunk tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/pattern-inputs.test.ts src/ingest/adapters/historic-sql/stage-unified.test.ts src/ingest/adapters/historic-sql/chunk-unified.test.ts +``` + +Expected: PASS. + +- [ ] **Step 7: Commit chunker shard WorkUnits** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts +git commit -m "feat: emit historic sql pattern shard work units" +``` + +## Task 4: Update Skill Guidance And Acceptance Coverage + +**Files:** +- Modify: `packages/context/skills/historic_sql_patterns/SKILL.md` +- Modify: `packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts` +- Modify: `packages/context/src/ingest/ingest-runtime-assets.test.ts` + +- [ ] **Step 1: Update the packaged historic SQL patterns skill** + +Replace `packages/context/skills/historic_sql_patterns/SKILL.md` with: + +````markdown +--- +name: historic_sql_patterns +description: Identify recurring cross-table historic-SQL analytical intents from a bounded pattern shard and emit typed pattern evidence for deterministic wiki projection. +callers: [memory_agent] +--- + +# Historic SQL Patterns + +Use this skill when the WorkUnit raw file is a `patterns-input/part-0001.json` style shard from the `historic-sql` adapter. Older staged bundles may still provide root `patterns-input.json`; when that is the WorkUnit raw file, read it the same way. + +## Required Workflow + +1. Read the WorkUnit notes first. +2. Find the single pattern input file listed under the WorkUnit `rawFiles` section. +3. Call `read_raw_file` for that exact raw file path. +4. Identify recurring analytical intents that span at least two tables and have repeated usage signal. +5. Emit one `pattern` evidence object per durable cross-table intent by calling `emit_historic_sql_evidence`. +6. Set each evidence object's `rawPath` to the exact raw file path read in step 3. +7. Stop after all pattern evidence has been emitted. + +## Evidence Shape + +Each call to `emit_historic_sql_evidence` must use this shape: + +```json +{ + "kind": "pattern", + "rawPath": "patterns-input/part-0001.json", + "pattern": { + "slug": "order-lifecycle-analysis", + "title": "Order Lifecycle Analysis", + "narrative": "Analysts compare order statuses with customer segments to understand lifecycle movement.", + "definitionSql": "select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status", + "tablesInvolved": ["public.orders", "public.customers"], + "slRefs": ["orders", "customers"], + "constituentTemplateIds": ["pg:1", "pg:2"] + } +} +``` + +The `pattern` object must match `patternOutputSchema`; multiple calls together must form `patternsArraySchema`. + +## Pattern Selection Rules + +- Prefer patterns that involve two or more tables. +- Prefer templates with `executionsBucket` at least `10-100` and `distinctUsersBucket` above solo usage. +- Merge templates into one pattern only when the business intent is the same. +- Use a stable kebab-case slug based on intent, not a template id. +- Set `definitionSql` to the clearest representative SQL from a constituent template. +- Set `slRefs` to source names when the source name is obvious from table names; omit uncertain refs rather than guessing. +- Treat each pattern shard independently; do not read peer shard files from `peerFileIndex`. + +## Boundaries + +- Do not call wiki_write. +- Do not call sl_write_source. +- Do not call sl_edit_source. +- Do not call context_candidate_write. +- Do not create single-table pattern pages. +- Do not copy credentials, tokens, user emails, or unredacted literals into evidence. +```` + +- [ ] **Step 2: Update runtime asset assertions** + +In `packages/context/src/ingest/ingest-runtime-assets.test.ts`, replace this assertion: + +```typescript + expect(body).toContain('patterns-input.json'); +``` + +with: + +```typescript + expect(body).toContain('patterns-input/part-0001.json'); +``` + +- [ ] **Step 3: Update the local ingest acceptance fake agent** + +In `packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts`, replace this block: + +```typescript + if (params.telemetryTags.unitKey === 'historic-sql-patterns') { + const result = await emitEvidence.execute( + { + kind: 'pattern', + rawPath: 'patterns-input.json', + pattern: { +``` + +with: + +```typescript + if (params.telemetryTags.unitKey === 'historic-sql-patterns-part-0001') { + const result = await emitEvidence.execute( + { + kind: 'pattern', + rawPath: 'patterns-input/part-0001.json', + pattern: { +``` + +The rest of the pattern object stays unchanged. + +- [ ] **Step 4: Run skill and acceptance tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/ingest-runtime-assets.test.ts src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit skill and acceptance updates** + +```bash +git add packages/context/skills/historic_sql_patterns/SKILL.md packages/context/src/ingest/ingest-runtime-assets.test.ts packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +git commit -m "test: align historic sql pattern skill with shards" +``` + +## Task 5: Final Verification + +**Files:** +- Verify: `packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts` +- Verify: `packages/context/skills/historic_sql_patterns/SKILL.md` + +- [ ] **Step 1: Run focused historic SQL tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/ingest/adapters/historic-sql/pattern-inputs.test.ts \ + src/ingest/adapters/historic-sql/stage-unified.test.ts \ + src/ingest/adapters/historic-sql/chunk-unified.test.ts \ + src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts \ + src/ingest/ingest-runtime-assets.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run context package type-check** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS. + +- [ ] **Step 3: Verify no legacy historic SQL code path returned** + +Run: + +```bash +rg -n "stagePgStatStatementsTemplates|expandCategoricalTemplates|classifySlot|pgss-baseline|historic_sql_ingest|historic_sql_curator|PostgresPgssQueryHistoryReader|historic_sql_template" packages/context packages/cli +``` + +Expected: no matches in runtime or test source. Matches inside `docs/superpowers/plans/` are acceptable when searching docs separately, but this command does not search docs. + +- [ ] **Step 4: Run pre-commit on changed files if configured** + +Run: + +```bash +uv run pre-commit run --files \ + packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts \ + packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts \ + packages/context/src/ingest/adapters/historic-sql/stage-unified.ts \ + packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts \ + packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts \ + packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts \ + packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts \ + packages/context/src/ingest/ingest-runtime-assets.test.ts \ + packages/context/skills/historic_sql_patterns/SKILL.md +``` + +Expected: PASS. If the repository has no pre-commit config or the local `uv` version cannot satisfy the project pin, record the exact error and rely on the focused tests plus type-check above. + +- [ ] **Step 5: Commit verification-only adjustments if any were needed** + +If any test or type-check step required small follow-up edits, commit them: + +```bash +git add packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts packages/context/src/ingest/ingest-runtime-assets.test.ts packages/context/skills/historic_sql_patterns/SKILL.md +git commit -m "test: verify historic sql pattern shard work units" +``` + +If there were no follow-up edits, do not create an empty commit. + +## Self-Review + +**Spec coverage:** This plan covers spec section 5.2's allowance for multiple deterministic pattern WorkUnits when `patterns-input.json` exceeds a context budget. It preserves section 4.6's full `patterns-input.json` audit artifact, keeps section 4.7's changed-file DiffSet behavior, and does not alter deterministic projection from section 5.3. + +**Placeholder scan:** The plan contains concrete files, commands, expected outcomes, code snippets, and commit commands. It has no deferred implementation markers. + +**Type consistency:** `StagedPatternsInput`, `splitHistoricSqlPatternInputs()`, `isHistoricSqlPatternInputShardPath()`, `HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES`, and `serializedStagedPatternsInputByteLength()` are introduced in Task 1 and imported with the same names in later tasks. Pattern shard raw paths use `patterns-input/part-0001.json` consistently in the stager, chunker, skill, and acceptance test. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-pattern-workunit-sharding.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - I dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - Execute tasks in this session using executing-plans, batch execution with checkpoints + +Which approach? diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-projection-archive-hardening.md b/docs/superpowers/plans/2026-05-11-historic-sql-projection-archive-hardening.md new file mode 100644 index 00000000..655d9568 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-projection-archive-hardening.md @@ -0,0 +1,444 @@ +# Historic SQL Projection Archive Hardening Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Keep historic-SQL archived pattern pages stable across runs and add projection regression coverage for archive, stale-table, and legacy-page behavior from the redesign spec. + +**Architecture:** The redesigned historic-SQL pipeline is already cut over. This plan only hardens the deterministic projection step by treating `knowledge/global/historic-sql/_archived/*.md` pages as historical records, not active candidates for slug reuse or stale/archive processing. Tests stay in the existing projection unit suite because the behavior is pure filesystem projection. + +**Tech Stack:** TypeScript ESM/NodeNext, Vitest, YAML, local filesystem fixtures. + +--- + +## Starting Point + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans found that are based on this spec: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md` + +Implemented status verified from this worktree: + +- `2026-05-11-historic-sql-foundations.md` is implemented. Evidence: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`, `packages/context/src/sql-analysis/ports.ts` exposes `analyzeBatch()`, `python/ktx-daemon/src/ktx_daemon/app.py` registers `/sql/analyze-batch`, `packages/context/src/sl/types.ts` has `SemanticLayerSource.usage`, and `packages/context/src/ingest/adapters/live-database/manifest.ts` has `mergeUsagePreservingExternal()`. +- `2026-05-11-historic-sql-search-enrichment.md` is implemented. Evidence: `packages/context/src/sl/sl-search.service.ts` indexes `source.usage`, `packages/context/src/sl/sqlite-sl-sources-index.ts` selects FTS snippets, and local/MCP list surfaces expose `frequencyTier` and `snippet`. +- `2026-05-11-historic-sql-unified-hot-path.md` is implemented. Evidence: `stageHistoricSqlAggregatedSnapshot()`, `chunkHistoricSqlUnifiedStagedDir()`, `PostgresPgssReader`, aggregate BigQuery/Snowflake `fetchAggregated()` methods, unified schemas, and exports exist. +- `2026-05-11-historic-sql-skills-projection-cutover.md` is implemented. Evidence: `HistoricSqlSourceAdapter` uses the unified stager/chunker, `packages/context/skills/historic_sql_table_digest/` and `packages/context/skills/historic_sql_patterns/` exist, `emit_historic_sql_evidence` exists, `HistoricSqlProjectionPostProcessor` is wired in `packages/context/src/ingest/local-bundle-runtime.ts`, and legacy skill names no longer grep in `packages/context` or `packages/cli`. +- `2026-05-11-historic-sql-cross-dialect-readiness.md` is implemented. Evidence: `packages/cli/src/local-adapters.test.ts` covers Postgres, BigQuery, and Snowflake historic-SQL registration, and `packages/cli/src/historic-sql-doctor.test.ts` covers low `pg_stat_statements.max` as informational output. +- `2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md` is implemented. Evidence: `packages/cli/src/setup-databases.test.ts` expects canonical `historicSql.filters.serviceAccounts`, `examples/postgres-historic/scripts/smoke.sh` asserts `manifest.json`, `tables/*.json`, `patterns-input.json`, and zero WorkUnits on the unchanged run, and public docs use `minExecutions`. + +Remaining issue this plan fixes: + +- `packages/context/src/ingest/adapters/historic-sql/projection.ts` recursively loads every markdown page below `knowledge/global/historic-sql`, including pages already under `_archived/`. +- Because archived pages still have `source: historic-sql` and tags `['historic-sql', 'pattern', 'archived']`, they are currently active candidates for slug reuse and stale/archive processing. +- A reappearing pattern can be written back to `_archived/.md` instead of active `historic-sql/.md`. +- A later no-pattern run can move an already archived page to `_archived/_archived/.md`. +- `projection.test.ts` does not cover stale table marking, legacy query-page deletion, or the archived-page stability behavior required by spec §5.3 and §10.2. + +## File Structure + +- Modify `packages/context/src/ingest/adapters/historic-sql/projection.ts`: add an archived-page predicate and exclude archived pages from active pattern slug matching and stale/archive loops. +- Modify `packages/context/src/ingest/adapters/historic-sql/projection.test.ts`: add failing tests for archived-page stability, active slug restoration after a pattern reappears, stale table marking, and legacy query-page cleanup. + +### Task 1: Add Archived Pattern Projection Regression Tests + +**Files:** +- Modify: `packages/context/src/ingest/adapters/historic-sql/projection.test.ts` + +- [ ] **Step 1: Add failing tests for archived page handling** + +Append these tests inside the existing `describe('projectHistoricSqlEvidence', ...)` block in `packages/context/src/ingest/adapters/historic-sql/projection.test.ts`: + +```typescript + it('writes a reappearing pattern to the active slug instead of reusing an archived page key', async () => { + const workdir = await tempWorkdir(); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 2, + touchedTableCount: 2, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 30, + }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' }); + await writeText( + workdir, + 'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md', + [ + '---', + YAML.stringify({ + summary: 'Archived order lifecycle page', + tags: ['historic-sql', 'pattern', 'archived'], + refs: [], + sl_refs: ['orders'], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.orders', 'public.customers'], + fingerprints: ['pg:1'], + stale_since: '2026-01-01T00:00:00.000Z', + }).trimEnd(), + '---', + '', + 'Archived body', + '', + ].join('\n'), + ); + await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', { + kind: 'pattern', + connectionId: 'warehouse', + rawPath: 'patterns-input.json', + pattern: { + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Analysts compare order status with customer segment again.', + definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['pg:1', 'pg:2'], + }, + }); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.patternPagesWritten).toBe(1); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain( + 'Order Lifecycle Analysis', + ); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain( + 'Archived body', + ); + await expect( + readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/order-lifecycle-analysis.md'), 'utf-8'), + ).rejects.toMatchObject({ code: 'ENOENT' }); + }); + + it('leaves already archived pattern pages stable when they are still absent', async () => { + const workdir = await tempWorkdir(); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 0, + touchedTableCount: 0, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 30, + }); + await writeText( + workdir, + 'knowledge/global/historic-sql/_archived/retired-pattern.md', + [ + '---', + YAML.stringify({ + summary: 'Retired pattern', + tags: ['historic-sql', 'pattern', 'archived'], + refs: [], + sl_refs: [], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.tickets'], + fingerprints: ['pg:9'], + stale_since: '2026-01-01T00:00:00.000Z', + }).trimEnd(), + '---', + '', + 'Archived retired body', + '', + ].join('\n'), + ); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.archivedPatternPages).toBe(0); + expect(result.stalePatternPagesMarked).toBe(0); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/retired-pattern.md'), 'utf-8')).resolves.toContain( + 'Archived retired body', + ); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/retired-pattern.md'), 'utf-8')).rejects.toMatchObject({ + code: 'ENOENT', + }); + }); +``` + +- [ ] **Step 2: Run projection tests to verify the archived-page tests fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/projection.test.ts +``` + +Expected: FAIL. The first new test should fail because `knowledge/global/historic-sql/order-lifecycle-analysis.md` is not written. The second new test should fail because `result.archivedPatternPages` is `1` or `_archived/_archived/retired-pattern.md` exists. + +### Task 2: Exclude Archived Pages From Active Projection Processing + +**Files:** +- Modify: `packages/context/src/ingest/adapters/historic-sql/projection.ts` +- Test: `packages/context/src/ingest/adapters/historic-sql/projection.test.ts` + +- [ ] **Step 1: Add the archived-page predicate** + +In `packages/context/src/ingest/adapters/historic-sql/projection.ts`, add this function after `isLegacyQueryPage()`: + +```typescript +function isArchivedPatternPage(page: HistoricSqlPatternPage): boolean { + const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : []; + return page.key.startsWith('_archived/') || tags.includes('archived'); +} +``` + +- [ ] **Step 2: Use only active pattern pages for slug matching and stale/archive processing** + +In `projectHistoricSqlEvidence()`, replace: + +```typescript + const allPages = await loadPatternPages(wikiRoot); + const patternPages = allPages.filter(isHistoricPatternPage); +``` + +with: + +```typescript + const allPages = await loadPatternPages(wikiRoot); + const activePages = allPages.filter((page) => !isArchivedPatternPage(page)); + const patternPages = activePages.filter(isHistoricPatternPage); +``` + +- [ ] **Step 3: Run projection tests to verify the archived-page fix passes** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/projection.test.ts +``` + +Expected: PASS. All projection tests pass, including the two archived-page tests from Task 1. + +- [ ] **Step 4: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/projection.ts packages/context/src/ingest/adapters/historic-sql/projection.test.ts +git commit -m "fix: keep historic sql archived patterns stable" +``` + +### Task 3: Add Stale Table And Legacy Page Cleanup Regression Coverage + +**Files:** +- Modify: `packages/context/src/ingest/adapters/historic-sql/projection.test.ts` + +- [ ] **Step 1: Add projection coverage for table drift and legacy query-page cleanup** + +Append this test inside the existing `describe('projectHistoricSqlEvidence', ...)` block in `packages/context/src/ingest/adapters/historic-sql/projection.test.ts`: + +```typescript + it('marks missing table usage stale and deletes legacy historic SQL query pages', async () => { + const workdir = await tempWorkdir(); + await writeText( + workdir, + 'semantic-layer/warehouse/_schema/public.yaml', + YAML.stringify({ + tables: { + orders: { + table: 'public.orders', + usage: { + narrative: 'Orders were active before.', + frequencyTier: 'high', + commonFilters: ['status'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + ownerNote: 'keep analyst annotation', + }, + columns: [{ name: 'id', type: 'string' }], + }, + }, + }), + ); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 0, + touchedTableCount: 0, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 90, + }); + await writeText( + workdir, + 'knowledge/global/historic-sql/legacy-template.md', + [ + '---', + YAML.stringify({ + summary: 'Legacy template page', + tags: ['historic-sql', 'query-pattern'], + refs: [], + sl_refs: ['orders'], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.orders'], + fingerprints: ['legacy:1'], + }).trimEnd(), + '---', + '', + 'Legacy body', + '', + ].join('\n'), + ); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.staleTablesMarked).toBe(1); + expect(result.legacyPagesDeleted).toBe(1); + expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]); + const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')); + expect(shard.tables.orders.usage).toEqual({ + ownerNote: 'keep analyst annotation', + narrative: 'No recent historic SQL usage was observed in the latest snapshot.', + frequencyTier: 'unused', + commonFilters: [], + commonGroupBys: [], + commonJoins: [], + staleSince: '2026-05-11T00:00:00.000Z', + }); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/legacy-template.md'), 'utf-8')).rejects.toMatchObject({ + code: 'ENOENT', + }); + }); +``` + +- [ ] **Step 2: Run projection tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/projection.test.ts +``` + +Expected: PASS. The new regression test should pass with the current implementation after Task 2, proving stale table drift and legacy query-page cleanup stay covered. + +- [ ] **Step 3: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/projection.test.ts +git commit -m "test: cover historic sql projection cleanup" +``` + +### Task 4: Final Verification + +**Files:** +- Verify: `packages/context/src/ingest/adapters/historic-sql/projection.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/projection.test.ts` + +- [ ] **Step 1: Run the focused projection test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/projection.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run the focused historic-SQL adapter test group** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/ingest/adapters/historic-sql/evidence.test.ts \ + src/ingest/adapters/historic-sql/evidence-tool.test.ts \ + src/ingest/adapters/historic-sql/projection.test.ts \ + src/ingest/adapters/historic-sql/post-processor.test.ts \ + src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts +``` + +Expected: PASS. + +- [ ] **Step 3: Run context type check** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS. + +- [ ] **Step 4: Confirm old historic-SQL code paths remain absent** + +Run: + +```bash +rg -n "stagePgStatStatementsTemplates|expandCategoricalTemplates|classifySlot|historic_sql_ingest|historic_sql_curator|PostgresPgssQueryHistoryReader|historic_sql_template" packages/context packages/cli +``` + +Expected: no output and exit code 1. + +- [ ] **Step 5: Run whitespace check** + +Run: + +```bash +git diff --check +``` + +Expected: no output. + +- [ ] **Step 6: Commit verification fixes only if verification changed files** + +If verification required an edit, commit the exact touched files: + +```bash +git add packages/context/src/ingest/adapters/historic-sql/projection.ts packages/context/src/ingest/adapters/historic-sql/projection.test.ts +git commit -m "test: verify historic sql projection archive hardening" +``` + +If verification made no edits, do not create an empty commit. + +## Self-Review + +Spec coverage: + +- Spec §5.3 stale pattern handling is covered by Task 1 and Task 2: archived pages are historical records and are not repeatedly archived or reused as active slug targets. +- Spec §10.2 legacy wiki page cleanup is covered by Task 3. +- Spec §10.4 drift behavior is covered by Task 3: a table absent from the latest snapshot receives `usage.staleSince` while external usage keys remain intact. +- Spec §10.6 slug churn and user-edited usage risks are covered by Task 1 and Task 3. + +Placeholder scan: + +- The plan contains no unresolved marker text from the forbidden-pattern list. +- Every code-changing step names exact files, exact inserted or replacement code, exact commands, and expected outcomes. + +Type consistency: + +- `staleSince`, `frequencyTier`, `commonFilters`, `commonGroupBys`, and `commonJoins` match `tableUsageOutputSchema`. +- `stale_since`, `tags`, `tables`, and `fingerprints` match the existing wiki frontmatter shape used in `projection.ts`. +- `archivedPatternPages`, `stalePatternPagesMarked`, `staleTablesMarked`, and `legacyPagesDeleted` match `HistoricSqlProjectionResult`. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-projection-archive-hardening.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - I dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - Execute tasks in this session using executing-plans, batch execution with checkpoints + +Which approach? diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-redaction-hardening.md b/docs/superpowers/plans/2026-05-11-historic-sql-redaction-hardening.md new file mode 100644 index 00000000..1adcdfd3 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-redaction-hardening.md @@ -0,0 +1,441 @@ +# Historic SQL Redaction Hardening Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make `historicSql.redactionPatterns` actually redact sensitive SQL substrings from historic-SQL staged artifacts and WorkUnit inputs. + +**Architecture:** Keep the unified hot path parseable by sending original SQL to the local deterministic SQL-analysis daemon, then redact only the SQL text that is written to `tables/*.json` and `patterns-input.json`. Add a focused redaction helper so regex compatibility and error messages are tested independently from staging, then add a stager regression proving raw sensitive values do not reach files consumed by LLM skills. + +**Tech Stack:** TypeScript ESM/NodeNext, zod 4, Vitest, existing historic-SQL unified stager. + +--- + +## Starting Point + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans found that are based on this spec: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-cross-dialect-readiness.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-projection-archive-hardening.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-end-to-end-retrieval-acceptance.md` + +Implemented status verified from this worktree: + +- `2026-05-11-historic-sql-foundations.md` is implemented. Evidence: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`, `packages/context/src/sql-analysis/ports.ts` exposes `analyzeBatch()`, `python/ktx-daemon/src/ktx_daemon/app.py` registers `/sql/analyze-batch`, `packages/context/src/sl/types.ts` has `SemanticLayerSource.usage`, and `packages/context/src/ingest/adapters/live-database/manifest.ts` has `mergeUsagePreservingExternal()`. +- `2026-05-11-historic-sql-search-enrichment.md` is implemented. Evidence: `packages/context/src/sl/sl-search.service.ts` indexes `source.usage`, `packages/context/src/sl/sqlite-sl-sources-index.ts` selects FTS snippets, and local/MCP list surfaces expose `frequencyTier` and `snippet`. +- `2026-05-11-historic-sql-unified-hot-path.md` is implemented. Evidence: `stageHistoricSqlAggregatedSnapshot()`, `chunkHistoricSqlUnifiedStagedDir()`, `PostgresPgssReader`, aggregate BigQuery/Snowflake `fetchAggregated()` methods, unified schemas, and package exports exist. +- `2026-05-11-historic-sql-skills-projection-cutover.md` is implemented. Evidence: `HistoricSqlSourceAdapter` uses the unified stager/chunker, `packages/context/skills/historic_sql_table_digest/` and `packages/context/skills/historic_sql_patterns/` exist, `emit_historic_sql_evidence` exists, `HistoricSqlProjectionPostProcessor` is wired in `packages/context/src/ingest/local-bundle-runtime.ts`, and legacy skill names no longer grep in `packages/context` or `packages/cli`. +- `2026-05-11-historic-sql-cross-dialect-readiness.md` is implemented. Evidence: `packages/cli/src/local-adapters.test.ts` covers Postgres, BigQuery, and Snowflake historic-SQL registration, and `packages/cli/src/historic-sql-doctor.test.ts` covers low `pg_stat_statements.max` as informational output. +- `2026-05-11-historic-sql-docs-smoke-and-config-cleanup.md` is implemented. Evidence: `packages/cli/src/setup-databases.test.ts` expects canonical `historicSql.filters.serviceAccounts`, `examples/postgres-historic/scripts/smoke.sh` asserts unified `manifest.json`, `tables/*.json`, `patterns-input.json`, and zero WorkUnits on the unchanged run, and public docs use `minExecutions`. +- `2026-05-11-historic-sql-projection-archive-hardening.md` is implemented. Evidence: `packages/context/src/ingest/adapters/historic-sql/projection.ts` has `isArchivedPatternPage()`, excludes archived pages from active slug matching, and `projection.test.ts` covers reappearing archived patterns, stable archived pages, stale table marking, and legacy query-page deletion. +- `2026-05-11-historic-sql-end-to-end-retrieval-acceptance.md` is implemented. Evidence: `packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts` exercises the production `HistoricSqlSourceAdapter`, fake `emit_historic_sql_evidence` calls, projection, semantic-layer search, and wiki search. + +Focused verification before writing this plan: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts src/ingest/adapters/historic-sql/projection.test.ts src/ingest/adapters/historic-sql/stage-unified.test.ts src/ingest/adapters/historic-sql/types.test.ts +``` + +Observed: 4 files passed, 10 tests passed. + +Remaining spec gap this plan covers: + +- Spec §8 exposes `historicSql.redactionPatterns`, and setup/docs already write that field. +- `packages/context/src/ingest/adapters/historic-sql/types.ts` parses `redactionPatterns`, but `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` never applies them. +- Staged `tables/{schema}.{table}.json` and `patterns-input.json` currently copy `AggregatedTemplate.canonicalSql` unchanged into `topTemplates[].canonicalSql` and `templates[].canonicalSql`. +- Those staged files are WorkUnit inputs for `historic_sql_table_digest` and `historic_sql_patterns`, so sensitive substrings can reach LLM prompts even when the user configured redaction. + +## File Structure + +Create: + +- `packages/context/src/ingest/adapters/historic-sql/redaction.ts` + Owns compilation and application of historic-SQL SQL-text redaction patterns. Supports JavaScript regex strings and the documented `(?i)` case-insensitive prefix used by setup tests/docs. +- `packages/context/src/ingest/adapters/historic-sql/redaction.test.ts` + Tests raw regex replacement, `(?i)` compatibility, empty config behavior, and invalid-pattern diagnostics. + +Modify: + +- `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` + Compiles `config.redactionPatterns` once per fetch. Keeps original SQL for filtering and `SqlAnalysisPort.analyzeBatch()`, then stores redacted SQL in `ParsedTemplate.template.canonicalSql` before `toStagedTable()` and `toPatternsInput()` serialize files. +- `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` + Adds a regression proving raw secrets are absent from staged artifacts while `analyzeBatch()` still receives the original SQL. + +## Task 1: Add Historic SQL Redaction Helper + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/redaction.test.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/redaction.ts` + +- [ ] **Step 1: Write the failing redaction helper test** + +Create `packages/context/src/ingest/adapters/historic-sql/redaction.test.ts`: + +```typescript +import { describe, expect, it } from 'vitest'; +import { compileHistoricSqlRedactionPatterns, redactHistoricSqlText } from './redaction.js'; + +describe('historic-SQL redaction', () => { + it('redacts regex matches and supports the (?i) case-insensitive prefix', () => { + const redactors = compileHistoricSqlRedactionPatterns([ + 'sk_live_[A-Za-z0-9]+', + '(?i)secret_token_[a-z0-9]+', + ]); + + const sql = + "select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'"; + + expect(redactHistoricSqlText(sql, redactors)).toBe( + "select * from public.api_events where api_key = '[REDACTED]' and note = '[REDACTED]'", + ); + }); + + it('returns the original SQL text when no redaction patterns are configured', () => { + const sql = "select * from public.orders where status = 'paid'"; + + expect(redactHistoricSqlText(sql, compileHistoricSqlRedactionPatterns([]))).toBe(sql); + }); + + it('throws a config-focused error for invalid redaction regex patterns', () => { + expect(() => compileHistoricSqlRedactionPatterns(['[broken'])).toThrow( + 'Invalid historicSql.redactionPatterns entry "[broken"', + ); + }); + + it('throws a config-focused error for empty redaction regex patterns', () => { + expect(() => compileHistoricSqlRedactionPatterns([' '])).toThrow( + 'Invalid historicSql.redactionPatterns entry " "', + ); + }); +}); +``` + +- [ ] **Step 2: Run the redaction helper test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/redaction.test.ts +``` + +Expected: FAIL because `./redaction.js` does not exist. + +- [ ] **Step 3: Add the redaction helper implementation** + +Create `packages/context/src/ingest/adapters/historic-sql/redaction.ts`: + +```typescript +export interface HistoricSqlRedactionPattern { + pattern: string; + expression: RegExp; +} + +const CASE_INSENSITIVE_PREFIX = '(?i)'; +const REDACTION_TOKEN = '[REDACTED]'; + +export function compileHistoricSqlRedactionPatterns(patterns: readonly string[]): HistoricSqlRedactionPattern[] { + return patterns.map((pattern) => { + const trimmed = pattern.trim(); + const caseInsensitive = trimmed.startsWith(CASE_INSENSITIVE_PREFIX); + const source = caseInsensitive ? trimmed.slice(CASE_INSENSITIVE_PREFIX.length) : trimmed; + if (source.length === 0) { + throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": pattern must not be empty`); + } + + try { + return { + pattern, + expression: new RegExp(source, caseInsensitive ? 'gi' : 'g'), + }; + } catch (error) { + const reason = error instanceof Error ? error.message : String(error); + throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": ${reason}`); + } + }); +} + +export function redactHistoricSqlText(text: string, redactors: readonly HistoricSqlRedactionPattern[]): string { + let next = text; + for (const redactor of redactors) { + redactor.expression.lastIndex = 0; + next = next.replace(redactor.expression, REDACTION_TOKEN); + } + return next; +} +``` + +- [ ] **Step 4: Run the redaction helper test to verify it passes** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/redaction.test.ts +``` + +Expected: PASS. The output reports 1 test file passed and 4 tests passed. + +- [ ] **Step 5: Commit the redaction helper** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/redaction.ts packages/context/src/ingest/adapters/historic-sql/redaction.test.ts +git commit -m "feat: add historic sql redaction helper" +``` + +## Task 2: Apply Redaction To Unified Staged Artifacts + +**Files:** +- Modify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/redaction.ts` + +- [ ] **Step 1: Add the failing staged-artifact redaction test** + +Append this test inside the existing `describe('stageHistoricSqlAggregatedSnapshot', ...)` block in `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts`: + +```typescript + it('redacts configured SQL substrings in staged artifacts while analyzing original SQL', async () => { + const stagedDir = await tempDir(); + const originalSql = + "select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'"; + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ + templateId: 'api-events-with-secret', + canonicalSql: originalSql, + stats: { + executions: 15, + distinctUsers: 2, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 12, + p95RuntimeMs: 25, + errorRate: 0, + rowsProduced: 15, + }, + }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + [ + 'api-events-with-secret', + { + tablesTouched: ['public.api_events'], + columnsByClause: { + select: [], + where: ['api_key', 'note'], + join: [], + groupBy: [], + }, + }, + ], + ])), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'postgres', + redactionPatterns: ['sk_live_[A-Za-z0-9]+', '(?i)secret_token_[a-z0-9]+'], + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith( + [{ id: 'api-events-with-secret', sql: originalSql }], + 'postgres', + ); + + const tableJson = await readFile(join(stagedDir, 'tables/public.api_events.json'), 'utf-8'); + const patternsJson = await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8'); + expect(tableJson).not.toContain('sk_live_abc123'); + expect(tableJson).not.toContain('Secret_Token_9f'); + expect(patternsJson).not.toContain('sk_live_abc123'); + expect(patternsJson).not.toContain('Secret_Token_9f'); + expect(tableJson).toContain('[REDACTED]'); + expect(patternsJson).toContain('[REDACTED]'); + }); +``` + +- [ ] **Step 2: Run the staged-artifact test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/stage-unified.test.ts +``` + +Expected: FAIL because `tables/public.api_events.json` and `patterns-input.json` still contain `sk_live_abc123` and `Secret_Token_9f`. + +- [ ] **Step 3: Import the redaction helper in the stager** + +In `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts`, add this import below the existing `./buckets.js` import block: + +```typescript +import { + compileHistoricSqlRedactionPatterns, + redactHistoricSqlText, + type HistoricSqlRedactionPattern, +} from './redaction.js'; +``` + +- [ ] **Step 4: Add a small template redaction helper** + +In `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts`, add this helper after `shouldDropTemplate()`: + +```typescript +function redactTemplateSql( + template: AggregatedTemplate, + redactors: readonly HistoricSqlRedactionPattern[], +): AggregatedTemplate { + if (redactors.length === 0) { + return template; + } + return { + ...template, + canonicalSql: redactHistoricSqlText(template.canonicalSql, redactors), + }; +} +``` + +- [ ] **Step 5: Compile redaction patterns once per staged snapshot** + +In `stageHistoricSqlAggregatedSnapshot()` in `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts`, replace this opening block: + +```typescript + const config = historicSqlUnifiedPullConfigSchema.parse(input.pullConfig); + const now = input.now ?? new Date(); + const windowStart = new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000); +``` + +with: + +```typescript + const config = historicSqlUnifiedPullConfigSchema.parse(input.pullConfig); + const redactors = compileHistoricSqlRedactionPatterns(config.redactionPatterns); + const now = input.now ?? new Date(); + const windowStart = new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000); +``` + +- [ ] **Step 6: Store redacted SQL only after batch analysis has used original SQL** + +In `stageHistoricSqlAggregatedSnapshot()` in `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts`, replace this `parsedTemplates.push()` block: + +```typescript + parsedTemplates.push({ + template, + tablesTouched, + columnsByClause: Object.fromEntries( + Object.entries(parsed.columnsByClause).map(([clause, columns]) => [clause, [...new Set(columns)].sort()]), + ), + }); +``` + +with: + +```typescript + parsedTemplates.push({ + template: redactTemplateSql(template, redactors), + tablesTouched, + columnsByClause: Object.fromEntries( + Object.entries(parsed.columnsByClause).map(([clause, columns]) => [clause, [...new Set(columns)].sort()]), + ), + }); +``` + +- [ ] **Step 7: Run staged-artifact and redaction tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/redaction.test.ts src/ingest/adapters/historic-sql/stage-unified.test.ts +``` + +Expected: PASS. The output reports 2 test files passed and the staged-artifact test confirms both raw sensitive substrings are absent. + +- [ ] **Step 8: Commit the stager redaction** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/stage-unified.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts +git commit -m "feat: redact historic sql staged artifacts" +``` + +## Task 3: Run Focused Historic-SQL Regression Checks + +**Files:** +- Verify: `packages/context/src/ingest/adapters/historic-sql/redaction.test.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/projection.test.ts` +- Verify: `packages/context/src/ingest/adapters/historic-sql/types.test.ts` + +- [ ] **Step 1: Run focused historic-SQL tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/redaction.test.ts src/ingest/adapters/historic-sql/stage-unified.test.ts src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts src/ingest/adapters/historic-sql/projection.test.ts src/ingest/adapters/historic-sql/types.test.ts +``` + +Expected: PASS. The output reports 5 test files passed. + +- [ ] **Step 2: Run the context package type-check** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS with TypeScript completing without diagnostics. + +- [ ] **Step 3: Confirm the implementation did not reintroduce legacy historic-SQL codepaths** + +Run: + +```bash +rg -n "stagePgStatStatementsTemplates|expandCategoricalTemplates|classifySlot|pgss-baseline|historic_sql_ingest|historic_sql_curator" packages/context/src packages/context/skills packages/cli/src +``` + +Expected: no matches. + +- [ ] **Step 4: Commit verification-only adjustments if any were required** + +If Task 3 required a source or test correction, commit the verified files: + +```bash +git add packages/context/src/ingest/adapters/historic-sql/redaction.ts packages/context/src/ingest/adapters/historic-sql/redaction.test.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts +git commit -m "test: verify historic sql redaction hardening" +``` + +If Task 3 did not require changes, leave the existing commits from Task 1 and Task 2 unchanged. + +## Self-Review + +**Spec coverage:** This plan covers the remaining practical gap in spec §8's `redactionPatterns` config by applying it before SQL text reaches staged artifacts and LLM WorkUnit inputs. It does not alter reader SQL, projection, search enrichment, or setup output because those slices are already implemented. + +**Placeholder scan:** The plan contains no `TBD`, no `TODO`, and no missing code bodies. Every code-writing step includes the exact test or implementation block to add. + +**Type consistency:** `HistoricSqlRedactionPattern`, `compileHistoricSqlRedactionPatterns()`, and `redactHistoricSqlText()` are defined in Task 1 and imported with the same names in Task 2. `redactTemplateSql()` returns `AggregatedTemplate`, preserving the existing `ParsedTemplate.template` type. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-redaction-hardening.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - I dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - Execute tasks in this session using executing-plans, batch execution with checkpoints + +Which approach? diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-redesign-manual-test-plan.md b/docs/superpowers/plans/2026-05-11-historic-sql-redesign-manual-test-plan.md new file mode 100644 index 00000000..ba539195 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-redesign-manual-test-plan.md @@ -0,0 +1,459 @@ +# External Hosted Postgres Discovery Manual Test Plan + +This plan tests KTX from the point of view of a new external user who discovers +the public CLI and connects the hosted Kaelio demo Postgres database as the +source. It starts with the credential-free seeded demo, then creates a real KTX +project that reads from `start.kaelio.com`. + +The plan avoids writing the database password into this repository. Keep the +password in a local environment variable and configure KTX with +`env:KTX_DEMO_DATABASE_URL`. + +## Scope + +Use this plan when the goal is to test KTX as an external user with the hosted +demo database. The commands use the published package shape through +`npx @kaelio/ktx`. If you are testing from this repository, you can replace +`npx @kaelio/ktx` with the local `ktx` alias. + +The required checks cover: + +- Running the packaged seeded demo without credentials. +- Creating a new project that points to the hosted Postgres demo source. +- Verifying the connection through the public CLI. +- Running public ingest against the hosted database. +- Searching semantic-layer sources through `agent sl list --query`. +- Running the Postgres historic-SQL readiness doctor. +- Running the historic-SQL adapter when the demo database exposes query + history and local LLM configuration is available. +- Searching generated historic-SQL usage and pattern pages when historic-SQL + ingest runs. + +## Prerequisites + +Prepare a clean terminal before starting. The required path needs Node and +network access to `start.kaelio.com:5432`. The optional historic-SQL ingest path +also needs `uv` and an LLM provider configured for KTX. + +1. Confirm Node 22 or newer is available: + + ```bash + node --version + ``` + + Expected: the version is `v22` or newer. + +2. Confirm the hosted Postgres endpoint is reachable from your network: + + ```bash + nc -vz start.kaelio.com 5432 + ``` + + Expected: the command reports that the TCP connection succeeds. If `nc` is + unavailable, continue and let `ktx connection test` perform the real check. + +3. Create an isolated test parent: + + ```bash + export KTX_EXTERNAL_PARENT="$(mktemp -d)" + export KTX_SEEDED_PROJECT="$KTX_EXTERNAL_PARENT/seeded-demo" + export KTX_HOSTED_PROJECT="$KTX_EXTERNAL_PARENT/hosted-postgres" + export KTX_RUNTIME_ROOT="$KTX_EXTERNAL_PARENT/managed-runtime" + ``` + + Expected: every file created by this test stays under + `$KTX_EXTERNAL_PARENT`. + +4. Set the hosted database URL without committing the password: + + ```bash + read -rsp "Demo database password: " KTX_DEMO_DB_PASSWORD + printf '\n' + export KTX_DEMO_DATABASE_URL="postgresql://kaelio_demo:${KTX_DEMO_DB_PASSWORD}" + export KTX_DEMO_DATABASE_URL="${KTX_DEMO_DATABASE_URL}@start.kaelio.com:5432/demo?sslmode=prefer" + unset KTX_DEMO_DB_PASSWORD + ``` + + Expected: `KTX_DEMO_DATABASE_URL` is set only in your shell. The project + config will store `env:KTX_DEMO_DATABASE_URL`, not the literal URL. + + The hosted demo endpoint uses libpq-style `sslmode=prefer`, which means + "try SSL, then fall back to non-SSL." KTX handles this mode explicitly for + the Node Postgres connector so the setup check can connect to the hosted + demo database. + +5. Verify the required shell variables before running any `ktx` commands: + + ```bash + : "${KTX_EXTERNAL_PARENT:?Run prerequisite step 3 in this shell first}" + : "${KTX_SEEDED_PROJECT:?Run prerequisite step 3 in this shell first}" + : "${KTX_HOSTED_PROJECT:?Run prerequisite step 3 in this shell first}" + : "${KTX_RUNTIME_ROOT:?Run prerequisite step 3 in this shell first}" + : "${KTX_DEMO_DATABASE_URL:?Run prerequisite step 4 in this shell first}" + ``` + + Expected: the command prints nothing and exits zero. If it prints a shell + error, rerun the referenced prerequisite in the same terminal before + continuing. + +## Step 1: Run the packaged seeded demo + +Start with the shortest public path. The seeded demo uses packaged data and +prebuilt context, so it must not ask for an LLM key. + +1. Run the seeded demo: + + ```bash + npx @kaelio/ktx setup demo \ + --project-dir "$KTX_SEEDED_PROJECT" \ + --plain \ + --no-input + ``` + + Expected: output includes `Mode: seeded`, `Source: packaged demo project`, + and `LLM calls: none`. + +2. Inspect the seeded demo: + + ```bash + npx @kaelio/ktx setup demo inspect \ + --project-dir "$KTX_SEEDED_PROJECT" \ + --json > "$KTX_EXTERNAL_PARENT/seeded-inspect.json" + ``` + + Expected: the JSON reports seeded mode, semantic-layer sources, knowledge + pages, and `reports/seeded-demo-report.json`. + +3. Search seeded semantic-layer sources: + + ```bash + npx @kaelio/ktx agent sl list \ + --project-dir "$KTX_SEEDED_PROJECT" \ + --json \ + --query "revenue" \ + > "$KTX_EXTERNAL_PARENT/seeded-sl-search.json" + ``` + + Expected: the command exits zero and returns at least one source with a + numeric `score`. + +## Step 2: Create a hosted Postgres project + +Create a new KTX project that uses the hosted demo database as the warehouse +source. This step enables historic SQL in the config, but it does not require +LLM credentials yet. + +If an earlier setup attempt failed after creating `$KTX_HOSTED_PROJECT/ktx.yaml`, +start a fresh test project before rerunning the `--new` command: + +```bash +export KTX_HOSTED_PROJECT="$KTX_EXTERNAL_PARENT/hosted-postgres-retry" +``` + +1. Create the project and connection: + + ```bash + npx @kaelio/ktx setup \ + --project-dir "${KTX_HOSTED_PROJECT:?Run prerequisite step 3 first}" \ + --new \ + --skip-llm \ + --skip-embeddings \ + --skip-sources \ + --skip-agents \ + --database postgres \ + --new-database-connection-id warehouse \ + --database-url env:KTX_DEMO_DATABASE_URL \ + --database-schema public \ + --enable-historic-sql \ + --historic-sql-min-executions 2 \ + --yes \ + --no-input + ``` + + Expected: `$KTX_HOSTED_PROJECT/ktx.yaml` exists and contains a `warehouse` + Postgres connection whose URL is `env:KTX_DEMO_DATABASE_URL`. + +2. Confirm the password was not written to disk: + + ```bash + grep -R "start.kaelio.com:5432/demo" "$KTX_HOSTED_PROJECT" || true + ``` + + Expected: no matches are printed. + +3. Inspect the generated connection config: + + ```bash + sed -n '1,120p' "$KTX_HOSTED_PROJECT/ktx.yaml" + ``` + + Expected: the `warehouse` connection has `driver: postgres`, + `url: env:KTX_DEMO_DATABASE_URL` or an equivalent URL reference, and + `historicSql.enabled: true`. + +## Step 3: Test the hosted connection + +Run the public connection check before ingest. This verifies that the external +user can reach and introspect the hosted source. + +1. Test the connection: + + ```bash + npx @kaelio/ktx connection test warehouse \ + --project-dir "$KTX_HOSTED_PROJECT" + ``` + + Expected: output includes `Driver: postgres` and a positive table count. + +2. List configured connections: + + ```bash + npx @kaelio/ktx connection list \ + --project-dir "$KTX_HOSTED_PROJECT" + ``` + + Expected: output includes the `warehouse` connection. + +## Step 4: Run public ingest + +Run the public ingest command. For warehouse connections, this performs the +database scan path and writes local context files that agent search can use. + +1. Run ingest: + + ```bash + npx @kaelio/ktx ingest warehouse \ + --project-dir "$KTX_HOSTED_PROJECT" \ + --no-input + ``` + + Expected: output reports that ingest finished and that the `scan` step is + `done`. + +2. Inspect the latest public ingest status: + + ```bash + npx @kaelio/ktx ingest status \ + --project-dir "$KTX_HOSTED_PROJECT" \ + --no-input + ``` + + Expected: the status references the hosted `warehouse` source and a + completed scan. + +3. Confirm semantic-layer files exist: + + ```bash + find "$KTX_HOSTED_PROJECT/semantic-layer/warehouse" \ + -name '*.yaml' -print | head + ``` + + Expected: at least one semantic-layer YAML file is printed. + +## Step 5: Search the hosted database context + +Use the agent-facing semantic-layer search command after ingest. This validates +the discovery path that agents use for database analysis. + +1. Run semantic-layer search: + + ```bash + npx @kaelio/ktx agent sl list \ + --project-dir "$KTX_HOSTED_PROJECT" \ + --connection-id warehouse \ + --json \ + --query "orders revenue customers" \ + > "$KTX_EXTERNAL_PARENT/hosted-sl-search.json" + ``` + + Expected: the command exits zero. + +2. Validate search metadata: + + ```bash + node - "$KTX_EXTERNAL_PARENT/hosted-sl-search.json" <<'NODE' + const { readFileSync } = require('node:fs'); + const result = JSON.parse(readFileSync(process.argv[2], 'utf8')); + const assert = (ok, message) => { + if (!ok) throw new Error(message); + }; + assert(Array.isArray(result.sources), 'sources missing'); + assert(result.sources.length > 0, 'no semantic-layer hits'); + assert(Number.isFinite(result.sources[0].score), 'score missing'); + console.log('hosted semantic-layer search ok'); + NODE + ``` + + Expected: the script prints `hosted semantic-layer search ok`. + +3. Read the top source: + + ```bash + node - "$KTX_EXTERNAL_PARENT/hosted-sl-search.json" \ + > "$KTX_EXTERNAL_PARENT/hosted-top-source-name.txt" <<'NODE' + const { readFileSync } = require('node:fs'); + const result = JSON.parse(readFileSync(process.argv[2], 'utf8')); + process.stdout.write(result.sources[0].name); + NODE + + npx @kaelio/ktx agent sl read \ + "$(cat "$KTX_EXTERNAL_PARENT/hosted-top-source-name.txt")" \ + --project-dir "$KTX_HOSTED_PROJECT" \ + --connection-id warehouse \ + --json \ + > "$KTX_EXTERNAL_PARENT/hosted-sl-read.json" + ``` + + Expected: the JSON includes the full semantic-layer source. + +## Step 6: Check historic-SQL readiness + +Run the Postgres historic-SQL doctor. This determines whether the hosted demo +database exposes the query-history prerequisites needed for the redesign's +historic-SQL adapter. + +1. Run doctor: + + ```bash + npx @kaelio/ktx dev doctor \ + --project-dir "$KTX_HOSTED_PROJECT" \ + --no-input + ``` + + Expected: output includes a `Postgres Historic SQL (warehouse)` check. + +2. Interpret the result: + + - `PASS` means the hosted source is ready for the optional historic-SQL + ingest path. + - `WARN` or `FAIL` means the external discovery test still covers scan and + semantic-layer search, but historic-SQL query-history ingestion is blocked + by database permissions or configuration. + +## Step 7: Optional historic-SQL ingest + +Run this section only when the doctor passes and the KTX project has an LLM +provider configured. Historic-SQL table and pattern curation uses LLM-backed +skills, so this path is not credential-free. + +1. Configure LLM and embeddings if you skipped them during setup: + + ```bash + npx @kaelio/ktx setup \ + --project-dir "$KTX_HOSTED_PROJECT" + ``` + + Expected: `npx @kaelio/ktx setup status --project-dir "$KTX_HOSTED_PROJECT"` + reports that LLM and embedding setup are ready. + +2. Run historic-SQL ingest: + + ```bash + npx @kaelio/ktx dev ingest run \ + --project-dir "$KTX_HOSTED_PROJECT" \ + --connection-id warehouse \ + --adapter historic-sql \ + --plain \ + --yes \ + --no-input + ``` + + Expected: the command exits zero and schedules `historic-sql-table-` and + `historic-sql-patterns-` WorkUnits when the database has qualifying query + history. + +3. Locate the latest historic-SQL manifest: + + ```bash + find "$KTX_HOSTED_PROJECT/raw-sources/warehouse/historic-sql" \ + -name manifest.json -print | sort | tail -n 1 + ``` + + Expected: a manifest path is printed. + +4. Search for generated usage: + + ```bash + npx @kaelio/ktx agent sl list \ + --project-dir "$KTX_HOSTED_PROJECT" \ + --connection-id warehouse \ + --json \ + --query "common filters joins usage" \ + > "$KTX_EXTERNAL_PARENT/historic-sl-search.json" + ``` + + Expected: hits produced from historic-SQL usage include `score`, and hits + with projected usage include `frequencyTier` and `snippet`. + +5. Search for generated pattern pages: + + ```bash + npx @kaelio/ktx agent wiki search "historic sql pattern" \ + --project-dir "$KTX_HOSTED_PROJECT" \ + --json \ + --limit 10 \ + > "$KTX_EXTERNAL_PARENT/historic-wiki-search.json" + ``` + + Expected: results include pages whose keys start with `historic-sql/` when + the run produced cross-table patterns. + +## Step 8: Record results + +Capture the result in a way that separates the external discovery path from the +optional historic-SQL path. + +1. Save useful outputs: + + ```bash + mkdir -p "$KTX_EXTERNAL_PARENT/results" + cp "$KTX_EXTERNAL_PARENT/seeded-inspect.json" \ + "$KTX_EXTERNAL_PARENT/results/" 2>/dev/null || true + cp "$KTX_EXTERNAL_PARENT/hosted-sl-search.json" \ + "$KTX_EXTERNAL_PARENT/results/" 2>/dev/null || true + cp "$KTX_EXTERNAL_PARENT/hosted-sl-read.json" \ + "$KTX_EXTERNAL_PARENT/results/" 2>/dev/null || true + cp "$KTX_EXTERNAL_PARENT/historic-sl-search.json" \ + "$KTX_EXTERNAL_PARENT/results/" 2>/dev/null || true + cp "$KTX_EXTERNAL_PARENT/historic-wiki-search.json" \ + "$KTX_EXTERNAL_PARENT/results/" 2>/dev/null || true + ``` + + Expected: the results directory contains the JSON outputs created during the + run. + +2. Mark these areas as pass, fail, or blocked: + + - Public package discovery through `npx @kaelio/ktx`. + - Seeded demo without credentials. + - Hosted Postgres project setup. + - Hosted Postgres connection test. + - Public ingest scan. + - Semantic-layer search and read. + - Historic-SQL doctor. + - Historic-SQL ingest, if doctor and LLM setup allow it. + - Historic-SQL usage search, if ingest ran. + - Historic-SQL wiki pattern search, if ingest ran. + + Expected: every required external discovery area passes. Historic-SQL ingest + is pass, fail, or blocked based on the doctor result and local LLM + configuration. + +## Cleanup + +Remove the disposable project after collecting results. Keep it only when you +need the files for debugging. + +1. Stop the managed runtime: + + ```bash + npx @kaelio/ktx runtime stop || true + ``` + +2. Remove the test parent: + + ```bash + rm -rf "$KTX_EXTERNAL_PARENT" + ``` + + Expected: temporary projects and runtime files are removed. diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md b/docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md new file mode 100644 index 00000000..ee960bb8 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md @@ -0,0 +1,778 @@ +# Historic SQL Search Enrichment Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make historic-SQL table usage searchable through semantic-layer search and return lean query-mode context with `frequencyTier` and an FTS snippet. + +**Architecture:** This is the second slice of the historic SQL redesign, covering spec §6.2.3-§6.2.5 and the search-hit tier in §7. It builds on the already implemented foundation slice: `SemanticLayerSource.usage` is the source of truth, the SL search text builder indexes usage narrative and structured usage fields, SQLite FTS returns snippets from indexed search text, and local/MCP list responses hydrate `frequencyTier` from the source while keeping the full `usage` block available through `agent sl read`. + +**Tech Stack:** TypeScript ESM/NodeNext, Vitest, better-sqlite3 FTS5, zod-backed TypeScript types. + +--- + +## Starting Point + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans found that are based on this spec: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` + +Implemented status: + +- `2026-05-11-historic-sql-foundations.md` is implemented in this worktree. Evidence in code: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`, `SemanticLayerSource.usage` in `packages/context/src/sl/types.ts`, `mergeUsagePreservingExternal()` in `packages/context/src/ingest/adapters/live-database/manifest.ts`, `SqlAnalysisPort.analyzeBatch()` in `packages/context/src/sql-analysis/ports.ts`, and `/sql/analyze-batch` in `python/ktx-daemon/src/ktx_daemon/app.py`. +- Focused TypeScript foundation verification passed: `pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/skill-schemas.test.ts src/sl/semantic-layer.service.test.ts src/ingest/adapters/live-database/manifest.test.ts src/scan/local-enrichment-artifacts.test.ts src/sql-analysis/http-sql-analysis-port.test.ts` reported 5 files and 53 tests passed. +- `uv run pytest python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py -q` is blocked by the repo's exact uv pin: required `==0.11.11`, local `0.11.13`. Closest available check after activating `.venv` passed: `source .venv/bin/activate && python -m pytest python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py -q` reported 20 passed. + +Not yet implemented: + +- `buildSemanticLayerSourceSearchText()` in `packages/context/src/sl/sl-search.service.ts` does not include `source.usage`. +- `SqliteSlSourcesIndex` does not select `snippet(local_sl_sources_fts, ...)`. +- `LocalSlSourceSearchResult` and `KtxSemanticLayerSourceSummary` do not expose `frequencyTier` or `snippet`. +- `createLocalProjectMcpContextPorts().semanticLayer.listSources()` drops any future snippet/frequency metadata. + +This plan does not rewrite the historic-SQL adapter, readers, skills, projection, or cleanup path. The next plan after this one should cover the new adapter hot path from spec §4 and §10.3 step 3. + +## File Structure + +Modify: + +- `packages/context/src/sl/sl-search.service.ts` + Adds usage narrative, frequency, filters, group-bys, joins, and stale marker to the canonical SL search text. Preserves snippets returned by repository search for direct `SlSearchService.search()` callers. +- `packages/context/src/sl/sl-search.service.test.ts` + Tests usage search-text content and direct service snippet pass-through. +- `packages/context/src/sl/ports.ts` + Extends `SlSourcesIndexPort.search()` rows with optional `snippet`. +- `packages/context/src/sl/sqlite-sl-sources-index.ts` + Adds FTS5 `snippet()` selection to lexical candidate search and direct index search. +- `packages/context/src/sl/sqlite-sl-sources-index.test.ts` + Locks snippet behavior for both direct search and lexical lane candidates. +- `packages/context/src/sl/local-sl.ts` + Adds `frequencyTier` and `snippet` to query-mode `LocalSlSourceSearchResult`; collects snippets from the lexical lane and hydrates frequency from `SemanticLayerSource.usage`. +- `packages/context/src/sl/local-sl.test.ts` + Tests that usage-only terms can find a source and that results include `frequencyTier` and FTS snippet. +- `packages/context/src/sl/pglite-sl-search-prototype.ts` + Propagates `frequencyTier` for the prototype backend so the shared result type stays truthful. +- `packages/context/src/mcp/types.ts` + Adds `frequencyTier` and `snippet` to `KtxSemanticLayerSourceSummary`. +- `packages/context/src/mcp/local-project-ports.ts` + Includes `frequencyTier` and `snippet` in `semanticLayer.listSources()` output. +- `packages/context/src/mcp/local-project-ports.test.ts` + Tests the agent/MCP-facing list response. + +## Task 1: Index Historic SQL Usage In SL Search Text + +**Files:** +- Modify: `packages/context/src/sl/sl-search.service.test.ts` +- Modify: `packages/context/src/sl/sl-search.service.ts` + +- [ ] **Step 1: Write the failing usage search-text test** + +Add this test at the end of the existing `describe('SlSearchService', ...)` block in `packages/context/src/sl/sl-search.service.test.ts`: + +```typescript + it('includes historic SQL usage in semantic-layer search text', () => { + const source: SemanticLayerSource = { + name: 'orders', + descriptions: { user: 'Customer orders' }, + table: 'public.orders', + grain: ['order_id'], + columns: [{ name: 'order_id', type: 'string' }], + joins: [], + measures: [], + usage: { + narrative: 'Analysts inspect paid and refunded order lifecycle trends by customer segment.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['customer_segment'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: '2026-05-01T00:00:00.000Z', + }, + }; + + const text = buildSemanticLayerSourceSearchText(source); + + expect(text).toContain('usage: Analysts inspect paid and refunded order lifecycle trends by customer segment.'); + expect(text).toContain('frequency: high'); + expect(text).toContain('commonly filtered by: status, created_at'); + expect(text).toContain('commonly grouped by: customer_segment'); + expect(text).toContain('commonly joined to public.customers on customer_id'); + expect(text).toContain('stale since 2026-05-01T00:00:00.000Z'); + }); +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/sl-search.service.test.ts +``` + +Expected: FAIL because the search text does not contain `usage: Analysts inspect paid and refunded order lifecycle trends by customer segment.` + +- [ ] **Step 3: Add usage fields to the canonical search text** + +In `packages/context/src/sl/sl-search.service.ts`, insert this block after the existing `freshness` block and before `return parts.join('. ');`: + +```typescript + if (source.usage) { + const usage = source.usage; + parts.push(`usage: ${usage.narrative}`); + parts.push(`frequency: ${usage.frequencyTier}`); + if (usage.commonFilters.length > 0) { + parts.push(`commonly filtered by: ${usage.commonFilters.join(', ')}`); + } + if (usage.commonGroupBys?.length) { + parts.push(`commonly grouped by: ${usage.commonGroupBys.join(', ')}`); + } + for (const join of usage.commonJoins) { + parts.push(`commonly joined to ${join.table} on ${join.on.join(',')}`); + } + if (usage.staleSince) { + parts.push(`stale since ${usage.staleSince}`); + } + } +``` + +- [ ] **Step 4: Run the search-text test to verify it passes** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/sl-search.service.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add packages/context/src/sl/sl-search.service.ts packages/context/src/sl/sl-search.service.test.ts +git commit -m "feat: index historic sql usage in sl search text" +``` + +## Task 2: Return SQLite FTS Snippets From SL Search + +**Files:** +- Modify: `packages/context/src/sl/ports.ts` +- Modify: `packages/context/src/sl/sqlite-sl-sources-index.ts` +- Modify: `packages/context/src/sl/sqlite-sl-sources-index.test.ts` +- Modify: `packages/context/src/sl/sl-search.service.ts` +- Modify: `packages/context/src/sl/sl-search.service.test.ts` + +- [ ] **Step 1: Write failing SQLite snippet assertions** + +Replace the existing `creates SQLite tables and searches indexed source text` test in `packages/context/src/sl/sqlite-sl-sources-index.test.ts` with: + +```typescript + it('creates SQLite tables and searches indexed source text with FTS snippets', async () => { + const index = new SqliteSlSourcesIndex({ dbPath }); + + await index.upsertSources('warehouse', [ + { + sourceName: 'orders', + searchText: 'orders table: public.orders measure: total_revenue sum(revenue) gross revenue', + embedding: null, + }, + { + sourceName: 'tickets', + searchText: 'tickets table: public.tickets measure: ticket_count count(*) support queue', + embedding: null, + }, + ]); + + await expect(access(dbPath)).resolves.toBeUndefined(); + + const directResults = await index.search('warehouse', null, 'gross revenue', 10); + expect(directResults).toEqual([ + expect.objectContaining({ + sourceName: 'orders', + rrfScore: expect.any(Number), + snippet: expect.stringContaining(''), + }), + ]); + expect(directResults[0]?.snippet).toContain('revenue'); + + const lexicalCandidates = await index.searchLexicalCandidates({ queryText: 'gross revenue', limit: 10 }); + expect(lexicalCandidates).toEqual([ + expect.objectContaining({ + id: 'warehouse/orders', + connectionId: 'warehouse', + sourceName: 'orders', + snippet: expect.stringContaining(''), + }), + ]); + }); +``` + +- [ ] **Step 2: Write the failing direct service snippet test** + +Add this test at the end of `packages/context/src/sl/sl-search.service.test.ts`: + +```typescript + it('preserves FTS snippets returned by the source index', async () => { + const service = new SlSearchService( + { + maxBatchSize: 16, + computeEmbedding: vi.fn(async () => [1, 0]), + computeEmbeddingsBulk: vi.fn(), + }, + { + upsertSources: vi.fn(), + getExistingSearchTexts: vi.fn(), + deleteStale: vi.fn(), + deleteByConnection: vi.fn(), + deleteByConnectionAndName: vi.fn(), + search: vi.fn(async () => [ + { + sourceName: 'orders', + rrfScore: 0.75, + snippet: 'usage: paid order lifecycle', + }, + ]), + }, + ); + + await expect(service.search('warehouse', 'order lifecycle', 10)).resolves.toEqual([ + { + sourceName: 'orders', + score: 0.75, + snippet: 'usage: paid order lifecycle', + }, + ]); + }); +``` + +- [ ] **Step 3: Run the snippet tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/sqlite-sl-sources-index.test.ts src/sl/sl-search.service.test.ts +``` + +Expected: FAIL because `snippet` is missing from SQLite search rows and `SlSearchService.search()` drops repository snippets. + +- [ ] **Step 4: Extend the index port result type** + +In `packages/context/src/sl/ports.ts`, replace the `search()` return type in `SlSourcesIndexPort` with: + +```typescript + search( + connectionId: string, + queryEmbedding: number[] | null, + queryText: string, + limit: number, + minRrfScore?: number, + ): Promise>; +``` + +- [ ] **Step 5: Add snippet fields and SQL selection in the SQLite index** + +In `packages/context/src/sl/sqlite-sl-sources-index.ts`, replace the `SearchRow` type with: + +```typescript +type SearchRow = { + connection_id?: string; + source_name: string; + rank: number; + snippet?: string | null; +}; +``` + +In the `SlSqliteLaneCandidate` interface, add the optional snippet property: + +```typescript +export interface SlSqliteLaneCandidate { + id: string; + connectionId: string; + sourceName: string; + rank: number; + rawScore: number; + snippet?: string; +} +``` + +In `searchLexicalCandidates()`, replace the SELECT list with: + +```sql + SELECT + connection_id, + source_name, + bm25(local_sl_sources_fts) AS rank, + snippet(local_sl_sources_fts, 2, '', '', '...', 12) AS snippet + FROM local_sl_sources_fts +``` + +Then replace the returned row mapping in `searchLexicalCandidates()` with: + +```typescript + return rows.map((row, index) => ({ + id: candidateId(row.connection_id, row.source_name), + connectionId: row.connection_id, + sourceName: row.source_name, + rank: index + 1, + rawScore: Number(row.rank), + ...(typeof row.snippet === 'string' && row.snippet.length > 0 ? { snippet: row.snippet } : {}), + })); +``` + +In the direct `search()` method, replace the SELECT list with: + +```sql + SELECT + source_name, + bm25(local_sl_sources_fts) AS rank, + snippet(local_sl_sources_fts, 2, '', '', '...', 12) AS snippet + FROM local_sl_sources_fts +``` + +Then replace the direct `search()` return mapping with: + +```typescript + return rows + .map((row) => ({ + sourceName: row.source_name, + rrfScore: scoreFromRank(row.rank), + ...(typeof row.snippet === 'string' && row.snippet.length > 0 ? { snippet: row.snippet } : {}), + })) + .filter((row) => row.rrfScore >= minRrfScore); +``` + +- [ ] **Step 6: Preserve snippets in direct `SlSearchService.search()` results** + +In `packages/context/src/sl/sl-search.service.ts`, replace the `search()` method signature and final return with: + +```typescript + async search( + connectionId: string, + query: string, + limit = 15, + minRrfScore = 0, + ): Promise> { + let queryEmbedding: number[] | null = null; + try { + queryEmbedding = await this.embeddingService.computeEmbedding(query); + } catch (error) { + this.logger.warn( + `Failed to compute query embedding, falling back to FTS + trigram: ${error instanceof Error ? error.message : String(error)}`, + ); + } + + const results = await this.slSourcesRepository.search(connectionId, queryEmbedding, query, limit, minRrfScore); + return results.map((result) => ({ + sourceName: result.sourceName, + score: result.rrfScore, + ...(result.snippet ? { snippet: result.snippet } : {}), + })); + } +``` + +- [ ] **Step 7: Run the snippet tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/sqlite-sl-sources-index.test.ts src/sl/sl-search.service.test.ts +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add packages/context/src/sl/ports.ts packages/context/src/sl/sqlite-sl-sources-index.ts packages/context/src/sl/sqlite-sl-sources-index.test.ts packages/context/src/sl/sl-search.service.ts packages/context/src/sl/sl-search.service.test.ts +git commit -m "feat: return sl search snippets" +``` + +## Task 3: Hydrate Query-Mode SL Results With Frequency And Snippet + +**Files:** +- Modify: `packages/context/src/sl/local-sl.ts` +- Modify: `packages/context/src/sl/local-sl.test.ts` +- Modify: `packages/context/src/sl/pglite-sl-search-prototype.ts` + +- [ ] **Step 1: Write the failing local search hydration test** + +Add this test after `searches local semantic-layer source text through SQLite FTS` in `packages/context/src/sl/local-sl.test.ts`: + +```typescript + it('searches historic SQL usage and returns frequency tier plus FTS snippet', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/public.yaml', + `tables: + orders: + table: public.orders + usage: + narrative: Analysts inspect paid order lifecycle by customer segment. + frequencyTier: high + commonFilters: + - status + - created_at + commonGroupBys: + - customer_segment + commonJoins: + - table: public.customers + on: + - customer_id + columns: + - name: order_id + type: string + - name: status + type: string +`, + 'ktx', + 'ktx@example.com', + 'Add usage-backed manifest shard', + ); + + const results = await searchLocalSlSources(project, { + connectionId: 'warehouse', + query: 'paid lifecycle customer segment', + }); + + expect(results).toEqual([ + expect.objectContaining({ + connectionId: 'warehouse', + name: 'orders', + path: 'semantic-layer/warehouse/_schema/public.yaml#orders', + frequencyTier: 'high', + snippet: expect.stringContaining(''), + matchReasons: expect.arrayContaining(['lexical']), + }), + ]); + expect(results[0]?.snippet).toContain('lifecycle'); + }); +``` + +- [ ] **Step 2: Run the local search test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/local-sl.test.ts +``` + +Expected: FAIL because the query cannot match usage text yet if Task 1 is not present, and because `frequencyTier` and `snippet` are not hydrated into `LocalSlSourceSearchResult`. + +- [ ] **Step 3: Extend the local search result type** + +In `packages/context/src/sl/local-sl.ts`, replace the `LocalSlSourceSearchResult` interface with: + +```typescript +export interface LocalSlSourceSearchResult extends LocalSlSourceSummary { + score: number; + frequencyTier?: NonNullable['frequencyTier']; + snippet?: string; + matchReasons?: SlSearchMatchReason[]; + dictionaryMatches?: SlDictionaryMatch[]; + lanes?: SlSearchLaneSummary[]; +} +``` + +Then add this helper after `candidateKey()`: + +```typescript +function searchResultUsageFields(source: SemanticLayerSource): Pick { + return source.usage?.frequencyTier ? { frequencyTier: source.usage.frequencyTier } : {}; +} +``` + +- [ ] **Step 4: Include frequency tier in the non-SQLite token fallback** + +In `searchLocalSlSources()`, inside the `project.config.storage.search !== 'sqlite-fts5'` branch, replace the final mapped object with: + +```typescript + .map((result) => ({ + ...result.candidate.summary, + score: result.score, + matchReasons: ['token'], + ...searchResultUsageFields(result.candidate.source), + })) +``` + +- [ ] **Step 5: Collect lexical snippets during hybrid search** + +In `searchLocalSlSources()`, after `const dictionaryEvidence = new Map();`, add: + +```typescript + const lexicalSnippets = new Map(); +``` + +Inside the lexical generator, immediately after `const rows = await index.searchLexicalCandidates({ ... });`, add: + +```typescript + for (const row of rows) { + if (row.snippet) { + lexicalSnippets.set(row.id, row.snippet); + } + } +``` + +- [ ] **Step 6: Hydrate frequency tier and snippet in SQLite hybrid results** + +In the final hydration loop in `searchLocalSlSources()`, replace the `hydrated.push({ ... })` block with: + +```typescript + const dictionaryMatches = dictionaryEvidence.get(fused.id); + const snippet = lexicalSnippets.get(fused.id); + hydrated.push({ + ...candidate.summary, + score: fused.score, + ...searchResultUsageFields(candidate.source), + ...(snippet ? { snippet } : {}), + matchReasons: fused.matchReasons as SlSearchMatchReason[], + ...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}), + lanes: result.lanes, + }); +``` + +- [ ] **Step 7: Propagate frequency tier in the PGlite prototype backend** + +In `packages/context/src/sl/pglite-sl-search-prototype.ts`, inside the final hydration loop, replace the `hydrated.push({ ... })` block with: + +```typescript + const dictionaryMatches = dictionaryEvidence.get(result.id); + const frequencyTier = candidate.source.usage?.frequencyTier; + hydrated.push({ + ...candidate.summary, + score: result.score, + ...(frequencyTier ? { frequencyTier } : {}), + matchReasons: result.matchReasons as SlSearchMatchReason[], + ...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}), + lanes: fused.lanes, + }); +``` + +- [ ] **Step 8: Run the local search test to verify it passes** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/local-sl.test.ts +``` + +Expected: PASS. + +- [ ] **Step 9: Commit** + +```bash +git add packages/context/src/sl/local-sl.ts packages/context/src/sl/local-sl.test.ts packages/context/src/sl/pglite-sl-search-prototype.ts +git commit -m "feat: hydrate sl search usage metadata" +``` + +## Task 4: Expose Frequency And Snippet Through Agent/MCP SL List + +**Files:** +- Modify: `packages/context/src/mcp/types.ts` +- Modify: `packages/context/src/mcp/local-project-ports.ts` +- Modify: `packages/context/src/mcp/local-project-ports.test.ts` + +- [ ] **Step 1: Write the failing agent-facing list test** + +Add this test after `returns semantic-layer hybrid search metadata through local project ports` in `packages/context/src/mcp/local-project-ports.test.ts`: + +```typescript + it('returns historic SQL usage frequency and snippet through semantic-layer list search', async () => { + const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' }); + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/public.yaml', + `tables: + orders: + table: public.orders + usage: + narrative: Analysts inspect paid order lifecycle by customer segment. + frequencyTier: high + commonFilters: + - status + commonGroupBys: + - customer_segment + commonJoins: + - table: public.customers + on: + - customer_id + columns: + - name: order_id + type: string + - name: status + type: string +`, + 'ktx', + 'ktx@example.com', + 'Seed usage-backed manifest shard', + ); + + const ports = createLocalProjectMcpContextPorts(project); + await expect( + ports.semanticLayer?.listSources({ connectionId: 'warehouse', query: 'paid order lifecycle' }), + ).resolves.toEqual({ + sources: [ + expect.objectContaining({ + connectionId: 'warehouse', + connectionName: 'warehouse', + name: 'orders', + frequencyTier: 'high', + snippet: expect.stringContaining(''), + score: expect.any(Number), + matchReasons: expect.arrayContaining(['lexical']), + }), + ], + totalSources: 1, + }); + }); +``` + +- [ ] **Step 2: Run the local project ports test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts +``` + +Expected: FAIL because `frequencyTier` and `snippet` are missing from `semanticLayer.listSources()` responses. + +- [ ] **Step 3: Add fields to the MCP summary type** + +In `packages/context/src/mcp/types.ts`, replace the ingest import with: + +```typescript +import type { IngestReportSnapshot, MemoryFlowReplayInput, TableUsageOutput } from '../ingest/index.js'; +``` + +Then add these optional fields to `KtxSemanticLayerSourceSummary` after `joinCount`: + +```typescript + frequencyTier?: TableUsageOutput['frequencyTier']; + snippet?: string; +``` + +- [ ] **Step 4: Pass fields through local project ports** + +In `packages/context/src/mcp/local-project-ports.ts`, inside the object built in `semanticLayer.listSources()`, add these two spread lines after `joinCount: source.joinCount,`: + +```typescript + ...(hasSlSearchMetadata(source) && source.frequencyTier ? { frequencyTier: source.frequencyTier } : {}), + ...(hasSlSearchMetadata(source) && source.snippet ? { snippet: source.snippet } : {}), +``` + +- [ ] **Step 5: Run the agent-facing list test to verify it passes** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add packages/context/src/mcp/types.ts packages/context/src/mcp/local-project-ports.ts packages/context/src/mcp/local-project-ports.test.ts +git commit -m "feat: expose sl search usage snippets" +``` + +## Task 5: Final Verification + +**Files:** +- Verify: `packages/context/src/sl/sl-search.service.ts` +- Verify: `packages/context/src/sl/sqlite-sl-sources-index.ts` +- Verify: `packages/context/src/sl/local-sl.ts` +- Verify: `packages/context/src/mcp/local-project-ports.ts` + +- [ ] **Step 1: Run all focused tests from this plan** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/sl-search.service.test.ts src/sl/sqlite-sl-sources-index.test.ts src/sl/local-sl.test.ts src/mcp/local-project-ports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run the context type check** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS. + +- [ ] **Step 3: Confirm the adapter rewrite is still untouched** + +Run: + +```bash +git diff -- packages/context/src/ingest/adapters/historic-sql/stage.ts packages/context/src/ingest/adapters/historic-sql/stage-pgss.ts packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts +``` + +Expected: no diff output. + +- [ ] **Step 4: Confirm no placeholder text remains in the plan** + +Run: + +```bash +node - <<'NODE' +import { readFileSync } from 'node:fs'; + +const path = 'docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md'; +const text = readFileSync(path, 'utf8'); +const redFlags = [ + 'T' + 'BD', + 'TO' + 'DO', + 'implement ' + 'later', + 'fill in ' + 'details', + 'Add appropriate ' + 'error handling', + 'add ' + 'validation', + 'handle edge ' + 'cases', + 'Write tests for ' + 'the above', + 'Similar to ' + 'Task', +]; + +let failed = false; +for (const flag of redFlags) { + if (text.includes(flag)) { + console.error(`${path}: contains red-flag placeholder text: ${flag}`); + failed = true; + } +} +process.exit(failed ? 1 : 0); +NODE +``` + +Expected: exits 0 with no output. + +- [ ] **Step 5: Commit verification notes if a verification-only edit was needed** + +If Step 1 or Step 2 required a code correction, commit only those corrected files: + +```bash +git status --short +git add packages/context/src/sl/sl-search.service.ts packages/context/src/sl/sl-search.service.test.ts packages/context/src/sl/ports.ts packages/context/src/sl/sqlite-sl-sources-index.ts packages/context/src/sl/sqlite-sl-sources-index.test.ts packages/context/src/sl/local-sl.ts packages/context/src/sl/local-sl.test.ts packages/context/src/sl/pglite-sl-search-prototype.ts packages/context/src/mcp/types.ts packages/context/src/mcp/local-project-ports.ts packages/context/src/mcp/local-project-ports.test.ts +git commit -m "test: verify historic sql search enrichment" +``` + +If Step 1 and Step 2 pass without changes, skip this commit. + +## Self-Review + +Spec coverage: + +- Spec §6.2.3 is covered by Task 1: usage fields are included in `buildSemanticLayerSourceSearchText()`. +- Spec §6.2.4 is already covered by the foundation behavior in `SlSearchService.indexSources()`, which compares search text before re-embedding; Task 1 makes usage changes part of that search-text drift. +- Spec §6.2.5 is covered by Tasks 2-4: SQLite FTS snippets are selected and exposed through query-mode list results, and `frequencyTier` is hydrated from the source. +- Spec §7 search-hit tier is covered by Tasks 3-4: query-mode results carry name, table summary counts, description, score, frequency tier, and snippet. Full `usage` remains available through source read because the foundation plan added `SemanticLayerSource.usage`. + +Placeholder scan: + +- This plan contains no deferred implementation markers or unspecified code steps. + +Type consistency: + +- `frequencyTier` uses `TableUsageOutput['frequencyTier']` at the MCP boundary and `NonNullable['frequencyTier']` in local SL search results. +- `snippet` is consistently optional because lexical FTS may not contribute to every hybrid result. diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md b/docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md new file mode 100644 index 00000000..a892542e --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md @@ -0,0 +1,1890 @@ +# Historic SQL Skills Projection Cutover Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Cut the production `historic-sql` adapter over to the unified staged shape, add the two replacement LLM skills, project their evidence into `_schema` usage and pattern wiki pages, and delete the legacy per-template code path. + +**Architecture:** The deterministic fetch/chunk hot path is already present and remains LLM-free. WorkUnit skills emit typed evidence through a source-specific tool into ignored run-local files; a deterministic ingest post-processor reads those evidence files before the squash commit and writes `_schema` usage plus `knowledge/global/historic-sql/*.md` pattern pages. The existing `onPullSucceeded()` hook runs after the squash commit in this repo, so projection uses `IngestBundlePostProcessorPort`, which is the current pre-squash deterministic import hook. + +**Tech Stack:** TypeScript ESM/NodeNext, zod 4, Vitest, YAML, existing ingest WorkUnit runner, existing semantic-layer and wiki file layouts. + +--- + +## Starting Point + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans found that are based on this spec: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md` + +Implemented status verified in this worktree: + +- `2026-05-11-historic-sql-foundations.md` is implemented. Evidence: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`, `SqlAnalysisPort.analyzeBatch()` in `packages/context/src/sql-analysis/ports.ts`, `/sql/analyze-batch` in `python/ktx-daemon/src/ktx_daemon/app.py`, `SemanticLayerSource.usage` in `packages/context/src/sl/types.ts`, and `mergeUsagePreservingExternal()` in `packages/context/src/ingest/adapters/live-database/manifest.ts`. +- `2026-05-11-historic-sql-search-enrichment.md` is implemented. Evidence: `packages/context/src/sl/sl-search.service.ts` indexes `source.usage`, `packages/context/src/sl/sqlite-sl-sources-index.ts` selects FTS snippets, and local/MCP list surfaces expose `frequencyTier` and `snippet`. +- `2026-05-11-historic-sql-unified-hot-path.md` is implemented as helper code. Evidence: `stageHistoricSqlAggregatedSnapshot()`, `chunkHistoricSqlUnifiedStagedDir()`, `PostgresPgssReader`, aggregate BigQuery/Snowflake reader methods, unified schemas, and exports exist. + +Verification already run before writing this plan: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/skill-schemas.test.ts src/sl/semantic-layer.service.test.ts src/ingest/adapters/live-database/manifest.test.ts src/scan/local-enrichment-artifacts.test.ts src/sql-analysis/http-sql-analysis-port.test.ts src/sl/sl-search.service.test.ts src/sl/sqlite-sl-sources-index.test.ts src/sl/local-sl.test.ts src/mcp/local-project-ports.test.ts src/ingest/adapters/historic-sql/types.test.ts src/ingest/adapters/historic-sql/buckets.test.ts src/ingest/adapters/historic-sql/stage-unified.test.ts src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts src/ingest/adapters/historic-sql/chunk-unified.test.ts src/package-exports.test.ts +``` + +Expected and observed: 17 files passed, 119 tests passed. + +```bash +source .venv/bin/activate && python -m pytest python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py -q +``` + +Expected and observed: 20 passed. + +Still not implemented: + +- `HistoricSqlSourceAdapter` still calls `stagePgStatStatementsTemplates()` or `stageHistoricSqlTemplates()` and advertises `historic_sql_ingest` / `historic_sql_curator`. +- Old skills still exist: `packages/context/skills/historic_sql_ingest/SKILL.md` and `packages/context/skills/historic_sql_curator/SKILL.md`. +- Old template staging and PGSS baseline files still exist: `stage.ts`, `stage-pgss.ts`, `chunk.ts`, `postgres-pgss-query-history-reader.ts`, related tests/fixtures. +- CLI doctor/setup code still imports `PostgresPgssQueryHistoryReader`. +- Runtime asset tests and page-triage prompts still mention `historic_sql_template`, `historic_sql_ingest`, and `historic_sql_curator`. + +## File Structure + +Create: + +- `packages/context/src/ingest/adapters/historic-sql/evidence.ts` + Owns typed evidence envelopes, ignored evidence path helpers, and load/write helpers for table usage and pattern evidence. +- `packages/context/src/ingest/adapters/historic-sql/evidence.test.ts` + Tests evidence schema validation, path normalization, and loader rejection of malformed evidence. +- `packages/context/src/ingest/adapters/historic-sql/evidence-tool.ts` + Adds `emit_historic_sql_evidence`, the only write tool the two new historic-SQL skills use. +- `packages/context/src/ingest/adapters/historic-sql/evidence-tool.test.ts` + Tests the tool writes ignored run-local JSON with `skipLock: true` and rejects non-historic ingest sessions. +- `packages/context/src/ingest/adapters/historic-sql/projection.ts` + Projects table usage evidence into manifest shards, writes pattern wiki pages, marks stale usage/pages, and deletes legacy query pages. +- `packages/context/src/ingest/adapters/historic-sql/projection.test.ts` + Tests `_schema` merge, stale usage, pattern slug reuse, stale page tagging, archive movement, and legacy page cleanup. +- `packages/context/src/ingest/adapters/historic-sql/post-processor.ts` + Implements `IngestBundlePostProcessorPort` for the deterministic projection phase. +- `packages/context/src/ingest/adapters/historic-sql/post-processor.test.ts` + Tests post-processor path resolution from `workdir`, `connectionId`, `sourceKey`, and `syncId`. +- `packages/context/skills/historic_sql_table_digest/SKILL.md` + Skill for one changed `tables/*.json` WorkUnit; emits one table usage evidence object. +- `packages/context/skills/historic_sql_patterns/SKILL.md` + Skill for `patterns-input.json`; emits one pattern evidence object per recurring cross-table intent. + +Modify: + +- `packages/context/src/ingest/adapters/historic-sql/types.ts` + Keep only unified config/staged schemas and reader contracts; extend config preprocessing for existing `serviceAccountUserPatterns` and `minCalls` aliases. +- `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` + Add `staleArchiveAfterDays` to `manifest.json` so projection can archive stale pattern pages deterministically. +- `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts` + Keep the same WorkUnits, but mention `emit_historic_sql_evidence` in `notes`. +- `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts` + Switch production fetch/chunk/scope to the unified hot path, replace skills, remove legacy triage support, and run legacy PGSS baseline cache cleanup. +- `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts` + Rewrite around unified staging and new skills. +- `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts` + Inline the PGSS probe logic so `postgres-pgss-query-history-reader.ts` can be deleted. +- `packages/context/src/ingest/local-adapters.ts` + Use `PostgresPgssReader` for local Postgres historic SQL and return unified pull config. +- `packages/context/src/ingest/local-bundle-runtime.ts` + Add the source-specific evidence tool to historic-SQL WorkUnits and register the historic-SQL post-processor. +- `packages/context/src/ingest/ingest-runtime-assets.test.ts` + Replace old skill asset assertions with the two new skills. +- `packages/context/src/memory/memory-runtime-assets.test.ts` + Replace old historic-SQL skill heading with the two new skill headings. +- `packages/context/src/package-exports.test.ts` + Remove legacy export assertions and add evidence/projection export assertions. +- `packages/context/src/ingest/index.ts` + Export new evidence/projection/post-processor helpers and remove legacy historic-SQL exports. +- `packages/cli/src/setup-databases.ts` and `packages/cli/src/historic-sql-doctor.ts` + Import `PostgresPgssReader` instead of `PostgresPgssQueryHistoryReader`. +- `packages/cli/src/commands/setup-commands.ts`, `packages/cli/src/index.test.ts`, `packages/cli/src/setup-databases.test.ts` + Rename generated config to `minExecutions` while accepting the old `--historic-sql-min-calls` flag for one release. +- `packages/context/prompts/skills/page_triage_classifier.md`, `packages/context/src/ingest/page-triage/page-triage.service.test.ts`, `packages/context/src/ingest/ingest-prompts.test.ts` + Remove historic-SQL template triage examples because the new adapter no longer uses page triage. + +Delete: + +- `packages/context/src/ingest/adapters/historic-sql/stage.ts` +- `packages/context/src/ingest/adapters/historic-sql/stage.test.ts` +- `packages/context/src/ingest/adapters/historic-sql/stage-pgss.ts` +- `packages/context/src/ingest/adapters/historic-sql/stage-pgss.test.ts` +- `packages/context/src/ingest/adapters/historic-sql/stage-pgss-golden.test.ts` +- `packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/` +- `packages/context/src/ingest/adapters/historic-sql/chunk.ts` +- `packages/context/src/ingest/adapters/historic-sql/chunk.test.ts` +- `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.ts` +- `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.test.ts` +- `packages/context/skills/historic_sql_ingest/SKILL.md` +- `packages/context/skills/historic_sql_curator/SKILL.md` + +## Task 1: Add Typed Historic-SQL Evidence Emission + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/evidence.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/evidence.test.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/evidence-tool.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/evidence-tool.test.ts` +- Modify: `packages/context/src/ingest/index.ts` +- Modify: `packages/context/src/package-exports.test.ts` + +- [ ] **Step 1: Write failing evidence schema tests** + +Create `packages/context/src/ingest/adapters/historic-sql/evidence.test.ts`: + +```typescript +import { describe, expect, it } from 'vitest'; +import { + historicSqlEvidenceEnvelopeSchema, + historicSqlEvidencePath, + historicSqlTableUsageEvidenceSchema, +} from './evidence.js'; + +describe('historic-sql evidence contracts', () => { + it('validates table usage evidence emitted by table digest WorkUnits', () => { + const parsed = historicSqlTableUsageEvidenceSchema.parse({ + kind: 'table_usage', + connectionId: 'warehouse', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Orders are repeatedly queried for paid/refunded lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + }, + }); + + expect(parsed.table).toBe('public.orders'); + expect(parsed.usage.frequencyTier).toBe('high'); + }); + + it('validates pattern evidence emitted by the patterns WorkUnit', () => { + const parsed = historicSqlEvidenceEnvelopeSchema.parse({ + kind: 'pattern', + connectionId: 'warehouse', + rawPath: 'patterns-input.json', + pattern: { + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Analysts compare order status changes by customer segment.', + definitionSql: 'select status, count(*) from public.orders group by status', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['pg:1', 'pg:2'], + }, + }); + + expect(parsed.kind).toBe('pattern'); + expect(parsed.pattern.slug).toBe('order-lifecycle-analysis'); + }); + + it('builds a stable ignored evidence path from run and WorkUnit identity', () => { + expect(historicSqlEvidencePath('run-1', 'historic-sql-table-public-orders')).toBe( + '.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json', + ); + }); +}); +``` + +- [ ] **Step 2: Run the schema tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/evidence.test.ts +``` + +Expected: FAIL with an import error for `./evidence.js`. + +- [ ] **Step 3: Add evidence schemas and path helpers** + +Create `packages/context/src/ingest/adapters/historic-sql/evidence.ts`: + +```typescript +import { z } from 'zod'; +import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js'; + +function safeEvidenceSegment(value: string): string { + const segment = value.replace(/[^a-zA-Z0-9._-]+/g, '-').replace(/^-+|-+$/g, ''); + if (!segment) { + throw new Error(`Invalid historic-SQL evidence path segment: ${value}`); + } + return segment; +} + +export const historicSqlTableUsageEvidenceSchema = z.object({ + kind: z.literal('table_usage'), + connectionId: z.string().min(1), + table: z.string().min(1), + rawPath: z.string().min(1), + usage: tableUsageOutputSchema, +}); +export type HistoricSqlTableUsageEvidence = z.infer; + +export const historicSqlPatternEvidenceSchema = z.object({ + kind: z.literal('pattern'), + connectionId: z.string().min(1), + rawPath: z.string().min(1), + pattern: patternOutputSchema, +}); +export type HistoricSqlPatternEvidence = z.infer; + +export const historicSqlEvidenceEnvelopeSchema = z.discriminatedUnion('kind', [ + historicSqlTableUsageEvidenceSchema, + historicSqlPatternEvidenceSchema, +]); +export type HistoricSqlEvidenceEnvelope = z.infer; + +export function historicSqlEvidencePath(runId: string, unitKey: string): string { + return `.ktx/ingest-evidence/historic-sql/${safeEvidenceSegment(runId)}/${safeEvidenceSegment(unitKey)}.json`; +} + +export function serializeHistoricSqlEvidence(evidence: HistoricSqlEvidenceEnvelope): string { + return `${JSON.stringify(historicSqlEvidenceEnvelopeSchema.parse(evidence), null, 2)}\n`; +} +``` + +- [ ] **Step 4: Write failing tool tests** + +Create `packages/context/src/ingest/adapters/historic-sql/evidence-tool.test.ts`: + +```typescript +import { describe, expect, it, vi } from 'vitest'; +import { createEmitHistoricSqlEvidenceTool } from './evidence-tool.js'; + +describe('emit_historic_sql_evidence tool', () => { + it('writes table usage evidence to the ignored run evidence directory', async () => { + const writeFile = vi.fn(async () => ({ success: true, commitHash: null })); + const tool = createEmitHistoricSqlEvidenceTool(); + + const result = await tool.execute!( + { + kind: 'table_usage', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Orders are repeatedly queried by paid status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [], + staleSince: null, + }, + }, + { + toolCallId: 'call-1', + messages: [], + abortSignal: new AbortController().signal, + experimental_context: { + connectionId: 'warehouse', + session: { + ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'historic-sql' }, + configService: { writeFile }, + }, + }, + } as never, + ); + + expect(result).toBe('Recorded historic-SQL table_usage evidence for public.orders.'); + expect(writeFile).toHaveBeenCalledWith( + '.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json', + expect.stringContaining('"kind": "table_usage"'), + 'System User', + 'system@example.com', + 'Record historic-SQL evidence: historic-sql-table-public-orders', + { skipLock: true }, + ); + }); + + it('rejects non-historic ingest sessions', async () => { + const tool = createEmitHistoricSqlEvidenceTool(); + + await expect( + tool.execute!( + { + kind: 'pattern', + rawPath: 'patterns-input.json', + pattern: { + slug: 'orders', + title: 'Orders', + narrative: 'Orders pattern.', + definitionSql: 'select * from public.orders', + tablesInvolved: ['public.orders'], + slRefs: ['orders'], + constituentTemplateIds: ['pg:1'], + }, + }, + { + toolCallId: 'call-1', + messages: [], + abortSignal: new AbortController().signal, + experimental_context: { + connectionId: 'warehouse', + session: { + ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'notion' }, + configService: { writeFile: vi.fn() }, + }, + }, + } as never, + ), + ).resolves.toContain('Error: emit_historic_sql_evidence is only available during historic-sql ingest'); + }); +}); +``` + +- [ ] **Step 5: Run the tool tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/evidence-tool.test.ts +``` + +Expected: FAIL with an import error for `./evidence-tool.js`. + +- [ ] **Step 6: Add the evidence tool** + +Create `packages/context/src/ingest/adapters/historic-sql/evidence-tool.ts`: + +```typescript +import { tool } from 'ai'; +import { z } from 'zod'; +import { historicSqlEvidencePath, serializeHistoricSqlEvidence } from './evidence.js'; +import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js'; + +const SYSTEM_AUTHOR = 'System User'; +const SYSTEM_EMAIL = 'system@example.com'; + +function unitKeyForEvidence(input: { kind: string; table?: string; pattern?: { slug: string } }): string { + if (input.kind === 'table_usage') { + return `historic-sql-table-${String(input.table).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`; + } + return `historic-sql-pattern-${String(input.pattern?.slug).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`; +} + +export function createEmitHistoricSqlEvidenceTool() { + return tool({ + description: + 'Record typed historic-SQL evidence for deterministic projection. Use this instead of wiki_write, sl_write_source, sl_edit_source, or context_candidate_write during historic-SQL WorkUnits.', + inputSchema: z.discriminatedUnion('kind', [ + z.object({ + kind: z.literal('table_usage'), + table: z.string().min(1), + rawPath: z.string().min(1), + usage: tableUsageOutputSchema, + }), + z.object({ + kind: z.literal('pattern'), + rawPath: z.string().min(1), + pattern: patternOutputSchema, + }), + ]), + execute: async (input, options): Promise => { + const context = options.experimental_context as + | { + connectionId?: string | null; + session?: { + ingest?: { runId: string; sourceKey: string }; + configService?: { + writeFile( + path: string, + content: string, + author: string, + authorEmail: string, + commitMessage: string, + options?: { skipLock?: boolean }, + ): Promise; + }; + }; + } + | undefined; + const ingest = context?.session?.ingest; + const configService = context?.session?.configService; + if (!ingest || ingest.sourceKey !== 'historic-sql' || !configService || !context?.connectionId) { + return 'Error: emit_historic_sql_evidence is only available during historic-sql ingest.'; + } + + const unitKey = unitKeyForEvidence(input); + const content = serializeHistoricSqlEvidence({ ...input, connectionId: context.connectionId }); + await configService.writeFile( + historicSqlEvidencePath(ingest.runId, unitKey), + content, + SYSTEM_AUTHOR, + SYSTEM_EMAIL, + `Record historic-SQL evidence: ${unitKey}`, + { skipLock: true }, + ); + const label = input.kind === 'table_usage' ? input.table : input.pattern.slug; + return `Recorded historic-SQL ${input.kind} evidence for ${label}.`; + }, + }); +} +``` + +- [ ] **Step 7: Export evidence helpers and verify tests pass** + +Add these exports to `packages/context/src/ingest/index.ts`: + +```typescript +export { + historicSqlEvidenceEnvelopeSchema, + historicSqlEvidencePath, + historicSqlPatternEvidenceSchema, + historicSqlTableUsageEvidenceSchema, + serializeHistoricSqlEvidence, +} from './adapters/historic-sql/evidence.js'; +export type { + HistoricSqlEvidenceEnvelope, + HistoricSqlPatternEvidence, + HistoricSqlTableUsageEvidence, +} from './adapters/historic-sql/evidence.js'; +export { createEmitHistoricSqlEvidenceTool } from './adapters/historic-sql/evidence-tool.js'; +``` + +Add these assertions to the historic-SQL block in `packages/context/src/package-exports.test.ts`: + +```typescript + expect(ingest.historicSqlEvidenceEnvelopeSchema).toBeDefined(); + expect(ingest.historicSqlEvidencePath).toBeTypeOf('function'); + expect(ingest.createEmitHistoricSqlEvidenceTool).toBeTypeOf('function'); +``` + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/evidence.test.ts src/ingest/adapters/historic-sql/evidence-tool.test.ts src/package-exports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/evidence.ts packages/context/src/ingest/adapters/historic-sql/evidence.test.ts packages/context/src/ingest/adapters/historic-sql/evidence-tool.ts packages/context/src/ingest/adapters/historic-sql/evidence-tool.test.ts packages/context/src/ingest/index.ts packages/context/src/package-exports.test.ts +git commit -m "feat: add historic sql evidence emission" +``` + +## Task 2: Add Replacement Historic-SQL Skills + +**Files:** +- Create: `packages/context/skills/historic_sql_table_digest/SKILL.md` +- Create: `packages/context/skills/historic_sql_patterns/SKILL.md` +- Modify: `packages/context/src/ingest/ingest-runtime-assets.test.ts` +- Modify: `packages/context/src/memory/memory-runtime-assets.test.ts` + +- [ ] **Step 1: Write failing runtime asset tests for the new skills** + +In `packages/context/src/ingest/ingest-runtime-assets.test.ts`, replace `historic_sql_ingest` with `historic_sql_table_digest` and `historic_sql_patterns` in `adapterSkillNames`, and remove `historic_sql_curator` from `adapterReconcileSkillNames`. + +Replace the two historic-SQL skill tests with: + +```typescript + it('packages historic-SQL table digest guidance from KTX assets', async () => { + const registry = new SkillsRegistryService({ skillsDir }); + const skills = await registry.listSkills(['historic_sql_table_digest'], 'memory_agent'); + + expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_table_digest']); + + const body = await readFile(join(skills[0]!.path, 'SKILL.md'), 'utf-8'); + expect(body).toContain('# Historic SQL Table Digest'); + expect(body).toContain('tables/..json'); + expect(body).toContain('tableUsageOutputSchema'); + expect(body).toContain('emit_historic_sql_evidence'); + expect(body).toContain('Do not call wiki_write'); + expect(body).toContain('Do not call sl_write_source'); + expect(body).not.toMatch(forbiddenProductPattern()); + }); + + it('packages historic-SQL patterns guidance from KTX assets', async () => { + const registry = new SkillsRegistryService({ skillsDir }); + const skills = await registry.listSkills(['historic_sql_patterns'], 'memory_agent'); + + expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_patterns']); + + const body = await readFile(join(skills[0]!.path, 'SKILL.md'), 'utf-8'); + expect(body).toContain('# Historic SQL Patterns'); + expect(body).toContain('patterns-input.json'); + expect(body).toContain('patternsArraySchema'); + expect(body).toContain('emit_historic_sql_evidence'); + expect(body).toContain('cross-table'); + expect(body).not.toMatch(forbiddenProductPattern()); + }); +``` + +In `packages/context/src/memory/memory-runtime-assets.test.ts`, change `expectedAdapterSkillHeadings` to include: + +```typescript + historic_sql_patterns: '# Historic SQL Patterns', + historic_sql_table_digest: '# Historic SQL Table Digest', +``` + +and remove: + +```typescript + historic_sql_ingest: '# Historic SQL Ingest', +``` + +- [ ] **Step 2: Run runtime asset tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/ingest-runtime-assets.test.ts src/memory/memory-runtime-assets.test.ts +``` + +Expected: FAIL because `historic_sql_table_digest` and `historic_sql_patterns` skill directories do not exist yet. + +- [ ] **Step 3: Add the table digest skill** + +Create `packages/context/skills/historic_sql_table_digest/SKILL.md`: + +```markdown +--- +name: historic_sql_table_digest +description: Convert one changed historic-SQL table usage bucket into typed table usage evidence for deterministic _schema projection. +callers: [memory_agent] +--- + +# Historic SQL Table Digest + +Use this skill when the WorkUnit raw file is one `tables/..json` file from the `historic-sql` adapter. + +## Required Workflow + +1. Read the WorkUnit notes first. +2. Call `read_raw_file` for the single `tables/..json` raw file. +3. Read `manifest.json` only if the table JSON omits the dialect or the WorkUnit notes are unclear. +4. Produce one concise usage narrative for this table from the staged table JSON. +5. Call `emit_historic_sql_evidence` exactly once with `kind: "table_usage"`. +6. Stop after the evidence tool succeeds. + +## Evidence Shape + +Call `emit_historic_sql_evidence` with this shape: + +```json +{ + "kind": "table_usage", + "table": "public.orders", + "rawPath": "tables/public.orders.json", + "usage": { + "narrative": "Orders are repeatedly queried for paid/refunded lifecycle analysis and customer-level rollups.", + "frequencyTier": "high", + "commonFilters": ["status", "created_at"], + "commonGroupBys": ["status"], + "commonJoins": [{ "table": "public.customers", "on": ["customer_id"] }], + "staleSince": null + } +} +``` + +The `usage` object must match `tableUsageOutputSchema`. + +## Interpretation Rules + +- Treat `columnsByClause.where` as common filters. +- Treat `columnsByClause.groupBy` as common group-bys. +- Treat `observedJoins` as common joins. +- Use `stats.executionsBucket`, `stats.distinctUsersBucket`, and `stats.recencyBucket` to choose `frequencyTier`. +- Use `frequencyTier: "high"` only when executions and distinct users are both broad. +- Use `frequencyTier: "mid"` for repeated team usage that is not broad enough for high. +- Use `frequencyTier: "low"` for low-volume but present usage. +- Use `frequencyTier: "unused"` only when the table input explicitly says the table is stale or has no recent templates. +- Keep `narrative` short and concrete. + +## Boundaries + +- Do not call `wiki_write`. +- Do not call `sl_write_source`. +- Do not call `sl_edit_source`. +- Do not call `context_candidate_write`. +- Do not emit more than one table usage evidence object. +- Do not invent columns, joins, or tables that are absent from the staged JSON. +``` + +- [ ] **Step 4: Add the patterns skill** + +Create `packages/context/skills/historic_sql_patterns/SKILL.md`: + +```markdown +--- +name: historic_sql_patterns +description: Identify recurring cross-table historic-SQL analytical intents and emit typed pattern evidence for deterministic wiki projection. +callers: [memory_agent] +--- + +# Historic SQL Patterns + +Use this skill when the WorkUnit raw file is `patterns-input.json` from the `historic-sql` adapter. + +## Required Workflow + +1. Read the WorkUnit notes first. +2. Call `read_raw_file` for `patterns-input.json`. +3. Identify recurring analytical intents that span at least two tables and have repeated usage signal. +4. Emit one `pattern` evidence object per durable cross-table intent by calling `emit_historic_sql_evidence`. +5. Stop after all pattern evidence has been emitted. + +## Evidence Shape + +Each call to `emit_historic_sql_evidence` must use this shape: + +```json +{ + "kind": "pattern", + "rawPath": "patterns-input.json", + "pattern": { + "slug": "order-lifecycle-analysis", + "title": "Order Lifecycle Analysis", + "narrative": "Analysts compare order statuses with customer segments to understand lifecycle movement.", + "definitionSql": "select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status", + "tablesInvolved": ["public.orders", "public.customers"], + "slRefs": ["orders", "customers"], + "constituentTemplateIds": ["pg:1", "pg:2"] + } +} +``` + +The `pattern` object must match `patternOutputSchema`; multiple calls together must form `patternsArraySchema`. + +## Pattern Selection Rules + +- Prefer patterns that involve two or more tables. +- Prefer templates with `executionsBucket` at least `10-100` and `distinctUsersBucket` above solo usage. +- Merge templates into one pattern only when the business intent is the same. +- Use a stable kebab-case slug based on intent, not a template id. +- Set `definitionSql` to the clearest representative SQL from a constituent template. +- Set `slRefs` to source names when the source name is obvious from table names; omit uncertain refs rather than guessing. + +## Boundaries + +- Do not call `wiki_write`. +- Do not call `sl_write_source`. +- Do not call `sl_edit_source`. +- Do not call `context_candidate_write`. +- Do not create single-table pattern pages. +- Do not copy credentials, tokens, user emails, or unredacted literals into evidence. +``` + +- [ ] **Step 5: Run runtime asset tests to verify they pass** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/ingest-runtime-assets.test.ts src/memory/memory-runtime-assets.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add packages/context/skills/historic_sql_table_digest/SKILL.md packages/context/skills/historic_sql_patterns/SKILL.md packages/context/src/ingest/ingest-runtime-assets.test.ts packages/context/src/memory/memory-runtime-assets.test.ts +git commit -m "feat: add historic sql evidence skills" +``` + +## Task 3: Project Evidence Into _schema Usage And Pattern Wiki Pages + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/projection.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/projection.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/types.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` +- Modify: `packages/context/src/wiki/types.ts` +- Modify: `packages/context/src/ingest/index.ts` + +- [ ] **Step 1: Extend staged manifest with stale archive policy** + +In `packages/context/src/ingest/adapters/historic-sql/types.test.ts`, add `staleArchiveAfterDays: 90` to the manifest fixture and assert: + +```typescript + expect( + stagedManifestSchema.parse({ + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 2, + touchedTableCount: 1, + parseFailures: 1, + warnings: ['parse_failed:bad'], + probeWarnings: [], + staleArchiveAfterDays: 90, + }).staleArchiveAfterDays, + ).toBe(90); +``` + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/types.test.ts src/ingest/adapters/historic-sql/stage-unified.test.ts +``` + +Expected: FAIL because `staleArchiveAfterDays` is not in `stagedManifestSchema` or written by staging. + +- [ ] **Step 2: Implement staged manifest policy field** + +Add this field to `stagedManifestSchema` in `packages/context/src/ingest/adapters/historic-sql/types.ts`: + +```typescript + staleArchiveAfterDays: z.number().int().positive().default(90), +``` + +Add this property to the manifest object written by `stageHistoricSqlAggregatedSnapshot()` in `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts`: + +```typescript + staleArchiveAfterDays: config.staleArchiveAfterDays, +``` + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/types.test.ts src/ingest/adapters/historic-sql/stage-unified.test.ts +``` + +Expected: PASS. + +- [ ] **Step 3: Write failing projection tests** + +Create `packages/context/src/ingest/adapters/historic-sql/projection.test.ts`: + +```typescript +import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import YAML from 'yaml'; +import { describe, expect, it } from 'vitest'; +import { projectHistoricSqlEvidence } from './projection.js'; + +async function tempWorkdir(): Promise { + return mkdtemp(join(tmpdir(), 'historic-sql-projection-')); +} + +async function writeText(root: string, relPath: string, content: string): Promise { + const target = join(root, relPath); + await mkdir(join(target, '..'), { recursive: true }); + await writeFile(target, content, 'utf-8'); +} + +async function writeJson(root: string, relPath: string, value: unknown): Promise { + await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`); +} + +describe('projectHistoricSqlEvidence', () => { + it('merges table usage into matching _schema shards and preserves external usage keys', async () => { + const workdir = await tempWorkdir(); + await writeText( + workdir, + 'semantic-layer/warehouse/_schema/public.yaml', + YAML.stringify({ + tables: { + orders: { + table: 'public.orders', + usage: { + narrative: 'Old generated usage.', + frequencyTier: 'low', + commonFilters: ['old_status'], + commonJoins: [], + ownerNote: 'keep me', + }, + columns: [{ name: 'id', type: 'string' }], + }, + }, + }), + ); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 1, + touchedTableCount: 1, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 90, + }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' }); + await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/orders.json', { + kind: 'table_usage', + connectionId: 'warehouse', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Orders are repeatedly queried for lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + }, + }); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]); + const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')); + expect(shard.tables.orders.usage).toEqual({ + ownerNote: 'keep me', + narrative: 'Orders are repeatedly queried for lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + }); + }); + + it('writes pattern pages, reuses similar slugs, and marks missing old pattern pages stale', async () => { + const workdir = await tempWorkdir(); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 2, + touchedTableCount: 2, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 90, + }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' }); + await writeText( + workdir, + 'knowledge/global/historic-sql/old-order-lifecycle.md', + [ + '---', + YAML.stringify({ + summary: 'Old order lifecycle page', + tags: ['historic-sql', 'pattern'], + refs: [], + sl_refs: ['orders'], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.orders', 'public.customers'], + fingerprints: ['pg:1'], + }).trimEnd(), + '---', + '', + 'Old body', + '', + ].join('\n'), + ); + await writeText( + workdir, + 'knowledge/global/historic-sql/retired-pattern.md', + [ + '---', + YAML.stringify({ + summary: 'Retired pattern', + tags: ['historic-sql', 'pattern'], + refs: [], + sl_refs: [], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.tickets'], + fingerprints: ['pg:9'], + }).trimEnd(), + '---', + '', + 'Retired body', + '', + ].join('\n'), + ); + await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', { + kind: 'pattern', + connectionId: 'warehouse', + rawPath: 'patterns-input.json', + pattern: { + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Analysts compare order status with customer segment.', + definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['pg:1', 'pg:2'], + }, + }); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.patternPagesWritten).toBe(1); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/old-order-lifecycle.md'), 'utf-8')).resolves.toContain( + 'Order Lifecycle Analysis', + ); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/retired-pattern.md'), 'utf-8')).resolves.toContain( + 'stale_since: "2026-05-11T00:00:00.000Z"', + ); + }); +}); +``` + +- [ ] **Step 4: Run projection tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/projection.test.ts +``` + +Expected: FAIL with an import error for `./projection.js`. + +- [ ] **Step 5: Implement projection helpers** + +Create `packages/context/src/ingest/adapters/historic-sql/projection.ts` with these exported shapes and functions: + +```typescript +import { access, mkdir, readdir, readFile, rename, rm, writeFile } from 'node:fs/promises'; +import { dirname, join, relative } from 'node:path'; +import YAML from 'yaml'; +import { rawSourcesDirForSync } from '../../raw-sources-paths.js'; +import { mergeUsagePreservingExternal } from '../live-database/manifest.js'; +import { historicSqlEvidenceEnvelopeSchema, type HistoricSqlEvidenceEnvelope } from './evidence.js'; +import { stagedManifestSchema } from './types.js'; + +export interface HistoricSqlProjectionInput { + workdir: string; + connectionId: string; + syncId: string; + runId: string; +} + +export interface HistoricSqlProjectionResult { + tableUsageMerged: number; + staleTablesMarked: number; + patternPagesWritten: number; + stalePatternPagesMarked: number; + archivedPatternPages: number; + legacyPagesDeleted: number; + touchedSources: Array<{ connectionId: string; sourceName: string }>; + warnings: string[]; +} + +interface ManifestShard { + tables?: Record; columns?: unknown[]; [key: string]: unknown }>; +} + +function safeKnowledgeSlug(value: string): string { + return value.toLowerCase().replace(/[^a-z0-9/-]+/g, '-').replace(/^-+|-+$/g, ''); +} + +async function pathExists(path: string): Promise { + try { + await access(path); + return true; + } catch { + return false; + } +} + +async function walkFiles(root: string): Promise { + if (!(await pathExists(root))) return []; + const entries = await readdir(root, { withFileTypes: true, recursive: true }); + return entries + .filter((entry) => entry.isFile()) + .map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/')) + .sort(); +} + +async function readJson(path: string): Promise { + return JSON.parse(await readFile(path, 'utf-8')) as unknown; +} + +async function writeYamlAtomic(path: string, value: unknown): Promise { + await mkdir(dirname(path), { recursive: true }); + const tmp = `${path}.tmp`; + await writeFile(tmp, YAML.stringify(value, { indent: 2, lineWidth: 0 }), 'utf-8'); + await rename(tmp, path); +} + +function tableSourceName(tableRef: string): string { + return tableRef.split('.').filter(Boolean).at(-1) ?? tableRef; +} + +function staleUsage(fetchedAt: string) { + return { + narrative: 'No recent historic SQL usage was observed in the latest snapshot.', + frequencyTier: 'unused' as const, + commonFilters: [], + commonGroupBys: [], + commonJoins: [], + staleSince: fetchedAt, + }; +} + +async function loadEvidence(workdir: string, runId: string): Promise { + const root = join(workdir, '.ktx/ingest-evidence/historic-sql', runId); + const files = await walkFiles(root); + const evidence: HistoricSqlEvidenceEnvelope[] = []; + for (const file of files.filter((candidate) => candidate.endsWith('.json'))) { + evidence.push(historicSqlEvidenceEnvelopeSchema.parse(await readJson(join(root, file)))); + } + return evidence; +} + +function renderPatternMarkdown(pattern: HistoricSqlEvidenceEnvelope & { kind: 'pattern' }): string { + return [ + `# ${pattern.pattern.title}`, + '', + pattern.pattern.narrative, + '', + '## Representative SQL', + '', + '```sql', + pattern.pattern.definitionSql, + '```', + '', + '## Tables', + '', + ...pattern.pattern.tablesInvolved.map((table) => `- ${table}`), + '', + '## Constituent Templates', + '', + ...pattern.pattern.constituentTemplateIds.map((id) => `- ${id}`), + '', + ].join('\n'); +} + +function overlapRatio(left: string[], right: string[]): number { + const rightSet = new Set(right); + const intersection = left.filter((value) => rightSet.has(value)).length; + return left.length === 0 ? 0 : intersection / left.length; +} +``` + +In the same file, implement `projectHistoricSqlEvidence()` with this behavior: + +- Read `manifest.json` from `join(workdir, rawSourcesDirForSync(connectionId, 'historic-sql', syncId), 'manifest.json')` and parse with `stagedManifestSchema`. +- Read every current table file under `raw-sources//historic-sql//tables/*.json` and build a `Set` of current staged table refs. +- Load every evidence JSON file from `.ktx/ingest-evidence/historic-sql/`. +- For each `_schema/*.yaml` shard in `semantic-layer//_schema`: + - Parse the shard as YAML. + - For each table entry, match table evidence where `evidence.table === entry.table` or `tableSourceName(evidence.table) === tableName`. + - Merge evidence usage with `mergeUsagePreservingExternal(entry.usage, evidence.usage)`. + - If an entry has `usage` and its table ref is absent from the current staged table set, replace historic-SQL managed usage with `staleUsage(manifest.fetchedAt)` while preserving external keys through `mergeUsagePreservingExternal`. + - Write the shard atomically only when serialized YAML changes. +- For patterns: + - Read current pages under `knowledge/global/historic-sql/*.md`. + - Treat pages with frontmatter `tags` containing both `historic-sql` and `pattern` as historic-SQL pattern pages. + - For each pattern evidence, reuse an existing page key when overlap of `tables + constituentTemplateIds` against existing `tables + fingerprints` is at least `0.6`; otherwise write `historic-sql/`. + - Write frontmatter with `summary`, `tags: ['historic-sql', 'pattern']`, `refs`, `sl_refs`, `usage_mode: 'auto'`, `source: 'historic-sql'`, `tables`, `representative_sql`, and `fingerprints`. + - For existing pattern pages not written this run, add tag `stale` and `stale_since: manifest.fetchedAt`. + - If an existing stale page has `stale_since` older than `manifest.staleArchiveAfterDays`, move it under `knowledge/global/historic-sql/_archived/.md` and add tag `archived`. +- Delete legacy old per-template pages whose frontmatter has `source: historic-sql`, tag `query-pattern`, and lacks tag `pattern`. +- Return counts and touched source names for every `_schema` entry whose usage changed. + +- [ ] **Step 6: Extend wiki frontmatter type for stale pattern metadata** + +In `packages/context/src/wiki/types.ts`, add: + +```typescript + stale_since?: string; +``` + +to `WikiFrontmatter`. + +- [ ] **Step 7: Export projection and run tests** + +Add this export to `packages/context/src/ingest/index.ts`: + +```typescript +export { projectHistoricSqlEvidence } from './adapters/historic-sql/projection.js'; +export type { HistoricSqlProjectionInput, HistoricSqlProjectionResult } from './adapters/historic-sql/projection.js'; +``` + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/projection.test.ts src/ingest/adapters/historic-sql/types.test.ts src/ingest/adapters/historic-sql/stage-unified.test.ts +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/projection.ts packages/context/src/ingest/adapters/historic-sql/projection.test.ts packages/context/src/ingest/adapters/historic-sql/types.ts packages/context/src/ingest/adapters/historic-sql/types.test.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts packages/context/src/wiki/types.ts packages/context/src/ingest/index.ts +git commit -m "feat: project historic sql evidence" +``` + +## Task 4: Wire The Projection Post-Processor And Evidence Tool Runtime + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/post-processor.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/post-processor.test.ts` +- Modify: `packages/context/src/ingest/local-bundle-runtime.ts` +- Modify: `packages/context/src/ingest/local-bundle-ingest.test.ts` +- Modify: `packages/context/src/ingest/index.ts` + +- [ ] **Step 1: Write failing post-processor tests** + +Create `packages/context/src/ingest/adapters/historic-sql/post-processor.test.ts`: + +```typescript +import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import YAML from 'yaml'; +import { describe, expect, it } from 'vitest'; +import { HistoricSqlProjectionPostProcessor } from './post-processor.js'; + +async function tempWorkdir(): Promise { + return mkdtemp(join(tmpdir(), 'historic-sql-post-processor-')); +} + +async function writeJson(root: string, relPath: string, value: unknown): Promise { + const target = join(root, relPath); + await mkdir(join(target, '..'), { recursive: true }); + await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'); +} + +describe('HistoricSqlProjectionPostProcessor', () => { + it('projects current run evidence before the ingest squash commit', async () => { + const workdir = await tempWorkdir(); + await mkdir(join(workdir, 'semantic-layer/warehouse/_schema'), { recursive: true }); + await writeFile( + join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), + YAML.stringify({ tables: { orders: { table: 'public.orders', columns: [{ name: 'id', type: 'string' }] } } }), + 'utf-8', + ); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 1, + touchedTableCount: 1, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 90, + }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' }); + await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/orders.json', { + kind: 'table_usage', + connectionId: 'warehouse', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Orders are repeatedly queried by lifecycle status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [], + staleSince: null, + }, + }); + + const result = await new HistoricSqlProjectionPostProcessor().run({ + connectionId: 'warehouse', + sourceKey: 'historic-sql', + syncId: 'sync-1', + jobId: 'job-1', + runId: 'run-1', + workdir, + parseArtifacts: null, + }); + + expect(result.errors).toEqual([]); + expect(result.warnings).toEqual([]); + expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]); + expect(result.result).toMatchObject({ tableUsageMerged: 1 }); + await expect(readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves.toContain( + 'Orders are repeatedly queried by lifecycle status.', + ); + }); +}); +``` + +- [ ] **Step 2: Run the post-processor test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/post-processor.test.ts +``` + +Expected: FAIL with an import error for `./post-processor.js`. + +- [ ] **Step 3: Implement the post-processor** + +Create `packages/context/src/ingest/adapters/historic-sql/post-processor.ts`: + +```typescript +import type { IngestBundlePostProcessorInput, IngestBundlePostProcessorPort, IngestBundlePostProcessorResult } from '../../ports.js'; +import { projectHistoricSqlEvidence } from './projection.js'; + +export class HistoricSqlProjectionPostProcessor implements IngestBundlePostProcessorPort { + async run(input: IngestBundlePostProcessorInput): Promise { + const projection = await projectHistoricSqlEvidence({ + workdir: input.workdir, + connectionId: input.connectionId, + syncId: input.syncId, + runId: input.runId, + }); + return { + result: projection, + warnings: projection.warnings, + errors: [], + touchedSources: projection.touchedSources, + }; + } +} +``` + +- [ ] **Step 4: Add the evidence tool and post-processor to local ingest runtime** + +In `packages/context/src/ingest/local-bundle-runtime.ts`, import: + +```typescript +import { createEmitHistoricSqlEvidenceTool } from './adapters/historic-sql/evidence-tool.js'; +import { HistoricSqlProjectionPostProcessor } from './adapters/historic-sql/post-processor.js'; +``` + +In `LocalIngestToolsetFactory.createIngestWuToolset()`, return the historic-SQL evidence tool only for historic-SQL ingest sessions: + +```typescript + createIngestWuToolset(session: ToolSession, options?: { includeContextEvidenceTools?: boolean }): IngestToolsetLike { + const sourceTools = session.ingest?.sourceKey === 'historic-sql' ? [createEmitHistoricSqlEvidenceTool()] : []; + return new LocalIngestToolSet( + options?.includeContextEvidenceTools + ? [...this.baseTools, ...this.contextTools, ...sourceTools] + : [...this.baseTools, ...sourceTools], + ); + } +``` + +In the `deps` object passed to `new IngestBundleRunner(deps)`, add: + +```typescript + postProcessors: { + 'historic-sql': new HistoricSqlProjectionPostProcessor(), + }, +``` + +- [ ] **Step 5: Add runtime integration assertions** + +In `packages/context/src/ingest/local-bundle-ingest.test.ts`, add a test using an injected `agentRunner` that calls `emit_historic_sql_evidence` for a planned historic-SQL WorkUnit and asserts the report `postProcessor` result contains `tableUsageMerged: 1`. Use the existing local-bundle ingest test patterns for injected tool execution; the key assertion is: + +```typescript +await expect(readFile(join(projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves.toContain( + 'Orders are repeatedly queried by lifecycle status.', +); +``` + +- [ ] **Step 6: Export post-processor and verify tests pass** + +Add this export to `packages/context/src/ingest/index.ts`: + +```typescript +export { HistoricSqlProjectionPostProcessor } from './adapters/historic-sql/post-processor.js'; +``` + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/post-processor.test.ts src/ingest/local-bundle-ingest.test.ts src/package-exports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/post-processor.ts packages/context/src/ingest/adapters/historic-sql/post-processor.test.ts packages/context/src/ingest/local-bundle-runtime.ts packages/context/src/ingest/local-bundle-ingest.test.ts packages/context/src/ingest/index.ts packages/context/src/package-exports.test.ts +git commit -m "feat: run historic sql deterministic projection" +``` + +## Task 5: Switch Production Adapter To Unified Hot Path + +**Files:** +- Modify: `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/types.ts` +- Modify: `packages/context/src/ingest/local-adapters.ts` +- Modify: `packages/context/src/ingest/local-adapters.test.ts` + +- [ ] **Step 1: Write failing adapter metadata and fetch tests** + +In `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts`, replace the metadata test expectations with: + +```typescript + expect(adapter.skillNames).toEqual(['historic_sql_table_digest', 'historic_sql_patterns']); + expect(adapter.reconcileSkillNames).toEqual([]); + expect(adapter.evidenceIndexing).toBeUndefined(); + expect(adapter.triageSupported).toBe(false); +``` + +Replace the legacy fetch tests with a unified fetch test: + +```typescript + it('fetches a unified aggregate snapshot and emits unified WorkUnits', async () => { + const stagedDir = await tempDir(); + const reader = { + async probe() { + return { warnings: [] }; + }, + async *fetchAggregated() { + yield { + templateId: 'pg:1', + canonicalSql: 'select status, count(*) from public.orders group by status', + dialect: 'postgres', + stats: { + executions: 25, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 10, + p95RuntimeMs: 20, + errorRate: 0, + rowsProduced: 10, + }, + topUsers: [{ user: 'analyst', executions: 25 }], + }; + }, + }; + const sqlAnalysis = { + async analyzeForFingerprint() { + throw new Error('legacy analyzeForFingerprint must not be used'); + }, + async analyzeBatch() { + return new Map([ + [ + 'pg:1', + { + tablesTouched: ['public.orders'], + columnsByClause: { select: ['status'], groupBy: ['status'] }, + }, + ], + ]); + }, + }; + const adapter = new HistoricSqlSourceAdapter({ + sqlAnalysis, + reader, + queryClient: {}, + now: () => new Date('2026-05-11T00:00:00.000Z'), + }); + + await adapter.fetch({ dialect: 'postgres', minExecutions: 5 }, stagedDir, { + connectionId: 'warehouse', + sourceKey: 'historic-sql', + }); + + await expect(adapter.detect(stagedDir)).resolves.toBe(true); + await expect(adapter.chunk(stagedDir)).resolves.toMatchObject({ + workUnits: [ + { unitKey: 'historic-sql-table-public-orders' }, + { unitKey: 'historic-sql-patterns' }, + ], + }); + }); +``` + +- [ ] **Step 2: Run adapter tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts +``` + +Expected: FAIL because the adapter still advertises legacy skills and calls legacy staging. + +- [ ] **Step 3: Update adapter dependency types** + +In `packages/context/src/ingest/adapters/historic-sql/types.ts`, change `HistoricSqlSourceAdapterDeps` to: + +```typescript +export interface HistoricSqlSourceAdapterDeps { + sqlAnalysis: SqlAnalysisPort; + reader: HistoricSqlReader; + queryClient: unknown; + legacyPostgresBaselineRootDir?: string; + now?: () => Date; +} +``` + +Extend `historicSqlUnifiedPullConfigSchema` preprocessing to map existing local config keys: + +```typescript + const next: Record = { ...value }; + if (next.minExecutions === undefined && typeof next.minCalls === 'number') { + next.minExecutions = next.minCalls; + } + if (!next.filters && Array.isArray(next.serviceAccountUserPatterns)) { + next.filters = { + serviceAccounts: { patterns: next.serviceAccountUserPatterns, mode: 'exclude' }, + dropTrivialProbes: true, + }; + } + return next; +``` + +- [ ] **Step 4: Replace adapter implementation** + +In `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts`, remove legacy imports and use: + +```typescript +import { rm } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { ChunkResult, DiffSet, FetchContext, ScopeDescriptor, SourceAdapter } from '../../types.js'; +import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js'; +import { detectHistoricSqlStagedDir } from './detect.js'; +import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js'; +import { type HistoricSqlSourceAdapterDeps } from './types.js'; + +export class HistoricSqlSourceAdapter implements SourceAdapter { + readonly source = 'historic-sql'; + readonly skillNames = ['historic_sql_table_digest', 'historic_sql_patterns']; + readonly reconcileSkillNames: string[] = []; + readonly triageSupported = false; + + constructor(private readonly deps: HistoricSqlSourceAdapterDeps) {} + + detect(stagedDir: string): Promise { + return detectHistoricSqlStagedDir(stagedDir); + } + + async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise { + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: ctx.connectionId, + queryClient: this.deps.queryClient, + reader: this.deps.reader, + sqlAnalysis: this.deps.sqlAnalysis, + pullConfig, + now: this.deps.now?.(), + }); + if (this.deps.legacyPostgresBaselineRootDir) { + await rm(join(this.deps.legacyPostgresBaselineRootDir, ctx.connectionId, 'pgss-baseline.json'), { + force: true, + }); + } + } + + chunk(stagedDir: string, diffSet?: DiffSet): Promise { + return chunkHistoricSqlUnifiedStagedDir(stagedDir, diffSet); + } + + describeScope(stagedDir: string): Promise { + return describeHistoricSqlUnifiedScope(stagedDir); + } +} +``` + +- [ ] **Step 5: Update WorkUnit notes to mention the evidence tool** + +In `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts`, update notes to contain: + +```typescript +'Use historic_sql_table_digest. Read this table usage JSON and emit exactly one table_usage object with emit_historic_sql_evidence. Do not call wiki_write or sl_write_source.' +``` + +and: + +```typescript +'Use historic_sql_patterns. Read patterns-input.json and emit pattern objects with emit_historic_sql_evidence. Do not call wiki_write or sl_write_source.' +``` + +Update `chunk-unified.test.ts` assertions to check `emit_historic_sql_evidence`. + +- [ ] **Step 6: Update local adapter wiring** + +In `packages/context/src/ingest/local-adapters.ts`, import: + +```typescript +import { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js'; +``` + +Remove the `PostgresPgssQueryHistoryReader` import. Construct the local historic-SQL adapter as: + +```typescript + adapters.push( + new HistoricSqlSourceAdapter({ + sqlAnalysis: options.historicSql.sqlAnalysis, + reader: new PostgresPgssReader(), + queryClient: options.historicSql.postgresQueryClient, + legacyPostgresBaselineRootDir: options.historicSql.postgresBaselineRootDir, + now: options.historicSql.now, + }), + ); +``` + +In `localPullConfigForAdapter()`, parse with `historicSqlUnifiedPullConfigSchema` instead of `historicSqlPullConfigSchema`. + +- [ ] **Step 7: Run adapter/local tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts src/ingest/adapters/historic-sql/chunk-unified.test.ts src/ingest/local-adapters.test.ts +``` + +Expected: PASS. + +- [ ] **Step 8: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts packages/context/src/ingest/adapters/historic-sql/types.ts packages/context/src/ingest/local-adapters.ts packages/context/src/ingest/local-adapters.test.ts +git commit -m "feat: cut over historic sql adapter" +``` + +## Task 6: Delete Legacy Historic-SQL Code Path + +**Files:** +- Modify: `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/detect.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/detect.test.ts` +- Modify: `packages/context/src/ingest/index.ts` +- Modify: `packages/context/src/package-exports.test.ts` +- Modify: `packages/cli/src/setup-databases.ts` +- Modify: `packages/cli/src/historic-sql-doctor.ts` +- Delete the legacy files listed in the File Structure section. + +- [ ] **Step 1: Move PGSS probe behavior into `PostgresPgssReader`** + +Update `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts` so the existing probe tests import `PostgresPgssReader` from `./postgres-pgss-reader.js` and assert the same probe warnings/errors now covered by `postgres-pgss-query-history-reader.test.ts`. + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts +``` + +Expected: FAIL until probe SQL and error mapping are inlined. + +- [ ] **Step 2: Inline the probe logic** + +In `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts`, remove: + +```typescript +import { PostgresPgssQueryHistoryReader } from './postgres-pgss-query-history-reader.js'; +``` + +Remove: + +```typescript + private readonly legacyReader = new PostgresPgssQueryHistoryReader(); + + probe(client: unknown): Promise { + return this.legacyReader.probe(client); + } +``` + +Add the probe SQL and mapping currently used by `PostgresPgssQueryHistoryReader` into this file, and make `probe(client)` return `PostgresPgssProbeResult` directly. Preserve the existing doctor-facing checks for extension presence, grants, server version, `pg_stat_statements.track`, and informational `pg_stat_statements.max`. + +- [ ] **Step 3: Update CLI doctor/setup imports** + +In `packages/cli/src/setup-databases.ts` and `packages/cli/src/historic-sql-doctor.ts`, replace dynamic imports of `PostgresPgssQueryHistoryReader` with `PostgresPgssReader`: + +```typescript +const [{ PostgresPgssReader }, { KtxPostgresHistoricSqlQueryClient, isKtxPostgresConnectionConfig }] = + await Promise.all([import('@ktx/context/ingest'), import('./postgres-query-client.js')]); +``` + +Replace `new PostgresPgssQueryHistoryReader().probe(client)` with: + +```typescript +new PostgresPgssReader().probe(client) +``` + +- [ ] **Step 4: Simplify detection to the unified manifest shape** + +In `packages/context/src/ingest/adapters/historic-sql/detect.ts`, keep manifest-source detection and replace the old `templates/*/{metadata.json,page.md}` fallback with unified structural detection: + +```typescript + try { + await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8'); + const entries = await readdir(join(stagedDir, 'tables'), { withFileTypes: true }); + return entries.some((entry) => entry.isFile() && entry.name.endsWith('.json')); + } catch { + return false; + } +``` + +Update `detect.test.ts` to use `stagedManifestSchema` and remove tests for legacy `historicSqlManifestSchema`, `historicSqlMetadataSchema`, and `historicSqlUsageSchema`. + +- [ ] **Step 5: Remove legacy exports and assertions** + +In `packages/context/src/ingest/index.ts`, delete exports for: + +```typescript +chunkHistoricSqlStagedDir +describeHistoricSqlScope +PostgresPgssQueryHistoryReader +stageHistoricSqlTemplates +stagePgStatStatementsTemplates +PgssBaseline +StagePgStatStatementsTemplatesResult +historicSqlManifestSchema +historicSqlMetadataSchema +historicSqlPullConfigSchema +historicSqlUsageSchema +``` + +In `packages/context/src/package-exports.test.ts`, remove assertions for those exports. + +- [ ] **Step 6: Delete legacy files and old skills** + +Run: + +```bash +rm -rf packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres +rm packages/context/src/ingest/adapters/historic-sql/stage.ts +rm packages/context/src/ingest/adapters/historic-sql/stage.test.ts +rm packages/context/src/ingest/adapters/historic-sql/stage-pgss.ts +rm packages/context/src/ingest/adapters/historic-sql/stage-pgss.test.ts +rm packages/context/src/ingest/adapters/historic-sql/stage-pgss-golden.test.ts +rm packages/context/src/ingest/adapters/historic-sql/chunk.ts +rm packages/context/src/ingest/adapters/historic-sql/chunk.test.ts +rm packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.ts +rm packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.test.ts +rm -rf packages/context/skills/historic_sql_ingest +rm -rf packages/context/skills/historic_sql_curator +``` + +Expected: files are removed from the worktree. Do not delete unified files: `stage-unified.ts`, `chunk-unified.ts`, `postgres-pgss-reader.ts`, `bigquery-query-history-reader.ts`, `snowflake-query-history-reader.ts`, `types.ts`, `skill-schemas.ts`, `evidence.ts`, `projection.ts`, and `post-processor.ts`. + +- [ ] **Step 7: Remove page-triage historic-SQL prompt references** + +In `packages/context/prompts/skills/page_triage_classifier.md`, remove the historic-SQL-specific block for `signals.objectType === "historic_sql_template"`. Update these tests to stop asserting that prompt text: + +- `packages/context/src/ingest/page-triage/page-triage.service.test.ts` +- `packages/context/src/ingest/ingest-prompts.test.ts` +- `packages/context/src/ingest/ingest-runtime-assets.test.ts` + +- [ ] **Step 8: Run no-old-code grep** + +Run: + +```bash +rg -n "stagePgStatStatementsTemplates|expandCategoricalTemplates|classifySlot|pgss-baseline|historic_sql_ingest|historic_sql_curator|PostgresPgssQueryHistoryReader|historic_sql_template" packages/context packages/cli +``` + +Expected: no matches in `packages/context` or `packages/cli`. + +- [ ] **Step 9: Run focused deletion tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql src/ingest/ingest-runtime-assets.test.ts src/memory/memory-runtime-assets.test.ts src/package-exports.test.ts +pnpm --filter @ktx/cli exec vitest run src/historic-sql-doctor.test.ts src/setup-databases.test.ts +``` + +Expected: PASS. + +- [ ] **Step 10: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql packages/context/skills packages/context/src/ingest/index.ts packages/context/src/package-exports.test.ts packages/context/src/ingest/ingest-runtime-assets.test.ts packages/context/src/memory/memory-runtime-assets.test.ts packages/context/prompts/skills/page_triage_classifier.md packages/context/src/ingest/page-triage/page-triage.service.test.ts packages/context/src/ingest/ingest-prompts.test.ts packages/cli/src/setup-databases.ts packages/cli/src/historic-sql-doctor.ts packages/cli/src/historic-sql-doctor.test.ts packages/cli/src/setup-databases.test.ts +git commit -m "refactor: remove legacy historic sql pipeline" +``` + +## Task 7: Rename Setup Config To minExecutions + +**Files:** +- Modify: `packages/cli/src/commands/setup-commands.ts` +- Modify: `packages/cli/src/index.test.ts` +- Modify: `packages/cli/src/setup-databases.ts` +- Modify: `packages/cli/src/setup-databases.test.ts` + +- [ ] **Step 1: Write failing setup CLI assertions** + +In `packages/cli/src/index.test.ts`, update setup help assertions so both flags are accepted: + +```typescript +expect(output).toContain('--historic-sql-min-executions'); +expect(output).toContain('--historic-sql-min-calls'); +``` + +In setup output/config tests, assert generated YAML uses: + +```yaml +historicSql: + enabled: true + dialect: postgres + minExecutions: 7 +``` + +and does not write `minCalls`. + +- [ ] **Step 2: Run setup tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/index.test.ts src/setup-databases.test.ts +``` + +Expected: FAIL because the CLI still writes `minCalls`. + +- [ ] **Step 3: Add the new flag and preserve the old alias** + +In `packages/cli/src/commands/setup-commands.ts`, add: + +```typescript + .option('--historic-sql-min-executions ', 'Minimum Historic SQL executions for a template', positiveInteger) +``` + +Keep `--historic-sql-min-calls` with help text: + +```typescript + .option('--historic-sql-min-calls ', 'Alias for --historic-sql-min-executions', positiveInteger) +``` + +When building setup options, resolve: + +```typescript +const historicSqlMinExecutions = opts.historicSqlMinExecutions ?? opts.historicSqlMinCalls; +``` + +In `packages/cli/src/setup-databases.ts`, write `minExecutions` to config. Do not write `minCalls`. + +- [ ] **Step 4: Run setup tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/index.test.ts src/setup-databases.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add packages/cli/src/commands/setup-commands.ts packages/cli/src/index.test.ts packages/cli/src/setup-databases.ts packages/cli/src/setup-databases.test.ts +git commit -m "feat: rename historic sql setup threshold" +``` + +## Task 8: Final Verification + +**Files:** +- Verify: historic-SQL adapter, CLI setup/doctor, runtime assets, exports, Python daemon batch endpoint. + +- [ ] **Step 1: Run all historic-SQL context tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql src/ingest/local-adapters.test.ts src/ingest/local-bundle-ingest.test.ts src/ingest/ingest-runtime-assets.test.ts src/memory/memory-runtime-assets.test.ts src/package-exports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run CLI setup and doctor tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/historic-sql-doctor.test.ts src/setup-databases.test.ts src/index.test.ts src/ingest.test.ts +``` + +Expected: PASS. + +- [ ] **Step 3: Run Python daemon SQL analysis tests** + +Run: + +```bash +source .venv/bin/activate && python -m pytest python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py -q +``` + +Expected: PASS. + +- [ ] **Step 4: Run package type checks** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +pnpm --filter @ktx/cli run type-check +``` + +Expected: PASS. + +- [ ] **Step 5: Run no-old-code grep** + +Run: + +```bash +rg -n "stagePgStatStatementsTemplates|expandCategoricalTemplates|classifySlot|pgss-baseline|historic_sql_ingest|historic_sql_curator|PostgresPgssQueryHistoryReader|historic_sql_template" packages/context packages/cli +``` + +Expected: no matches. + +- [ ] **Step 6: Run pre-commit for touched files** + +Run with the actual touched file list from `git diff --name-only`: + +```bash +uv run pre-commit run --files $(git diff --name-only) +``` + +Expected: PASS. If local `uv` refuses due the repo's exact uv pin, activate `.venv` for Python checks and report the uv version mismatch exactly. + +- [ ] **Step 7: Commit final verification notes if test snapshots changed** + +If verification updated tracked snapshots or generated checked-in fixtures, commit only those intended files: + +```bash +git add +git commit -m "test: verify historic sql cutover" +``` + +Expected: either a small verification commit is created, or no commit is needed because `git status --short` is clean. + +## Self-Review + +Spec coverage: + +- New skills `historic_sql_table_digest` and `historic_sql_patterns`: Task 2. +- LLM skills emit evidence instead of direct writes: Task 1 and Task 2. +- Deterministic projection of table usage into `_schema` shards: Task 3 and Task 4. +- Pattern wiki pages under `knowledge/global/historic-sql/{slug}.md`: Task 3 and Task 4. +- Slug stability and stale/archive handling: Task 3. +- Production adapter cutover to unified reader/stager/chunker: Task 5. +- Old skill and legacy code deletion: Task 6. +- PGSS baseline cleanup: Task 5 via `legacyPostgresBaselineRootDir` removal. +- CLI setup `minCalls` to `minExecutions` alias: Task 7. +- Search surfaces: already implemented by `2026-05-11-historic-sql-search-enrichment.md`; final verification keeps them covered. + +Placeholder scan: + +- No unresolved placeholder markers are present. +- Every code-changing task includes exact paths, test commands, and expected pass/fail outcomes. +- Complex projection internals are described as concrete behavior with named fields and deterministic matching rules. + +Type consistency: + +- `HistoricSqlEvidenceEnvelope`, `HistoricSqlProjectionResult`, `HistoricSqlProjectionPostProcessor`, and `HistoricSqlSourceAdapterDeps` names are introduced before use. +- Skill names match the new adapter metadata and runtime asset tests: `historic_sql_table_digest`, `historic_sql_patterns`. +- `PostgresPgssReader` remains the single public PGSS reader after legacy deletion. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-skills-projection-cutover.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - I dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - Execute tasks in this session using executing-plans, batch execution with checkpoints + +Which approach? diff --git a/docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md b/docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md new file mode 100644 index 00000000..c3228e8a --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md @@ -0,0 +1,1698 @@ +# Historic SQL Unified Hot Path Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build the deterministic historic-SQL hot path that reads warehouse-aggregated query templates, batch-parses them once, and writes stable table-bucket and pattern-input staged artifacts. + +**Architecture:** This slice adds the unified reader/stager contracts from the historic-SQL redesign without doing the LLM cold path or projection work. Dialect-specific SQL lives in reader classes; shared TypeScript code filters, batch-parses, bucketizes, and writes `manifest.json`, `tables/*.json`, and `patterns-input.json`. The existing production adapter remains on the legacy path until the follow-up skills/projection cutover can switch it without loading missing skills. + +**Tech Stack:** TypeScript ESM/NodeNext, zod 4, Vitest, `SqlAnalysisPort.analyzeBatch()`, warehouse query clients. + +--- + +## Starting Point + +Spec: `docs/superpowers/specs/2026-05-11-historic-sql-redesign-design.md` + +Plans found that are based on this spec: + +- `docs/superpowers/plans/2026-05-11-historic-sql-foundations.md` +- `docs/superpowers/plans/2026-05-11-historic-sql-search-enrichment.md` + +Implemented status from this worktree: + +- `2026-05-11-historic-sql-foundations.md` is implemented. Evidence: `packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts`, `SemanticLayerSource.usage` in `packages/context/src/sl/types.ts`, `mergeUsagePreservingExternal()` in `packages/context/src/ingest/adapters/live-database/manifest.ts`, `SqlAnalysisPort.analyzeBatch()` in `packages/context/src/sql-analysis/ports.ts`, and `/sql/analyze-batch` in `python/ktx-daemon/src/ktx_daemon/app.py`. +- `2026-05-11-historic-sql-search-enrichment.md` is implemented. Evidence: `buildSemanticLayerSourceSearchText()` indexes `source.usage` in `packages/context/src/sl/sl-search.service.ts`, SQLite FTS returns `snippet()` in `packages/context/src/sl/sqlite-sl-sources-index.ts`, and local/MCP list results expose `frequencyTier` and `snippet` in `packages/context/src/sl/local-sl.ts` and `packages/context/src/mcp/local-project-ports.ts`. + +Still not implemented: + +- `packages/context/src/ingest/adapters/historic-sql/stage.ts` still calls `SqlAnalysisPort.analyzeForFingerprint()` per raw query and emits `templates/*/{metadata.json,page.md,usage.json}`. +- `packages/context/src/ingest/adapters/historic-sql/stage-pgss.ts` still owns Postgres baseline-diff state and writes `.ktx/cache/historic-sql/*/pgss-baseline.json`. +- `packages/context/src/ingest/adapters/historic-sql/chunk.ts` still emits one WorkUnit per template page for `historic_sql_ingest`. +- `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts` still advertises `historic_sql_ingest` and `historic_sql_curator`. +- Old code strings still exist: `stagePgStatStatementsTemplates`, `expandCategoricalTemplates`, `classifySlot`, and `pgss-baseline`. + +This plan covers the deterministic hot path from the spec: unified aggregate contracts, aggregate readers, batch parsing, table bucketing, pattern input staging, and a new chunker for the new staged shape. It does not switch `HistoricSqlSourceAdapter` to the new WorkUnits; the cutover plan must create `historic_sql_table_digest`, `historic_sql_patterns`, and projection before changing production `skillNames`. + +## File Structure + +Create: + +- `packages/context/src/ingest/adapters/historic-sql/types.test.ts` + Locks the new public zod contracts and the one-release `minCalls` to `minExecutions` config alias. +- `packages/context/src/ingest/adapters/historic-sql/buckets.ts` + Owns deterministic bucket labels and frequency-tier helpers used by staging. +- `packages/context/src/ingest/adapters/historic-sql/buckets.test.ts` + Locks stable bucket boundaries so small numeric drift does not churn staged files. +- `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` + Implements the new deterministic stager behind `stageHistoricSqlAggregatedSnapshot()`. +- `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` + Tests batch parsing, parse failures, service-account filtering, per-table bucketing, and `patterns-input.json`. +- `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts` + Implements the new Postgres aggregate reader over `pg_stat_statements`. +- `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts` + Tests the aggregate PGSS query shape, probe warnings, and row mapping. +- `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts` + Implements the new chunker for `tables/*.json` plus `patterns-input.json`. +- `packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts` + Tests table WorkUnits, the patterns WorkUnit, diff filtering, eviction, and scope detection. + +Modify: + +- `packages/context/src/ingest/adapters/historic-sql/types.ts` + Adds aggregate input, staged artifact, reader, and manifest schemas. Keeps legacy exported types until adapter cutover, but marks the new contracts as the target API for the next slice. +- `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts` + Adds `fetchAggregated()` while retaining the existing `fetch()` until the adapter cutover. +- `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts` + Adds aggregate-query tests. +- `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts` + Adds `fetchAggregated()` while retaining the existing `fetch()` until the adapter cutover. +- `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts` + Adds aggregate-query tests. +- `packages/context/src/ingest/index.ts` + Exports the new hot-path contracts and helpers. +- `packages/context/src/package-exports.test.ts` + Asserts the new exports exist without removing old exports in this slice. + +Do not modify in this plan: + +- `packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts` +- `packages/context/skills/historic_sql_ingest/SKILL.md` +- `packages/context/skills/historic_sql_curator/SKILL.md` +- `packages/context/src/ingest/ingest-runtime-assets.test.ts` + +Those files change in the cutover/projection plan after the replacement skills exist. + +## Task 1: Add Unified Contracts + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/types.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/types.ts` +- Modify: `packages/context/src/ingest/index.ts` +- Modify: `packages/context/src/package-exports.test.ts` + +- [ ] **Step 1: Write failing contract tests** + +Create `packages/context/src/ingest/adapters/historic-sql/types.test.ts`: + +```typescript +import { describe, expect, it } from 'vitest'; +import { + aggregatedTemplateSchema, + historicSqlUnifiedPullConfigSchema, + stagedManifestSchema, + stagedPatternsInputSchema, + stagedTableInputSchema, +} from './types.js'; + +describe('historic-sql unified contracts', () => { + it('parses minExecutions and accepts minCalls as a one-release alias', () => { + expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minExecutions: 9 })).toMatchObject({ + dialect: 'postgres', + minExecutions: 9, + windowDays: 90, + concurrency: 12, + redactionPatterns: [], + staleArchiveAfterDays: 90, + }); + + expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minCalls: 7 }).minExecutions).toBe(7); + }); + + it('validates aggregate templates from warehouse readers', () => { + const parsed = aggregatedTemplateSchema.parse({ + templateId: 'pg:123', + canonicalSql: 'select status, count(*) from public.orders group by status', + dialect: 'postgres', + stats: { + executions: 42, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 12.5, + p95RuntimeMs: 40, + errorRate: 0, + rowsProduced: 100, + }, + topUsers: [{ user: 'analyst', executions: 40 }], + }); + + expect(parsed.templateId).toBe('pg:123'); + expect(parsed.topUsers).toEqual([{ user: 'analyst', executions: 40 }]); + }); + + it('validates staged table, patterns, and manifest artifacts', () => { + expect( + stagedTableInputSchema.parse({ + table: 'public.orders', + stats: { + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + errorRateBucket: 'none', + p95RuntimeBucket: '<100ms', + recencyBucket: 'current', + }, + columnsByClause: { + select: [['status', 'high']], + where: [['created_at', 'mid']], + }, + observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }], + topTemplates: [{ id: 'pg:123', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }], + }).table, + ).toBe('public.orders'); + + expect( + stagedPatternsInputSchema.parse({ + templates: [ + { + id: 'pg:123', + canonicalSql: 'select * from public.orders', + tablesTouched: ['public.orders'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ], + }).templates, + ).toHaveLength(1); + + expect( + stagedManifestSchema.parse({ + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 2, + touchedTableCount: 1, + parseFailures: 1, + warnings: ['parse_failed:bad'], + probeWarnings: [], + }).parseFailures, + ).toBe(1); + }); +}); +``` + +Add these assertions near the historic-SQL export assertions in `packages/context/src/package-exports.test.ts`: + +```typescript + expect(ingest.historicSqlUnifiedPullConfigSchema).toBeDefined(); + expect(ingest.aggregatedTemplateSchema).toBeDefined(); + expect(ingest.stagedTableInputSchema).toBeDefined(); +``` + +- [ ] **Step 2: Run the contract tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/types.test.ts src/package-exports.test.ts +``` + +Expected: FAIL with missing exports for `historicSqlUnifiedPullConfigSchema`, `aggregatedTemplateSchema`, and `stagedTableInputSchema`. + +- [ ] **Step 3: Add the new schemas and reader contracts** + +Insert this block immediately after the existing `historicSqlPullConfigSchema` definition in `packages/context/src/ingest/adapters/historic-sql/types.ts`. Keep `historicSqlPullConfigSchema` and `HistoricSqlPullConfig` unchanged in this plan because the current production adapter still reads `lastSuccessfulCursor`, `maxTemplatesPerRun`, and `minCalls`. + +```typescript +const filterModeSchema = z.enum(['exclude', 'include', 'mark-only']); + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +export const historicSqlUnifiedPullConfigSchema = z.preprocess((value) => { + if (!isRecord(value)) { + return value; + } + if (value.minExecutions === undefined && typeof value.minCalls === 'number') { + return { ...value, minExecutions: value.minCalls }; + } + return value; +}, z.object({ + dialect: historicSqlDialectSchema, + windowDays: z.number().int().positive().default(90), + minExecutions: z.number().int().nonnegative().default(5), + concurrency: z.number().int().positive().default(12), + filters: z.object({ + serviceAccounts: z.object({ + patterns: z.array(z.string()).default([]), + mode: filterModeSchema.default('exclude'), + }).optional(), + orchestrators: z.object({ + mode: filterModeSchema.default('mark-only'), + }).optional(), + dropTrivialProbes: z.boolean().default(true), + dropFailedBelow: z.object({ + errorRate: z.number().min(0).max(1), + executions: z.number().int().nonnegative(), + }).optional(), + }).default({}), + redactionPatterns: z.array(z.string()).default([]), + staleArchiveAfterDays: z.number().int().positive().default(90), +})); + +export type HistoricSqlUnifiedPullConfig = z.infer; + +export const aggregatedTemplateSchema = z.object({ + templateId: z.string().min(1), + canonicalSql: z.string().min(1), + dialect: historicSqlDialectSchema, + stats: z.object({ + executions: z.number().int().nonnegative(), + distinctUsers: z.number().int().nonnegative(), + firstSeen: z.iso.datetime(), + lastSeen: z.iso.datetime(), + p50RuntimeMs: z.number().nonnegative().nullable(), + p95RuntimeMs: z.number().nonnegative().nullable(), + errorRate: z.number().min(0).max(1), + rowsProduced: z.number().int().nonnegative().nullable(), + }), + topUsers: z.array(z.object({ + user: z.string().nullable(), + executions: z.number().int().nonnegative(), + })).default([]), +}); +export type AggregatedTemplate = z.infer; + +export const stagedTableInputSchema = z.object({ + table: z.string().min(1), + stats: z.object({ + executionsBucket: z.string(), + distinctUsersBucket: z.string(), + errorRateBucket: z.string(), + p95RuntimeBucket: z.string(), + recencyBucket: z.string(), + }), + columnsByClause: z.record(z.string(), z.array(z.tuple([z.string(), z.string()]))), + observedJoins: z.array(z.object({ + withTable: z.string(), + on: z.array(z.string()), + freq: z.string(), + })), + topTemplates: z.array(z.object({ + id: z.string(), + canonicalSql: z.string(), + topUsers: z.array(z.object({ user: z.string().nullable() })), + })), +}); +export type StagedTableInput = z.infer; + +export const stagedPatternsInputSchema = z.object({ + templates: z.array(z.object({ + id: z.string(), + canonicalSql: z.string(), + tablesTouched: z.array(z.string()), + executionsBucket: z.string(), + distinctUsersBucket: z.string(), + dialect: historicSqlDialectSchema, + })), +}); +export type StagedPatternsInput = z.infer; + +export const stagedManifestSchema = z.object({ + source: z.literal(HISTORIC_SQL_SOURCE_KEY), + connectionId: z.string().min(1), + dialect: historicSqlDialectSchema, + fetchedAt: z.iso.datetime(), + windowStart: z.iso.datetime(), + windowEnd: z.iso.datetime(), + snapshotRowCount: z.number().int().nonnegative(), + touchedTableCount: z.number().int().nonnegative(), + parseFailures: z.number().int().nonnegative(), + warnings: z.array(z.string()), + probeWarnings: z.array(z.string()), +}); +export type StagedManifest = z.infer; + +export interface HistoricSqlProbeResult { + warnings: string[]; +} + +export interface HistoricSqlReader { + probe(client: unknown): Promise; + fetchAggregated( + client: unknown, + window: HistoricSqlTimeWindow, + config: HistoricSqlUnifiedPullConfig, + ): AsyncIterable; +} +``` + +- [ ] **Step 4: Export the new contracts** + +In `packages/context/src/ingest/index.ts`, add exports for the new types and schemas: + +```typescript +export type { + AggregatedTemplate, + HistoricSqlProbeResult, + HistoricSqlReader, + HistoricSqlUnifiedPullConfig, + StagedManifest, + StagedPatternsInput, + StagedTableInput, +} from './adapters/historic-sql/types.js'; +export { + aggregatedTemplateSchema, + historicSqlUnifiedPullConfigSchema, + stagedManifestSchema, + stagedPatternsInputSchema, + stagedTableInputSchema, +} from './adapters/historic-sql/types.js'; +``` + +- [ ] **Step 5: Run the contract tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/types.test.ts src/package-exports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/types.ts packages/context/src/ingest/adapters/historic-sql/types.test.ts packages/context/src/ingest/index.ts packages/context/src/package-exports.test.ts +git commit -m "feat: add historic sql unified contracts" +``` + +## Task 2: Add Stable Bucket Helpers + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/buckets.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/buckets.test.ts` +- Modify: `packages/context/src/ingest/index.ts` + +- [ ] **Step 1: Write failing bucket tests** + +Create `packages/context/src/ingest/adapters/historic-sql/buckets.test.ts`: + +```typescript +import { describe, expect, it } from 'vitest'; +import { + bucketDistinctUsers, + bucketErrorRate, + bucketExecutions, + bucketFrequency, + bucketP95Runtime, + bucketRecency, +} from './buckets.js'; + +describe('historic-sql bucket helpers', () => { + it('uses stable execution buckets', () => { + expect([0, 9, 10, 99, 100, 999, 1000, 4999, 5000, 49999, 50000].map(bucketExecutions)).toEqual([ + '<10', + '<10', + '10-100', + '10-100', + '100-1k', + '100-1k', + '1k-5k', + '1k-5k', + '5k-50k', + '5k-50k', + '>50k', + ]); + }); + + it('uses stable distinct-user, error-rate, runtime, and recency buckets', () => { + expect([0, 1, 2, 5, 6, 10, 11].map(bucketDistinctUsers)).toEqual([ + '0', + '1', + '2-5', + '2-5', + '5-10', + '5-10', + '>10', + ]); + expect([0, 0.01, 0.05, 0.2].map(bucketErrorRate)).toEqual(['none', 'low', 'low', 'high']); + expect([null, 99, 100, 999, 1000, 9999, 10000].map(bucketP95Runtime)).toEqual([ + 'unknown', + '<100ms', + '100ms-1s', + '100ms-1s', + '1s-10s', + '1s-10s', + '>10s', + ]); + expect(bucketRecency('2026-05-11T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('current'); + expect(bucketRecency('2026-04-20T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('recent'); + expect(bucketRecency('2026-01-01T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('stale'); + }); + + it('maps frequency counts to high, mid, and low labels', () => { + expect(bucketFrequency(80, 100)).toBe('high'); + expect(bucketFrequency(20, 100)).toBe('mid'); + expect(bucketFrequency(1, 100)).toBe('low'); + expect(bucketFrequency(0, 0)).toBe('low'); + }); +}); +``` + +- [ ] **Step 2: Run the bucket test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/buckets.test.ts +``` + +Expected: FAIL because `buckets.js` does not exist. + +- [ ] **Step 3: Add the bucket helper implementation** + +Create `packages/context/src/ingest/adapters/historic-sql/buckets.ts`: + +```typescript +export function bucketExecutions(value: number): string { + if (value < 10) return '<10'; + if (value < 100) return '10-100'; + if (value < 1000) return '100-1k'; + if (value < 5000) return '1k-5k'; + if (value < 50000) return '5k-50k'; + return '>50k'; +} + +export function bucketDistinctUsers(value: number): string { + if (value <= 0) return '0'; + if (value === 1) return '1'; + if (value <= 5) return '2-5'; + if (value <= 10) return '5-10'; + return '>10'; +} + +export function bucketErrorRate(value: number): string { + if (value <= 0) return 'none'; + if (value < 0.1) return 'low'; + return 'high'; +} + +export function bucketP95Runtime(value: number | null): string { + if (value === null) return 'unknown'; + if (value < 100) return '<100ms'; + if (value < 1000) return '100ms-1s'; + if (value < 10000) return '1s-10s'; + return '>10s'; +} + +export function bucketRecency(lastSeen: string, now: Date): string { + const parsed = new Date(lastSeen); + if (Number.isNaN(parsed.getTime())) { + return 'unknown'; + } + const ageDays = (now.getTime() - parsed.getTime()) / (24 * 60 * 60 * 1000); + if (ageDays <= 7) return 'current'; + if (ageDays <= 45) return 'recent'; + return 'stale'; +} + +export function bucketFrequency(count: number, total: number): 'high' | 'mid' | 'low' { + if (total <= 0 || count <= 0) return 'low'; + const ratio = count / total; + if (ratio >= 0.5) return 'high'; + if (ratio >= 0.1) return 'mid'; + return 'low'; +} +``` + +- [ ] **Step 4: Run the bucket test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/buckets.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Export bucket helpers** + +In `packages/context/src/ingest/index.ts`, add: + +```typescript +export { bucketDistinctUsers, bucketErrorRate, bucketExecutions, bucketP95Runtime, bucketRecency } from './adapters/historic-sql/buckets.js'; +``` + +- [ ] **Step 6: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/buckets.ts packages/context/src/ingest/adapters/historic-sql/buckets.test.ts packages/context/src/ingest/index.ts +git commit -m "feat: add historic sql bucket helpers" +``` + +## Task 3: Stage Aggregated Snapshots + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts` +- Modify: `packages/context/src/ingest/index.ts` + +- [ ] **Step 1: Write failing staged-artifact tests** + +Create `packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts`: + +```typescript +import { mkdtemp, readFile, readdir } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { describe, expect, it, vi } from 'vitest'; +import type { SqlAnalysisPort } from '../../../sql-analysis/index.js'; +import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js'; +import type { AggregatedTemplate, HistoricSqlReader } from './types.js'; + +async function tempDir(): Promise { + return mkdtemp(join(tmpdir(), 'historic-sql-unified-stage-')); +} + +async function readJson(root: string, relPath: string): Promise { + return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T; +} + +function aggregate(overrides: Partial & { templateId: string; canonicalSql: string }): AggregatedTemplate { + return { + templateId: overrides.templateId, + canonicalSql: overrides.canonicalSql, + dialect: overrides.dialect ?? 'postgres', + stats: overrides.stats ?? { + executions: 42, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 20, + p95RuntimeMs: 80, + errorRate: 0, + rowsProduced: 100, + }, + topUsers: overrides.topUsers ?? [{ user: 'analyst', executions: 40 }], + }; +} + +describe('stageHistoricSqlAggregatedSnapshot', () => { + it('batch parses templates and writes stable table and patterns artifacts', async () => { + const stagedDir = await tempDir(); + const reader: HistoricSqlReader = { + async probe() { + return { warnings: ['pg_stat_statements.max is low; aggregation still proceeds'] }; + }, + async *fetchAggregated() { + yield aggregate({ + templateId: 'orders-by-status', + canonicalSql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status', + }); + yield aggregate({ + templateId: 'service-account-only', + canonicalSql: 'select * from public.orders where id = $1', + stats: { + executions: 20, + distinctUsers: 1, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 5, + p95RuntimeMs: 10, + errorRate: 0, + rowsProduced: 1, + }, + topUsers: [{ user: 'svc_loader', executions: 20 }], + }); + yield aggregate({ + templateId: 'bad-parse', + canonicalSql: 'select broken from', + }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + [ + 'orders-by-status', + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: ['status'], + where: ['created_at'], + join: ['customer_id'], + groupBy: ['status'], + }, + }, + ], + ['bad-parse', { tablesTouched: [], columnsByClause: {}, error: 'parse failed' }], + ])), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'postgres', + filters: { + serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' }, + }, + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1); + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith( + [ + { + id: 'orders-by-status', + sql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status', + }, + { id: 'bad-parse', sql: 'select broken from' }, + ], + 'postgres', + ); + + expect(await readdir(join(stagedDir, 'tables'))).toEqual(['public.customers.json', 'public.orders.json']); + + const manifest = await readJson>(stagedDir, 'manifest.json'); + expect(manifest).toMatchObject({ + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + snapshotRowCount: 3, + touchedTableCount: 2, + parseFailures: 1, + warnings: ['parse_failed:bad-parse'], + probeWarnings: ['pg_stat_statements.max is low; aggregation still proceeds'], + }); + + const orders = await readJson>(stagedDir, 'tables/public.orders.json'); + expect(orders).toMatchObject({ + table: 'public.orders', + stats: { + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + errorRateBucket: 'none', + p95RuntimeBucket: '<100ms', + recencyBucket: 'current', + }, + columnsByClause: { + select: [['status', 'high']], + where: [['created_at', 'high']], + join: [['customer_id', 'high']], + groupBy: [['status', 'high']], + }, + observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }], + topTemplates: [ + { + id: 'orders-by-status', + topUsers: [{ user: 'analyst' }], + }, + ], + }); + expect(orders.topTemplates[0].canonicalSql).toContain('group by o.status'); + + const patterns = await readJson>(stagedDir, 'patterns-input.json'); + expect(patterns.templates).toEqual([ + { + id: 'orders-by-status', + canonicalSql: expect.stringContaining('public.orders'), + tablesTouched: ['public.customers', 'public.orders'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ]); + }); +}); +``` + +- [ ] **Step 2: Run the stage test to verify it fails** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/stage-unified.test.ts +``` + +Expected: FAIL because `stage-unified.js` does not exist. + +- [ ] **Step 3: Add the unified stager** + +Create `packages/context/src/ingest/adapters/historic-sql/stage-unified.ts` with these exported shapes and helpers: + +```typescript +import { mkdir, writeFile } from 'node:fs/promises'; +import { dirname, join } from 'node:path'; +import type { SqlAnalysisPort } from '../../../sql-analysis/index.js'; +import { + bucketDistinctUsers, + bucketErrorRate, + bucketExecutions, + bucketFrequency, + bucketP95Runtime, + bucketRecency, +} from './buckets.js'; +import { + HISTORIC_SQL_SOURCE_KEY, + aggregatedTemplateSchema, + historicSqlUnifiedPullConfigSchema, + type AggregatedTemplate, + type HistoricSqlReader, + type HistoricSqlUnifiedPullConfig, + type StagedPatternsInput, + type StagedTableInput, +} from './types.js'; + +interface StageHistoricSqlAggregatedSnapshotInput { + stagedDir: string; + connectionId: string; + queryClient: unknown; + reader: HistoricSqlReader; + sqlAnalysis: SqlAnalysisPort; + pullConfig: unknown; + now?: Date; +} + +interface ParsedTemplate { + template: AggregatedTemplate; + tablesTouched: string[]; + columnsByClause: Record; +} + +interface TableAccumulator { + table: string; + executions: number; + distinctUsers: number; + errorRateNumerator: number; + p95RuntimeMs: number | null; + lastSeen: string; + columnsByClause: Map>; + observedJoins: Map>; + topTemplates: AggregatedTemplate[]; +} + +const TRIVIAL_SQL_RE = /^\s*SELECT\s+(1|NOW\(\)|CURRENT_TIMESTAMP|VERSION\(\))\s*;?\s*$/i; +const NOISE_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i; +const SYSTEM_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i; +const ORCHESTRATOR_RE = /\b(dbt|looker|metabase)\b/i; + +function writeJson(root: string, relPath: string, value: unknown): Promise { + const target = join(root, relPath); + return mkdir(dirname(target), { recursive: true }).then(() => + writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'), + ); +} + +function compilePatterns(patterns: string[]): RegExp[] { + return patterns.map((pattern) => new RegExp(pattern)); +} + +function matchesAny(value: string | null, patterns: RegExp[]): boolean { + return !!value && patterns.some((pattern) => pattern.test(value)); +} + +function shouldDropBySql(sql: string, config: HistoricSqlUnifiedPullConfig): boolean { + if (NOISE_PREFIX_RE.test(sql) || SYSTEM_TABLE_RE.test(sql)) return true; + if (config.filters.dropTrivialProbes !== false && TRIVIAL_SQL_RE.test(sql)) return true; + return false; +} + +function shouldDropByUsers(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean { + const service = config.filters.serviceAccounts; + if (!service || service.mode === 'mark-only' || service.patterns.length === 0) return false; + const patterns = compilePatterns(service.patterns); + const matchingExecutions = template.topUsers + .filter((entry) => matchesAny(entry.user, patterns)) + .reduce((sum, entry) => sum + entry.executions, 0); + const allExecutions = template.topUsers.reduce((sum, entry) => sum + entry.executions, 0); + const serviceOnly = allExecutions > 0 && matchingExecutions >= allExecutions; + return service.mode === 'exclude' ? serviceOnly : !serviceOnly; +} + +function shouldDropByFailure(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean { + const failed = config.filters.dropFailedBelow; + return !!failed && template.stats.errorRate > failed.errorRate && template.stats.executions < failed.executions; +} + +function shouldDropTemplate(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean { + if (shouldDropBySql(template.canonicalSql, config)) return true; + if (shouldDropByUsers(template, config)) return true; + if (shouldDropByFailure(template, config)) return true; + return false; +} + +function recordColumn(acc: TableAccumulator, clause: string, column: string, executions: number): void { + const byColumn = acc.columnsByClause.get(clause) ?? new Map(); + byColumn.set(column, (byColumn.get(column) ?? 0) + executions); + acc.columnsByClause.set(clause, byColumn); +} + +function recordJoin(acc: TableAccumulator, otherTable: string, columns: string[], executions: number): void { + const byColumns = acc.observedJoins.get(otherTable) ?? new Map(); + const key = [...new Set(columns)].sort().join(','); + if (key.length > 0) { + byColumns.set(key, (byColumns.get(key) ?? 0) + executions); + acc.observedJoins.set(otherTable, byColumns); + } +} + +function accumulatorFor(table: string): TableAccumulator { + return { + table, + executions: 0, + distinctUsers: 0, + errorRateNumerator: 0, + p95RuntimeMs: null, + lastSeen: '1970-01-01T00:00:00.000Z', + columnsByClause: new Map(), + observedJoins: new Map(), + topTemplates: [], + }; +} + +function addTemplate(acc: TableAccumulator, parsed: ParsedTemplate): void { + const executions = parsed.template.stats.executions; + acc.executions += executions; + acc.distinctUsers = Math.max(acc.distinctUsers, parsed.template.stats.distinctUsers); + acc.errorRateNumerator += parsed.template.stats.errorRate * executions; + acc.p95RuntimeMs = + acc.p95RuntimeMs === null + ? parsed.template.stats.p95RuntimeMs + : parsed.template.stats.p95RuntimeMs === null + ? acc.p95RuntimeMs + : Math.max(acc.p95RuntimeMs, parsed.template.stats.p95RuntimeMs); + acc.lastSeen = parsed.template.stats.lastSeen > acc.lastSeen ? parsed.template.stats.lastSeen : acc.lastSeen; + for (const [clause, columns] of Object.entries(parsed.columnsByClause)) { + for (const column of columns) { + recordColumn(acc, clause, column, executions); + } + } + const joinColumns = parsed.columnsByClause.join ?? []; + for (const otherTable of parsed.tablesTouched.filter((table) => table !== acc.table)) { + recordJoin(acc, otherTable, joinColumns, executions); + } + acc.topTemplates.push(parsed.template); +} +``` + +In the same file, add the staging function: + +```typescript +function toStagedTable(acc: TableAccumulator, now: Date): StagedTableInput { + const errorRate = acc.executions > 0 ? acc.errorRateNumerator / acc.executions : 0; + const columnsByClause = Object.fromEntries( + [...acc.columnsByClause.entries()] + .sort(([left], [right]) => left.localeCompare(right)) + .map(([clause, counts]) => [ + clause, + [...counts.entries()] + .sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0])) + .map(([column, count]) => [column, bucketFrequency(count, acc.executions)]), + ]), + ); + const observedJoins = [...acc.observedJoins.entries()] + .flatMap(([withTable, byColumns]) => + [...byColumns.entries()].map(([columns, count]) => ({ + withTable, + on: columns.split(',').filter(Boolean), + freq: bucketFrequency(count, acc.executions), + })), + ) + .sort((left, right) => left.withTable.localeCompare(right.withTable) || left.on.join(',').localeCompare(right.on.join(','))); + const topTemplates = [...acc.topTemplates] + .sort((left, right) => right.stats.executions - left.stats.executions || left.templateId.localeCompare(right.templateId)) + .slice(0, 5) + .map((template) => ({ + id: template.templateId, + canonicalSql: template.canonicalSql, + topUsers: template.topUsers.slice(0, 5).map((entry) => ({ user: entry.user })), + })); + + return { + table: acc.table, + stats: { + executionsBucket: bucketExecutions(acc.executions), + distinctUsersBucket: bucketDistinctUsers(acc.distinctUsers), + errorRateBucket: bucketErrorRate(errorRate), + p95RuntimeBucket: bucketP95Runtime(acc.p95RuntimeMs), + recencyBucket: bucketRecency(acc.lastSeen, now), + }, + columnsByClause, + observedJoins, + topTemplates, + }; +} + +function toPatternsInput(parsedTemplates: ParsedTemplate[]): StagedPatternsInput { + return { + templates: parsedTemplates + .map(({ template, tablesTouched }) => ({ + id: template.templateId, + canonicalSql: template.canonicalSql, + tablesTouched: [...tablesTouched].sort(), + executionsBucket: bucketExecutions(template.stats.executions), + distinctUsersBucket: bucketDistinctUsers(template.stats.distinctUsers), + dialect: template.dialect, + })) + .sort((left, right) => left.id.localeCompare(right.id)), + }; +} + +export async function stageHistoricSqlAggregatedSnapshot(input: StageHistoricSqlAggregatedSnapshotInput): Promise { + const config = historicSqlUnifiedPullConfigSchema.parse(input.pullConfig); + const now = input.now ?? new Date(); + const windowStart = new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000); + const probe = await input.reader.probe(input.queryClient); + const snapshot: AggregatedTemplate[] = []; + + for await (const row of input.reader.fetchAggregated(input.queryClient, { start: windowStart, end: now }, config)) { + const parsed = aggregatedTemplateSchema.parse(row); + if (!shouldDropTemplate(parsed, config)) { + snapshot.push(parsed); + } + } + + const analysis = await input.sqlAnalysis.analyzeBatch( + snapshot.map((template) => ({ id: template.templateId, sql: template.canonicalSql })), + config.dialect, + ); + const warnings: string[] = []; + const parsedTemplates: ParsedTemplate[] = []; + for (const template of snapshot) { + const parsed = analysis.get(template.templateId); + if (!parsed || parsed.error) { + warnings.push(`parse_failed:${template.templateId}`); + continue; + } + const tablesTouched = [...new Set(parsed.tablesTouched)].filter((table) => table.length > 0).sort(); + if (tablesTouched.length === 0) { + continue; + } + parsedTemplates.push({ + template, + tablesTouched, + columnsByClause: Object.fromEntries( + Object.entries(parsed.columnsByClause).map(([clause, columns]) => [clause, [...new Set(columns)].sort()]), + ), + }); + } + + const byTable = new Map(); + for (const parsed of parsedTemplates) { + for (const table of parsed.tablesTouched) { + const acc = byTable.get(table) ?? accumulatorFor(table); + addTemplate(acc, parsed); + byTable.set(table, acc); + } + } + + await mkdir(input.stagedDir, { recursive: true }); + for (const [table, acc] of [...byTable.entries()].sort(([left], [right]) => left.localeCompare(right))) { + await writeJson(input.stagedDir, `tables/${table}.json`, toStagedTable(acc, now)); + } + await writeJson(input.stagedDir, 'patterns-input.json', toPatternsInput(parsedTemplates)); + await writeJson(input.stagedDir, 'manifest.json', { + source: HISTORIC_SQL_SOURCE_KEY, + connectionId: input.connectionId, + dialect: config.dialect, + fetchedAt: now.toISOString(), + windowStart: windowStart.toISOString(), + windowEnd: now.toISOString(), + snapshotRowCount: snapshot.length, + touchedTableCount: byTable.size, + parseFailures: warnings.filter((warning) => warning.startsWith('parse_failed:')).length, + warnings, + probeWarnings: probe.warnings, + }); +} +``` + +- [ ] **Step 4: Run the staged-artifact test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/stage-unified.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Export the unified stager** + +In `packages/context/src/ingest/index.ts`, add: + +```typescript +export { stageHistoricSqlAggregatedSnapshot } from './adapters/historic-sql/stage-unified.js'; +``` + +- [ ] **Step 6: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/stage-unified.ts packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts packages/context/src/ingest/index.ts +git commit -m "feat: stage historic sql aggregate snapshots" +``` + +## Task 4: Add Aggregate Readers + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts` +- Modify: `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts` +- Modify: `packages/context/src/ingest/index.ts` + +- [ ] **Step 1: Write failing Postgres aggregate reader tests** + +Create `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts`: + +```typescript +import { describe, expect, it, vi } from 'vitest'; +import { PostgresPgssReader } from './postgres-pgss-reader.js'; + +describe('PostgresPgssReader aggregate path', () => { + it('aggregates pg_stat_statements rows by queryid and query', async () => { + const executeQuery = vi.fn(async (sql: string, params?: unknown[]) => { + if (sql.includes('pg_stat_statements_info')) { + return { headers: ['stats_reset', 'dealloc'], rows: [['2026-05-01T00:00:00.000Z', 1]] }; + } + expect(sql).toContain('GROUP BY queryid, query'); + expect(sql).toContain('HAVING SUM(calls) >= $1'); + expect(params).toEqual([5]); + return { + headers: ['template_id', 'canonical_sql', 'executions', 'distinct_users', 'mean_ms', 'rows_produced', 'top_users'], + rows: [ + [ + '123', + 'select status from public.orders', + '42', + '3', + '11.5', + '100', + JSON.stringify([{ user: 'analyst', executions: 40 }]), + ], + ], + }; + }); + + const reader = new PostgresPgssReader(); + const rows = []; + for await (const row of reader.fetchAggregated( + { executeQuery }, + { start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') }, + { dialect: 'postgres', minExecutions: 5, windowDays: 90, concurrency: 12, filters: {}, redactionPatterns: [], staleArchiveAfterDays: 90 }, + )) { + rows.push(row); + } + + expect(rows).toEqual([ + { + templateId: '123', + canonicalSql: 'select status from public.orders', + dialect: 'postgres', + stats: { + executions: 42, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 11.5, + p95RuntimeMs: 11.5, + errorRate: 0, + rowsProduced: 100, + }, + topUsers: [{ user: 'analyst', executions: 40 }], + }, + ]); + }); +}); +``` + +- [ ] **Step 2: Add failing BigQuery and Snowflake aggregate assertions** + +In `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts`, add a test that constructs `new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'demo', region: 'us' })`, calls `fetchAggregated()`, and asserts the SQL contains: + +```typescript +expect(sql).toContain('COUNT(*) AS executions'); +expect(sql).toContain('COUNT(DISTINCT user_email) AS distinct_users'); +expect(sql).toContain('GROUP BY query_hash'); +expect(sql).toContain('HAVING COUNT(*) >= 5'); +``` + +Map one returned row with headers: + +```typescript +[ + 'template_id', + 'canonical_sql', + 'executions', + 'distinct_users', + 'first_seen', + 'last_seen', + 'p50_ms', + 'p95_ms', + 'error_rate', + 'rows_produced', + 'top_users', +] +``` + +and assert `templateId`, `stats.executions`, `stats.errorRate`, and `topUsers` match the row. + +In `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts`, add the same shape but assert the SQL contains: + +```typescript +expect(sql).toContain('SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY'); +expect(sql).toContain('COUNT(*) AS executions'); +expect(sql).toContain('GROUP BY query_hash'); +expect(sql).toContain('HAVING COUNT(*) >= 5'); +``` + +- [ ] **Step 3: Run aggregate reader tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts +``` + +Expected: FAIL because `fetchAggregated()` and `postgres-pgss-reader.js` do not exist. + +- [ ] **Step 4: Implement the aggregate reader methods** + +Create `packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts` with the same probe behavior currently implemented in `postgres-pgss-query-history-reader.ts`: `queryClient`, `execute`, `indexByHeader`, `value`, `nullableString`, `requiredString`, `requiredFiniteNumber`, `nullableInteger`, `nullableIsoTimestamp`, `firstRow`, `extensionMissingError`, and `grantsMissingError` keep their current behavior. Add this aggregate query and row mapper: + +```typescript +const AGGREGATE_SQL = ` +SELECT queryid::text AS template_id, + query AS canonical_sql, + SUM(calls)::bigint AS executions, + COUNT(DISTINCT userid) AS distinct_users, + SUM(total_exec_time) / NULLIF(SUM(calls), 0) AS mean_ms, + SUM(rows)::bigint AS rows_produced, + COALESCE( + json_agg(json_build_object('user', rolname, 'executions', calls) ORDER BY calls DESC) + FILTER (WHERE userid IS NOT NULL), + '[]'::json + )::text AS top_users +FROM pg_stat_statements +LEFT JOIN pg_roles ON pg_roles.oid = pg_stat_statements.userid +WHERE toplevel = true +GROUP BY queryid, query +HAVING SUM(calls) >= $1 +ORDER BY SUM(total_exec_time) DESC +`.trim(); +``` + +The `fetchAggregated()` method must: + +```typescript + async *fetchAggregated( + client: unknown, + window: HistoricSqlTimeWindow, + config: HistoricSqlUnifiedPullConfig, + ): AsyncIterable { + const pgClient = queryClient(client); + const statsResult = await execute(pgClient, STATS_INFO_SQL); + const { row: statsRow, headers: statsHeaders } = firstRow(statsResult, 'stats-info'); + const firstSeen = nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')) ?? window.start.toISOString(); + const result = await execute(pgClient, AGGREGATE_SQL, [config.minExecutions]); + const indexes = indexByHeader(result.headers); + for (const row of result.rows) { + yield aggregatedTemplateSchema.parse({ + templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'), + canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'), + dialect: 'postgres', + stats: { + executions: requiredInteger(value(row, indexes, 'executions'), 'executions'), + distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'), + firstSeen, + lastSeen: window.end.toISOString(), + p50RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')), + p95RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')), + errorRate: 0, + rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')), + }, + topUsers: parseTopUsers(value(row, indexes, 'top_users')), + }); + } + } +``` + +In `packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts`, add this aggregate query inside `fetchAggregated()`: + +```typescript +const sql = ` +SELECT + query_hash AS template_id, + MIN(query) AS canonical_sql, + COUNT(*) AS executions, + COUNT(DISTINCT user_email) AS distinct_users, + MIN(creation_time) AS first_seen, + MAX(creation_time) AS last_seen, + APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(50)] AS p50_ms, + APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(95)] AS p95_ms, + SAFE_DIVIDE(COUNTIF(error_result IS NOT NULL), COUNT(*)) AS error_rate, + CAST(NULL AS INT64) AS rows_produced, + TO_JSON_STRING(ARRAY_AGG(STRUCT(user_email AS user, 1 AS executions) ORDER BY creation_time DESC LIMIT 5)) AS top_users +FROM ${this.viewPath} +WHERE job_type = 'QUERY' + AND statement_type IN ('SELECT', 'MERGE') + AND creation_time >= ${timestampExpression(window.start)} + AND creation_time < ${timestampExpression(window.end)} + AND query IS NOT NULL +GROUP BY query_hash +HAVING COUNT(*) >= ${config.minExecutions} +ORDER BY executions DESC`.trim(); +``` + +Map each result row into `aggregatedTemplateSchema.parse({ templateId, canonicalSql, dialect: 'bigquery', stats: { executions, distinctUsers, firstSeen, lastSeen, p50RuntimeMs, p95RuntimeMs, errorRate, rowsProduced }, topUsers })`, where `topUsers` is parsed from the `top_users` JSON string and invalid JSON becomes `[]`. + +In `packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts`, add this aggregate query inside `fetchAggregated()`: + +```typescript +const sql = ` +SELECT + query_hash AS template_id, + MIN(query_text) AS canonical_sql, + COUNT(*) AS executions, + COUNT(DISTINCT user_name) AS distinct_users, + MIN(start_time) AS first_seen, + MAX(start_time) AS last_seen, + APPROX_PERCENTILE(total_elapsed_time, 0.50) AS p50_ms, + APPROX_PERCENTILE(total_elapsed_time, 0.95) AS p95_ms, + DIV0(COUNT_IF(execution_status != 'SUCCESS'), COUNT(*)) AS error_rate, + SUM(rows_produced) AS rows_produced, + ARRAY_AGG(OBJECT_CONSTRUCT('user', user_name, 'executions', 1)) WITHIN GROUP (ORDER BY start_time DESC)::string AS top_users +FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY +WHERE query_text IS NOT NULL + AND query_type IN ('SELECT', 'MERGE') + AND start_time >= ${timestampLiteral(window.start)} + AND start_time < ${timestampLiteral(window.end)} +GROUP BY query_hash +HAVING COUNT(*) >= ${config.minExecutions} +ORDER BY executions DESC`.trim(); +``` + +Map each result row into `aggregatedTemplateSchema.parse({ templateId, canonicalSql, dialect: 'snowflake', stats: { executions, distinctUsers, firstSeen, lastSeen, p50RuntimeMs, p95RuntimeMs, errorRate, rowsProduced }, topUsers })`, where `topUsers` is parsed from the `top_users` JSON string and invalid JSON becomes `[]`. Keep the existing `fetch()` methods unchanged in this plan so current adapter behavior does not move before the skill/projection cutover. + +- [ ] **Step 5: Export the new Postgres reader** + +In `packages/context/src/ingest/index.ts`, add: + +```typescript +export { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js'; +``` + +- [ ] **Step 6: Run aggregate reader tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts packages/context/src/ingest/index.ts +git commit -m "feat: add historic sql aggregate readers" +``` + +## Task 5: Add Unified Chunking + +**Files:** +- Create: `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts` +- Create: `packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts` +- Modify: `packages/context/src/ingest/index.ts` + +- [ ] **Step 1: Write failing unified chunk tests** + +Create `packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts`: + +```typescript +import { mkdir, mkdtemp, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js'; + +async function tempDir(): Promise { + return mkdtemp(join(tmpdir(), 'historic-sql-unified-chunk-')); +} + +async function writeJson(root: string, relPath: string, value: unknown): Promise { + const target = join(root, relPath); + await mkdir(join(target, '..'), { recursive: true }); + await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'); +} + +async function writeUnifiedStagedDir(root: string): Promise { + await writeJson(root, 'manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 1, + touchedTableCount: 1, + parseFailures: 0, + warnings: [], + probeWarnings: [], + }); + await writeJson(root, 'tables/public.orders.json', { + table: 'public.orders', + stats: { + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + errorRateBucket: 'none', + p95RuntimeBucket: '<100ms', + recencyBucket: 'current', + }, + columnsByClause: { select: [['status', 'high']] }, + observedJoins: [], + topTemplates: [{ id: 'orders', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }], + }); + await writeJson(root, 'patterns-input.json', { + templates: [ + { + id: 'orders', + canonicalSql: 'select * from public.orders', + tablesTouched: ['public.orders'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ], + }); +} + +describe('chunkHistoricSqlUnifiedStagedDir', () => { + it('emits one table WorkUnit plus one patterns WorkUnit', async () => { + const stagedDir = await tempDir(); + await writeUnifiedStagedDir(stagedDir); + + const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir); + + expect(result.workUnits).toEqual([ + expect.objectContaining({ + unitKey: 'historic-sql-table-public-orders', + displayLabel: 'Historic SQL usage: public.orders', + rawFiles: ['tables/public.orders.json'], + dependencyPaths: ['manifest.json'], + notes: expect.stringContaining('historic_sql_table_digest'), + }), + expect.objectContaining({ + unitKey: 'historic-sql-patterns', + displayLabel: 'Historic SQL cross-table patterns', + rawFiles: ['patterns-input.json'], + dependencyPaths: ['manifest.json'], + notes: expect.stringContaining('historic_sql_patterns'), + }), + ]); + expect(result.reconcileNotes).toEqual(['Historic-SQL touched tables=1 parseFailures=0']); + }); + + it('respects diff sets for unchanged table and patterns files', async () => { + const stagedDir = await tempDir(); + await writeUnifiedStagedDir(stagedDir); + + await expect( + chunkHistoricSqlUnifiedStagedDir(stagedDir, { + added: [], + modified: ['tables/public.orders.json'], + deleted: [], + unchanged: ['manifest.json', 'patterns-input.json'], + }), + ).resolves.toMatchObject({ + workUnits: [expect.objectContaining({ unitKey: 'historic-sql-table-public-orders' })], + }); + + await expect( + chunkHistoricSqlUnifiedStagedDir(stagedDir, { + added: [], + modified: ['patterns-input.json'], + deleted: [], + unchanged: ['manifest.json', 'tables/public.orders.json'], + }), + ).resolves.toMatchObject({ + workUnits: [expect.objectContaining({ unitKey: 'historic-sql-patterns' })], + }); + }); + + it('describes unified staged scope', async () => { + const stagedDir = await tempDir(); + await writeUnifiedStagedDir(stagedDir); + + const scope = await describeHistoricSqlUnifiedScope(stagedDir); + + expect(scope.isPathInScope('manifest.json')).toBe(true); + expect(scope.isPathInScope('patterns-input.json')).toBe(true); + expect(scope.isPathInScope('tables/public.orders.json')).toBe(true); + expect(scope.isPathInScope('templates/old/page.md')).toBe(false); + }); +}); +``` + +- [ ] **Step 2: Run the unified chunk tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/chunk-unified.test.ts +``` + +Expected: FAIL because `chunk-unified.js` does not exist. + +- [ ] **Step 3: Add the unified chunker** + +Create `packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts`: + +```typescript +import { createHash } from 'node:crypto'; +import { readFile, readdir } from 'node:fs/promises'; +import { join, relative } from 'node:path'; +import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js'; +import { stagedManifestSchema, stagedPatternsInputSchema, stagedTableInputSchema } from './types.js'; + +async function walk(root: string): Promise { + const entries = await readdir(root, { withFileTypes: true, recursive: true }); + return entries + .filter((entry) => entry.isFile()) + .map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/')) + .sort(); +} + +async function readJson(stagedDir: string, relPath: string): Promise { + return JSON.parse(await readFile(join(stagedDir, relPath), 'utf-8')) as T; +} + +function safeUnitKey(value: string): string { + return value.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, ''); +} + +function touchedPath(path: string, touched: Set | null): boolean { + return !touched || touched.has(path); +} + +export async function chunkHistoricSqlUnifiedStagedDir(stagedDir: string, diffSet?: DiffSet): Promise { + const files = await walk(stagedDir); + const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); + const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null; + const workUnits: WorkUnit[] = []; + + for (const path of files.filter((file) => /^tables\/.+\.json$/.test(file))) { + if (!touchedPath(path, touched)) { + continue; + } + const table = stagedTableInputSchema.parse(await readJson(stagedDir, path)); + workUnits.push({ + unitKey: `historic-sql-table-${safeUnitKey(table.table)}`, + displayLabel: `Historic SQL usage: ${table.table}`, + rawFiles: [path], + dependencyPaths: ['manifest.json'], + peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(), + notes: + 'Use historic_sql_table_digest. Read this table usage JSON and the existing semantic-layer source for the table; output only table usage evidence shaped like tableUsageOutputSchema.', + }); + } + + if (files.includes('patterns-input.json') && touchedPath('patterns-input.json', touched)) { + stagedPatternsInputSchema.parse(await readJson(stagedDir, 'patterns-input.json')); + workUnits.push({ + unitKey: 'historic-sql-patterns', + displayLabel: 'Historic SQL cross-table patterns', + rawFiles: ['patterns-input.json'], + dependencyPaths: ['manifest.json'], + peerFileIndex: files.filter((file) => file !== 'patterns-input.json' && file !== 'manifest.json').sort(), + notes: + 'Use historic_sql_patterns. Read patterns-input.json and produce cross-table pattern evidence shaped like patternsArraySchema.', + }); + } + + const deleted = diffSet?.deleted.filter((path) => path === 'patterns-input.json' || /^tables\/.+\.json$/.test(path)).sort(); + return { + workUnits, + eviction: deleted && deleted.length > 0 ? { deletedRawPaths: deleted } : undefined, + reconcileNotes: [`Historic-SQL touched tables=${manifest.touchedTableCount} parseFailures=${manifest.parseFailures}`], + contextReport: { + capped: false, + warnings: [...manifest.probeWarnings, ...manifest.warnings], + }, + }; +} + +export async function describeHistoricSqlUnifiedScope(stagedDir: string): Promise { + const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); + const fingerprint = createHash('sha256') + .update(JSON.stringify({ + connectionId: manifest.connectionId, + dialect: manifest.dialect, + windowStart: manifest.windowStart, + windowEnd: manifest.windowEnd, + })) + .digest('hex'); + return { + fingerprint, + isPathInScope: (rawPath) => + rawPath === 'manifest.json' || rawPath === 'patterns-input.json' || /^tables\/.+\.json$/.test(rawPath), + }; +} +``` + +- [ ] **Step 4: Run the unified chunk tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/adapters/historic-sql/chunk-unified.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Export the unified chunker** + +In `packages/context/src/ingest/index.ts`, add: + +```typescript +export { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './adapters/historic-sql/chunk-unified.js'; +``` + +- [ ] **Step 6: Commit** + +```bash +git add packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts packages/context/src/ingest/index.ts +git commit -m "feat: chunk historic sql unified staging" +``` + +## Task 6: Verify the Hot Path Slice + +**Files:** +- Modify: files changed in Tasks 1-5 + +- [ ] **Step 1: Run focused historic-SQL tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/ingest/adapters/historic-sql/types.test.ts \ + src/ingest/adapters/historic-sql/buckets.test.ts \ + src/ingest/adapters/historic-sql/stage-unified.test.ts \ + src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts \ + src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts \ + src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts \ + src/ingest/adapters/historic-sql/chunk-unified.test.ts \ + src/package-exports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run type-check for the context package** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS. + +- [ ] **Step 3: Confirm legacy production adapter was not switched** + +Run: + +```bash +rg -n "historic_sql_ingest|historic_sql_curator|stagePgStatStatementsTemplates" packages/context/src/ingest/adapters/historic-sql packages/context/skills packages/context/src/ingest/ingest-runtime-assets.test.ts +``` + +Expected: Results still include `historic-sql.adapter.ts`, the old skill files, and runtime-asset tests. This is correct for this plan because the replacement skills and projection are not present yet. + +- [ ] **Step 4: Confirm new hot-path exports exist** + +Run: + +```bash +rg -n "stageHistoricSqlAggregatedSnapshot|chunkHistoricSqlUnifiedStagedDir|PostgresPgssReader|aggregatedTemplateSchema" packages/context/src/ingest/index.ts packages/context/src/ingest/adapters/historic-sql +``` + +Expected: Results include the new stager, chunker, reader, and schemas. + +- [ ] **Step 5: Commit verification fixes only when verification changed files** + +```bash +git status --short +``` + +Expected: no output. If verification forced a fix, run: + +```bash +git add packages/context/src/ingest/adapters/historic-sql packages/context/src/ingest/index.ts packages/context/src/package-exports.test.ts +git commit -m "test: verify historic sql unified hot path" +``` + +## Follow-Up Plan Boundary + +The next plan after this one should switch the production adapter only after it also creates the cold-path pieces: + +- `packages/context/skills/historic_sql_table_digest/SKILL.md` +- `packages/context/skills/historic_sql_patterns/SKILL.md` +- adapter `skillNames` change from `historic_sql_ingest` to the two new skills +- `onPullSucceeded()` projection of table usage into `_schema/{shard}.yaml` +- pattern wiki page projection and slug stability +- one-time cleanup of legacy template wiki pages and PGSS baselines +- deletion of `stage-pgss.ts`, old template staging exports, and old historic-SQL skill assets + +## Self-Review + +Spec coverage: + +- Unified aggregate reader contracts: Task 1 and Task 4. +- Trailing-window aggregate fetch shape: Task 4. +- Batch SQL parse through `SqlAnalysisPort.analyzeBatch()`: Task 3. +- Service-account, trivial query, failed-template, parse-failure, and zero-table filtering: Task 3. +- Bucketed `tables/*.json`, `patterns-input.json`, and `manifest.json`: Task 2 and Task 3. +- WorkUnits for one table file plus patterns input: Task 5. +- Hard production cutover, LLM skills, projection, wiki pages, stale handling, and legacy deletion: explicitly excluded from this plan and listed as the next plan boundary. + +Placeholder scan: + +- No unresolved placeholders are left in task steps. +- Every code-changing task includes concrete test code, implementation code, commands, and expected results. + +Type consistency: + +- `HistoricSqlUnifiedPullConfig`, `AggregatedTemplate`, `StagedTableInput`, `StagedPatternsInput`, and `StagedManifest` are defined in Task 1 and reused consistently by Tasks 3-5. +- `PostgresPgssReader`, `fetchAggregated()`, `stageHistoricSqlAggregatedSnapshot()`, and `chunkHistoricSqlUnifiedStagedDir()` names match exports and test imports. + +Plan complete and saved to `docs/superpowers/plans/2026-05-11-historic-sql-unified-hot-path.md`. Two execution options: + +**1. Subagent-Driven (recommended)** - I dispatch a fresh subagent per task, review between tasks, fast iteration + +**2. Inline Execution** - Execute tasks in this session using executing-plans, batch execution with checkpoints + +Which approach? diff --git a/examples/README.md b/examples/README.md index 134b4c24..2a3ed818 100644 --- a/examples/README.md +++ b/examples/README.md @@ -29,10 +29,10 @@ warehouse credential. ## postgres-historic -`postgres-historic/` is a manual Docker-backed smoke for Postgres -historic-SQL ingest via `pg_stat_statements`. It verifies setup, first-run -baseline creation, delta-only follow-up ingest, and reset handling without -requiring a managed Postgres service. +`postgres-historic/` is a manual Docker-backed smoke for Postgres historic-SQL +ingest via `pg_stat_statements`. It verifies setup, unified Historic SQL artifacts, +managed daemon batch SQL analysis, bounded pattern WorkUnit shards, and +no-WorkUnit idempotency for unchanged bucketed table inputs and pattern shards. ## package-artifacts diff --git a/examples/postgres-historic/README.md b/examples/postgres-historic/README.md index f97d4b9b..b235c93f 100644 --- a/examples/postgres-historic/README.md +++ b/examples/postgres-historic/README.md @@ -1,13 +1,18 @@ # Postgres Historic SQL Example -This example is a manual smoke for Postgres historic-SQL ingest through -`pg_stat_statements`. It starts Postgres 14 with the extension preloaded, -generates query workload under separate users, runs `ktx setup` with -`--enable-historic-sql`, and verifies three local ingest runs: +This example is a manual smoke for the redesigned Postgres historic-SQL ingest +path through `pg_stat_statements`. It starts Postgres 14 with the extension +preloaded, generates query workload under separate users, runs `ktx setup` with +`--enable-historic-sql`, and verifies the unified staged artifacts: -- first run creates a fresh PGSS baseline -- second run emits only positive deltas -- reset run treats `pg_stat_statements_reset()` as a fresh baseline +- `manifest.json` +- `tables/*.json` +- `patterns-input.json` as the full audit input +- `patterns-input/part-*.json` as bounded pattern WorkUnit shards + +The smoke also runs the same workload twice and verifies the second stage-only +run has `workUnitCount: 0`, which proves unchanged bucketed table inputs and +unchanged bounded pattern shards do not schedule LLM work. ## Prerequisites @@ -36,8 +41,9 @@ Set `KTX_POSTGRES_HISTORIC_KEEP_DOCKER=1` to leave the container running after the script exits. The smoke validates the historic-SQL raw snapshot path without requiring LLM -credentials. It uses KTX's local stage-only ingest API after `ktx setup` so the -PGSS baseline and delta behavior can be checked independently from curation. +credentials. It uses KTX's local stage-only ingest API after `ktx setup`, so the +deterministic reader, batch SQL parser, stable artifact writer, and diff-based +WorkUnit planning are checked independently from curation. ## Manual Commands @@ -64,7 +70,7 @@ node packages/cli/dist/bin.js --project-dir /tmp/ktx-postgres-historic setup \ --database-url env:WAREHOUSE_DATABASE_URL \ --database-schema public \ --enable-historic-sql \ - --historic-sql-min-calls 2 \ + --historic-sql-min-executions 2 \ --yes \ --no-input ``` @@ -75,11 +81,16 @@ node packages/cli/dist/bin.js --project-dir /tmp/ktx-postgres-historic setup \ pnpm run ktx -- dev doctor --project-dir /tmp/ktx-postgres-historic --no-input ``` -The installed CLI form is `ktx dev doctor --project-dir -/tmp/ktx-postgres-historic --no-input`. Expected output includes `PASS Postgres -Historic SQL (warehouse)` when `pg_stat_statements` is installed, -`pg_read_all_stats` is granted, tracking is enabled, and -`pg_stat_statements.max` is at least 5000. +The installed CLI form is: + +```bash +ktx dev doctor --project-dir /tmp/ktx-postgres-historic --no-input +``` + +Expected output includes `PASS Postgres Historic SQL (warehouse)` when +`pg_stat_statements` is installed, `pg_read_all_stats` is granted, and tracking +is enabled. A low `pg_stat_statements.max` value is reported as an informational +note, not a warning. Run local historic-SQL ingest: @@ -92,7 +103,7 @@ pnpm run ktx -- dev ingest run --project-dir /tmp/ktx-postgres-historic \ --no-input ``` -The full `dev ingest run` path also runs curation work units, so it requires a +The full `dev ingest run` path also runs curation WorkUnits, so it requires a configured LLM provider. Inspect the latest manifest: @@ -101,9 +112,12 @@ Inspect the latest manifest: find /tmp/ktx-postgres-historic/raw-sources/warehouse/historic-sql -name manifest.json | sort | tail -n 1 ``` -The manifest should have `dialect: "postgres"`, `degraded: true`, -`baselineFirstRun: true` on the first run, and populated `pgServerVersion` and -`statsResetAt`. +The manifest should have `source: "historic-sql"`, `dialect: "postgres"`, +positive `snapshotRowCount`, positive `touchedTableCount`, numeric +`parseFailures`, `warnings`, and `probeWarnings`. The same directory should +contain `patterns-input.json`, at least one `patterns-input/part-*.json` pattern +shard for cross-table candidates, and one `tables/*.json` file per touched +table. ## Troubleshooting @@ -111,8 +125,8 @@ The manifest should have `dialect: "postgres"`, `degraded: true`, `CREATE EXTENSION pg_stat_statements;` both happened in the `analytics` database. - Missing grants: confirm `GRANT pg_read_all_stats TO ktx_reader;`. -- Empty templates: rerun `scripts/generate-workload.sh base` and keep - `--historic-sql-min-calls 2` for the smoke. +- Empty snapshot: rerun `scripts/generate-workload.sh base` and keep + `--historic-sql-min-executions 2` for the smoke. - SQL-analysis failures: run `pnpm run ktx -- runtime doctor` from the KTX repository root and confirm `uv`, the bundled Python wheel, and the managed runtime all pass. diff --git a/examples/postgres-historic/scripts/smoke.sh b/examples/postgres-historic/scripts/smoke.sh index 488535a4..07656d37 100755 --- a/examples/postgres-historic/scripts/smoke.sh +++ b/examples/postgres-historic/scripts/smoke.sh @@ -8,6 +8,7 @@ COMPOSE_FILE="$EXAMPLE_DIR/docker-compose.yml" PROJECT_PARENT="${KTX_POSTGRES_HISTORIC_PROJECT_PARENT:-$(mktemp -d)}" PROJECT_DIR="$PROJECT_PARENT/postgres-historic-ktx" KTX_BIN="$KTX_ROOT/packages/cli/dist/bin.js" +MAX_STAGE_SECONDS="${KTX_POSTGRES_HISTORIC_MAX_STAGE_SECONDS:-60}" export KTX_RUNTIME_ROOT="$PROJECT_PARENT/managed-runtime" unset KTX_DAEMON_URL unset KTX_SQL_ANALYSIS_URL @@ -26,35 +27,145 @@ latest_manifest() { find "$PROJECT_DIR/raw-sources/warehouse/historic-sql" -name manifest.json | sort | tail -n 1 } -assert_manifest() { +assert_unified_snapshot() { local manifest_path="$1" - local expected_first_run="$2" - node - "$manifest_path" "$expected_first_run" <<'NODE' -const { readFileSync } = require('node:fs'); + node - "$manifest_path" <<'NODE' +const { dirname, join } = require('node:path'); +const { existsSync, readFileSync, readdirSync } = require('node:fs'); + const manifestPath = process.argv[2]; -const expectedFirstRun = process.argv[3] === 'true'; const manifest = JSON.parse(readFileSync(manifestPath, 'utf8')); -if (manifest.dialect !== 'postgres') throw new Error(`Expected dialect postgres, got ${manifest.dialect}`); -if (manifest.degraded !== true) throw new Error('Expected degraded:true for Postgres PGSS v1'); -if (manifest.baselineFirstRun !== expectedFirstRun) { - throw new Error(`Expected baselineFirstRun:${expectedFirstRun}, got ${manifest.baselineFirstRun}`); +function assert(condition, message) { + if (!condition) throw new Error(message); } -if (!manifest.pgServerVersion) throw new Error('Expected pgServerVersion'); -if (!manifest.statsResetAt) throw new Error('Expected statsResetAt'); -if (!Array.isArray(manifest.templates) || manifest.templates.length === 0) { - throw new Error('Expected at least one staged historic-SQL template'); + +assert(manifest.source === 'historic-sql', `Expected source historic-sql, got ${manifest.source}`); +assert(manifest.dialect === 'postgres', `Expected dialect postgres, got ${manifest.dialect}`); +assert(Number.isInteger(manifest.snapshotRowCount) && manifest.snapshotRowCount > 0, 'Expected snapshotRowCount > 0'); +assert(Number.isInteger(manifest.touchedTableCount) && manifest.touchedTableCount > 0, 'Expected touchedTableCount > 0'); +assert(Number.isInteger(manifest.parseFailures), 'Expected numeric parseFailures'); +assert(Array.isArray(manifest.warnings), 'Expected warnings array'); +assert(Array.isArray(manifest.probeWarnings), 'Expected probeWarnings array'); +const legacyKeys = [ + ['de', 'graded'], + ['baseline', 'FirstRun'], + ['pgServer', 'Version'], + ['stats', 'ResetAt'], + ['templates'], +].map((parts) => parts.join('')); +for (const legacyKey of legacyKeys) { + assert(!(legacyKey in manifest), `Legacy manifest key is still present: ${legacyKey}`); } + +function assertPatternShards(root) { + const shardDir = join(root, 'patterns-input'); + assert(existsSync(shardDir), 'Expected patterns-input shard directory'); + const shardFiles = readdirSync(shardDir) + .filter((file) => /^part-\d{4}\.json$/.test(file)) + .sort() + .map((file) => `patterns-input/${file}`); + assert(shardFiles.length > 0, 'Expected at least one pattern shard file'); + + for (const shardFile of shardFiles) { + const shard = JSON.parse(readFileSync(join(root, shardFile), 'utf8')); + assert(Array.isArray(shard.templates), `${shardFile}: expected templates array`); + assert(shard.templates.length > 0, `${shardFile}: expected at least one template`); + assert( + shard.templates.every((template) => Array.isArray(template.tablesTouched) && template.tablesTouched.length >= 2), + `${shardFile}: expected only cross-table pattern candidates`, + ); + } + + return shardFiles; +} + +const root = dirname(manifestPath); +const tableDir = join(root, 'tables'); +const tableFiles = readdirSync(tableDir).filter((file) => file.endsWith('.json')).sort(); +assert(tableFiles.length === manifest.touchedTableCount, `Expected ${manifest.touchedTableCount} table files, got ${tableFiles.length}`); + +const firstTable = JSON.parse(readFileSync(join(tableDir, tableFiles[0]), 'utf8')); +assert(typeof firstTable.table === 'string' && firstTable.table.length > 0, 'Expected staged table name'); +assert(firstTable.stats && typeof firstTable.stats.executionsBucket === 'string', 'Expected bucketed table stats'); +assert(firstTable.columnsByClause && typeof firstTable.columnsByClause === 'object', 'Expected columnsByClause object'); +assert(Array.isArray(firstTable.observedJoins), 'Expected observedJoins array'); +assert(Array.isArray(firstTable.topTemplates) && firstTable.topTemplates.length > 0, 'Expected topTemplates'); + +const patterns = JSON.parse(readFileSync(join(root, 'patterns-input.json'), 'utf8')); +assert(Array.isArray(patterns.templates) && patterns.templates.length > 0, 'Expected patterns-input audit templates'); +assert( + patterns.templates.every((template) => Array.isArray(template.tablesTouched) && template.tablesTouched.length > 0), + 'Expected every audit pattern template to have touched tables', +); +const shardFiles = assertPatternShards(root); +assert( + shardFiles.length <= patterns.templates.length, + `Expected shard count ${shardFiles.length} to be no greater than audit template count ${patterns.templates.length}`, +); +NODE +} + +assert_stage_record() { + local record_path="$1" + local label="$2" + local expected_work_units="$3" + node - "$record_path" "$label" "$expected_work_units" "$MAX_STAGE_SECONDS" <<'NODE' +const { readFileSync } = require('node:fs'); + +const record = JSON.parse(readFileSync(process.argv[2], 'utf8')); +const label = process.argv[3]; +const expectedWorkUnits = process.argv[4]; +const maxSeconds = Number(process.argv[5]); +function assert(condition, message) { + if (!condition) throw new Error(message); +} + +assert(record.status === 'done', `${label}: expected status done, got ${record.status}`); +assert(record.adapter === 'historic-sql', `${label}: expected historic-sql adapter`); +assert(record.connectionId === 'warehouse', `${label}: expected warehouse connection`); +assert(record.rawFileCount >= 4, `${label}: expected manifest, audit patterns input, pattern shard, and at least one table file`); +assert(Array.isArray(record.errors) && record.errors.length === 0, `${label}: expected no errors`); + +if (expectedWorkUnits === 'zero') { + assert(record.workUnitCount === 0, `${label}: expected zero WorkUnits, got ${record.workUnitCount}`); + assert(Array.isArray(record.workUnits) && record.workUnits.length === 0, `${label}: expected empty workUnits`); +} else if (expectedWorkUnits === 'nonzero') { + assert(record.workUnitCount > 0, `${label}: expected nonzero WorkUnits`); + const patternUnits = record.workUnits.filter((unit) => /^historic-sql-patterns-part-\d{4}$/.test(unit.unitKey)); + const patternShardRawFilePattern = new RegExp('^patterns-input/part-\\d{4}\\.json$'); + assert(patternUnits.length > 0, `${label}: expected sharded patterns WorkUnit`); + for (const unit of patternUnits) { + assert( + unit.rawFiles.some((rawFile) => patternShardRawFilePattern.test(rawFile)), + `${label}: expected ${unit.unitKey} to read a pattern shard`, + ); + assert( + !unit.rawFiles.includes('patterns-input.json'), + `${label}: expected ${unit.unitKey} not to schedule the full audit patterns input`, + ); + } + assert(record.workUnits.some((unit) => unit.unitKey.startsWith('historic-sql-table-')), `${label}: expected table WorkUnit`); +} else { + throw new Error(`${label}: unknown expected work unit mode ${expectedWorkUnits}`); +} + +const elapsedMs = Date.parse(record.completedAt) - Date.parse(record.startedAt); +assert(Number.isFinite(elapsedMs) && elapsedMs >= 0, `${label}: invalid elapsed time`); +assert(elapsedMs <= maxSeconds * 1000, `${label}: stage-only ingest took ${elapsedMs}ms, over ${maxSeconds}s`); NODE } run_historic_stage_only() { local job_id="$1" - node - "$KTX_ROOT" "$PROJECT_DIR" "$job_id" <<'NODE' + local record_path="$2" + node - "$KTX_ROOT" "$PROJECT_DIR" "$job_id" "$record_path" <<'NODE' +const { writeFile } = await import('node:fs/promises'); const { join } = await import('node:path'); const ktxRoot = process.argv[2]; const projectDir = process.argv[3]; const jobId = process.argv[4]; +const recordPath = process.argv[5]; const { loadKtxProject } = await import(join(ktxRoot, 'packages/context/dist/project/index.js')); const { runLocalStageOnlyIngest } = await import(join(ktxRoot, 'packages/context/dist/ingest/index.js')); const { createKtxCliLocalIngestAdapters } = await import(join(ktxRoot, 'packages/cli/dist/local-adapters.js')); @@ -81,15 +192,8 @@ const record = await runLocalStageOnlyIngest({ trigger: 'manual_resync', jobId, }); -await adapter.onPullSucceeded?.({ - connectionId: 'warehouse', - sourceKey: 'historic-sql', - syncId: record.syncId, - trigger: 'manual_resync', - completedAt: new Date(record.completedAt), - stagedDir: join(project.projectDir, '.ktx/cache/local-ingest', jobId, 'staged'), -}); -console.log(record.syncId); +await writeFile(recordPath, `${JSON.stringify(record, null, 2)}\n`, 'utf8'); +console.log(`${record.syncId} workUnits=${record.workUnitCount}`); NODE } @@ -112,25 +216,31 @@ node "$KTX_BIN" --project-dir "$PROJECT_DIR" setup \ --database-url env:WAREHOUSE_DATABASE_URL \ --database-schema public \ --enable-historic-sql \ - --historic-sql-min-calls 2 \ + --historic-sql-min-executions 2 \ --yes \ --no-input -run_historic_stage_only "historic-first-$$" +node "$KTX_BIN" runtime install --yes +node "$KTX_BIN" runtime start + +FIRST_RECORD="$PROJECT_PARENT/first-record.json" +run_historic_stage_only "historic-first-$$" "$FIRST_RECORD" FIRST_MANIFEST="$(latest_manifest)" -assert_manifest "$FIRST_MANIFEST" true +assert_unified_snapshot "$FIRST_MANIFEST" +assert_stage_record "$FIRST_RECORD" first nonzero + +UNCHANGED_RECORD="$PROJECT_PARENT/unchanged-record.json" +run_historic_stage_only "historic-unchanged-$$" "$UNCHANGED_RECORD" +UNCHANGED_MANIFEST="$(latest_manifest)" +assert_unified_snapshot "$UNCHANGED_MANIFEST" +assert_stage_record "$UNCHANGED_RECORD" unchanged zero "$EXAMPLE_DIR/scripts/generate-workload.sh" extra -run_historic_stage_only "historic-second-$$" -SECOND_MANIFEST="$(latest_manifest)" -assert_manifest "$SECOND_MANIFEST" false - -docker compose -f "$COMPOSE_FILE" exec -T postgres \ - psql -U postgres -d analytics -v ON_ERROR_STOP=1 -c "SELECT pg_stat_statements_reset();" >/dev/null -"$EXAMPLE_DIR/scripts/generate-workload.sh" extra -run_historic_stage_only "historic-reset-$$" -RESET_MANIFEST="$(latest_manifest)" -assert_manifest "$RESET_MANIFEST" true +CHANGED_RECORD="$PROJECT_PARENT/changed-record.json" +run_historic_stage_only "historic-changed-$$" "$CHANGED_RECORD" +CHANGED_MANIFEST="$(latest_manifest)" +assert_unified_snapshot "$CHANGED_MANIFEST" +assert_stage_record "$CHANGED_RECORD" changed nonzero echo "Postgres historic SQL smoke passed" echo "Project dir: $PROJECT_DIR" diff --git a/packages/cli/src/commands/ingest-commands.ts b/packages/cli/src/commands/ingest-commands.ts index 772c107d..5ad357e1 100644 --- a/packages/cli/src/commands/ingest-commands.ts +++ b/packages/cli/src/commands/ingest-commands.ts @@ -92,7 +92,7 @@ export function registerIngestCommands( sourceDir: options.sourceDir ? resolve(options.sourceDir) : undefined, databaseIntrospectionUrl: options.databaseIntrospectionUrl || undefined, cliVersion: context.packageInfo.version, - runtimeInstallPolicy: runtimeInstallPolicyFromFlags(options), + runtimeInstallPolicy: runtimeInstallPolicyFromFlags({ yes: options.yes }), ...(options.debugLlmRequestFile ? { debugLlmRequestFile: resolve(options.debugLlmRequestFile) } : {}), outputMode: outputMode(options), ...inputMode(options), diff --git a/packages/cli/src/commands/setup-commands.ts b/packages/cli/src/commands/setup-commands.ts index 16e3cc28..f15b6680 100644 --- a/packages/cli/src/commands/setup-commands.ts +++ b/packages/cli/src/commands/setup-commands.ts @@ -117,6 +117,7 @@ function shouldShowSetupEntryMenu( enableHistoricSql?: boolean; disableHistoricSql?: boolean; historicSqlWindowDays?: number; + historicSqlMinExecutions?: number; historicSqlMinCalls?: number; historicSqlServiceAccountPattern?: string[]; historicSqlRedactionPattern?: string[]; @@ -186,6 +187,7 @@ function shouldShowSetupEntryMenu( 'enableHistoricSql', 'disableHistoricSql', 'historicSqlWindowDays', + 'historicSqlMinExecutions', 'historicSqlMinCalls', 'skipDatabases', 'source', @@ -274,9 +276,10 @@ export function registerSetupCommands(program: Command, context: KtxCliCommandCo .option('--enable-historic-sql', 'Enable Historic SQL when the selected database supports it', false) .option('--disable-historic-sql', 'Disable Historic SQL for the selected database', false) .option('--historic-sql-window-days ', 'Historic SQL query-history window', positiveInteger) + .option('--historic-sql-min-executions ', 'Minimum Historic SQL executions for a template', positiveInteger) .option( '--historic-sql-min-calls ', - 'Postgres Historic SQL pg_stat_statements minimum calls floor', + 'Alias for --historic-sql-min-executions', positiveInteger, ) .option( @@ -360,6 +363,7 @@ export function registerSetupCommands(program: Command, context: KtxCliCommandCo const mode = options.new ? 'new' : options.existing ? 'existing' : 'auto'; const resolvedAgentScope = options.global ? 'global' : options.agentScope; + const historicSqlMinExecutions = options.historicSqlMinExecutions ?? options.historicSqlMinCalls; await runSetupArgs(context, { command: 'run', projectDir: resolveCommandProjectDir(command), @@ -388,7 +392,7 @@ export function registerSetupCommands(program: Command, context: KtxCliCommandCo ...(options.enableHistoricSql ? { enableHistoricSql: true } : {}), ...(options.disableHistoricSql ? { disableHistoricSql: true } : {}), ...(options.historicSqlWindowDays !== undefined ? { historicSqlWindowDays: options.historicSqlWindowDays } : {}), - ...(options.historicSqlMinCalls !== undefined ? { historicSqlMinCalls: options.historicSqlMinCalls } : {}), + ...(historicSqlMinExecutions !== undefined ? { historicSqlMinExecutions } : {}), ...(options.historicSqlServiceAccountPattern.length > 0 ? { historicSqlServiceAccountPatterns: options.historicSqlServiceAccountPattern } : {}), diff --git a/packages/cli/src/doctor.test.ts b/packages/cli/src/doctor.test.ts index b40af1bd..d0ebdb95 100644 --- a/packages/cli/src/doctor.test.ts +++ b/packages/cli/src/doctor.test.ts @@ -292,10 +292,9 @@ describe('runKtxDoctor', () => { { id: 'historic-sql-postgres-warehouse', label: 'Postgres Historic SQL (warehouse)', - status: 'warn' as const, + status: 'pass' as const, detail: - 'pg_stat_statements ready (PostgreSQL 16.4) with warnings: pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', - fix: `Update the Postgres parameter group or config, then rerun \`ktx dev doctor --project-dir ${tempDir}\``, + 'pg_stat_statements ready (PostgreSQL 16.4); info: pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', }, ]); @@ -313,8 +312,9 @@ describe('runKtxDoctor', () => { ).resolves.toBe(0); expect(runHistoricSqlDoctorChecks).toHaveBeenCalledTimes(1); - expect(testIo.stdout()).toContain('WARN Postgres Historic SQL (warehouse): pg_stat_statements ready'); - expect(testIo.stdout()).toContain('Fix: Update the Postgres parameter group or config'); + expect(testIo.stdout()).toContain('PASS Postgres Historic SQL (warehouse): pg_stat_statements ready'); + expect(testIo.stdout()).toContain('info: pg_stat_statements.max is 1000'); + expect(testIo.stdout()).not.toContain('Fix: Update the Postgres parameter group or config'); }); it('warns when semantic-search embeddings are not configured', async () => { diff --git a/packages/cli/src/historic-sql-doctor.test.ts b/packages/cli/src/historic-sql-doctor.test.ts index f4e0ee7f..1c08b6e3 100644 --- a/packages/cli/src/historic-sql-doctor.test.ts +++ b/packages/cli/src/historic-sql-doctor.test.ts @@ -81,7 +81,39 @@ describe('runPostgresHistoricSqlDoctorChecks', () => { ]); }); - it('warns when the PGSS probe succeeds with operational warnings', async () => { + it('passes with an informational note when only pg_stat_statements.max is below the recommended floor', async () => { + const checks = await runPostgresHistoricSqlDoctorChecks( + projectWithConnections({ + warehouse: { + driver: 'postgres', + url: 'env:WAREHOUSE_DATABASE_URL', + readonly: true, + historicSql: { enabled: true, dialect: 'postgres' }, + }, + }), + { + postgresHistoricSqlProbe: async () => ({ + pgServerVersion: 'PostgreSQL 16.4', + warnings: [], + info: [ + 'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', + ], + }), + }, + ); + + expect(checks).toEqual([ + { + id: 'historic-sql-postgres-warehouse', + label: 'Postgres Historic SQL (warehouse)', + status: 'pass', + detail: + 'pg_stat_statements ready (PostgreSQL 16.4); info: pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', + }, + ]); + }); + + it('warns when pg_stat_statements tracking is disabled', async () => { const checks = await runPostgresHistoricSqlDoctorChecks( projectWithConnections({ warehouse: { @@ -95,6 +127,9 @@ describe('runPostgresHistoricSqlDoctorChecks', () => { postgresHistoricSqlProbe: async () => ({ pgServerVersion: 'PostgreSQL 16.4', warnings: [ + 'pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config', + ], + info: [ 'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', ], }), @@ -107,7 +142,7 @@ describe('runPostgresHistoricSqlDoctorChecks', () => { label: 'Postgres Historic SQL (warehouse)', status: 'warn', detail: - 'pg_stat_statements ready (PostgreSQL 16.4) with warnings: pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', + 'pg_stat_statements ready (PostgreSQL 16.4) with warnings: pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config; info: pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', fix: 'Update the Postgres parameter group or config, then rerun `ktx dev doctor --project-dir /tmp/ktx-project`', }, ]); diff --git a/packages/cli/src/historic-sql-doctor.ts b/packages/cli/src/historic-sql-doctor.ts index bb36648e..62db386b 100644 --- a/packages/cli/src/historic-sql-doctor.ts +++ b/packages/cli/src/historic-sql-doctor.ts @@ -16,6 +16,7 @@ export interface PostgresHistoricSqlDoctorProbeInput { export interface PostgresHistoricSqlDoctorProbeResult { pgServerVersion: string; warnings: string[]; + info?: string[]; } export type PostgresHistoricSqlDoctorProbe = ( @@ -72,10 +73,17 @@ function failureDetail(error: unknown): string { return String(error); } +function readinessDetail(result: PostgresHistoricSqlDoctorProbeResult): string { + const warningText = result.warnings.length > 0 ? ` with warnings: ${result.warnings.join('; ')}` : ''; + const info = result.info ?? []; + const infoText = info.length > 0 ? `; info: ${info.join('; ')}` : ''; + return `pg_stat_statements ready (${result.pgServerVersion})${warningText}${infoText}`; +} + async function defaultPostgresHistoricSqlProbe( input: PostgresHistoricSqlDoctorProbeInput, ): Promise { - const [{ PostgresPgssQueryHistoryReader }, { KtxPostgresHistoricSqlQueryClient, isKtxPostgresConnectionConfig }] = + const [{ PostgresPgssReader }, { KtxPostgresHistoricSqlQueryClient, isKtxPostgresConnectionConfig }] = await Promise.all([import('@ktx/context/ingest'), import('@ktx/connector-postgres')]); if (!isKtxPostgresConnectionConfig(input.connection)) { @@ -88,7 +96,7 @@ async function defaultPostgresHistoricSqlProbe( env: input.env, }); try { - return await new PostgresPgssQueryHistoryReader().probe(client); + return await new PostgresPgssReader().probe(client); } finally { await client.cleanup(); } @@ -134,14 +142,12 @@ export async function runPostgresHistoricSqlDoctorChecks( 'warn', checkId(connectionId), label, - `pg_stat_statements ready (${result.pgServerVersion}) with warnings: ${result.warnings.join('; ')}`, + readinessDetail(result), `Update the Postgres parameter group or config, then rerun \`ktx dev doctor --project-dir ${project.projectDir}\``, ), ); } else { - checks.push( - check('pass', checkId(connectionId), label, `pg_stat_statements ready (${result.pgServerVersion})`), - ); + checks.push(check('pass', checkId(connectionId), label, readinessDetail(result))); } } catch (error) { checks.push( diff --git a/packages/cli/src/index.test.ts b/packages/cli/src/index.test.ts index f26215d1..a575eeed 100644 --- a/packages/cli/src/index.test.ts +++ b/packages/cli/src/index.test.ts @@ -920,7 +920,7 @@ describe('runKtxCli', () => { sourceDir: tempDir, databaseIntrospectionUrl: undefined, cliVersion: '0.0.0-private', - runtimeInstallPolicy: 'never', + runtimeInstallPolicy: 'prompt', debugLlmRequestFile: `${tempDir}/debug.jsonl`, outputMode: 'json', inputMode: 'disabled', @@ -934,9 +934,9 @@ describe('runKtxCli', () => { expect(ingestReplayHelpIo.stderr()).toBe(''); }); - it('routes ingest managed runtime install policies', async () => { + it('routes ingest managed runtime install policy separately from visualization input mode', async () => { const autoIo = makeIo(); - const conflictIo = makeIo(); + const nonInteractiveIo = makeIo(); const ingest = vi.fn(async () => 0); await expect( @@ -972,10 +972,10 @@ describe('runKtxCli', () => { '--yes', '--no-input', ], - conflictIo.io, + nonInteractiveIo.io, { ingest }, ), - ).resolves.toBe(1); + ).resolves.toBe(0); expect(ingest).toHaveBeenCalledWith( expect.objectContaining({ @@ -985,7 +985,16 @@ describe('runKtxCli', () => { }), autoIo.io, ); - expect(conflictIo.stderr()).toContain('Choose only one runtime install mode: --yes or --no-input'); + expect(ingest).toHaveBeenCalledWith( + expect.objectContaining({ + command: 'run', + cliVersion: '0.0.0-private', + runtimeInstallPolicy: 'auto', + inputMode: 'disabled', + }), + nonInteractiveIo.io, + ); + expect(nonInteractiveIo.stderr()).toBe(''); }); it('dispatches public connection through the existing connection implementation', async () => { @@ -1182,7 +1191,7 @@ describe('runKtxCli', () => { '--enable-historic-sql', '--historic-sql-window-days', '30', - '--historic-sql-min-calls', + '--historic-sql-min-executions', '12', ], setupIo.io, @@ -1205,7 +1214,7 @@ describe('runKtxCli', () => { databaseSchemas: ['public'], enableHistoricSql: true, historicSqlWindowDays: 30, - historicSqlMinCalls: 12, + historicSqlMinExecutions: 12, skipDatabases: false, }), setupIo.io, diff --git a/packages/cli/src/ingest-viz.test.ts b/packages/cli/src/ingest-viz.test.ts index 936490d7..1347b3a8 100644 --- a/packages/cli/src/ingest-viz.test.ts +++ b/packages/cli/src/ingest-viz.test.ts @@ -22,6 +22,7 @@ import { resetVizFallbackWarningsForTest } from './viz-fallback.js'; describe('runKtxIngest viz and replay', () => { let tempDir: string; let originalTerm: string | undefined; + const interactiveEnv = (): NodeJS.ProcessEnv => ({ ...process.env, CI: 'false' }); beforeEach(async () => { resetVizFallbackWarningsForTest(); @@ -304,7 +305,7 @@ describe('runKtxIngest viz and replay', () => { expect(io.stdout()).toContain('KTX memory flow warehouse/fake done'); }); - it('does not attach a live memory-flow sink for plain run output', async () => { + it('attaches a plain progress memory-flow sink for interactive plain run output', async () => { const projectDir = join(tempDir, 'project'); await writeWarehouseConfig(projectDir); const sourceDir = join(tempDir, 'source'); @@ -325,11 +326,12 @@ describe('runKtxIngest viz and replay', () => { outputMode: 'plain', }, io.io, - { runLocalIngest: runLocal }, + { env: interactiveEnv(), runLocalIngest: runLocal }, ), ).resolves.toBe(0); - expect(runLocal).toHaveBeenCalledWith(expect.not.objectContaining({ memoryFlow: expect.anything() })); + expect(runLocal).toHaveBeenCalledWith(expect.objectContaining({ memoryFlow: expect.anything() })); + expect(io.stdout()).toContain('[5%] Fetching source files for warehouse/fake'); expect(io.stdout()).toContain('Job: plain-run'); expect(io.stdout()).not.toContain('KTX memory flow'); }); @@ -395,6 +397,7 @@ describe('runKtxIngest viz and replay', () => { }, io.io, { + env: interactiveEnv(), runLocalIngest: runLocal, startLiveMemoryFlow, jobIdFactory: () => 'raw-missing-viz-run', @@ -403,7 +406,8 @@ describe('runKtxIngest viz and replay', () => { ).resolves.toBe(0); expect(startLiveMemoryFlow).not.toHaveBeenCalled(); - expect(runLocal).toHaveBeenCalledWith(expect.not.objectContaining({ memoryFlow: expect.anything() })); + expect(runLocal).toHaveBeenCalledWith(expect.objectContaining({ memoryFlow: expect.anything() })); + expect(io.stdout()).toContain('[5%] Fetching source files for warehouse/fake'); expect(io.stdout()).toContain('Job: raw-missing-viz-run'); expect(io.stdout()).not.toContain('KTX memory flow'); expect(io.stderr()).toContain( diff --git a/packages/cli/src/ingest.test.ts b/packages/cli/src/ingest.test.ts index a2784266..9fc4dc82 100644 --- a/packages/cli/src/ingest.test.ts +++ b/packages/cli/src/ingest.test.ts @@ -36,6 +36,7 @@ import { resetVizFallbackWarningsForTest } from './viz-fallback.js'; describe('runKtxIngest', () => { let tempDir: string; let originalTerm: string | undefined; + const interactiveEnv = (): NodeJS.ProcessEnv => ({ ...process.env, CI: 'false' }); beforeEach(async () => { resetVizFallbackWarningsForTest(); @@ -544,6 +545,63 @@ describe('runKtxIngest', () => { expect(io.stdout()).toContain('Diff: +2/~0/-0/=0\n'); }); + it('includes historic-sql projection output in saved memory counts', async () => { + const projectDir = join(tempDir, 'project'); + await writeWarehouseConfig(projectDir); + const runLocal = vi.fn(async (input: RunLocalIngestOptions) => { + const result = completedLocalBundleRun(input, 'historic-sql-projection'); + return { + ...result, + report: localFakeBundleReport('historic-sql-projection', { + sourceKey: 'historic-sql', + body: { + workUnits: [], + postProcessor: { + sourceKey: 'historic-sql', + status: 'success', + result: { + tableUsageMerged: 56, + staleTablesMarked: 1, + patternPagesWritten: 30, + stalePatternPagesMarked: 2, + archivedPatternPages: 3, + legacyPagesDeleted: 4, + }, + errors: [], + warnings: [], + touchedSources: [], + }, + }, + }), + }; + }); + + const io = makeIo(); + await expect( + runKtxIngest( + { + command: 'run', + projectDir, + connectionId: 'warehouse', + adapter: 'historic-sql', + outputMode: 'plain', + }, + io.io, + { + runLocalIngest: runLocal, + createAdapters: vi.fn(() => [ + { source: 'historic-sql', skillNames: [], detect: async () => true, chunk: async () => ({ workUnits: [] }) }, + ]), + jobIdFactory: () => 'historic-sql-projection', + }, + ), + ).resolves.toBe(0); + + expect(io.stderr()).toBe(''); + expect(io.stdout()).toContain('Adapter: historic-sql\n'); + expect(io.stdout()).toContain('Saved memory: 39 wiki, 57 SL\n'); + }); + it('returns a non-zero code when local ingest reports failed work units', async () => { const projectDir = join(tempDir, 'project'); await writeWarehouseConfig(projectDir); @@ -715,7 +773,7 @@ describe('runKtxIngest', () => { ' historicSql:', ' enabled: true', ' dialect: postgres', - ' minCalls: 2', + ' minExecutions: 2', 'ingest:', ' adapters:', ' - historic-sql', @@ -762,6 +820,104 @@ describe('runKtxIngest', () => { ); }); + it('prints live progress for plain local ingest in interactive terminals', async () => { + const projectDir = join(tempDir, 'historic-sql-progress-project'); + await mkdir(projectDir, { recursive: true }); + await writeFile( + join(projectDir, 'ktx.yaml'), + [ + 'project: historic-sql-progress-project', + 'connections:', + ' warehouse:', + ' driver: postgres', + ' url: env:WAREHOUSE_DATABASE_URL', + ' historicSql:', + ' enabled: true', + ' dialect: postgres', + ' minExecutions: 2', + 'ingest:', + ' adapters:', + ' - historic-sql', + '', + ].join('\n'), + 'utf-8', + ); + const createdAdapters: SourceAdapter[] = [ + { source: 'historic-sql', skillNames: [], detect: async () => true, chunk: async () => ({ workUnits: [] }) }, + ]; + const createAdapters = vi.fn(() => createdAdapters as never); + const runLocal = vi.fn(async (input: RunLocalIngestOptions) => { + expect(input.memoryFlow).toBeDefined(); + input.memoryFlow?.emit({ + type: 'source_acquired', + adapter: 'historic-sql', + trigger: 'manual_resync', + fileCount: 3, + }); + input.memoryFlow?.update({ syncId: 'sync-progress-1' }); + input.memoryFlow?.emit({ type: 'raw_snapshot_written', syncId: 'sync-progress-1', rawFileCount: 3 }); + input.memoryFlow?.emit({ type: 'diff_computed', added: 2, modified: 0, deleted: 0, unchanged: 1 }); + input.memoryFlow?.update({ + plannedWorkUnits: [ + { + unitKey: 'historic-sql-table-public-orders', + rawFiles: ['tables/public/orders.json'], + peerFileCount: 0, + dependencyCount: 0, + }, + ], + }); + input.memoryFlow?.emit({ type: 'chunks_planned', chunkCount: 1, workUnitCount: 1, evictionCount: 0 }); + input.memoryFlow?.emit({ + type: 'work_unit_started', + unitKey: 'historic-sql-table-public-orders', + skills: ['historic_sql_table_digest'], + stepBudget: 40, + }); + input.memoryFlow?.emit({ + type: 'work_unit_finished', + unitKey: 'historic-sql-table-public-orders', + status: 'success', + }); + input.memoryFlow?.emit({ type: 'saved', commitSha: null, wikiCount: 0, slCount: 1 }); + input.memoryFlow?.emit({ type: 'provenance_recorded', rowCount: 3 }); + input.memoryFlow?.emit({ type: 'report_created', runId: 'run-live-1', reportPath: 'report-live-1' }); + input.memoryFlow?.finish('done'); + return completedLocalBundleRun(input, input.jobId ?? 'historic-progress-job'); + }); + const io = makeIo({ isTTY: true }); + + await expect( + runKtxIngest( + { + command: 'run', + projectDir, + connectionId: 'warehouse', + adapter: 'historic-sql', + outputMode: 'plain', + }, + io.io, + { + env: interactiveEnv(), + createAdapters, + runLocalIngest: runLocal, + jobIdFactory: () => 'historic-progress-job', + }, + ), + ).resolves.toBe(0); + + const stdout = io.stdout(); + expect(stdout).toContain('[5%] Fetching source files for warehouse/historic-sql'); + expect(stdout).toContain('[15%] Fetched 3 source files from historic-sql'); + expect(stdout).toContain('[45%] Planned 1 work unit'); + expect(stdout).toContain('[80%] Processed 1/1 work units'); + expect(stdout).toContain('[100%] Ingest completed'); + expect(stdout.indexOf('[5%] Fetching source files for warehouse/historic-sql')).toBeLessThan( + stdout.indexOf('Report: report-live-1'), + ); + expect(io.stderr()).toBe(''); + }); + it('passes local Looker pull-config options and agent runner into scheduled ingest for Looker scheduled ingest', async () => { const projectDir = join(tempDir, 'project'); await writeWarehouseConfig(projectDir); diff --git a/packages/cli/src/ingest.ts b/packages/cli/src/ingest.ts index d9f4d434..a580b3d5 100644 --- a/packages/cli/src/ingest.ts +++ b/packages/cli/src/ingest.ts @@ -8,11 +8,13 @@ import { ingestReportToMemoryFlowReplay, type LocalMetabaseFanoutResult, type LocalMetabaseFanoutProgress, + type MemoryFlowEvent, type MemoryFlowReplayInput, type RunLocalIngestOptions, renderMemoryFlowReplay, runLocalIngest, runLocalMetabaseIngest, + savedMemoryCountsForReport, } from '@ktx/context/ingest'; import { loadKtxProject } from '@ktx/context/project'; import { readIngestReportSnapshotFile } from './ingest-report-file.js'; @@ -88,16 +90,8 @@ function reportStatus(report: IngestReportSnapshot): 'done' | 'error' { return report.body.failedWorkUnits.length > 0 ? 'error' : 'done'; } -function reportActionCounts(report: IngestReportSnapshot): { wikiCount: number; slCount: number } { - const actions = report.body.workUnits.flatMap((workUnit) => workUnit.actions); - return { - wikiCount: actions.filter((action) => action.target === 'wiki').length, - slCount: actions.filter((action) => action.target === 'sl').length, - }; -} - function writeReportStatus(report: IngestReportSnapshot, io: KtxIngestIo): void { - const counts = reportActionCounts(report); + const counts = savedMemoryCountsForReport(report); io.stdout.write(`Report: ${report.id}\n`); io.stdout.write(`Run: ${report.runId}\n`); io.stdout.write(`Job: ${report.jobId}\n`); @@ -116,7 +110,7 @@ function writeReportStatus(report: IngestReportSnapshot, io: KtxIngestIo): void function writeMetabaseFanoutStatus(result: LocalMetabaseFanoutResult, io: KtxIngestIo): void { const counts = result.children.reduce( (acc, child) => { - const childCounts = reportActionCounts(child.report); + const childCounts = savedMemoryCountsForReport(child.report); return { wikiCount: acc.wikiCount + childCounts.wikiCount, slCount: acc.slCount + childCounts.slCount, @@ -170,6 +164,118 @@ function createMetabaseFanoutProgress( }; } +function formatDiffProgress(event: Extract): string { + return `+${event.added}/~${event.modified}/-${event.deleted}/=${event.unchanged}`; +} + +function completedWorkUnitCount(snapshot: MemoryFlowReplayInput): number { + return snapshot.events.filter((event) => event.type === 'work_unit_finished').length; +} + +function plainIngestEventProgress( + event: MemoryFlowEvent, + snapshot: MemoryFlowReplayInput, +): { percent: number; message: string } | null { + switch (event.type) { + case 'source_acquired': + return { + percent: 15, + message: `Fetched ${pluralize(event.fileCount, 'source file')} from ${event.adapter}`, + }; + case 'raw_snapshot_written': + return { + percent: 25, + message: `Wrote raw snapshot ${event.syncId} with ${pluralize(event.rawFileCount, 'file')}`, + }; + case 'diff_computed': + return { percent: 35, message: `Computed source diff ${formatDiffProgress(event)}` }; + case 'chunks_planned': + return { + percent: 45, + message: `Planned ${pluralize(event.workUnitCount, 'work unit')}`, + }; + case 'stage_skipped': + return { percent: 45, message: `Skipped ${event.stage}: ${event.reason}` }; + case 'work_unit_started': + return { percent: 55, message: `Processing ${event.unitKey}` }; + case 'work_unit_finished': { + const total = snapshot.plannedWorkUnits.length || completedWorkUnitCount(snapshot); + const completed = completedWorkUnitCount(snapshot); + const percent = total > 0 ? 55 + Math.round((completed / total) * 25) : 80; + return { + percent, + message: `Processed ${completed}/${total} work units`, + }; + } + case 'reconciliation_finished': + return { + percent: 85, + message: `Reconciled results with ${pluralize(event.conflictCount, 'conflict')} and ${pluralize( + event.fallbackCount, + 'fallback', + )}`, + }; + case 'saved': + return { + percent: 90, + message: `Saved memory updates (${event.wikiCount} wiki, ${event.slCount} SL)`, + }; + case 'provenance_recorded': + return { percent: 95, message: `Recorded ${pluralize(event.rowCount, 'provenance row')}` }; + case 'report_created': + return { percent: 98, message: `Created ingest report ${event.reportPath ?? event.runId}` }; + case 'scope_detected': + case 'work_unit_step': + case 'candidate_action': + return null; + } +} + +function shouldWritePlainIngestProgress( + outputMode: KtxIngestOutputMode, + io: KtxIngestIo, + env: NodeJS.ProcessEnv, +): boolean { + return outputMode === 'plain' && io.stdout.isTTY === true && env.CI !== 'true'; +} + +function createPlainIngestProgressRenderer( + args: Extract, + io: KtxIngestIo, +): { start(): void; update(snapshot: MemoryFlowReplayInput): void } { + let printedEvents = 0; + let lastPercent = 0; + let printedCompletion = false; + + const write = (percent: number, message: string) => { + const nextPercent = Math.max(lastPercent, Math.max(0, Math.min(100, percent))); + lastPercent = nextPercent; + io.stdout.write(`[${nextPercent}%] ${message}\n`); + }; + + return { + start() { + write(5, `Fetching source files for ${args.connectionId}/${args.adapter}`); + }, + update(snapshot) { + while (printedEvents < snapshot.events.length) { + const event = snapshot.events[printedEvents++]; + if (!event) { + continue; + } + const progress = plainIngestEventProgress(event, snapshot); + if (progress) { + write(progress.percent, progress.message); + } + } + if (!printedCompletion && snapshot.status !== 'running') { + printedCompletion = true; + write(100, snapshot.status === 'done' ? 'Ingest completed' : 'Ingest failed'); + } + }, + }; +} + function writeReportJson(report: IngestReportSnapshot, io: KtxIngestIo): void { io.stdout.write(`${JSON.stringify(report, null, 2)}\n`); } @@ -366,10 +472,14 @@ export async function runKtxIngest( }); const shouldUseLiveViz = runOutputMode === 'viz' && (args.inputMode ?? 'auto') === 'auto' && isInteractiveTerminal(io); - const initialMemoryFlow = shouldUseLiveViz ? initialRunMemoryFlowInput(args, jobId ?? 'pending') : undefined; + const plainProgress = shouldWritePlainIngestProgress(runOutputMode, io, env) + ? createPlainIngestProgressRenderer(args, io) + : null; + const initialMemoryFlow = + shouldUseLiveViz || plainProgress ? initialRunMemoryFlowInput(args, jobId ?? 'pending') : undefined; let latestMemoryFlowSnapshot: MemoryFlowReplayInput | null = initialMemoryFlow ?? null; - if (initialMemoryFlow && isTuiCapableIo(io)) { + if (shouldUseLiveViz && initialMemoryFlow && isTuiCapableIo(io)) { const startLiveMemoryFlow = deps.startLiveMemoryFlow ?? startLiveMemoryFlowTui; liveTui = await startLiveMemoryFlow(initialMemoryFlow, io); } @@ -382,13 +492,17 @@ export async function runKtxIngest( liveTui.update(snapshot); return; } - if (!liveTui) { + if (shouldUseLiveViz && !liveTui) { writeMemoryFlowInput(snapshot, io, { clear: true }); + return; } + plainProgress?.update(snapshot); }, }) : undefined; + plainProgress?.start(); + try { const result = await executeLocalIngest({ project, @@ -403,7 +517,7 @@ export async function runKtxIngest( ...(args.debugLlmRequestFile ? { llmDebugRequestFile: args.debugLlmRequestFile } : {}), ...(memoryFlow ? { memoryFlow } : {}), }); - if (memoryFlow) { + if (shouldUseLiveViz && memoryFlow) { latestMemoryFlowSnapshot = memoryFlow.snapshot(); liveTui?.close(); liveTui = null; diff --git a/packages/cli/src/knowledge.test.ts b/packages/cli/src/knowledge.test.ts index d3db8465..0e9ed1d5 100644 --- a/packages/cli/src/knowledge.test.ts +++ b/packages/cli/src/knowledge.test.ts @@ -2,6 +2,7 @@ import { mkdtemp, rm } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { initKtxProject } from '@ktx/context/project'; +import type { KtxEmbeddingPort } from '@ktx/context'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import { runKtxKnowledge } from './knowledge.js'; @@ -26,6 +27,19 @@ function makeIo() { }; } +class FakeEmbeddingPort implements KtxEmbeddingPort { + readonly maxBatchSize = 16; + + async computeEmbedding(text: string): Promise { + const lower = text.toLowerCase(); + return lower.includes('revenue') || lower.includes('arr') ? [1, 0] : [0, 1]; + } + + async computeEmbeddingsBulk(texts: string[]): Promise { + return Promise.all(texts.map((text) => this.computeEmbedding(text))); + } +} + describe('runKtxKnowledge', () => { let tempDir: string; @@ -92,4 +106,39 @@ describe('runKtxKnowledge', () => { expect(searchIo.stderr()).toContain('No local wiki pages found'); expect(searchIo.stderr()).toContain('ktx wiki write'); }); + + it('uses configured embeddings for semantic wiki search', async () => { + const projectDir = join(tempDir, 'semantic-project'); + await initKtxProject({ projectDir, projectName: 'warehouse' }); + + await expect( + runKtxKnowledge( + { + command: 'write', + projectDir, + key: 'historic-sql/active-contract-arr-open-tickets', + scope: 'GLOBAL', + userId: 'local', + summary: 'Active Contract ARR Ranked by Open Support Ticket Count', + content: 'Accounts ranked by annual recurring contract value and support ticket load.', + tags: ['historic-sql'], + refs: [], + slRefs: [], + }, + makeIo().io, + ), + ).resolves.toBe(0); + + const searchIo = makeIo(); + await expect( + runKtxKnowledge( + { command: 'search', projectDir, query: 'revenue', userId: 'local' }, + searchIo.io, + { embeddingService: new FakeEmbeddingPort() }, + ), + ).resolves.toBe(0); + + expect(searchIo.stdout()).toContain('historic-sql/active-contract-arr-open-tickets'); + expect(searchIo.stderr()).toBe(''); + }); }); diff --git a/packages/cli/src/knowledge.ts b/packages/cli/src/knowledge.ts index 89afda8e..40cc5372 100644 --- a/packages/cli/src/knowledge.ts +++ b/packages/cli/src/knowledge.ts @@ -1,3 +1,8 @@ +import { + createLocalKtxEmbeddingProviderFromConfig, + KtxIngestEmbeddingPortAdapter, + type KtxEmbeddingPort, +} from '@ktx/context'; import { loadKtxProject } from '@ktx/context/project'; import { type LocalKnowledgeScope, @@ -29,7 +34,29 @@ interface KtxKnowledgeIo { stderr: { write(chunk: string): void }; } -export async function runKtxKnowledge(args: KtxKnowledgeArgs, io: KtxKnowledgeIo = process): Promise { +interface KtxKnowledgeDeps { + embeddingService?: KtxEmbeddingPort | null; + createEmbeddingProvider?: typeof createLocalKtxEmbeddingProviderFromConfig; +} + +function wikiSearchEmbeddingService( + project: Awaited>, + deps: KtxKnowledgeDeps, +): KtxEmbeddingPort | null { + if ('embeddingService' in deps) { + return deps.embeddingService ?? null; + } + const provider = (deps.createEmbeddingProvider ?? createLocalKtxEmbeddingProviderFromConfig)( + project.config.ingest.embeddings, + ); + return provider ? new KtxIngestEmbeddingPortAdapter(provider) : null; +} + +export async function runKtxKnowledge( + args: KtxKnowledgeArgs, + io: KtxKnowledgeIo = process, + deps: KtxKnowledgeDeps = {}, +): Promise { try { const project = await loadKtxProject({ projectDir: args.projectDir }); if (args.command === 'list') { @@ -51,7 +78,11 @@ export async function runKtxKnowledge(args: KtxKnowledgeArgs, io: KtxKnowledgeIo return 0; } if (args.command === 'search') { - const results = await searchLocalKnowledgePages(project, { query: args.query, userId: args.userId }); + const results = await searchLocalKnowledgePages(project, { + query: args.query, + userId: args.userId, + embeddingService: wikiSearchEmbeddingService(project, deps), + }); if (results.length === 0) { const pages = await listLocalKnowledgePages(project, { userId: args.userId }); if (pages.length === 0) { diff --git a/packages/cli/src/local-adapters.test.ts b/packages/cli/src/local-adapters.test.ts new file mode 100644 index 00000000..517c0588 --- /dev/null +++ b/packages/cli/src/local-adapters.test.ts @@ -0,0 +1,141 @@ +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { loadKtxProject } from '@ktx/context/project'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { createKtxCliLocalIngestAdapters } from './local-adapters.js'; + +function sqlAnalysisStub() { + return { + async analyzeForFingerprint(sql: string) { + return { + fingerprint: 'fp', + normalizedSql: sql, + tablesTouched: [], + literalSlots: [], + }; + }, + async analyzeBatch() { + return new Map(); + }, + }; +} + +async function writeProject(projectDir: string, body: string): Promise { + await writeFile(join(projectDir, 'ktx.yaml'), body, 'utf-8'); +} + +describe('CLI local ingest adapters', () => { + let tempDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-cli-local-adapters-')); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('registers Postgres historic SQL from the requested connection', async () => { + await writeProject( + tempDir, + [ + 'project: warehouse', + 'connections:', + ' warehouse:', + ' driver: postgres', + ' url: env:WAREHOUSE_DATABASE_URL', + ' readonly: true', + ' historicSql:', + ' enabled: true', + ' dialect: postgres', + 'ingest:', + ' adapters:', + ' - historic-sql', + '', + ].join('\n'), + ); + const project = await loadKtxProject({ projectDir: tempDir }); + + const adapters = createKtxCliLocalIngestAdapters(project, { + historicSqlConnectionId: 'warehouse', + sqlAnalysis: sqlAnalysisStub(), + }); + + expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([ + 'historic_sql_table_digest', + 'historic_sql_patterns', + ]); + }); + + it('registers BigQuery historic SQL from the requested connection', async () => { + await writeProject( + tempDir, + [ + 'project: warehouse', + 'connections:', + ' bq:', + ' driver: bigquery', + ' readonly: true', + ' dataset_id: analytics', + ' location: us', + ' credentials_json: \'{"project_id":"demo-project"}\'', + ' historicSql:', + ' enabled: true', + ' dialect: bigquery', + 'ingest:', + ' adapters:', + ' - historic-sql', + '', + ].join('\n'), + ); + const project = await loadKtxProject({ projectDir: tempDir }); + + const adapters = createKtxCliLocalIngestAdapters(project, { + historicSqlConnectionId: 'bq', + sqlAnalysis: sqlAnalysisStub(), + }); + + expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([ + 'historic_sql_table_digest', + 'historic_sql_patterns', + ]); + }); + + it('registers Snowflake historic SQL from the requested connection', async () => { + await writeProject( + tempDir, + [ + 'project: warehouse', + 'connections:', + ' sf:', + ' driver: snowflake', + ' readonly: true', + ' account: acct', + ' warehouse: wh', + ' database: ANALYTICS', + ' schema_name: PUBLIC', + ' username: reader', + ' password: env:SNOWFLAKE_PASSWORD', + ' historicSql:', + ' enabled: true', + ' dialect: snowflake', + 'ingest:', + ' adapters:', + ' - historic-sql', + '', + ].join('\n'), + ); + const project = await loadKtxProject({ projectDir: tempDir }); + + const adapters = createKtxCliLocalIngestAdapters(project, { + historicSqlConnectionId: 'sf', + sqlAnalysis: sqlAnalysisStub(), + }); + + expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([ + 'historic_sql_table_digest', + 'historic_sql_patterns', + ]); + }); +}); diff --git a/packages/cli/src/local-adapters.ts b/packages/cli/src/local-adapters.ts index 90d17306..d0a5f571 100644 --- a/packages/cli/src/local-adapters.ts +++ b/packages/cli/src/local-adapters.ts @@ -1,5 +1,10 @@ import { join } from 'node:path'; -import { createBigQueryLiveDatabaseIntrospection, isKtxBigQueryConnectionConfig } from '@ktx/connector-bigquery'; +import { + createBigQueryLiveDatabaseIntrospection, + isKtxBigQueryConnectionConfig, + KtxBigQueryScanConnector, + type KtxBigQueryConnectionConfig, +} from '@ktx/connector-bigquery'; import { createClickHouseLiveDatabaseIntrospection, isKtxClickHouseConnectionConfig } from '@ktx/connector-clickhouse'; import { createMysqlLiveDatabaseIntrospection, isKtxMysqlConnectionConfig } from '@ktx/connector-mysql'; import { @@ -11,15 +16,19 @@ import { import { createSqliteLiveDatabaseIntrospection, isKtxSqliteConnectionConfig } from '@ktx/connector-sqlite'; import { createSqlServerLiveDatabaseIntrospection, isKtxSqlServerConnectionConfig } from '@ktx/connector-sqlserver'; import { + BigQueryHistoricSqlQueryHistoryReader, createDaemonLiveDatabaseIntrospection, createDefaultLocalIngestAdapters, type DefaultLocalIngestAdaptersOptions, + type HistoricSqlReader, type LiveDatabaseIntrospectionPort, LiveDatabaseSourceAdapter, + PostgresPgssReader, + SnowflakeHistoricSqlQueryHistoryReader, type SourceAdapter, } from '@ktx/context/ingest'; import type { KtxLocalProject } from '@ktx/context/project'; -import { createHttpSqlAnalysisPort } from '@ktx/context/sql-analysis'; +import { createHttpSqlAnalysisPort, type SqlAnalysisPort } from '@ktx/context/sql-analysis'; import { createManagedDaemonLookerTableIdentifierParser, createManagedDaemonSqlAnalysisPort, @@ -35,6 +44,8 @@ function hasSnowflakeDriver(connection: unknown): boolean { ); } +type SnowflakeConnectorModule = typeof import('@ktx/connector-snowflake'); + function ktxCliDaemonDatabaseIntrospectionOptions( options: KtxCliLocalIngestAdaptersOptions, ): DefaultLocalIngestAdaptersOptions['databaseIntrospection'] { @@ -61,6 +72,9 @@ function ktxCliLookerOptions( } function ktxCliHistoricSqlAnalysis(options: KtxCliLocalIngestAdaptersOptions) { + if (options.sqlAnalysis) { + return options.sqlAnalysis; + } if (options.sqlAnalysisUrl) { return createHttpSqlAnalysisPort({ baseUrl: options.sqlAnalysisUrl }); } @@ -145,21 +159,32 @@ function createKtxCliLiveDatabaseIntrospection( export interface KtxCliLocalIngestAdaptersOptions extends DefaultLocalIngestAdaptersOptions { historicSqlConnectionId?: string; + sqlAnalysis?: SqlAnalysisPort; sqlAnalysisUrl?: string; managedDaemon?: ManagedPythonCoreDaemonOptions; } -function isEnabledPostgresHistoricSqlConnection(connection: KtxPostgresConnectionConfig | undefined): boolean { - if (!connection || !isKtxPostgresConnectionConfig(connection)) { - return false; +function historicSqlRecord(connection: unknown): Record | null { + if ( + connection && + typeof connection === 'object' && + 'historicSql' in connection && + typeof (connection as { historicSql?: unknown }).historicSql === 'object' && + (connection as { historicSql?: unknown }).historicSql !== null && + !Array.isArray((connection as { historicSql?: unknown }).historicSql) + ) { + return (connection as { historicSql: Record }).historicSql; } - const historicSql = - typeof connection.historicSql === 'object' && - connection.historicSql !== null && - !Array.isArray(connection.historicSql) - ? (connection.historicSql as Record) - : null; - return historicSql?.enabled === true && historicSql.dialect === 'postgres'; + return null; +} + +function enabledHistoricSqlDialect(connection: unknown): 'postgres' | 'bigquery' | 'snowflake' | null { + const historicSql = historicSqlRecord(connection); + if (historicSql?.enabled !== true) { + return null; + } + const dialect = String(historicSql.dialect ?? '').toLowerCase(); + return dialect === 'postgres' || dialect === 'bigquery' || dialect === 'snowflake' ? dialect : null; } function createEphemeralPostgresHistoricSqlClient(project: KtxLocalProject, connectionId: string) { @@ -184,20 +209,131 @@ function createEphemeralPostgresHistoricSqlClient(project: KtxLocalProject, conn }; } +function createEphemeralBigQueryHistoricSqlClient(project: KtxLocalProject, connectionId: string) { + const connection = project.config.connections[connectionId] as KtxBigQueryConnectionConfig | undefined; + if (!isKtxBigQueryConnectionConfig(connection)) { + throw new Error( + `Historic SQL local ingest requires a BigQuery connection, got ${String(connection?.driver ?? 'unknown')}`, + ); + } + return { + async executeQuery(query: string) { + const connector = new KtxBigQueryScanConnector({ + connectionId, + connection, + }); + try { + const result = await connector.executeReadOnly({ connectionId, sql: query }, {} as never); + return { + headers: result.headers, + rows: result.rows, + totalRows: result.totalRows, + }; + } finally { + await connector.cleanup(); + } + }, + }; +} + +async function createEphemeralSnowflakeHistoricSqlClient( + project: KtxLocalProject, + connectionId: string, + connectorModule: SnowflakeConnectorModule, +) { + const connection = project.config.connections[connectionId]; + if (!connectorModule.isKtxSnowflakeConnectionConfig(connection)) { + throw new Error( + `Historic SQL local ingest requires a Snowflake connection, got ${String(connection?.driver ?? 'unknown')}`, + ); + } + return { + async executeQuery(query: string) { + const connector = new connectorModule.KtxSnowflakeScanConnector({ + connectionId, + connection, + }); + try { + const result = await connector.executeReadOnly({ connectionId, sql: query }, {} as never); + return { + headers: result.headers, + rows: result.rows, + totalRows: result.totalRows, + }; + } finally { + await connector.cleanup(); + } + }, + }; +} + +function bigQueryProjectId(connection: KtxBigQueryConnectionConfig, env: NodeJS.ProcessEnv): string { + const raw = typeof connection.credentials_json === 'string' ? connection.credentials_json : ''; + const resolved = raw.startsWith('env:') ? env[raw.slice('env:'.length)] ?? '' : raw; + const parsed = JSON.parse(resolved) as { project_id?: unknown }; + if (typeof parsed.project_id !== 'string' || parsed.project_id.trim().length === 0) { + throw new Error('Historic SQL BigQuery connection requires credentials_json.project_id'); + } + return parsed.project_id; +} + +function bigQueryRegion(connection: KtxBigQueryConnectionConfig): string { + return typeof connection.location === 'string' && connection.location.trim().length > 0 + ? connection.location.trim() + : 'us'; +} + function historicSqlOptionsForLocalRun(project: KtxLocalProject, options: KtxCliLocalIngestAdaptersOptions) { const connectionId = options.historicSqlConnectionId; if (!connectionId) { return undefined; } - const connection = project.config.connections[connectionId] as KtxPostgresConnectionConfig | undefined; - if (!isEnabledPostgresHistoricSqlConnection(connection)) { + const connection = project.config.connections[connectionId]; + const dialect = enabledHistoricSqlDialect(connection); + if (!dialect) { return undefined; } - return { + + const base = { sqlAnalysis: ktxCliHistoricSqlAnalysis(options), - postgresQueryClient: createEphemeralPostgresHistoricSqlClient(project, connectionId), postgresBaselineRootDir: join(project.projectDir, '.ktx/cache/historic-sql'), }; + + if (dialect === 'postgres') { + return { + ...base, + reader: new PostgresPgssReader() satisfies HistoricSqlReader, + queryClient: createEphemeralPostgresHistoricSqlClient(project, connectionId), + }; + } + + if (dialect === 'bigquery') { + if (!isKtxBigQueryConnectionConfig(connection)) { + throw new Error( + `Historic SQL local ingest requires a BigQuery connection, got ${String(connection?.driver ?? 'unknown')}`, + ); + } + return { + ...base, + reader: new BigQueryHistoricSqlQueryHistoryReader({ + projectId: bigQueryProjectId(connection, process.env), + region: bigQueryRegion(connection), + }) satisfies HistoricSqlReader, + queryClient: createEphemeralBigQueryHistoricSqlClient(project, connectionId), + }; + } + + return { + ...base, + reader: new SnowflakeHistoricSqlQueryHistoryReader() satisfies HistoricSqlReader, + queryClient: { + async executeQuery(query: string) { + const connectorModule = await import('@ktx/connector-snowflake'); + const client = await createEphemeralSnowflakeHistoricSqlClient(project, connectionId, connectorModule); + return client.executeQuery(query); + }, + }, + }; } export function createKtxCliLocalIngestAdapters( diff --git a/packages/cli/src/managed-python-http.test.ts b/packages/cli/src/managed-python-http.test.ts index c0153c45..7bab7ea5 100644 --- a/packages/cli/src/managed-python-http.test.ts +++ b/packages/cli/src/managed-python-http.test.ts @@ -154,6 +154,37 @@ describe('managed daemon ingest ports', () => { }); }); + it('routes SQL batch analysis through the managed daemon runner', async () => { + const requestJson = vi.fn(async () => ({ + results: { + orders: { + tables_touched: ['public.orders'], + columns_by_clause: { select: ['status'] }, + error: null, + }, + }, + })); + const sqlAnalysis = createManagedDaemonSqlAnalysisPort({ requestJson }); + + await expect(sqlAnalysis.analyzeBatch([{ id: 'orders', sql: 'select status from public.orders' }], 'postgres')) + .resolves.toEqual( + new Map([ + [ + 'orders', + { + tablesTouched: ['public.orders'], + columnsByClause: { select: ['status'] }, + error: null, + }, + ], + ]), + ); + expect(requestJson).toHaveBeenCalledWith('/sql/analyze-batch', { + dialect: 'postgres', + items: [{ id: 'orders', sql: 'select status from public.orders' }], + }); + }); + it('returns live-database daemon request options backed by the managed runner', async () => { const requestJson = vi.fn(async () => ({ connection_id: 'warehouse', diff --git a/packages/cli/src/setup-context.ts b/packages/cli/src/setup-context.ts index f88635f4..fc7a1aef 100644 --- a/packages/cli/src/setup-context.ts +++ b/packages/cli/src/setup-context.ts @@ -767,6 +767,9 @@ export async function runKtxSetupContextStep( const missing = missingCapabilities(project); if (missing.length > 0) { + if (args.allowEmpty === true) { + return { status: 'skipped', projectDir: args.projectDir }; + } writeMissingCapabilities(missing, io); return { status: 'missing-input', projectDir: args.projectDir }; } diff --git a/packages/cli/src/setup-databases.test.ts b/packages/cli/src/setup-databases.test.ts index 3f268ce8..09b9d29f 100644 --- a/packages/cli/src/setup-databases.test.ts +++ b/packages/cli/src/setup-databases.test.ts @@ -64,6 +64,8 @@ function textInputPrompt(message: string): string { return `${title}\n\n${bodyLines.join('\n')}\nPress Escape to go back.\n`; } +const legacyHistoricSqlServiceAccountPatternsKey = ['serviceAccount', 'UserPatterns'].join(''); + describe('setup databases step', () => { let tempDir: string; @@ -1230,14 +1232,21 @@ describe('setup databases step', () => { enabled: true, dialect: 'snowflake', windowDays: 30, - serviceAccountUserPatterns: ['^svc_'], + filters: { + dropTrivialProbes: true, + serviceAccounts: { + patterns: ['^svc_'], + mode: 'exclude', + }, + }, redactionPatterns: ['(?i)secret'], }, }); + expect(config.connections.snowflake.historicSql).not.toHaveProperty(legacyHistoricSqlServiceAccountPatternsKey); expect(config.ingest.adapters).toContain('historic-sql'); }); - it('writes Postgres Historic SQL config with minCalls and ignores window/redaction output', async () => { + it('writes Postgres Historic SQL config with minExecutions and ignores window/redaction output', async () => { const io = makeIo(); const result = await runKtxSetupDatabasesStep( { @@ -1249,7 +1258,7 @@ describe('setup databases step', () => { databaseSchemas: ['public'], enableHistoricSql: true, historicSqlWindowDays: 30, - historicSqlMinCalls: 12, + historicSqlMinExecutions: 12, historicSqlServiceAccountPatterns: ['^svc_'], historicSqlRedactionPatterns: ['(?i)secret'], skipDatabases: false, @@ -1271,13 +1280,20 @@ describe('setup databases step', () => { historicSql: { enabled: true, dialect: 'postgres', - minCalls: 12, - maxTemplatesPerRun: 5000, - serviceAccountUserPatterns: ['^svc_'], + minExecutions: 12, + filters: { + dropTrivialProbes: true, + serviceAccounts: { + patterns: ['^svc_'], + mode: 'exclude', + }, + }, }, }); + expect(config.connections.warehouse.historicSql).not.toHaveProperty('minCalls'); expect(config.connections.warehouse.historicSql).not.toHaveProperty('windowDays'); expect(config.connections.warehouse.historicSql).not.toHaveProperty('redactionPatterns'); + expect(config.connections.warehouse.historicSql).not.toHaveProperty(legacyHistoricSqlServiceAccountPatternsKey); expect(config.ingest.adapters).toContain('historic-sql'); expect(io.stdout()).toContain('Historic SQL probe...'); expect(io.stdout()).toContain('pg_stat_statements ready'); @@ -1324,10 +1340,13 @@ describe('setup databases step', () => { enabled: true, dialect: 'bigquery', windowDays: 45, - serviceAccountUserPatterns: [], + filters: { + dropTrivialProbes: true, + }, redactionPatterns: [], }, }); + expect(config.connections.analytics.historicSql).not.toHaveProperty(legacyHistoricSqlServiceAccountPatternsKey); expect(config.ingest.adapters).toContain('historic-sql'); }); @@ -1354,7 +1373,7 @@ describe('setup databases step', () => { databaseConnectionIds: ['warehouse'], databaseSchemas: [], enableHistoricSql: true, - historicSqlMinCalls: 8, + historicSqlMinExecutions: 8, skipDatabases: false, }, io.io, @@ -1371,11 +1390,13 @@ describe('setup databases step', () => { historicSql: { enabled: true, dialect: 'postgres', - minCalls: 8, - maxTemplatesPerRun: 5000, - serviceAccountUserPatterns: [], + minExecutions: 8, + filters: { + dropTrivialProbes: true, + }, }, }); + expect(config.connections.warehouse.historicSql).not.toHaveProperty(legacyHistoricSqlServiceAccountPatternsKey); }); it('prints a non-blocking Postgres Historic SQL probe failure after connection test succeeds', async () => { diff --git a/packages/cli/src/setup-databases.ts b/packages/cli/src/setup-databases.ts index 1838725d..bd554590 100644 --- a/packages/cli/src/setup-databases.ts +++ b/packages/cli/src/setup-databases.ts @@ -34,6 +34,7 @@ export interface KtxSetupDatabasesArgs { enableHistoricSql?: boolean; disableHistoricSql?: boolean; historicSqlWindowDays?: number; + historicSqlMinExecutions?: number; historicSqlMinCalls?: number; historicSqlServiceAccountPatterns?: string[]; historicSqlRedactionPatterns?: string[]; @@ -226,7 +227,7 @@ async function defaultHistoricSqlProbe(input: KtxSetupHistoricSqlProbeInput): Pr const project = await loadKtxProject({ projectDir: input.projectDir }); const connection = project.config.connections[input.connectionId]; - const [{ PostgresPgssQueryHistoryReader }, { KtxPostgresHistoricSqlQueryClient, isKtxPostgresConnectionConfig }] = + const [{ PostgresPgssReader }, { KtxPostgresHistoricSqlQueryClient, isKtxPostgresConnectionConfig }] = await Promise.all([import('@ktx/context/ingest'), import('@ktx/connector-postgres')]); const postgresConnection = connection as Parameters[0]; @@ -242,7 +243,7 @@ async function defaultHistoricSqlProbe(input: KtxSetupHistoricSqlProbeInput): Pr connection: postgresConnection, }); try { - const result = await new PostgresPgssQueryHistoryReader().probe(client); + const result = await new PostgresPgssReader().probe(client); return { ok: true, lines: [ @@ -664,20 +665,20 @@ async function maybeApplyHistoricSqlConfig(input: { return { ...input.connection, historicSql: { ...existing, enabled: false, dialect } }; } - const common = { + const common: Record = { ...existing, enabled: true, dialect, - serviceAccountUserPatterns: input.args.historicSqlServiceAccountPatterns ?? [], + filters: historicSqlFiltersForSetup(input.args.historicSqlServiceAccountPatterns), }; + delete common[['serviceAccount', 'UserPatterns'].join('')]; if (dialect === 'postgres') { return { ...input.connection, historicSql: { ...common, - minCalls: input.args.historicSqlMinCalls ?? 5, - maxTemplatesPerRun: 5000, + minExecutions: input.args.historicSqlMinExecutions ?? input.args.historicSqlMinCalls ?? 5, }, }; } @@ -692,6 +693,21 @@ async function maybeApplyHistoricSqlConfig(input: { }; } +function historicSqlFiltersForSetup(patterns: string[] | undefined) { + const serviceAccountPatterns = patterns ?? []; + return { + dropTrivialProbes: true, + ...(serviceAccountPatterns.length > 0 + ? { + serviceAccounts: { + patterns: serviceAccountPatterns, + mode: 'exclude' as const, + }, + } + : {}), + }; +} + async function defaultTestConnection(projectDir: string, connectionId: string, io: KtxCliIo): Promise { return await runKtxConnection({ command: 'test', projectDir, connectionId }, io); } diff --git a/packages/cli/src/setup.test.ts b/packages/cli/src/setup.test.ts index 7cb0d0df..c8961e2a 100644 --- a/packages/cli/src/setup.test.ts +++ b/packages/cli/src/setup.test.ts @@ -1174,6 +1174,66 @@ describe('setup status', () => { expect(calls).toEqual(['model', 'embeddings', 'databases', 'sources']); }); + it('does not fail context build when prerequisites were explicitly skipped and agents are skipped', async () => { + const calls: string[] = []; + const io = makeIo(); + await writeFile( + join(tempDir, 'ktx.yaml'), + [ + 'project: revenue', + 'connections:', + ' warehouse:', + ' driver: postgres', + ' url: env:DEMO_DATABASE_URL', + ' readonly: true', + '', + ].join('\n'), + 'utf-8', + ); + + await expect( + runKtxSetup( + { + command: 'run', + projectDir: tempDir, + mode: 'existing', + agents: false, + skipAgents: true, + inputMode: 'disabled', + yes: true, + cliVersion: '0.2.0', + skipLlm: true, + skipEmbeddings: true, + skipDatabases: true, + skipSources: true, + databaseSchemas: [], + }, + io.io, + { + model: async () => { + calls.push('model'); + return { status: 'skipped', projectDir: tempDir }; + }, + embeddings: async () => { + calls.push('embeddings'); + return { status: 'skipped', projectDir: tempDir }; + }, + databases: async () => { + calls.push('databases'); + return { status: 'skipped', projectDir: tempDir }; + }, + sources: async () => { + calls.push('sources'); + return { status: 'skipped', projectDir: tempDir }; + }, + }, + ), + ).resolves.toBe(0); + + expect(calls).toEqual(['model', 'embeddings', 'databases', 'sources']); + expect(io.stderr()).not.toContain('KTX cannot build agent-ready context yet.'); + }); + it('runs context after sources and before agents in full setup', async () => { const calls: string[] = []; const io = makeIo(); diff --git a/packages/cli/src/setup.ts b/packages/cli/src/setup.ts index 5eac2e27..89c5dcdc 100644 --- a/packages/cli/src/setup.ts +++ b/packages/cli/src/setup.ts @@ -82,6 +82,7 @@ export type KtxSetupArgs = enableHistoricSql?: boolean; disableHistoricSql?: boolean; historicSqlWindowDays?: number; + historicSqlMinExecutions?: number; historicSqlMinCalls?: number; historicSqlServiceAccountPatterns?: string[]; historicSqlRedactionPatterns?: string[]; @@ -644,6 +645,9 @@ async function runKtxSetupInner(args: KtxSetupArgs, io: KtxCliIo, deps: KtxSetup ...(args.enableHistoricSql !== undefined ? { enableHistoricSql: args.enableHistoricSql } : {}), ...(args.disableHistoricSql !== undefined ? { disableHistoricSql: args.disableHistoricSql } : {}), ...(args.historicSqlWindowDays !== undefined ? { historicSqlWindowDays: args.historicSqlWindowDays } : {}), + ...(args.historicSqlMinExecutions !== undefined + ? { historicSqlMinExecutions: args.historicSqlMinExecutions } + : {}), ...(args.historicSqlMinCalls !== undefined ? { historicSqlMinCalls: args.historicSqlMinCalls } : {}), ...(args.historicSqlServiceAccountPatterns ? { historicSqlServiceAccountPatterns: args.historicSqlServiceAccountPatterns } diff --git a/packages/connector-postgres/src/connector.test.ts b/packages/connector-postgres/src/connector.test.ts index 3bdfc109..9e6e1db8 100644 --- a/packages/connector-postgres/src/connector.test.ts +++ b/packages/connector-postgres/src/connector.test.ts @@ -129,6 +129,25 @@ describe('KtxPostgresScanConnector', () => { options: '-c search_path=analytics,public', ssl: { rejectUnauthorized: false }, }); + const libpqPreferConfig = postgresPoolConfigFromConfig({ + connectionId: 'warehouse', + connection: { + driver: 'postgres', + url: 'env:DEMO_DATABASE_URL', + readonly: true, + }, + env: { + DEMO_DATABASE_URL: 'postgresql://reader@demo.example.test:5432/demo?sslmode=prefer', + }, + }); + expect(libpqPreferConfig).toMatchObject({ + host: 'demo.example.test', + port: 5432, + database: 'demo', + user: 'reader', + }); + expect(libpqPreferConfig).not.toHaveProperty('connectionString'); + expect(libpqPreferConfig).not.toHaveProperty('ssl'); expect(() => postgresPoolConfigFromConfig({ connectionId: 'warehouse', diff --git a/packages/connector-postgres/src/connector.ts b/packages/connector-postgres/src/connector.ts index a780663f..288ef25c 100644 --- a/packages/connector-postgres/src/connector.ts +++ b/packages/connector-postgres/src/connector.ts @@ -57,6 +57,8 @@ export interface KtxPostgresConnectionConfig { schema?: string; schemas?: string[]; ssl?: boolean; + sslmode?: string; + sslMode?: string; rejectUnauthorized?: boolean; readonly?: boolean; [key: string]: unknown; @@ -253,15 +255,22 @@ function numberValue(value: unknown): number | undefined { function parsePostgresUrl(url: string): Partial { const parsed = new URL(url); + const sslmode = parsed.searchParams.get('sslmode') ?? undefined; return { host: parsed.hostname, port: parsed.port ? Number(parsed.port) : undefined, database: parsed.pathname.replace(/^\/+/, '') || undefined, username: parsed.username ? decodeURIComponent(parsed.username) : undefined, password: parsed.password ? decodeURIComponent(parsed.password) : undefined, + ...(sslmode ? { sslmode } : {}), }; } +function normalizedSslMode(connection: KtxPostgresConnectionConfig): string | undefined { + const value = connection.sslmode ?? connection.sslMode; + return typeof value === 'string' && value.trim().length > 0 ? value.trim().toLowerCase() : undefined; +} + function schemasFromConnection(connection: KtxPostgresConnectionConfig): string[] { if (Array.isArray(connection.schemas) && connection.schemas.length > 0) { return connection.schemas.filter((schema): schema is string => typeof schema === 'string' && schema.length > 0); @@ -299,6 +308,7 @@ export function postgresPoolConfigFromConfig(input: { const database = stringConfigValue(merged, 'database', env); const user = stringConfigValue(merged, 'username', env) ?? stringConfigValue(merged, 'user', env); const password = stringConfigValue(merged, 'password', env); + const sslmode = normalizedSslMode(merged); if (!referencedUrl && !host) { throw new Error(`Native PostgreSQL connector requires connections.${input.connectionId}.host or url`); @@ -314,7 +324,7 @@ export function postgresPoolConfigFromConfig(input: { max: 10, idleTimeoutMillis: 30_000, connectionTimeoutMillis: 10_000, - ...(referencedUrl + ...(referencedUrl && sslmode !== 'prefer' && sslmode !== 'disable' ? { connectionString: referencedUrl } : { host, port: numberValue(merged.port) ?? 5432, database, user, password }), }; @@ -322,7 +332,7 @@ export function postgresPoolConfigFromConfig(input: { if (searchPathSchemas.length > 0) { config.options = `-c search_path=${searchPathSchemas.join(',')}`; } - if (merged.ssl) { + if (merged.ssl && sslmode !== 'prefer' && sslmode !== 'disable') { config.ssl = { rejectUnauthorized: merged.rejectUnauthorized ?? true }; } return config; diff --git a/packages/context/prompts/skills/page_triage_classifier.md b/packages/context/prompts/skills/page_triage_classifier.md index c449b312..5a6d7e23 100644 --- a/packages/context/prompts/skills/page_triage_classifier.md +++ b/packages/context/prompts/skills/page_triage_classifier.md @@ -18,68 +18,6 @@ Analytics evidence (BI tools like Looker, Metabase, Tableau) is durable knowledg Treat dashboard/Look filter values, saved aggregations, calculated fields, and named tiles as candidate metric/segment definitions — they are durable. Do **not** mark BI evidence as `skip` solely because it is "configuration" or "tied to a data model"; that is exactly the durable knowledge we want to capture. -Historic SQL query-history evidence is durable when usage signals show a repeated pattern worth memory work. For `signals.objectType === "historic_sql_template"`: - -- If `propertyHints.executions_bucket=low AND distinct_users_bucket=solo`, return `skip`. A one-off query by one user is indexed evidence, but it is too weak to produce durable knowledge candidates. -- Else if `propertyHints.service_account_only=true AND below the frequency floor`, return `light`. Treat `executions_bucket=low` or `distinct_users_bucket=solo` as below the frequency floor for this rule. Service-account-only templates can preserve useful SQL evidence, but should not occupy a full WorkUnit unless other signals show shared human usage. -- Otherwise apply the standard full/light/skip logic to the page excerpt. Favor `full` for shared human usage with mid or high execution volume, especially when `tables_touched`, normalized SQL, and slot classifications define a reusable metric, segment, threshold, or operational query pattern. - -Historic-SQL synthetic signal examples: - -- skip low solo template: - -```json -{ - "objectType": "historic_sql_template", - "propertyHints": { - "executions_bucket": "low", - "distinct_users_bucket": "solo", - "error_rate_bucket": "ok", - "recency_bucket": "active", - "service_account_only": "false", - "slot_summary": "1 constant, 1 runtime" - } -} -``` - --> `skip` - -- light service-account-only template: - -```json -{ - "objectType": "historic_sql_template", - "propertyHints": { - "executions_bucket": "high", - "distinct_users_bucket": "solo", - "error_rate_bucket": "ok", - "recency_bucket": "active", - "service_account_only": "true", - "slot_summary": "1 constant, 0 runtime" - } -} -``` - --> `light` - -- full shared human template: - -```json -{ - "objectType": "historic_sql_template", - "propertyHints": { - "executions_bucket": "high", - "distinct_users_bucket": "team", - "error_rate_bucket": "ok", - "recency_bucket": "active", - "service_account_only": "false", - "slot_summary": "2 constant, 1 runtime" - } -} -``` - --> `full` - Examples: - `Cold Call Script` with reusable call flow, objection handling, or positioning language -> `light` when short, `full` when multi-section or ambiguous. diff --git a/packages/context/skills/historic_sql_curator/SKILL.md b/packages/context/skills/historic_sql_curator/SKILL.md deleted file mode 100644 index 615bf2ea..00000000 --- a/packages/context/skills/historic_sql_curator/SKILL.md +++ /dev/null @@ -1,153 +0,0 @@ ---- -name: historic_sql_curator -description: Reconcile historic-SQL query knowledge pages by deduping collapsed intents, cross-linking categorical sub-clusters, and demoting stale low-signal pages. -callers: [memory_agent] ---- - -# Historic SQL Curator - -Use this skill during Stage 4 reconciliation for the `historic-sql` source. It runs after `historic_sql_ingest` has written query knowledge pages from full-tier template WorkUnits. The Stage 4 runner may use curator pagination, so treat the current prompt as one bounded page of work and finish every listed item you inspect. - -## Input Shape - -The reconciliation prompt normally exposes: - -- `# Stage Index` with WorkUnit keys, raw paths, and wiki or SL actions from Stage 3. -- `# Eviction Set` with deleted raw paths from retired templates. -- `# Curator Pass State` when curator pagination splits reconciliation into multiple passes. -- `# Source Reconciliation Notes` with run-level notes such as staged template count. - -Use tools instead of guessing: - -- `stage_list` shows every WorkUnit raw path and action. -- `stage_diff` compares two WorkUnits by written artifact overlap. -- `read_raw_span` reads staged `metadata.json`, `page.md`, `usage.json`, and `manifest.json` snippets when page content is not enough. -- `wiki_search`, `wiki_read`, and `wiki_write` inspect and update query knowledge pages. -- `emit_artifact_resolution` records merged or subsumed wiki pages for provenance. -- `eviction_list` and `emit_eviction_decision` handle deleted raw paths. - -## Required Workflow - -1. Read the `# Stage Index`, `# Eviction Set`, `# Curator Pass State`, and `# Source Reconciliation Notes` sections first. -2. Call `stage_list` when the prompt omits raw paths or when more than one WorkUnit wrote a `queries/...` page. -3. For each successful historic-SQL WorkUnit that wrote a wiki page, call `wiki_read` on that page before deciding whether to merge, cross-link, or demote it. -4. If the page body does not show fingerprint, sub-cluster, tables, or usage clearly enough, call `read_raw_span` on that WorkUnit's `metadata.json` and `usage.json` raw paths. -5. Build intent clusters using table overlap, representative SQL shape, page summaries, fingerprints, sub-cluster IDs, and usage. Same table is not enough to merge; the business intent must collapse. -6. Deduplicate collapsed intents by electing one canonical page, merging useful variant details into it with `wiki_write`, and recording each merged loser with `emit_artifact_resolution`. -7. Cross-link categorical sub-cluster pages that share the same base fingerprint but differ by `__cat_...` sub-cluster ID. -8. Demote pages whose underlying cluster has decayed below the floor in the most recent 3 windows, or in the current window plus eviction evidence showing the template retired. -9. For every deleted raw path in the Eviction Set that you inspect, call `eviction_list` and then `emit_eviction_decision`. - -## Canonical Page Election - -When two or more pages describe the same query intent, choose the canonical page with this order: - -1. The clearest human-readable intent summary. -2. The page with broader non-service-account usage. -3. The page covering more fingerprints or categorical variants of the same intent. -4. The page with the most recent successful usage. -5. Lexicographically first page key. - -After electing the canonical page: - -- Read every page that will be merged. -- Update the canonical page so it contains one "Historic SQL Variants" section with fingerprints, sub-cluster IDs, tables, usage summaries, and links to sibling page keys when retained. -- Keep `tags` including `historic-sql` and `query-pattern`. -- Preserve useful `sl_refs`; when replacing refs, include the union of cleanly matched SL refs from merged pages. -- For each merged loser, call `emit_artifact_resolution` with: - -```json -{ - "rawPath": "", - "artifactKind": "wiki", - "artifactKey": "", - "actionType": "merged", - "reason": "Historic-SQL query intent collapsed into ." -} -``` - -Use `actionType: "subsumed"` only when the loser page is a thin duplicate with no unique facts worth retaining in the canonical body. - -## Categorical Sub-Cluster Cross-Links - -A categorical sub-cluster normally has a staged ID like `__cat_` or page content that says `Sub-cluster: `. For sibling pages that share the same base fingerprint: - -1. Read all sibling pages visible in the current Stage Index or found through `wiki_search`. -2. Keep one page per meaningful category value. -3. Add or update a "Categorical Variants" section in each sibling page: - -```markdown -### Categorical Variants -- ``: [[queries/]] - -``` - -4. Use `wiki_write` with `refs` containing the sibling page keys so cross-links also live in frontmatter. -5. Do not merge categorical siblings only because they share a fingerprint. Merge them only when the category value no longer changes intent. - -## Demotion - -Demotion preserves history; it is not deletion. A page is demoted when evidence shows its underlying cluster has fallen below the historic-SQL floor: - -- `executions < 3`, or -- `distinct_users < 2`, or -- service-account-only usage below the frequency floor, or -- the template was evicted and no active sibling or replacement page supports the same intent. - -Require the low-signal state across the most recent 3 windows when page history is available. If only the current window is visible, demote only when eviction evidence confirms the raw template retired; otherwise add a caveat and leave the page active. - -Use `wiki_write` to express demotion with the current wiki frontmatter fields: - -- Add the `historic-sql-demoted` tag while preserving `historic-sql` and `query-pattern`. -- Prefix the summary with `Demoted historic-SQL pattern: ` unless it already begins with that phrase. -- Add a `### Demotion` section in the body with the last observed usage window, the floor that failed, and the raw path or fingerprint that supports the decision. - -When demoting because of an eviction, also call `emit_eviction_decision`: - -```json -{ - "rawPath": "", - "artifactKind": "wiki", - "artifactKey": "", - "action": "retained_deprecated", - "reason": "Historic-SQL template retired or decayed below the floor; page retained with historic-sql-demoted frontmatter tag." -} -``` - -## What To Write - -Use `wiki_write` for every page update. The tool supports `summary`, `content`, `tags`, `refs`, and `sl_refs` frontmatter fields. - -Canonical pages should keep this body shape: - -```markdown -## -- Source: historic-sql -- Tables: -- Fingerprints: -- Usage: , , first seen , last seen - -### Representative SQL -```sql - -``` - -### Historic SQL Variants -- ``: - -### Categorical Variants -- ``: [[queries/]] - - -### Demotion -- Omit this section unless the page is demoted. -``` - -## Boundaries - -- Do not call `context_candidate_write`; historic-SQL Stage 3 writes query pages directly. -- Do not create new artifact types, stores, ports, or tables. -- Do not group low-tier templates that triage already filtered out. -- Do not merge pages on table overlap alone. -- Do not delete a query page solely because usage is low; demote it unless eviction rules and inbound-reference evidence make removal clearly safer. -- Do not copy unredacted sample `bound_sql`, user emails, account IDs, tokens, or free-text literal values into wiki or SL output. -- Do not edit SL unless the reconciliation prompt shows a concrete same-intent conflict or duplicate that requires an existing SL artifact resolution. -- Do not finish a curator pagination pass while a merged page, demoted page, or inspected eviction lacks the corresponding provenance call. diff --git a/packages/context/skills/historic_sql_ingest/SKILL.md b/packages/context/skills/historic_sql_ingest/SKILL.md deleted file mode 100644 index f8650a99..00000000 --- a/packages/context/skills/historic_sql_ingest/SKILL.md +++ /dev/null @@ -1,170 +0,0 @@ ---- -name: historic_sql_ingest -description: Convert one full-tier historic-SQL template WorkUnit into a canonical query knowledge page, linked SL refs, and optional semantic-layer proposals. -callers: [memory_agent] ---- - -# Historic SQL Ingest - -Use this skill when the WorkUnit contains files under `raw-sources//historic-sql//templates//`. - -Read exactly one historic-SQL template WorkUnit. Each WorkUnit represents one staged template or categorical sub-cluster that already survived full-tier page triage. It is not an intent cluster. - -## Input Shape - -The WorkUnit normally exposes: - -- `metadata.json` in `rawFiles`. -- `page.md` in `rawFiles`. -- `usage.json` in `dependencyPaths`. -- `manifest.json` in `dependencyPaths`. -- `peerFileIndex` containing sibling templates that you cannot read. - -`metadata.json` has the stable identity: - -```json -{ - "id": "fp_1", - "title": "snowflake - analytics.orders [fp_1]", - "path": "templates/fp_1/page.md", - "objectType": "historic_sql_template", - "lastEditedAt": null, - "properties": { - "fingerprint": "fp_1", - "sub_cluster_id": null, - "dialect": "snowflake", - "tables_touched": ["analytics.orders"], - "literal_slots": [ - { "position": 1, "type": "string", "classification": "constant" }, - { "position": 2, "type": "date", "classification": "runtime" } - ], - "triage_signals": { - "executions_bucket": "high", - "distinct_users_bucket": "team", - "error_rate_bucket": "ok", - "recency_bucket": "active", - "service_account_only": "false", - "slot_summary": "1 constant, 1 runtime" - } - } -} -``` - -`page.md` contains mechanically generated normalized SQL and touched tables: - -```text -# fp_1 - -## Normalized SQL -SELECT date_trunc(?, created_at), count(*) FROM analytics.orders WHERE status = ? AND created_at >= ? GROUP BY 1 - -## Tables touched -- analytics.orders -``` - -`usage.json` contains volatile stats, literal top values, and redacted samples. Use it for intent inference and usage summaries. Do not treat usage-only drift as a reason to group this template with siblings. - -## Required Workflow - -1. Read the WorkUnit section in the prompt first. -2. Call `read_raw_file` for `metadata.json`, `page.md`, `usage.json`, and `manifest.json`. -3. Confirm `metadata.objectType === "historic_sql_template"`. If it is not, call `emit_unmapped_fallback` with `reason: "parse_error"`, `fallback: "flagged"`, and the `metadata.json` raw path. -4. Extract `fingerprint`, `sub_cluster_id`, `dialect`, `tables_touched`, `literal_slots`, normalized SQL, usage stats, top literal values, and sample timestamps. -5. Infer one canonical query intent from this template only. Use table names, selected expressions, aggregations, joins, grouping, constant literal slots, and repeated successful samples. Runtime literal slots are parameters, not fixed business rules. -6. Build a short intent slug in kebab-case. Use `queries/` as the wiki key. -7. Search existing knowledge with `wiki_search` using the intent phrase and the primary table. Prefer updating an existing `queries/...` page when it is the same intent. -8. Discover touched tables with `sl_discover`. Add cleanly matched source names to `sl_refs`. If a table does not map cleanly, keep it in the page body and do not include it in `sl_refs`. -9. Write or update the query page with `wiki_write`. -10. Apply the SL proposal threshold below. If it passes and a useful generic measure, segment, join, or overlay is clear, update the semantic layer and run `sl_validate`. -11. Exit without reading peer files or grouping sibling templates. - -## Wiki Page Shape - -Use `wiki_write` for pages. Emit the spec frontmatter fields directly on the query page. - -Use this shape: - -```json -{ - "key": "queries/", - "summary": "", - "tags": ["historic-sql", "query-pattern"], - "sl_refs": [""], - "source": "historic-sql", - "intent": "", - "tables": [""], - "representative_sql": "", - "usage": { - "executions": 47812, - "distinct_users": 12, - "first_seen": "2026-02-01", - "last_seen": "2026-04-30", - "p50_runtime_ms": 320, - "p95_runtime_ms": 1180, - "error_rate": 0.0007 - }, - "fingerprints": [""], - "content": "## \n\n### Parameters\n- \n\n### When To Use\n- \n\n### Caveats\n- " -} -``` - -For Snowflake templates include `usage.rows_produced` when present in `usage.json`; for BigQuery v1 omit `usage.rows_produced`. - -The `key: "queries/"` value writes to `knowledge/global/queries/.md` during external ingest because bundle ingests write global wiki pages. - -## Representative SQL Rules - -- Start from normalized SQL in `page.md`. -- For constant slots, use the dominant `usage.literal_slots[].top_values[0][0]` when it has definitional meaning. Quote string and date values in the representative SQL. -- For runtime slots, render named parameters such as `:start_date`, `:as_of`, `:status`, or `:threshold`. -- For categorical slots, document the known categories and write this WorkUnit's sub-cluster value when `sub_cluster_id` is present. -- Preserve the warehouse dialect named by `metadata.properties.dialect`. -- Do not copy sample bound_sql into the wiki unless it is visibly redacted and safer than the normalized SQL. Prefer normalized SQL plus parameter notes. - -## SL Proposal Threshold - -Only propose semantic-layer changes when all are true: - -1. This WorkUnit reached Stage 3 full tier. The runner normally guarantees this, but treat `executions_bucket=low` plus `distinct_users_bucket=solo` or `service_account_only=true` as a reason to write wiki only. -2. At least one `literal_slots[]` entry has `classification: "constant"` and the value has durable business meaning, such as a status, plan tier, channel, threshold, or fixed category. -3. Every table in `tables_touched` maps cleanly through `sl_discover` to an existing SL source. - -When the threshold passes: - -- Call `sl_read_source` before editing an existing source. -- Prefer adding a measure, segment, computed dimension, join, or manifest-backed overlay over creating a standalone SQL source. -- Use `sl_write_source` for a manifest-backed overlay only with `name:` plus additive fields such as `measures:`, `segments:`, `description:`, or `joins:`. Do not include `sql:`, `table:`, `grain:`, or `columns:` on manifest-backed overlays. -- Use `sl_edit_source` for targeted edits when the source file already exists. -- Run `sl_validate` after every SL write or edit. -- Keep runtime parameters as caller filters. Do not bake dates, user ids, ids, search strings, or other runtime slots into SL measures. - -When the threshold does not pass, write the wiki page and set `sl_refs` for any cleanly discovered touched tables. A wiki-only result is valid. - -## Intent Inference Guidance - -Prefer canonical intent names that describe the business question, not the SQL shape: - -- Good: `queries/monthly-paid-order-count` -- Good: `queries/enterprise-contract-renewal-risk` -- Good: `queries/support-ticket-first-response-time` -- Weak: `queries/fp-1` -- Weak: `queries/count-orders-group-by-date` - -Use the SQL shape to infer intent: - -- `COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, `GROUP BY`, and date truncation usually indicate metrics or rollups. -- Constant slots often name segments, statuses, tiers, regions, or thresholds. -- Runtime slots usually represent time windows, selected entities, or caller filters. -- Repeated successful samples from multiple human users make the page more durable. -- High error rates, service-account-only use, or old `last_seen` values belong in caveats. - -## Boundaries - -- Do not group sibling templates. Stage 4 `historic_sql_curator` owns cross-template clustering and dedupe. -- Do not read paths listed only in `peerFileIndex`. -- Do not create or update `historic_sql_curator`. -- Do not call `context_candidate_write`; historic-SQL Stage 3 writes final wiki and optional SL artifacts directly. -- Do not invent joins, measures, or definitions that are not supported by the normalized SQL, touched tables, literal slots, or existing SL sources. -- Do not copy unredacted sample `bound_sql`, user emails, account ids, tokens, or free-text literal values into wiki or SL output. -- Do not write SL changes when any touched table lacks a clean SL mapping. -- Do not finish after only an SL write. Always write or update the query knowledge page first so the canonical SQL pattern is searchable. diff --git a/packages/context/skills/historic_sql_patterns/SKILL.md b/packages/context/skills/historic_sql_patterns/SKILL.md new file mode 100644 index 00000000..33eb6fe0 --- /dev/null +++ b/packages/context/skills/historic_sql_patterns/SKILL.md @@ -0,0 +1,60 @@ +--- +name: historic_sql_patterns +description: Identify recurring cross-table historic-SQL analytical intents from a bounded pattern shard and emit typed pattern evidence for deterministic wiki projection. +callers: [memory_agent] +--- + +# Historic SQL Patterns + +Use this skill when the WorkUnit raw file is a `patterns-input/part-0001.json` style shard from the `historic-sql` adapter. Older staged bundles may still provide root `patterns-input.json`; when that is the WorkUnit raw file, read it the same way. + +## Required Workflow + +1. Read the WorkUnit notes first. +2. Find the single pattern input file listed under the WorkUnit `rawFiles` section. +3. Call `read_raw_file` for that exact raw file path. +4. Identify recurring analytical intents that span at least two tables and have repeated usage signal. +5. Emit one `pattern` evidence object per durable cross-table intent by calling `emit_historic_sql_evidence`. +6. Set each evidence object's `rawPath` to the exact raw file path read in step 3. +7. Stop after all pattern evidence has been emitted. + +## Evidence Shape + +Each call to `emit_historic_sql_evidence` must use this shape: + +```json +{ + "kind": "pattern", + "rawPath": "patterns-input/part-0001.json", + "pattern": { + "slug": "order-lifecycle-analysis", + "title": "Order Lifecycle Analysis", + "narrative": "Analysts compare order statuses with customer segments to understand lifecycle movement.", + "definitionSql": "select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status", + "tablesInvolved": ["public.orders", "public.customers"], + "slRefs": ["orders", "customers"], + "constituentTemplateIds": ["pg:1", "pg:2"] + } +} +``` + +The `pattern` object must match `patternOutputSchema`; multiple calls together must form `patternsArraySchema`. + +## Pattern Selection Rules + +- Prefer patterns that involve two or more tables. +- Prefer templates with `executionsBucket` at least `10-100` and `distinctUsersBucket` above solo usage. +- Merge templates into one pattern only when the business intent is the same. +- Use a stable kebab-case slug based on intent, not a template id. +- Set `definitionSql` to the clearest representative SQL from a constituent template. +- Set `slRefs` to source names when the source name is obvious from table names; omit uncertain refs rather than guessing. +- Treat each pattern shard independently; do not read peer shard files from `peerFileIndex`. + +## Boundaries + +- Do not call wiki_write. +- Do not call sl_write_source. +- Do not call sl_edit_source. +- Do not call context_candidate_write. +- Do not create single-table pattern pages. +- Do not copy credentials, tokens, user emails, or unredacted literals into evidence. diff --git a/packages/context/skills/historic_sql_table_digest/SKILL.md b/packages/context/skills/historic_sql_table_digest/SKILL.md new file mode 100644 index 00000000..34e49d27 --- /dev/null +++ b/packages/context/skills/historic_sql_table_digest/SKILL.md @@ -0,0 +1,61 @@ +--- +name: historic_sql_table_digest +description: Convert one changed historic-SQL table usage bucket into typed table usage evidence for deterministic _schema projection. +callers: [memory_agent] +--- + +# Historic SQL Table Digest + +Use this skill when the WorkUnit raw file is one `tables/..json` file from the `historic-sql` adapter. + +## Required Workflow + +1. Read the WorkUnit notes first. +2. Call `read_raw_file` for the single `tables/..json` raw file. +3. Read `manifest.json` only if the table JSON omits the dialect or the WorkUnit notes are unclear. +4. Produce one concise usage narrative for this table from the staged table JSON. +5. Call `emit_historic_sql_evidence` exactly once with `kind: "table_usage"`. +6. Stop after the evidence tool succeeds. + +## Evidence Shape + +Call `emit_historic_sql_evidence` with this shape: + +```json +{ + "kind": "table_usage", + "table": "public.orders", + "rawPath": "tables/public.orders.json", + "usage": { + "narrative": "Orders are repeatedly queried for paid/refunded lifecycle analysis and customer-level rollups.", + "frequencyTier": "high", + "commonFilters": ["status", "created_at"], + "commonGroupBys": ["status"], + "commonJoins": [{ "table": "public.customers", "on": ["customer_id"] }], + "staleSince": null + } +} +``` + +The `usage` object must match `tableUsageOutputSchema`. + +## Interpretation Rules + +- Treat `columnsByClause.where` as common filters. +- Treat `columnsByClause.groupBy` as common group-bys. +- Treat `observedJoins` as common joins. +- Use `stats.executionsBucket`, `stats.distinctUsersBucket`, and `stats.recencyBucket` to choose `frequencyTier`. +- Use `frequencyTier: "high"` only when executions and distinct users are both broad. +- Use `frequencyTier: "mid"` for repeated team usage that is not broad enough for high. +- Use `frequencyTier: "low"` for low-volume but present usage. +- Use `frequencyTier: "unused"` only when the table input explicitly says the table is stale or has no recent templates. +- Keep `narrative` short and concrete. + +## Boundaries + +- Do not call wiki_write. +- Do not call sl_write_source. +- Do not call sl_edit_source. +- Do not call context_candidate_write. +- Do not emit more than one table usage evidence object. +- Do not invent columns, joins, or tables that are absent from the staged JSON. diff --git a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/eviction-churn/input.json b/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/eviction-churn/input.json deleted file mode 100644 index 6be4e518..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/eviction-churn/input.json +++ /dev/null @@ -1,146 +0,0 @@ -{ - "name": "eviction-churn", - "now": "2026-05-08T12:00:00.000Z", - "connectionId": "warehouse", - "probe": { - "pgServerVersion": "PostgreSQL 16.4", - "warnings": [ - "pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn" - ] - }, - "snapshot": { - "statsResetAt": "2026-05-08T08:00:00.000Z", - "deallocCount": 3, - "rows": [ - { - "queryid": "501", - "userid": "11", - "username": "analyst", - "dbid": "5", - "database": "analytics", - "query": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "calls": 20, - "totalExecTime": 500, - "meanExecTime": 25, - "totalRows": 40 - } - ] - }, - "pullConfig": { - "dialect": "postgres", - "windowDays": 90, - "lastSuccessfulCursor": null, - "serviceAccountUserPatterns": [], - "redactionPatterns": [], - "maxTemplatesPerRun": 5000, - "minCalls": 5 - }, - "analysisBySql": { - "SELECT count(*) FROM analytics.orders WHERE status = $1": { - "fingerprint": "fp_orders_status", - "normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "tablesTouched": [ - "analytics.orders" - ], - "literalSlots": [] - } - }, - "baseline": null, - "expectedBaseline": { - "version": 1, - "fetchedAt": "2026-05-08T12:00:00.000Z", - "statsResetAt": "2026-05-08T08:00:00.000Z", - "pgServerVersion": "PostgreSQL 16.4", - "templates": { - "db5_q501": { - "firstObservedAt": "2026-05-08T12:00:00.000Z", - "perUser": { - "11": { - "calls": 20, - "totalExecTime": 500, - "totalRows": 40 - } - } - } - } - }, - "expectedFiles": { - "manifest.json": { - "json": { - "source": "historic-sql", - "connectionId": "warehouse", - "dialect": "postgres", - "fetchedAt": "2026-05-08T12:00:00.000Z", - "windowStart": "2026-05-08T08:00:00.000Z", - "windowEnd": "2026-05-08T12:00:00.000Z", - "nextSuccessfulCursor": "2026-05-08T12:00:00.000Z", - "templateCount": 1, - "capped": false, - "warnings": [ - "pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn", - "pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn", - "baseline_first_run:no_previous_pgss_baseline" - ], - "degraded": true, - "statsResetAt": "2026-05-08T08:00:00.000Z", - "baselineFirstRun": true, - "pgServerVersion": "PostgreSQL 16.4", - "deallocCount": 3, - "templates": [ - { - "id": "db5_q501", - "fingerprint": "fp_orders_status", - "subClusterId": null, - "path": "templates/db5_q501/page.md" - } - ] - } - }, - "templates/db5_q501/metadata.json": { - "json": { - "id": "db5_q501", - "title": "postgres · analytics.orders [db5_q501]", - "path": "templates/db5_q501/page.md", - "objectType": "historic_sql_template", - "lastEditedAt": null, - "properties": { - "fingerprint": "fp_orders_status", - "sub_cluster_id": null, - "dialect": "postgres", - "tables_touched": [ - "analytics.orders" - ], - "literal_slots": [], - "triage_signals": { - "executions_bucket": "mid", - "distinct_users_bucket": "solo", - "error_rate_bucket": "ok", - "recency_bucket": "active", - "service_account_only": "false", - "runtime_bucket": "fast" - } - } - } - }, - "templates/db5_q501/page.md": { - "text": "# db5_q501\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n" - }, - "templates/db5_q501/usage.json": { - "json": { - "stats": { - "executions": 20, - "distinct_users": 1, - "first_seen": "2026-05-08T12:00:00.000Z", - "last_seen": "2026-05-08T12:00:00.000Z", - "p50_runtime_ms": null, - "p95_runtime_ms": null, - "mean_runtime_ms": 25, - "error_rate": 0, - "rows_produced": 40 - }, - "literal_slots": [], - "samples": [] - } - } - } -} diff --git a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/first-run/input.json b/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/first-run/input.json deleted file mode 100644 index 5835ab3a..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/first-run/input.json +++ /dev/null @@ -1,144 +0,0 @@ -{ - "name": "first-run", - "now": "2026-05-08T12:00:00.000Z", - "connectionId": "warehouse", - "probe": { - "pgServerVersion": "PostgreSQL 16.4", - "warnings": [] - }, - "snapshot": { - "statsResetAt": "2026-05-08T08:00:00.000Z", - "deallocCount": 0, - "rows": [ - { - "queryid": "101", - "userid": "11", - "username": "analyst", - "dbid": "5", - "database": "analytics", - "query": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "calls": 10, - "totalExecTime": 250, - "meanExecTime": 25, - "totalRows": 20 - } - ] - }, - "pullConfig": { - "dialect": "postgres", - "windowDays": 90, - "lastSuccessfulCursor": null, - "serviceAccountUserPatterns": [ - "^svc_" - ], - "redactionPatterns": [], - "maxTemplatesPerRun": 5000, - "minCalls": 5 - }, - "analysisBySql": { - "SELECT count(*) FROM analytics.orders WHERE status = $1": { - "fingerprint": "fp_orders_status", - "normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "tablesTouched": [ - "analytics.orders" - ], - "literalSlots": [] - } - }, - "baseline": null, - "expectedBaseline": { - "version": 1, - "fetchedAt": "2026-05-08T12:00:00.000Z", - "statsResetAt": "2026-05-08T08:00:00.000Z", - "pgServerVersion": "PostgreSQL 16.4", - "templates": { - "db5_q101": { - "firstObservedAt": "2026-05-08T12:00:00.000Z", - "perUser": { - "11": { - "calls": 10, - "totalExecTime": 250, - "totalRows": 20 - } - } - } - } - }, - "expectedFiles": { - "manifest.json": { - "json": { - "source": "historic-sql", - "connectionId": "warehouse", - "dialect": "postgres", - "fetchedAt": "2026-05-08T12:00:00.000Z", - "windowStart": "2026-05-08T08:00:00.000Z", - "windowEnd": "2026-05-08T12:00:00.000Z", - "nextSuccessfulCursor": "2026-05-08T12:00:00.000Z", - "templateCount": 1, - "capped": false, - "warnings": [ - "baseline_first_run:no_previous_pgss_baseline" - ], - "degraded": true, - "statsResetAt": "2026-05-08T08:00:00.000Z", - "baselineFirstRun": true, - "pgServerVersion": "PostgreSQL 16.4", - "deallocCount": 0, - "templates": [ - { - "id": "db5_q101", - "fingerprint": "fp_orders_status", - "subClusterId": null, - "path": "templates/db5_q101/page.md" - } - ] - } - }, - "templates/db5_q101/metadata.json": { - "json": { - "id": "db5_q101", - "title": "postgres · analytics.orders [db5_q101]", - "path": "templates/db5_q101/page.md", - "objectType": "historic_sql_template", - "lastEditedAt": null, - "properties": { - "fingerprint": "fp_orders_status", - "sub_cluster_id": null, - "dialect": "postgres", - "tables_touched": [ - "analytics.orders" - ], - "literal_slots": [], - "triage_signals": { - "executions_bucket": "mid", - "distinct_users_bucket": "solo", - "error_rate_bucket": "ok", - "recency_bucket": "active", - "service_account_only": "false", - "runtime_bucket": "fast" - } - } - } - }, - "templates/db5_q101/page.md": { - "text": "# db5_q101\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n" - }, - "templates/db5_q101/usage.json": { - "json": { - "stats": { - "executions": 10, - "distinct_users": 1, - "first_seen": "2026-05-08T12:00:00.000Z", - "last_seen": "2026-05-08T12:00:00.000Z", - "p50_runtime_ms": null, - "p95_runtime_ms": null, - "mean_runtime_ms": 25, - "error_rate": 0, - "rows_produced": 20 - }, - "literal_slots": [], - "samples": [] - } - } - } -} diff --git a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/normal-delta/input.json b/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/normal-delta/input.json deleted file mode 100644 index 2cc386da..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/normal-delta/input.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "name": "normal-delta", - "now": "2026-05-08T12:00:00.000Z", - "connectionId": "warehouse", - "probe": { - "pgServerVersion": "PostgreSQL 16.4", - "warnings": [] - }, - "snapshot": { - "statsResetAt": "2026-05-08T08:00:00.000Z", - "deallocCount": 0, - "rows": [ - { - "queryid": "201", - "userid": "11", - "username": "analyst", - "dbid": "5", - "database": "analytics", - "query": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "calls": 12, - "totalExecTime": 160, - "meanExecTime": 13.333333333333334, - "totalRows": 58 - }, - { - "queryid": "201", - "userid": "12", - "username": "svc_loader", - "dbid": "5", - "database": "analytics", - "query": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "calls": 5, - "totalExecTime": 50, - "meanExecTime": 10, - "totalRows": 25 - } - ] - }, - "pullConfig": { - "dialect": "postgres", - "windowDays": 90, - "lastSuccessfulCursor": null, - "serviceAccountUserPatterns": [ - "^svc_" - ], - "redactionPatterns": [], - "maxTemplatesPerRun": 5000, - "minCalls": 5 - }, - "analysisBySql": { - "SELECT count(*) FROM analytics.orders WHERE status = $1": { - "fingerprint": "fp_orders_status", - "normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "tablesTouched": [ - "analytics.orders" - ], - "literalSlots": [] - } - }, - "baseline": { - "version": 1, - "fetchedAt": "2026-05-08T10:00:00.000Z", - "statsResetAt": "2026-05-08T08:00:00.000Z", - "pgServerVersion": "PostgreSQL 16.4", - "templates": { - "db5_q201": { - "firstObservedAt": "2026-05-08T09:00:00.000Z", - "perUser": { - "11": { - "calls": 10, - "totalExecTime": 100, - "totalRows": 50 - }, - "12": { - "calls": 5, - "totalExecTime": 50, - "totalRows": 25 - } - } - } - } - }, - "expectedBaseline": { - "version": 1, - "fetchedAt": "2026-05-08T12:00:00.000Z", - "statsResetAt": "2026-05-08T08:00:00.000Z", - "pgServerVersion": "PostgreSQL 16.4", - "templates": { - "db5_q201": { - "firstObservedAt": "2026-05-08T09:00:00.000Z", - "perUser": { - "11": { - "calls": 12, - "totalExecTime": 160, - "totalRows": 58 - }, - "12": { - "calls": 5, - "totalExecTime": 50, - "totalRows": 25 - } - } - } - } - }, - "expectedFiles": { - "manifest.json": { - "json": { - "source": "historic-sql", - "connectionId": "warehouse", - "dialect": "postgres", - "fetchedAt": "2026-05-08T12:00:00.000Z", - "windowStart": "2026-05-08T10:00:00.000Z", - "windowEnd": "2026-05-08T12:00:00.000Z", - "nextSuccessfulCursor": "2026-05-08T12:00:00.000Z", - "templateCount": 1, - "capped": false, - "warnings": [], - "degraded": true, - "statsResetAt": "2026-05-08T08:00:00.000Z", - "baselineFirstRun": false, - "pgServerVersion": "PostgreSQL 16.4", - "deallocCount": 0, - "templates": [ - { - "id": "db5_q201", - "fingerprint": "fp_orders_status", - "subClusterId": null, - "path": "templates/db5_q201/page.md" - } - ] - } - }, - "templates/db5_q201/metadata.json": { - "json": { - "id": "db5_q201", - "title": "postgres · analytics.orders [db5_q201]", - "path": "templates/db5_q201/page.md", - "objectType": "historic_sql_template", - "lastEditedAt": null, - "properties": { - "fingerprint": "fp_orders_status", - "sub_cluster_id": null, - "dialect": "postgres", - "tables_touched": [ - "analytics.orders" - ], - "literal_slots": [], - "triage_signals": { - "executions_bucket": "low", - "distinct_users_bucket": "solo", - "error_rate_bucket": "ok", - "recency_bucket": "active", - "service_account_only": "false", - "runtime_bucket": "fast" - } - } - } - }, - "templates/db5_q201/page.md": { - "text": "# db5_q201\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n" - }, - "templates/db5_q201/usage.json": { - "json": { - "stats": { - "executions": 2, - "distinct_users": 1, - "first_seen": "2026-05-08T09:00:00.000Z", - "last_seen": "2026-05-08T12:00:00.000Z", - "p50_runtime_ms": null, - "p95_runtime_ms": null, - "mean_runtime_ms": 30, - "error_rate": 0, - "rows_produced": 8 - }, - "literal_slots": [], - "samples": [] - } - } - } -} diff --git a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/reset-detected/input.json b/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/reset-detected/input.json deleted file mode 100644 index f2e0b16f..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/reset-detected/input.json +++ /dev/null @@ -1,159 +0,0 @@ -{ - "name": "reset-detected", - "now": "2026-05-08T12:00:00.000Z", - "connectionId": "warehouse", - "probe": { - "pgServerVersion": "PostgreSQL 16.4", - "warnings": [] - }, - "snapshot": { - "statsResetAt": "2026-05-08T11:00:00.000Z", - "deallocCount": 0, - "rows": [ - { - "queryid": "301", - "userid": "11", - "username": "analyst", - "dbid": "5", - "database": "analytics", - "query": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "calls": 3, - "totalExecTime": 90, - "meanExecTime": 30, - "totalRows": 9 - } - ] - }, - "pullConfig": { - "dialect": "postgres", - "windowDays": 90, - "lastSuccessfulCursor": null, - "serviceAccountUserPatterns": [], - "redactionPatterns": [], - "maxTemplatesPerRun": 5000, - "minCalls": 5 - }, - "analysisBySql": { - "SELECT count(*) FROM analytics.orders WHERE status = $1": { - "fingerprint": "fp_orders_status", - "normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "tablesTouched": [ - "analytics.orders" - ], - "literalSlots": [] - } - }, - "baseline": { - "version": 1, - "fetchedAt": "2026-05-08T10:00:00.000Z", - "statsResetAt": "2026-05-08T08:00:00.000Z", - "pgServerVersion": "PostgreSQL 16.4", - "templates": { - "db5_q301": { - "firstObservedAt": "2026-05-08T09:00:00.000Z", - "perUser": { - "11": { - "calls": 100, - "totalExecTime": 1000, - "totalRows": 500 - } - } - } - } - }, - "expectedBaseline": { - "version": 1, - "fetchedAt": "2026-05-08T12:00:00.000Z", - "statsResetAt": "2026-05-08T11:00:00.000Z", - "pgServerVersion": "PostgreSQL 16.4", - "templates": { - "db5_q301": { - "firstObservedAt": "2026-05-08T12:00:00.000Z", - "perUser": { - "11": { - "calls": 3, - "totalExecTime": 90, - "totalRows": 9 - } - } - } - } - }, - "expectedFiles": { - "manifest.json": { - "json": { - "source": "historic-sql", - "connectionId": "warehouse", - "dialect": "postgres", - "fetchedAt": "2026-05-08T12:00:00.000Z", - "windowStart": "2026-05-08T10:00:00.000Z", - "windowEnd": "2026-05-08T12:00:00.000Z", - "nextSuccessfulCursor": "2026-05-08T12:00:00.000Z", - "templateCount": 1, - "capped": false, - "warnings": [ - "baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z" - ], - "degraded": true, - "statsResetAt": "2026-05-08T11:00:00.000Z", - "baselineFirstRun": true, - "pgServerVersion": "PostgreSQL 16.4", - "deallocCount": 0, - "templates": [ - { - "id": "db5_q301", - "fingerprint": "fp_orders_status", - "subClusterId": null, - "path": "templates/db5_q301/page.md" - } - ] - } - }, - "templates/db5_q301/metadata.json": { - "json": { - "id": "db5_q301", - "title": "postgres · analytics.orders [db5_q301]", - "path": "templates/db5_q301/page.md", - "objectType": "historic_sql_template", - "lastEditedAt": null, - "properties": { - "fingerprint": "fp_orders_status", - "sub_cluster_id": null, - "dialect": "postgres", - "tables_touched": [ - "analytics.orders" - ], - "literal_slots": [], - "triage_signals": { - "executions_bucket": "mid", - "distinct_users_bucket": "solo", - "error_rate_bucket": "ok", - "recency_bucket": "active", - "service_account_only": "false", - "runtime_bucket": "fast" - } - } - } - }, - "templates/db5_q301/page.md": { - "text": "# db5_q301\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n" - }, - "templates/db5_q301/usage.json": { - "json": { - "stats": { - "executions": 3, - "distinct_users": 1, - "first_seen": "2026-05-08T12:00:00.000Z", - "last_seen": "2026-05-08T12:00:00.000Z", - "p50_runtime_ms": null, - "p95_runtime_ms": null, - "mean_runtime_ms": 30, - "error_rate": 0, - "rows_produced": 9 - }, - "literal_slots": [], - "samples": [] - } - } - } -} diff --git a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/version-change/input.json b/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/version-change/input.json deleted file mode 100644 index 1618e3ca..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/__fixtures__/postgres/version-change/input.json +++ /dev/null @@ -1,159 +0,0 @@ -{ - "name": "version-change", - "now": "2026-05-08T12:00:00.000Z", - "connectionId": "warehouse", - "probe": { - "pgServerVersion": "PostgreSQL 16.4", - "warnings": [] - }, - "snapshot": { - "statsResetAt": "2026-05-08T08:00:00.000Z", - "deallocCount": 0, - "rows": [ - { - "queryid": "401", - "userid": "11", - "username": "analyst", - "dbid": "5", - "database": "analytics", - "query": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "calls": 4, - "totalExecTime": 80, - "meanExecTime": 20, - "totalRows": 8 - } - ] - }, - "pullConfig": { - "dialect": "postgres", - "windowDays": 90, - "lastSuccessfulCursor": null, - "serviceAccountUserPatterns": [], - "redactionPatterns": [], - "maxTemplatesPerRun": 5000, - "minCalls": 5 - }, - "analysisBySql": { - "SELECT count(*) FROM analytics.orders WHERE status = $1": { - "fingerprint": "fp_orders_status", - "normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1", - "tablesTouched": [ - "analytics.orders" - ], - "literalSlots": [] - } - }, - "baseline": { - "version": 1, - "fetchedAt": "2026-05-08T10:00:00.000Z", - "statsResetAt": "2026-05-08T08:00:00.000Z", - "pgServerVersion": "PostgreSQL 15.7", - "templates": { - "db5_q401": { - "firstObservedAt": "2026-05-08T09:00:00.000Z", - "perUser": { - "11": { - "calls": 100, - "totalExecTime": 1000, - "totalRows": 500 - } - } - } - } - }, - "expectedBaseline": { - "version": 1, - "fetchedAt": "2026-05-08T12:00:00.000Z", - "statsResetAt": "2026-05-08T08:00:00.000Z", - "pgServerVersion": "PostgreSQL 16.4", - "templates": { - "db5_q401": { - "firstObservedAt": "2026-05-08T12:00:00.000Z", - "perUser": { - "11": { - "calls": 4, - "totalExecTime": 80, - "totalRows": 8 - } - } - } - } - }, - "expectedFiles": { - "manifest.json": { - "json": { - "source": "historic-sql", - "connectionId": "warehouse", - "dialect": "postgres", - "fetchedAt": "2026-05-08T12:00:00.000Z", - "windowStart": "2026-05-08T10:00:00.000Z", - "windowEnd": "2026-05-08T12:00:00.000Z", - "nextSuccessfulCursor": "2026-05-08T12:00:00.000Z", - "templateCount": 1, - "capped": false, - "warnings": [ - "baseline_reset:pg_server_major changed from 15 to 16" - ], - "degraded": true, - "statsResetAt": "2026-05-08T08:00:00.000Z", - "baselineFirstRun": true, - "pgServerVersion": "PostgreSQL 16.4", - "deallocCount": 0, - "templates": [ - { - "id": "db5_q401", - "fingerprint": "fp_orders_status", - "subClusterId": null, - "path": "templates/db5_q401/page.md" - } - ] - } - }, - "templates/db5_q401/metadata.json": { - "json": { - "id": "db5_q401", - "title": "postgres · analytics.orders [db5_q401]", - "path": "templates/db5_q401/page.md", - "objectType": "historic_sql_template", - "lastEditedAt": null, - "properties": { - "fingerprint": "fp_orders_status", - "sub_cluster_id": null, - "dialect": "postgres", - "tables_touched": [ - "analytics.orders" - ], - "literal_slots": [], - "triage_signals": { - "executions_bucket": "mid", - "distinct_users_bucket": "solo", - "error_rate_bucket": "ok", - "recency_bucket": "active", - "service_account_only": "false", - "runtime_bucket": "fast" - } - } - } - }, - "templates/db5_q401/page.md": { - "text": "# db5_q401\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n" - }, - "templates/db5_q401/usage.json": { - "json": { - "stats": { - "executions": 4, - "distinct_users": 1, - "first_seen": "2026-05-08T12:00:00.000Z", - "last_seen": "2026-05-08T12:00:00.000Z", - "p50_runtime_ms": null, - "p95_runtime_ms": null, - "mean_runtime_ms": 20, - "error_rate": 0, - "rows_produced": 8 - }, - "literal_slots": [], - "samples": [] - } - } - } -} diff --git a/packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts b/packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts index e0a5e07d..9d5785cb 100644 --- a/packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts +++ b/packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts @@ -33,7 +33,7 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => { const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]); const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' }); - await expect(reader.probe(client)).resolves.toBeUndefined(); + await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] }); expect(client.executeQuery).toHaveBeenCalledWith( 'SELECT 1 FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` LIMIT 1', @@ -63,127 +63,85 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => { await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError); }); - it('fetches BigQuery jobs with cursor and maps them into RawQueryRow shape without rowsProduced', async () => { + it('fetches aggregated BigQuery query templates', async () => { const client = queryClient([ { headers: [ - 'job_id', - 'query', - 'user_email', - 'creation_time', - 'end_time', - 'runtime_ms', - 'total_slot_ms', - 'total_bytes_processed', - 'state', - 'error_reason', - 'error_message', - 'statement_type', + 'template_id', + 'canonical_sql', + 'executions', + 'distinct_users', + 'first_seen', + 'last_seen', + 'p50_ms', + 'p95_ms', + 'error_rate', + 'rows_produced', + 'top_users', ], rows: [ [ - 'bquxjob_1', - "SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'", - 'analyst-a@example.test', - '2026-05-04T10:00:00.000Z', - '2026-05-04T10:00:01.250Z', - 1250, - 3106, - 161164718, - 'DONE', + 'hash-1', + 'select status from orders', + 42, + 3, + '2026-05-01T00:00:00.000Z', + '2026-05-11T00:00:00.000Z', + 12, + 40, + 0.05, null, - null, - 'SELECT', - ], - [ - 'bquxjob_2', - 'SELECT * FROM `project-1.analytics.missing_table`', - 'analyst-b@example.test', - new Date('2026-05-04T10:05:00.000Z'), - null, - null, - 0, - 0, - 'DONE', - 'notFound', - 'Not found: Table project-1.analytics.missing_table', - 'SELECT', + JSON.stringify([{ user: 'analyst@example.test', executions: 1 }]), ], ], - totalRows: 2, + totalRows: 1, }, ]); - const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' }); + const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'demo', region: 'us' }); const rows = []; - for await (const row of reader.fetch( + for await (const row of reader.fetchAggregated( client, - { - start: new Date('2026-05-01T00:00:00.000Z'), - end: new Date('2026-05-04T12:00:00.000Z'), - }, - '2026-05-03T00:00:00.000Z', + { start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') }, + { dialect: 'bigquery', minExecutions: 5, windowDays: 90, concurrency: 12, filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 }, )) { rows.push(row); } - expect(client.executeQuery).toHaveBeenCalledTimes(1); const sql = firstQuery(client); - expect(sql).toContain('FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`'); - expect(sql).toContain("creation_time >= TIMESTAMP('2026-05-03T00:00:00.000Z')"); - expect(sql).toContain("creation_time < TIMESTAMP('2026-05-04T12:00:00.000Z')"); - expect(sql).toContain("job_type = 'QUERY'"); - expect(sql).toContain("(statement_type IS NULL OR statement_type != 'SCRIPT')"); - expect(sql).toContain('ORDER BY creation_time ASC, job_id ASC'); - expect(sql).toContain('total_slot_ms'); - expect(sql).toContain('total_bytes_processed'); - expect(sql).not.toMatch(/total_rows/i); - - expect(rows).toEqual([ + expect(sql).toContain('COUNT(*) AS executions'); + expect(sql).toContain('COUNT(DISTINCT user_email) AS distinct_users'); + expect(sql).toContain('GROUP BY query_hash'); + expect(sql).toContain('HAVING COUNT(*) >= 5'); + expect(rows).toMatchObject([ { - id: 'bquxjob_1', - sql: "SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'", - user: 'analyst-a@example.test', - startedAt: '2026-05-04T10:00:00.000Z', - endedAt: '2026-05-04T10:00:01.250Z', - runtimeMs: 1250, - success: true, - errorMessage: null, - }, - { - id: 'bquxjob_2', - sql: 'SELECT * FROM `project-1.analytics.missing_table`', - user: 'analyst-b@example.test', - startedAt: '2026-05-04T10:05:00.000Z', - endedAt: null, - runtimeMs: null, - success: false, - errorMessage: 'notFound: Not found: Table project-1.analytics.missing_table', + templateId: 'hash-1', + stats: { + executions: 42, + errorRate: 0.05, + }, + topUsers: [{ user: 'analyst@example.test', executions: 1 }], }, ]); }); - it('uses the window start when no cursor is available', async () => { - const client = queryClient([{ headers: ['job_id'], rows: [], totalRows: 0 }]); - const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'EU' }); - - for await (const _row of reader.fetch(client, { - start: new Date('2026-02-03T12:00:00.000Z'), - end: new Date('2026-05-04T12:00:00.000Z'), - })) { - throw new Error('empty result should not yield rows'); - } - - const sql = firstQuery(client); - expect(sql).toContain('FROM `project-1.region-eu.INFORMATION_SCHEMA.JOBS_BY_PROJECT`'); - expect(sql).toContain("creation_time >= TIMESTAMP('2026-02-03T12:00:00.000Z')"); - }); - it('throws a clear error when the query client cannot execute SQL', async () => { const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' }); await expect(async () => { - for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) { + for await (const _row of reader.fetchAggregated( + {}, + { start: new Date(), end: new Date() }, + { + dialect: 'bigquery', + minExecutions: 5, + windowDays: 90, + concurrency: 12, + filters: { dropTrivialProbes: true }, + redactionPatterns: [], + staleArchiveAfterDays: 90, + }, + )) { throw new Error('unreachable'); } }).rejects.toThrow('Historic SQL BigQuery reader requires a query client with executeQuery(query)'); diff --git a/packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts b/packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts index ea8fb00e..e24c50cf 100644 --- a/packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts +++ b/packages/context/src/ingest/adapters/historic-sql/bigquery-query-history-reader.ts @@ -1,5 +1,10 @@ import { HistoricSqlGrantsMissingError } from './errors.js'; -import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js'; +import { + aggregatedTemplateSchema, + type AggregatedTemplate, + type HistoricSqlTimeWindow, + type HistoricSqlUnifiedPullConfig, +} from './types.js'; interface QueryResultLike { headers: string[]; @@ -110,6 +115,23 @@ function nullableNumber(raw: unknown): number | null { return Math.max(0, number); } +function requiredNumber(raw: unknown, field: string): number { + const number = nullableNumber(raw); + if (number === null) { + throw new Error(`BigQuery JOBS_BY_PROJECT row has invalid ${field}: ${String(raw)}`); + } + return number; +} + +function requiredInteger(raw: unknown, field: string): number { + return Math.trunc(requiredNumber(raw, field)); +} + +function nullableInteger(raw: unknown): number | null { + const number = nullableNumber(raw); + return number === null ? null : Math.trunc(number); +} + function isoTimestamp(raw: unknown, field: string): string { if (raw instanceof Date) { return raw.toISOString(); @@ -122,43 +144,49 @@ function isoTimestamp(raw: unknown, field: string): string { return date.toISOString(); } -function nullableIsoTimestamp(raw: unknown): string | null { - if (raw === null || raw === undefined || raw === '') { - return null; +function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> { + const text = nullableString(raw); + if (!text) { + return []; } - return isoTimestamp(raw, 'end_time'); -} - -function executionSucceeded(state: string | null, errorReason: string | null, errorMessage: string | null): boolean { - if (errorReason || errorMessage) { - return false; + try { + const parsed = JSON.parse(text) as unknown; + if (!Array.isArray(parsed)) { + return []; + } + return parsed.flatMap((entry) => { + if (!entry || typeof entry !== 'object') { + return []; + } + const user = nullableString((entry as { user?: unknown }).user); + const executions = nullableInteger((entry as { executions?: unknown }).executions); + return executions === null ? [] : [{ user, executions }]; + }); + } catch { + return []; } - return state === null || state.toUpperCase() === 'DONE'; } -function combinedErrorMessage(errorReason: string | null, errorMessage: string | null): string | null { - if (errorReason && errorMessage) { - return `${errorReason}: ${errorMessage}`; - } - return errorMessage ?? errorReason; +function mapAggregatedRow(row: unknown[], indexes: Map): AggregatedTemplate { + return aggregatedTemplateSchema.parse({ + templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'), + canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'), + dialect: 'bigquery', + stats: { + executions: requiredInteger(value(row, indexes, 'executions'), 'executions'), + distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'), + firstSeen: isoTimestamp(value(row, indexes, 'first_seen'), 'first_seen'), + lastSeen: isoTimestamp(value(row, indexes, 'last_seen'), 'last_seen'), + p50RuntimeMs: nullableNumber(value(row, indexes, 'p50_ms')), + p95RuntimeMs: nullableNumber(value(row, indexes, 'p95_ms')), + errorRate: requiredNumber(value(row, indexes, 'error_rate'), 'error_rate'), + rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')), + }, + topUsers: parseTopUsers(value(row, indexes, 'top_users')), + }); } -function mapRow(row: unknown[], indexes: Map): HistoricSqlRawQueryRow { - const errorReason = nullableString(value(row, indexes, 'error_reason')); - const errorMessage = nullableString(value(row, indexes, 'error_message')); - return { - id: requiredString(value(row, indexes, 'job_id'), 'job_id'), - sql: requiredString(value(row, indexes, 'query'), 'query'), - user: nullableString(value(row, indexes, 'user_email')), - startedAt: isoTimestamp(value(row, indexes, 'creation_time'), 'creation_time'), - endedAt: nullableIsoTimestamp(value(row, indexes, 'end_time')), - runtimeMs: nullableNumber(value(row, indexes, 'runtime_ms')), - success: executionSucceeded(nullableString(value(row, indexes, 'state')), errorReason, errorMessage), - errorMessage: combinedErrorMessage(errorReason, errorMessage), - }; -} - -export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader { +export class BigQueryHistoricSqlQueryHistoryReader { private readonly viewPath: string; constructor(options: BigQueryHistoricSqlQueryHistoryReaderOptions) { @@ -167,7 +195,7 @@ export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHi this.viewPath = `\`${projectId}.region-${region}.INFORMATION_SCHEMA.JOBS_BY_PROJECT\``; } - async probe(client: unknown): Promise { + async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> { let result: QueryResultLike; try { result = await queryClient(client).executeQuery(`SELECT 1 FROM ${this.viewPath} LIMIT 1`); @@ -177,43 +205,43 @@ export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHi if (result.error) { throw grantsError(result.error); } + return { warnings: [], info: [] }; } - async *fetch( + async *fetchAggregated( client: unknown, window: HistoricSqlTimeWindow, - cursor?: string | null, - ): AsyncIterable { - const start = timestampExpression(cursor ?? window.start); - const end = timestampExpression(window.end); + config: HistoricSqlUnifiedPullConfig, + ): AsyncIterable { const sql = ` SELECT - job_id, - query, - user_email, - creation_time, - end_time, - TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND) AS runtime_ms, - total_slot_ms, - total_bytes_processed, - state, - error_result.reason AS error_reason, - error_result.message AS error_message, - statement_type + query_hash AS template_id, + MIN(query) AS canonical_sql, + COUNT(*) AS executions, + COUNT(DISTINCT user_email) AS distinct_users, + MIN(creation_time) AS first_seen, + MAX(creation_time) AS last_seen, + APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(50)] AS p50_ms, + APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(95)] AS p95_ms, + SAFE_DIVIDE(COUNTIF(error_result IS NOT NULL), COUNT(*)) AS error_rate, + CAST(NULL AS INT64) AS rows_produced, + TO_JSON_STRING(ARRAY_AGG(STRUCT(user_email AS user, 1 AS executions) ORDER BY creation_time DESC LIMIT 5)) AS top_users FROM ${this.viewPath} -WHERE creation_time >= ${start} - AND creation_time < ${end} - AND job_type = 'QUERY' +WHERE job_type = 'QUERY' + AND statement_type IN ('SELECT', 'MERGE') + AND creation_time >= ${timestampExpression(window.start)} + AND creation_time < ${timestampExpression(window.end)} AND query IS NOT NULL - AND (statement_type IS NULL OR statement_type != 'SCRIPT') -ORDER BY creation_time ASC, job_id ASC`.trim(); +GROUP BY query_hash +HAVING COUNT(*) >= ${config.minExecutions} +ORDER BY executions DESC`.trim(); const result = await queryClient(client).executeQuery(sql); if (result.error) { throw grantsError(result.error); } const indexes = indexByHeader(result.headers); for (const row of result.rows) { - yield mapRow(row, indexes); + yield mapAggregatedRow(row, indexes); } } } diff --git a/packages/context/src/ingest/adapters/historic-sql/buckets.test.ts b/packages/context/src/ingest/adapters/historic-sql/buckets.test.ts new file mode 100644 index 00000000..78dc2859 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/buckets.test.ts @@ -0,0 +1,59 @@ +import { describe, expect, it } from 'vitest'; +import { + bucketDistinctUsers, + bucketErrorRate, + bucketExecutions, + bucketFrequency, + bucketP95Runtime, + bucketRecency, +} from './buckets.js'; + +describe('historic-sql bucket helpers', () => { + it('uses stable execution buckets', () => { + expect([0, 9, 10, 99, 100, 999, 1000, 4999, 5000, 49999, 50000].map(bucketExecutions)).toEqual([ + '<10', + '<10', + '10-100', + '10-100', + '100-1k', + '100-1k', + '1k-5k', + '1k-5k', + '5k-50k', + '5k-50k', + '>50k', + ]); + }); + + it('uses stable distinct-user, error-rate, runtime, and recency buckets', () => { + expect([0, 1, 2, 5, 6, 10, 11].map(bucketDistinctUsers)).toEqual([ + '0', + '1', + '2-5', + '2-5', + '5-10', + '5-10', + '>10', + ]); + expect([0, 0.01, 0.05, 0.2].map(bucketErrorRate)).toEqual(['none', 'low', 'low', 'high']); + expect([null, 99, 100, 999, 1000, 9999, 10000].map(bucketP95Runtime)).toEqual([ + 'unknown', + '<100ms', + '100ms-1s', + '100ms-1s', + '1s-10s', + '1s-10s', + '>10s', + ]); + expect(bucketRecency('2026-05-11T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('current'); + expect(bucketRecency('2026-04-20T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('recent'); + expect(bucketRecency('2026-01-01T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('stale'); + }); + + it('maps frequency counts to high, mid, and low labels', () => { + expect(bucketFrequency(80, 100)).toBe('high'); + expect(bucketFrequency(20, 100)).toBe('mid'); + expect(bucketFrequency(1, 100)).toBe('low'); + expect(bucketFrequency(0, 0)).toBe('low'); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/buckets.ts b/packages/context/src/ingest/adapters/historic-sql/buckets.ts new file mode 100644 index 00000000..8777f826 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/buckets.ts @@ -0,0 +1,49 @@ +export function bucketExecutions(value: number): string { + if (value < 10) return '<10'; + if (value < 100) return '10-100'; + if (value < 1000) return '100-1k'; + if (value < 5000) return '1k-5k'; + if (value < 50000) return '5k-50k'; + return '>50k'; +} + +export function bucketDistinctUsers(value: number): string { + if (value <= 0) return '0'; + if (value === 1) return '1'; + if (value <= 5) return '2-5'; + if (value <= 10) return '5-10'; + return '>10'; +} + +export function bucketErrorRate(value: number): string { + if (value <= 0) return 'none'; + if (value < 0.1) return 'low'; + return 'high'; +} + +export function bucketP95Runtime(value: number | null): string { + if (value === null) return 'unknown'; + if (value < 100) return '<100ms'; + if (value < 1000) return '100ms-1s'; + if (value < 10000) return '1s-10s'; + return '>10s'; +} + +export function bucketRecency(lastSeen: string, now: Date): string { + const parsed = new Date(lastSeen); + if (Number.isNaN(parsed.getTime())) { + return 'unknown'; + } + const ageDays = (now.getTime() - parsed.getTime()) / (24 * 60 * 60 * 1000); + if (ageDays <= 7) return 'current'; + if (ageDays <= 45) return 'recent'; + return 'stale'; +} + +export function bucketFrequency(count: number, total: number): 'high' | 'mid' | 'low' { + if (total <= 0 || count <= 0) return 'low'; + const ratio = count / total; + if (ratio >= 0.5) return 'high'; + if (ratio >= 0.1) return 'mid'; + return 'low'; +} diff --git a/packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts b/packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts new file mode 100644 index 00000000..d8c0187f --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/chunk-unified.test.ts @@ -0,0 +1,182 @@ +import { mkdir, mkdtemp, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js'; + +async function tempDir(): Promise { + return mkdtemp(join(tmpdir(), 'historic-sql-unified-chunk-')); +} + +async function writeJson(root: string, relPath: string, value: unknown): Promise { + const target = join(root, relPath); + await mkdir(join(target, '..'), { recursive: true }); + await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'); +} + +async function writeUnifiedStagedDir(root: string): Promise { + await writeJson(root, 'manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 1, + touchedTableCount: 1, + parseFailures: 0, + warnings: [], + probeWarnings: [], + }); + await writeJson(root, 'tables/public.orders.json', { + table: 'public.orders', + stats: { + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + errorRateBucket: 'none', + p95RuntimeBucket: '<100ms', + recencyBucket: 'current', + }, + columnsByClause: { select: [['status', 'high']] }, + observedJoins: [], + topTemplates: [{ id: 'orders', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }], + }); + await writeJson(root, 'patterns-input.json', { + templates: [ + { + id: 'orders', + canonicalSql: 'select * from public.orders join public.customers on true', + tablesTouched: ['public.orders', 'public.customers'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ], + }); + await writeJson(root, 'patterns-input/part-0001.json', { + templates: [ + { + id: 'orders', + canonicalSql: 'select * from public.orders join public.customers on true', + tablesTouched: ['public.orders', 'public.customers'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ], + }); +} + +describe('chunkHistoricSqlUnifiedStagedDir', () => { + it('emits one table WorkUnit plus one patterns WorkUnit', async () => { + const stagedDir = await tempDir(); + await writeUnifiedStagedDir(stagedDir); + + const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir); + + expect(result.workUnits).toEqual([ + expect.objectContaining({ + unitKey: 'historic-sql-table-public-orders', + displayLabel: 'Historic SQL usage: public.orders', + rawFiles: ['tables/public.orders.json'], + dependencyPaths: ['manifest.json'], + notes: expect.stringContaining('historic_sql_table_digest'), + }), + expect.objectContaining({ + unitKey: 'historic-sql-patterns-part-0001', + displayLabel: 'Historic SQL cross-table patterns: part-0001', + rawFiles: ['patterns-input/part-0001.json'], + dependencyPaths: ['manifest.json'], + notes: expect.stringContaining('patterns-input/part-0001.json'), + }), + ]); + expect(result.workUnits[0]?.notes).toContain('emit_historic_sql_evidence'); + expect(result.workUnits[1]?.notes).toContain('emit_historic_sql_evidence'); + expect(result.reconcileNotes).toEqual(['Historic-SQL touched tables=1 parseFailures=0']); + }); + + it('respects diff sets for unchanged table and patterns files', async () => { + const stagedDir = await tempDir(); + await writeUnifiedStagedDir(stagedDir); + + await expect( + chunkHistoricSqlUnifiedStagedDir(stagedDir, { + added: [], + modified: ['tables/public.orders.json'], + deleted: [], + unchanged: ['manifest.json', 'patterns-input.json', 'patterns-input/part-0001.json'], + }), + ).resolves.toMatchObject({ + workUnits: [expect.objectContaining({ unitKey: 'historic-sql-table-public-orders' })], + }); + + await expect( + chunkHistoricSqlUnifiedStagedDir(stagedDir, { + added: [], + modified: ['patterns-input/part-0001.json'], + deleted: [], + unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'], + }), + ).resolves.toMatchObject({ + workUnits: [expect.objectContaining({ unitKey: 'historic-sql-patterns-part-0001' })], + }); + + await expect( + chunkHistoricSqlUnifiedStagedDir(stagedDir, { + added: [], + modified: ['patterns-input.json'], + deleted: [], + unchanged: ['manifest.json', 'patterns-input/part-0001.json', 'tables/public.orders.json'], + }), + ).resolves.toMatchObject({ + workUnits: [], + }); + }); + + it('describes unified staged scope', async () => { + const stagedDir = await tempDir(); + await writeUnifiedStagedDir(stagedDir); + + const scope = await describeHistoricSqlUnifiedScope(stagedDir); + + expect(scope.isPathInScope('manifest.json')).toBe(true); + expect(scope.isPathInScope('patterns-input.json')).toBe(true); + expect(scope.isPathInScope('patterns-input/part-0001.json')).toBe(true); + expect(scope.isPathInScope('patterns-input/part-1.json')).toBe(false); + expect(scope.isPathInScope('tables/public.orders.json')).toBe(true); + expect(scope.isPathInScope('templates/old/page.md')).toBe(false); + }); + + it('emits one patterns WorkUnit per changed shard', async () => { + const stagedDir = await tempDir(); + await writeUnifiedStagedDir(stagedDir); + await writeJson(stagedDir, 'patterns-input/part-0002.json', { + templates: [ + { + id: 'line-items', + canonicalSql: 'select * from public.orders join public.line_items on true', + tablesTouched: ['public.orders', 'public.line_items'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ], + }); + + const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir, { + added: ['patterns-input/part-0002.json'], + modified: ['patterns-input/part-0001.json'], + deleted: [], + unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'], + }); + + expect(result.workUnits.map((unit) => unit.unitKey)).toEqual([ + 'historic-sql-patterns-part-0001', + 'historic-sql-patterns-part-0002', + ]); + expect(result.workUnits.map((unit) => unit.rawFiles)).toEqual([ + ['patterns-input/part-0001.json'], + ['patterns-input/part-0002.json'], + ]); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts b/packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts new file mode 100644 index 00000000..4e6dfeda --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/chunk-unified.ts @@ -0,0 +1,99 @@ +import { createHash } from 'node:crypto'; +import { readFile, readdir } from 'node:fs/promises'; +import { join, relative } from 'node:path'; +import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js'; +import { isHistoricSqlPatternInputShardPath } from './pattern-inputs.js'; +import { stagedManifestSchema, stagedPatternsInputSchema, stagedTableInputSchema } from './types.js'; + +async function walk(root: string): Promise { + const entries = await readdir(root, { withFileTypes: true, recursive: true }); + return entries + .filter((entry) => entry.isFile()) + .map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/')) + .sort(); +} + +async function readJson(stagedDir: string, relPath: string): Promise { + return JSON.parse(await readFile(join(stagedDir, relPath), 'utf-8')) as T; +} + +function safeUnitKey(value: string): string { + return value.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, ''); +} + +function touchedPath(path: string, touched: Set | null): boolean { + return !touched || touched.has(path); +} + +export async function chunkHistoricSqlUnifiedStagedDir(stagedDir: string, diffSet?: DiffSet): Promise { + const files = await walk(stagedDir); + const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); + const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null; + const workUnits: WorkUnit[] = []; + + for (const path of files.filter((file) => /^tables\/.+\.json$/.test(file))) { + if (!touchedPath(path, touched)) { + continue; + } + const table = stagedTableInputSchema.parse(await readJson(stagedDir, path)); + workUnits.push({ + unitKey: `historic-sql-table-${safeUnitKey(table.table)}`, + displayLabel: `Historic SQL usage: ${table.table}`, + rawFiles: [path], + dependencyPaths: ['manifest.json'], + peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(), + notes: + 'Use historic_sql_table_digest. Read this table usage JSON and emit exactly one table_usage object with emit_historic_sql_evidence. Do not call wiki_write or sl_write_source.', + }); + } + + for (const path of files.filter(isHistoricSqlPatternInputShardPath)) { + if (!touchedPath(path, touched)) { + continue; + } + stagedPatternsInputSchema.parse(await readJson(stagedDir, path)); + const shardLabel = path.replace(/^patterns-input\//, '').replace(/\.json$/, ''); + workUnits.push({ + unitKey: `historic-sql-patterns-${safeUnitKey(shardLabel)}`, + displayLabel: `Historic SQL cross-table patterns: ${shardLabel}`, + rawFiles: [path], + dependencyPaths: ['manifest.json'], + peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(), + notes: + `Use historic_sql_patterns. Read ${path} and emit pattern objects with emit_historic_sql_evidence using rawPath "${path}". Do not call wiki_write or sl_write_source.`, + }); + } + + const deleted = diffSet?.deleted + .filter((path) => isHistoricSqlPatternInputShardPath(path) || /^tables\/.+\.json$/.test(path)) + .sort(); + return { + workUnits, + eviction: deleted && deleted.length > 0 ? { deletedRawPaths: deleted } : undefined, + reconcileNotes: [`Historic-SQL touched tables=${manifest.touchedTableCount} parseFailures=${manifest.parseFailures}`], + contextReport: { + capped: false, + warnings: [...manifest.probeWarnings, ...manifest.warnings], + }, + }; +} + +export async function describeHistoricSqlUnifiedScope(stagedDir: string): Promise { + const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); + const fingerprint = createHash('sha256') + .update(JSON.stringify({ + connectionId: manifest.connectionId, + dialect: manifest.dialect, + windowStart: manifest.windowStart, + windowEnd: manifest.windowEnd, + })) + .digest('hex'); + return { + fingerprint, + isPathInScope: (rawPath) => + rawPath === 'manifest.json' || + rawPath === 'patterns-input.json' || + isHistoricSqlPatternInputShardPath(rawPath) || + /^tables\/.+\.json$/.test(rawPath), + }; +} diff --git a/packages/context/src/ingest/adapters/historic-sql/chunk.test.ts b/packages/context/src/ingest/adapters/historic-sql/chunk.test.ts deleted file mode 100644 index a7941c65..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/chunk.test.ts +++ /dev/null @@ -1,251 +0,0 @@ -import { mkdir, mkdtemp, writeFile } from 'node:fs/promises'; -import { tmpdir } from 'node:os'; -import { join } from 'node:path'; -import { describe, expect, it } from 'vitest'; -import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js'; - -async function tempDir(): Promise { - return mkdtemp(join(tmpdir(), 'historic-sql-chunk-')); -} - -async function writeJson(root: string, relPath: string, value: unknown): Promise { - const target = join(root, relPath); - await mkdir(join(target, '..'), { recursive: true }); - await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'); -} - -async function writeTemplate(root: string): Promise { - await writeJson(root, 'manifest.json', { - source: 'historic-sql', - connectionId: 'conn_1', - dialect: 'snowflake', - fetchedAt: '2026-05-04T12:00:00.000Z', - windowStart: '2026-02-03T12:00:00.000Z', - windowEnd: '2026-05-04T12:00:00.000Z', - nextSuccessfulCursor: '2026-05-04T11:55:00.000Z', - templateCount: 1, - capped: false, - warnings: ['source warning'], - templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }], - }); - await writeJson(root, 'templates/fp_1/metadata.json', { - id: 'fp_1', - title: 'snowflake · analytics.orders [fp_1]', - path: 'templates/fp_1/page.md', - objectType: 'historic_sql_template', - lastEditedAt: null, - properties: { - fingerprint: 'fp_1', - sub_cluster_id: null, - dialect: 'snowflake', - tables_touched: ['analytics.orders'], - literal_slots: [{ position: 1, type: 'string', classification: 'constant' }], - triage_signals: { - executions_bucket: 'high', - distinct_users_bucket: 'team', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'false', - slot_summary: '1 constant, 0 runtime', - }, - }, - }); - await writeFile(join(root, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8'); - await writeJson(root, 'templates/fp_1/usage.json', { - stats: { - executions: 20, - distinct_users: 3, - first_seen: '2026-05-01T00:00:00.000Z', - last_seen: '2026-05-04T11:55:00.000Z', - p50_runtime_ms: 100, - p95_runtime_ms: 200, - error_rate: 0, - rows_produced: 20, - }, - literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }], - samples: [], - }); -} - -async function writeSubclusterTemplates(root: string): Promise { - await writeJson(root, 'manifest.json', { - source: 'historic-sql', - connectionId: 'conn_1', - dialect: 'snowflake', - fetchedAt: '2026-05-04T12:00:00.000Z', - windowStart: '2026-02-03T12:00:00.000Z', - windowEnd: '2026-05-04T12:00:00.000Z', - nextSuccessfulCursor: '2026-05-04T11:55:00.000Z', - templateCount: 2, - capped: false, - warnings: [], - templates: [ - { - id: 'fp_order_status__cat_2b2ff2318877', - fingerprint: 'fp_order_status', - subClusterId: 'cat_2b2ff2318877', - path: 'templates/fp_order_status__cat_2b2ff2318877/page.md', - }, - { - id: 'fp_order_status__cat_34f037ddcbfa', - fingerprint: 'fp_order_status', - subClusterId: 'cat_34f037ddcbfa', - path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md', - }, - ], - }); - - for (const template of [ - { id: 'fp_order_status__cat_2b2ff2318877', subClusterId: 'cat_2b2ff2318877' }, - { id: 'fp_order_status__cat_34f037ddcbfa', subClusterId: 'cat_34f037ddcbfa' }, - ]) { - await writeJson(root, `templates/${template.id}/metadata.json`, { - id: template.id, - title: `snowflake · analytics.orders [fp_ord:${template.subClusterId.slice(-6)}]`, - path: `templates/${template.id}/page.md`, - objectType: 'historic_sql_template', - lastEditedAt: null, - properties: { - fingerprint: 'fp_order_status', - sub_cluster_id: template.subClusterId, - dialect: 'snowflake', - tables_touched: ['analytics.orders'], - literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }], - triage_signals: { - executions_bucket: 'mid', - distinct_users_bucket: 'team', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'false', - slot_summary: '0 constant, 0 runtime', - }, - }, - }); - await writeFile(join(root, `templates/${template.id}/page.md`), `# ${template.id}\n`, 'utf-8'); - await writeJson(root, `templates/${template.id}/usage.json`, { - stats: { - executions: 3, - distinct_users: 3, - first_seen: '2026-05-04T10:00:00.000Z', - last_seen: '2026-05-04T10:05:00.000Z', - p50_runtime_ms: 120, - p95_runtime_ms: 150, - error_rate: 0, - rows_produced: 36, - }, - literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }], - samples: [], - }); - } -} - -describe('chunkHistoricSqlStagedDir', () => { - it('emits one WorkUnit per changed template and keeps usage as dependency', async () => { - const stagedDir = await tempDir(); - await writeTemplate(stagedDir); - - const result = await chunkHistoricSqlStagedDir(stagedDir, { - added: ['templates/fp_1/metadata.json'], - modified: [], - deleted: [], - unchanged: ['templates/fp_1/page.md', 'templates/fp_1/usage.json', 'manifest.json'], - }); - - expect(result.workUnits).toEqual([ - { - unitKey: 'historic-sql-fp-1', - displayLabel: 'snowflake · analytics.orders [fp_1]', - rawFiles: ['templates/fp_1/metadata.json'], - dependencyPaths: ['manifest.json', 'templates/fp_1/usage.json'], - peerFileIndex: ['templates/fp_1/page.md'], - notes: - 'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.', - }, - ]); - expect(result.contextReport).toEqual({ capped: false, warnings: ['source warning'] }); - }); - - it('emits one WorkUnit per changed categorical sub-cluster', async () => { - const stagedDir = await tempDir(); - await writeSubclusterTemplates(stagedDir); - - const result = await chunkHistoricSqlStagedDir(stagedDir, { - added: [ - 'templates/fp_order_status__cat_2b2ff2318877/metadata.json', - 'templates/fp_order_status__cat_34f037ddcbfa/metadata.json', - ], - modified: [], - deleted: [], - unchanged: [ - 'manifest.json', - 'templates/fp_order_status__cat_2b2ff2318877/page.md', - 'templates/fp_order_status__cat_2b2ff2318877/usage.json', - 'templates/fp_order_status__cat_34f037ddcbfa/page.md', - 'templates/fp_order_status__cat_34f037ddcbfa/usage.json', - ], - }); - - expect( - result.workUnits.map((unit) => ({ - unitKey: unit.unitKey, - displayLabel: unit.displayLabel, - rawFiles: unit.rawFiles, - dependencyPaths: unit.dependencyPaths, - })), - ).toEqual([ - { - unitKey: 'historic-sql-fp-order-status-cat-2b2ff2318877', - displayLabel: 'snowflake · analytics.orders [fp_ord:318877]', - rawFiles: ['templates/fp_order_status__cat_2b2ff2318877/metadata.json'], - dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_2b2ff2318877/usage.json'], - }, - { - unitKey: 'historic-sql-fp-order-status-cat-34f037ddcbfa', - displayLabel: 'snowflake · analytics.orders [fp_ord:ddcbfa]', - rawFiles: ['templates/fp_order_status__cat_34f037ddcbfa/metadata.json'], - dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'], - }, - ]); - }); - - it('emits zero WorkUnits for usage-only diffs', async () => { - const stagedDir = await tempDir(); - await writeTemplate(stagedDir); - - const result = await chunkHistoricSqlStagedDir(stagedDir, { - added: [], - modified: ['templates/fp_1/usage.json'], - deleted: [], - unchanged: ['templates/fp_1/metadata.json', 'templates/fp_1/page.md', 'manifest.json'], - }); - - expect(result.workUnits).toEqual([]); - expect(result.eviction).toBeUndefined(); - }); - - it('emits eviction only for deleted metadata or page files', async () => { - const stagedDir = await tempDir(); - await writeTemplate(stagedDir); - - const result = await chunkHistoricSqlStagedDir(stagedDir, { - added: [], - modified: [], - deleted: ['templates/fp_1/usage.json', 'templates/fp_2/page.md'], - unchanged: [], - }); - - expect(result.eviction).toEqual({ deletedRawPaths: ['templates/fp_2/page.md'] }); - }); - - it('describes historic-sql scope without including unrelated paths', async () => { - const stagedDir = await tempDir(); - await writeTemplate(stagedDir); - - const scope = await describeHistoricSqlScope(stagedDir); - - expect(scope.fingerprint).toHaveLength(64); - expect(scope.isPathInScope('manifest.json')).toBe(true); - expect(scope.isPathInScope('templates/fp_1/usage.json')).toBe(true); - expect(scope.isPathInScope('pages/notion/page.md')).toBe(false); - }); -}); diff --git a/packages/context/src/ingest/adapters/historic-sql/chunk.ts b/packages/context/src/ingest/adapters/historic-sql/chunk.ts deleted file mode 100644 index 5d959bc0..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/chunk.ts +++ /dev/null @@ -1,86 +0,0 @@ -import { createHash } from 'node:crypto'; -import { readFile, readdir } from 'node:fs/promises'; -import { join, relative } from 'node:path'; -import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js'; -import { historicSqlManifestSchema, historicSqlMetadataSchema } from './types.js'; - -async function walk(root: string): Promise { - const entries = await readdir(root, { withFileTypes: true, recursive: true }); - return entries - .filter((entry) => entry.isFile()) - .map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/')) - .sort(); -} - -function safeUnitKey(id: string): string { - return `historic-sql-${id.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`; -} - -async function readManifest(stagedDir: string) { - try { - return historicSqlManifestSchema.parse(JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8'))); - } catch (error) { - throw new Error(`Invalid historic-SQL manifest: ${error instanceof Error ? error.message : String(error)}`); - } -} - -export async function chunkHistoricSqlStagedDir(stagedDir: string, diffSet?: DiffSet): Promise { - const files = await walk(stagedDir); - const manifest = await readManifest(stagedDir); - const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null; - const workUnits: WorkUnit[] = []; - - for (const pagePath of files.filter((path) => /^templates\/[^/]+\/page\.md$/.test(path))) { - const metadataPath = pagePath.replace(/\/page\.md$/, '/metadata.json'); - const usagePath = pagePath.replace(/\/page\.md$/, '/usage.json'); - const primary = [metadataPath, pagePath].filter((path) => files.includes(path)); - if (touched && !primary.some((path) => touched.has(path))) { - continue; - } - - const metadata = historicSqlMetadataSchema.parse(JSON.parse(await readFile(join(stagedDir, metadataPath), 'utf-8'))); - const rawFiles = touched ? primary.filter((path) => touched.has(path)).sort() : primary.sort(); - const dependencyPaths = ['manifest.json', files.includes(usagePath) ? usagePath : null] - .filter((path): path is string => typeof path === 'string' && !rawFiles.includes(path)) - .sort(); - const excluded = new Set([...rawFiles, ...dependencyPaths]); - const peerFileIndex = files.filter((path) => !excluded.has(path)).sort(); - - workUnits.push({ - unitKey: safeUnitKey(metadata.id), - displayLabel: metadata.title, - rawFiles, - dependencyPaths, - peerFileIndex, - notes: - 'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.', - }); - } - - const deletedPrimary = diffSet?.deleted.filter((path) => /^templates\/[^/]+\/(metadata\.json|page\.md)$/.test(path)); - - return { - workUnits, - eviction: deletedPrimary && deletedPrimary.length > 0 ? { deletedRawPaths: deletedPrimary.sort() } : undefined, - reconcileNotes: [`Historic-SQL staged templates=${manifest.templateCount}`], - contextReport: { - capped: manifest.capped, - warnings: manifest.warnings, - }, - }; -} - -export async function describeHistoricSqlScope(stagedDir: string): Promise { - const manifest = await readManifest(stagedDir); - const scopeKey = JSON.stringify({ - connectionId: manifest.connectionId, - dialect: manifest.dialect, - windowStart: manifest.windowStart, - windowEnd: manifest.windowEnd, - }); - const fingerprint = createHash('sha256').update(scopeKey).digest('hex'); - return { - fingerprint, - isPathInScope: (rawPath) => rawPath === 'manifest.json' || rawPath.startsWith('templates/'), - }; -} diff --git a/packages/context/src/ingest/adapters/historic-sql/detect.test.ts b/packages/context/src/ingest/adapters/historic-sql/detect.test.ts index c4240192..9ad3cf39 100644 --- a/packages/context/src/ingest/adapters/historic-sql/detect.test.ts +++ b/packages/context/src/ingest/adapters/historic-sql/detect.test.ts @@ -3,13 +3,7 @@ import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { describe, expect, it } from 'vitest'; import { detectHistoricSqlStagedDir } from './detect.js'; -import { - HISTORIC_SQL_SOURCE_KEY, - historicSqlManifestSchema, - historicSqlMetadataSchema, - historicSqlPullConfigSchema, - historicSqlUsageSchema, -} from './types.js'; +import { HISTORIC_SQL_SOURCE_KEY, stagedManifestSchema } from './types.js'; async function tempDir(): Promise { return mkdtemp(join(tmpdir(), 'historic-sql-detect-')); @@ -21,32 +15,35 @@ async function writeJson(root: string, relPath: string, value: unknown): Promise await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'); } +function manifest() { + return stagedManifestSchema.parse({ + source: HISTORIC_SQL_SOURCE_KEY, + connectionId: 'conn_1', + dialect: 'postgres', + fetchedAt: '2026-05-04T12:00:00.000Z', + windowStart: '2026-02-03T12:00:00.000Z', + windowEnd: '2026-05-04T12:00:00.000Z', + snapshotRowCount: 0, + touchedTableCount: 0, + parseFailures: 0, + warnings: [], + probeWarnings: [], + }); +} + describe('historic-sql staged dir detection', () => { it('detects manifest source', async () => { const stagedDir = await tempDir(); - await writeJson(stagedDir, 'manifest.json', { - source: HISTORIC_SQL_SOURCE_KEY, - connectionId: 'conn_1', - dialect: 'snowflake', - fetchedAt: '2026-05-04T12:00:00.000Z', - windowStart: '2026-02-03T12:00:00.000Z', - windowEnd: '2026-05-04T12:00:00.000Z', - nextSuccessfulCursor: '2026-05-04T11:55:00.000Z', - templateCount: 0, - capped: false, - warnings: [], - templates: [], - }); + await writeJson(stagedDir, 'manifest.json', manifest()); await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true); }); - it('detects document-shaped template structure without manifest', async () => { + it('detects unified table and patterns structure without manifest', async () => { const stagedDir = await tempDir(); await writeFile(join(stagedDir, 'not-a-match.txt'), 'x', 'utf-8'); - await mkdir(join(stagedDir, 'templates', 'fp_1'), { recursive: true }); - await writeFile(join(stagedDir, 'templates', 'fp_1', 'metadata.json'), '{}', 'utf-8'); - await writeFile(join(stagedDir, 'templates', 'fp_1', 'page.md'), '# fp_1\n', 'utf-8'); + await writeJson(stagedDir, 'patterns-input.json', { templates: [] }); + await writeJson(stagedDir, 'tables/public.orders.json', { table: 'public.orders' }); await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true); }); @@ -58,140 +55,3 @@ describe('historic-sql staged dir detection', () => { await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(false); }); }); - -describe('historic-sql schemas', () => { - it('defaults disabled optional pull-config fields through the parser', () => { - expect( - historicSqlPullConfigSchema.parse({ - dialect: 'bigquery', - }), - ).toEqual({ - dialect: 'bigquery', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 5000, - minCalls: 5, - }); - }); - - it('accepts postgres pull config with a minCalls floor', () => { - expect( - historicSqlPullConfigSchema.parse({ - dialect: 'postgres', - minCalls: 12, - }), - ).toEqual({ - dialect: 'postgres', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 5000, - minCalls: 12, - }); - }); - - it('accepts postgres manifest fields with defaults for older dialects', () => { - expect( - historicSqlManifestSchema.parse({ - source: HISTORIC_SQL_SOURCE_KEY, - connectionId: 'conn_pg', - dialect: 'postgres', - fetchedAt: '2026-05-08T12:00:00.000Z', - windowStart: '2026-05-08T11:00:00.000Z', - windowEnd: '2026-05-08T12:00:00.000Z', - nextSuccessfulCursor: '2026-05-08T12:00:00.000Z', - templateCount: 0, - capped: false, - warnings: [], - templates: [], - degraded: true, - statsResetAt: '2026-05-01T00:00:00.000Z', - baselineFirstRun: true, - pgServerVersion: 'PostgreSQL 16.4', - deallocCount: 3, - }), - ).toMatchObject({ - dialect: 'postgres', - degraded: true, - statsResetAt: '2026-05-01T00:00:00.000Z', - baselineFirstRun: true, - pgServerVersion: 'PostgreSQL 16.4', - deallocCount: 3, - }); - - expect( - historicSqlManifestSchema.parse({ - source: HISTORIC_SQL_SOURCE_KEY, - connectionId: 'conn_sf', - dialect: 'snowflake', - fetchedAt: '2026-05-08T12:00:00.000Z', - windowStart: '2026-05-01T12:00:00.000Z', - windowEnd: '2026-05-08T12:00:00.000Z', - nextSuccessfulCursor: null, - templateCount: 0, - capped: false, - warnings: [], - templates: [], - }), - ).toMatchObject({ - degraded: false, - statsResetAt: null, - baselineFirstRun: false, - pgServerVersion: null, - deallocCount: null, - }); - }); - - it('accepts postgres usage stats with mean_runtime_ms and empty samples', () => { - const parsed = historicSqlUsageSchema.parse({ - stats: { - executions: 25, - distinct_users: 2, - first_seen: '2026-05-08T10:00:00.000Z', - last_seen: '2026-05-08T12:00:00.000Z', - p50_runtime_ms: null, - p95_runtime_ms: null, - mean_runtime_ms: 32.5, - error_rate: 0, - rows_produced: 1042, - }, - literal_slots: [], - samples: [], - }); - - expect(parsed.stats.mean_runtime_ms).toBe(32.5); - expect(parsed.samples).toEqual([]); - }); - - it('pins the Notion-compatible metadata envelope', () => { - const parsed = historicSqlMetadataSchema.parse({ - id: 'fp_1', - title: 'snowflake · analytics.orders [fp_1]', - path: 'templates/fp_1/page.md', - objectType: 'historic_sql_template', - lastEditedAt: null, - properties: { - fingerprint: 'fp_1', - sub_cluster_id: null, - dialect: 'snowflake', - tables_touched: ['analytics.orders'], - literal_slots: [{ position: 1, type: 'string', classification: 'constant' }], - triage_signals: { - executions_bucket: 'high', - distinct_users_bucket: 'team', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'false', - slot_summary: '1 constant, 0 runtime', - }, - }, - }); - - expect(parsed.objectType).toBe('historic_sql_template'); - expect(parsed.lastEditedAt).toBeNull(); - expect(parsed.properties.triage_signals.service_account_only).toBe('false'); - }); -}); diff --git a/packages/context/src/ingest/adapters/historic-sql/detect.ts b/packages/context/src/ingest/adapters/historic-sql/detect.ts index d0a1652b..103a0d6a 100644 --- a/packages/context/src/ingest/adapters/historic-sql/detect.ts +++ b/packages/context/src/ingest/adapters/historic-sql/detect.ts @@ -16,21 +16,9 @@ export async function detectHistoricSqlStagedDir(stagedDir: string): Promise(); - const pageDirs = new Set(); - for (const entry of entries) { - if (!entry.isFile()) { - continue; - } - if (entry.name === 'metadata.json') { - metadataDirs.add(entry.parentPath); - } - if (entry.name === 'page.md') { - pageDirs.add(entry.parentPath); - } - } - return [...metadataDirs].some((dir) => pageDirs.has(dir)); + await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8'); + const entries = await readdir(join(stagedDir, 'tables'), { withFileTypes: true }); + return entries.some((entry) => entry.isFile() && entry.name.endsWith('.json')); } catch { return false; } diff --git a/packages/context/src/ingest/adapters/historic-sql/evidence-tool.test.ts b/packages/context/src/ingest/adapters/historic-sql/evidence-tool.test.ts new file mode 100644 index 00000000..ae16d105 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/evidence-tool.test.ts @@ -0,0 +1,89 @@ +import { describe, expect, it, vi } from 'vitest'; +import { asSchema } from 'ai'; +import { createEmitHistoricSqlEvidenceTool } from './evidence-tool.js'; + +describe('emit_historic_sql_evidence tool', () => { + it('exposes an AI SDK v6 tool input schema with top-level object type', async () => { + const tool = createEmitHistoricSqlEvidenceTool(); + + expect(await asSchema(tool.inputSchema).jsonSchema).toMatchObject({ + type: 'object', + }); + }); + + it('writes table usage evidence to the ignored run evidence directory', async () => { + const writeFile = vi.fn(async () => ({ success: true, commitHash: null })); + const tool = createEmitHistoricSqlEvidenceTool(); + + const result = await tool.execute!( + { + kind: 'table_usage', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Orders are repeatedly queried by paid status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [], + staleSince: null, + }, + }, + { + toolCallId: 'call-1', + messages: [], + abortSignal: new AbortController().signal, + experimental_context: { + connectionId: 'warehouse', + session: { + ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'historic-sql' }, + configService: { writeFile }, + }, + }, + } as never, + ); + + expect(result).toBe('Recorded historic-SQL table_usage evidence for public.orders.'); + expect(writeFile).toHaveBeenCalledWith( + '.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json', + expect.stringContaining('"kind": "table_usage"'), + 'System User', + 'system@example.com', + 'Record historic-SQL evidence: historic-sql-table-public-orders', + { skipLock: true }, + ); + }); + + it('rejects non-historic ingest sessions', async () => { + const tool = createEmitHistoricSqlEvidenceTool(); + + await expect( + tool.execute!( + { + kind: 'pattern', + rawPath: 'patterns-input.json', + pattern: { + slug: 'orders', + title: 'Orders', + narrative: 'Orders pattern.', + definitionSql: 'select * from public.orders', + tablesInvolved: ['public.orders'], + slRefs: ['orders'], + constituentTemplateIds: ['pg:1'], + }, + }, + { + toolCallId: 'call-1', + messages: [], + abortSignal: new AbortController().signal, + experimental_context: { + connectionId: 'warehouse', + session: { + ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'notion' }, + configService: { writeFile: vi.fn() }, + }, + }, + } as never, + ), + ).resolves.toContain('Error: emit_historic_sql_evidence is only available during historic-sql ingest'); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/evidence-tool.ts b/packages/context/src/ingest/adapters/historic-sql/evidence-tool.ts new file mode 100644 index 00000000..29d66cb2 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/evidence-tool.ts @@ -0,0 +1,121 @@ +import { tool } from 'ai'; +import { z } from 'zod'; +import { historicSqlEvidencePath, serializeHistoricSqlEvidence } from './evidence.js'; +import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js'; + +const SYSTEM_AUTHOR = 'System User'; +const SYSTEM_EMAIL = 'system@example.com'; + +const emitHistoricSqlEvidenceInputSchema = z + .object({ + kind: z.enum(['table_usage', 'pattern']), + table: z.string().min(1).optional(), + rawPath: z.string().min(1), + usage: tableUsageOutputSchema.optional(), + pattern: patternOutputSchema.optional(), + }) + .superRefine((input, ctx) => { + if (input.kind === 'table_usage') { + if (!input.table) { + ctx.addIssue({ + code: 'custom', + path: ['table'], + message: 'table is required when kind is table_usage', + }); + } + if (!input.usage) { + ctx.addIssue({ + code: 'custom', + path: ['usage'], + message: 'usage is required when kind is table_usage', + }); + } + } + if (input.kind === 'pattern' && !input.pattern) { + ctx.addIssue({ + code: 'custom', + path: ['pattern'], + message: 'pattern is required when kind is pattern', + }); + } + }); + +type EmitHistoricSqlEvidenceInput = z.infer; + +interface EmitHistoricSqlEvidenceToolContext { + connectionId?: string | null; + session?: { + ingest?: { runId: string; sourceKey: string }; + configService?: { + writeFile( + path: string, + content: string, + author: string, + authorEmail: string, + commitMessage: string, + options?: { skipLock?: boolean }, + ): Promise; + }; + }; +} + +function unitKeyForEvidence(input: EmitHistoricSqlEvidenceInput): string { + if (input.kind === 'table_usage') { + return `historic-sql-table-${String(input.table).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`; + } + return `historic-sql-pattern-${String(input.pattern?.slug).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`; +} + +function evidenceEnvelope(input: EmitHistoricSqlEvidenceInput, connectionId: string) { + if (input.kind === 'table_usage') { + if (!input.table || !input.usage) { + throw new Error('Invalid historic-SQL table usage evidence input.'); + } + return { + kind: 'table_usage' as const, + connectionId, + table: input.table, + rawPath: input.rawPath, + usage: input.usage, + }; + } + if (!input.pattern) { + throw new Error('Invalid historic-SQL pattern evidence input.'); + } + return { + kind: 'pattern' as const, + connectionId, + rawPath: input.rawPath, + pattern: input.pattern, + }; +} + +export function createEmitHistoricSqlEvidenceTool(defaultContext?: EmitHistoricSqlEvidenceToolContext) { + return tool({ + description: + 'Record typed historic-SQL evidence for deterministic projection. Use this instead of wiki_write, sl_write_source, sl_edit_source, or context_candidate_write during historic-SQL WorkUnits.', + inputSchema: emitHistoricSqlEvidenceInputSchema, + execute: async (input, options): Promise => { + const context = (options.experimental_context as EmitHistoricSqlEvidenceToolContext | undefined) ?? defaultContext; + const ingest = context?.session?.ingest; + const configService = context?.session?.configService; + if (!ingest || ingest.sourceKey !== 'historic-sql' || !configService || !context?.connectionId) { + return 'Error: emit_historic_sql_evidence is only available during historic-sql ingest.'; + } + + const unitKey = unitKeyForEvidence(input); + const evidence = evidenceEnvelope(input, context.connectionId); + const content = serializeHistoricSqlEvidence(evidence); + await configService.writeFile( + historicSqlEvidencePath(ingest.runId, unitKey), + content, + SYSTEM_AUTHOR, + SYSTEM_EMAIL, + `Record historic-SQL evidence: ${unitKey}`, + { skipLock: true }, + ); + const label = evidence.kind === 'table_usage' ? evidence.table : evidence.pattern.slug; + return `Recorded historic-SQL ${input.kind} evidence for ${label}.`; + }, + }); +} diff --git a/packages/context/src/ingest/adapters/historic-sql/evidence.test.ts b/packages/context/src/ingest/adapters/historic-sql/evidence.test.ts new file mode 100644 index 00000000..8858ed37 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/evidence.test.ts @@ -0,0 +1,57 @@ +import { describe, expect, it } from 'vitest'; +import { + historicSqlEvidenceEnvelopeSchema, + historicSqlEvidencePath, + historicSqlPatternEvidenceSchema, + historicSqlTableUsageEvidenceSchema, +} from './evidence.js'; + +describe('historic-sql evidence contracts', () => { + it('validates table usage evidence emitted by table digest WorkUnits', () => { + const parsed = historicSqlTableUsageEvidenceSchema.parse({ + kind: 'table_usage', + connectionId: 'warehouse', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Orders are repeatedly queried for paid/refunded lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + }, + }); + + expect(parsed.table).toBe('public.orders'); + expect(parsed.usage.frequencyTier).toBe('high'); + }); + + it('validates pattern evidence emitted by the patterns WorkUnit', () => { + const parsed = historicSqlPatternEvidenceSchema.parse( + historicSqlEvidenceEnvelopeSchema.parse({ + kind: 'pattern', + connectionId: 'warehouse', + rawPath: 'patterns-input.json', + pattern: { + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Analysts compare order status changes by customer segment.', + definitionSql: 'select status, count(*) from public.orders group by status', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['pg:1', 'pg:2'], + }, + }), + ); + + expect(parsed.kind).toBe('pattern'); + expect(parsed.pattern.slug).toBe('order-lifecycle-analysis'); + }); + + it('builds a stable ignored evidence path from run and WorkUnit identity', () => { + expect(historicSqlEvidencePath('run-1', 'historic-sql-table-public-orders')).toBe( + '.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json', + ); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/evidence.ts b/packages/context/src/ingest/adapters/historic-sql/evidence.ts new file mode 100644 index 00000000..18c01a85 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/evidence.ts @@ -0,0 +1,41 @@ +import { z } from 'zod'; +import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js'; + +function safeEvidenceSegment(value: string): string { + const segment = value.replace(/[^a-zA-Z0-9._-]+/g, '-').replace(/^-+|-+$/g, ''); + if (!segment) { + throw new Error(`Invalid historic-SQL evidence path segment: ${value}`); + } + return segment; +} + +export const historicSqlTableUsageEvidenceSchema = z.object({ + kind: z.literal('table_usage'), + connectionId: z.string().min(1), + table: z.string().min(1), + rawPath: z.string().min(1), + usage: tableUsageOutputSchema, +}); +export type HistoricSqlTableUsageEvidence = z.infer; + +export const historicSqlPatternEvidenceSchema = z.object({ + kind: z.literal('pattern'), + connectionId: z.string().min(1), + rawPath: z.string().min(1), + pattern: patternOutputSchema, +}); +export type HistoricSqlPatternEvidence = z.infer; + +export const historicSqlEvidenceEnvelopeSchema = z.discriminatedUnion('kind', [ + historicSqlTableUsageEvidenceSchema, + historicSqlPatternEvidenceSchema, +]); +export type HistoricSqlEvidenceEnvelope = z.infer; + +export function historicSqlEvidencePath(runId: string, unitKey: string): string { + return `.ktx/ingest-evidence/historic-sql/${safeEvidenceSegment(runId)}/${safeEvidenceSegment(unitKey)}.json`; +} + +export function serializeHistoricSqlEvidence(evidence: HistoricSqlEvidenceEnvelope): string { + return `${JSON.stringify(historicSqlEvidenceEnvelopeSchema.parse(evidence), null, 2)}\n`; +} diff --git a/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts b/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts index 40926965..c2c679e5 100644 --- a/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts +++ b/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts @@ -1,48 +1,30 @@ -import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'; +import { mkdtemp } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; -import { describe, expect, it, vi } from 'vitest'; +import { describe, expect, it } from 'vitest'; import type { SqlAnalysisPort } from '../../../sql-analysis/index.js'; +import type { SourceAdapter } from '../../types.js'; import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js'; -import { pgssBaselinePath } from './stage-pgss.js'; -import type { HistoricSqlQueryHistoryReader, PostgresPgssReader } from './types.js'; +import type { HistoricSqlReader } from './types.js'; async function tempDir(): Promise { return mkdtemp(join(tmpdir(), 'historic-sql-adapter-')); } -async function writeJson(root: string, relPath: string, value: unknown): Promise { - const target = join(root, relPath); - await mkdir(join(target, '..'), { recursive: true }); - await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'); -} - const sqlAnalysis: SqlAnalysisPort = { async analyzeForFingerprint() { - return { - fingerprint: 'fp_1', - normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?', - tablesTouched: ['analytics.orders'], - literalSlots: [{ position: 1, type: 'string', exampleValue: 'paid' }], - }; + throw new Error('legacy analyzeForFingerprint must not be used'); + }, + async analyzeBatch() { + return new Map(); }, }; -const reader: HistoricSqlQueryHistoryReader = { - async probe() {}, - async *fetch() { - yield { - id: 'q1', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'", - user: 'analyst', - startedAt: '2026-05-04T11:00:00.000Z', - endedAt: null, - runtimeMs: 10, - rowsProduced: 1, - success: true, - errorMessage: null, - }; +const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; }, + async *fetchAggregated() {}, }; describe('HistoricSqlSourceAdapter', () => { @@ -50,255 +32,73 @@ describe('HistoricSqlSourceAdapter', () => { const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} }); expect(adapter.source).toBe('historic-sql'); - expect(adapter.skillNames).toEqual(['historic_sql_ingest']); - expect(adapter.reconcileSkillNames).toEqual(['historic_sql_curator']); - expect(adapter.evidenceIndexing).toBe('documents'); - expect(adapter.triageSupported).toBe(true); + expect(adapter.skillNames).toEqual(['historic_sql_table_digest', 'historic_sql_patterns']); + expect(adapter.reconcileSkillNames).toEqual([]); + expect((adapter as SourceAdapter).evidenceIndexing).toBeUndefined(); + expect(adapter.triageSupported).toBe(false); }); - it('fetches staged templates through injected reader and SqlAnalysisPort', async () => { + it('fetches a unified aggregate snapshot and emits unified WorkUnits', async () => { const stagedDir = await tempDir(); - const adapter = new HistoricSqlSourceAdapter({ - sqlAnalysis, - reader, - queryClient: {}, - now: () => new Date('2026-05-04T12:00:00.000Z'), - }); - - await adapter.fetch( - { - dialect: 'snowflake', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 5000, - }, - stagedDir, - { connectionId: 'conn_1', sourceKey: 'historic-sql' }, - ); - - await expect(adapter.detect(stagedDir)).resolves.toBe(true); - }); - - it('reads triage signals from usage.json and metadata properties', async () => { - const stagedDir = await tempDir(); - await writeJson(stagedDir, 'manifest.json', { - source: 'historic-sql', - connectionId: 'conn_1', - dialect: 'snowflake', - fetchedAt: '2026-05-04T12:00:00.000Z', - windowStart: '2026-02-03T12:00:00.000Z', - windowEnd: '2026-05-04T12:00:00.000Z', - nextSuccessfulCursor: '2026-05-04T11:55:00.000Z', - templateCount: 1, - capped: false, - warnings: [], - templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }], - }); - await writeJson(stagedDir, 'templates/fp_1/metadata.json', { - id: 'fp_1', - title: 'snowflake · analytics.orders [fp_1]', - path: 'templates/fp_1/page.md', - objectType: 'historic_sql_template', - lastEditedAt: null, - properties: { - fingerprint: 'fp_1', - sub_cluster_id: null, - dialect: 'snowflake', - tables_touched: ['analytics.orders'], - literal_slots: [{ position: 1, type: 'string', classification: 'constant' }], - triage_signals: { - executions_bucket: 'high', - distinct_users_bucket: 'team', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'false', - slot_summary: '1 constant, 0 runtime', - }, - }, - }); - await writeFile(join(stagedDir, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8'); - await writeJson(stagedDir, 'templates/fp_1/usage.json', { - stats: { - executions: 20, - distinct_users: 3, - first_seen: '2026-05-01T00:00:00.000Z', - last_seen: '2026-05-04T11:55:00.000Z', - p50_runtime_ms: 100, - p95_runtime_ms: 200, - error_rate: 0, - }, - literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }], - samples: [], - }); - - const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} }); - - await expect(adapter.getTriageSignals(stagedDir, 'fp_1')).resolves.toEqual({ - objectType: 'historic_sql_template', - lastEditedAt: '2026-05-04T11:55:00.000Z', - propertyHints: { - executions_bucket: 'high', - distinct_users_bucket: 'team', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'false', - slot_summary: '1 constant, 0 runtime', - }, - }); - }); - - it('dispatches postgres fetches through PGSS staging and writes the baseline only after pull success', async () => { - const stagedDir = await tempDir(); - const baselineRootDir = await tempDir(); - const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg'); - const unusedPerExecutionReader: HistoricSqlQueryHistoryReader = { + const aggregateReader: HistoricSqlReader = { async probe() { - throw new Error('per-execution reader must not be used for postgres'); + return { warnings: [], info: [] }; }, - async *fetch() { - throw new Error('per-execution reader must not be used for postgres'); - }, - }; - const postgresReader: PostgresPgssReader = { - async probe() { - return { pgServerVersion: 'PostgreSQL 16.4', warnings: [] }; - }, - async readSnapshot() { - return { - statsResetAt: '2026-05-08T08:00:00.000Z', - deallocCount: 0, - rows: [ - { - queryid: '901', - userid: '11', - username: 'analyst', - dbid: '5', - database: 'warehouse', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 9, - totalExecTime: 90, - meanExecTime: 10, - totalRows: 18, - }, - ], + async *fetchAggregated() { + yield { + templateId: 'pg:1', + canonicalSql: + 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status', + dialect: 'postgres', + stats: { + executions: 25, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 10, + p95RuntimeMs: 20, + errorRate: 0, + rowsProduced: 10, + }, + topUsers: [{ user: 'analyst', executions: 25 }], }; }, }; + const batchSqlAnalysis: SqlAnalysisPort = { + async analyzeForFingerprint() { + throw new Error('legacy analyzeForFingerprint must not be used'); + }, + async analyzeBatch() { + return new Map([ + [ + 'pg:1', + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { select: ['status'], join: ['customer_id', 'id'], groupBy: ['status'] }, + }, + ], + ]); + }, + }; const adapter = new HistoricSqlSourceAdapter({ - sqlAnalysis, - reader: unusedPerExecutionReader, + sqlAnalysis: batchSqlAnalysis, + reader: aggregateReader, queryClient: {}, - postgresReader, - postgresQueryClient: { - async executeQuery() { - return { headers: [], rows: [] }; - }, - }, - postgresBaselineRootDir: baselineRootDir, - now: () => new Date('2026-05-08T12:00:00.000Z'), + now: () => new Date('2026-05-11T00:00:00.000Z'), }); - await adapter.fetch( - { - dialect: 'postgres', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 5000, - minCalls: 5, - }, - stagedDir, - { connectionId: 'conn_pg', sourceKey: 'historic-sql' }, - ); - - const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as { - dialect: string; - baselineFirstRun: boolean; - templates: Array<{ id: string }>; - }; - expect(manifest.dialect).toBe('postgres'); - expect(manifest.baselineFirstRun).toBe(true); - expect(manifest.templates).toEqual([ - { id: 'db5_q901', fingerprint: 'fp_1', subClusterId: null, path: 'templates/db5_q901/page.md' }, - ]); - await expect(readFile(baselinePath, 'utf-8')).rejects.toMatchObject({ code: 'ENOENT' }); - - await adapter.onPullSucceeded({ - connectionId: 'conn_pg', + await adapter.fetch({ dialect: 'postgres', minExecutions: 5 }, stagedDir, { + connectionId: 'warehouse', sourceKey: 'historic-sql', - syncId: 'sync_pg', - trigger: 'scheduled_pull', - completedAt: new Date('2026-05-08T12:01:00.000Z'), - stagedDir, }); - const baseline = JSON.parse(await readFile(baselinePath, 'utf-8')) as { - fetchedAt: string; - templates: Record }>; - }; - expect(baseline.fetchedAt).toBe('2026-05-08T12:00:00.000Z'); - expect(baseline.templates.db5_q901.perUser['11'].calls).toBe(9); - }); - - it('fails postgres fetches clearly when no PGSS reader is configured', async () => { - const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} }); - - await expect( - adapter.fetch( - { - dialect: 'postgres', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 5000, - minCalls: 5, - }, - await tempDir(), - { connectionId: 'conn_pg', sourceKey: 'historic-sql' }, - ), - ).rejects.toThrow('Historic SQL Postgres fetch requires deps.postgresReader'); - }); - - it('forwards manifest cursor through onPullSucceeded without changing the SourceAdapter signature', async () => { - const stagedDir = await tempDir(); - await writeJson(stagedDir, 'manifest.json', { - source: 'historic-sql', - connectionId: 'conn_1', - dialect: 'snowflake', - fetchedAt: '2026-05-04T12:00:00.000Z', - windowStart: '2026-02-03T12:00:00.000Z', - windowEnd: '2026-05-04T12:00:00.000Z', - nextSuccessfulCursor: '2026-05-04T11:55:00.000Z', - templateCount: 0, - capped: false, - warnings: [], - templates: [], - }); - const onPullSucceeded = vi.fn(async () => {}); - const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {}, onPullSucceeded }); - const completedAt = new Date('2026-05-04T12:01:00.000Z'); - - await adapter.onPullSucceeded({ - connectionId: 'conn_1', - sourceKey: 'historic-sql', - syncId: 'sync_1', - trigger: 'scheduled_pull', - completedAt, - stagedDir, - }); - - expect(onPullSucceeded).toHaveBeenCalledWith({ - connectionId: 'conn_1', - sourceKey: 'historic-sql', - syncId: 'sync_1', - trigger: 'scheduled_pull', - completedAt, - stagedDir, - nextSuccessfulCursor: '2026-05-04T11:55:00.000Z', + await expect(adapter.detect(stagedDir)).resolves.toBe(true); + await expect(adapter.chunk(stagedDir)).resolves.toMatchObject({ + workUnits: [ + { unitKey: 'historic-sql-table-public-customers' }, + { unitKey: 'historic-sql-table-public-orders' }, + { unitKey: 'historic-sql-patterns-part-0001' }, + ], }); }); }); diff --git a/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts b/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts index e66b1cd1..aee051e7 100644 --- a/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts +++ b/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.ts @@ -1,39 +1,16 @@ -import { readFile } from 'node:fs/promises'; +import { rm } from 'node:fs/promises'; import { join } from 'node:path'; -import type { - ChunkResult, - DiffSet, - FetchContext, - IngestTrigger, - ScopeDescriptor, - SourceAdapter, - TriageSignals, -} from '../../types.js'; -import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js'; +import type { ChunkResult, DiffSet, FetchContext, ScopeDescriptor, SourceAdapter } from '../../types.js'; +import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js'; import { detectHistoricSqlStagedDir } from './detect.js'; -import { stageHistoricSqlTemplates } from './stage.js'; -import { - pgssBaselinePath, - stagePgStatStatementsTemplates, - writePgssBaselineAtomic, - type StagePgStatStatementsTemplatesResult, -} from './stage-pgss.js'; -import { - historicSqlManifestSchema, - historicSqlMetadataSchema, - historicSqlPullConfigSchema, - historicSqlUsageSchema, - type HistoricSqlSourceAdapterDeps, -} from './types.js'; +import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js'; +import { type HistoricSqlSourceAdapterDeps } from './types.js'; export class HistoricSqlSourceAdapter implements SourceAdapter { readonly source = 'historic-sql'; - readonly skillNames = ['historic_sql_ingest']; - readonly reconcileSkillNames = ['historic_sql_curator']; - readonly evidenceIndexing = 'documents' as const; - readonly triageSupported = true; - - private readonly pendingPgssBaselines = new Map(); + readonly skillNames = ['historic_sql_table_digest', 'historic_sql_patterns']; + readonly reconcileSkillNames: string[] = []; + readonly triageSupported = false; constructor(private readonly deps: HistoricSqlSourceAdapterDeps) {} @@ -42,94 +19,27 @@ export class HistoricSqlSourceAdapter implements SourceAdapter { } async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise { - const config = historicSqlPullConfigSchema.parse(pullConfig); - if (config.dialect === 'postgres') { - if (!this.deps.postgresReader) { - throw new Error('Historic SQL Postgres fetch requires deps.postgresReader'); - } - const postgresQueryClient = this.deps.postgresQueryClient ?? this.deps.queryClient; - if ( - !postgresQueryClient || - typeof postgresQueryClient !== 'object' || - !('executeQuery' in postgresQueryClient) || - typeof (postgresQueryClient as { executeQuery?: unknown }).executeQuery !== 'function' - ) { - throw new Error('Historic SQL Postgres fetch requires deps.postgresQueryClient with executeQuery(sql, params?)'); - } - const result = await stagePgStatStatementsTemplates({ - stagedDir, - connectionId: ctx.connectionId, - queryClient: postgresQueryClient as NonNullable, - reader: this.deps.postgresReader, - sqlAnalysis: this.deps.sqlAnalysis, - pullConfig: config, - baselinePath: pgssBaselinePath(this.deps.postgresBaselineRootDir, ctx.connectionId), - now: this.deps.now?.(), - }); - this.pendingPgssBaselines.set(stagedDir, result); - return; - } - - await stageHistoricSqlTemplates({ + await stageHistoricSqlAggregatedSnapshot({ stagedDir, connectionId: ctx.connectionId, queryClient: this.deps.queryClient, reader: this.deps.reader, sqlAnalysis: this.deps.sqlAnalysis, - pullConfig: config, + pullConfig, now: this.deps.now?.(), }); + if (this.deps.legacyPostgresBaselineRootDir) { + await rm(join(this.deps.legacyPostgresBaselineRootDir, ctx.connectionId, ['pgss', 'baseline.json'].join('-')), { + force: true, + }); + } } chunk(stagedDir: string, diffSet?: DiffSet): Promise { - return chunkHistoricSqlStagedDir(stagedDir, diffSet); + return chunkHistoricSqlUnifiedStagedDir(stagedDir, diffSet); } describeScope(stagedDir: string): Promise { - return describeHistoricSqlScope(stagedDir); - } - - async getTriageSignals(stagedDir: string, externalId: string): Promise { - const manifest = historicSqlManifestSchema.parse( - JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')), - ); - const template = manifest.templates.find((entry) => entry.id === externalId); - if (!template) { - return {}; - } - const templateDir = template.path.replace(/\/page\.md$/, ''); - const metadata = historicSqlMetadataSchema.parse( - JSON.parse(await readFile(join(stagedDir, templateDir, 'metadata.json'), 'utf-8')), - ); - const usage = historicSqlUsageSchema.parse( - JSON.parse(await readFile(join(stagedDir, templateDir, 'usage.json'), 'utf-8')), - ); - - return { - objectType: metadata.objectType, - lastEditedAt: usage.stats.last_seen, - propertyHints: metadata.properties.triage_signals, - }; - } - - async onPullSucceeded(ctx: { - connectionId: string; - sourceKey: string; - syncId: string; - trigger: IngestTrigger; - completedAt: Date; - stagedDir: string; - }): Promise { - const manifest = historicSqlManifestSchema.parse( - JSON.parse(await readFile(join(ctx.stagedDir, 'manifest.json'), 'utf-8')), - ); - if (manifest.dialect === 'postgres') { - const pending = this.pendingPgssBaselines.get(ctx.stagedDir); - if (pending) { - await writePgssBaselineAtomic(pending.baselinePath, pending.baseline); - this.pendingPgssBaselines.delete(ctx.stagedDir); - } - } - await this.deps.onPullSucceeded?.({ ...ctx, nextSuccessfulCursor: manifest.nextSuccessfulCursor }); + return describeHistoricSqlUnifiedScope(stagedDir); } } diff --git a/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts b/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts new file mode 100644 index 00000000..22f35cfc --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts @@ -0,0 +1,304 @@ +import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import YAML from 'yaml'; +import { AgentRunnerService } from '../../../agent/index.js'; +import { initKtxProject, loadKtxProject, type KtxLocalProject } from '../../../project/index.js'; +import { + type SqlAnalysisBatchItem, + type SqlAnalysisBatchResult, + type SqlAnalysisDialect, + type SqlAnalysisPort, +} from '../../../sql-analysis/index.js'; +import { searchLocalSlSources } from '../../../sl/local-sl.js'; +import { searchLocalKnowledgePages } from '../../../wiki/local-knowledge.js'; +import { runLocalIngest } from '../../local-ingest.js'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js'; +import type { AggregatedTemplate, HistoricSqlReader, HistoricSqlUnifiedPullConfig } from './types.js'; + +class AcceptanceHistoricSqlReader implements HistoricSqlReader { + async probe() { + return { warnings: [], info: [] }; + } + + async *fetchAggregated( + _client: unknown, + _window: { start: Date; end: Date }, + _config: HistoricSqlUnifiedPullConfig, + ): AsyncIterable { + yield { + templateId: 'pg:orders-lifecycle', + canonicalSql: + 'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.status = $1 group by o.status, c.segment', + dialect: 'postgres', + stats: { + executions: 42, + distinctUsers: 4, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 18, + p95RuntimeMs: 84, + errorRate: 0, + rowsProduced: 420, + }, + topUsers: [{ user: 'analyst@example.test', executions: 42 }], + }; + } +} + +class HistoricSqlAcceptanceAgentRunner extends AgentRunnerService { + override runLoop = vi.fn(async (params: any) => { + if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') { + return { stopReason: 'natural' as const }; + } + + const emitEvidence = params.toolSet.emit_historic_sql_evidence; + if (!emitEvidence?.execute) { + throw new Error('emit_historic_sql_evidence tool was not available to the historic-SQL WorkUnit'); + } + + if (params.telemetryTags.unitKey === 'historic-sql-table-public-orders') { + const result = await emitEvidence.execute( + { + kind: 'table_usage', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Analysts repeatedly inspect paid order lifecycle by customer segment.', + frequencyTier: 'high', + commonFilters: ['status'], + commonGroupBys: ['status', 'segment'], + commonJoins: [{ table: 'public.customers', on: ['customer_id', 'id'] }], + staleSince: null, + }, + }, + { toolCallId: 'historic-sql-orders-usage' }, + ); + if (!String(result).includes('Recorded historic-SQL table_usage evidence')) { + throw new Error(`Unexpected orders evidence result: ${String(result)}`); + } + } + + if (params.telemetryTags.unitKey === 'historic-sql-table-public-customers') { + const result = await emitEvidence.execute( + { + kind: 'table_usage', + table: 'public.customers', + rawPath: 'tables/public.customers.json', + usage: { + narrative: 'Customers provide segment context for paid order lifecycle analysis.', + frequencyTier: 'mid', + commonFilters: [], + commonGroupBys: ['segment'], + commonJoins: [{ table: 'public.orders', on: ['id', 'customer_id'] }], + staleSince: null, + }, + }, + { toolCallId: 'historic-sql-customers-usage' }, + ); + if (!String(result).includes('Recorded historic-SQL table_usage evidence')) { + throw new Error(`Unexpected customers evidence result: ${String(result)}`); + } + } + + if (params.telemetryTags.unitKey === 'historic-sql-patterns-part-0001') { + const result = await emitEvidence.execute( + { + kind: 'pattern', + rawPath: 'patterns-input/part-0001.json', + pattern: { + slug: 'paid-order-lifecycle', + title: 'Paid Order Lifecycle', + narrative: 'Analysts join orders and customers to compare paid order lifecycle by segment.', + definitionSql: + 'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status, c.segment', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['pg:orders-lifecycle'], + }, + }, + { toolCallId: 'historic-sql-pattern' }, + ); + if (!String(result).includes('Recorded historic-SQL pattern evidence')) { + throw new Error(`Unexpected pattern evidence result: ${String(result)}`); + } + } + + return { stopReason: 'natural' as const }; + }); + + constructor() { + super({ llmProvider: { getModel: () => ({}) as never } as never }); + } +} + +function acceptanceSqlAnalysis(): SqlAnalysisPort { + return { + analyzeForFingerprint: async () => { + throw new Error('analyzeForFingerprint should not be used by unified historic-SQL ingest'); + }, + analyzeBatch: vi.fn( + async ( + items: SqlAnalysisBatchItem[], + _dialect: SqlAnalysisDialect, + ): Promise> => { + return new Map( + items.map((item) => [ + item.id, + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: ['status', 'segment'], + where: ['status'], + join: ['customer_id', 'id'], + groupBy: ['status', 'segment'], + }, + }, + ]), + ); + }, + ), + }; +} + +async function writeHistoricSqlProject(project: KtxLocalProject): Promise { + await writeFile( + join(project.projectDir, 'ktx.yaml'), + [ + 'project: warehouse', + 'connections:', + ' warehouse:', + ' driver: postgres', + ' historicSql:', + ' enabled: true', + ' dialect: postgres', + ' minExecutions: 2', + 'ingest:', + ' adapters:', + ' - historic-sql', + ' embeddings:', + ' backend: deterministic', + 'storage:', + ' state: sqlite', + ' search: sqlite-fts5', + ' git:', + ' auto_commit: false', + ' author: KTX Test ', + '', + ].join('\n'), + 'utf-8', + ); + + const loaded = await loadKtxProject({ projectDir: project.projectDir }); + await loaded.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/public.yaml', + YAML.stringify({ + tables: { + orders: { + table: 'public.orders', + columns: [ + { name: 'id', type: 'string' }, + { name: 'status', type: 'string' }, + { name: 'customer_id', type: 'string' }, + ], + }, + customers: { + table: 'public.customers', + columns: [ + { name: 'id', type: 'string' }, + { name: 'segment', type: 'string' }, + ], + }, + }, + }), + 'KTX Test', + 'system@ktx.local', + 'Seed schema shard', + ); + return loaded; +} + +describe('historic-SQL local ingest retrieval acceptance', () => { + let tempDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-historic-sql-acceptance-')); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('projects table and pattern evidence into semantic-layer and wiki retrieval surfaces', async () => { + const initialized = await initKtxProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' }); + const project = await writeHistoricSqlProject(initialized); + const sqlAnalysis = acceptanceSqlAnalysis(); + const agentRunner = new HistoricSqlAcceptanceAgentRunner(); + const adapter = new HistoricSqlSourceAdapter({ + reader: new AcceptanceHistoricSqlReader(), + queryClient: {}, + sqlAnalysis, + now: () => new Date('2026-05-11T00:00:00.000Z'), + }); + + const result = await runLocalIngest({ + project, + adapters: [adapter], + adapter: 'historic-sql', + connectionId: 'warehouse', + jobId: 'historic-sql-retrieval-acceptance', + agentRunner, + }); + + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1); + expect(result.result.failedWorkUnits).toEqual([]); + expect(result.result.workUnitCount).toBe(3); + expect(agentRunner.runLoop).toHaveBeenCalledTimes(3); + const postProcessor = result.report.body.postProcessor; + expect(postProcessor).toBeDefined(); + if (!postProcessor) { + throw new Error('Expected historic-SQL post-processor result'); + } + expect(postProcessor).toMatchObject({ + sourceKey: 'historic-sql', + status: 'success', + result: { + tableUsageMerged: 2, + patternPagesWritten: 1, + }, + }); + expect(postProcessor.touchedSources).toEqual( + expect.arrayContaining([ + { connectionId: 'warehouse', sourceName: 'customers' }, + { connectionId: 'warehouse', sourceName: 'orders' }, + ]), + ); + + await expect(readFile(join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves + .toContain('Analysts repeatedly inspect paid order lifecycle by customer segment.'); + await expect(readFile(join(project.projectDir, 'knowledge/global/historic-sql/paid-order-lifecycle.md'), 'utf-8')) + .resolves.toContain('Paid Order Lifecycle'); + + const reloaded = await loadKtxProject({ projectDir: project.projectDir }); + await expect( + searchLocalSlSources(reloaded, { connectionId: 'warehouse', query: 'paid order lifecycle', limit: 5 }), + ).resolves.toEqual(expect.arrayContaining([ + expect.objectContaining({ + name: 'orders', + frequencyTier: 'high', + snippet: expect.stringContaining(''), + matchReasons: expect.arrayContaining(['lexical']), + }), + ])); + await expect( + searchLocalKnowledgePages(reloaded, { query: 'paid order lifecycle', userId: 'local', limit: 5 }), + ).resolves.toEqual([ + expect.objectContaining({ + key: 'historic-sql/paid-order-lifecycle', + summary: 'Paid Order Lifecycle', + matchReasons: expect.arrayContaining(['lexical']), + }), + ]); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts b/packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts new file mode 100644 index 00000000..d37ed193 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/pattern-inputs.test.ts @@ -0,0 +1,89 @@ +import { describe, expect, it } from 'vitest'; +import { + HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES, + isHistoricSqlPatternInputShardPath, + serializedStagedPatternsInputByteLength, + splitHistoricSqlPatternInputs, +} from './pattern-inputs.js'; +import type { StagedPatternsInput } from './types.js'; + +type PatternTemplate = StagedPatternsInput['templates'][number]; + +function template(id: string, tablesTouched: string[], canonicalSql = 'select 1'): PatternTemplate { + return { + id, + canonicalSql, + tablesTouched, + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }; +} + +describe('historic-SQL pattern input sharding', () => { + it('keeps the audit input complete while sharding only cross-table pattern candidates', () => { + const largeSql = `select * from public.orders join public.customers on true where marker = '${'x'.repeat(260)}'`; + const input: StagedPatternsInput = { + templates: [ + template('single-table-orders', ['public.orders']), + template('orders-customers-2', ['public.orders', 'public.customers'], largeSql), + template('orders-customers-1', ['public.customers', 'public.orders'], largeSql), + template('orders-customers-payments', ['public.orders', 'public.customers', 'public.payments'], largeSql), + ], + }; + + const result = splitHistoricSqlPatternInputs(input, { maxBytes: 760 }); + + expect(result.auditInput.templates.map((entry) => entry.id)).toEqual([ + 'orders-customers-1', + 'orders-customers-2', + 'orders-customers-payments', + 'single-table-orders', + ]); + expect(result.shards.length).toBeGreaterThan(1); + expect(result.shards.map((shard) => shard.path)).toEqual([ + 'patterns-input/part-0001.json', + 'patterns-input/part-0002.json', + 'patterns-input/part-0003.json', + ]); + expect(result.shards.flatMap((shard) => shard.input.templates.map((entry) => entry.id))).toEqual([ + 'orders-customers-payments', + 'orders-customers-1', + 'orders-customers-2', + ]); + expect(result.shards.every((shard) => shard.byteLength <= 760)).toBe(true); + expect(result.shards.flatMap((shard) => shard.input.templates).some((entry) => entry.id === 'single-table-orders')).toBe(false); + expect(result.warnings).toEqual([]); + }); + + it('omits a single oversized template from shards and reports a manifest warning', () => { + const input: StagedPatternsInput = { + templates: [ + template( + 'oversized-cross-table', + ['public.orders', 'public.customers'], + `select * from public.orders join public.customers on true where payload = '${'x'.repeat(500)}'`, + ), + ], + }; + + const result = splitHistoricSqlPatternInputs(input, { maxBytes: 240 }); + + expect(result.auditInput.templates.map((entry) => entry.id)).toEqual(['oversized-cross-table']); + expect(result.shards).toEqual([]); + expect(result.warnings).toEqual(['patterns_input_template_too_large:oversized-cross-table']); + }); + + it('recognizes only generated pattern shard paths', () => { + expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0001.json')).toBe(true); + expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0012.json')).toBe(true); + expect(isHistoricSqlPatternInputShardPath('patterns-input.json')).toBe(false); + expect(isHistoricSqlPatternInputShardPath('patterns-input/part-1.json')).toBe(false); + expect(isHistoricSqlPatternInputShardPath('patterns-input/readme.md')).toBe(false); + }); + + it('uses a production byte budget below read_raw_file maximum size', () => { + expect(HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES).toBeLessThan(120_000); + expect(serializedStagedPatternsInputByteLength({ templates: [] })).toBeGreaterThan(0); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts b/packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts new file mode 100644 index 00000000..c9380239 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/pattern-inputs.ts @@ -0,0 +1,99 @@ +import { Buffer } from 'node:buffer'; +import type { StagedPatternsInput } from './types.js'; + +export const HISTORIC_SQL_PATTERN_WORKUNIT_DIR = 'patterns-input'; +export const HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES = 110_000; +export const HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE = /^patterns-input\/part-\d{4}\.json$/; + +type PatternTemplate = StagedPatternsInput['templates'][number]; + +export interface HistoricSqlPatternInputShard { + path: string; + input: StagedPatternsInput; + byteLength: number; +} + +export interface HistoricSqlPatternInputSplitResult { + auditInput: StagedPatternsInput; + shards: HistoricSqlPatternInputShard[]; + warnings: string[]; +} + +export interface HistoricSqlPatternInputSplitOptions { + maxBytes?: number; +} + +export function isHistoricSqlPatternInputShardPath(path: string): boolean { + return HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE.test(path); +} + +export function serializeStagedPatternsInput(input: StagedPatternsInput): string { + return `${JSON.stringify(input, null, 2)}\n`; +} + +export function serializedStagedPatternsInputByteLength(input: StagedPatternsInput): number { + return Buffer.byteLength(serializeStagedPatternsInput(input), 'utf-8'); +} + +function sortedAuditTemplates(templates: readonly PatternTemplate[]): PatternTemplate[] { + return [...templates].sort((left, right) => left.id.localeCompare(right.id)); +} + +function sortedPatternCandidates(templates: readonly PatternTemplate[]): PatternTemplate[] { + return [...templates] + .filter((template) => template.tablesTouched.length >= 2) + .map((template) => ({ ...template, tablesTouched: [...template.tablesTouched].sort() })) + .sort((left, right) => { + const cardinality = right.tablesTouched.length - left.tablesTouched.length; + if (cardinality !== 0) return cardinality; + const tableSignature = left.tablesTouched.join('\0').localeCompare(right.tablesTouched.join('\0')); + if (tableSignature !== 0) return tableSignature; + return left.id.localeCompare(right.id); + }); +} + +function shardPath(index: number): string { + return `${HISTORIC_SQL_PATTERN_WORKUNIT_DIR}/part-${String(index).padStart(4, '0')}.json`; +} + +export function splitHistoricSqlPatternInputs( + input: StagedPatternsInput, + options: HistoricSqlPatternInputSplitOptions = {}, +): HistoricSqlPatternInputSplitResult { + const maxBytes = options.maxBytes ?? HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES; + const auditInput: StagedPatternsInput = { templates: sortedAuditTemplates(input.templates) }; + const warnings: string[] = []; + const shards: HistoricSqlPatternInputShard[] = []; + let current: PatternTemplate[] = []; + + const flush = () => { + if (current.length === 0) { + return; + } + const shardInput: StagedPatternsInput = { templates: current }; + shards.push({ + path: shardPath(shards.length + 1), + input: shardInput, + byteLength: serializedStagedPatternsInputByteLength(shardInput), + }); + current = []; + }; + + for (const template of sortedPatternCandidates(input.templates)) { + const singleInput: StagedPatternsInput = { templates: [template] }; + if (serializedStagedPatternsInputByteLength(singleInput) > maxBytes) { + warnings.push(`patterns_input_template_too_large:${template.id}`); + continue; + } + + const nextInput: StagedPatternsInput = { templates: [...current, template] }; + if (current.length > 0 && serializedStagedPatternsInputByteLength(nextInput) > maxBytes) { + flush(); + } + current.push(template); + } + + flush(); + + return { auditInput, shards, warnings }; +} diff --git a/packages/context/src/ingest/adapters/historic-sql/post-processor.test.ts b/packages/context/src/ingest/adapters/historic-sql/post-processor.test.ts new file mode 100644 index 00000000..c96461c1 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/post-processor.test.ts @@ -0,0 +1,74 @@ +import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import YAML from 'yaml'; +import { describe, expect, it } from 'vitest'; +import { HistoricSqlProjectionPostProcessor } from './post-processor.js'; + +async function tempWorkdir(): Promise { + return mkdtemp(join(tmpdir(), 'historic-sql-post-processor-')); +} + +async function writeJson(root: string, relPath: string, value: unknown): Promise { + const target = join(root, relPath); + await mkdir(join(target, '..'), { recursive: true }); + await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'); +} + +describe('HistoricSqlProjectionPostProcessor', () => { + it('projects current run evidence before the ingest squash commit', async () => { + const workdir = await tempWorkdir(); + await mkdir(join(workdir, 'semantic-layer/warehouse/_schema'), { recursive: true }); + await writeFile( + join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), + YAML.stringify({ tables: { orders: { table: 'public.orders', columns: [{ name: 'id', type: 'string' }] } } }), + 'utf-8', + ); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 1, + touchedTableCount: 1, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 90, + }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' }); + await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/orders.json', { + kind: 'table_usage', + connectionId: 'warehouse', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Orders are repeatedly queried by lifecycle status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [], + staleSince: null, + }, + }); + + const result = await new HistoricSqlProjectionPostProcessor().run({ + connectionId: 'warehouse', + sourceKey: 'historic-sql', + syncId: 'sync-1', + jobId: 'job-1', + runId: 'run-1', + workdir, + parseArtifacts: null, + }); + + expect(result.errors).toEqual([]); + expect(result.warnings).toEqual([]); + expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]); + expect(result.result).toMatchObject({ tableUsageMerged: 1 }); + await expect(readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves.toContain( + 'Orders are repeatedly queried by lifecycle status.', + ); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/post-processor.ts b/packages/context/src/ingest/adapters/historic-sql/post-processor.ts new file mode 100644 index 00000000..815b6798 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/post-processor.ts @@ -0,0 +1,41 @@ +import type { IngestBundlePostProcessorInput, IngestBundlePostProcessorPort, IngestBundlePostProcessorResult } from '../../ports.js'; +import { createSimpleGit } from '../../../core/git-env.js'; +import { projectHistoricSqlEvidence } from './projection.js'; + +async function commitProjectionChanges(workdir: string): Promise { + const git = createSimpleGit(workdir); + if (!(await git.checkIsRepo().catch(() => false))) { + return; + } + const status = await git.status(); + const paths = status.files + .map((file) => file.path) + .filter((path) => path.startsWith('semantic-layer/') || path.startsWith('knowledge/global/historic-sql/')); + if (paths.length === 0) { + return; + } + await git.add(paths); + const staged = await git.diff(['--cached', '--name-only']); + if (!staged.trim()) { + return; + } + await git.commit('Project historic SQL evidence', { '--author': 'System User ' }); +} + +export class HistoricSqlProjectionPostProcessor implements IngestBundlePostProcessorPort { + async run(input: IngestBundlePostProcessorInput): Promise { + const projection = await projectHistoricSqlEvidence({ + workdir: input.workdir, + connectionId: input.connectionId, + syncId: input.syncId, + runId: input.runId, + }); + await commitProjectionChanges(input.workdir); + return { + result: projection, + warnings: projection.warnings, + errors: [], + touchedSources: projection.touchedSources, + }; + } +} diff --git a/packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.test.ts b/packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts similarity index 65% rename from packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.test.ts rename to packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts index 3f7b3fca..3bf4b2f5 100644 --- a/packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.test.ts +++ b/packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts @@ -4,7 +4,7 @@ import { HistoricSqlGrantsMissingError, HistoricSqlVersionUnsupportedError, } from './errors.js'; -import { PostgresPgssQueryHistoryReader } from './postgres-pgss-query-history-reader.js'; +import { PostgresPgssReader } from './postgres-pgss-reader.js'; interface FakeQueryResult { headers: string[]; @@ -35,7 +35,7 @@ function executedSql(client: ReturnType, index: number): str return call[0]; } -describe('PostgresPgssQueryHistoryReader', () => { +describe('PostgresPgssReader aggregate path', () => { it('probes version, extension presence, grants, and tracking state', async () => { const client = queryClient([ { @@ -47,11 +47,12 @@ describe('PostgresPgssQueryHistoryReader', () => { { headers: ['track'], rows: [['top']] }, { headers: ['max'], rows: [['5000']] }, ]); - const reader = new PostgresPgssQueryHistoryReader(); + const reader = new PostgresPgssReader(); await expect(reader.probe(client)).resolves.toEqual({ pgServerVersion: 'PostgreSQL 16.4 on x86_64-apple-darwin', warnings: [], + info: [], }); expect(executedSql(client, 0)).toContain("current_setting('server_version_num')::int"); @@ -69,12 +70,8 @@ describe('PostgresPgssQueryHistoryReader', () => { headers: ['server_version_num', 'server_version'], rows: [[130012, 'PostgreSQL 13.12']], }, - { - headers: ['stats_reset', 'dealloc'], - rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]], - }, ]); - const reader = new PostgresPgssQueryHistoryReader(); + const reader = new PostgresPgssReader(); const promise = reader.probe(client); await expect(promise).rejects.toMatchObject({ @@ -95,7 +92,7 @@ describe('PostgresPgssQueryHistoryReader', () => { }, new Error('relation "pg_stat_statements" does not exist'), ]); - const reader = new PostgresPgssQueryHistoryReader(); + const reader = new PostgresPgssReader(); const promise = reader.probe(client); await expect(promise).rejects.toMatchObject({ @@ -113,7 +110,7 @@ describe('PostgresPgssQueryHistoryReader', () => { }, new Error('pg_stat_statements must be loaded via shared_preload_libraries'), ]); - const reader = new PostgresPgssQueryHistoryReader(); + const reader = new PostgresPgssReader(); const promise = reader.probe(client); await expect(promise).rejects.toMatchObject({ @@ -134,7 +131,7 @@ describe('PostgresPgssQueryHistoryReader', () => { { headers: ['?column?'], rows: [[1]] }, { headers: ['has_role'], rows: [[false]] }, ]); - const reader = new PostgresPgssQueryHistoryReader(); + const reader = new PostgresPgssReader(); const promise = reader.probe(client); await expect(promise).rejects.toMatchObject({ @@ -156,17 +153,18 @@ describe('PostgresPgssQueryHistoryReader', () => { { headers: ['track'], rows: [['none']] }, { headers: ['max'], rows: [['5000']] }, ]); - const reader = new PostgresPgssQueryHistoryReader(); + const reader = new PostgresPgssReader(); await expect(reader.probe(client)).resolves.toEqual({ pgServerVersion: 'PostgreSQL 16.4', warnings: [ "pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config", ], + info: [], }); }); - it('warns when pg_stat_statements.max is below the recommended floor', async () => { + it('returns an info note when pg_stat_statements.max is below the recommended floor', async () => { const client = queryClient([ { headers: ['server_version_num', 'server_version'], @@ -177,105 +175,68 @@ describe('PostgresPgssQueryHistoryReader', () => { { headers: ['track'], rows: [['top']] }, { headers: ['max'], rows: [['1000']] }, ]); - const reader = new PostgresPgssQueryHistoryReader(); + const reader = new PostgresPgssReader(); await expect(reader.probe(client)).resolves.toEqual({ pgServerVersion: 'PostgreSQL 16.4', - warnings: [ + warnings: [], + info: [ 'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn', ], }); }); - it('reads a parameterized pg_stat_statements snapshot and stats info', async () => { - const client = queryClient([ - { - headers: [ - 'queryid', - 'userid', - 'username', - 'dbid', - 'database', - 'query', - 'calls', - 'total_exec_time', - 'mean_exec_time', - 'total_rows', - ], + it('aggregates pg_stat_statements rows by queryid and query', async () => { + const executeQuery = vi.fn(async (sql: string, params?: unknown[]) => { + if (sql.includes('pg_stat_statements_info')) { + return { headers: ['stats_reset', 'dealloc'], rows: [['2026-05-01T00:00:00.000Z', 1]] }; + } + expect(sql).toContain('GROUP BY queryid, query'); + expect(sql).toContain('HAVING SUM(calls) >= $1'); + expect(params).toEqual([5]); + return { + headers: ['template_id', 'canonical_sql', 'executions', 'distinct_users', 'mean_ms', 'rows_produced', 'top_users'], rows: [ [ - '922337203685477580', - '16384', - 'analyst', - '16385', - 'warehouse', - 'SELECT count(*) FROM public.orders WHERE status = $1', + '123', + 'select status from public.orders', '42', - '2100.5', - '50.0119', - '9001', - ], - [ - '922337203685477581', - '16386', - 'unknown', - '16385', - 'warehouse', - 'SELECT * FROM public.customers WHERE id = $1', - 5, - 30, - 6, - 5, + '3', + '11.5', + '100', + JSON.stringify([{ user: 'analyst', executions: 40 }]), ], ], - }, - { - headers: ['stats_reset', 'dealloc'], - rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]], - }, - ]); - const reader = new PostgresPgssQueryHistoryReader(); - - await expect(reader.readSnapshot(client, { minCalls: 5, maxTemplates: 500 })).resolves.toEqual({ - statsResetAt: '2026-05-01T00:00:00.000Z', - deallocCount: 7, - rows: [ - { - queryid: '922337203685477580', - userid: '16384', - username: 'analyst', - dbid: '16385', - database: 'warehouse', - query: 'SELECT count(*) FROM public.orders WHERE status = $1', - calls: 42, - totalExecTime: 2100.5, - meanExecTime: 50.0119, - totalRows: 9001, - }, - { - queryid: '922337203685477581', - userid: '16386', - username: 'unknown', - dbid: '16385', - database: 'warehouse', - query: 'SELECT * FROM public.customers WHERE id = $1', - calls: 5, - totalExecTime: 30, - meanExecTime: 6, - totalRows: 5, - }, - ], + }; }); - const snapshotSql = executedSql(client, 0); - expect(snapshotSql).toContain('FROM pg_stat_statements s'); - expect(snapshotSql).toContain('LEFT JOIN pg_roles'); - expect(snapshotSql).toContain('LEFT JOIN pg_database'); - expect(snapshotSql).toContain('WHERE s.toplevel = true'); - expect(snapshotSql).toContain('AND s.calls >= $1'); - expect(snapshotSql).toContain('ORDER BY s.total_exec_time DESC'); - expect(snapshotSql).toContain('LIMIT $2'); - expect(client.executeQuery.mock.calls[0]?.[1]).toEqual([5, 500]); - expect(executedSql(client, 1)).toBe('SELECT stats_reset, dealloc FROM pg_stat_statements_info'); + const reader = new PostgresPgssReader(); + const rows = []; + for await (const row of reader.fetchAggregated( + { executeQuery }, + { start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') }, + { dialect: 'postgres', minExecutions: 5, windowDays: 90, concurrency: 12, filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 }, + )) { + rows.push(row); + } + + expect(rows).toEqual([ + { + templateId: '123', + canonicalSql: 'select status from public.orders', + dialect: 'postgres', + stats: { + executions: 42, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 11.5, + p95RuntimeMs: 11.5, + errorRate: 0, + rowsProduced: 100, + }, + topUsers: [{ user: 'analyst', executions: 40 }], + }, + ]); }); }); diff --git a/packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.ts b/packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts similarity index 68% rename from packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.ts rename to packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts index 116233dc..8887acb8 100644 --- a/packages/context/src/ingest/adapters/historic-sql/postgres-pgss-query-history-reader.ts +++ b/packages/context/src/ingest/adapters/historic-sql/postgres-pgss-reader.ts @@ -3,12 +3,13 @@ import { HistoricSqlGrantsMissingError, HistoricSqlVersionUnsupportedError, } from './errors.js'; -import type { - KtxPostgresQueryClient, - PostgresPgssProbeResult, - PostgresPgssReader, - PostgresPgssRow, - PostgresPgssSnapshot, +import { + aggregatedTemplateSchema, + type AggregatedTemplate, + type HistoricSqlTimeWindow, + type HistoricSqlUnifiedPullConfig, + type KtxPostgresQueryClient, + type PostgresPgssProbeResult, } from './types.js'; interface QueryResultLike { @@ -18,37 +19,35 @@ interface QueryResultLike { error?: string; } +const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info'; const VERSION_SQL = ` SELECT current_setting('server_version_num')::int AS server_version_num, version() AS server_version `.trim(); - const EXTENSION_PROBE_SQL = 'SELECT 1 FROM pg_stat_statements LIMIT 1'; const GRANTS_PROBE_SQL = "SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role"; const TRACKING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.track') AS track"; const MAX_SETTING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.max') AS max"; const RECOMMENDED_PGSS_MAX = 5000; -const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info'; -const SNAPSHOT_SQL = ` -SELECT - s.queryid::text AS queryid, - s.userid::text AS userid, - COALESCE(r.rolname, 'unknown') AS username, - s.dbid::text AS dbid, - d.datname AS database, - s.query, - s.calls, - s.total_exec_time, - s.mean_exec_time, - s.rows AS total_rows -FROM pg_stat_statements s -LEFT JOIN pg_roles r ON s.userid = r.oid -LEFT JOIN pg_database d ON s.dbid = d.oid -WHERE s.toplevel = true - AND s.calls >= $1 -ORDER BY s.total_exec_time DESC -LIMIT $2 +const AGGREGATE_SQL = ` +SELECT queryid::text AS template_id, + query AS canonical_sql, + SUM(calls)::bigint AS executions, + COUNT(DISTINCT userid) AS distinct_users, + SUM(total_exec_time) / NULLIF(SUM(calls), 0) AS mean_ms, + SUM(rows)::bigint AS rows_produced, + COALESCE( + json_agg(json_build_object('user', rolname, 'executions', calls) ORDER BY calls DESC) + FILTER (WHERE userid IS NOT NULL), + '[]'::json + )::text AS top_users +FROM pg_stat_statements +LEFT JOIN pg_roles ON pg_roles.oid = pg_stat_statements.userid +WHERE toplevel = true +GROUP BY queryid, query +HAVING SUM(calls) >= $1 +ORDER BY SUM(total_exec_time) DESC `.trim(); const POSTGRES_EXTENSION_REMEDIATION = [ @@ -78,7 +77,7 @@ async function execute(client: KtxPostgresQueryClient, sql: string, params?: unk return result; } -function indexes(headers: string[]): Map { +function indexByHeader(headers: string[]): Map { const out = new Map(); headers.forEach((header, index) => out.set(header.toLowerCase(), index)); return out; @@ -113,12 +112,21 @@ function requiredFiniteNumber(raw: unknown, field: string): number { return number; } -function nullableInteger(raw: unknown): number | null { +function requiredInteger(raw: unknown, field: string): number { + return Math.trunc(requiredFiniteNumber(raw, field)); +} + +function nullableNumber(raw: unknown): number | null { if (raw === null || raw === undefined || raw === '') { return null; } const number = typeof raw === 'number' ? raw : Number(raw); - return Number.isFinite(number) ? Math.trunc(number) : null; + return Number.isFinite(number) ? number : null; +} + +function nullableInteger(raw: unknown): number | null { + const number = nullableNumber(raw); + return number === null ? null : Math.trunc(number); } function nullableIsoTimestamp(raw: unknown): string | null { @@ -137,7 +145,7 @@ function firstRow(result: QueryResultLike, context: string): { row: unknown[]; h if (!row) { throw new Error(`Postgres historic-SQL ${context} query returned no rows`); } - return { row, headers: indexes(result.headers) }; + return { row, headers: indexByHeader(result.headers) }; } function isMissingPgssRelation(error: unknown): boolean { @@ -167,22 +175,30 @@ function grantsMissingError(): HistoricSqlGrantsMissingError { }); } -function mapSnapshotRow(row: unknown[], headerIndexes: Map): PostgresPgssRow { - return { - queryid: requiredString(value(row, headerIndexes, 'queryid'), 'queryid'), - userid: requiredString(value(row, headerIndexes, 'userid'), 'userid'), - username: nullableString(value(row, headerIndexes, 'username')), - dbid: requiredString(value(row, headerIndexes, 'dbid'), 'dbid'), - database: nullableString(value(row, headerIndexes, 'database')), - query: requiredString(value(row, headerIndexes, 'query'), 'query'), - calls: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'calls'), 'calls')), - totalExecTime: requiredFiniteNumber(value(row, headerIndexes, 'total_exec_time'), 'total_exec_time'), - meanExecTime: requiredFiniteNumber(value(row, headerIndexes, 'mean_exec_time'), 'mean_exec_time'), - totalRows: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'total_rows'), 'total_rows')), - }; +function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> { + const text = nullableString(raw); + if (!text) { + return []; + } + try { + const parsed = JSON.parse(text) as unknown; + if (!Array.isArray(parsed)) { + return []; + } + return parsed.flatMap((entry) => { + if (!entry || typeof entry !== 'object') { + return []; + } + const user = nullableString((entry as { user?: unknown }).user); + const executions = nullableInteger((entry as { executions?: unknown }).executions); + return executions === null ? [] : [{ user, executions }]; + }); + } catch { + return []; + } } -export class PostgresPgssQueryHistoryReader implements PostgresPgssReader { +export class PostgresPgssReader { async probe(client: unknown): Promise { const pgClient = queryClient(client); const versionResult = await execute(pgClient, VERSION_SQL); @@ -231,32 +247,47 @@ export class PostgresPgssQueryHistoryReader implements PostgresPgssReader { const pgssMax = nullableInteger(value(maxRow, maxHeaders, 'max')); const warnings: string[] = []; + const info: string[] = []; if (track === 'none') { warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config'); } if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) { - warnings.push( + info.push( `pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`, ); } - return { pgServerVersion, warnings }; + return { pgServerVersion, warnings, info }; } - async readSnapshot( + async *fetchAggregated( client: unknown, - options: { minCalls: number; maxTemplates: number }, - ): Promise { + window: HistoricSqlTimeWindow, + config: HistoricSqlUnifiedPullConfig, + ): AsyncIterable { const pgClient = queryClient(client); - const snapshotResult = await execute(pgClient, SNAPSHOT_SQL, [options.minCalls, options.maxTemplates]); - const snapshotHeaders = indexes(snapshotResult.headers); const statsResult = await execute(pgClient, STATS_INFO_SQL); const { row: statsRow, headers: statsHeaders } = firstRow(statsResult, 'stats-info'); - - return { - statsResetAt: nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')), - deallocCount: nullableInteger(value(statsRow, statsHeaders, 'dealloc')), - rows: snapshotResult.rows.map((row) => mapSnapshotRow(row, snapshotHeaders)), - }; + const firstSeen = nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')) ?? window.start.toISOString(); + const result = await execute(pgClient, AGGREGATE_SQL, [config.minExecutions]); + const indexes = indexByHeader(result.headers); + for (const row of result.rows) { + yield aggregatedTemplateSchema.parse({ + templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'), + canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'), + dialect: 'postgres', + stats: { + executions: requiredInteger(value(row, indexes, 'executions'), 'executions'), + distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'), + firstSeen, + lastSeen: window.end.toISOString(), + p50RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')), + p95RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')), + errorRate: 0, + rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')), + }, + topUsers: parseTopUsers(value(row, indexes, 'top_users')), + }); + } } } diff --git a/packages/context/src/ingest/adapters/historic-sql/projection.test.ts b/packages/context/src/ingest/adapters/historic-sql/projection.test.ts new file mode 100644 index 00000000..e6cb736a --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/projection.test.ts @@ -0,0 +1,372 @@ +import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import YAML from 'yaml'; +import { describe, expect, it } from 'vitest'; +import { projectHistoricSqlEvidence } from './projection.js'; + +async function tempWorkdir(): Promise { + return mkdtemp(join(tmpdir(), 'historic-sql-projection-')); +} + +async function writeText(root: string, relPath: string, content: string): Promise { + const target = join(root, relPath); + await mkdir(join(target, '..'), { recursive: true }); + await writeFile(target, content, 'utf-8'); +} + +async function writeJson(root: string, relPath: string, value: unknown): Promise { + await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`); +} + +describe('projectHistoricSqlEvidence', () => { + it('merges table usage into matching _schema shards and preserves external usage keys', async () => { + const workdir = await tempWorkdir(); + await writeText( + workdir, + 'semantic-layer/warehouse/_schema/public.yaml', + YAML.stringify({ + tables: { + orders: { + table: 'public.orders', + usage: { + narrative: 'Old generated usage.', + frequencyTier: 'low', + commonFilters: ['old_status'], + commonJoins: [], + ownerNote: 'keep me', + }, + columns: [{ name: 'id', type: 'string' }], + }, + }, + }), + ); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 1, + touchedTableCount: 1, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 90, + }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' }); + await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/orders.json', { + kind: 'table_usage', + connectionId: 'warehouse', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Orders are repeatedly queried for lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + }, + }); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]); + const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')); + expect(shard.tables.orders.usage).toEqual({ + ownerNote: 'keep me', + narrative: 'Orders are repeatedly queried for lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + }); + }); + + it('writes pattern pages, reuses similar slugs, and marks missing old pattern pages stale', async () => { + const workdir = await tempWorkdir(); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 2, + touchedTableCount: 2, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 90, + }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' }); + await writeText( + workdir, + 'knowledge/global/historic-sql/old-order-lifecycle.md', + [ + '---', + YAML.stringify({ + summary: 'Old order lifecycle page', + tags: ['historic-sql', 'pattern'], + refs: [], + sl_refs: ['orders'], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.orders', 'public.customers'], + fingerprints: ['pg:1'], + }).trimEnd(), + '---', + '', + 'Old body', + '', + ].join('\n'), + ); + await writeText( + workdir, + 'knowledge/global/historic-sql/retired-pattern.md', + [ + '---', + YAML.stringify({ + summary: 'Retired pattern', + tags: ['historic-sql', 'pattern'], + refs: [], + sl_refs: [], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.tickets'], + fingerprints: ['pg:9'], + }).trimEnd(), + '---', + '', + 'Retired body', + '', + ].join('\n'), + ); + await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', { + kind: 'pattern', + connectionId: 'warehouse', + rawPath: 'patterns-input.json', + pattern: { + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Analysts compare order status with customer segment.', + definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['pg:1', 'pg:2'], + }, + }); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.patternPagesWritten).toBe(1); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/old-order-lifecycle.md'), 'utf-8')).resolves.toContain( + 'Order Lifecycle Analysis', + ); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/retired-pattern.md'), 'utf-8')).resolves.toContain( + 'stale_since: "2026-05-11T00:00:00.000Z"', + ); + }); + + it('writes a reappearing pattern to the active slug instead of reusing an archived page key', async () => { + const workdir = await tempWorkdir(); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 2, + touchedTableCount: 2, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 30, + }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' }); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' }); + await writeText( + workdir, + 'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md', + [ + '---', + YAML.stringify({ + summary: 'Archived order lifecycle page', + tags: ['historic-sql', 'pattern', 'archived'], + refs: [], + sl_refs: ['orders'], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.orders', 'public.customers'], + fingerprints: ['pg:1'], + stale_since: '2026-01-01T00:00:00.000Z', + }).trimEnd(), + '---', + '', + 'Archived body', + '', + ].join('\n'), + ); + await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', { + kind: 'pattern', + connectionId: 'warehouse', + rawPath: 'patterns-input.json', + pattern: { + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Analysts compare order status with customer segment again.', + definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['pg:1', 'pg:2'], + }, + }); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.patternPagesWritten).toBe(1); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain( + 'Order Lifecycle Analysis', + ); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain( + 'Archived body', + ); + await expect( + readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/order-lifecycle-analysis.md'), 'utf-8'), + ).rejects.toMatchObject({ code: 'ENOENT' }); + }); + + it('leaves already archived pattern pages stable when they are still absent', async () => { + const workdir = await tempWorkdir(); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 0, + touchedTableCount: 0, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 30, + }); + await writeText( + workdir, + 'knowledge/global/historic-sql/_archived/retired-pattern.md', + [ + '---', + YAML.stringify({ + summary: 'Retired pattern', + tags: ['historic-sql', 'pattern', 'archived'], + refs: [], + sl_refs: [], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.tickets'], + fingerprints: ['pg:9'], + stale_since: '2026-01-01T00:00:00.000Z', + }).trimEnd(), + '---', + '', + 'Archived retired body', + '', + ].join('\n'), + ); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.archivedPatternPages).toBe(0); + expect(result.stalePatternPagesMarked).toBe(0); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/retired-pattern.md'), 'utf-8')).resolves.toContain( + 'Archived retired body', + ); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/retired-pattern.md'), 'utf-8')).rejects.toMatchObject({ + code: 'ENOENT', + }); + }); + + it('marks missing table usage stale and deletes legacy historic SQL query pages', async () => { + const workdir = await tempWorkdir(); + await writeText( + workdir, + 'semantic-layer/warehouse/_schema/public.yaml', + YAML.stringify({ + tables: { + orders: { + table: 'public.orders', + usage: { + narrative: 'Orders were active before.', + frequencyTier: 'high', + commonFilters: ['status'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + ownerNote: 'keep analyst annotation', + }, + columns: [{ name: 'id', type: 'string' }], + }, + }, + }), + ); + await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 0, + touchedTableCount: 0, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 90, + }); + await writeText( + workdir, + 'knowledge/global/historic-sql/legacy-template.md', + [ + '---', + YAML.stringify({ + summary: 'Legacy template page', + tags: ['historic-sql', 'query-pattern'], + refs: [], + sl_refs: ['orders'], + usage_mode: 'auto', + source: 'historic-sql', + tables: ['public.orders'], + fingerprints: ['legacy:1'], + }).trimEnd(), + '---', + '', + 'Legacy body', + '', + ].join('\n'), + ); + + const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' }); + + expect(result.staleTablesMarked).toBe(1); + expect(result.legacyPagesDeleted).toBe(1); + expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]); + const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')); + expect(shard.tables.orders.usage).toEqual({ + ownerNote: 'keep analyst annotation', + narrative: 'No recent historic SQL usage was observed in the latest snapshot.', + frequencyTier: 'unused', + commonFilters: [], + commonGroupBys: [], + commonJoins: [], + staleSince: '2026-05-11T00:00:00.000Z', + }); + await expect(readFile(join(workdir, 'knowledge/global/historic-sql/legacy-template.md'), 'utf-8')).rejects.toMatchObject({ + code: 'ENOENT', + }); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/projection.ts b/packages/context/src/ingest/adapters/historic-sql/projection.ts new file mode 100644 index 00000000..ca24a67f --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/projection.ts @@ -0,0 +1,334 @@ +import { access, mkdir, readdir, readFile, rename, rm, writeFile } from 'node:fs/promises'; +import { dirname, join, relative } from 'node:path'; +import YAML from 'yaml'; +import { rawSourcesDirForSync } from '../../raw-sources-paths.js'; +import { mergeUsagePreservingExternal } from '../live-database/manifest.js'; +import { historicSqlEvidenceEnvelopeSchema, type HistoricSqlEvidenceEnvelope } from './evidence.js'; +import type { TableUsageOutput } from './skill-schemas.js'; +import { stagedManifestSchema } from './types.js'; + +export interface HistoricSqlProjectionInput { + workdir: string; + connectionId: string; + syncId: string; + runId: string; +} + +export interface HistoricSqlProjectionResult { + tableUsageMerged: number; + staleTablesMarked: number; + patternPagesWritten: number; + stalePatternPagesMarked: number; + archivedPatternPages: number; + legacyPagesDeleted: number; + touchedSources: Array<{ connectionId: string; sourceName: string }>; + warnings: string[]; +} + +interface ManifestShard { + tables?: Record; columns?: unknown[]; [key: string]: unknown }>; +} + +interface HistoricSqlPatternPage { + key: string; + path: string; + frontmatter: Record; + content: string; +} + +function safeKnowledgeSlug(value: string): string { + return value.toLowerCase().replace(/[^a-z0-9/-]+/g, '-').replace(/^-+|-+$/g, ''); +} + +async function pathExists(path: string): Promise { + try { + await access(path); + return true; + } catch { + return false; + } +} + +async function walkFiles(root: string): Promise { + if (!(await pathExists(root))) return []; + const result: string[] = []; + async function visit(dir: string): Promise { + const entries = await readdir(dir, { withFileTypes: true }); + for (const entry of entries) { + const absolute = join(dir, entry.name); + if (entry.isDirectory()) { + await visit(absolute); + } else if (entry.isFile()) { + result.push(relative(root, absolute).replace(/\\/g, '/')); + } + } + } + await visit(root); + return result.sort(); +} + +async function readJson(path: string): Promise { + return JSON.parse(await readFile(path, 'utf-8')) as unknown; +} + +async function writeYamlAtomic(path: string, value: unknown): Promise { + await mkdir(dirname(path), { recursive: true }); + const tmp = `${path}.tmp`; + await writeFile(tmp, YAML.stringify(value, { indent: 2, lineWidth: 0 }), 'utf-8'); + await rename(tmp, path); +} + +function tableSourceName(tableRef: string): string { + return tableRef.split('.').filter(Boolean).at(-1) ?? tableRef; +} + +function staleUsage(fetchedAt: string) { + return { + narrative: 'No recent historic SQL usage was observed in the latest snapshot.', + frequencyTier: 'unused' as const, + commonFilters: [], + commonGroupBys: [], + commonJoins: [], + staleSince: fetchedAt, + }; +} + +async function loadEvidence(workdir: string, runId: string): Promise { + const root = join(workdir, '.ktx/ingest-evidence/historic-sql', runId); + const files = await walkFiles(root); + const evidence: HistoricSqlEvidenceEnvelope[] = []; + for (const file of files.filter((candidate) => candidate.endsWith('.json'))) { + evidence.push(historicSqlEvidenceEnvelopeSchema.parse(await readJson(join(root, file)))); + } + return evidence; +} + +function renderPatternMarkdown(pattern: HistoricSqlEvidenceEnvelope & { kind: 'pattern' }): string { + return [ + `# ${pattern.pattern.title}`, + '', + pattern.pattern.narrative, + '', + '## Representative SQL', + '', + '```sql', + pattern.pattern.definitionSql, + '```', + '', + '## Tables', + '', + ...pattern.pattern.tablesInvolved.map((table) => `- ${table}`), + '', + '## Constituent Templates', + '', + ...pattern.pattern.constituentTemplateIds.map((id) => `- ${id}`), + '', + ].join('\n'); +} + +function overlapRatio(left: string[], right: string[]): number { + const rightSet = new Set(right); + const intersection = left.filter((value) => rightSet.has(value)).length; + return left.length === 0 ? 0 : intersection / left.length; +} + +function parseMarkdownPage(key: string, path: string, raw: string): HistoricSqlPatternPage | null { + const match = raw.match(/^---\n([\s\S]*?)\n---\n?([\s\S]*)$/); + if (!match) return null; + return { + key, + path, + frontmatter: (YAML.parse(match[1] ?? '') ?? {}) as Record, + content: match[2] ?? '', + }; +} + +function isHistoricPatternPage(page: HistoricSqlPatternPage): boolean { + const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : []; + return ( + page.frontmatter.source === 'historic-sql' && + tags.includes('historic-sql') && + tags.includes('pattern') + ); +} + +function isLegacyQueryPage(page: HistoricSqlPatternPage): boolean { + const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : []; + return page.frontmatter.source === 'historic-sql' && tags.includes('query-pattern') && !tags.includes('pattern'); +} + +function isArchivedPatternPage(page: HistoricSqlPatternPage): boolean { + const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : []; + return page.key.startsWith('_archived/') || tags.includes('archived'); +} + +function stringArray(value: unknown): string[] { + return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : []; +} + +function renderMarkdownPage(frontmatter: Record, content: string): string { + let yaml = YAML.stringify(frontmatter, { indent: 2, lineWidth: 0 }).trimEnd(); + const staleSince = frontmatter.stale_since; + if (typeof staleSince === 'string') { + yaml = yaml.replace(`stale_since: ${staleSince}`, `stale_since: "${staleSince}"`); + } + return `---\n${yaml}\n---\n\n${content.trim()}\n`; +} + +function existingPageSignals(page: HistoricSqlPatternPage): string[] { + return [...stringArray(page.frontmatter.tables), ...stringArray(page.frontmatter.fingerprints)]; +} + +function shouldArchive(staleSince: unknown, fetchedAt: string, days: number): boolean { + if (typeof staleSince !== 'string') return false; + const staleTime = Date.parse(staleSince); + const fetchedTime = Date.parse(fetchedAt); + if (!Number.isFinite(staleTime) || !Number.isFinite(fetchedTime)) return false; + return fetchedTime - staleTime > days * 24 * 60 * 60 * 1000; +} + +async function loadPatternPages(root: string): Promise { + const files = await walkFiles(root); + const pages: HistoricSqlPatternPage[] = []; + for (const file of files.filter((candidate) => candidate.endsWith('.md'))) { + const key = file.replace(/\.md$/, ''); + const path = join(root, file); + const page = parseMarkdownPage(key, path, await readFile(path, 'utf-8')); + if (page) { + pages.push(page); + } + } + return pages; +} + +async function currentStagedTables(rawDir: string): Promise> { + const tablesRoot = join(rawDir, 'tables'); + const files = await walkFiles(tablesRoot); + const tables = new Set(); + for (const file of files.filter((candidate) => candidate.endsWith('.json'))) { + const value = await readJson(join(tablesRoot, file)); + if (typeof value === 'object' && value !== null && 'table' in value && typeof value.table === 'string') { + tables.add(value.table); + } + } + return tables; +} + +export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInput): Promise { + const result: HistoricSqlProjectionResult = { + tableUsageMerged: 0, + staleTablesMarked: 0, + patternPagesWritten: 0, + stalePatternPagesMarked: 0, + archivedPatternPages: 0, + legacyPagesDeleted: 0, + touchedSources: [], + warnings: [], + }; + const touchedKeys = new Set(); + const rawDir = join(input.workdir, rawSourcesDirForSync(input.connectionId, 'historic-sql', input.syncId)); + const manifest = stagedManifestSchema.parse(await readJson(join(rawDir, 'manifest.json'))); + const currentTables = await currentStagedTables(rawDir); + const evidence = await loadEvidence(input.workdir, input.runId); + const tableEvidence = evidence.filter((entry): entry is HistoricSqlEvidenceEnvelope & { kind: 'table_usage' } => entry.kind === 'table_usage'); + const patternEvidence = evidence.filter((entry): entry is HistoricSqlEvidenceEnvelope & { kind: 'pattern' } => entry.kind === 'pattern'); + + const schemaRoot = join(input.workdir, 'semantic-layer', input.connectionId, '_schema'); + for (const file of (await walkFiles(schemaRoot)).filter((candidate) => candidate.endsWith('.yaml') || candidate.endsWith('.yml'))) { + const path = join(schemaRoot, file); + const before = await readFile(path, 'utf-8'); + const shard = (YAML.parse(before) ?? {}) as ManifestShard; + if (!shard.tables) continue; + for (const [tableName, entry] of Object.entries(shard.tables)) { + const tableRef = entry.table ?? tableName; + const matchingEvidence = tableEvidence.find( + (candidate) => candidate.table === tableRef || tableSourceName(candidate.table) === tableName, + ); + if (matchingEvidence) { + const merged = mergeUsagePreservingExternal(entry.usage as TableUsageOutput | undefined, matchingEvidence.usage); + if (JSON.stringify(entry.usage ?? null) !== JSON.stringify(merged ?? null)) { + entry.usage = merged as Record; + result.tableUsageMerged += 1; + const sourceName = tableSourceName(matchingEvidence.table); + const key = `${input.connectionId}:${sourceName}`; + if (!touchedKeys.has(key)) { + touchedKeys.add(key); + result.touchedSources.push({ connectionId: input.connectionId, sourceName }); + } + } + } else if (entry.usage && !currentTables.has(tableRef)) { + const merged = mergeUsagePreservingExternal(entry.usage as TableUsageOutput | undefined, staleUsage(manifest.fetchedAt)); + if (JSON.stringify(entry.usage ?? null) !== JSON.stringify(merged ?? null)) { + entry.usage = merged as Record; + result.staleTablesMarked += 1; + const sourceName = tableSourceName(tableRef); + const key = `${input.connectionId}:${sourceName}`; + if (!touchedKeys.has(key)) { + touchedKeys.add(key); + result.touchedSources.push({ connectionId: input.connectionId, sourceName }); + } + } + } + } + const after = YAML.stringify(shard, { indent: 2, lineWidth: 0 }); + if (after !== before) { + await writeYamlAtomic(path, shard); + } + } + + const wikiRoot = join(input.workdir, 'knowledge/global/historic-sql'); + await mkdir(wikiRoot, { recursive: true }); + const allPages = await loadPatternPages(wikiRoot); + const activePages = allPages.filter((page) => !isArchivedPatternPage(page)); + const patternPages = activePages.filter(isHistoricPatternPage); + const writtenKeys = new Set(); + + for (const pattern of patternEvidence) { + const incomingSignals = [...pattern.pattern.tablesInvolved, ...pattern.pattern.constituentTemplateIds]; + const reusable = patternPages.find((page) => overlapRatio(incomingSignals, existingPageSignals(page)) >= 0.6); + const key = reusable?.key ?? safeKnowledgeSlug(pattern.pattern.slug); + const pagePath = join(wikiRoot, `${key}.md`); + const frontmatter = { + summary: pattern.pattern.title, + tags: ['historic-sql', 'pattern'], + refs: [], + sl_refs: pattern.pattern.slRefs, + usage_mode: 'auto', + source: 'historic-sql', + tables: pattern.pattern.tablesInvolved, + representative_sql: pattern.pattern.definitionSql, + fingerprints: pattern.pattern.constituentTemplateIds, + }; + await mkdir(dirname(pagePath), { recursive: true }); + await writeFile(pagePath, renderMarkdownPage(frontmatter, renderPatternMarkdown(pattern)), 'utf-8'); + writtenKeys.add(key); + result.patternPagesWritten += 1; + } + + for (const page of patternPages) { + if (writtenKeys.has(page.key)) continue; + if (shouldArchive(page.frontmatter.stale_since, manifest.fetchedAt, manifest.staleArchiveAfterDays)) { + const archivePath = join(wikiRoot, '_archived', `${page.key}.md`); + const tags = [...new Set([...stringArray(page.frontmatter.tags), 'archived'])]; + await mkdir(dirname(archivePath), { recursive: true }); + await writeFile(archivePath, renderMarkdownPage({ ...page.frontmatter, tags }, page.content), 'utf-8'); + await rm(page.path, { force: true }); + result.archivedPatternPages += 1; + continue; + } + const tags = [...new Set([...stringArray(page.frontmatter.tags), 'stale'])]; + await writeFile( + page.path, + renderMarkdownPage({ ...page.frontmatter, tags, stale_since: manifest.fetchedAt }, page.content), + 'utf-8', + ); + result.stalePatternPagesMarked += 1; + } + + for (const page of allPages.filter(isLegacyQueryPage)) { + await rm(page.path, { force: true }); + result.legacyPagesDeleted += 1; + } + + return result; +} diff --git a/packages/context/src/ingest/adapters/historic-sql/redaction.test.ts b/packages/context/src/ingest/adapters/historic-sql/redaction.test.ts new file mode 100644 index 00000000..c8f1d78b --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/redaction.test.ts @@ -0,0 +1,36 @@ +import { describe, expect, it } from 'vitest'; +import { compileHistoricSqlRedactionPatterns, redactHistoricSqlText } from './redaction.js'; + +describe('historic-SQL redaction', () => { + it('redacts regex matches and supports the (?i) case-insensitive prefix', () => { + const redactors = compileHistoricSqlRedactionPatterns([ + 'sk_live_[A-Za-z0-9]+', + '(?i)secret_token_[a-z0-9]+', + ]); + + const sql = + "select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'"; + + expect(redactHistoricSqlText(sql, redactors)).toBe( + "select * from public.api_events where api_key = '[REDACTED]' and note = '[REDACTED]'", + ); + }); + + it('returns the original SQL text when no redaction patterns are configured', () => { + const sql = "select * from public.orders where status = 'paid'"; + + expect(redactHistoricSqlText(sql, compileHistoricSqlRedactionPatterns([]))).toBe(sql); + }); + + it('throws a config-focused error for invalid redaction regex patterns', () => { + expect(() => compileHistoricSqlRedactionPatterns(['[broken'])).toThrow( + 'Invalid historicSql.redactionPatterns entry "[broken"', + ); + }); + + it('throws a config-focused error for empty redaction regex patterns', () => { + expect(() => compileHistoricSqlRedactionPatterns([' '])).toThrow( + 'Invalid historicSql.redactionPatterns entry " "', + ); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/redaction.ts b/packages/context/src/ingest/adapters/historic-sql/redaction.ts new file mode 100644 index 00000000..a047b70f --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/redaction.ts @@ -0,0 +1,37 @@ +export interface HistoricSqlRedactionPattern { + pattern: string; + expression: RegExp; +} + +const CASE_INSENSITIVE_PREFIX = '(?i)'; +const REDACTION_TOKEN = '[REDACTED]'; + +export function compileHistoricSqlRedactionPatterns(patterns: readonly string[]): HistoricSqlRedactionPattern[] { + return patterns.map((pattern) => { + const trimmed = pattern.trim(); + const caseInsensitive = trimmed.startsWith(CASE_INSENSITIVE_PREFIX); + const source = caseInsensitive ? trimmed.slice(CASE_INSENSITIVE_PREFIX.length) : trimmed; + if (source.length === 0) { + throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": pattern must not be empty`); + } + + try { + return { + pattern, + expression: new RegExp(source, caseInsensitive ? 'gi' : 'g'), + }; + } catch (error) { + const reason = error instanceof Error ? error.message : String(error); + throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": ${reason}`); + } + }); +} + +export function redactHistoricSqlText(text: string, redactors: readonly HistoricSqlRedactionPattern[]): string { + let next = text; + for (const redactor of redactors) { + redactor.expression.lastIndex = 0; + next = next.replace(redactor.expression, REDACTION_TOKEN); + } + return next; +} diff --git a/packages/context/src/ingest/adapters/historic-sql/skill-schemas.test.ts b/packages/context/src/ingest/adapters/historic-sql/skill-schemas.test.ts new file mode 100644 index 00000000..b384c0c0 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/skill-schemas.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, it } from 'vitest'; +import { z } from 'zod'; +import { + patternOutputSchema, + patternsArraySchema, + tableUsageOutputSchema, +} from './skill-schemas.js'; + +describe('historic-sql skill schemas', () => { + it('accepts table usage output and preserves future keys', () => { + const parsed = tableUsageOutputSchema.parse({ + narrative: 'Orders are queried for paid/refunded lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + analystNote: 'preserve me', + }); + + expect(parsed).toMatchObject({ + narrative: 'Orders are queried for paid/refunded lifecycle analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: null, + analystNote: 'preserve me', + }); + }); + + it('rejects invalid frequency tiers', () => { + const result = tableUsageOutputSchema.safeParse({ + narrative: 'Orders are queried often.', + frequencyTier: 'sometimes', + commonFilters: [], + commonJoins: [], + }); + + expect(result.success).toBe(false); + }); + + it('accepts pattern outputs used for wiki projection', () => { + const parsed = patternsArraySchema.parse([ + { + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Teams inspect order status by customer and month.', + definitionSql: 'select status, count(*) from public.orders group by status', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['template_1', 'template_2'], + }, + ]); + + expect(parsed[0]).toEqual({ + slug: 'order-lifecycle-analysis', + title: 'Order Lifecycle Analysis', + narrative: 'Teams inspect order status by customer and month.', + definitionSql: 'select status, count(*) from public.orders group by status', + tablesInvolved: ['public.orders', 'public.customers'], + slRefs: ['orders', 'customers'], + constituentTemplateIds: ['template_1', 'template_2'], + }); + }); + + it('exports zod schemas that can produce JSON schema for prompt prefixes', () => { + const tableUsageJsonSchema = z.toJSONSchema(tableUsageOutputSchema); + const patternJsonSchema = z.toJSONSchema(patternOutputSchema); + + expect(tableUsageJsonSchema).toMatchObject({ type: 'object' }); + expect(patternJsonSchema).toMatchObject({ type: 'object' }); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts b/packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts new file mode 100644 index 00000000..340cd5b1 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/skill-schemas.ts @@ -0,0 +1,31 @@ +import { z } from 'zod'; + +export const tableUsageOutputSchema = z + .object({ + narrative: z.string(), + frequencyTier: z.enum(['high', 'mid', 'low', 'unused']), + commonFilters: z.array(z.string()), + commonGroupBys: z.array(z.string()).optional(), + commonJoins: z.array( + z.object({ + table: z.string(), + on: z.array(z.string()), + }), + ), + staleSince: z.iso.datetime().nullable().optional(), + }) + .passthrough(); +export type TableUsageOutput = z.infer; + +export const patternOutputSchema = z.object({ + slug: z.string(), + title: z.string(), + narrative: z.string(), + definitionSql: z.string(), + tablesInvolved: z.array(z.string()), + slRefs: z.array(z.string()), + constituentTemplateIds: z.array(z.string()), +}); +export type PatternOutput = z.infer; + +export const patternsArraySchema = z.array(patternOutputSchema); diff --git a/packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts b/packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts index d8253df9..a3288223 100644 --- a/packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts +++ b/packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts @@ -33,7 +33,7 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => { const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]); const reader = new SnowflakeHistoricSqlQueryHistoryReader(); - await expect(reader.probe(client)).resolves.toBeUndefined(); + await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] }); expect(client.executeQuery).toHaveBeenCalledWith( 'SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY LIMIT 1', @@ -62,130 +62,85 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => { await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError); }); - it('fetches query-history rows with cursor and maps them into RawQueryRow shape', async () => { + it('fetches aggregated Snowflake query templates', async () => { const client = queryClient([ { headers: [ - 'QUERY_ID', - 'QUERY_TEXT', - 'USER_NAME', - 'ROLE_NAME', - 'WAREHOUSE_NAME', - 'DATABASE_NAME', - 'SCHEMA_NAME', - 'START_TIME', - 'END_TIME', - 'TOTAL_ELAPSED_TIME', - 'ROWS_PRODUCED', - 'EXECUTION_STATUS', - 'ERROR_CODE', - 'ERROR_MESSAGE', + 'template_id', + 'canonical_sql', + 'executions', + 'distinct_users', + 'first_seen', + 'last_seen', + 'p50_ms', + 'p95_ms', + 'error_rate', + 'rows_produced', + 'top_users', ], rows: [ [ - '01a', - "SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'", - 'ANALYST_A', - 'ANALYST_ROLE', - 'WH_XS', - 'ANALYTICS', - 'PUBLIC', - '2026-05-04T10:00:00.000Z', - '2026-05-04T10:00:01.250Z', - 1250, + 'hash-1', + 'select status from orders', + 42, + 3, + '2026-05-01T00:00:00.000Z', + '2026-05-11T00:00:00.000Z', 12, - 'SUCCESS', - null, - null, - ], - [ - '01b', - 'SELECT * FROM MISSING_TABLE', - 'ANALYST_B', - 'ANALYST_ROLE', - 'WH_XS', - 'ANALYTICS', - 'PUBLIC', - new Date('2026-05-04T10:05:00.000Z'), - null, - null, - null, - 'FAILED_WITH_ERROR', - '002003', - 'SQL compilation error', + 40, + 0.05, + 100, + JSON.stringify([{ user: 'ANALYST', executions: 1 }]), ], ], - totalRows: 2, + totalRows: 1, }, ]); const reader = new SnowflakeHistoricSqlQueryHistoryReader(); const rows = []; - for await (const row of reader.fetch( + for await (const row of reader.fetchAggregated( client, - { - start: new Date('2026-05-01T00:00:00.000Z'), - end: new Date('2026-05-04T12:00:00.000Z'), - }, - '2026-05-03T00:00:00.000Z', + { start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') }, + { dialect: 'snowflake', minExecutions: 5, windowDays: 90, concurrency: 12, filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 }, )) { rows.push(row); } - expect(client.executeQuery).toHaveBeenCalledTimes(1); const sql = firstQuery(client); - expect(sql).toContain('FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY'); - expect(sql).toContain("START_TIME >= '2026-05-03T00:00:00.000Z'::TIMESTAMP_TZ"); - expect(sql).toContain("START_TIME < '2026-05-04T12:00:00.000Z'::TIMESTAMP_TZ"); - expect(sql).toContain('ORDER BY START_TIME ASC, QUERY_ID ASC'); - expect(sql).toContain('ROWS_PRODUCED'); - - expect(rows).toEqual([ + expect(sql).toContain('SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY'); + expect(sql).toContain('COUNT(*) AS executions'); + expect(sql).toContain('GROUP BY query_hash'); + expect(sql).toContain('HAVING COUNT(*) >= 5'); + expect(rows).toMatchObject([ { - id: '01a', - sql: "SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'", - user: 'ANALYST_A', - startedAt: '2026-05-04T10:00:00.000Z', - endedAt: '2026-05-04T10:00:01.250Z', - runtimeMs: 1250, - rowsProduced: 12, - success: true, - errorMessage: null, - }, - { - id: '01b', - sql: 'SELECT * FROM MISSING_TABLE', - user: 'ANALYST_B', - startedAt: '2026-05-04T10:05:00.000Z', - endedAt: null, - runtimeMs: null, - rowsProduced: null, - success: false, - errorMessage: '002003: SQL compilation error', + templateId: 'hash-1', + stats: { + executions: 42, + errorRate: 0.05, + }, + topUsers: [{ user: 'ANALYST', executions: 1 }], }, ]); }); - it('uses the window start when no cursor is available', async () => { - const client = queryClient([{ headers: ['QUERY_ID'], rows: [], totalRows: 0 }]); - const reader = new SnowflakeHistoricSqlQueryHistoryReader(); - - for await (const _row of reader.fetch(client, { - start: new Date('2026-02-03T12:00:00.000Z'), - end: new Date('2026-05-04T12:00:00.000Z'), - })) { - throw new Error('empty result should not yield rows'); - } - - const sql = firstQuery(client); - expect(sql).toContain("START_TIME >= '2026-02-03T12:00:00.000Z'::TIMESTAMP_TZ"); - }); - it('throws a clear error when the query client cannot execute SQL', async () => { const reader = new SnowflakeHistoricSqlQueryHistoryReader(); await expect(async () => { - for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) { + for await (const _row of reader.fetchAggregated( + {}, + { start: new Date(), end: new Date() }, + { + dialect: 'snowflake', + minExecutions: 5, + windowDays: 90, + concurrency: 12, + filters: { dropTrivialProbes: true }, + redactionPatterns: [], + staleArchiveAfterDays: 90, + }, + )) { throw new Error('unreachable'); } }).rejects.toThrow('Historic SQL Snowflake reader requires a query client with executeQuery(query)'); diff --git a/packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts b/packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts index b149a34b..539df3c3 100644 --- a/packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts +++ b/packages/context/src/ingest/adapters/historic-sql/snowflake-query-history-reader.ts @@ -1,5 +1,10 @@ import { HistoricSqlGrantsMissingError } from './errors.js'; -import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js'; +import { + aggregatedTemplateSchema, + type AggregatedTemplate, + type HistoricSqlTimeWindow, + type HistoricSqlUnifiedPullConfig, +} from './types.js'; interface QueryResultLike { headers: string[]; @@ -52,32 +57,6 @@ function timestampLiteral(value: Date | string): string { return `'${date.toISOString().replace(/'/g, "''")}'::TIMESTAMP_TZ`; } -function queryHistorySql(window: HistoricSqlTimeWindow, cursor?: string | null): string { - const start = timestampLiteral(cursor ?? window.start); - const end = timestampLiteral(window.end); - return ` -SELECT - QUERY_ID, - QUERY_TEXT, - USER_NAME, - ROLE_NAME, - WAREHOUSE_NAME, - DATABASE_NAME, - SCHEMA_NAME, - START_TIME, - END_TIME, - TOTAL_ELAPSED_TIME, - ROWS_PRODUCED, - EXECUTION_STATUS, - ERROR_CODE, - ERROR_MESSAGE -FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY -WHERE START_TIME >= ${start} - AND START_TIME < ${end} - AND QUERY_TEXT IS NOT NULL -ORDER BY START_TIME ASC, QUERY_ID ASC`.trim(); -} - function indexByHeader(headers: string[]): Map { const out = new Map(); headers.forEach((header, index) => { @@ -87,7 +66,7 @@ function indexByHeader(headers: string[]): Map { } function value(row: unknown[], indexes: Map, name: string): unknown { - const index = indexes.get(name); + const index = indexes.get(name.toUpperCase()); return index === undefined ? null : row[index]; } @@ -118,6 +97,18 @@ function nullableNumber(raw: unknown): number | null { return number; } +function requiredNumber(raw: unknown, field: string): number { + const number = nullableNumber(raw); + if (number === null) { + throw new Error(`Snowflake QUERY_HISTORY row has invalid ${field}: ${String(raw)}`); + } + return number; +} + +function requiredInteger(raw: unknown, field: string): number { + return Math.trunc(requiredNumber(raw, field)); +} + function nullableInteger(raw: unknown): number | null { const number = nullableNumber(raw); return number === null ? null : Math.trunc(number); @@ -135,46 +126,50 @@ function isoTimestamp(raw: unknown, field: string): string { return date.toISOString(); } -function nullableIsoTimestamp(raw: unknown): string | null { - if (raw === null || raw === undefined || raw === '') { - return null; +function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> { + const text = nullableString(raw); + if (!text) { + return []; } - return isoTimestamp(raw, 'END_TIME'); -} - -function executionSucceeded(status: string | null, errorCode: string | null, errorMessage: string | null): boolean { - if (errorCode || errorMessage) { - return false; + try { + const parsed = JSON.parse(text) as unknown; + if (!Array.isArray(parsed)) { + return []; + } + return parsed.flatMap((entry) => { + if (!entry || typeof entry !== 'object') { + return []; + } + const user = nullableString((entry as { user?: unknown }).user); + const executions = nullableInteger((entry as { executions?: unknown }).executions); + return executions === null ? [] : [{ user, executions }]; + }); + } catch { + return []; } - return status === null || status.toUpperCase().startsWith('SUCCESS'); } -function combinedErrorMessage(errorCode: string | null, errorMessage: string | null): string | null { - if (errorCode && errorMessage) { - return `${errorCode}: ${errorMessage}`; - } - return errorMessage ?? errorCode; +function mapAggregatedRow(row: unknown[], indexes: Map): AggregatedTemplate { + return aggregatedTemplateSchema.parse({ + templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'), + canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'), + dialect: 'snowflake', + stats: { + executions: requiredInteger(value(row, indexes, 'executions'), 'executions'), + distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'), + firstSeen: isoTimestamp(value(row, indexes, 'first_seen'), 'first_seen'), + lastSeen: isoTimestamp(value(row, indexes, 'last_seen'), 'last_seen'), + p50RuntimeMs: nullableNumber(value(row, indexes, 'p50_ms')), + p95RuntimeMs: nullableNumber(value(row, indexes, 'p95_ms')), + errorRate: requiredNumber(value(row, indexes, 'error_rate'), 'error_rate'), + rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')), + }, + topUsers: parseTopUsers(value(row, indexes, 'top_users')), + }); } -function mapRow(row: unknown[], indexes: Map): HistoricSqlRawQueryRow { - const errorCode = nullableString(value(row, indexes, 'ERROR_CODE')); - const errorMessage = nullableString(value(row, indexes, 'ERROR_MESSAGE')); - const rowsProduced = nullableInteger(value(row, indexes, 'ROWS_PRODUCED')); - return { - id: requiredString(value(row, indexes, 'QUERY_ID'), 'QUERY_ID'), - sql: requiredString(value(row, indexes, 'QUERY_TEXT'), 'QUERY_TEXT'), - user: nullableString(value(row, indexes, 'USER_NAME')), - startedAt: isoTimestamp(value(row, indexes, 'START_TIME'), 'START_TIME'), - endedAt: nullableIsoTimestamp(value(row, indexes, 'END_TIME')), - runtimeMs: nullableNumber(value(row, indexes, 'TOTAL_ELAPSED_TIME')), - rowsProduced, - success: executionSucceeded(nullableString(value(row, indexes, 'EXECUTION_STATUS')), errorCode, errorMessage), - errorMessage: combinedErrorMessage(errorCode, errorMessage), - }; -} - -export class SnowflakeHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader { - async probe(client: unknown): Promise { +export class SnowflakeHistoricSqlQueryHistoryReader { + async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> { let result: QueryResultLike; try { result = await queryClient(client).executeQuery(PROBE_SQL); @@ -184,20 +179,42 @@ export class SnowflakeHistoricSqlQueryHistoryReader implements HistoricSqlQueryH if (result.error) { throw grantsError(result.error); } + return { warnings: [], info: [] }; } - async *fetch( + async *fetchAggregated( client: unknown, window: HistoricSqlTimeWindow, - cursor?: string | null, - ): AsyncIterable { - const result = await queryClient(client).executeQuery(queryHistorySql(window, cursor)); + config: HistoricSqlUnifiedPullConfig, + ): AsyncIterable { + const sql = ` +SELECT + query_hash AS template_id, + MIN(query_text) AS canonical_sql, + COUNT(*) AS executions, + COUNT(DISTINCT user_name) AS distinct_users, + MIN(start_time) AS first_seen, + MAX(start_time) AS last_seen, + APPROX_PERCENTILE(total_elapsed_time, 0.50) AS p50_ms, + APPROX_PERCENTILE(total_elapsed_time, 0.95) AS p95_ms, + DIV0(COUNT_IF(execution_status != 'SUCCESS'), COUNT(*)) AS error_rate, + SUM(rows_produced) AS rows_produced, + ARRAY_AGG(OBJECT_CONSTRUCT('user', user_name, 'executions', 1)) WITHIN GROUP (ORDER BY start_time DESC)::string AS top_users +FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY +WHERE query_text IS NOT NULL + AND query_type IN ('SELECT', 'MERGE') + AND start_time >= ${timestampLiteral(window.start)} + AND start_time < ${timestampLiteral(window.end)} +GROUP BY query_hash +HAVING COUNT(*) >= ${config.minExecutions} +ORDER BY executions DESC`.trim(); + const result = await queryClient(client).executeQuery(sql); if (result.error) { throw grantsError(result.error); } const indexes = indexByHeader(result.headers); for (const row of result.rows) { - yield mapRow(row, indexes); + yield mapAggregatedRow(row, indexes); } } } diff --git a/packages/context/src/ingest/adapters/historic-sql/stage-pgss-golden.test.ts b/packages/context/src/ingest/adapters/historic-sql/stage-pgss-golden.test.ts deleted file mode 100644 index 98a907fd..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/stage-pgss-golden.test.ts +++ /dev/null @@ -1,152 +0,0 @@ -import { mkdir, mkdtemp, readdir, readFile } from 'node:fs/promises'; -import { tmpdir } from 'node:os'; -import { dirname, join, relative } from 'node:path'; -import { describe, expect, it } from 'vitest'; -import type { SqlAnalysisPort } from '../../../sql-analysis/index.js'; -import { stagePgStatStatementsTemplates, writePgssBaselineAtomic, type PgssBaseline } from './stage-pgss.js'; -import type { HistoricSqlPullConfig, KtxPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js'; - -const FIXTURE_ROOT = join(__dirname, '__fixtures__/postgres'); - -interface GoldenFixture { - name: string; - now: string; - connectionId: string; - probe: { - pgServerVersion: string; - warnings: string[]; - }; - snapshot: { - statsResetAt: string | null; - deallocCount: number | null; - rows: PostgresPgssRow[]; - }; - pullConfig: HistoricSqlPullConfig & { dialect: 'postgres' }; - analysisBySql: Record< - string, - { - fingerprint: string; - normalizedSql: string; - tablesTouched: string[]; - literalSlots: []; - error?: string; - } - >; - baseline: PgssBaseline | null; - expectedBaseline: PgssBaseline; - expectedFiles: Record; -} - -async function readFixture(name: string): Promise { - return JSON.parse(await readFile(join(FIXTURE_ROOT, name, 'input.json'), 'utf-8')) as GoldenFixture; -} - -async function tempDir(prefix: string): Promise { - return mkdtemp(join(tmpdir(), prefix)); -} - -function fakePgClient(): KtxPostgresQueryClient { - return { - async executeQuery() { - return { headers: [], rows: [] }; - }, - }; -} - -function fixtureReader(fixture: GoldenFixture): PostgresPgssReader { - return { - async probe() { - return fixture.probe; - }, - async readSnapshot(_client, options) { - return { - statsResetAt: fixture.snapshot.statsResetAt, - deallocCount: fixture.snapshot.deallocCount, - rows: fixture.snapshot.rows.slice(0, options.maxTemplates), - }; - }, - }; -} - -function fixtureSqlAnalysis(fixture: GoldenFixture): SqlAnalysisPort { - return { - async analyzeForFingerprint(sql) { - const result = fixture.analysisBySql[sql]; - if (!result) { - return { - fingerprint: '', - normalizedSql: '', - tablesTouched: [], - literalSlots: [], - error: `missing fixture analysis for ${sql}`, - }; - } - return result; - }, - }; -} - -async function writeFixtureBaseline(path: string, baseline: PgssBaseline | null): Promise { - if (!baseline) { - return; - } - await writePgssBaselineAtomic(path, baseline); -} - -async function listFiles(root: string, current = root): Promise { - const entries = await readdir(current, { withFileTypes: true }); - const files: string[] = []; - for (const entry of entries) { - const fullPath = join(current, entry.name); - if (entry.isDirectory()) { - files.push(...(await listFiles(root, fullPath))); - } else { - files.push(relative(root, fullPath)); - } - } - return files; -} - -async function expectGoldenFiles(stagedDir: string, expectedFiles: GoldenFixture['expectedFiles']): Promise { - const actualFiles = await listFiles(stagedDir); - const expectedPaths = Object.keys(expectedFiles).sort(); - expect(actualFiles.sort()).toEqual(expectedPaths); - - for (const path of expectedPaths) { - const expected = expectedFiles[path]; - const actual = await readFile(join(stagedDir, path), 'utf-8'); - if ('json' in expected) { - expect(JSON.parse(actual)).toEqual(expected.json); - } else { - expect(actual).toBe(expected.text); - } - } -} - -describe('stagePgStatStatementsTemplates golden fixtures', () => { - it.each(['first-run', 'normal-delta', 'reset-detected', 'version-change', 'eviction-churn'] as const)( - 'matches the committed %s golden output', - async (fixtureName) => { - const fixture = await readFixture(fixtureName); - const root = await tempDir(`pgss-golden-${fixtureName}-`); - const stagedDir = join(root, 'staged'); - const baselinePath = join(root, 'cache', fixture.connectionId, 'pgss-baseline.json'); - await mkdir(dirname(baselinePath), { recursive: true }); - await writeFixtureBaseline(baselinePath, fixture.baseline); - - const result = await stagePgStatStatementsTemplates({ - stagedDir, - connectionId: fixture.connectionId, - queryClient: fakePgClient(), - reader: fixtureReader(fixture), - sqlAnalysis: fixtureSqlAnalysis(fixture), - pullConfig: fixture.pullConfig, - baselinePath, - now: new Date(fixture.now), - }); - - await expectGoldenFiles(stagedDir, fixture.expectedFiles); - expect(result.baseline).toEqual(fixture.expectedBaseline); - }, - ); -}); diff --git a/packages/context/src/ingest/adapters/historic-sql/stage-pgss.test.ts b/packages/context/src/ingest/adapters/historic-sql/stage-pgss.test.ts deleted file mode 100644 index 901a0ae2..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/stage-pgss.test.ts +++ /dev/null @@ -1,652 +0,0 @@ -import { mkdtemp, readFile } from 'node:fs/promises'; -import { tmpdir } from 'node:os'; -import { join } from 'node:path'; -import { describe, expect, it, vi } from 'vitest'; -import type { SqlAnalysisPort } from '../../../sql-analysis/index.js'; -import { - pgssBaselinePath, - readPgssBaseline, - stagePgStatStatementsTemplates, - writePgssBaselineAtomic, - type PgssBaseline, -} from './stage-pgss.js'; -import { historicSqlManifestSchema, historicSqlMetadataSchema, historicSqlUsageSchema } from './types.js'; -import type { KtxPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js'; - -async function tempDir(prefix: string): Promise { - return mkdtemp(join(tmpdir(), prefix)); -} - -async function readJson(root: string, relPath: string): Promise { - return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T; -} - -function fakePgClient(): KtxPostgresQueryClient { - return { - async executeQuery() { - return { headers: [], rows: [] }; - }, - }; -} - -function row(overrides: Partial & Pick): PostgresPgssRow { - return { - userid: '11', - username: 'analyst', - dbid: '5', - database: 'warehouse', - calls: 10, - totalExecTime: 250, - meanExecTime: 25, - totalRows: 20, - ...overrides, - }; -} - -function fakeReader(input: { - pgServerVersion?: string; - warnings?: string[]; - statsResetAt?: string | null; - deallocCount?: number | null; - rows: PostgresPgssRow[]; -}): PostgresPgssReader { - return { - probe: vi.fn(async () => ({ - pgServerVersion: input.pgServerVersion ?? 'PostgreSQL 16.4', - warnings: input.warnings ?? [], - })), - readSnapshot: vi.fn(async (_client, options) => ({ - statsResetAt: input.statsResetAt ?? '2026-05-08T08:00:00.000Z', - deallocCount: input.deallocCount ?? 0, - rows: input.rows.slice(0, options.maxTemplates), - })), - }; -} - -const sqlAnalysis: SqlAnalysisPort = { - async analyzeForFingerprint(sql) { - if (sql.includes('broken')) { - return { - fingerprint: '', - normalizedSql: '', - tablesTouched: [], - literalSlots: [], - error: 'parse failed', - }; - } - if (sql.includes('customers')) { - return { - fingerprint: 'fp_customers', - normalizedSql: 'SELECT count(*) FROM analytics.customers', - tablesTouched: ['analytics.customers'], - literalSlots: [], - }; - } - return { - fingerprint: 'fp_orders', - normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - tablesTouched: ['analytics.orders'], - literalSlots: [], - }; - }, -}; - -function postgresPullConfig(maxTemplatesPerRun = 5000) { - return { - dialect: 'postgres' as const, - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: ['^svc_'], - redactionPatterns: ['secret'], - maxTemplatesPerRun, - minCalls: 5, - }; -} - -describe('stagePgStatStatementsTemplates', () => { - it('stages first-run PGSS templates as degraded aggregate templates and builds a next baseline', async () => { - const stagedDir = await tempDir('pgss-stage-first-'); - const baselineRootDir = await tempDir('pgss-baseline-first-'); - const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg'); - - const result = await stagePgStatStatementsTemplates({ - stagedDir, - connectionId: 'conn_pg', - queryClient: fakePgClient(), - reader: fakeReader({ - warnings: ['pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config'], - deallocCount: 2, - rows: [ - row({ - queryid: '101', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 10, - totalExecTime: 250, - totalRows: 20, - }), - row({ - queryid: '102', - query: 'SELECT * FROM pg_catalog.pg_class', - calls: 50, - totalExecTime: 500, - }), - row({ - queryid: '103', - query: 'BEGIN', - calls: 75, - totalExecTime: 75, - }), - row({ - queryid: '104', - query: 'SELECT broken FROM analytics.orders', - calls: 8, - totalExecTime: 80, - }), - ], - }), - sqlAnalysis, - pullConfig: postgresPullConfig(), - baselinePath, - now: new Date('2026-05-08T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - expect(manifest).toMatchObject({ - source: 'historic-sql', - connectionId: 'conn_pg', - dialect: 'postgres', - fetchedAt: '2026-05-08T12:00:00.000Z', - windowEnd: '2026-05-08T12:00:00.000Z', - nextSuccessfulCursor: '2026-05-08T12:00:00.000Z', - templateCount: 1, - capped: false, - degraded: true, - statsResetAt: '2026-05-08T08:00:00.000Z', - baselineFirstRun: true, - pgServerVersion: 'PostgreSQL 16.4', - deallocCount: 2, - }); - expect(manifest.warnings).toEqual([ - 'pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config', - 'pgss_dealloc_count:2; pg_stat_statements.max may be too low, causing template eviction churn', - 'baseline_first_run:no_previous_pgss_baseline', - 'analysis_failed:db5_q104', - ]); - expect(manifest.templates).toEqual([ - { - id: 'db5_q101', - fingerprint: 'fp_orders', - subClusterId: null, - path: 'templates/db5_q101/page.md', - }, - ]); - - const metadata = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q101/metadata.json')); - expect(metadata).toMatchObject({ - id: 'db5_q101', - title: 'postgres · analytics.orders [db5_q101]', - path: 'templates/db5_q101/page.md', - objectType: 'historic_sql_template', - lastEditedAt: null, - properties: { - fingerprint: 'fp_orders', - sub_cluster_id: null, - dialect: 'postgres', - tables_touched: ['analytics.orders'], - literal_slots: [], - }, - }); - expect(metadata.properties.triage_signals).toEqual({ - executions_bucket: 'mid', - distinct_users_bucket: 'solo', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'false', - runtime_bucket: 'fast', - }); - - const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q101/usage.json')); - expect(usage).toEqual({ - stats: { - executions: 10, - distinct_users: 1, - first_seen: '2026-05-08T12:00:00.000Z', - last_seen: '2026-05-08T12:00:00.000Z', - p50_runtime_ms: null, - p95_runtime_ms: null, - mean_runtime_ms: 25, - error_rate: 0, - rows_produced: 20, - }, - literal_slots: [], - samples: [], - }); - - expect(await readFile(join(stagedDir, 'templates/db5_q101/page.md'), 'utf-8')).toContain( - 'SELECT count(*) FROM analytics.orders WHERE status = $1', - ); - expect(result.baselinePath).toBe(baselinePath); - expect(result.baseline.templates.db5_q101.perUser['11']).toEqual({ - calls: 10, - totalExecTime: 250, - totalRows: 20, - }); - await expect(readPgssBaseline(baselinePath)).resolves.toBeNull(); - }); - - it('warns when pg_stat_statements reports dealloc churn', async () => { - const root = await tempDir('pgss-churn-'); - const stagedDir = join(root, 'staged'); - const baselinePath = join(root, 'cache', 'warehouse', 'pgss-baseline.json'); - - await stagePgStatStatementsTemplates({ - stagedDir, - connectionId: 'warehouse', - queryClient: fakePgClient(), - reader: fakeReader({ - rows: [ - row({ - queryid: '901', - query: 'SELECT COUNT(*) FROM public.orders WHERE status = $1', - calls: 20, - totalExecTime: 500, - meanExecTime: 25, - }), - ], - deallocCount: 3, - }), - sqlAnalysis, - pullConfig: postgresPullConfig(50), - baselinePath, - now: new Date('2026-05-08T12:00:00.000Z'), - }); - - const manifest = await readJson<{ warnings: string[]; deallocCount: number }>(stagedDir, 'manifest.json'); - expect(manifest.deallocCount).toBe(3); - expect(manifest.warnings).toContain( - 'pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn', - ); - }); - - it('uses the saved cumulative baseline to stage only positive deltas on later runs', async () => { - const stagedDir = await tempDir('pgss-stage-delta-'); - const baselineRootDir = await tempDir('pgss-baseline-delta-'); - const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg'); - const baseline: PgssBaseline = { - version: 1, - fetchedAt: '2026-05-08T10:00:00.000Z', - statsResetAt: '2026-05-08T08:00:00.000Z', - pgServerVersion: 'PostgreSQL 16.4', - templates: { - db5_q201: { - firstObservedAt: '2026-05-08T09:00:00.000Z', - perUser: { - '11': { calls: 10, totalExecTime: 100, totalRows: 50 }, - '12': { calls: 5, totalExecTime: 50, totalRows: 25 }, - }, - }, - }, - }; - await writePgssBaselineAtomic(baselinePath, baseline); - - await stagePgStatStatementsTemplates({ - stagedDir, - connectionId: 'conn_pg', - queryClient: fakePgClient(), - reader: fakeReader({ - rows: [ - row({ - queryid: '201', - userid: '11', - username: 'analyst', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 12, - totalExecTime: 160, - totalRows: 58, - }), - row({ - queryid: '201', - userid: '12', - username: 'svc_loader', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 5, - totalExecTime: 50, - totalRows: 25, - }), - row({ - queryid: '202', - userid: '13', - username: 'analyst_2', - query: 'SELECT count(*) FROM analytics.customers', - calls: 7, - totalExecTime: 210, - totalRows: 7, - }), - ], - }), - sqlAnalysis, - pullConfig: postgresPullConfig(), - baselinePath, - now: new Date('2026-05-08T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - expect(manifest.baselineFirstRun).toBe(false); - expect(manifest.windowStart).toBe('2026-05-08T10:00:00.000Z'); - expect(manifest.templateCount).toBe(2); - expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q202', 'db5_q201']); - - const usage201 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q201/usage.json')); - expect(usage201.stats).toMatchObject({ - executions: 2, - distinct_users: 1, - first_seen: '2026-05-08T09:00:00.000Z', - last_seen: '2026-05-08T12:00:00.000Z', - mean_runtime_ms: 30, - rows_produced: 8, - }); - const metadata201 = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q201/metadata.json')); - expect(metadata201.properties.triage_signals.service_account_only).toBe('false'); - - const usage202 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q202/usage.json')); - expect(usage202.stats).toMatchObject({ - executions: 7, - distinct_users: 1, - first_seen: '2026-05-08T12:00:00.000Z', - mean_runtime_ms: 30, - rows_produced: 7, - }); - }); - - it('keeps matching queryid values from different databases as distinct templates and baseline entries', async () => { - const stagedDir = await tempDir('pgss-stage-db-key-'); - const baselineRootDir = await tempDir('pgss-baseline-db-key-'); - const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg'); - await writePgssBaselineAtomic(baselinePath, { - version: 1, - fetchedAt: '2026-05-08T10:00:00.000Z', - statsResetAt: '2026-05-08T08:00:00.000Z', - pgServerVersion: 'PostgreSQL 16.4', - templates: { - db5_q701: { - firstObservedAt: '2026-05-08T09:00:00.000Z', - perUser: { - '11': { calls: 10, totalExecTime: 100, totalRows: 50 }, - }, - }, - db6_q701: { - firstObservedAt: '2026-05-08T09:30:00.000Z', - perUser: { - '11': { calls: 4, totalExecTime: 40, totalRows: 20 }, - }, - }, - }, - }); - - const result = await stagePgStatStatementsTemplates({ - stagedDir, - connectionId: 'conn_pg', - queryClient: fakePgClient(), - reader: fakeReader({ - rows: [ - row({ - queryid: '701', - dbid: '5', - database: 'warehouse', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 12, - totalExecTime: 160, - totalRows: 58, - }), - row({ - queryid: '701', - dbid: '6', - database: 'app', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 9, - totalExecTime: 130, - totalRows: 35, - }), - ], - }), - sqlAnalysis, - pullConfig: postgresPullConfig(), - baselinePath, - now: new Date('2026-05-08T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - expect(manifest.templates.map((template) => template.id).sort()).toEqual(['db5_q701', 'db6_q701']); - - const warehouseUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q701/usage.json')); - expect(warehouseUsage.stats).toMatchObject({ - executions: 2, - rows_produced: 8, - first_seen: '2026-05-08T09:00:00.000Z', - }); - - const appUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db6_q701/usage.json')); - expect(appUsage.stats).toMatchObject({ - executions: 5, - rows_produced: 15, - first_seen: '2026-05-08T09:30:00.000Z', - }); - - expect(result.baseline.templates.db5_q701.perUser['11']).toEqual({ - calls: 12, - totalExecTime: 160, - totalRows: 58, - }); - expect(result.baseline.templates.db6_q701.perUser['11']).toEqual({ - calls: 9, - totalExecTime: 130, - totalRows: 35, - }); - }); - - it('treats stats_reset advancement and major-version changes as fresh baselines', async () => { - const resetStagedDir = await tempDir('pgss-stage-reset-'); - const resetBaselineRootDir = await tempDir('pgss-baseline-reset-'); - const resetBaselinePath = pgssBaselinePath(resetBaselineRootDir, 'conn_pg'); - await writePgssBaselineAtomic(resetBaselinePath, { - version: 1, - fetchedAt: '2026-05-08T10:00:00.000Z', - statsResetAt: '2026-05-08T08:00:00.000Z', - pgServerVersion: 'PostgreSQL 16.4', - templates: { - db5_q301: { - firstObservedAt: '2026-05-08T09:00:00.000Z', - perUser: { - '11': { calls: 100, totalExecTime: 1000, totalRows: 500 }, - }, - }, - }, - }); - - await stagePgStatStatementsTemplates({ - stagedDir: resetStagedDir, - connectionId: 'conn_pg', - queryClient: fakePgClient(), - reader: fakeReader({ - statsResetAt: '2026-05-08T11:00:00.000Z', - rows: [ - row({ - queryid: '301', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 3, - totalExecTime: 90, - totalRows: 9, - }), - ], - }), - sqlAnalysis, - pullConfig: postgresPullConfig(), - baselinePath: resetBaselinePath, - now: new Date('2026-05-08T12:00:00.000Z'), - }); - - const resetManifest = historicSqlManifestSchema.parse(await readJson(resetStagedDir, 'manifest.json')); - expect(resetManifest.baselineFirstRun).toBe(true); - expect(resetManifest.warnings).toContain( - 'baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z', - ); - const resetUsage = historicSqlUsageSchema.parse(await readJson(resetStagedDir, 'templates/db5_q301/usage.json')); - expect(resetUsage.stats.executions).toBe(3); - - const versionStagedDir = await tempDir('pgss-stage-version-'); - const versionBaselineRootDir = await tempDir('pgss-baseline-version-'); - const versionBaselinePath = pgssBaselinePath(versionBaselineRootDir, 'conn_pg'); - await writePgssBaselineAtomic(versionBaselinePath, { - version: 1, - fetchedAt: '2026-05-08T10:00:00.000Z', - statsResetAt: '2026-05-08T08:00:00.000Z', - pgServerVersion: 'PostgreSQL 15.7', - templates: { - db5_q302: { - firstObservedAt: '2026-05-08T09:00:00.000Z', - perUser: { - '11': { calls: 100, totalExecTime: 1000, totalRows: 500 }, - }, - }, - }, - }); - - await stagePgStatStatementsTemplates({ - stagedDir: versionStagedDir, - connectionId: 'conn_pg', - queryClient: fakePgClient(), - reader: fakeReader({ - pgServerVersion: 'PostgreSQL 16.4', - rows: [ - row({ - queryid: '302', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 4, - totalExecTime: 80, - totalRows: 8, - }), - ], - }), - sqlAnalysis, - pullConfig: postgresPullConfig(), - baselinePath: versionBaselinePath, - now: new Date('2026-05-08T12:00:00.000Z'), - }); - - const versionManifest = historicSqlManifestSchema.parse(await readJson(versionStagedDir, 'manifest.json')); - expect(versionManifest.baselineFirstRun).toBe(true); - expect(versionManifest.warnings).toContain('baseline_reset:pg_server_major changed from 15 to 16'); - }); - - it('handles scoped counter regressions without forcing a global first-run baseline', async () => { - const stagedDir = await tempDir('pgss-stage-scoped-'); - const baselineRootDir = await tempDir('pgss-baseline-scoped-'); - const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg'); - await writePgssBaselineAtomic(baselinePath, { - version: 1, - fetchedAt: '2026-05-08T10:00:00.000Z', - statsResetAt: '2026-05-08T08:00:00.000Z', - pgServerVersion: 'PostgreSQL 16.4', - templates: { - db5_q401: { - firstObservedAt: '2026-05-08T09:00:00.000Z', - perUser: { - '11': { calls: 100, totalExecTime: 1000, totalRows: 500 }, - '12': { calls: 50, totalExecTime: 500, totalRows: 250 }, - }, - }, - }, - }); - - await stagePgStatStatementsTemplates({ - stagedDir, - connectionId: 'conn_pg', - queryClient: fakePgClient(), - reader: fakeReader({ - statsResetAt: '2026-05-08T08:00:00.000Z', - rows: [ - row({ - queryid: '401', - userid: '11', - username: 'analyst', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 2, - totalExecTime: 30, - totalRows: 6, - }), - row({ - queryid: '401', - userid: '12', - username: 'svc_loader', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 55, - totalExecTime: 650, - totalRows: 275, - }), - ], - }), - sqlAnalysis, - pullConfig: postgresPullConfig(), - baselinePath, - now: new Date('2026-05-08T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - expect(manifest.baselineFirstRun).toBe(false); - expect(manifest.warnings).toContain('scoped_reset:dbid=5 queryid=401 userid=11'); - - const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q401/usage.json')); - expect(usage.stats).toMatchObject({ - executions: 7, - distinct_users: 2, - mean_runtime_ms: 25.714285714285715, - rows_produced: 31, - }); - }); - - it('ranks and caps selected PGSS templates after skip and analysis filtering', async () => { - const stagedDir = await tempDir('pgss-stage-cap-'); - const baselineRootDir = await tempDir('pgss-baseline-cap-'); - const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg'); - - await stagePgStatStatementsTemplates({ - stagedDir, - connectionId: 'conn_pg', - queryClient: fakePgClient(), - reader: fakeReader({ - rows: [ - row({ - queryid: '501', - username: 'analyst_a', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 2, - totalExecTime: 20, - }), - row({ - queryid: '502', - username: 'analyst_b', - query: 'SELECT count(*) FROM analytics.customers', - calls: 20, - totalExecTime: 200, - }), - row({ - queryid: '503', - username: 'analyst_c', - query: 'SELECT count(*) FROM analytics.orders WHERE status = $1', - calls: 10, - totalExecTime: 100, - }), - ], - }), - sqlAnalysis, - pullConfig: postgresPullConfig(2), - baselinePath, - now: new Date('2026-05-08T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - expect(manifest.capped).toBe(true); - expect(manifest.warnings).toContain('templates_truncated: kept 2 of 3 templates'); - expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q502', 'db5_q503']); - }); -}); diff --git a/packages/context/src/ingest/adapters/historic-sql/stage-pgss.ts b/packages/context/src/ingest/adapters/historic-sql/stage-pgss.ts deleted file mode 100644 index f33fa6db..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/stage-pgss.ts +++ /dev/null @@ -1,508 +0,0 @@ -import { mkdir, readFile, rename, writeFile } from 'node:fs/promises'; -import { dirname, join } from 'node:path'; -import { z } from 'zod'; -import type { SqlAnalysisFingerprintResult, SqlAnalysisPort } from '../../../sql-analysis/index.js'; -import { - HISTORIC_SQL_OBJECT_TYPE, - HISTORIC_SQL_SOURCE_KEY, - historicSqlPullConfigSchema, - type HistoricSqlManifest, - type HistoricSqlMetadata, - type HistoricSqlPullConfig, - type HistoricSqlUsage, - type KtxPostgresQueryClient, - type PostgresPgssAggregateRow, - type PostgresPgssReader, - type PostgresPgssRow, -} from './types.js'; - -const PGSS_BASELINE_VERSION = 1 as const; - -const pgssCounterSchema = z.object({ - calls: z.number().int().nonnegative(), - totalExecTime: z.number().nonnegative(), - totalRows: z.number().int().nonnegative(), -}); - -const pgssBaselineSchema = z.object({ - version: z.literal(PGSS_BASELINE_VERSION), - fetchedAt: z.string().datetime(), - statsResetAt: z.string().datetime().nullable(), - pgServerVersion: z.string(), - templates: z.record( - z.string(), - z.object({ - firstObservedAt: z.string().datetime(), - perUser: z.record(z.string(), pgssCounterSchema), - }), - ), -}); - -export type PgssBaseline = z.infer; - -export interface StagePgStatStatementsTemplatesInput { - stagedDir: string; - connectionId: string; - queryClient: KtxPostgresQueryClient; - reader: PostgresPgssReader; - sqlAnalysis: SqlAnalysisPort; - pullConfig: HistoricSqlPullConfig; - baselinePath: string; - now?: Date; -} - -export interface StagePgStatStatementsTemplatesResult { - baselinePath: string; - baseline: PgssBaseline; -} - -interface PgssBaselineCounter { - calls: number; - totalExecTime: number; - totalRows: number; -} - -interface PgssAggregateMutable { - id: string; - queryid: string; - dbid: string; - database: string | null; - query: string; - deltaCalls: number; - deltaExecTime: number; - deltaRows: number; - users: Set; - firstObservedAt: string; -} - -interface AnalyzedPgssTemplate { - aggregate: PostgresPgssAggregateRow; - analysis: SqlAnalysisFingerprintResult; -} - -const ZERO_COUNTER: PgssBaselineCounter = { - calls: 0, - totalExecTime: 0, - totalRows: 0, -}; - -const PGSS_SNAPSHOT_READ_LIMIT = 5000; -const PGSS_HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET|BEGIN|COMMIT|ROLLBACK|VACUUM|ANALYZE)\b/i; -const PGSS_HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|pg_catalog\.|pg_toast\.|pg_stat_)/i; - -function pgssTemplateId(row: Pick): string { - return `db${row.dbid}_q${row.queryid}`; -} - -export function pgssBaselinePath(rootDir: string | undefined, connectionId: string): string { - return join(rootDir ?? join(process.cwd(), '.ktx/cache/historic-sql'), connectionId, 'pgss-baseline.json'); -} - -export async function readPgssBaseline(path: string): Promise { - try { - return pgssBaselineSchema.parse(JSON.parse(await readFile(path, 'utf-8'))); - } catch (error) { - if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') { - return null; - } - throw error; - } -} - -export async function writePgssBaselineAtomic(path: string, baseline: PgssBaseline): Promise { - const parsed = pgssBaselineSchema.parse(baseline); - await mkdir(dirname(path), { recursive: true }); - const tempPath = `${path}.tmp`; - await writeFile(tempPath, `${JSON.stringify(parsed, null, 2)}\n`, 'utf-8'); - await rename(tempPath, path); -} - -export async function stagePgStatStatementsTemplates( - input: StagePgStatStatementsTemplatesInput, -): Promise { - const config = historicSqlPullConfigSchema.parse(input.pullConfig); - if (config.dialect !== 'postgres') { - throw new Error(`stagePgStatStatementsTemplates requires dialect postgres, got ${config.dialect}`); - } - - const now = input.now ?? new Date(); - const fetchedAt = now.toISOString(); - const probe = await input.reader.probe(input.queryClient); - const warnings = [...probe.warnings]; - const baseline = await readPgssBaseline(input.baselinePath); - const snapshot = await input.reader.readSnapshot(input.queryClient, { - minCalls: config.minCalls, - maxTemplates: PGSS_SNAPSHOT_READ_LIMIT, - }); - if (snapshot.deallocCount !== null && snapshot.deallocCount > 0) { - warnings.push( - `pgss_dealloc_count:${snapshot.deallocCount}; pg_stat_statements.max may be too low, causing template eviction churn`, - ); - } - const reset = detectBaselineReset({ - baseline, - snapshotStatsResetAt: snapshot.statsResetAt, - currentPgServerVersion: probe.pgServerVersion, - }); - warnings.push(...reset.warnings); - - const aggregates = aggregatePgssRows({ - rows: snapshot.rows, - baseline, - baselineFirstRun: reset.baselineFirstRun, - fetchedAt, - warnings, - }).filter((aggregate) => !shouldSkipPgssSql(aggregate.query)); - - const analyzed: AnalyzedPgssTemplate[] = []; - for (const aggregate of aggregates) { - const analysis = await input.sqlAnalysis.analyzeForFingerprint(aggregate.query, 'postgres'); - if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) { - warnings.push(`analysis_failed:${aggregate.id}`); - continue; - } - analyzed.push({ aggregate, analysis }); - } - - const selected = selectPgssTemplates(analyzed, config.maxTemplatesPerRun); - if (selected.length < analyzed.length) { - warnings.push(`templates_truncated: kept ${selected.length} of ${analyzed.length} templates`); - } - - await mkdir(input.stagedDir, { recursive: true }); - const templates: HistoricSqlManifest['templates'] = []; - for (const template of selected) { - const staged = buildPgssStagedTemplate(template, config, now); - const basePath = `templates/${staged.metadata.id}`; - await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata); - await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown); - await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage); - templates.push({ - id: staged.metadata.id, - fingerprint: staged.metadata.properties.fingerprint, - subClusterId: staged.metadata.properties.sub_cluster_id, - path: staged.metadata.path, - }); - } - - await writeJson(input.stagedDir, 'manifest.json', { - source: HISTORIC_SQL_SOURCE_KEY, - connectionId: input.connectionId, - dialect: 'postgres', - fetchedAt, - windowStart: baseline?.fetchedAt ?? snapshot.statsResetAt ?? fetchedAt, - windowEnd: fetchedAt, - nextSuccessfulCursor: fetchedAt, - templateCount: selected.length, - capped: selected.length < analyzed.length, - warnings, - degraded: true, - statsResetAt: snapshot.statsResetAt, - baselineFirstRun: reset.baselineFirstRun, - pgServerVersion: probe.pgServerVersion, - deallocCount: snapshot.deallocCount, - templates, - } satisfies HistoricSqlManifest); - - return { - baselinePath: input.baselinePath, - baseline: buildNextBaseline({ - rows: snapshot.rows, - fetchedAt, - statsResetAt: snapshot.statsResetAt, - pgServerVersion: probe.pgServerVersion, - previousBaseline: reset.baselineFirstRun ? null : baseline, - }), - }; -} - -function detectBaselineReset(input: { - baseline: PgssBaseline | null; - snapshotStatsResetAt: string | null; - currentPgServerVersion: string; -}): { baselineFirstRun: boolean; warnings: string[] } { - if (!input.baseline) { - return { baselineFirstRun: true, warnings: ['baseline_first_run:no_previous_pgss_baseline'] }; - } - - const warnings: string[] = []; - if ( - input.baseline.statsResetAt && - input.snapshotStatsResetAt && - input.baseline.statsResetAt < input.snapshotStatsResetAt - ) { - warnings.push( - `baseline_reset:stats_reset advanced from ${input.baseline.statsResetAt} to ${input.snapshotStatsResetAt}`, - ); - } - - const previousMajor = postgresMajor(input.baseline.pgServerVersion); - const currentMajor = postgresMajor(input.currentPgServerVersion); - if (previousMajor && currentMajor && previousMajor !== currentMajor) { - warnings.push(`baseline_reset:pg_server_major changed from ${previousMajor} to ${currentMajor}`); - } - - return { baselineFirstRun: warnings.length > 0, warnings }; -} - -function postgresMajor(version: string): string | null { - return version.match(/PostgreSQL\s+(\d+)/i)?.[1] ?? version.match(/^(\d+)(?:\.|$)/)?.[1] ?? null; -} - -function aggregatePgssRows(input: { - rows: PostgresPgssRow[]; - baseline: PgssBaseline | null; - baselineFirstRun: boolean; - fetchedAt: string; - warnings: string[]; -}): PostgresPgssAggregateRow[] { - const aggregates = new Map(); - - for (const row of input.rows) { - const templateId = pgssTemplateId(row); - const baselineTemplate = input.baselineFirstRun ? undefined : input.baseline?.templates[templateId]; - const baselineCounter = baselineTemplate?.perUser[row.userid]; - const previous = scopedCounterBaseline(row, baselineCounter, input.baselineFirstRun, input.warnings); - const deltaCalls = row.calls - previous.calls; - const deltaExecTime = row.totalExecTime - previous.totalExecTime; - const deltaRows = row.totalRows - previous.totalRows; - if (deltaCalls === 0 && !input.baselineFirstRun) { - continue; - } - - const existing = - aggregates.get(templateId) ?? - ({ - id: templateId, - queryid: row.queryid, - dbid: row.dbid, - database: row.database, - query: row.query, - deltaCalls: 0, - deltaExecTime: 0, - deltaRows: 0, - users: new Set(), - firstObservedAt: baselineTemplate?.firstObservedAt ?? input.fetchedAt, - } satisfies PgssAggregateMutable); - - existing.deltaCalls += Math.max(0, deltaCalls); - existing.deltaExecTime += Math.max(0, deltaExecTime); - existing.deltaRows += Math.max(0, deltaRows); - if (deltaCalls > 0) { - existing.users.add(row.username ?? 'unknown'); - } - aggregates.set(templateId, existing); - } - - return [...aggregates.values()] - .filter((aggregate) => aggregate.deltaCalls > 0) - .map((aggregate) => ({ - id: aggregate.id, - queryid: aggregate.queryid, - dbid: aggregate.dbid, - database: aggregate.database, - query: aggregate.query, - deltaCalls: aggregate.deltaCalls, - deltaExecTime: aggregate.deltaExecTime, - deltaRows: aggregate.deltaRows, - meanExecTime: aggregate.deltaExecTime / Math.max(aggregate.deltaCalls, 1), - distinctUsersDelta: aggregate.users.size, - users: [...aggregate.users].sort(), - firstObservedAt: aggregate.firstObservedAt, - })); -} - -function scopedCounterBaseline( - row: PostgresPgssRow, - baselineCounter: PgssBaselineCounter | undefined, - baselineFirstRun: boolean, - warnings: string[], -): PgssBaselineCounter { - if (!baselineCounter || baselineFirstRun) { - return ZERO_COUNTER; - } - if ( - baselineCounter.calls > row.calls || - baselineCounter.totalExecTime > row.totalExecTime || - baselineCounter.totalRows > row.totalRows - ) { - warnings.push(`scoped_reset:dbid=${row.dbid} queryid=${row.queryid} userid=${row.userid}`); - return ZERO_COUNTER; - } - return baselineCounter; -} - -function shouldSkipPgssSql(sql: string): boolean { - return PGSS_HARD_SKIP_PREFIX_RE.test(sql) || PGSS_HARD_SKIP_TABLE_RE.test(sql); -} - -function selectPgssTemplates(templates: AnalyzedPgssTemplate[], maxTemplatesPerRun: number): AnalyzedPgssTemplate[] { - return templates - .map((template) => ({ - template, - score: template.aggregate.users.length * Math.log1p(template.aggregate.deltaCalls), - })) - .sort( - (left, right) => right.score - left.score || left.template.aggregate.id.localeCompare(right.template.aggregate.id), - ) - .slice(0, maxTemplatesPerRun) - .map((entry) => entry.template); -} - -function buildPgssStagedTemplate( - template: AnalyzedPgssTemplate, - config: HistoricSqlPullConfig, - now: Date, -): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } { - const tablesTouched = [...template.analysis.tablesTouched].sort(); - const firstTable = tablesTouched[0] ?? 'query'; - const id = template.aggregate.id; - - const metadata: HistoricSqlMetadata = { - id, - title: `postgres · ${firstTable} [${id.slice(0, 12)}]`, - path: `templates/${id}/page.md`, - objectType: HISTORIC_SQL_OBJECT_TYPE, - lastEditedAt: null, - properties: { - fingerprint: template.analysis.fingerprint, - sub_cluster_id: null, - dialect: 'postgres', - tables_touched: tablesTouched, - literal_slots: [], - triage_signals: buildPgssTriageSignals({ - executions: template.aggregate.deltaCalls, - distinctUsers: template.aggregate.distinctUsersDelta, - firstSeen: template.aggregate.firstObservedAt, - lastSeen: now.toISOString(), - meanRuntimeMs: template.aggregate.meanExecTime, - serviceAccountOnly: isServiceAccountOnly(template.aggregate.users, config.serviceAccountUserPatterns), - now, - }), - }, - }; - - return { - metadata, - pageMarkdown: renderTemplatePage(id, template.analysis.normalizedSql, tablesTouched), - usage: { - stats: { - executions: template.aggregate.deltaCalls, - distinct_users: template.aggregate.distinctUsersDelta, - first_seen: template.aggregate.firstObservedAt, - last_seen: now.toISOString(), - p50_runtime_ms: null, - p95_runtime_ms: null, - mean_runtime_ms: template.aggregate.meanExecTime, - error_rate: 0, - rows_produced: template.aggregate.deltaRows, - }, - literal_slots: [], - samples: [], - }, - }; -} - -function buildPgssTriageSignals(input: { - executions: number; - distinctUsers: number; - firstSeen: string; - lastSeen: string; - meanRuntimeMs: number; - serviceAccountOnly: boolean; - now: Date; -}): Record { - return { - executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high', - distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad', - error_rate_bucket: 'ok', - recency_bucket: recencyBucket(input.lastSeen, input.now), - service_account_only: String(input.serviceAccountOnly), - runtime_bucket: runtimeBucket(input.meanRuntimeMs), - }; -} - -function runtimeBucket(meanRuntimeMs: number): string { - if (meanRuntimeMs < 100) { - return 'fast'; - } - if (meanRuntimeMs < 1000) { - return 'moderate'; - } - return 'slow'; -} - -function recencyBucket(lastSeen: string, now: Date): string { - const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / 86400000); - if (ageDays <= 14) { - return 'active'; - } - if (ageDays <= 60) { - return 'warm'; - } - return 'cold'; -} - -function isServiceAccountOnly(users: string[], patterns: string[]): boolean { - if (users.length === 0 || patterns.length === 0) { - return false; - } - const regexes = patterns.map((pattern) => new RegExp(pattern)); - return users.every((user) => regexes.some((regex) => regex.test(user))); -} - -function renderTemplatePage(id: string, normalizedSql: string, tablesTouched: string[]): string { - return [ - `# ${id}`, - '', - '## Normalized SQL', - '```sql', - normalizedSql, - '```', - '', - '## Tables touched', - ...tablesTouched.map((table) => `- ${table}`), - '', - ].join('\n'); -} - -function buildNextBaseline(input: { - rows: PostgresPgssRow[]; - fetchedAt: string; - statsResetAt: string | null; - pgServerVersion: string; - previousBaseline: PgssBaseline | null; -}): PgssBaseline { - const templates: PgssBaseline['templates'] = {}; - for (const row of input.rows) { - const templateId = pgssTemplateId(row); - const previous = input.previousBaseline?.templates[templateId]; - const template = templates[templateId] ?? { - firstObservedAt: previous?.firstObservedAt ?? input.fetchedAt, - perUser: {}, - }; - template.perUser[row.userid] = { - calls: row.calls, - totalExecTime: row.totalExecTime, - totalRows: row.totalRows, - }; - templates[templateId] = template; - } - return { - version: PGSS_BASELINE_VERSION, - fetchedAt: input.fetchedAt, - statsResetAt: input.statsResetAt, - pgServerVersion: input.pgServerVersion, - templates, - }; -} - -async function writeJson(root: string, relPath: string, value: unknown): Promise { - await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`); -} - -async function writeText(root: string, relPath: string, value: string): Promise { - const target = join(root, relPath); - await mkdir(dirname(target), { recursive: true }); - await writeFile(target, value, 'utf-8'); -} diff --git a/packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts b/packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts new file mode 100644 index 00000000..421970bf --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts @@ -0,0 +1,358 @@ +import { mkdtemp, readFile, readdir } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { describe, expect, it, vi } from 'vitest'; +import type { SqlAnalysisPort } from '../../../sql-analysis/index.js'; +import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js'; +import type { AggregatedTemplate, HistoricSqlReader } from './types.js'; + +async function tempDir(): Promise { + return mkdtemp(join(tmpdir(), 'historic-sql-unified-stage-')); +} + +async function readJson(root: string, relPath: string): Promise { + return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T; +} + +function aggregate(overrides: Partial & { templateId: string; canonicalSql: string }): AggregatedTemplate { + return { + templateId: overrides.templateId, + canonicalSql: overrides.canonicalSql, + dialect: overrides.dialect ?? 'postgres', + stats: overrides.stats ?? { + executions: 42, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 20, + p95RuntimeMs: 80, + errorRate: 0, + rowsProduced: 100, + }, + topUsers: overrides.topUsers ?? [{ user: 'analyst', executions: 40 }], + }; +} + +describe('stageHistoricSqlAggregatedSnapshot', () => { + it('batch parses templates and writes stable table and patterns artifacts', async () => { + const stagedDir = await tempDir(); + const reader: HistoricSqlReader = { + async probe() { + return { warnings: ['pg_stat_statements.track is none; aggregation still proceeds'], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ + templateId: 'orders-by-status', + canonicalSql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status', + }); + yield aggregate({ + templateId: 'service-account-only', + canonicalSql: 'select * from public.orders where id = $1', + stats: { + executions: 20, + distinctUsers: 1, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 5, + p95RuntimeMs: 10, + errorRate: 0, + rowsProduced: 1, + }, + topUsers: [{ user: 'svc_loader', executions: 20 }], + }); + yield aggregate({ + templateId: 'bad-parse', + canonicalSql: 'select broken from', + }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + [ + 'orders-by-status', + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: ['status'], + where: ['created_at'], + join: ['customer_id'], + groupBy: ['status'], + }, + }, + ], + ['bad-parse', { tablesTouched: [], columnsByClause: {}, error: 'parse failed' }], + ])), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'postgres', + filters: { + serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' }, + }, + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1); + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith( + [ + { + id: 'orders-by-status', + sql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status', + }, + { id: 'bad-parse', sql: 'select broken from' }, + ], + 'postgres', + ); + + expect(await readdir(join(stagedDir, 'tables'))).toEqual(['public.customers.json', 'public.orders.json']); + + const manifest = await readJson>(stagedDir, 'manifest.json'); + expect(manifest).toMatchObject({ + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + snapshotRowCount: 3, + touchedTableCount: 2, + parseFailures: 1, + warnings: ['parse_failed:bad-parse'], + probeWarnings: ['pg_stat_statements.track is none; aggregation still proceeds'], + staleArchiveAfterDays: 90, + }); + + const orders = await readJson>(stagedDir, 'tables/public.orders.json'); + expect(orders).toMatchObject({ + table: 'public.orders', + stats: { + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + errorRateBucket: 'none', + p95RuntimeBucket: '<100ms', + recencyBucket: 'current', + }, + columnsByClause: { + select: [['status', 'high']], + where: [['created_at', 'high']], + join: [['customer_id', 'high']], + groupBy: [['status', 'high']], + }, + observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }], + topTemplates: [ + { + id: 'orders-by-status', + topUsers: [{ user: 'analyst' }], + }, + ], + }); + expect(orders.topTemplates[0].canonicalSql).toContain('group by o.status'); + + const patterns = await readJson>(stagedDir, 'patterns-input.json'); + expect(patterns.templates).toEqual([ + { + id: 'orders-by-status', + canonicalSql: expect.stringContaining('public.orders'), + tablesTouched: ['public.customers', 'public.orders'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ]); + }); + + it('redacts configured SQL substrings in staged artifacts while analyzing original SQL', async () => { + const stagedDir = await tempDir(); + const originalSql = + "select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'"; + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ + templateId: 'api-events-with-secret', + canonicalSql: originalSql, + stats: { + executions: 15, + distinctUsers: 2, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 12, + p95RuntimeMs: 25, + errorRate: 0, + rowsProduced: 15, + }, + }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + [ + 'api-events-with-secret', + { + tablesTouched: ['public.api_events'], + columnsByClause: { + select: [], + where: ['api_key', 'note'], + join: [], + groupBy: [], + }, + }, + ], + ])), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'postgres', + redactionPatterns: ['sk_live_[A-Za-z0-9]+', '(?i)secret_token_[a-z0-9]+'], + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith( + [{ id: 'api-events-with-secret', sql: originalSql }], + 'postgres', + ); + + const tableJson = await readFile(join(stagedDir, 'tables/public.api_events.json'), 'utf-8'); + const patternsJson = await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8'); + expect(tableJson).not.toContain('sk_live_abc123'); + expect(tableJson).not.toContain('Secret_Token_9f'); + expect(patternsJson).not.toContain('sk_live_abc123'); + expect(patternsJson).not.toContain('Secret_Token_9f'); + expect(tableJson).toContain('[REDACTED]'); + expect(patternsJson).toContain('[REDACTED]'); + }); + + it('preserves full patterns audit input and writes bounded cross-table pattern shards', async () => { + const stagedDir = await tempDir(); + const largeSql = `select * from public.orders o join public.customers c on c.id = o.customer_id where payload = '${'x'.repeat(8000)}'`; + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ + templateId: 'orders-customers-a', + canonicalSql: largeSql, + stats: { + executions: 25, + distinctUsers: 4, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 15, + p95RuntimeMs: 90, + errorRate: 0, + rowsProduced: 250, + }, + }); + yield aggregate({ + templateId: 'orders-customers-b', + canonicalSql: largeSql.replace('payload', 'payload_b'), + stats: { + executions: 22, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 20, + p95RuntimeMs: 95, + errorRate: 0, + rowsProduced: 220, + }, + }); + yield aggregate({ + templateId: 'orders-single-table', + canonicalSql: 'select count(*) from public.orders', + stats: { + executions: 30, + distinctUsers: 2, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 10, + p95RuntimeMs: 20, + errorRate: 0, + rowsProduced: 30, + }, + }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + [ + 'orders-customers-a', + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: [], + where: ['payload'], + join: ['customer_id', 'id'], + groupBy: [], + }, + }, + ], + [ + 'orders-customers-b', + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: [], + where: ['payload_b'], + join: ['customer_id', 'id'], + groupBy: [], + }, + }, + ], + [ + 'orders-single-table', + { + tablesTouched: ['public.orders'], + columnsByClause: { + select: [], + where: [], + join: [], + groupBy: [], + }, + }, + ], + ])), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { dialect: 'postgres' }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + const audit = await readJson>(stagedDir, 'patterns-input.json'); + expect(audit.templates.map((entry: any) => entry.id)).toEqual([ + 'orders-customers-a', + 'orders-customers-b', + 'orders-single-table', + ]); + + const firstShard = await readJson>(stagedDir, 'patterns-input/part-0001.json'); + expect(firstShard.templates.map((entry: any) => entry.id)).toEqual(['orders-customers-a', 'orders-customers-b']); + expect(firstShard.templates.some((entry: any) => entry.id === 'orders-single-table')).toBe(false); + + const manifest = await readJson>(stagedDir, 'manifest.json'); + expect(manifest.warnings).toEqual([]); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/stage-unified.ts b/packages/context/src/ingest/adapters/historic-sql/stage-unified.ts new file mode 100644 index 00000000..a95052d1 --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/stage-unified.ts @@ -0,0 +1,308 @@ +import { mkdir, writeFile } from 'node:fs/promises'; +import { dirname, join } from 'node:path'; +import type { SqlAnalysisPort } from '../../../sql-analysis/index.js'; +import { + bucketDistinctUsers, + bucketErrorRate, + bucketExecutions, + bucketFrequency, + bucketP95Runtime, + bucketRecency, +} from './buckets.js'; +import { splitHistoricSqlPatternInputs } from './pattern-inputs.js'; +import { + compileHistoricSqlRedactionPatterns, + redactHistoricSqlText, + type HistoricSqlRedactionPattern, +} from './redaction.js'; +import { + HISTORIC_SQL_SOURCE_KEY, + aggregatedTemplateSchema, + historicSqlUnifiedPullConfigSchema, + type AggregatedTemplate, + type HistoricSqlReader, + type HistoricSqlUnifiedPullConfig, + type StagedPatternsInput, + type StagedTableInput, +} from './types.js'; + +interface StageHistoricSqlAggregatedSnapshotInput { + stagedDir: string; + connectionId: string; + queryClient: unknown; + reader: HistoricSqlReader; + sqlAnalysis: SqlAnalysisPort; + pullConfig: unknown; + now?: Date; +} + +interface ParsedTemplate { + template: AggregatedTemplate; + tablesTouched: string[]; + columnsByClause: Record; +} + +interface TableAccumulator { + table: string; + executions: number; + distinctUsers: number; + errorRateNumerator: number; + p95RuntimeMs: number | null; + lastSeen: string; + columnsByClause: Map>; + observedJoins: Map>; + topTemplates: AggregatedTemplate[]; +} + +const TRIVIAL_SQL_RE = /^\s*SELECT\s+(1|NOW\(\)|CURRENT_TIMESTAMP|VERSION\(\))\s*;?\s*$/i; +const NOISE_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i; +const SYSTEM_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i; + +function writeJson(root: string, relPath: string, value: unknown): Promise { + const target = join(root, relPath); + return mkdir(dirname(target), { recursive: true }).then(() => + writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'), + ); +} + +function compilePatterns(patterns: string[]): RegExp[] { + return patterns.map((pattern) => new RegExp(pattern)); +} + +function matchesAny(value: string | null, patterns: RegExp[]): boolean { + return !!value && patterns.some((pattern) => pattern.test(value)); +} + +function shouldDropBySql(sql: string, config: HistoricSqlUnifiedPullConfig): boolean { + if (NOISE_PREFIX_RE.test(sql) || SYSTEM_TABLE_RE.test(sql)) return true; + if (config.filters.dropTrivialProbes !== false && TRIVIAL_SQL_RE.test(sql)) return true; + return false; +} + +function shouldDropByUsers(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean { + const service = config.filters.serviceAccounts; + if (!service || service.mode === 'mark-only' || service.patterns.length === 0) return false; + const patterns = compilePatterns(service.patterns); + const matchingExecutions = template.topUsers + .filter((entry) => matchesAny(entry.user, patterns)) + .reduce((sum, entry) => sum + entry.executions, 0); + const allExecutions = template.topUsers.reduce((sum, entry) => sum + entry.executions, 0); + const serviceOnly = allExecutions > 0 && matchingExecutions >= allExecutions; + return service.mode === 'exclude' ? serviceOnly : !serviceOnly; +} + +function shouldDropByFailure(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean { + const failed = config.filters.dropFailedBelow; + return !!failed && template.stats.errorRate > failed.errorRate && template.stats.executions < failed.executions; +} + +function shouldDropTemplate(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean { + if (shouldDropBySql(template.canonicalSql, config)) return true; + if (shouldDropByUsers(template, config)) return true; + if (shouldDropByFailure(template, config)) return true; + return false; +} + +function redactTemplateSql( + template: AggregatedTemplate, + redactors: readonly HistoricSqlRedactionPattern[], +): AggregatedTemplate { + if (redactors.length === 0) { + return template; + } + return { + ...template, + canonicalSql: redactHistoricSqlText(template.canonicalSql, redactors), + }; +} + +function recordColumn(acc: TableAccumulator, clause: string, column: string, executions: number): void { + const byColumn = acc.columnsByClause.get(clause) ?? new Map(); + byColumn.set(column, (byColumn.get(column) ?? 0) + executions); + acc.columnsByClause.set(clause, byColumn); +} + +function recordJoin(acc: TableAccumulator, otherTable: string, columns: string[], executions: number): void { + const byColumns = acc.observedJoins.get(otherTable) ?? new Map(); + const key = [...new Set(columns)].sort().join(','); + if (key.length > 0) { + byColumns.set(key, (byColumns.get(key) ?? 0) + executions); + acc.observedJoins.set(otherTable, byColumns); + } +} + +function accumulatorFor(table: string): TableAccumulator { + return { + table, + executions: 0, + distinctUsers: 0, + errorRateNumerator: 0, + p95RuntimeMs: null, + lastSeen: '1970-01-01T00:00:00.000Z', + columnsByClause: new Map(), + observedJoins: new Map(), + topTemplates: [], + }; +} + +function addTemplate(acc: TableAccumulator, parsed: ParsedTemplate): void { + const executions = parsed.template.stats.executions; + acc.executions += executions; + acc.distinctUsers = Math.max(acc.distinctUsers, parsed.template.stats.distinctUsers); + acc.errorRateNumerator += parsed.template.stats.errorRate * executions; + acc.p95RuntimeMs = + acc.p95RuntimeMs === null + ? parsed.template.stats.p95RuntimeMs + : parsed.template.stats.p95RuntimeMs === null + ? acc.p95RuntimeMs + : Math.max(acc.p95RuntimeMs, parsed.template.stats.p95RuntimeMs); + acc.lastSeen = parsed.template.stats.lastSeen > acc.lastSeen ? parsed.template.stats.lastSeen : acc.lastSeen; + for (const [clause, columns] of Object.entries(parsed.columnsByClause)) { + for (const column of columns) { + recordColumn(acc, clause, column, executions); + } + } + const joinColumns = parsed.columnsByClause.join ?? []; + for (const otherTable of parsed.tablesTouched.filter((table) => table !== acc.table)) { + recordJoin(acc, otherTable, joinColumns, executions); + } + acc.topTemplates.push(parsed.template); +} + +function toStagedTable(acc: TableAccumulator, now: Date): StagedTableInput { + const errorRate = acc.executions > 0 ? acc.errorRateNumerator / acc.executions : 0; + const columnsByClause: Record> = Object.fromEntries( + [...acc.columnsByClause.entries()] + .sort(([left], [right]) => left.localeCompare(right)) + .map(([clause, counts]) => [ + clause, + [...counts.entries()] + .sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0])) + .map(([column, count]) => [column, bucketFrequency(count, acc.executions)] as [string, string]), + ]), + ); + const observedJoins = [...acc.observedJoins.entries()] + .flatMap(([withTable, byColumns]) => + [...byColumns.entries()].map(([columns, count]) => ({ + withTable, + on: columns.split(',').filter(Boolean), + freq: bucketFrequency(count, acc.executions), + })), + ) + .sort((left, right) => left.withTable.localeCompare(right.withTable) || left.on.join(',').localeCompare(right.on.join(','))); + const topTemplates = [...acc.topTemplates] + .sort((left, right) => right.stats.executions - left.stats.executions || left.templateId.localeCompare(right.templateId)) + .slice(0, 5) + .map((template) => ({ + id: template.templateId, + canonicalSql: template.canonicalSql, + topUsers: template.topUsers.slice(0, 5).map((entry) => ({ user: entry.user })), + })); + + return { + table: acc.table, + stats: { + executionsBucket: bucketExecutions(acc.executions), + distinctUsersBucket: bucketDistinctUsers(acc.distinctUsers), + errorRateBucket: bucketErrorRate(errorRate), + p95RuntimeBucket: bucketP95Runtime(acc.p95RuntimeMs), + recencyBucket: bucketRecency(acc.lastSeen, now), + }, + columnsByClause, + observedJoins, + topTemplates, + }; +} + +function toPatternsInput(parsedTemplates: ParsedTemplate[]): StagedPatternsInput { + return { + templates: parsedTemplates + .map(({ template, tablesTouched }) => ({ + id: template.templateId, + canonicalSql: template.canonicalSql, + tablesTouched: [...tablesTouched].sort(), + executionsBucket: bucketExecutions(template.stats.executions), + distinctUsersBucket: bucketDistinctUsers(template.stats.distinctUsers), + dialect: template.dialect, + })) + .sort((left, right) => left.id.localeCompare(right.id)), + }; +} + +export async function stageHistoricSqlAggregatedSnapshot(input: StageHistoricSqlAggregatedSnapshotInput): Promise { + const config = historicSqlUnifiedPullConfigSchema.parse(input.pullConfig); + const redactors = compileHistoricSqlRedactionPatterns(config.redactionPatterns); + const now = input.now ?? new Date(); + const windowStart = new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000); + const probe = await input.reader.probe(input.queryClient); + const snapshot: AggregatedTemplate[] = []; + let snapshotRowCount = 0; + + for await (const row of input.reader.fetchAggregated(input.queryClient, { start: windowStart, end: now }, config)) { + snapshotRowCount += 1; + const parsed = aggregatedTemplateSchema.parse(row); + if (!shouldDropTemplate(parsed, config)) { + snapshot.push(parsed); + } + } + + const analysis = await input.sqlAnalysis.analyzeBatch( + snapshot.map((template) => ({ id: template.templateId, sql: template.canonicalSql })), + config.dialect, + ); + const warnings: string[] = []; + const parsedTemplates: ParsedTemplate[] = []; + for (const template of snapshot) { + const parsed = analysis.get(template.templateId); + if (!parsed || parsed.error) { + warnings.push(`parse_failed:${template.templateId}`); + continue; + } + const tablesTouched = [...new Set(parsed.tablesTouched)].filter((table) => table.length > 0).sort(); + if (tablesTouched.length === 0) { + continue; + } + parsedTemplates.push({ + template: redactTemplateSql(template, redactors), + tablesTouched, + columnsByClause: Object.fromEntries( + Object.entries(parsed.columnsByClause).map(([clause, columns]) => [clause, [...new Set(columns)].sort()]), + ), + }); + } + + const byTable = new Map(); + for (const parsed of parsedTemplates) { + for (const table of parsed.tablesTouched) { + const acc = byTable.get(table) ?? accumulatorFor(table); + addTemplate(acc, parsed); + byTable.set(table, acc); + } + } + + await mkdir(input.stagedDir, { recursive: true }); + for (const [table, acc] of [...byTable.entries()].sort(([left], [right]) => left.localeCompare(right))) { + await writeJson(input.stagedDir, `tables/${table}.json`, toStagedTable(acc, now)); + } + const patternsInput = toPatternsInput(parsedTemplates); + const patternInputSplit = splitHistoricSqlPatternInputs(patternsInput); + const allWarnings = [...warnings, ...patternInputSplit.warnings]; + await writeJson(input.stagedDir, 'patterns-input.json', patternInputSplit.auditInput); + for (const shard of patternInputSplit.shards) { + await writeJson(input.stagedDir, shard.path, shard.input); + } + await writeJson(input.stagedDir, 'manifest.json', { + source: HISTORIC_SQL_SOURCE_KEY, + connectionId: input.connectionId, + dialect: config.dialect, + fetchedAt: now.toISOString(), + windowStart: windowStart.toISOString(), + windowEnd: now.toISOString(), + snapshotRowCount, + touchedTableCount: byTable.size, + parseFailures: allWarnings.filter((warning) => warning.startsWith('parse_failed:')).length, + warnings: allWarnings, + probeWarnings: probe.warnings, + staleArchiveAfterDays: config.staleArchiveAfterDays, + }); +} diff --git a/packages/context/src/ingest/adapters/historic-sql/stage.test.ts b/packages/context/src/ingest/adapters/historic-sql/stage.test.ts deleted file mode 100644 index dfaed511..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/stage.test.ts +++ /dev/null @@ -1,798 +0,0 @@ -import { mkdtemp, readFile, readdir } from 'node:fs/promises'; -import { tmpdir } from 'node:os'; -import { join } from 'node:path'; -import { describe, expect, it } from 'vitest'; -import type { SqlAnalysisPort } from '../../../sql-analysis/index.js'; -import { stageHistoricSqlTemplates } from './stage.js'; -import { - historicSqlManifestSchema, - historicSqlMetadataSchema, - historicSqlUsageSchema, - type HistoricSqlQueryHistoryReader, - type HistoricSqlRawQueryRow, -} from './types.js'; - -async function tempDir(): Promise { - return mkdtemp(join(tmpdir(), 'historic-sql-stage-')); -} - -async function readJson(root: string, relPath: string): Promise { - return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T; -} - -function fakeReader(rows: HistoricSqlRawQueryRow[]): HistoricSqlQueryHistoryReader { - return { - async probe() {}, - async *fetch() { - for (const row of rows) { - yield row; - } - }, - }; -} - -const fakeSqlAnalysis: SqlAnalysisPort = { - async analyzeForFingerprint(sql) { - if (sql.includes('paid')) { - return { - fingerprint: 'fp_paid_orders', - normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?', - tablesTouched: ['analytics.orders'], - literalSlots: [ - { position: 1, type: 'string', exampleValue: 'paid' }, - { position: 2, type: 'date', exampleValue: '2026-04-01' }, - ], - }; - } - return { - fingerprint: 'fp_refunds', - normalizedSql: 'SELECT count(*) FROM analytics.refunds WHERE state = ?', - tablesTouched: ['analytics.refunds'], - literalSlots: [{ position: 1, type: 'string', exampleValue: 'complete' }], - }; - }, -}; - -const categoricalSqlAnalysis: SqlAnalysisPort = { - async analyzeForFingerprint(sql) { - const status = sql.includes("'refunded'") ? 'refunded' : 'paid'; - return { - fingerprint: 'fp_order_status', - normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?', - tablesTouched: ['analytics.orders'], - literalSlots: [{ position: 1, type: 'string', exampleValue: status }], - }; - }, -}; - -function categoricalRows(): HistoricSqlRawQueryRow[] { - return [ - { - id: 'paid-1', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'", - user: 'analyst-a', - startedAt: '2026-05-04T10:00:00.000Z', - endedAt: null, - runtimeMs: 100, - rowsProduced: 11, - success: true, - errorMessage: null, - }, - { - id: 'paid-2', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'", - user: 'analyst-b', - startedAt: '2026-05-04T10:01:00.000Z', - endedAt: null, - runtimeMs: 110, - rowsProduced: 12, - success: true, - errorMessage: null, - }, - { - id: 'paid-3', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'", - user: 'analyst-c', - startedAt: '2026-05-04T10:02:00.000Z', - endedAt: null, - runtimeMs: 120, - rowsProduced: 13, - success: true, - errorMessage: null, - }, - { - id: 'refunded-1', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'", - user: 'analyst-a', - startedAt: '2026-05-04T10:03:00.000Z', - endedAt: null, - runtimeMs: 130, - rowsProduced: 21, - success: true, - errorMessage: null, - }, - { - id: 'refunded-2', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'", - user: 'analyst-b', - startedAt: '2026-05-04T10:04:00.000Z', - endedAt: null, - runtimeMs: 140, - rowsProduced: 22, - success: true, - errorMessage: null, - }, - { - id: 'refunded-3', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'", - user: 'analyst-c', - startedAt: '2026-05-04T10:05:00.000Z', - endedAt: null, - runtimeMs: 150, - rowsProduced: 23, - success: true, - errorMessage: null, - }, - ]; -} - -const diverseSqlAnalysis: SqlAnalysisPort = { - async analyzeForFingerprint(sql) { - const value = sql.match(/status = '([^']+)'/)?.[1] ?? 'unknown'; - return { - fingerprint: 'fp_diverse_samples', - normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?', - tablesTouched: ['analytics.orders'], - literalSlots: [{ position: 1, type: 'string', exampleValue: value }], - }; - }, -}; - -const classificationMatrixSqlAnalysis: SqlAnalysisPort = { - async analyzeForFingerprint(sql) { - if (sql.includes('stale_orders')) { - return { - fingerprint: 'fp_stale_date', - normalizedSql: 'SELECT count(*) FROM analytics.stale_orders WHERE created_at >= ?', - tablesTouched: ['analytics.stale_orders'], - literalSlots: [{ position: 1, type: 'date', exampleValue: '2026-04-01' }], - }; - } - - const stringValue = (field: string): string => sql.match(new RegExp(`${field} = '([^']+)'`))?.[1] ?? 'unknown'; - const amount = sql.match(/amount >= (\d+)/)?.[1] ?? '0'; - const asOf = sql.match(/created_at >= '([^']+)'/)?.[1] ?? '2026-05-01'; - - return { - fingerprint: 'fp_classification_matrix', - normalizedSql: - 'SELECT count(*) FROM analytics.orders WHERE region = ? AND plan = ? AND status = ? AND amount >= ? AND created_at >= ?', - tablesTouched: ['analytics.orders'], - literalSlots: [ - { position: 1, type: 'string', exampleValue: stringValue('region') }, - { position: 2, type: 'string', exampleValue: stringValue('plan') }, - { position: 3, type: 'string', exampleValue: stringValue('status') }, - { position: 4, type: 'number', exampleValue: amount }, - { position: 5, type: 'date', exampleValue: asOf }, - ], - }; - }, -}; - -function classificationMatrixRows(): HistoricSqlRawQueryRow[] { - const rows: HistoricSqlRawQueryRow[] = Array.from({ length: 20 }, (_, index) => { - const status = index < 10 ? 'paid' : 'refunded'; - const plan = index === 19 ? 'self_serve' : 'enterprise'; - const amount = 100 + index; - const asOf = `2026-05-${String(1 + Math.floor(index / 5)).padStart(2, '0')}`; - return { - id: `matrix-${index + 1}`, - sql: `SELECT count(*) FROM analytics.orders WHERE region = 'us' AND plan = '${plan}' AND status = '${status}' AND amount >= ${amount} AND created_at >= '${asOf}'`, - user: `analyst-${(index % 4) + 1}`, - startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`, - endedAt: null, - runtimeMs: 100 + index, - rowsProduced: 1, - success: true, - errorMessage: null, - }; - }); - - return [ - ...rows, - { - id: 'stale-date-1', - sql: "SELECT count(*) FROM analytics.stale_orders WHERE created_at >= '2026-04-01'", - user: 'analyst-1', - startedAt: '2026-05-04T11:00:00.000Z', - endedAt: null, - runtimeMs: 75, - rowsProduced: 1, - success: true, - errorMessage: null, - }, - ]; -} - -describe('stageHistoricSqlTemplates', () => { - it('compresses rows by fingerprint into document-shaped staged templates', async () => { - const stagedDir = await tempDir(); - - await stageHistoricSqlTemplates({ - stagedDir, - connectionId: 'conn_1', - queryClient: {}, - reader: fakeReader([ - { - id: 'q1', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01' AND email = 'analyst@example.com'", - user: 'analyst@example.com', - startedAt: '2026-05-04T10:00:00.000Z', - endedAt: '2026-05-04T10:00:01.000Z', - runtimeMs: 100, - rowsProduced: 1, - success: true, - errorMessage: null, - }, - { - id: 'q2', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-05-01' AND email = 'analyst-2@example.com'", - user: 'analyst-2@example.com', - startedAt: '2026-05-04T11:00:00.000Z', - endedAt: '2026-05-04T11:00:01.000Z', - runtimeMs: 300, - rowsProduced: 1, - success: true, - errorMessage: null, - }, - ]), - sqlAnalysis: fakeSqlAnalysis, - pullConfig: { - dialect: 'snowflake', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: ['^svc_'], - redactionPatterns: ['[\\w.+-]+@[\\w-]+\\.[\\w.-]+'], - maxTemplatesPerRun: 5000, - minCalls: 5, - }, - now: new Date('2026-05-04T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - expect(manifest).toMatchObject({ - source: 'historic-sql', - connectionId: 'conn_1', - dialect: 'snowflake', - nextSuccessfulCursor: '2026-05-04T11:00:00.000Z', - templateCount: 1, - capped: false, - }); - - const files = (await readdir(join(stagedDir, 'templates', 'fp_paid_orders'))).sort(); - expect(files).toEqual(['metadata.json', 'page.md', 'usage.json']); - - const metadata = historicSqlMetadataSchema.parse( - await readJson(stagedDir, 'templates/fp_paid_orders/metadata.json'), - ); - expect(metadata).toEqual({ - id: 'fp_paid_orders', - title: 'snowflake · analytics.orders [fp_pai]', - path: 'templates/fp_paid_orders/page.md', - objectType: 'historic_sql_template', - lastEditedAt: null, - properties: { - fingerprint: 'fp_paid_orders', - sub_cluster_id: null, - dialect: 'snowflake', - tables_touched: ['analytics.orders'], - literal_slots: [ - { position: 1, type: 'string', classification: 'constant' }, - { position: 2, type: 'date', classification: 'runtime' }, - ], - triage_signals: { - executions_bucket: 'low', - distinct_users_bucket: 'team', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'false', - slot_summary: '1 constant, 1 runtime', - }, - }, - }); - - const page = await readFile(join(stagedDir, 'templates/fp_paid_orders/page.md'), 'utf-8'); - expect(page).toContain('## Normalized SQL'); - expect(page).toContain('SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?'); - expect(page).toContain('- analytics.orders'); - - const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json')); - expect(usage.stats).toMatchObject({ - executions: 2, - distinct_users: 2, - first_seen: '2026-05-04T10:00:00.000Z', - last_seen: '2026-05-04T11:00:00.000Z', - p50_runtime_ms: 100, - p95_runtime_ms: 300, - error_rate: 0, - }); - expect(usage.samples).toHaveLength(1); - expect(usage.samples[0].bound_sql).toContain(''); - expect(usage.samples[0].bound_sql).not.toContain('analyst@example.com'); - expect(usage.samples[0].bound_sql).not.toContain('analyst-2@example.com'); - }); - - it('skips hard-noise SQL and caps templates deterministically', async () => { - const stagedDir = await tempDir(); - - await stageHistoricSqlTemplates({ - stagedDir, - connectionId: 'conn_1', - queryClient: {}, - reader: fakeReader([ - { - id: 'show-1', - sql: 'SHOW TABLES', - user: 'analyst', - startedAt: '2026-05-04T10:00:00.000Z', - endedAt: null, - runtimeMs: null, - success: true, - errorMessage: null, - }, - { - id: 'q3', - sql: "SELECT count(*) FROM analytics.refunds WHERE state = 'complete'", - user: 'analyst', - startedAt: '2026-05-04T11:00:00.000Z', - endedAt: null, - runtimeMs: 50, - success: true, - errorMessage: null, - }, - { - id: 'q4', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01'", - user: 'analyst', - startedAt: '2026-05-04T11:30:00.000Z', - endedAt: null, - runtimeMs: 40, - success: true, - errorMessage: null, - }, - ]), - sqlAnalysis: fakeSqlAnalysis, - pullConfig: { - dialect: 'bigquery', - windowDays: 7, - lastSuccessfulCursor: '2026-05-01T00:00:00.000Z', - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 1, - minCalls: 5, - }, - now: new Date('2026-05-04T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - expect(manifest.templateCount).toBe(1); - expect(manifest.capped).toBe(true); - expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']); - expect(manifest.templates.map((template) => template.id)).toEqual(['fp_paid_orders']); - }); - - it('splits categorical fingerprints into one document directory per dominant value', async () => { - const stagedDir = await tempDir(); - - await stageHistoricSqlTemplates({ - stagedDir, - connectionId: 'conn_1', - queryClient: {}, - reader: fakeReader(categoricalRows()), - sqlAnalysis: categoricalSqlAnalysis, - pullConfig: { - dialect: 'snowflake', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 5000, - minCalls: 5, - }, - now: new Date('2026-05-04T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - const templates = manifest.templates - .map((template) => ({ - id: template.id, - fingerprint: template.fingerprint, - subClusterId: template.subClusterId, - path: template.path, - })) - .sort((left, right) => left.id.localeCompare(right.id)); - - expect(manifest.templateCount).toBe(2); - expect(templates).toEqual([ - { - id: 'fp_order_status__cat_2b2ff2318877', - fingerprint: 'fp_order_status', - subClusterId: 'cat_2b2ff2318877', - path: 'templates/fp_order_status__cat_2b2ff2318877/page.md', - }, - { - id: 'fp_order_status__cat_34f037ddcbfa', - fingerprint: 'fp_order_status', - subClusterId: 'cat_34f037ddcbfa', - path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md', - }, - ]); - - const paidMetadata = historicSqlMetadataSchema.parse( - await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/metadata.json'), - ); - expect(paidMetadata).toMatchObject({ - id: 'fp_order_status__cat_34f037ddcbfa', - title: 'snowflake · analytics.orders [fp_ord:ddcbfa]', - path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md', - properties: { - fingerprint: 'fp_order_status', - sub_cluster_id: 'cat_34f037ddcbfa', - dialect: 'snowflake', - tables_touched: ['analytics.orders'], - literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }], - }, - }); - - const paidUsage = historicSqlUsageSchema.parse( - await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'), - ); - expect(paidUsage.stats).toMatchObject({ - executions: 3, - distinct_users: 3, - first_seen: '2026-05-04T10:00:00.000Z', - last_seen: '2026-05-04T10:02:00.000Z', - rows_produced: 36, - }); - expect(paidUsage.literal_slots).toEqual([{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }]); - - const refundedUsage = historicSqlUsageSchema.parse( - await readJson(stagedDir, 'templates/fp_order_status__cat_2b2ff2318877/usage.json'), - ); - expect(refundedUsage.stats).toMatchObject({ - executions: 3, - distinct_users: 3, - first_seen: '2026-05-04T10:03:00.000Z', - last_seen: '2026-05-04T10:05:00.000Z', - rows_produced: 66, - }); - expect(refundedUsage.literal_slots).toEqual([ - { position: 1, distinct_values: 1, top_values: [['refunded', 3]] }, - ]); - }); - - it('classifies literal slots across the spec matrix and stale-date demotion', async () => { - const stagedDir = await tempDir(); - - await stageHistoricSqlTemplates({ - stagedDir, - connectionId: 'conn_1', - queryClient: {}, - reader: fakeReader(classificationMatrixRows()), - sqlAnalysis: classificationMatrixSqlAnalysis, - pullConfig: { - dialect: 'snowflake', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 5000, - minCalls: 5, - }, - now: new Date('2026-05-04T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - const matrixTemplates = manifest.templates.filter((template) => template.fingerprint === 'fp_classification_matrix'); - expect(matrixTemplates).toHaveLength(2); - expect(matrixTemplates.every((template) => template.subClusterId?.startsWith('cat_'))).toBe(true); - - const matrixTemplate = matrixTemplates[0]; - if (!matrixTemplate) { - throw new Error('expected classification matrix template'); - } - const matrixMetadata = historicSqlMetadataSchema.parse( - await readJson(stagedDir, matrixTemplate.path.replace('/page.md', '/metadata.json')), - ); - expect(matrixMetadata.properties.literal_slots).toMatchInlineSnapshot(` - [ - { - "classification": "constant", - "position": 1, - "type": "string", - }, - { - "classification": "constant", - "position": 2, - "type": "string", - }, - { - "classification": "categorical", - "position": 3, - "type": "string", - }, - { - "classification": "runtime", - "position": 4, - "type": "number", - }, - { - "classification": "runtime", - "position": 5, - "type": "date", - }, - ] - `); - expect(matrixMetadata.properties.triage_signals.slot_summary).toBe('2 constant, 2 runtime'); - - const staleMetadata = historicSqlMetadataSchema.parse( - await readJson(stagedDir, 'templates/fp_stale_date/metadata.json'), - ); - expect(staleMetadata.properties.literal_slots).toMatchInlineSnapshot(` - [ - { - "classification": "runtime", - "position": 1, - "type": "date", - }, - ] - `); - expect(staleMetadata.properties.triage_signals.slot_summary).toBe('0 constant, 1 runtime'); - }); - - it('applies the templates-per-run cap after categorical expansion', async () => { - const stagedDir = await tempDir(); - - await stageHistoricSqlTemplates({ - stagedDir, - connectionId: 'conn_1', - queryClient: {}, - reader: fakeReader(categoricalRows()), - sqlAnalysis: categoricalSqlAnalysis, - pullConfig: { - dialect: 'snowflake', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 1, - minCalls: 5, - }, - now: new Date('2026-05-04T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - expect(manifest.templateCount).toBe(1); - expect(manifest.capped).toBe(true); - expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']); - expect(manifest.templates).toHaveLength(1); - expect(manifest.templates[0].id).toMatch(/^fp_order_status__cat_/); - }); - - it('omits rows_produced for BigQuery templates when reader rows have no row counts', async () => { - const stagedDir = await tempDir(); - - await stageHistoricSqlTemplates({ - stagedDir, - connectionId: 'conn_bq', - queryClient: {}, - reader: fakeReader([ - { - id: 'bq-1', - sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'", - user: 'analyst-a@example.com', - startedAt: '2026-05-04T10:00:00.000Z', - endedAt: null, - runtimeMs: 100, - success: true, - errorMessage: null, - }, - ]), - sqlAnalysis: fakeSqlAnalysis, - pullConfig: { - dialect: 'bigquery', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 5000, - minCalls: 5, - }, - now: new Date('2026-05-04T12:00:00.000Z'), - }); - - const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json')); - expect(usage.stats).not.toHaveProperty('rows_produced'); - expect(usage.samples[0]).not.toHaveProperty('rows_produced'); - }); - - it('keeps at most five diverse samples, preferring recent successful representatives per literal tuple', async () => { - const stagedDir = await tempDir(); - const statuses = [ - 'paid', - 'refunded', - 'pending', - 'failed', - 'trial', - 'cancelled', - 'draft', - 'returned', - 'review', - 'held', - 'archived', - ]; - const rows: HistoricSqlRawQueryRow[] = statuses.flatMap((status, index) => [ - { - id: `${status}-old`, - sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`, - user: 'analyst-a', - startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`, - endedAt: null, - runtimeMs: 100, - rowsProduced: 1, - success: false, - errorMessage: 'old failed sample', - }, - { - id: `${status}-new`, - sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`, - user: 'analyst-a', - startedAt: `2026-05-04T11:${String(index).padStart(2, '0')}:00.000Z`, - endedAt: null, - runtimeMs: 90, - rowsProduced: 2, - success: true, - errorMessage: null, - }, - ]); - - await stageHistoricSqlTemplates({ - stagedDir, - connectionId: 'conn_1', - queryClient: {}, - reader: fakeReader(rows), - sqlAnalysis: diverseSqlAnalysis, - pullConfig: { - dialect: 'snowflake', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 5000, - minCalls: 5, - }, - now: new Date('2026-05-04T12:00:00.000Z'), - }); - - const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_diverse_samples/usage.json')); - expect(usage.samples).toHaveLength(5); - expect(usage.samples.every((sample) => sample.success)).toBe(true); - expect(new Set(usage.samples.map((sample) => sample.bound_sql.match(/status = '([^']+)'/)?.[1])).size).toBe(5); - expect(usage.samples.map((sample) => sample.started_at)).toEqual([ - '2026-05-04T11:10:00.000Z', - '2026-05-04T11:09:00.000Z', - '2026-05-04T11:08:00.000Z', - '2026-05-04T11:07:00.000Z', - '2026-05-04T11:06:00.000Z', - ]); - }); - - it('uses recency as a tie-breaker when the templates-per-run cap overflows', async () => { - const stagedDir = await tempDir(); - const sqlAnalysis: SqlAnalysisPort = { - async analyzeForFingerprint(sql) { - const table = sql.includes('fresh_orders') ? 'fresh_orders' : 'stale_orders'; - return { - fingerprint: `fp_${table}`, - normalizedSql: `SELECT count(*) FROM analytics.${table}`, - tablesTouched: [`analytics.${table}`], - literalSlots: [], - }; - }, - }; - - await stageHistoricSqlTemplates({ - stagedDir, - connectionId: 'conn_1', - queryClient: {}, - reader: fakeReader([ - { - id: 'stale-1', - sql: 'SELECT count(*) FROM analytics.stale_orders', - user: 'analyst-a', - startedAt: '2026-02-04T10:00:00.000Z', - endedAt: null, - runtimeMs: 100, - rowsProduced: 1, - success: true, - errorMessage: null, - }, - { - id: 'fresh-1', - sql: 'SELECT count(*) FROM analytics.fresh_orders', - user: 'analyst-a', - startedAt: '2026-05-04T10:00:00.000Z', - endedAt: null, - runtimeMs: 100, - rowsProduced: 1, - success: true, - errorMessage: null, - }, - ]), - sqlAnalysis, - pullConfig: { - dialect: 'snowflake', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: [], - maxTemplatesPerRun: 1, - minCalls: 5, - }, - now: new Date('2026-05-04T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - expect(manifest.templates.map((template) => template.id)).toEqual(['fp_fresh_orders']); - }); - - it('does not persist bound SQL samples when redaction patterns are invalid', async () => { - const stagedDir = await tempDir(); - - await stageHistoricSqlTemplates({ - stagedDir, - connectionId: 'conn_1', - queryClient: {}, - reader: fakeReader([ - { - id: 'q1', - sql: "SELECT * FROM analytics.orders WHERE email = 'analyst@example.com'", - user: 'analyst@example.com', - startedAt: '2026-05-04T10:00:00.000Z', - endedAt: null, - runtimeMs: 100, - rowsProduced: 1, - success: true, - errorMessage: null, - }, - ]), - sqlAnalysis: { - async analyzeForFingerprint() { - return { - fingerprint: 'fp_redaction', - normalizedSql: 'SELECT * FROM analytics.orders WHERE email = ?', - tablesTouched: ['analytics.orders'], - literalSlots: [{ position: 1, type: 'string', exampleValue: 'analyst@example.com' }], - }; - }, - }, - pullConfig: { - dialect: 'snowflake', - windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: [], - redactionPatterns: ['['], - maxTemplatesPerRun: 5000, - minCalls: 5, - }, - now: new Date('2026-05-04T12:00:00.000Z'), - }); - - const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json')); - const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_redaction/usage.json')); - expect(manifest.warnings.some((warning) => warning.startsWith('redaction_skipped:invalid_redaction_pattern'))).toBe( - true, - ); - expect(usage.samples).toEqual([]); - }); -}); diff --git a/packages/context/src/ingest/adapters/historic-sql/stage.ts b/packages/context/src/ingest/adapters/historic-sql/stage.ts deleted file mode 100644 index 9c380a9f..00000000 --- a/packages/context/src/ingest/adapters/historic-sql/stage.ts +++ /dev/null @@ -1,630 +0,0 @@ -import { createHash } from 'node:crypto'; -import { mkdir, writeFile } from 'node:fs/promises'; -import { dirname, join } from 'node:path'; -import type { - SqlAnalysisFingerprintResult, - SqlAnalysisLiteralSlot, - SqlAnalysisLiteralSlotType, - SqlAnalysisPort, -} from '../../../sql-analysis/index.js'; -import { - HISTORIC_SQL_OBJECT_TYPE, - HISTORIC_SQL_SOURCE_KEY, - historicSqlPullConfigSchema, - historicSqlRawQueryRowSchema, - type HistoricSqlLiteralSlotClassification, - type HistoricSqlManifest, - type HistoricSqlMetadata, - type HistoricSqlPullConfig, - type HistoricSqlQueryHistoryReader, - type HistoricSqlRawQueryRow, - type HistoricSqlUsage, -} from './types.js'; - -interface StageHistoricSqlTemplatesInput { - stagedDir: string; - connectionId: string; - queryClient: unknown; - reader: HistoricSqlQueryHistoryReader; - sqlAnalysis: SqlAnalysisPort; - pullConfig: HistoricSqlPullConfig; - now?: Date; -} - -interface SlotObservation { - value: string; - rowStartedAt: string; -} - -interface SlotStats { - position: number; - type: SqlAnalysisLiteralSlotType; - values: Map; - observations: SlotObservation[]; -} - -interface TemplateAccumulator { - fingerprint: string; - normalizedSql: string; - tablesTouched: Set; - rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>; - slotStats: Map; -} - -interface ClassifiedLiteralSlot { - position: number; - type: SqlAnalysisLiteralSlotType; - classification: HistoricSqlLiteralSlotClassification; -} - -interface TemplateVariant { - id: string; - fingerprint: string; - subClusterId: string | null; - normalizedSql: string; - tablesTouched: Set; - rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>; - slotStats: Map; - slotClassifications: ClassifiedLiteralSlot[]; -} - -interface CategoricalTupleEntry { - position: number; - value: string; -} - -interface RedactionPolicy { - redactors: RegExp[]; - samplesAllowed: boolean; -} - -const HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i; -const HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i; - -export async function stageHistoricSqlTemplates(input: StageHistoricSqlTemplatesInput): Promise { - const config = historicSqlPullConfigSchema.parse(input.pullConfig); - const now = input.now ?? new Date(); - const windowStart = config.lastSuccessfulCursor - ? new Date(config.lastSuccessfulCursor) - : new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000); - const warnings: string[] = []; - const redaction = compileRedactors(config.redactionPatterns, warnings); - const groups = new Map(); - let nextSuccessfulCursor: string | null = null; - - await input.reader.probe(input.queryClient); - - for await (const rawRow of input.reader.fetch( - input.queryClient, - { start: windowStart, end: now }, - config.lastSuccessfulCursor, - )) { - const row = historicSqlRawQueryRowSchema.parse(rawRow); - if (!nextSuccessfulCursor || row.startedAt > nextSuccessfulCursor) { - nextSuccessfulCursor = row.startedAt; - } - if (shouldSkipSql(row.sql)) { - continue; - } - - const analysis = await input.sqlAnalysis.analyzeForFingerprint(row.sql, config.dialect); - if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) { - warnings.push(`analysis_failed:${row.id}`); - continue; - } - - const group = - groups.get(analysis.fingerprint) ?? - { - fingerprint: analysis.fingerprint, - normalizedSql: analysis.normalizedSql, - tablesTouched: new Set(), - rows: [], - slotStats: new Map(), - }; - - for (const table of analysis.tablesTouched) { - group.tablesTouched.add(table); - } - for (const slot of analysis.literalSlots) { - recordSlot(group.slotStats, slot, redaction.redactors, row.startedAt); - } - group.rows.push({ row, analysis }); - groups.set(analysis.fingerprint, group); - } - - const expandedTemplates = expandCategoricalTemplates([...groups.values()], redaction.redactors); - const selected = selectTemplates(expandedTemplates, config.maxTemplatesPerRun, now); - if (selected.length < expandedTemplates.length) { - warnings.push(`templates_truncated: kept ${selected.length} of ${expandedTemplates.length} templates`); - } - - await mkdir(input.stagedDir, { recursive: true }); - const templates: HistoricSqlManifest['templates'] = []; - for (const template of selected) { - const staged = buildStagedTemplate(template, config, redaction, now); - const basePath = `templates/${staged.metadata.id}`; - await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata); - await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown); - await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage); - templates.push({ - id: staged.metadata.id, - fingerprint: staged.metadata.properties.fingerprint, - subClusterId: staged.metadata.properties.sub_cluster_id, - path: staged.metadata.path, - }); - } - - await writeJson(input.stagedDir, 'manifest.json', { - source: HISTORIC_SQL_SOURCE_KEY, - connectionId: input.connectionId, - dialect: config.dialect, - fetchedAt: now.toISOString(), - windowStart: windowStart.toISOString(), - windowEnd: now.toISOString(), - nextSuccessfulCursor, - templateCount: selected.length, - capped: selected.length < expandedTemplates.length, - warnings, - degraded: false, - statsResetAt: null, - baselineFirstRun: false, - pgServerVersion: null, - deallocCount: null, - templates, - } satisfies HistoricSqlManifest); -} - -function shouldSkipSql(sql: string): boolean { - return HARD_SKIP_PREFIX_RE.test(sql) || HARD_SKIP_TABLE_RE.test(sql); -} - -function recordSlot( - slotStats: Map, - slot: SqlAnalysisLiteralSlot, - redactors: RegExp[], - rowStartedAt: string, -): void { - const existing = slotStats.get(slot.position) ?? { - position: slot.position, - type: slot.type, - values: new Map(), - observations: [], - }; - const persistedValue = redactText(slot.exampleValue, redactors); - existing.values.set(persistedValue, (existing.values.get(persistedValue) ?? 0) + 1); - existing.observations.push({ value: persistedValue, rowStartedAt }); - slotStats.set(slot.position, existing); -} - -function expandCategoricalTemplates(groups: TemplateAccumulator[], redactors: RegExp[]): TemplateVariant[] { - return groups.flatMap((group) => expandTemplateGroup(group, redactors)); -} - -function expandTemplateGroup(group: TemplateAccumulator, redactors: RegExp[]): TemplateVariant[] { - const rows = [...group.rows].sort((left, right) => left.row.startedAt.localeCompare(right.row.startedAt)); - const firstSeen = rows[0]?.row.startedAt; - if (!firstSeen) { - return []; - } - - const slotClassifications = classifySlots(group.slotStats, rows.length, firstSeen); - const categoricalPositions = slotClassifications - .filter((slot) => slot.classification === 'categorical') - .map((slot) => slot.position) - .sort((left, right) => left - right); - - if (categoricalPositions.length === 0) { - return [ - { - id: group.fingerprint, - fingerprint: group.fingerprint, - subClusterId: null, - normalizedSql: group.normalizedSql, - tablesTouched: group.tablesTouched, - rows, - slotStats: group.slotStats, - slotClassifications, - }, - ]; - } - - const byTuple = new Map< - string, - { - tuple: CategoricalTupleEntry[]; - rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>; - } - >(); - - for (const entry of rows) { - const tuple = categoricalTuple(entry.analysis.literalSlots, categoricalPositions, redactors); - const key = JSON.stringify(tuple); - const existing = byTuple.get(key) ?? { tuple, rows: [] }; - existing.rows.push(entry); - byTuple.set(key, existing); - } - - return [...byTuple.values()] - .map(({ tuple, rows: tupleRows }) => { - const subClusterId = subClusterIdForTuple(tuple); - return { - id: `${group.fingerprint}__${subClusterId}`, - fingerprint: group.fingerprint, - subClusterId, - normalizedSql: group.normalizedSql, - tablesTouched: group.tablesTouched, - rows: tupleRows, - slotStats: collectSlotStats(tupleRows, redactors), - slotClassifications, - }; - }) - .sort((left, right) => left.id.localeCompare(right.id)); -} - -function classifySlots( - slotStats: Map, - executions: number, - firstSeen: string, -): ClassifiedLiteralSlot[] { - return [...slotStats.values()] - .sort((left, right) => left.position - right.position) - .map((slot) => ({ - position: slot.position, - type: slot.type, - classification: classifySlot(slot, executions, firstSeen), - })); -} - -function collectSlotStats( - rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>, - redactors: RegExp[], -): Map { - const slotStats = new Map(); - for (const entry of rows) { - for (const slot of entry.analysis.literalSlots) { - recordSlot(slotStats, slot, redactors, entry.row.startedAt); - } - } - return slotStats; -} - -function categoricalTuple( - literalSlots: SqlAnalysisLiteralSlot[], - categoricalPositions: number[], - redactors: RegExp[], -): CategoricalTupleEntry[] { - const valuesByPosition = new Map( - literalSlots.map((slot) => [slot.position, redactText(slot.exampleValue, redactors)] as const), - ); - return categoricalPositions.map((position) => ({ - position, - value: valuesByPosition.get(position) ?? '', - })); -} - -function subClusterIdForTuple(tuple: CategoricalTupleEntry[]): string { - return `cat_${createHash('sha256').update(JSON.stringify(tuple)).digest('hex').slice(0, 12)}`; -} - -function buildStagedTemplate( - template: TemplateVariant, - config: HistoricSqlPullConfig, - redaction: RedactionPolicy, - now: Date, -): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } { - const rows = template.rows - .map((entry) => entry.row) - .sort((left, right) => left.startedAt.localeCompare(right.startedAt)); - const firstSeen = rows[0].startedAt; - const lastSeen = rows[rows.length - 1].startedAt; - const distinctUsers = new Set(rows.map((row) => row.user).filter((user): user is string => !!user)).size; - const errorCount = rows.filter((row) => !row.success).length; - const runtimes = rows - .map((row) => row.runtimeMs) - .filter((runtime): runtime is number => typeof runtime === 'number') - .sort((left, right) => left - right); - const triageSignals = buildTriageSignals({ - executions: rows.length, - distinctUsers, - errorRate: rows.length === 0 ? 0 : errorCount / rows.length, - lastSeen, - now, - serviceAccountOnly: isServiceAccountOnly(rows, config.serviceAccountUserPatterns), - slotClassifications: template.slotClassifications.map((slot) => slot.classification), - }); - const tablesTouched = [...template.tablesTouched].sort(); - const firstTable = tablesTouched[0] ?? 'query'; - const id = template.id; - const rowsProduced = sumRowsProduced(rows); - const metadata: HistoricSqlMetadata = { - id, - title: buildTemplateTitle(config.dialect, firstTable, template.fingerprint, template.subClusterId), - path: `templates/${id}/page.md`, - objectType: HISTORIC_SQL_OBJECT_TYPE, - lastEditedAt: null, - properties: { - fingerprint: template.fingerprint, - sub_cluster_id: template.subClusterId, - dialect: config.dialect, - tables_touched: tablesTouched, - literal_slots: template.slotClassifications, - triage_signals: triageSignals, - }, - }; - - return { - metadata, - pageMarkdown: renderTemplatePage(id, template.normalizedSql, tablesTouched), - usage: { - stats: { - executions: rows.length, - distinct_users: distinctUsers, - first_seen: firstSeen, - last_seen: lastSeen, - p50_runtime_ms: percentile(runtimes, 0.5), - p95_runtime_ms: percentile(runtimes, 0.95), - error_rate: rows.length === 0 ? 0 : errorCount / rows.length, - ...(rowsProduced === null ? {} : { rows_produced: rowsProduced }), - }, - literal_slots: [...template.slotStats.values()] - .sort((left, right) => left.position - right.position) - .map((slot) => ({ - position: slot.position, - distinct_values: slot.values.size, - top_values: [...slot.values.entries()] - .sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0])) - .slice(0, 10), - })), - samples: selectSamples(template.rows, redaction), - }, - }; -} - -const TEMPORAL_SLOT_TYPES = new Set(['date', 'timestamp']); - -function isStaleDateConstant(slot: SlotStats, value: string, firstSeen: string): boolean { - return slot.type === 'date' && parseTemporalSlotValue(value) !== null && value < firstSeen.slice(0, 10); -} - -function isMovingTemporalSlot(slot: SlotStats): boolean { - if (!TEMPORAL_SLOT_TYPES.has(slot.type) || slot.values.size < 2) { - return false; - } - - const observations: Array<{ rowStartedAt: number; literalTime: number }> = []; - for (const observation of slot.observations) { - const rowStartedAt = Date.parse(observation.rowStartedAt); - const literalTime = parseTemporalSlotValue(observation.value); - if (Number.isNaN(rowStartedAt) || literalTime === null) { - return false; - } - observations.push({ rowStartedAt, literalTime }); - } - - const literalTimes = observations - .sort((left, right) => left.rowStartedAt - right.rowStartedAt) - .map((observation) => observation.literalTime); - - return isMonotonic(literalTimes); -} - -function parseTemporalSlotValue(value: string): number | null { - const parsed = Date.parse(value); - return Number.isNaN(parsed) ? null : parsed; -} - -function isMonotonic(values: number[]): boolean { - if (values.length < 2) { - return false; - } - - let nonDecreasing = true; - let nonIncreasing = true; - for (let index = 1; index < values.length; index += 1) { - if (values[index] < values[index - 1]) { - nonDecreasing = false; - } - if (values[index] > values[index - 1]) { - nonIncreasing = false; - } - } - - return nonDecreasing || nonIncreasing; -} - -function classifySlot( - slot: SlotStats, - executions: number, - firstSeen: string, -): HistoricSqlLiteralSlotClassification { - const ordered = [...slot.values.entries()].sort((left, right) => right[1] - left[1]); - const distinct = ordered.length; - const topCount = ordered[0]?.[1] ?? 0; - const topValue = ordered[0]?.[0] ?? ''; - const staleDateConstant = isStaleDateConstant(slot, topValue, firstSeen); - - if (distinct === 1 && !staleDateConstant) { - return 'constant'; - } - if (executions > 0 && topCount / executions >= 0.95 && !staleDateConstant) { - return 'constant'; - } - if (isMovingTemporalSlot(slot)) { - return 'runtime'; - } - if (executions > 0 && distinct >= 2 && distinct <= 10 && ordered.every(([, count]) => count / executions >= 0.05)) { - return 'categorical'; - } - return 'runtime'; -} - -function buildTriageSignals(input: { - executions: number; - distinctUsers: number; - errorRate: number; - lastSeen: string; - now: Date; - serviceAccountOnly: boolean; - slotClassifications: HistoricSqlLiteralSlotClassification[]; -}): Record { - const runtimeCount = input.slotClassifications.filter((classification) => classification === 'runtime').length; - const constantCount = input.slotClassifications.filter((classification) => classification === 'constant').length; - return { - executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high', - distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad', - error_rate_bucket: input.errorRate <= 0.01 ? 'ok' : input.errorRate <= 0.1 ? 'noisy' : 'broken', - recency_bucket: recencyBucket(input.lastSeen, input.now), - service_account_only: String(input.serviceAccountOnly), - slot_summary: `${constantCount} constant, ${runtimeCount} runtime`, - }; -} - -function recencyBucket(lastSeen: string, now: Date): string { - const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / (24 * 60 * 60 * 1000)); - if (ageDays <= 14) { - return 'active'; - } - if (ageDays <= 60) { - return 'warm'; - } - return 'cold'; -} - -function isServiceAccountOnly(rows: HistoricSqlRawQueryRow[], patterns: string[]): boolean { - const users = rows.map((row) => row.user).filter((user): user is string => !!user); - if (users.length === 0 || patterns.length === 0) { - return false; - } - const regexes = patterns.map((pattern) => new RegExp(pattern)); - return users.every((user) => regexes.some((regex) => regex.test(user))); -} - -function buildTemplateTitle( - dialect: HistoricSqlPullConfig['dialect'], - firstTable: string, - fingerprint: string, - subClusterId: string | null, -): string { - if (!subClusterId) { - return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}]`; - } - return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}:${subClusterId.slice(-6)}]`; -} - -function renderTemplatePage(fingerprint: string, normalizedSql: string, tablesTouched: string[]): string { - return [ - `# ${fingerprint}`, - '', - '## Normalized SQL', - '```sql', - normalizedSql, - '```', - '', - '## Tables touched', - ...tablesTouched.map((table) => `- ${table}`), - '', - ].join('\n'); -} - -function selectSamples( - rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>, - redaction: RedactionPolicy, -): HistoricSqlUsage['samples'] { - if (!redaction.samplesAllowed) { - return []; - } - - const byLiteralTuple = new Map(); - const preferred = [...rows].sort((left, right) => { - if (left.row.success !== right.row.success) { - return left.row.success ? -1 : 1; - } - return right.row.startedAt.localeCompare(left.row.startedAt); - }); - - for (const entry of preferred) { - const key = [...entry.analysis.literalSlots] - .sort((left, right) => left.position - right.position) - .map((slot) => slot.exampleValue) - .join('\u001f'); - if (!byLiteralTuple.has(key)) { - byLiteralTuple.set(key, entry); - } - } - - return [...byLiteralTuple.values()] - .sort((left, right) => right.row.startedAt.localeCompare(left.row.startedAt)) - .slice(0, 5) - .map(({ row }) => ({ - started_at: row.startedAt, - user: row.user, - bound_sql: redactText(row.sql, redaction.redactors), - ...(row.rowsProduced === undefined ? {} : { rows_produced: row.rowsProduced ?? null }), - runtime_ms: row.runtimeMs, - success: row.success, - })); -} - -function selectTemplates(templates: TemplateVariant[], maxTemplatesPerRun: number, now: Date): TemplateVariant[] { - return templates - .map((template) => ({ template, score: rankTemplate(template, now) })) - .sort((left, right) => right.score - left.score || left.template.id.localeCompare(right.template.id)) - .slice(0, maxTemplatesPerRun) - .map((entry) => entry.template); -} - -function rankTemplate(template: TemplateVariant, now: Date): number { - const users = new Set(template.rows.map(({ row }) => row.user).filter((user): user is string => !!user)).size; - const latestStartedAt = template.rows.reduce( - (latest, { row }) => (latest === null || row.startedAt > latest ? row.startedAt : latest), - null, - ); - const ageDays = - latestStartedAt === null ? 365 : Math.max(0, (now.getTime() - new Date(latestStartedAt).getTime()) / 86400000); - const recencyWeight = 1 / (1 + ageDays / 30); - return users * Math.log1p(template.rows.length) * recencyWeight; -} - -function percentile(values: number[], percentileValue: number): number | null { - if (values.length === 0) { - return null; - } - const index = Math.min(values.length - 1, Math.max(0, Math.ceil(values.length * percentileValue) - 1)); - return values[index]; -} - -function sumRowsProduced(rows: HistoricSqlRawQueryRow[]): number | null { - const values = rows.map((row) => row.rowsProduced).filter((value): value is number => typeof value === 'number'); - return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : null; -} - -function compileRedactors(patterns: string[], warnings: string[]): RedactionPolicy { - let samplesAllowed = true; - const redactors = patterns.flatMap((pattern) => { - try { - return [new RegExp(pattern, 'g')]; - } catch (error) { - samplesAllowed = false; - warnings.push( - `redaction_skipped:invalid_redaction_pattern:${pattern}:${error instanceof Error ? error.message : String(error)}`, - ); - return []; - } - }); - return { redactors, samplesAllowed }; -} - -function redactText(value: string, redactors: RegExp[]): string { - return redactors.reduce((current, regex) => current.replace(regex, ''), value); -} - -async function writeJson(stagedDir: string, relPath: string, value: unknown): Promise { - await writeText(stagedDir, relPath, `${JSON.stringify(value, null, 2)}\n`); -} - -async function writeText(stagedDir: string, relPath: string, value: string): Promise { - const target = join(stagedDir, relPath); - await mkdir(dirname(target), { recursive: true }); - await writeFile(target, value, 'utf-8'); -} diff --git a/packages/context/src/ingest/adapters/historic-sql/types.test.ts b/packages/context/src/ingest/adapters/historic-sql/types.test.ts new file mode 100644 index 00000000..076e5d8e --- /dev/null +++ b/packages/context/src/ingest/adapters/historic-sql/types.test.ts @@ -0,0 +1,98 @@ +import { describe, expect, it } from 'vitest'; +import { + aggregatedTemplateSchema, + historicSqlUnifiedPullConfigSchema, + stagedManifestSchema, + stagedPatternsInputSchema, + stagedTableInputSchema, +} from './types.js'; + +describe('historic-sql unified contracts', () => { + it('parses minExecutions and accepts minCalls as a one-release alias', () => { + expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minExecutions: 9 })).toMatchObject({ + dialect: 'postgres', + minExecutions: 9, + windowDays: 90, + concurrency: 12, + redactionPatterns: [], + staleArchiveAfterDays: 90, + }); + + expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minCalls: 7 }).minExecutions).toBe(7); + }); + + it('validates aggregate templates from warehouse readers', () => { + const parsed = aggregatedTemplateSchema.parse({ + templateId: 'pg:123', + canonicalSql: 'select status, count(*) from public.orders group by status', + dialect: 'postgres', + stats: { + executions: 42, + distinctUsers: 3, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 12.5, + p95RuntimeMs: 40, + errorRate: 0, + rowsProduced: 100, + }, + topUsers: [{ user: 'analyst', executions: 40 }], + }); + + expect(parsed.templateId).toBe('pg:123'); + expect(parsed.topUsers).toEqual([{ user: 'analyst', executions: 40 }]); + }); + + it('validates staged table, patterns, and manifest artifacts', () => { + expect( + stagedTableInputSchema.parse({ + table: 'public.orders', + stats: { + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + errorRateBucket: 'none', + p95RuntimeBucket: '<100ms', + recencyBucket: 'current', + }, + columnsByClause: { + select: [['status', 'high']], + where: [['created_at', 'mid']], + }, + observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }], + topTemplates: [{ id: 'pg:123', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }], + }).table, + ).toBe('public.orders'); + + expect( + stagedPatternsInputSchema.parse({ + templates: [ + { + id: 'pg:123', + canonicalSql: 'select * from public.orders', + tablesTouched: ['public.orders'], + executionsBucket: '10-100', + distinctUsersBucket: '2-5', + dialect: 'postgres', + }, + ], + }).templates, + ).toHaveLength(1); + + expect( + stagedManifestSchema.parse({ + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 2, + touchedTableCount: 1, + parseFailures: 1, + warnings: ['parse_failed:bad'], + probeWarnings: [], + staleArchiveAfterDays: 90, + }).staleArchiveAfterDays, + ).toBe(90); + }); +}); diff --git a/packages/context/src/ingest/adapters/historic-sql/types.ts b/packages/context/src/ingest/adapters/historic-sql/types.ts index 0cd3d01a..a827e8ae 100644 --- a/packages/context/src/ingest/adapters/historic-sql/types.ts +++ b/packages/context/src/ingest/adapters/historic-sql/types.ts @@ -2,200 +2,161 @@ import { z } from 'zod'; import type { SqlAnalysisPort } from '../../../sql-analysis/index.js'; export const HISTORIC_SQL_SOURCE_KEY = 'historic-sql' as const; -export const HISTORIC_SQL_OBJECT_TYPE = 'historic_sql_template' as const; const historicSqlDialectSchema = z.enum(['snowflake', 'bigquery', 'postgres']); export type HistoricSqlDialect = z.infer; -export const historicSqlPullConfigSchema = z.object({ +const filterModeSchema = z.enum(['exclude', 'include', 'mark-only']); + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +export const historicSqlUnifiedPullConfigSchema = z.preprocess((value) => { + if (!isRecord(value)) { + return value; + } + const next: Record = { ...value }; + if (next.minExecutions === undefined && typeof next.minCalls === 'number') { + next.minExecutions = next.minCalls; + } + if (!next.filters && Array.isArray(next.serviceAccountUserPatterns)) { + next.filters = { + serviceAccounts: { patterns: next.serviceAccountUserPatterns, mode: 'exclude' }, + dropTrivialProbes: true, + }; + } + return next; +}, z.object({ dialect: historicSqlDialectSchema, - windowDays: z.number().int().min(1).max(365).default(90), - lastSuccessfulCursor: z.string().datetime().nullable().default(null), - serviceAccountUserPatterns: z.array(z.string()).default([]), + windowDays: z.number().int().positive().default(90), + minExecutions: z.number().int().nonnegative().default(5), + concurrency: z.number().int().positive().default(12), + filters: z.object({ + serviceAccounts: z.object({ + patterns: z.array(z.string()).default([]), + mode: filterModeSchema.default('exclude'), + }).optional(), + orchestrators: z.object({ + mode: filterModeSchema.default('mark-only'), + }).optional(), + dropTrivialProbes: z.boolean().default(true), + dropFailedBelow: z.object({ + errorRate: z.number().min(0).max(1), + executions: z.number().int().nonnegative(), + }).optional(), + }).default({ dropTrivialProbes: true }), redactionPatterns: z.array(z.string()).default([]), - maxTemplatesPerRun: z.number().int().min(1).max(5000).default(5000), - minCalls: z.number().int().min(1).default(5), + staleArchiveAfterDays: z.number().int().positive().default(90), +})); + +export type HistoricSqlUnifiedPullConfig = z.infer; + +export const aggregatedTemplateSchema = z.object({ + templateId: z.string().min(1), + canonicalSql: z.string().min(1), + dialect: historicSqlDialectSchema, + stats: z.object({ + executions: z.number().int().nonnegative(), + distinctUsers: z.number().int().nonnegative(), + firstSeen: z.iso.datetime(), + lastSeen: z.iso.datetime(), + p50RuntimeMs: z.number().nonnegative().nullable(), + p95RuntimeMs: z.number().nonnegative().nullable(), + errorRate: z.number().min(0).max(1), + rowsProduced: z.number().int().nonnegative().nullable(), + }), + topUsers: z.array(z.object({ + user: z.string().nullable(), + executions: z.number().int().nonnegative(), + })).default([]), }); -export type HistoricSqlPullConfig = z.infer; +export type AggregatedTemplate = z.infer; + +export const stagedTableInputSchema = z.object({ + table: z.string().min(1), + stats: z.object({ + executionsBucket: z.string(), + distinctUsersBucket: z.string(), + errorRateBucket: z.string(), + p95RuntimeBucket: z.string(), + recencyBucket: z.string(), + }), + columnsByClause: z.record(z.string(), z.array(z.tuple([z.string(), z.string()]))), + observedJoins: z.array(z.object({ + withTable: z.string(), + on: z.array(z.string()), + freq: z.string(), + })), + topTemplates: z.array(z.object({ + id: z.string(), + canonicalSql: z.string(), + topUsers: z.array(z.object({ user: z.string().nullable() })), + })), +}); +export type StagedTableInput = z.infer; + +export const stagedPatternsInputSchema = z.object({ + templates: z.array(z.object({ + id: z.string(), + canonicalSql: z.string(), + tablesTouched: z.array(z.string()), + executionsBucket: z.string(), + distinctUsersBucket: z.string(), + dialect: historicSqlDialectSchema, + })), +}); +export type StagedPatternsInput = z.infer; + +export const stagedManifestSchema = z.object({ + source: z.literal(HISTORIC_SQL_SOURCE_KEY), + connectionId: z.string().min(1), + dialect: historicSqlDialectSchema, + fetchedAt: z.iso.datetime(), + windowStart: z.iso.datetime(), + windowEnd: z.iso.datetime(), + snapshotRowCount: z.number().int().nonnegative(), + touchedTableCount: z.number().int().nonnegative(), + parseFailures: z.number().int().nonnegative(), + warnings: z.array(z.string()), + probeWarnings: z.array(z.string()), + staleArchiveAfterDays: z.number().int().positive().default(90), +}); +export type StagedManifest = z.infer; + +export interface HistoricSqlProbeResult { + warnings: string[]; + info?: string[]; +} + +export interface HistoricSqlReader { + probe(client: unknown): Promise; + fetchAggregated( + client: unknown, + window: HistoricSqlTimeWindow, + config: HistoricSqlUnifiedPullConfig, + ): AsyncIterable; +} export interface HistoricSqlTimeWindow { start: Date; end: Date; } -export const historicSqlRawQueryRowSchema = z.object({ - id: z.string().min(1), - sql: z.string().min(1), - user: z.string().nullable().default(null), - startedAt: z.string().datetime(), - endedAt: z.string().datetime().nullable().default(null), - runtimeMs: z.number().nonnegative().nullable().default(null), - rowsProduced: z.number().int().nonnegative().nullable().optional(), - success: z.boolean().default(true), - errorMessage: z.string().nullable().default(null), -}); -export type HistoricSqlRawQueryRow = z.infer; - -export interface HistoricSqlQueryHistoryReader { - probe(client: unknown): Promise; - fetch( - client: unknown, - window: HistoricSqlTimeWindow, - cursor?: string | null, - ): AsyncIterable; -} - export interface KtxPostgresQueryClient { executeQuery(sql: string, params?: unknown[]): Promise<{ headers: string[]; rows: unknown[][]; totalRows?: number }>; } -export interface PostgresPgssProbeResult { +export interface PostgresPgssProbeResult extends HistoricSqlProbeResult { pgServerVersion: string; warnings: string[]; -} - -export interface PostgresPgssSnapshot { - statsResetAt: string | null; - deallocCount: number | null; - rows: PostgresPgssRow[]; -} - -export interface PostgresPgssReader { - probe(client: KtxPostgresQueryClient): Promise; - readSnapshot( - client: KtxPostgresQueryClient, - options: { minCalls: number; maxTemplates: number }, - ): Promise; -} - -export interface PostgresPgssRow { - queryid: string; - userid: string; - username: string | null; - dbid: string; - database: string | null; - query: string; - calls: number; - totalExecTime: number; - meanExecTime: number; - totalRows: number; -} - -export interface PostgresPgssAggregateRow { - id: string; - queryid: string; - dbid: string; - database: string | null; - query: string; - deltaCalls: number; - deltaExecTime: number; - deltaRows: number; - meanExecTime: number; - distinctUsersDelta: number; - users: string[]; - firstObservedAt: string; + info: string[]; } export interface HistoricSqlSourceAdapterDeps { sqlAnalysis: SqlAnalysisPort; - reader: HistoricSqlQueryHistoryReader; + reader: HistoricSqlReader; queryClient: unknown; - postgresReader?: PostgresPgssReader; - postgresQueryClient?: KtxPostgresQueryClient; - postgresBaselineRootDir?: string; + legacyPostgresBaselineRootDir?: string; now?: () => Date; - onPullSucceeded?: (ctx: { - connectionId: string; - sourceKey: string; - syncId: string; - trigger: import('../../types.js').IngestTrigger; - completedAt: Date; - stagedDir: string; - nextSuccessfulCursor: string | null; - }) => Promise; } - -const historicSqlLiteralSlotClassificationSchema = z.enum(['constant', 'runtime', 'categorical']); -export type HistoricSqlLiteralSlotClassification = z.infer; - -export const historicSqlMetadataSchema = z.object({ - id: z.string().min(1), - title: z.string().min(1), - path: z.string().min(1), - objectType: z.literal(HISTORIC_SQL_OBJECT_TYPE), - lastEditedAt: z.null(), - properties: z.object({ - fingerprint: z.string().min(1), - sub_cluster_id: z.string().nullable(), - dialect: historicSqlDialectSchema, - tables_touched: z.array(z.string()), - literal_slots: z.array( - z.object({ - position: z.number().int().min(1), - type: z.enum(['string', 'number', 'timestamp', 'date', 'boolean', 'null', 'unknown']), - classification: historicSqlLiteralSlotClassificationSchema, - }), - ), - triage_signals: z.record(z.string(), z.string()), - }), -}); -export type HistoricSqlMetadata = z.infer; - -export const historicSqlUsageSchema = z.object({ - stats: z.object({ - executions: z.number().int().nonnegative(), - distinct_users: z.number().int().nonnegative(), - first_seen: z.string().datetime(), - last_seen: z.string().datetime(), - p50_runtime_ms: z.number().nonnegative().nullable(), - p95_runtime_ms: z.number().nonnegative().nullable(), - mean_runtime_ms: z.number().nonnegative().nullable().optional(), - error_rate: z.number().min(0).max(1), - rows_produced: z.number().int().nonnegative().nullable().optional(), - }), - literal_slots: z.array( - z.object({ - position: z.number().int().min(1), - distinct_values: z.number().int().nonnegative(), - top_values: z.array(z.tuple([z.string(), z.number().int().nonnegative()])), - }), - ), - samples: z.array( - z.object({ - started_at: z.string().datetime(), - user: z.string().nullable(), - bound_sql: z.string(), - rows_produced: z.number().int().nonnegative().nullable().optional(), - runtime_ms: z.number().nonnegative().nullable(), - success: z.boolean(), - }), - ), -}); -export type HistoricSqlUsage = z.infer; - -export const historicSqlManifestSchema = z.object({ - source: z.literal(HISTORIC_SQL_SOURCE_KEY), - connectionId: z.string().min(1), - dialect: historicSqlDialectSchema, - fetchedAt: z.string().datetime(), - windowStart: z.string().datetime(), - windowEnd: z.string().datetime(), - nextSuccessfulCursor: z.string().datetime().nullable(), - templateCount: z.number().int().nonnegative(), - capped: z.boolean(), - warnings: z.array(z.string()), - degraded: z.boolean().default(false), - statsResetAt: z.string().datetime().nullable().default(null), - baselineFirstRun: z.boolean().default(false), - pgServerVersion: z.string().nullable().default(null), - deallocCount: z.number().int().nonnegative().nullable().default(null), - templates: z.array( - z.object({ - id: z.string().min(1), - fingerprint: z.string().min(1), - subClusterId: z.string().nullable(), - path: z.string().min(1), - }), - ), -}); -export type HistoricSqlManifest = z.infer; diff --git a/packages/context/src/ingest/adapters/live-database/manifest.test.ts b/packages/context/src/ingest/adapters/live-database/manifest.test.ts index 75a41067..a97140a9 100644 --- a/packages/context/src/ingest/adapters/live-database/manifest.test.ts +++ b/packages/context/src/ingest/adapters/live-database/manifest.test.ts @@ -186,6 +186,62 @@ describe('buildLiveDatabaseManifestShards', () => { }); }); + it('preserves external usage keys while replacing historic SQL managed keys', () => { + const existingUsage = new Map([ + [ + 'orders', + { + narrative: 'Old generated usage narrative.', + frequencyTier: 'low' as const, + commonFilters: ['old_status'], + commonJoins: [], + ownerNote: 'Pinned analyst note', + }, + ], + ]); + + const result = buildLiveDatabaseManifestShards({ + connectionType: 'POSTGRESQL', + mapColumnType: (nativeType) => nativeType.toLowerCase(), + existingUsage, + tables: [ + { + name: 'orders', + catalog: null, + db: 'public', + usage: { + narrative: 'Fresh generated usage narrative.', + frequencyTier: 'high', + commonFilters: ['status'], + commonGroupBys: ['created_at'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }, + columns: [{ name: 'id', type: 'INTEGER' }], + }, + ], + joins: [], + }); + + expect(shardObject(result.shards)).toEqual({ + public: { + tables: { + orders: { + table: 'public.orders', + usage: { + ownerNote: 'Pinned analyst note', + narrative: 'Fresh generated usage narrative.', + frequencyTier: 'high', + commonFilters: ['status'], + commonGroupBys: ['created_at'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }, + columns: [{ name: 'id', type: 'integer' }], + }, + }, + }, + }); + }); + it('renders ordered multi-column joins in both directions', () => { const result = buildLiveDatabaseManifestShards({ connectionType: 'POSTGRESQL', diff --git a/packages/context/src/ingest/adapters/live-database/manifest.ts b/packages/context/src/ingest/adapters/live-database/manifest.ts index d7315f9e..c6a6e2d5 100644 --- a/packages/context/src/ingest/adapters/live-database/manifest.ts +++ b/packages/context/src/ingest/adapters/live-database/manifest.ts @@ -1,3 +1,5 @@ +import type { TableUsageOutput } from '../historic-sql/skill-schemas.js'; + const RELATIONSHIP_MAP: Record = { MANY_TO_ONE: 'many_to_one', ONE_TO_MANY: 'one_to_many', @@ -11,6 +13,14 @@ const RELATIONSHIP_INVERSE: Record = { }; const SCAN_MANAGED_DESCRIPTION_KEYS = new Set(['db', 'ai']); +const HISTORIC_SQL_MANAGED_USAGE_KEYS = new Set([ + 'narrative', + 'frequencyTier', + 'commonFilters', + 'commonGroupBys', + 'commonJoins', + 'staleSince', +]); export interface LiveDatabaseManifestColumn { name: string; @@ -30,6 +40,7 @@ export interface LiveDatabaseManifestJoinEntry { export interface LiveDatabaseManifestTableEntry { table: string; descriptions?: Record; + usage?: TableUsageOutput; columns: LiveDatabaseManifestColumn[]; joins?: LiveDatabaseManifestJoinEntry[]; } @@ -43,6 +54,7 @@ export interface LiveDatabaseManifestTableData { catalog: string | null; db: string | null; descriptions?: Record; + usage?: TableUsageOutput; columns: Array<{ name: string; type: string; @@ -73,6 +85,7 @@ export interface BuildLiveDatabaseManifestShardsInput { mapColumnType: (nativeType: string) => string; existingPreservedJoins?: Map; existingDescriptions?: Map; + existingUsage?: Map; } export interface BuildLiveDatabaseManifestShardsResult { @@ -101,6 +114,28 @@ function mergeDescriptionsPreservingExternal( return Object.keys(result).length > 0 ? result : undefined; } +export function mergeUsagePreservingExternal( + existing: TableUsageOutput | undefined, + incoming: TableUsageOutput | undefined, +): TableUsageOutput | undefined { + if (!existing && !incoming) { + return undefined; + } + if (!incoming) { + return existing ? { ...existing } : undefined; + } + const result: Record = {}; + if (existing) { + for (const [key, value] of Object.entries(existing)) { + if (!HISTORIC_SQL_MANAGED_USAGE_KEYS.has(key)) { + result[key] = value; + } + } + } + Object.assign(result, incoming); + return Object.keys(result).length > 0 ? (result as TableUsageOutput) : undefined; +} + function getShardKey(connectionType: string, catalog: string | null, db: string | null): string { const normalized = connectionType.toUpperCase(); @@ -254,6 +289,11 @@ export function buildLiveDatabaseManifestShards( entry.descriptions = tableDescriptions; } + const usage = mergeUsagePreservingExternal(input.existingUsage?.get(table.name), table.usage); + if (usage) { + entry.usage = usage; + } + const tableJoins = joinsByTable.get(table.name); if (tableJoins && tableJoins.length > 0) { entry.joins = tableJoins; diff --git a/packages/context/src/ingest/index.ts b/packages/context/src/ingest/index.ts index 9991391f..3bcd5770 100644 --- a/packages/context/src/ingest/index.ts +++ b/packages/context/src/ingest/index.ts @@ -317,7 +317,8 @@ export type { export { NOTION_ORG_KNOWLEDGE_WARNING } from './adapters/notion/chunk.js'; export { NotionSourceAdapter, type NotionSourceAdapterDeps } from './adapters/notion/notion.adapter.js'; export { NotionClient, type NotionApi, type NotionBotInfo } from './adapters/notion/notion-client.js'; -export { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './adapters/historic-sql/chunk.js'; +export { bucketDistinctUsers, bucketErrorRate, bucketExecutions, bucketP95Runtime, bucketRecency } from './adapters/historic-sql/buckets.js'; +export { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './adapters/historic-sql/chunk-unified.js'; export { detectHistoricSqlStagedDir } from './adapters/historic-sql/detect.js'; export { HistoricSqlExtensionMissingError, @@ -327,41 +328,55 @@ export { export { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.adapter.js'; export { BigQueryHistoricSqlQueryHistoryReader } from './adapters/historic-sql/bigquery-query-history-reader.js'; export type { BigQueryHistoricSqlQueryHistoryReaderOptions } from './adapters/historic-sql/bigquery-query-history-reader.js'; -export { PostgresPgssQueryHistoryReader } from './adapters/historic-sql/postgres-pgss-query-history-reader.js'; +export { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js'; export { SnowflakeHistoricSqlQueryHistoryReader } from './adapters/historic-sql/snowflake-query-history-reader.js'; -export { stageHistoricSqlTemplates } from './adapters/historic-sql/stage.js'; +export { stageHistoricSqlAggregatedSnapshot } from './adapters/historic-sql/stage-unified.js'; export { - pgssBaselinePath, - readPgssBaseline, - stagePgStatStatementsTemplates, - writePgssBaselineAtomic, -} from './adapters/historic-sql/stage-pgss.js'; -export type { PgssBaseline, StagePgStatStatementsTemplatesResult } from './adapters/historic-sql/stage-pgss.js'; + historicSqlEvidenceEnvelopeSchema, + historicSqlEvidencePath, + historicSqlPatternEvidenceSchema, + historicSqlTableUsageEvidenceSchema, + serializeHistoricSqlEvidence, +} from './adapters/historic-sql/evidence.js'; export type { + HistoricSqlEvidenceEnvelope, + HistoricSqlPatternEvidence, + HistoricSqlTableUsageEvidence, +} from './adapters/historic-sql/evidence.js'; +export { createEmitHistoricSqlEvidenceTool } from './adapters/historic-sql/evidence-tool.js'; +export { HistoricSqlProjectionPostProcessor } from './adapters/historic-sql/post-processor.js'; +export { projectHistoricSqlEvidence } from './adapters/historic-sql/projection.js'; +export type { HistoricSqlProjectionInput, HistoricSqlProjectionResult } from './adapters/historic-sql/projection.js'; +export { + patternOutputSchema, + patternsArraySchema, + tableUsageOutputSchema, +} from './adapters/historic-sql/skill-schemas.js'; +export type { + PatternOutput, + TableUsageOutput, +} from './adapters/historic-sql/skill-schemas.js'; +export type { + AggregatedTemplate, HistoricSqlDialect, - HistoricSqlManifest, - HistoricSqlMetadata, - HistoricSqlPullConfig, - HistoricSqlQueryHistoryReader, - HistoricSqlRawQueryRow, + HistoricSqlProbeResult, + HistoricSqlReader, HistoricSqlSourceAdapterDeps, HistoricSqlTimeWindow, - HistoricSqlUsage, + HistoricSqlUnifiedPullConfig, KtxPostgresQueryClient, - PostgresPgssAggregateRow, PostgresPgssProbeResult, - PostgresPgssReader, - PostgresPgssRow, - PostgresPgssSnapshot, + StagedManifest, + StagedPatternsInput, + StagedTableInput, } from './adapters/historic-sql/types.js'; export { - HISTORIC_SQL_OBJECT_TYPE, HISTORIC_SQL_SOURCE_KEY, - historicSqlManifestSchema, - historicSqlMetadataSchema, - historicSqlPullConfigSchema, - historicSqlRawQueryRowSchema, - historicSqlUsageSchema, + aggregatedTemplateSchema, + historicSqlUnifiedPullConfigSchema, + stagedManifestSchema, + stagedPatternsInputSchema, + stagedTableInputSchema, } from './adapters/historic-sql/types.js'; export type { CanonicalPin } from './canonical-pins.js'; export { buildCanonicalPinsPromptBlock, selectRelevantCanonicalPins } from './canonical-pins.js'; diff --git a/packages/context/src/ingest/ingest-bundle.runner.test.ts b/packages/context/src/ingest/ingest-bundle.runner.test.ts index 0d1adf4c..ead6704d 100644 --- a/packages/context/src/ingest/ingest-bundle.runner.test.ts +++ b/packages/context/src/ingest/ingest-bundle.runner.test.ts @@ -405,44 +405,44 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { ); }); - it('reuses document evidence indexing and page triage for historic-SQL WorkUnits', async () => { + it('reuses document evidence indexing and page triage for document WorkUnits', async () => { const deps = makeDeps(); - deps.adapter.source = 'historic-sql'; - deps.adapter.skillNames = ['historic_sql_ingest']; - deps.adapter.reconcileSkillNames = ['historic_sql_curator']; + deps.adapter.source = 'notion'; + deps.adapter.skillNames = ['notion_synthesize']; + deps.adapter.reconcileSkillNames = []; deps.adapter.evidenceIndexing = 'documents'; deps.adapter.triageSupported = true; deps.adapter.chunk.mockResolvedValue({ workUnits: [ - { unitKey: 'full', rawFiles: ['templates/full/metadata.json'], dependencyPaths: [], peerFileIndex: [] }, - { unitKey: 'skip', rawFiles: ['templates/skip/metadata.json'], dependencyPaths: [], peerFileIndex: [] }, + { unitKey: 'full', rawFiles: ['pages/full/metadata.json'], dependencyPaths: [], peerFileIndex: [] }, + { unitKey: 'skip', rawFiles: ['pages/skip/metadata.json'], dependencyPaths: [], peerFileIndex: [] }, ], }); deps.diffSetService.compute.mockResolvedValue({ - added: ['templates/full/metadata.json', 'templates/skip/metadata.json'], + added: ['pages/full/metadata.json', 'pages/skip/metadata.json'], modified: [], deleted: [], unchanged: [], }); deps.pageTriage.triageRun.mockResolvedValue({ enabled: true, - fullRawPaths: new Set(['templates/full/metadata.json']), + fullRawPaths: new Set(['pages/full/metadata.json']), warnings: [], }); const runner = buildRunner(deps); (runner as any).stageRawFilesStage1 = vi.fn().mockResolvedValue({ currentHashes: new Map([ - ['templates/full/metadata.json', 'h-full'], - ['templates/skip/metadata.json', 'h-skip'], + ['pages/full/metadata.json', 'h-full'], + ['pages/skip/metadata.json', 'h-skip'], ]), - rawDirInWorktree: 'raw-sources/c1/historic-sql/s', + rawDirInWorktree: 'raw-sources/c1/notion/s', }); (runner as any).resolveStagedDir = vi.fn().mockResolvedValue('/tmp/stage/upload-x'); const result = await runner.run({ jobId: 'j1', connectionId: 'c1', - sourceKey: 'historic-sql', + sourceKey: 'notion', trigger: 'upload', bundleRef: { kind: 'upload', uploadId: 'upload-x' }, }); @@ -1428,6 +1428,67 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { expect(deps.sessionWorktreeService.cleanup).toHaveBeenCalledWith(expect.any(Object), 'success'); }); + it('includes historic-sql post-processor output in memory-flow saved counts', async () => { + const deps = makeDeps(); + deps.adapter.source = 'historic-sql'; + deps.registry.get.mockReturnValue(deps.adapter); + deps.adapter.chunk.mockResolvedValue({ + workUnits: [ + { + unitKey: 'historic-sql-table-public-orders', + rawFiles: ['tables/public/orders.json'], + peerFileIndex: [], + dependencyPaths: [], + }, + ], + }); + const postProcessor = { + run: vi.fn().mockResolvedValue({ + result: { + tableUsageMerged: 2, + staleTablesMarked: 1, + patternPagesWritten: 3, + stalePatternPagesMarked: 1, + archivedPatternPages: 1, + legacyPagesDeleted: 1, + }, + warnings: [], + errors: [], + touchedSources: [{ connectionId: 'c1', sourceName: 'orders' }], + }), + }; + const runner = buildRunner(deps, { postProcessors: { 'historic-sql': postProcessor } }); + (runner as any).stageRawFilesStage1 = vi.fn().mockResolvedValue({ + currentHashes: new Map([['tables/public/orders.json', 'h1']]), + rawDirInWorktree: 'raw-sources/c1/historic-sql/s', + }); + (runner as any).resolveStagedDir = vi.fn().mockResolvedValue('/tmp/stage/upload-x'); + const memoryFlow = createMemoryFlowLiveBuffer(bundleReplayInput()); + + await runner.run( + { + jobId: 'j1', + connectionId: 'c1', + sourceKey: 'historic-sql', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload-x' }, + }, + { + jobId: 'j1', + memoryFlow, + startPhase: () => new TestJobContext('j1', null, () => Promise.resolve(), () => Promise.resolve()), + }, + ); + + expect(memoryFlow.snapshot().events).toContainEqual( + expect.objectContaining({ + type: 'saved', + wikiCount: 6, + slCount: 3, + }), + ); + }); + it('marks post-processor infrastructure failure as failed and preserves worktree cleanup state', async () => { const deps = makeDeps(); deps.adapter.source = 'metricflow'; diff --git a/packages/context/src/ingest/ingest-bundle.runner.ts b/packages/context/src/ingest/ingest-bundle.runner.ts index 0515842a..a226bdd0 100644 --- a/packages/context/src/ingest/ingest-bundle.runner.ts +++ b/packages/context/src/ingest/ingest-bundle.runner.ts @@ -15,6 +15,7 @@ import type { ContextEvidenceIndexSummary, IngestBundleRunnerDeps, PageTriageRun import { buildSyncId, rawSourcesDirForSync } from './raw-sources-paths.js'; import { buildStageIndexFromReportBody, + postProcessorSavedMemoryCounts, type IngestReportPostProcessorOutcome, type IngestReportSnapshot, } from './reports.js'; @@ -1111,11 +1112,12 @@ export class IngestBundleRunner { } const commitSha = mergeResult.touchedPaths.length === 0 ? null : mergeResult.squashSha; const memoryFlowSavedActions = stageIndex.workUnits.flatMap((wu) => wu.actions).concat(reconcileActions); + const postProcessorMemoryCounts = postProcessorSavedMemoryCounts(postProcessorOutcome); memoryFlow?.emit({ type: 'saved', commitSha, - wikiCount: countMemoryFlowActions(memoryFlowSavedActions, 'wiki'), - slCount: countMemoryFlowActions(memoryFlowSavedActions, 'sl'), + wikiCount: countMemoryFlowActions(memoryFlowSavedActions, 'wiki') + postProcessorMemoryCounts.wikiCount, + slCount: countMemoryFlowActions(memoryFlowSavedActions, 'sl') + postProcessorMemoryCounts.slCount, }); await stage6?.updateProgress(1.0, commitSha ? `Saved changes (${commitSha.slice(0, 8)})` : 'No changes to save'); diff --git a/packages/context/src/ingest/ingest-prompts.test.ts b/packages/context/src/ingest/ingest-prompts.test.ts index b59ab81f..43985ee9 100644 --- a/packages/context/src/ingest/ingest-prompts.test.ts +++ b/packages/context/src/ingest/ingest-prompts.test.ts @@ -29,48 +29,10 @@ describe('ingest prompt assets', () => { expect(prompt).not.toMatch(forbiddenProductPattern()); }); - it('pins historic-SQL triage rules with synthetic signal fixtures', async () => { + it('does not route historic-SQL through page-triage prompt examples', async () => { const prompt = await readFile(new URL('../../prompts/skills/page_triage_classifier.md', import.meta.url), 'utf-8'); - expect(prompt).toContain('signals.objectType === "historic_sql_template"'); - expect(prompt).toContain('executions_bucket=low AND distinct_users_bucket=solo'); - expect(prompt).toContain('service_account_only=true AND below the frequency floor'); - expect(prompt).toContain('shared human usage with mid or high execution volume'); - - const fixtures = [ - { - label: 'skip low solo template', - objectType: '"objectType": "historic_sql_template"', - executions: '"executions_bucket": "low"', - users: '"distinct_users_bucket": "solo"', - serviceAccount: '"service_account_only": "false"', - lane: '-> `skip`', - }, - { - label: 'light service-account-only template', - objectType: '"objectType": "historic_sql_template"', - executions: '"executions_bucket": "high"', - users: '"distinct_users_bucket": "solo"', - serviceAccount: '"service_account_only": "true"', - lane: '-> `light`', - }, - { - label: 'full shared human template', - objectType: '"objectType": "historic_sql_template"', - executions: '"executions_bucket": "high"', - users: '"distinct_users_bucket": "team"', - serviceAccount: '"service_account_only": "false"', - lane: '-> `full`', - }, - ]; - - for (const fixture of fixtures) { - expect(prompt).toContain(fixture.label); - expect(prompt).toContain(fixture.objectType); - expect(prompt).toContain(fixture.executions); - expect(prompt).toContain(fixture.users); - expect(prompt).toContain(fixture.serviceAccount); - expect(prompt).toContain(fixture.lane); - } + expect(prompt).not.toContain(['historic_sql', 'template'].join('_')); + expect(prompt).not.toContain('service_account_only=true AND below the frequency floor'); }); }); diff --git a/packages/context/src/ingest/ingest-runtime-assets.test.ts b/packages/context/src/ingest/ingest-runtime-assets.test.ts index 9af8fcf6..2fafd69b 100644 --- a/packages/context/src/ingest/ingest-runtime-assets.test.ts +++ b/packages/context/src/ingest/ingest-runtime-assets.test.ts @@ -14,14 +14,14 @@ const adapterSkillNames = [ 'metabase_ingest', 'metricflow_ingest', 'notion_synthesize', - 'historic_sql_ingest', + 'historic_sql_table_digest', + 'historic_sql_patterns', 'ingest_triage', 'knowledge_capture', 'sl_capture', ] as const; const adapterReconcileSkillNames = [ - 'historic_sql_curator', 'ingest_triage', 'knowledge_capture', 'sl_capture', @@ -58,75 +58,37 @@ describe('ingest runtime assets', () => { } await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain('# Page Triage Classifier'); - await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain( - 'signals.objectType === "historic_sql_template"', - ); - await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain( - 'service_account_only=true AND below the frequency floor', - ); await expect(prompts.loadPrompt('skills/light_extraction')).resolves.toContain('# Light Context Extraction'); }); - it('packages historic-SQL WorkUnit skill guidance from KTX assets', async () => { + it('packages historic-SQL table digest guidance from KTX assets', async () => { const registry = new SkillsRegistryService({ skillsDir }); - const skills = await registry.listSkills(['historic_sql_ingest'], 'memory_agent'); + const skills = await registry.listSkills(['historic_sql_table_digest'], 'memory_agent'); - expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_ingest']); + expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_table_digest']); - const [skill] = skills; - if (!skill) { - throw new Error('historic_sql_ingest skill missing'); - } - - expect(skill.path.startsWith(skillsDir)).toBe(true); - - const body = await readFile(join(skill.path, 'SKILL.md'), 'utf-8'); - expect(body).toContain('# Historic SQL Ingest'); - expect(body).toContain('Read exactly one historic-SQL template WorkUnit'); - expect(body).toContain('metadata.json'); - expect(body).toContain('page.md'); - expect(body).toContain('usage.json'); - expect(body).toContain('manifest.json'); - expect(body).toContain('wiki_write'); - expect(body).toContain('key: "queries/"'); - expect(body).toContain('"source": "historic-sql"'); - expect(body).toContain('representative_sql'); - expect(body).toContain('fingerprints'); - expect(body).toContain('usage'); - expect(body).toContain('SL proposal threshold'); - expect(body).toContain('Do not group sibling templates'); - expect(body).toContain('Do not copy sample bound_sql'); - expect(body).not.toContain('store historic-SQL provenance in the markdown body'); + const body = await readFile(join(skills[0]!.path, 'SKILL.md'), 'utf-8'); + expect(body).toContain('# Historic SQL Table Digest'); + expect(body).toContain('tables/..json'); + expect(body).toContain('tableUsageOutputSchema'); + expect(body).toContain('emit_historic_sql_evidence'); + expect(body).toContain('Do not call wiki_write'); + expect(body).toContain('Do not call sl_write_source'); expect(body).not.toMatch(forbiddenProductPattern()); }); - it('packages historic-SQL curator reconcile guidance from KTX assets', async () => { + it('packages historic-SQL patterns guidance from KTX assets', async () => { const registry = new SkillsRegistryService({ skillsDir }); - const skills = await registry.listSkills(['historic_sql_curator'], 'memory_agent'); + const skills = await registry.listSkills(['historic_sql_patterns'], 'memory_agent'); - expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_curator']); + expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_patterns']); - const [skill] = skills; - if (!skill) { - throw new Error('historic_sql_curator skill missing'); - } - - expect(skill.path.startsWith(skillsDir)).toBe(true); - - const body = await readFile(join(skill.path, 'SKILL.md'), 'utf-8'); - expect(body).toContain('# Historic SQL Curator'); - expect(body).toContain('curator pagination'); - expect(body).toContain('stage_list'); - expect(body).toContain('stage_diff'); - expect(body).toContain('read_raw_span'); - expect(body).toContain('wiki_search'); - expect(body).toContain('wiki_read'); - expect(body).toContain('wiki_write'); - expect(body).toContain('emit_artifact_resolution'); - expect(body).toContain('emit_eviction_decision'); - expect(body).toContain('categorical sub-cluster'); - expect(body).toContain('historic-sql-demoted'); - expect(body).toContain('Do not call `context_candidate_write`'); + const body = await readFile(join(skills[0]!.path, 'SKILL.md'), 'utf-8'); + expect(body).toContain('# Historic SQL Patterns'); + expect(body).toContain('patterns-input/part-0001.json'); + expect(body).toContain('patternsArraySchema'); + expect(body).toContain('emit_historic_sql_evidence'); + expect(body).toContain('cross-table'); expect(body).not.toMatch(forbiddenProductPattern()); }); }); diff --git a/packages/context/src/ingest/local-adapters.test.ts b/packages/context/src/ingest/local-adapters.test.ts index 009cdda2..48bb2a80 100644 --- a/packages/context/src/ingest/local-adapters.test.ts +++ b/packages/context/src/ingest/local-adapters.test.ts @@ -4,6 +4,7 @@ import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { initKtxProject, type KtxLocalProject, loadKtxProject } from '../project/index.js'; import type { SqlAnalysisPort } from '../sql-analysis/index.js'; +import type { HistoricSqlReader } from './adapters/historic-sql/types.js'; import { LocalLookerRuntimeStore } from './adapters/looker/local-runtime-store.js'; import { createDefaultLocalIngestAdapters, localPullConfigForAdapter } from './local-adapters.js'; @@ -92,6 +93,9 @@ describe('local ingest adapters', () => { literalSlots: [], }; }, + async analyzeBatch() { + return new Map(); + }, }; const adapters = createDefaultLocalIngestAdapters(project, { historicSql: { @@ -107,6 +111,44 @@ describe('local ingest adapters', () => { expect(adapters.map((adapter) => adapter.source)).toContain('historic-sql'); expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.fetch).toBeTypeOf('function'); + expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([ + 'historic_sql_table_digest', + 'historic_sql_patterns', + ]); + }); + + it('registers historic-sql with an injected non-Postgres reader and query client', () => { + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() {}, + }; + const queryClient = { executeQuery: async () => ({ headers: [], rows: [], totalRows: 0 }) }; + + const adapters = createDefaultLocalIngestAdapters(project, { + historicSql: { + sqlAnalysis: { + async analyzeForFingerprint(sql) { + return { + fingerprint: 'fp', + normalizedSql: sql, + tablesTouched: [], + literalSlots: [], + }; + }, + async analyzeBatch() { + return new Map(); + }, + }, + reader, + queryClient, + }, + }); + + const adapter = adapters.find((candidate) => candidate.source === 'historic-sql'); + expect(adapter).toBeDefined(); + expect(adapter?.fetch).toBeTypeOf('function'); }); it('builds Postgres historic-sql pull config from a local connection', async () => { @@ -121,6 +163,9 @@ describe('local ingest adapters', () => { literalSlots: [], }; }, + async analyzeBatch() { + return new Map(); + }, }, postgresQueryClient: { async executeQuery() { @@ -146,11 +191,14 @@ describe('local ingest adapters', () => { await expect(localPullConfigForAdapter(postgresProject, historicSql!, 'warehouse')).resolves.toEqual({ dialect: 'postgres', windowDays: 90, - lastSuccessfulCursor: null, - serviceAccountUserPatterns: ['^svc_'], + minExecutions: 7, + concurrency: 12, + filters: { + serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' }, + dropTrivialProbes: true, + }, redactionPatterns: [], - maxTemplatesPerRun: 123, - minCalls: 7, + staleArchiveAfterDays: 90, }); }); @@ -166,6 +214,9 @@ describe('local ingest adapters', () => { literalSlots: [], }; }, + async analyzeBatch() { + return new Map(); + }, }, postgresQueryClient: { async executeQuery() { diff --git a/packages/context/src/ingest/local-adapters.ts b/packages/context/src/ingest/local-adapters.ts index 51681774..93d6b063 100644 --- a/packages/context/src/ingest/local-adapters.ts +++ b/packages/context/src/ingest/local-adapters.ts @@ -6,11 +6,11 @@ import type { SqlAnalysisPort } from '../sql-analysis/index.js'; import { DbtSourceAdapter } from './adapters/dbt/dbt.adapter.js'; import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js'; import { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.adapter.js'; -import { PostgresPgssQueryHistoryReader } from './adapters/historic-sql/postgres-pgss-query-history-reader.js'; -import { SnowflakeHistoricSqlQueryHistoryReader } from './adapters/historic-sql/snowflake-query-history-reader.js'; +import { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js'; import { HISTORIC_SQL_SOURCE_KEY, - historicSqlPullConfigSchema, + historicSqlUnifiedPullConfigSchema, + type HistoricSqlReader, type KtxPostgresQueryClient, } from './adapters/historic-sql/types.js'; import { @@ -43,7 +43,9 @@ export interface DefaultLocalIngestAdaptersOptions { databaseIntrospection?: Omit; historicSql?: { sqlAnalysis: SqlAnalysisPort; - postgresQueryClient: KtxPostgresQueryClient; + reader?: HistoricSqlReader; + queryClient?: unknown; + postgresQueryClient?: KtxPostgresQueryClient; postgresBaselineRootDir?: string; now?: () => Date; }; @@ -91,18 +93,16 @@ export function createDefaultLocalIngestAdapters( ]; if (options.historicSql) { + const queryClient = options.historicSql.queryClient ?? options.historicSql.postgresQueryClient; + if (!queryClient) { + throw new Error('Historic SQL local adapter requires queryClient or postgresQueryClient'); + } adapters.push( new HistoricSqlSourceAdapter({ sqlAnalysis: options.historicSql.sqlAnalysis, - reader: new SnowflakeHistoricSqlQueryHistoryReader(), - queryClient: { - executeQuery: async () => { - throw new Error('Local historic-SQL currently supports Postgres pg_stat_statements only'); - }, - }, - postgresReader: new PostgresPgssQueryHistoryReader(), - postgresQueryClient: options.historicSql.postgresQueryClient, - postgresBaselineRootDir: options.historicSql.postgresBaselineRootDir, + reader: options.historicSql.reader ?? new PostgresPgssReader(), + queryClient, + legacyPostgresBaselineRootDir: options.historicSql.postgresBaselineRootDir, now: options.historicSql.now, }), ); @@ -180,9 +180,8 @@ export async function localPullConfigForAdapter( if (historicSql?.enabled !== true) { throw new Error(`Connection "${connectionId}" does not have historicSql.enabled: true`); } - return historicSqlPullConfigSchema.parse({ + return historicSqlUnifiedPullConfigSchema.parse({ ...historicSql, - lastSuccessfulCursor: stringField(historicSql.lastSuccessfulCursor), }); } if (adapter.source === 'looker') { diff --git a/packages/context/src/ingest/local-bundle-ingest.test.ts b/packages/context/src/ingest/local-bundle-ingest.test.ts index 6e9aa4aa..2fa014d0 100644 --- a/packages/context/src/ingest/local-bundle-ingest.test.ts +++ b/packages/context/src/ingest/local-bundle-ingest.test.ts @@ -2,6 +2,7 @@ import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import Database from 'better-sqlite3'; +import YAML from 'yaml'; import { AgentRunnerService } from '../agent/index.js'; import { initKtxProject, type KtxLocalProject, loadKtxProject } from '../project/index.js'; import { makeLocalGitRepo } from '../test/make-local-git-repo.js'; @@ -10,6 +11,7 @@ import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js'; import { LocalLookerRuntimeStore } from './adapters/looker/local-runtime-store.js'; import { createDefaultLocalIngestAdapters, localPullConfigForAdapter } from './local-adapters.js'; import { getLocalIngestStatus, runLocalIngest } from './local-ingest.js'; +import type { ChunkResult, DiffSet, SourceAdapter } from './types.js'; class TestAgentRunner extends AgentRunnerService { override runLoop = vi.fn().mockResolvedValue({ stopReason: 'natural' as const }); @@ -86,6 +88,70 @@ class WikiWritingAgentRunner extends AgentRunnerService { } } +class HistoricSqlEvidenceAgentRunner extends AgentRunnerService { + override runLoop = vi.fn(async (params: any) => { + if ( + params.telemetryTags?.operationName === 'ingest-bundle-wu' && + params.telemetryTags?.unitKey === 'historic-sql-table-public-orders' + ) { + const emitEvidence = params.toolSet.emit_historic_sql_evidence; + if (!emitEvidence?.execute) { + throw new Error('emit_historic_sql_evidence tool was not available to the historic-SQL WorkUnit'); + } + const result = await emitEvidence.execute( + { + kind: 'table_usage', + table: 'public.orders', + rawPath: 'tables/public.orders.json', + usage: { + narrative: 'Orders are repeatedly queried by lifecycle status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [], + staleSince: null, + }, + }, + { toolCallId: 'historic-sql-evidence' }, + ); + if (!String(result).includes('Recorded historic-SQL table_usage evidence')) { + throw new Error(`Unexpected historic-SQL evidence result: ${String(result)}`); + } + } + return { stopReason: 'natural' as const }; + }); + + constructor() { + super({ llmProvider: { getModel: () => ({}) as never } as never }); + } +} + +class HistoricSqlEvidenceTestAdapter implements SourceAdapter { + readonly source = 'historic-sql'; + readonly skillNames = ['historic_sql_table_digest']; + readonly reconcileSkillNames: string[] = []; + readonly triageSupported = false; + + detect(): Promise { + return Promise.resolve(true); + } + + chunk(_stagedDir: string, _diffSet?: DiffSet): Promise { + return Promise.resolve({ + workUnits: [ + { + unitKey: 'historic-sql-table-public-orders', + displayLabel: 'public.orders', + rawFiles: ['tables/public.orders.json'], + peerFileIndex: [], + dependencyPaths: ['manifest.json'], + notes: + 'Use historic_sql_table_digest. Read this table usage JSON and emit exactly one table_usage object with emit_historic_sql_evidence.', + }, + ], + }); + } +} + function makeLookerRuntimeClient() { const lookerModels = { models: [{ name: 'ecommerce', label: 'Ecommerce', explores: [{ name: 'orders', label: 'Orders' }] }], @@ -308,6 +374,90 @@ describe('canonical local ingest', () => { } }); + it('runs historic-SQL evidence projection through the local bundle post-processor', async () => { + const projectDir = join(tempDir, 'historic-sql-project'); + await initKtxProject({ projectDir, projectName: 'warehouse' }); + await writeFile( + join(projectDir, 'ktx.yaml'), + [ + 'project: warehouse', + 'connections:', + ' warehouse:', + ' driver: postgres', + 'ingest:', + ' adapters:', + ' - historic-sql', + ' embeddings:', + ' backend: deterministic', + 'storage:', + ' state: sqlite', + ' search: sqlite-fts5', + ' git:', + ' auto_commit: false', + ' author: KTX Test ', + '', + ].join('\n'), + 'utf-8', + ); + const historicProject = await loadKtxProject({ projectDir }); + await historicProject.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/public.yaml', + YAML.stringify({ tables: { orders: { table: 'public.orders', columns: [{ name: 'id', type: 'string' }] } } }), + 'KTX Test', + 'system@ktx.local', + 'Seed schema shard', + ); + + const sourceDir = join(tempDir, 'historic-sql-source'); + await mkdir(join(sourceDir, 'tables'), { recursive: true }); + await writeFile( + join(sourceDir, 'manifest.json'), + `${JSON.stringify( + { + source: 'historic-sql', + connectionId: 'warehouse', + dialect: 'postgres', + fetchedAt: '2026-05-11T00:00:00.000Z', + windowStart: '2026-02-10T00:00:00.000Z', + windowEnd: '2026-05-11T00:00:00.000Z', + snapshotRowCount: 1, + touchedTableCount: 1, + parseFailures: 0, + warnings: [], + probeWarnings: [], + staleArchiveAfterDays: 90, + }, + null, + 2, + )}\n`, + 'utf-8', + ); + await writeFile(join(sourceDir, 'tables/public.orders.json'), '{"table":"public.orders"}\n', 'utf-8'); + await writeFile(join(sourceDir, 'patterns-input.json'), '{"templates":[]}\n', 'utf-8'); + const agentRunner = new HistoricSqlEvidenceAgentRunner(); + + const result = await runLocalIngest({ + project: historicProject, + adapters: [new HistoricSqlEvidenceTestAdapter()], + adapter: 'historic-sql', + connectionId: 'warehouse', + sourceDir, + jobId: 'historic-sql-local-projection', + agentRunner, + }); + + expect(result.result.failedWorkUnits).toEqual([]); + expect(result.report.body.postProcessor).toMatchObject({ + sourceKey: 'historic-sql', + status: 'success', + result: { tableUsageMerged: 1 }, + touchedSources: [{ connectionId: 'warehouse', sourceName: 'orders' }], + }); + await expect(readFile(join(projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves.toContain( + 'Orders are repeatedly queried by lifecycle status.', + ); + }); + it('rejects direct Metabase scheduled pulls before requiring a local ingest LLM provider', async () => { const projectDir = join(tempDir, 'metabase-project'); await initKtxProject({ projectDir, projectName: 'warehouse' }); diff --git a/packages/context/src/ingest/local-bundle-runtime.ts b/packages/context/src/ingest/local-bundle-runtime.ts index f7c8be80..43d0247b 100644 --- a/packages/context/src/ingest/local-bundle-runtime.ts +++ b/packages/context/src/ingest/local-bundle-runtime.ts @@ -2,6 +2,7 @@ import { mkdirSync } from 'node:fs'; import { join } from 'node:path'; import { fileURLToPath } from 'node:url'; import type { KtxLlmProvider } from '@ktx/llm'; +import type { Tool } from 'ai'; import YAML from 'yaml'; import type { AgentRunnerService } from '../agent/index.js'; import { AgentRunnerService as DefaultAgentRunnerService } from '../agent/index.js'; @@ -69,6 +70,8 @@ import { ContextCandidateCarryforwardService, CuratorPaginationService, } from './context-candidates/index.js'; +import { createEmitHistoricSqlEvidenceTool } from './adapters/historic-sql/evidence-tool.js'; +import { HistoricSqlProjectionPostProcessor } from './adapters/historic-sql/post-processor.js'; import { ContextEvidenceIndexService, SqliteContextEvidenceStore } from './context-evidence/index.js'; import { DiffSetService } from './diff-set.service.js'; import { IngestBundleRunner } from './ingest-bundle.runner.js'; @@ -427,10 +430,16 @@ class NoopKnowledgeEventPort implements KnowledgeEventPort { } class LocalIngestToolSet implements IngestToolsetLike { - constructor(private readonly tools: BaseTool[]) {} + constructor( + private readonly tools: BaseTool[], + private readonly sourceTools: Record = {}, + ) {} toAiSdkTools(context: ToolContext) { - return Object.fromEntries(this.tools.map((tool) => [tool.name, tool.toAiSdkTool(context)])); + return { + ...Object.fromEntries(this.tools.map((tool) => [tool.name, tool.toAiSdkTool(context)])), + ...this.sourceTools, + }; } } @@ -498,9 +507,19 @@ class LocalIngestToolsetFactory implements IngestToolsetFactoryPort { ]; } - createIngestWuToolset(_session: ToolSession, options?: { includeContextEvidenceTools?: boolean }): IngestToolsetLike { + createIngestWuToolset(session: ToolSession, options?: { includeContextEvidenceTools?: boolean }): IngestToolsetLike { + const sourceTools: Record = + session.ingest?.sourceKey === 'historic-sql' + ? { + emit_historic_sql_evidence: createEmitHistoricSqlEvidenceTool({ + connectionId: session.connectionId, + session, + }), + } + : {}; return new LocalIngestToolSet( options?.includeContextEvidenceTools ? [...this.baseTools, ...this.contextTools] : this.baseTools, + sourceTools, ); } } @@ -656,6 +675,9 @@ export function createLocalBundleIngestRuntime( settings: { batchSize: 8, maxPasses: 8, stepBudgetPerPass: 60 }, logger, }), + postProcessors: { + 'historic-sql': new HistoricSqlProjectionPostProcessor(), + }, logger, }; diff --git a/packages/context/src/ingest/page-triage/page-triage.service.test.ts b/packages/context/src/ingest/page-triage/page-triage.service.test.ts index 5e53d233..940b0fc0 100644 --- a/packages/context/src/ingest/page-triage/page-triage.service.test.ts +++ b/packages/context/src/ingest/page-triage/page-triage.service.test.ts @@ -1,4 +1,4 @@ -import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; @@ -120,14 +120,6 @@ describe('PageTriageService', () => { await rm(stagedDir, { recursive: true, force: true }); }); - function parseSignalsFromClassifierPrompt(prompt: string): unknown { - const match = /\n([\s\S]*?)\n<\/signals>/.exec(prompt); - if (!match) { - throw new Error('classifier prompt did not include a block'); - } - return JSON.parse(match[1]); - } - it('writes light-lane candidates and keeps the page out of full WorkUnits', async () => { generateTextMock .mockResolvedValueOnce({ text: JSON.stringify({ lane: 'light', reason: 'short durable policy' }) } as any) @@ -282,163 +274,6 @@ describe('PageTriageService', () => { expect(repository.setDocumentTriageLane).toHaveBeenCalledWith('run-1', 'pages/page-1/page.md', 'light'); }); - it.each([ - { - name: 'skip low solo template', - propertyHints: { - executions_bucket: 'low', - distinct_users_bucket: 'solo', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'false', - slot_summary: '1 constant, 1 runtime', - }, - expectedLane: 'skip', - expectedReport: { skip: 1, light: 0, full: 0 }, - }, - { - name: 'light service-account-only template', - propertyHints: { - executions_bucket: 'high', - distinct_users_bucket: 'solo', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'true', - slot_summary: '1 constant, 0 runtime', - }, - expectedLane: 'light', - expectedReport: { skip: 0, light: 1, full: 0 }, - }, - { - name: 'full shared human template', - propertyHints: { - executions_bucket: 'high', - distinct_users_bucket: 'team', - error_rate_bucket: 'ok', - recency_bucket: 'active', - service_account_only: 'false', - slot_summary: '2 constant, 1 runtime', - }, - expectedLane: 'full', - expectedReport: { skip: 0, light: 0, full: 1 }, - }, - ] as const)('triages historic-SQL synthetic signal fixture as $expectedLane for $name', async ({ - name, - propertyHints, - expectedLane, - expectedReport, - }) => { - const externalId = name.replace(/[^a-z0-9]+/g, '_'); - const templateDir = join(stagedDir, 'templates', externalId); - await mkdir(templateDir, { recursive: true }); - await writeFile( - join(templateDir, 'metadata.json'), - JSON.stringify({ - id: externalId, - title: `snowflake - analytics.orders [${externalId.slice(0, 6)}]`, - path: `templates/${externalId}/page.md`, - objectType: 'historic_sql_template', - lastEditedAt: null, - properties: { - fingerprint: externalId, - sub_cluster_id: null, - dialect: 'snowflake', - tables_touched: ['analytics.orders'], - literal_slots: [{ position: 1, type: 'string', classification: 'constant' }], - triage_signals: propertyHints, - }, - }), - 'utf-8', - ); - await writeFile( - join(templateDir, 'page.md'), - [ - `# ${externalId}`, - '', - '## Normalized SQL', - '```sql', - 'SELECT count(*) FROM analytics.orders WHERE status = ?', - '```', - '', - '## Tables touched', - '- analytics.orders', - ].join('\n'), - 'utf-8', - ); - - adapter.getTriageSignals.mockResolvedValueOnce({ - objectType: 'historic_sql_template', - lastEditedAt: '2026-05-04T12:00:00.000Z', - propertyHints, - }); - promptService.loadPrompt.mockImplementation((promptName: string) => { - if (promptName === 'skills/page_triage_classifier') { - return readFile(new URL('../../../prompts/skills/page_triage_classifier.md', import.meta.url), 'utf-8'); - } - return Promise.resolve(`prompt:${promptName}`); - }); - generateTextMock.mockImplementationOnce((args: any) => { - const prompt = args.messages[0].content as string; - expect(prompt).toContain('signals.objectType === "historic_sql_template"'); - expect(prompt).toContain('executions_bucket=low AND distinct_users_bucket=solo'); - expect(prompt).toContain('service_account_only=true AND below the frequency floor'); - expect(prompt).toContain('shared human usage with mid or high execution volume'); - expect(parseSignalsFromClassifierPrompt(prompt)).toEqual({ - objectType: 'historic_sql_template', - lastEditedAt: '2026-05-04T12:00:00.000Z', - propertyHints, - }); - return { text: JSON.stringify({ lane: expectedLane, reason: `${name} fixture` }) } as any; - }); - if (expectedLane === 'light') { - generateTextMock.mockResolvedValueOnce({ - text: JSON.stringify({ - candidates: [ - { - candidateKey: 'historic-sql-service-account-template', - topic: 'Historic SQL Service Account Template', - assertion: 'A service-account-only historic SQL template can remain as light evidence.', - rationale: 'The synthetic historic-SQL fixture is service-account-only and below the frequency floor.', - evidenceChunkIds: ['00000000-0000-0000-0000-000000000101'], - suggestedPageKey: 'historic-sql-service-account-template', - actionHint: 'create', - durabilityScore: 2, - authorityScore: 1, - reuseScore: 2, - noveltyScore: 1, - riskScore: 0, - }, - ], - }), - } as any); - } - - const result = await service.triageRun({ - stagedDir, - runId: 'run-1', - connectionId: 'conn-1', - sourceKey: 'historic-sql', - syncId: 'sync-1', - jobId: 'job-1', - diffSet: { - added: [`templates/${externalId}/metadata.json`, `templates/${externalId}/page.md`], - modified: [], - deleted: [], - unchanged: [], - }, - adapter: adapter as any, - }); - - expect(result.report).toMatchObject({ pageCount: 1, ...expectedReport }); - expect(repository.setDocumentTriageLane).toHaveBeenCalledWith( - 'run-1', - `templates/${externalId}/page.md`, - expectedLane, - ); - expect(result.fullRawPaths.has(`templates/${externalId}/metadata.json`)).toBe(expectedLane === 'full'); - expect(result.fullRawPaths.has(`templates/${externalId}/page.md`)).toBe(expectedLane === 'full'); - }); - it('triages Notion data-source row pages without reading data-source metadata as page markdown', async () => { triageSettings.lightExtractionEnabled = false; diff --git a/packages/context/src/ingest/reports.ts b/packages/context/src/ingest/reports.ts index 7cf4418a..6f60f149 100644 --- a/packages/context/src/ingest/reports.ts +++ b/packages/context/src/ingest/reports.ts @@ -79,6 +79,50 @@ export interface IngestReportSnapshot { createdAt: string; } +export interface IngestSavedMemoryCounts { + wikiCount: number; + slCount: number; +} + +function numericResultField(result: Record, field: string): number { + const value = result[field]; + return typeof value === 'number' && Number.isFinite(value) && value > 0 ? value : 0; +} + +export function postProcessorSavedMemoryCounts( + postProcessor: IngestReportPostProcessorOutcome | undefined, +): IngestSavedMemoryCounts { + if (!postProcessor || postProcessor.sourceKey !== 'historic-sql') { + return { wikiCount: 0, slCount: 0 }; + } + const result = postProcessor.result; + if (!result || typeof result !== 'object' || Array.isArray(result)) { + return { wikiCount: 0, slCount: 0 }; + } + const record = result as Record; + return { + wikiCount: + numericResultField(record, 'patternPagesWritten') + + numericResultField(record, 'stalePatternPagesMarked') + + numericResultField(record, 'archivedPatternPages') + + numericResultField(record, 'legacyPagesDeleted'), + slCount: numericResultField(record, 'tableUsageMerged') + numericResultField(record, 'staleTablesMarked'), + }; +} + +export function savedMemoryCountsForReport(report: IngestReportSnapshot): IngestSavedMemoryCounts { + const actions = report.body.workUnits.flatMap((workUnit) => workUnit.actions); + const directCounts = { + wikiCount: actions.filter((action) => action.target === 'wiki').length, + slCount: actions.filter((action) => action.target === 'sl').length, + }; + const postProcessorCounts = postProcessorSavedMemoryCounts(report.body.postProcessor); + return { + wikiCount: directCounts.wikiCount + postProcessorCounts.wikiCount, + slCount: directCounts.slCount + postProcessorCounts.slCount, + }; +} + export function buildStageIndexFromReportBody(jobId: string, connectionId: string, body: IngestReportBody): StageIndex { return { jobId, diff --git a/packages/context/src/mcp/local-project-ports.test.ts b/packages/context/src/mcp/local-project-ports.test.ts index f5aa52c0..8692ab7d 100644 --- a/packages/context/src/mcp/local-project-ports.test.ts +++ b/packages/context/src/mcp/local-project-ports.test.ts @@ -520,6 +520,54 @@ describe('createLocalProjectMcpContextPorts', () => { }); }); + it('returns historic SQL usage frequency and snippet through semantic-layer list search', async () => { + const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' }); + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/public.yaml', + `tables: + orders: + table: public.orders + usage: + narrative: Analysts inspect paid order lifecycle by customer segment. + frequencyTier: high + commonFilters: + - status + commonGroupBys: + - customer_segment + commonJoins: + - table: public.customers + on: + - customer_id + columns: + - name: order_id + type: string + - name: status + type: string +`, + 'ktx', + 'ktx@example.com', + 'Seed usage-backed manifest shard', + ); + + const ports = createLocalProjectMcpContextPorts(project); + await expect( + ports.semanticLayer?.listSources({ connectionId: 'warehouse', query: 'paid order lifecycle' }), + ).resolves.toEqual({ + sources: [ + expect.objectContaining({ + connectionId: 'warehouse', + connectionName: 'warehouse', + name: 'orders', + frequencyTier: 'high', + snippet: expect.stringContaining(''), + score: expect.any(Number), + matchReasons: expect.arrayContaining(['lexical']), + }), + ], + totalSources: 1, + }); + }); + it('uses configured local embeddings for semantic-layer search when available', async () => { const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' }); project.config.ingest.embeddings = { backend: 'none', dimensions: 2 }; diff --git a/packages/context/src/mcp/local-project-ports.ts b/packages/context/src/mcp/local-project-ports.ts index 331a14ae..41d1f916 100644 --- a/packages/context/src/mcp/local-project-ports.ts +++ b/packages/context/src/mcp/local-project-ports.ts @@ -479,6 +479,8 @@ export function createLocalProjectMcpContextPorts( columnCount: source.columnCount, measureCount: source.measureCount, joinCount: source.joinCount, + ...(hasSlSearchMetadata(source) && source.frequencyTier ? { frequencyTier: source.frequencyTier } : {}), + ...(hasSlSearchMetadata(source) && source.snippet ? { snippet: source.snippet } : {}), ...(hasSlSearchMetadata(source) ? { score: source.score } : {}), ...(hasSlSearchMetadata(source) && source.matchReasons ? { matchReasons: source.matchReasons } : {}), ...(hasSlSearchMetadata(source) && source.dictionaryMatches diff --git a/packages/context/src/mcp/types.ts b/packages/context/src/mcp/types.ts index 58f8e22b..f68444b2 100644 --- a/packages/context/src/mcp/types.ts +++ b/packages/context/src/mcp/types.ts @@ -1,4 +1,4 @@ -import type { IngestReportSnapshot, MemoryFlowReplayInput } from '../ingest/index.js'; +import type { IngestReportSnapshot, MemoryFlowReplayInput, TableUsageOutput } from '../ingest/index.js'; import type { MemoryCaptureService } from '../memory/index.js'; import type { KtxScanMode, KtxScanReport } from '../scan/index.js'; import type { @@ -131,6 +131,8 @@ export interface KtxSemanticLayerSourceSummary { columnCount: number; measureCount: number; joinCount: number; + frequencyTier?: TableUsageOutput['frequencyTier']; + snippet?: string; score?: number; matchReasons?: SlSearchMatchReason[]; dictionaryMatches?: SlDictionaryMatch[]; diff --git a/packages/context/src/memory/memory-runtime-assets.test.ts b/packages/context/src/memory/memory-runtime-assets.test.ts index 204461ec..36d4dc7c 100644 --- a/packages/context/src/memory/memory-runtime-assets.test.ts +++ b/packages/context/src/memory/memory-runtime-assets.test.ts @@ -15,7 +15,8 @@ const expectedSkillHeadings: Record = { sl_capture: '# Semantic Layer', }; const expectedAdapterSkillHeadings: Record = { - historic_sql_ingest: '# Historic SQL Ingest', + historic_sql_patterns: '# Historic SQL Patterns', + historic_sql_table_digest: '# Historic SQL Table Digest', live_database_ingest: '# Live Database Ingest', looker_ingest: '# Looker Runtime Ingest', lookml_ingest: '# LookML to KTX Semantic Layer', diff --git a/packages/context/src/package-exports.test.ts b/packages/context/src/package-exports.test.ts index e22d64fa..4fd7e502 100644 --- a/packages/context/src/package-exports.test.ts +++ b/packages/context/src/package-exports.test.ts @@ -232,14 +232,17 @@ describe('@ktx/context package exports', () => { expect(ingest.HistoricSqlSourceAdapter).toBeTypeOf('function'); expect(ingest.SnowflakeHistoricSqlQueryHistoryReader).toBeTypeOf('function'); expect(ingest.BigQueryHistoricSqlQueryHistoryReader).toBeTypeOf('function'); - expect(ingest.PostgresPgssQueryHistoryReader).toBeTypeOf('function'); - expect(ingest.stagePgStatStatementsTemplates).toBeTypeOf('function'); - expect(ingest.pgssBaselinePath).toBeTypeOf('function'); - expect(ingest.readPgssBaseline).toBeTypeOf('function'); - expect(ingest.writePgssBaselineAtomic).toBeTypeOf('function'); + expect(ingest.PostgresPgssReader).toBeTypeOf('function'); expect(ingest.HistoricSqlExtensionMissingError).toBeTypeOf('function'); expect(ingest.HistoricSqlVersionUnsupportedError).toBeTypeOf('function'); expect(ingest.HISTORIC_SQL_SOURCE_KEY).toBe('historic-sql'); + expect(ingest.historicSqlUnifiedPullConfigSchema).toBeDefined(); + expect(ingest.aggregatedTemplateSchema).toBeDefined(); + expect(ingest.stagedTableInputSchema).toBeDefined(); + expect(ingest.historicSqlEvidenceEnvelopeSchema).toBeDefined(); + expect(ingest.historicSqlEvidencePath).toBeTypeOf('function'); + expect(ingest.createEmitHistoricSqlEvidenceTool).toBeTypeOf('function'); + expect(ingest.HistoricSqlProjectionPostProcessor).toBeTypeOf('function'); expect(ingest.SqliteContextEvidenceStore).toBeTypeOf('function'); expect(ingest.SqliteBundleIngestStore).toBeTypeOf('function'); expect(ingest.CuratorPaginationService).toBeTypeOf('function'); diff --git a/packages/context/src/scan/local-enrichment-artifacts.test.ts b/packages/context/src/scan/local-enrichment-artifacts.test.ts index d34da036..0123f086 100644 --- a/packages/context/src/scan/local-enrichment-artifacts.test.ts +++ b/packages/context/src/scan/local-enrichment-artifacts.test.ts @@ -742,6 +742,13 @@ describe('writeLocalScanEnrichmentArtifacts', () => { orders: { table: 'public.orders', descriptions: { user: 'Pinned structural description', ai: 'Old generated text' }, + usage: { + narrative: 'Orders are commonly filtered by lifecycle status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + ownerNote: 'Preserve analyst note', + }, columns: [ { name: 'id', @@ -797,6 +804,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => { tables: { orders: { descriptions: Record; + usage?: Record; columns: Array<{ name: string; descriptions?: Record }>; joins: Array<{ to: string; on: string; source: string }>; }; @@ -807,6 +815,13 @@ describe('writeLocalScanEnrichmentArtifacts', () => { user: 'Pinned structural description', db: 'DB orders table', }); + expect(manifest.tables.orders.usage).toEqual({ + narrative: 'Orders are commonly filtered by lifecycle status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + ownerNote: 'Preserve analyst note', + }); expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({ user: 'Pinned structural id', db: 'DB order id', diff --git a/packages/context/src/scan/local-enrichment-artifacts.ts b/packages/context/src/scan/local-enrichment-artifacts.ts index b8186b0e..101d062e 100644 --- a/packages/context/src/scan/local-enrichment-artifacts.ts +++ b/packages/context/src/scan/local-enrichment-artifacts.ts @@ -6,6 +6,7 @@ import { type LiveDatabaseManifestJoinEntry, type LiveDatabaseManifestShard, type LiveDatabaseManifestTableData, + type TableUsageOutput, } from '../ingest/index.js'; import type { KtxScanRelationshipConfig } from '../project/config.js'; import type { KtxLocalProject } from '../project/index.js'; @@ -56,6 +57,7 @@ export interface WriteLocalScanEnrichmentArtifactsResult extends WriteLocalScanM interface ExistingManifestState { descriptions: Map; preservedJoins: Map; + usage: Map; } type LocalDescriptionUpdates = KtxLocalScanEnrichmentResult['descriptionUpdates']; @@ -196,6 +198,7 @@ async function loadExistingManifestState( ): Promise { const descriptions = new Map(); const preservedJoins = new Map(); + const usage = new Map(); const validTableNames = new Set(snapshot.tables.map((table) => table.name)); const columnsByTable = validColumns(snapshot); @@ -203,7 +206,7 @@ async function loadExistingManifestState( try { files = (await project.fileStore.listFiles(schemaDir(connectionId))).files.filter((file) => file.endsWith('.yaml')); } catch { - return { descriptions, preservedJoins }; + return { descriptions, preservedJoins, usage }; } for (const file of files) { @@ -225,6 +228,9 @@ async function loadExistingManifestState( ), ), }); + if (entry.usage) { + usage.set(tableName, { ...entry.usage }); + } const joins = (entry.joins ?? []).filter((join) => { return ( (join.source === 'manual' || join.source === 'inferred') && @@ -241,7 +247,7 @@ async function loadExistingManifestState( } } - return { descriptions, preservedJoins }; + return { descriptions, preservedJoins, usage }; } async function writeJsonArtifact( @@ -276,6 +282,7 @@ export async function writeLocalScanManifestShards( joins: relationshipJoins(input.snapshot, input.relationshipUpdate), existingDescriptions: existing.descriptions, existingPreservedJoins: existing.preservedJoins, + existingUsage: existing.usage, mapColumnType: (dimensionType) => dimensionType, }); diff --git a/packages/context/src/sl/local-sl.test.ts b/packages/context/src/sl/local-sl.test.ts index 9af28bca..3cdfaefe 100644 --- a/packages/context/src/sl/local-sl.test.ts +++ b/packages/context/src/sl/local-sl.test.ts @@ -187,6 +187,53 @@ describe('local semantic-layer helpers', () => { await expect(access(join(project.projectDir, '.ktx/db.sqlite'))).resolves.toBeUndefined(); }); + it('searches historic SQL usage and returns frequency tier plus FTS snippet', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/public.yaml', + `tables: + orders: + table: public.orders + usage: + narrative: Analysts inspect paid order lifecycle by customer segment. + frequencyTier: high + commonFilters: + - status + - created_at + commonGroupBys: + - customer_segment + commonJoins: + - table: public.customers + on: + - customer_id + columns: + - name: order_id + type: string + - name: status + type: string +`, + 'ktx', + 'ktx@example.com', + 'Add usage-backed manifest shard', + ); + + const results = await searchLocalSlSources(project, { + connectionId: 'warehouse', + query: 'paid lifecycle customer segment', + }); + + expect(results).toEqual([ + expect.objectContaining({ + connectionId: 'warehouse', + name: 'orders', + path: 'semantic-layer/warehouse/_schema/public.yaml#orders', + frequencyTier: 'high', + snippet: expect.stringContaining(''), + matchReasons: expect.arrayContaining(['lexical']), + }), + ]); + expect(results[0]?.snippet).toContain('lifecycle'); + }); + it('searches all connections with one global hybrid ranking pass', async () => { await writeLocalSlSource(project, { connectionId: 'warehouse', diff --git a/packages/context/src/sl/local-sl.ts b/packages/context/src/sl/local-sl.ts index 676b2522..14559ffe 100644 --- a/packages/context/src/sl/local-sl.ts +++ b/packages/context/src/sl/local-sl.ts @@ -26,6 +26,8 @@ export interface LocalSlSourceSummary { export interface LocalSlSourceSearchResult extends LocalSlSourceSummary { score: number; + frequencyTier?: NonNullable['frequencyTier']; + snippet?: string; matchReasons?: SlSearchMatchReason[]; dictionaryMatches?: SlDictionaryMatch[]; lanes?: SlSearchLaneSummary[]; @@ -367,6 +369,10 @@ function candidateKey(summary: LocalSlSourceSummary): string { return `${summary.connectionId}/${summary.name}`; } +function searchResultUsageFields(source: SemanticLayerSource): Pick { + return source.usage?.frequencyTier ? { frequencyTier: source.usage.frequencyTier } : {}; +} + function tokenLaneCandidates(candidates: LocalSlSearchCandidate[], terms: readonly string[]) { if (terms.length === 0) { return []; @@ -483,6 +489,7 @@ export async function searchLocalSlSources( ...result.candidate.summary, score: result.score, matchReasons: ['token'], + ...searchResultUsageFields(result.candidate.source), })) .sort( (left, right) => @@ -500,6 +507,7 @@ export async function searchLocalSlSources( const finalLimit = input.limit ?? candidates.length; const core = new HybridSearchCore(); const dictionaryEvidence = new Map(); + const lexicalSnippets = new Map(); const generators: SearchCandidateGenerator[] = [ { @@ -510,6 +518,11 @@ export async function searchLocalSlSources( queryText: args.queryText, limit: args.laneCandidatePoolLimit, }); + for (const row of rows) { + if (row.snippet) { + lexicalSnippets.set(row.id, row.snippet); + } + } return { candidates: rows.map((row) => ({ id: row.id, rank: row.rank, rawScore: row.rawScore })), }; @@ -584,9 +597,12 @@ export async function searchLocalSlSources( continue; } const dictionaryMatches = dictionaryEvidence.get(fused.id); + const snippet = lexicalSnippets.get(fused.id); hydrated.push({ ...candidate.summary, score: fused.score, + ...searchResultUsageFields(candidate.source), + ...(snippet ? { snippet } : {}), matchReasons: fused.matchReasons as SlSearchMatchReason[], ...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}), lanes: result.lanes, diff --git a/packages/context/src/sl/pglite-sl-search-prototype.ts b/packages/context/src/sl/pglite-sl-search-prototype.ts index 77c8c7d5..4a521437 100644 --- a/packages/context/src/sl/pglite-sl-search-prototype.ts +++ b/packages/context/src/sl/pglite-sl-search-prototype.ts @@ -554,9 +554,11 @@ export async function searchLocalSlSourcesWithPglitePrototype( continue; } const dictionaryMatches = dictionaryEvidence.get(result.id); + const frequencyTier = candidate.source.usage?.frequencyTier; hydrated.push({ ...candidate.summary, score: result.score, + ...(frequencyTier ? { frequencyTier } : {}), matchReasons: result.matchReasons as SlSearchMatchReason[], ...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}), lanes: fused.lanes, diff --git a/packages/context/src/sl/ports.ts b/packages/context/src/sl/ports.ts index d2426460..08e1ca6d 100644 --- a/packages/context/src/sl/ports.ts +++ b/packages/context/src/sl/ports.ts @@ -49,5 +49,5 @@ export interface SlSourcesIndexPort { queryText: string, limit: number, minRrfScore?: number, - ): Promise>; + ): Promise>; } diff --git a/packages/context/src/sl/schemas.ts b/packages/context/src/sl/schemas.ts index 218c0435..706a4add 100644 --- a/packages/context/src/sl/schemas.ts +++ b/packages/context/src/sl/schemas.ts @@ -1,4 +1,5 @@ import { z } from 'zod'; +import { tableUsageOutputSchema } from '../ingest/adapters/historic-sql/skill-schemas.js'; // Literal vocabularies — kept in lockstep with the Python Pydantic model at // python/ktx-sl/semantic_layer/models.py (SourceColumn / ColumnRole / @@ -125,6 +126,7 @@ export const sourceDefinitionSchema = z default_time_dimension: defaultTimeDimensionDbtSchema.optional(), tags: sourceKeyedStringArraySchema.optional(), freshness: sourceFreshnessSchema.optional(), + usage: tableUsageOutputSchema.optional(), }) .strict() .refine((s) => (s.table || s.sql) && !(s.table && s.sql), { @@ -145,6 +147,7 @@ export const sourceOverlaySchema = z exclude_columns: z.array(z.string()).optional(), disable_joins: z.array(z.string()).optional(), default_time_dimension: defaultTimeDimensionDbtSchema.optional(), + usage: tableUsageOutputSchema.optional(), }) .strict(); diff --git a/packages/context/src/sl/semantic-layer.service.test.ts b/packages/context/src/sl/semantic-layer.service.test.ts index 0b9656de..3adde085 100644 --- a/packages/context/src/sl/semantic-layer.service.test.ts +++ b/packages/context/src/sl/semantic-layer.service.test.ts @@ -5,6 +5,7 @@ import { composeOverlay, enrichColumnsFromManifest, findDanglingSegmentRefs, + projectManifestEntry, SemanticLayerService, } from './semantic-layer.service.js'; import { sourceDefinitionSchema } from './schemas.js'; @@ -129,6 +130,39 @@ describe('composeOverlay', () => { dbt: 'dbt description', }); }); + + it('replaces manifest usage only when an overlay explicitly provides usage', () => { + const baseWithUsage: SemanticLayerSource = { + ...baseTable, + usage: { + narrative: 'Orders are commonly queried by lifecycle status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }, + }; + + expect(composeOverlay(baseWithUsage, { name: 'fct_labs', measures: [] }).usage).toEqual(baseWithUsage.usage); + + const composed = composeOverlay(baseWithUsage, { + name: 'fct_labs', + usage: { + narrative: 'Overlay-curated usage note.', + frequencyTier: 'mid', + commonFilters: ['created_at'], + commonGroupBys: ['created_at'], + commonJoins: [], + }, + }); + + expect(composed.usage).toEqual({ + narrative: 'Overlay-curated usage note.', + frequencyTier: 'mid', + commonFilters: ['created_at'], + commonGroupBys: ['created_at'], + commonJoins: [], + }); + }); }); describe('enrichColumnsFromManifest', () => { @@ -299,6 +333,61 @@ describe('sourceDefinitionSchema', () => { dbt: { loaded_at_field: 'updated_at', raw: { warn_after: { count: 12, period: 'hour' } } }, }); }); + + it('accepts historic SQL usage on standalone sources', () => { + const result = sourceDefinitionSchema.safeParse({ + name: 'orders', + table: 'public.orders', + grain: ['id'], + columns: [{ name: 'id', type: 'string' }], + joins: [], + measures: [], + usage: { + narrative: 'Orders are queried for fulfillment and revenue analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + externalOwner: 'analytics', + }, + }); + + expect(result.success).toBe(true); + if (!result.success) { + return; + } + expect(result.data.usage).toMatchObject({ + narrative: 'Orders are queried for fulfillment and revenue analysis.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + externalOwner: 'analytics', + }); + }); +}); + +describe('projectManifestEntry', () => { + it('projects manifest usage onto the semantic-layer source', () => { + const source = projectManifestEntry('orders', { + table: 'public.orders', + usage: { + narrative: 'Orders are frequently filtered by status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }, + columns: [ + { name: 'id', type: 'string', pk: true }, + { name: 'status', type: 'string' }, + ], + }); + + expect(source.usage).toEqual({ + narrative: 'Orders are frequently filtered by status.', + frequencyTier: 'high', + commonFilters: ['status'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + }); + }); }); describe('findManifestEntryByTableRef', () => { diff --git a/packages/context/src/sl/semantic-layer.service.ts b/packages/context/src/sl/semantic-layer.service.ts index 0ccce66a..ffae0b12 100644 --- a/packages/context/src/sl/semantic-layer.service.ts +++ b/packages/context/src/sl/semantic-layer.service.ts @@ -1,6 +1,7 @@ import YAML from 'yaml'; import type { KtxFileStorePort, KtxLogger } from '../core/index.js'; import { noopLogger } from '../core/index.js'; +import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js'; import type { SlConnectionCatalogPort, SlPythonPort } from './ports.js'; import { normalizeSemanticLayerDescriptions } from './description-normalization.js'; import { isOverlaySource, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js'; @@ -884,6 +885,7 @@ export interface ManifestTableEntry { joins?: ManifestJoinEntry[]; tags?: { dbt?: string[] }; freshness?: { dbt?: { raw?: unknown; loaded_at_field?: string | null } }; + usage?: TableUsageOutput; } /** Migrate legacy flat description/db_description fields to a descriptions map. */ @@ -930,6 +932,7 @@ export function projectManifestEntry(name: string, entry: ManifestTableEntry): S measures: [], ...(entry.tags?.dbt?.length ? { tags: entry.tags } : {}), ...(entry.freshness?.dbt ? { freshness: entry.freshness } : {}), + ...(entry.usage ? { usage: entry.usage } : {}), }; } @@ -1005,6 +1008,7 @@ const COMPOSE_KNOWN_KEYS = new Set([ 'exclude_columns', 'disable_joins', 'default_time_dimension', + 'usage', ]); export function composeOverlay(base: SemanticLayerSource, overlay: Record): SemanticLayerSource { @@ -1028,6 +1032,10 @@ export function composeOverlay(base: SemanticLayerSource, overlay: Record !excluded.has(c.name)); diff --git a/packages/context/src/sl/sl-search.service.test.ts b/packages/context/src/sl/sl-search.service.test.ts index 3def9495..ffe27cbc 100644 --- a/packages/context/src/sl/sl-search.service.test.ts +++ b/packages/context/src/sl/sl-search.service.test.ts @@ -162,4 +162,65 @@ describe('SlSearchService', () => { expect(text).toContain('loaded_at=updated_at'); expect(text).toContain('warn_after'); }); + + it('includes historic SQL usage in semantic-layer search text', () => { + const source: SemanticLayerSource = { + name: 'orders', + descriptions: { user: 'Customer orders' }, + table: 'public.orders', + grain: ['order_id'], + columns: [{ name: 'order_id', type: 'string' }], + joins: [], + measures: [], + usage: { + narrative: 'Analysts inspect paid and refunded order lifecycle trends by customer segment.', + frequencyTier: 'high', + commonFilters: ['status', 'created_at'], + commonGroupBys: ['customer_segment'], + commonJoins: [{ table: 'public.customers', on: ['customer_id'] }], + staleSince: '2026-05-01T00:00:00.000Z', + }, + }; + + const text = buildSemanticLayerSourceSearchText(source); + + expect(text).toContain('usage: Analysts inspect paid and refunded order lifecycle trends by customer segment.'); + expect(text).toContain('frequency: high'); + expect(text).toContain('commonly filtered by: status, created_at'); + expect(text).toContain('commonly grouped by: customer_segment'); + expect(text).toContain('commonly joined to public.customers on customer_id'); + expect(text).toContain('stale since 2026-05-01T00:00:00.000Z'); + }); + + it('preserves FTS snippets returned by the source index', async () => { + const service = new SlSearchService( + { + maxBatchSize: 16, + computeEmbedding: vi.fn(async () => [1, 0]), + computeEmbeddingsBulk: vi.fn(), + }, + { + upsertSources: vi.fn(), + getExistingSearchTexts: vi.fn(), + deleteStale: vi.fn(), + deleteByConnection: vi.fn(), + deleteByConnectionAndName: vi.fn(), + search: vi.fn(async () => [ + { + sourceName: 'orders', + rrfScore: 0.75, + snippet: 'usage: paid order lifecycle', + }, + ]), + }, + ); + + await expect(service.search('warehouse', 'order lifecycle', 10)).resolves.toEqual([ + { + sourceName: 'orders', + score: 0.75, + snippet: 'usage: paid order lifecycle', + }, + ]); + }); }); diff --git a/packages/context/src/sl/sl-search.service.ts b/packages/context/src/sl/sl-search.service.ts index 47743ae1..68ae1557 100644 --- a/packages/context/src/sl/sl-search.service.ts +++ b/packages/context/src/sl/sl-search.service.ts @@ -71,6 +71,24 @@ export function buildSemanticLayerSourceSearchText( } } + if (source.usage) { + const usage = source.usage; + parts.push(`usage: ${usage.narrative}`); + parts.push(`frequency: ${usage.frequencyTier}`); + if (usage.commonFilters.length > 0) { + parts.push(`commonly filtered by: ${usage.commonFilters.join(', ')}`); + } + if (usage.commonGroupBys?.length) { + parts.push(`commonly grouped by: ${usage.commonGroupBys.join(', ')}`); + } + for (const join of usage.commonJoins) { + parts.push(`commonly joined to ${join.table} on ${join.on.join(',')}`); + } + if (usage.staleSince) { + parts.push(`stale since ${usage.staleSince}`); + } + } + return parts.join('. '); } @@ -150,7 +168,7 @@ export class SlSearchService { query: string, limit = 15, minRrfScore = 0, - ): Promise> { + ): Promise> { let queryEmbedding: number[] | null = null; try { queryEmbedding = await this.embeddingService.computeEmbedding(query); @@ -161,7 +179,11 @@ export class SlSearchService { } const results = await this.slSourcesRepository.search(connectionId, queryEmbedding, query, limit, minRrfScore); - return results.map((r) => ({ sourceName: r.sourceName, score: r.rrfScore })); + return results.map((result) => ({ + sourceName: result.sourceName, + score: result.rrfScore, + ...(result.snippet ? { snippet: result.snippet } : {}), + })); } buildSearchText(source: SemanticLayerSource, priority: string[] = DEFAULT_PRIORITY): string { diff --git a/packages/context/src/sl/sqlite-sl-sources-index.test.ts b/packages/context/src/sl/sqlite-sl-sources-index.test.ts index 01f1be37..18258000 100644 --- a/packages/context/src/sl/sqlite-sl-sources-index.test.ts +++ b/packages/context/src/sl/sqlite-sl-sources-index.test.ts @@ -17,7 +17,7 @@ describe('SqliteSlSourcesIndex', () => { await rm(tempDir, { recursive: true, force: true }); }); - it('creates SQLite tables and searches indexed source text', async () => { + it('creates SQLite tables and searches indexed source text with FTS snippets', async () => { const index = new SqliteSlSourcesIndex({ dbPath }); await index.upsertSources('warehouse', [ @@ -34,10 +34,24 @@ describe('SqliteSlSourcesIndex', () => { ]); await expect(access(dbPath)).resolves.toBeUndefined(); - expect(await index.search('warehouse', null, 'gross revenue', 10)).toEqual([ + + const directResults = await index.search('warehouse', null, 'gross revenue', 10); + expect(directResults).toEqual([ expect.objectContaining({ sourceName: 'orders', rrfScore: expect.any(Number), + snippet: expect.stringContaining(''), + }), + ]); + expect(directResults[0]?.snippet).toContain('revenue'); + + const lexicalCandidates = await index.searchLexicalCandidates({ queryText: 'gross revenue', limit: 10 }); + expect(lexicalCandidates).toEqual([ + expect.objectContaining({ + id: 'warehouse/orders', + connectionId: 'warehouse', + sourceName: 'orders', + snippet: expect.stringContaining(''), }), ]); }); diff --git a/packages/context/src/sl/sqlite-sl-sources-index.ts b/packages/context/src/sl/sqlite-sl-sources-index.ts index a5000976..f53c8eef 100644 --- a/packages/context/src/sl/sqlite-sl-sources-index.ts +++ b/packages/context/src/sl/sqlite-sl-sources-index.ts @@ -19,6 +19,7 @@ type SearchRow = { connection_id?: string; source_name: string; rank: number; + snippet?: string | null; }; export interface SlSqliteLaneCandidate { @@ -27,6 +28,7 @@ export interface SlSqliteLaneCandidate { sourceName: string; rank: number; rawScore: number; + snippet?: string; } export interface SlSqliteDictionaryCandidate extends SlSqliteLaneCandidate { @@ -334,7 +336,11 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort { const rows = this.db .prepare( ` - SELECT connection_id, source_name, bm25(local_sl_sources_fts) AS rank + SELECT + connection_id, + source_name, + bm25(local_sl_sources_fts) AS rank, + snippet(local_sl_sources_fts, 2, '', '', '...', 12) AS snippet FROM local_sl_sources_fts WHERE local_sl_sources_fts MATCH ? ${connectionPredicate} @@ -350,6 +356,7 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort { sourceName: row.source_name, rank: index + 1, rawScore: Number(row.rank), + ...(typeof row.snippet === 'string' && row.snippet.length > 0 ? { snippet: row.snippet } : {}), })); } @@ -499,7 +506,7 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort { queryText: string, limit: number, minRrfScore = 0, - ): Promise> { + ): Promise> { const ftsQuery = normalizeFtsQuery(queryText); if (!ftsQuery) { return []; @@ -508,7 +515,10 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort { const rows = this.db .prepare( ` - SELECT source_name, bm25(local_sl_sources_fts) AS rank + SELECT + source_name, + bm25(local_sl_sources_fts) AS rank, + snippet(local_sl_sources_fts, 2, '', '', '...', 12) AS snippet FROM local_sl_sources_fts WHERE connection_id = ? AND local_sl_sources_fts MATCH ? @@ -519,7 +529,11 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort { .all(connectionId, ftsQuery, Math.max(1, limit)) as SearchRow[]; return rows - .map((row) => ({ sourceName: row.source_name, rrfScore: scoreFromRank(row.rank) })) + .map((row) => ({ + sourceName: row.source_name, + rrfScore: scoreFromRank(row.rank), + ...(typeof row.snippet === 'string' && row.snippet.length > 0 ? { snippet: row.snippet } : {}), + })) .filter((row) => row.rrfScore >= minRrfScore); } diff --git a/packages/context/src/sl/types.ts b/packages/context/src/sl/types.ts index ff0334c1..7f153c58 100644 --- a/packages/context/src/sl/types.ts +++ b/packages/context/src/sl/types.ts @@ -1,3 +1,5 @@ +import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js'; + export interface SemanticLayerSource { name: string; descriptions?: Record; @@ -42,6 +44,7 @@ export interface SemanticLayerSource { default_time_dimension?: { dbt?: string }; tags?: { dbt?: string[] }; freshness?: { dbt?: { raw?: unknown; loaded_at_field?: string | null } }; + usage?: TableUsageOutput; } export interface SemanticLayerQueryInput { diff --git a/packages/context/src/sql-analysis/http-sql-analysis-port.test.ts b/packages/context/src/sql-analysis/http-sql-analysis-port.test.ts index f9bf513b..6e22fd47 100644 --- a/packages/context/src/sql-analysis/http-sql-analysis-port.test.ts +++ b/packages/context/src/sql-analysis/http-sql-analysis-port.test.ts @@ -45,6 +45,85 @@ describe('createHttpSqlAnalysisPort', () => { }); }); + it('calls the SQL batch endpoint and maps snake_case response fields into a Map', async () => { + const requestJson = vi.fn(async () => ({ + results: { + orders: { + tables_touched: ['public.orders', 'public.customers'], + columns_by_clause: { + select: ['status'], + where: ['created_at'], + join: ['customer_id', 'id'], + }, + error: null, + }, + broken: { + tables_touched: [], + columns_by_clause: {}, + error: 'Invalid expression / Unexpected token', + }, + }, + })); + const port = createHttpSqlAnalysisPort({ baseUrl: 'http://python.test', requestJson }); + + await expect( + port.analyzeBatch( + [ + { id: 'orders', sql: 'select status from public.orders' }, + { id: 'broken', sql: 'select * from where' }, + ], + 'postgres', + ), + ).resolves.toEqual( + new Map([ + [ + 'orders', + { + tablesTouched: ['public.orders', 'public.customers'], + columnsByClause: { + select: ['status'], + where: ['created_at'], + join: ['customer_id', 'id'], + }, + error: null, + }, + ], + [ + 'broken', + { + tablesTouched: [], + columnsByClause: {}, + error: 'Invalid expression / Unexpected token', + }, + ], + ]), + ); + + expect(requestJson).toHaveBeenCalledWith('/sql/analyze-batch', { + dialect: 'postgres', + items: [ + { id: 'orders', sql: 'select status from public.orders' }, + { id: 'broken', sql: 'select * from where' }, + ], + }); + }); + + it('rejects malformed SQL batch responses instead of inventing defaults', async () => { + const requestJson = vi.fn(async () => ({ + results: { + orders: { + tables_touched: ['public.orders'], + columns_by_clause: { select: ['status'], where: [42] }, + error: null, + }, + }, + })); + const port = createHttpSqlAnalysisPort({ baseUrl: 'http://python.test', requestJson }); + + await expect(port.analyzeBatch([{ id: 'orders', sql: 'select status from public.orders' }], 'postgres')).rejects + .toThrow('sql analysis response is missing string[] field columns_by_clause.where'); + }); + it('rejects malformed daemon responses instead of inventing defaults', async () => { const requestJson = vi.fn(async () => ({ fingerprint: 'abc', diff --git a/packages/context/src/sql-analysis/http-sql-analysis-port.ts b/packages/context/src/sql-analysis/http-sql-analysis-port.ts index a26d69e4..9da37556 100644 --- a/packages/context/src/sql-analysis/http-sql-analysis-port.ts +++ b/packages/context/src/sql-analysis/http-sql-analysis-port.ts @@ -2,6 +2,8 @@ import { request as httpRequest } from 'node:http'; import { request as httpsRequest } from 'node:https'; import { URL } from 'node:url'; import type { + SqlAnalysisBatchItem, + SqlAnalysisBatchResult, SqlAnalysisDialect, SqlAnalysisFingerprintResult, SqlAnalysisLiteralSlot, @@ -94,6 +96,14 @@ function requiredStringArray(raw: Record, field: string): strin return value; } +function requiredObject(raw: Record, field: string): Record { + const value = raw[field]; + if (!value || typeof value !== 'object' || Array.isArray(value)) { + throw new Error(`sql analysis response is missing object field ${field}`); + } + return value as Record; +} + function isLiteralSlotType(value: unknown): value is SqlAnalysisLiteralSlotType { return ( value === 'string' || @@ -144,6 +154,39 @@ function mapResult(raw: Record): SqlAnalysisFingerprintResult { }; } +function mapColumnsByClause(raw: Record): SqlAnalysisBatchResult['columnsByClause'] { + const value = requiredObject(raw, 'columns_by_clause'); + const result: SqlAnalysisBatchResult['columnsByClause'] = {}; + for (const [clause, columns] of Object.entries(value)) { + if (!Array.isArray(columns) || columns.some((item) => typeof item !== 'string')) { + throw new Error(`sql analysis response is missing string[] field columns_by_clause.${clause}`); + } + result[clause] = columns; + } + return result; +} + +function mapBatchResult(raw: Record): SqlAnalysisBatchResult { + const error = optionalString(raw, 'error'); + return { + tablesTouched: requiredStringArray(raw, 'tables_touched'), + columnsByClause: mapColumnsByClause(raw), + ...(error !== undefined ? { error } : {}), + }; +} + +function mapBatchResponse(raw: Record): Map { + const results = requiredObject(raw, 'results'); + return new Map( + Object.entries(results).map(([id, value]) => { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + throw new Error(`sql analysis response contains invalid batch result ${id}`); + } + return [id, mapBatchResult(value as Record)]; + }), + ); +} + export function createHttpSqlAnalysisPort(options: HttpSqlAnalysisPortOptions): SqlAnalysisPort { const requestJson = options.requestJson ?? postJson(options.baseUrl); @@ -155,5 +198,12 @@ export function createHttpSqlAnalysisPort(options: HttpSqlAnalysisPortOptions): }); return mapResult(raw); }, + async analyzeBatch(items: SqlAnalysisBatchItem[], dialect: SqlAnalysisDialect) { + const raw = await requestJson('/sql/analyze-batch', { + dialect, + items, + }); + return mapBatchResponse(raw); + }, }; } diff --git a/packages/context/src/sql-analysis/index.ts b/packages/context/src/sql-analysis/index.ts index 89e3ada9..8338b822 100644 --- a/packages/context/src/sql-analysis/index.ts +++ b/packages/context/src/sql-analysis/index.ts @@ -1,6 +1,9 @@ export { createHttpSqlAnalysisPort } from './http-sql-analysis-port.js'; export type { HttpSqlAnalysisPortOptions, KtxSqlAnalysisHttpJsonRunner } from './http-sql-analysis-port.js'; export type { + SqlAnalysisBatchItem, + SqlAnalysisBatchResult, + SqlAnalysisClause, SqlAnalysisDialect, SqlAnalysisFingerprintResult, SqlAnalysisLiteralSlot, diff --git a/packages/context/src/sql-analysis/ports.ts b/packages/context/src/sql-analysis/ports.ts index 69b15780..3361a7c4 100644 --- a/packages/context/src/sql-analysis/ports.ts +++ b/packages/context/src/sql-analysis/ports.ts @@ -25,6 +25,23 @@ export interface SqlAnalysisFingerprintResult { error?: string | null; } +export type SqlAnalysisClause = 'select' | 'where' | 'join' | 'groupBy' | 'having' | 'orderBy' | (string & {}); + +export interface SqlAnalysisBatchItem { + id: string; + sql: string; +} + +export interface SqlAnalysisBatchResult { + tablesTouched: string[]; + columnsByClause: Partial>; + error?: string | null; +} + export interface SqlAnalysisPort { analyzeForFingerprint(sql: string, dialect: SqlAnalysisDialect): Promise; + analyzeBatch( + items: SqlAnalysisBatchItem[], + dialect: SqlAnalysisDialect, + ): Promise>; } diff --git a/packages/context/src/wiki/types.ts b/packages/context/src/wiki/types.ts index cd11d49b..317b17ab 100644 --- a/packages/context/src/wiki/types.ts +++ b/packages/context/src/wiki/types.ts @@ -24,6 +24,7 @@ export interface WikiFrontmatter { representative_sql?: string; usage?: HistoricSqlWikiUsageFrontmatter; fingerprints?: string[]; + stale_since?: string; } export interface WikiPage { diff --git a/python/ktx-daemon/src/ktx_daemon/app.py b/python/ktx-daemon/src/ktx_daemon/app.py index 272b0c24..76325719 100644 --- a/python/ktx-daemon/src/ktx_daemon/app.py +++ b/python/ktx-daemon/src/ktx_daemon/app.py @@ -48,6 +48,11 @@ from ktx_daemon.source_generation import ( GenerateSourcesResponse, generate_sources_response, ) +from ktx_daemon.sql_analysis import ( + AnalyzeSqlBatchRequest, + AnalyzeSqlBatchResponse, + analyze_sql_batch_response, +) from ktx_daemon.table_identifier import ( ParseTableIdentifierBatchRequest, ParseTableIdentifierBatchResponse, @@ -193,6 +198,19 @@ def create_app( detail=f"Table identifier parsing failed: {error}", ) from error + @app.post("/sql/analyze-batch", response_model=AnalyzeSqlBatchResponse) + async def sql_analyze_batch( + request: AnalyzeSqlBatchRequest, + ) -> AnalyzeSqlBatchResponse: + try: + return analyze_sql_batch_response(request) + except Exception as error: + logger.exception("SQL batch analysis failed: %s", error) + raise HTTPException( + status_code=500, + detail=f"SQL batch analysis failed: {error}", + ) from error + @app.post( "/semantic-layer/generate-sources", response_model=GenerateSourcesResponse ) diff --git a/python/ktx-daemon/src/ktx_daemon/sql_analysis.py b/python/ktx-daemon/src/ktx_daemon/sql_analysis.py new file mode 100644 index 00000000..9a222098 --- /dev/null +++ b/python/ktx-daemon/src/ktx_daemon/sql_analysis.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import os +from concurrent.futures import ProcessPoolExecutor +from typing import Literal + +import sqlglot +from pydantic import BaseModel, ConfigDict, Field +from sqlglot import exp + +SqlAnalysisClause = Literal["select", "where", "join", "groupBy", "having", "orderBy"] + + +class AnalyzeSqlBatchItem(BaseModel): + id: str + sql: str + + +class AnalyzeSqlBatchRequest(BaseModel): + dialect: str + items: list[AnalyzeSqlBatchItem] + max_workers: int | None = Field(default=None, ge=1, le=32) + + +class AnalyzeSqlBatchResult(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + tables_touched: list[str] = Field(default_factory=list) + columns_by_clause: dict[SqlAnalysisClause, list[str]] = Field(default_factory=dict) + error: str | None = None + + +class AnalyzeSqlBatchResponse(BaseModel): + results: dict[str, AnalyzeSqlBatchResult] + + +def _ordered_unique(values: list[str]) -> list[str]: + seen: set[str] = set() + result: list[str] = [] + for value in values: + if value and value not in seen: + seen.add(value) + result.append(value) + return result + + +def _table_ref(table: exp.Table) -> str: + parts: list[str] = [] + catalog = table.args.get("catalog") + db = table.args.get("db") + if catalog is not None and getattr(catalog, "name", None): + parts.append(str(catalog.name)) + if db is not None and getattr(db, "name", None): + parts.append(str(db.name)) + if table.name: + parts.append(str(table.name)) + return ".".join(parts) + + +def _column_name(column: exp.Column) -> str: + return str(column.name) + + +def _columns_from_nodes(nodes: list[object]) -> list[str]: + names: list[str] = [] + for node in nodes: + if not isinstance(node, exp.Expression): + continue + names.extend(_column_name(column) for column in node.find_all(exp.Column)) + return _ordered_unique(names) + + +def _columns_by_clause(tree: exp.Expression) -> dict[SqlAnalysisClause, list[str]]: + result: dict[SqlAnalysisClause, list[str]] = {} + + select_columns = _columns_from_nodes(list(tree.expressions)) + if select_columns: + result["select"] = select_columns + + where_columns = _columns_from_nodes([tree.args.get("where")]) + if where_columns: + result["where"] = where_columns + + join_columns = _columns_from_nodes( + [join.args.get("on") for join in tree.args.get("joins") or []] + ) + if join_columns: + result["join"] = join_columns + + group = tree.args.get("group") + group_columns = _columns_from_nodes( + list(group.expressions) if group is not None else [] + ) + if group_columns: + result["groupBy"] = group_columns + + having_columns = _columns_from_nodes([tree.args.get("having")]) + if having_columns: + result["having"] = having_columns + + order = tree.args.get("order") + order_columns = _columns_from_nodes( + list(order.expressions) if order is not None else [] + ) + if order_columns: + result["orderBy"] = order_columns + + return result + + +def _analyze_one( + item_id: str, sql: str, dialect: str +) -> tuple[str, AnalyzeSqlBatchResult]: + try: + tree = sqlglot.parse_one(sql, read=dialect) + except sqlglot.errors.SqlglotError as exc: + return item_id, AnalyzeSqlBatchResult(error=str(exc)) + + cte_names = {cte.alias_or_name.lower() for cte in tree.find_all(exp.CTE)} + table_refs = [ + table_ref + for table_ref in (_table_ref(table) for table in tree.find_all(exp.Table)) + if table_ref and table_ref.split(".")[-1].lower() not in cte_names + ] + + return item_id, AnalyzeSqlBatchResult( + tables_touched=_ordered_unique(table_refs), + columns_by_clause=_columns_by_clause(tree), + error=None, + ) + + +def _analyze_payload(payload: tuple[str, str, str]) -> tuple[str, AnalyzeSqlBatchResult]: + item_id, sql, dialect = payload + return _analyze_one(item_id, sql, dialect) + + +def _worker_count(request: AnalyzeSqlBatchRequest) -> int: + if len(request.items) <= 1: + return 1 + if request.max_workers is not None: + return min(request.max_workers, len(request.items)) + return min(os.cpu_count() or 1, len(request.items), 8) + + +def analyze_sql_batch_response( + request: AnalyzeSqlBatchRequest, +) -> AnalyzeSqlBatchResponse: + payloads = [(item.id, item.sql, request.dialect) for item in request.items] + if _worker_count(request) == 1: + analyzed = [_analyze_payload(payload) for payload in payloads] + else: + with ProcessPoolExecutor(max_workers=_worker_count(request)) as executor: + analyzed = list(executor.map(_analyze_payload, payloads)) + + return AnalyzeSqlBatchResponse( + results={item_id: result for item_id, result in analyzed} + ) diff --git a/python/ktx-daemon/tests/test_app.py b/python/ktx-daemon/tests/test_app.py index cd5c4f16..eb2c3d68 100644 --- a/python/ktx-daemon/tests/test_app.py +++ b/python/ktx-daemon/tests/test_app.py @@ -280,6 +280,37 @@ def test_sql_parse_table_identifier_endpoint() -> None: assert body["results"]["template"]["reason"] == "looker_template_unresolved" +def test_sql_analyze_batch_endpoint_returns_per_item_results() -> None: + client = TestClient(create_app()) + + response = client.post( + "/sql/analyze-batch", + json={ + "dialect": "postgres", + "max_workers": 1, + "items": [ + { + "id": "orders", + "sql": "select status from public.orders where created_at is not null", + }, + {"id": "broken", "sql": "select * from where"}, + ], + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["results"]["orders"]["tables_touched"] == ["public.orders"] + assert body["results"]["orders"]["columns_by_clause"] == { + "select": ["status"], + "where": ["created_at"], + } + assert body["results"]["orders"]["error"] is None + assert body["results"]["broken"]["tables_touched"] == [] + assert body["results"]["broken"]["columns_by_clause"] == {} + assert body["results"]["broken"]["error"] is not None + + def test_semantic_query_endpoint_returns_sql() -> None: client = TestClient(create_app()) diff --git a/python/ktx-daemon/tests/test_sql_analysis.py b/python/ktx-daemon/tests/test_sql_analysis.py new file mode 100644 index 00000000..c1fc35f8 --- /dev/null +++ b/python/ktx-daemon/tests/test_sql_analysis.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from ktx_daemon.sql_analysis import ( + AnalyzeSqlBatchItem, + AnalyzeSqlBatchRequest, + _columns_from_nodes, + analyze_sql_batch_response, +) + + +def test_analyze_sql_batch_extracts_tables_and_clause_columns() -> None: + response = analyze_sql_batch_response( + AnalyzeSqlBatchRequest( + dialect="postgres", + items=[ + AnalyzeSqlBatchItem( + id="orders_by_customer", + sql=( + "select o.status, count(*) " + "from public.orders o " + "join public.customers c on o.customer_id = c.id " + "where o.created_at >= current_date - interval '30 day' " + "group by o.status" + ), + ) + ], + max_workers=1, + ) + ) + + result = response.results["orders_by_customer"] + assert result.error is None + assert result.tables_touched == ["public.orders", "public.customers"] + assert result.columns_by_clause == { + "select": ["status"], + "where": ["created_at"], + "join": ["customer_id", "id"], + "groupBy": ["status"], + } + + +def test_analyze_sql_batch_returns_per_item_parse_errors() -> None: + response = analyze_sql_batch_response( + AnalyzeSqlBatchRequest( + dialect="postgres", + items=[AnalyzeSqlBatchItem(id="broken", sql="select * from where")], + max_workers=1, + ) + ) + + result = response.results["broken"] + assert result.tables_touched == [] + assert result.columns_by_clause == {} + assert result.error is not None + + +def test_columns_from_nodes_ignores_non_expression_clause_values() -> None: + assert _columns_from_nodes([True, False, None]) == [] diff --git a/scripts/examples-docs.test.mjs b/scripts/examples-docs.test.mjs index b9f63d65..24c83452 100644 --- a/scripts/examples-docs.test.mjs +++ b/scripts/examples-docs.test.mjs @@ -68,12 +68,18 @@ describe('standalone example docs', () => { const smoke = await readText('examples/postgres-historic/scripts/smoke.sh'); assert.match(examples, /postgres-historic/); - assert.match(examples, /pg_stat_statements/); + assert.match(examples, /unified Historic SQL artifacts/); assert.match(readme, /--enable-historic-sql/); - assert.match(readme, /--historic-sql-min-calls 2/); + assert.match(readme, /--historic-sql-min-executions 2/); assert.match(readme, /ktx dev doctor --project-dir/); assert.match(readme, /Postgres Historic SQL/); - assert.match(readme, /dev ingest run/); + assert.match(readme, /manifest\.json/); + assert.match(readme, /tables\/\*\.json/); + assert.match(readme, /patterns-input\.json/); + assert.match(readme, /patterns-input\/part-\*\.json/); + assert.match(readme, /full audit input/); + assert.match(readme, /bounded pattern WorkUnit shards/); + assert.match(readme, /workUnitCount: 0/); assert.match(compose, /postgres:14/); assert.match(compose, /shared_preload_libraries=pg_stat_statements/); assert.match(compose, /pg_stat_statements.track=top/); @@ -82,7 +88,13 @@ describe('standalone example docs', () => { assert.match(workload, /JOIN customers/); assert.match(workload, /app_user/); assert.match(workload, /etl_user/); - assert.match(smoke, /pg_stat_statements_reset/); + assert.match(smoke, /assert_unified_snapshot/); + assert.match(smoke, /assert_stage_record "\$UNCHANGED_RECORD" unchanged zero/); + assert.match(smoke, /assertPatternShards/); + assert.match(smoke, /historic-sql-patterns-part-/); + assert.match(smoke, /patterns-input\/part-/); + assert.doesNotMatch(smoke, new RegExp(["unitKey === 'historic", 'sql', "patterns'"].join('-'))); + assert.match(smoke, /--historic-sql-min-executions 2/); assert.match(smoke, /KTX_RUNTIME_ROOT/); assert.match(smoke, /managedDaemon/); assert.match(smoke, /installPolicy: 'auto'/); @@ -91,13 +103,36 @@ describe('standalone example docs', () => { assert.doesNotMatch(smoke, /PYTHON_SERVICE/); assert.doesNotMatch(smoke, /uvicorn app\.main:app/); assert.doesNotMatch(smoke, /export KTX_SQL_ANALYSIS_URL/); + assert.doesNotMatch( + smoke, + new RegExp( + [ + ['baseline', 'FirstRun'], + ['de', 'graded'], + ['stats', 'ResetAt'], + ['assert', '_manifest'], + ] + .map((parts) => parts.join('')) + .join('|'), + ), + ); assert.doesNotMatch(readme, /python-service/); assert.doesNotMatch(readme, /KTX_SQL_ANALYSIS_URL/); - assert.match(smoke, /assert_manifest "\$FIRST_MANIFEST" true/); - assert.match(smoke, /assert_manifest "\$SECOND_MANIFEST" false/); - assert.match(smoke, /assert_manifest "\$RESET_MANIFEST" true/); - assert.doesNotMatch(readme, /python-service/); - assert.doesNotMatch(smoke, /python-service|PYTHON_SERVICE|REPO_ROOT/); + assert.doesNotMatch( + readme, + new RegExp( + [ + ['baseline', 'FirstRun'], + ['de', 'graded: true'], + ['stats', 'ResetAt'], + ['fresh PGSS', ' baseline'], + ['delta', '-only'], + ] + .map((parts) => parts.join('')) + .join('|'), + ), + ); + assert.doesNotMatch(readme, /--historic-sql-min-calls/); }); it('lists every published TypeScript package in the package root README', async () => {