diff --git a/docs-site/content/docs/configuration/ktx-yaml.mdx b/docs-site/content/docs/configuration/ktx-yaml.mdx index 4008a45d..2220814a 100644 --- a/docs-site/content/docs/configuration/ktx-yaml.mdx +++ b/docs-site/content/docs/configuration/ktx-yaml.mdx @@ -157,6 +157,12 @@ connections: dataset_ids: [analytics, mart] ``` +For Snowflake connections, set `maxSessions` when deep ingest needs more or +fewer concurrent warehouse sessions. The default is `4`. This caps all +concurrent Snowflake SQL work for that connector instance, including schema +introspection, table sampling, relationship profiling, relationship +validation, and read-only SQL execution. + For Postgres, BigQuery, and Snowflake, `historicSql` and `context.queryHistory` toggle query-history ingest. The shape is connector-specific; the setup wizard writes these fields when you pass `--enable-query-history`. @@ -483,6 +489,7 @@ scan: maxLlmTablesPerBatch: 40 maxCandidatesPerColumn: 25 profileSampleRows: 10000 + profileConcurrency: 4 validationConcurrency: 4 validationBudget: all ``` @@ -510,6 +517,7 @@ the manifest. | `relationships.maxLlmTablesPerBatch` | `int > 0` | `40` | Max tables included in a single LLM relationship-proposal batch. | | `relationships.maxCandidatesPerColumn` | `int > 0` | `25` | Max join partners considered per column. | | `relationships.profileSampleRows` | `int > 0` | `10000` | Rows sampled per table when profiling values for relationship inference. | +| `relationships.profileConcurrency` | `int > 0` | `4` | Parallel relationship-profile queries against the database. For Snowflake, effective database concurrency is also bounded by the connection's `maxSessions`. | | `relationships.validationConcurrency` | `int > 0` | `4` | Parallel relationship validation queries against the database. | | `relationships.validationBudget` | `all` \| `int ≥ 0` | runtime default | Cap on validation queries per scan. `all` means unlimited. | diff --git a/docs-site/content/docs/integrations/primary-sources.mdx b/docs-site/content/docs/integrations/primary-sources.mdx index 5e9483f9..81b8d400 100644 --- a/docs-site/content/docs/integrations/primary-sources.mdx +++ b/docs-site/content/docs/integrations/primary-sources.mdx @@ -129,20 +129,18 @@ connections: account: xy12345 warehouse: ANALYTICS_WH database: PROD - schema_name: PUBLIC + schema_names: + - PUBLIC + - SALES + - MARKETING username: KTX_SERVICE password: env:SNOWFLAKE_PASSWORD role: ANALYST ``` -For multiple schemas: - -```yaml - schema_names: - - PUBLIC - - ANALYTICS - - STAGING -``` +`ktx setup` discovers schemas after the connection is verified and writes the +selected list to `schema_names`. You can also set this field manually. For a +single schema, `schema_name: PUBLIC` is accepted as an equivalent shorthand. ### Authentication diff --git a/packages/cli/src/connectors/bigquery/connector.test.ts b/packages/cli/src/connectors/bigquery/connector.test.ts index c517100a..be65af1e 100644 --- a/packages/cli/src/connectors/bigquery/connector.test.ts +++ b/packages/cli/src/connectors/bigquery/connector.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it, vi } from 'vitest'; import { bigQueryConnectionConfigFromConfig, isKtxBigQueryConnectionConfig, type KtxBigQueryClient, KtxBigQueryScanConnector, type KtxBigQueryClientFactory, type KtxBigQueryDataset, type KtxBigQueryQueryJob, type KtxBigQueryTableRef } from '../../connectors/bigquery/connector.js'; import { createBigQueryLiveDatabaseIntrospection } from '../../connectors/bigquery/live-database-introspection.js'; +import { tableRefSet } from '../../context/scan/table-ref.js'; function fakeClientFactory(): KtxBigQueryClientFactory { const queryResults = vi.fn(async (): ReturnType => [ @@ -234,6 +235,59 @@ describe('KtxBigQueryScanConnector', () => { await connector.cleanup(); }); + it('limits introspection to tables in tableScope', async () => { + const ordersGet = vi.fn(async (): ReturnType => [ + { + metadata: { + type: 'TABLE', + numRows: '12', + schema: { fields: [{ name: 'id', type: 'INT64', mode: 'REQUIRED' }] }, + }, + }, + ]); + const skippedGet = vi.fn(async (): ReturnType => [ + { metadata: { type: 'TABLE', numRows: '1', schema: { fields: [] } } }, + ]); + const clientFactory: KtxBigQueryClientFactory = { + createClient: vi.fn(() => ({ + getDatasets: vi.fn(async (): ReturnType => [[{ id: 'analytics' }]]), + dataset: vi.fn( + (): KtxBigQueryDataset => ({ + get: vi.fn(async () => [{ id: 'analytics' }]), + getTables: vi.fn(async (): ReturnType => [ + [ + { id: 'orders', get: ordersGet }, + { id: 'customers', get: skippedGet }, + ], + ]), + }), + ), + createQueryJob: vi.fn(async (): ReturnType => [ + { + getQueryResults: async (): ReturnType => [ + [], + undefined, + { schema: { fields: [{ name: 'table_name', type: 'STRING' }, { name: 'column_name', type: 'STRING' }] } }, + ], + }, + ]), + })), + }; + const connector = new KtxBigQueryScanConnector({ + connectionId: 'warehouse', + connection, + clientFactory, + }); + const scope = tableRefSet([{ catalog: 'project-1', db: 'analytics', name: 'orders' }]); + const snapshot = await connector.introspect( + { connectionId: 'warehouse', driver: 'bigquery', tableScope: scope }, + { runId: 'scope-test' }, + ); + expect(snapshot.tables.map((table) => table.name)).toEqual(['orders']); + expect(ordersGet).toHaveBeenCalledTimes(1); + expect(skippedGet).not.toHaveBeenCalled(); + }); + it('constructs for discovery without dataset scope and lists tables through one region information schema query', async () => { const createQueryJob = vi.fn( async ( diff --git a/packages/cli/src/connectors/bigquery/connector.ts b/packages/cli/src/connectors/bigquery/connector.ts index 6a93ccb0..7810e251 100644 --- a/packages/cli/src/connectors/bigquery/connector.ts +++ b/packages/cli/src/connectors/bigquery/connector.ts @@ -2,6 +2,7 @@ import { BigQuery, type TableField } from '@google-cloud/bigquery'; import { normalizeBigQueryProjectId, normalizeBigQueryRegion } from '../../context/connections/bigquery-identifiers.js'; import { assertReadOnlySql, limitSqlForExecution } from '../../context/connections/read-only-sql.js'; import { createKtxConnectorCapabilities, type KtxColumnSampleInput, type KtxColumnSampleResult, type KtxColumnStatsInput, type KtxColumnStatsResult, type KtxQueryResult, type KtxReadOnlyQueryInput, type KtxScanConnector, type KtxScanContext, type KtxScanInput, type KtxSchemaColumn, type KtxSchemaSnapshot, type KtxSchemaTable, type KtxTableListEntry, type KtxTableRef, type KtxTableSampleInput, type KtxTableSampleResult } from '../../context/scan/types.js'; +import { scopedTableNames } from '../../context/scan/table-ref.js'; import { readFileSync } from 'node:fs'; import { homedir } from 'node:os'; import { resolve } from 'node:path'; @@ -289,7 +290,10 @@ export class KtxBigQueryScanConnector implements KtxScanConnector { const tables: KtxSchemaTable[] = []; const datasetIds = this.requireDatasetIdsForScan(); for (const datasetId of datasetIds) { - tables.push(...(await this.introspectDataset(datasetId))); + const scopedNames = input.tableScope + ? scopedTableNames(input.tableScope, { catalog: this.resolved.projectId, db: datasetId }) + : null; + tables.push(...(await this.introspectDataset(datasetId, scopedNames))); } return { connectionId: this.connectionId, @@ -362,7 +366,7 @@ export class KtxBigQueryScanConnector implements KtxScanConnector { if (!datasetId) { return 0; } - const tables = await this.introspectDataset(datasetId); + const tables = await this.introspectDataset(datasetId, null); return tables.find((table) => table.name === tableName)?.estimatedRows ?? 0; } @@ -463,12 +467,15 @@ export class KtxBigQueryScanConnector implements KtxScanConnector { return firstNumber(rows[0]?.[header]); } - private async introspectDataset(datasetId: string): Promise { + private async introspectDataset(datasetId: string, scopedNames: readonly string[] | null): Promise { + if (scopedNames && scopedNames.length === 0) return []; const dataset = this.getClient().dataset(datasetId); const [tableRefs] = await dataset.getTables(); + const scopeSet = scopedNames ? new Set(scopedNames) : null; + const filteredTableRefs = scopeSet ? tableRefs.filter((tableRef) => scopeSet.has(tableRef.id ?? '')) : tableRefs; const primaryKeys = await this.primaryKeys(datasetId); const tables: KtxSchemaTable[] = []; - for (const tableRef of tableRefs) { + for (const tableRef of filteredTableRefs) { const tableName = tableRef.id || ''; const [table] = await tableRef.get(); const fields = table.metadata.schema?.fields ?? []; diff --git a/packages/cli/src/connectors/bigquery/live-database-introspection.ts b/packages/cli/src/connectors/bigquery/live-database-introspection.ts index 5e854b9e..4e701dc4 100644 --- a/packages/cli/src/connectors/bigquery/live-database-introspection.ts +++ b/packages/cli/src/connectors/bigquery/live-database-introspection.ts @@ -1,4 +1,7 @@ -import type { LiveDatabaseIntrospectionPort } from '../../context/ingest/adapters/live-database/types.js'; +import type { + LiveDatabaseIntrospectionOptions, + LiveDatabaseIntrospectionPort, +} from '../../context/ingest/adapters/live-database/types.js'; import type { KtxProjectConnectionConfig } from '../../context/project/config.js'; import { KtxBigQueryScanConnector, @@ -16,7 +19,7 @@ export function createBigQueryLiveDatabaseIntrospection( options: CreateBigQueryLiveDatabaseIntrospectionOptions, ): LiveDatabaseIntrospectionPort { return { - async extractSchema(connectionId: string) { + async extractSchema(connectionId: string, introspectionOptions?: LiveDatabaseIntrospectionOptions) { const connection = options.connections[connectionId] as KtxBigQueryConnectionConfig | undefined; const connector = new KtxBigQueryScanConnector({ connectionId, @@ -25,7 +28,14 @@ export function createBigQueryLiveDatabaseIntrospection( now: options.now, }); try { - return await connector.introspect({ connectionId, driver: 'bigquery' }, { runId: `bigquery-${connectionId}` }); + return await connector.introspect( + { + connectionId, + driver: 'bigquery', + ...(introspectionOptions?.tableScope ? { tableScope: introspectionOptions.tableScope } : {}), + }, + { runId: `bigquery-${connectionId}` }, + ); } finally { await connector.cleanup(); } diff --git a/packages/cli/src/connectors/clickhouse/connector.test.ts b/packages/cli/src/connectors/clickhouse/connector.test.ts index a3ab11f6..abc7cad5 100644 --- a/packages/cli/src/connectors/clickhouse/connector.test.ts +++ b/packages/cli/src/connectors/clickhouse/connector.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it, vi } from 'vitest'; import { clickHouseClientConfigFromConfig, isKtxClickHouseConnectionConfig, KtxClickHouseScanConnector, type KtxClickHouseClientFactory } from '../../connectors/clickhouse/connector.js'; import { createClickHouseLiveDatabaseIntrospection } from '../../connectors/clickhouse/live-database-introspection.js'; +import { tableRefSet } from '../../context/scan/table-ref.js'; function result(payload: T) { return { @@ -238,6 +239,57 @@ describe('KtxClickHouseScanConnector', () => { ]); }); + it('limits introspection to tables in tableScope', async () => { + const queries: Array<{ query: string; query_params?: Record }> = []; + const clientFactory: KtxClickHouseClientFactory = { + createClient: vi.fn(() => ({ + query: vi.fn(async (input: { query: string; format: string; query_params?: Record }) => { + queries.push({ query: input.query, query_params: input.query_params }); + if (input.query.includes('FROM system.tables')) { + return result([{ database: 'analytics', name: 'events', engine: 'MergeTree', comment: '' }]); + } + if (input.query.includes('FROM system.columns')) { + return result([ + { + database: 'analytics', + table: 'events', + name: 'id', + type: 'UInt64', + comment: '', + is_in_primary_key: 1, + }, + ]); + } + if (input.query.includes('FROM system.parts')) { + return result([{ database: 'analytics', table: 'events', row_count: '2' }]); + } + throw new Error(`Unexpected SQL: ${input.query}`); + }), + close: vi.fn(async () => undefined), + })), + }; + const connector = new KtxClickHouseScanConnector({ + connectionId: 'warehouse', + connection: { + driver: 'clickhouse', + host: 'ch.example.test', + database: 'analytics', + username: 'reader', + password: 'test-pass', // pragma: allowlist secret + }, + clientFactory, + }); + const scope = tableRefSet([{ catalog: null, db: 'analytics', name: 'events' }]); + const snapshot = await connector.introspect( + { connectionId: 'warehouse', driver: 'clickhouse', tableScope: scope }, + { runId: 'scope-test' }, + ); + expect(snapshot.tables.map((table) => table.name)).toEqual(['events']); + const tablesQuery = queries.find((query) => query.query.includes('FROM system.tables')); + expect(tablesQuery?.query).toContain('AND name IN {table_names:Array(String)}'); + expect(tablesQuery?.query_params).toEqual({ databases: ['analytics'], table_names: ['events'] }); + }); + it('runs samples, distinct values, read-only SQL, row count, schema list, and cleanup', async () => { const clientFactory = fakeClientFactory(); const connector = new KtxClickHouseScanConnector({ diff --git a/packages/cli/src/connectors/clickhouse/connector.ts b/packages/cli/src/connectors/clickhouse/connector.ts index 1d851001..a2ee568c 100644 --- a/packages/cli/src/connectors/clickhouse/connector.ts +++ b/packages/cli/src/connectors/clickhouse/connector.ts @@ -1,6 +1,7 @@ import { createClient } from '@clickhouse/client'; import { assertReadOnlySql, limitSqlForExecution } from '../../context/connections/read-only-sql.js'; import { createKtxConnectorCapabilities, type KtxColumnSampleInput, type KtxColumnSampleResult, type KtxColumnStatsInput, type KtxColumnStatsResult, type KtxQueryResult, type KtxReadOnlyQueryInput, type KtxScanConnector, type KtxScanContext, type KtxScanInput, type KtxSchemaColumn, type KtxSchemaSnapshot, type KtxSchemaTable, type KtxTableRef, type KtxTableSampleInput, type KtxTableListEntry, type KtxTableSampleResult } from '../../context/scan/types.js'; +import { scopedTableNames } from '../../context/scan/table-ref.js'; import { readFileSync } from 'node:fs'; import { Agent as HttpsAgent } from 'node:https'; import { homedir } from 'node:os'; @@ -285,24 +286,42 @@ export class KtxClickHouseScanConnector implements KtxScanConnector { async introspect(input: KtxScanInput, _ctx: KtxScanContext): Promise { this.assertConnection(input.connectionId); const databases = configuredClickHouseDatabases(this.connection, this.clientConfig.database); + let allScopedTables: string[] | null = null; + if (input.tableScope) { + allScopedTables = []; + for (const database of databases) { + allScopedTables.push(...scopedTableNames(input.tableScope, { catalog: null, db: database })); + } + if (allScopedTables.length === 0) { + return this.emptySnapshot(databases); + } + } + const queryParams: Record = { databases }; + const tableNameClause = allScopedTables ? 'AND name IN {table_names:Array(String)}' : ''; + const columnTableNameClause = allScopedTables ? 'AND table IN {table_names:Array(String)}' : ''; + if (allScopedTables) { + queryParams.table_names = allScopedTables; + } const tables = await this.queryEachRow( ` SELECT database, name, engine, comment FROM system.tables WHERE database IN {databases:Array(String)} AND engine NOT IN ('Dictionary') + ${tableNameClause} ORDER BY database, name `, - { databases }, + queryParams, ); const columns = await this.queryEachRow( ` SELECT database, table, name, type, comment, is_in_primary_key FROM system.columns WHERE database IN {databases:Array(String)} + ${columnTableNameClause} ORDER BY database, table, position `, - { databases }, + queryParams, ); const rowCounts = await this.queryEachRow( ` @@ -310,9 +329,10 @@ export class KtxClickHouseScanConnector implements KtxScanConnector { FROM system.parts WHERE database IN {databases:Array(String)} AND active = 1 + ${columnTableNameClause} GROUP BY database, table `, - { databases }, + queryParams, ); const columnsByTable = new Map(); for (const column of columns) { @@ -347,6 +367,23 @@ export class KtxClickHouseScanConnector implements KtxScanConnector { }; } + private emptySnapshot(databases: string[]): KtxSchemaSnapshot { + return { + connectionId: this.connectionId, + driver: 'clickhouse', + extractedAt: this.now().toISOString(), + scope: { schemas: databases }, + metadata: { + database: this.clientConfig.database, + databases, + host: this.clientConfig.host, + table_count: 0, + total_columns: 0, + }, + tables: [], + }; + } + async sampleTable(input: KtxTableSampleInput, _ctx: KtxScanContext): Promise { this.assertConnection(input.connectionId); const result = await this.query( diff --git a/packages/cli/src/connectors/clickhouse/live-database-introspection.ts b/packages/cli/src/connectors/clickhouse/live-database-introspection.ts index 1e0ec918..74f9475d 100644 --- a/packages/cli/src/connectors/clickhouse/live-database-introspection.ts +++ b/packages/cli/src/connectors/clickhouse/live-database-introspection.ts @@ -1,4 +1,7 @@ -import type { LiveDatabaseIntrospectionPort } from '../../context/ingest/adapters/live-database/types.js'; +import type { + LiveDatabaseIntrospectionOptions, + LiveDatabaseIntrospectionPort, +} from '../../context/ingest/adapters/live-database/types.js'; import type { KtxProjectConnectionConfig } from '../../context/project/config.js'; import { KtxClickHouseScanConnector, @@ -18,7 +21,7 @@ export function createClickHouseLiveDatabaseIntrospection( options: CreateClickHouseLiveDatabaseIntrospectionOptions, ): LiveDatabaseIntrospectionPort { return { - async extractSchema(connectionId: string) { + async extractSchema(connectionId: string, introspectionOptions?: LiveDatabaseIntrospectionOptions) { const connection = options.connections[connectionId] as KtxClickHouseConnectionConfig | undefined; const connector = new KtxClickHouseScanConnector({ connectionId, @@ -29,7 +32,11 @@ export function createClickHouseLiveDatabaseIntrospection( }); try { return await connector.introspect( - { connectionId, driver: 'clickhouse' }, + { + connectionId, + driver: 'clickhouse', + ...(introspectionOptions?.tableScope ? { tableScope: introspectionOptions.tableScope } : {}), + }, { runId: `clickhouse-${connectionId}` }, ); } finally { diff --git a/packages/cli/src/connectors/mysql/connector.test.ts b/packages/cli/src/connectors/mysql/connector.test.ts index f9f2d0ad..5a21ada7 100644 --- a/packages/cli/src/connectors/mysql/connector.test.ts +++ b/packages/cli/src/connectors/mysql/connector.test.ts @@ -2,6 +2,7 @@ import { describe, expect, it, vi } from 'vitest'; import type { FieldPacket, RowDataPacket } from 'mysql2/promise'; import { createMysqlLiveDatabaseIntrospection } from '../../connectors/mysql/live-database-introspection.js'; import { isKtxMysqlConnectionConfig, KtxMysqlScanConnector, mysqlConnectionPoolConfigFromConfig, type KtxMysqlPoolFactory } from '../../connectors/mysql/connector.js'; +import { tableRefSet } from '../../context/scan/table-ref.js'; function mysqlResult(rows: Record[], fields: Array<{ name: string; type?: number }>): [RowDataPacket[], FieldPacket[]] { return [rows as RowDataPacket[], fields as FieldPacket[]]; @@ -275,6 +276,71 @@ describe('KtxMysqlScanConnector', () => { ]); }); + it('limits introspection to tables in tableScope', async () => { + const queries: Array<{ sql: string; params?: unknown }> = []; + const poolFactory: KtxMysqlPoolFactory = { + createPool: vi.fn(() => ({ + getConnection: vi.fn(async () => ({ + query: vi.fn(async (sql: string, params?: unknown): Promise<[RowDataPacket[], FieldPacket[]]> => { + queries.push({ sql, params }); + if (sql.includes('INFORMATION_SCHEMA.TABLES')) { + return mysqlResult( + [ + { + TABLE_SCHEMA: 'analytics', + TABLE_NAME: 'orders', + TABLE_TYPE: 'BASE TABLE', + TABLE_COMMENT: '', + TABLE_ROWS: 2, + }, + ], + [], + ); + } + if (sql.includes('INFORMATION_SCHEMA.COLUMNS')) { + return mysqlResult( + [ + { + TABLE_SCHEMA: 'analytics', + TABLE_NAME: 'orders', + COLUMN_NAME: 'id', + DATA_TYPE: 'int', + IS_NULLABLE: 'NO', + COLUMN_COMMENT: '', + }, + ], + [], + ); + } + return mysqlResult([], []); + }), + release: vi.fn(), + })), + end: vi.fn(async () => undefined), + })), + }; + const connector = new KtxMysqlScanConnector({ + connectionId: 'warehouse', + connection: { + driver: 'mysql', + host: 'db.example.test', + database: 'analytics', + username: 'reader', + password: 'secret', // pragma: allowlist secret + }, + poolFactory, + }); + const scope = tableRefSet([{ catalog: null, db: 'analytics', name: 'orders' }]); + const snapshot = await connector.introspect( + { connectionId: 'warehouse', driver: 'mysql', tableScope: scope }, + { runId: 'scope-test' }, + ); + expect(snapshot.tables.map((table) => table.name)).toEqual(['orders']); + const tablesQuery = queries.find((query) => query.sql.includes('INFORMATION_SCHEMA.TABLES')); + expect(tablesQuery?.sql).toMatch(/TABLE_NAME IN \(\?\)/); + expect(tablesQuery?.params).toEqual(['analytics', 'orders']); + }); + it('runs samples, distinct values, read-only SQL, row count, schema list, and cleanup', async () => { const poolFactory = fakePoolFactory(); const connector = new KtxMysqlScanConnector({ diff --git a/packages/cli/src/connectors/mysql/connector.ts b/packages/cli/src/connectors/mysql/connector.ts index 9d92c2e0..82a2384c 100644 --- a/packages/cli/src/connectors/mysql/connector.ts +++ b/packages/cli/src/connectors/mysql/connector.ts @@ -4,6 +4,7 @@ import { homedir } from 'node:os'; import { resolve } from 'node:path'; import { assertReadOnlySql, limitSqlForExecution } from '../../context/connections/read-only-sql.js'; import { createKtxConnectorCapabilities, type KtxColumnSampleInput, type KtxColumnSampleResult, type KtxColumnStatsInput, type KtxColumnStatsResult, type KtxQueryResult, type KtxReadOnlyQueryInput, type KtxScanConnector, type KtxScanContext, type KtxScanInput, type KtxSchemaColumn, type KtxTableListEntry, type KtxSchemaForeignKey, type KtxSchemaSnapshot, type KtxSchemaTable, type KtxTableRef, type KtxTableSampleInput, type KtxTableSampleResult } from '../../context/scan/types.js'; +import { scopedTableNames } from '../../context/scan/table-ref.js'; import { KtxMysqlDialect } from './dialect.js'; export interface KtxMysqlConnectionConfig { @@ -335,23 +336,37 @@ export class KtxMysqlScanConnector implements KtxScanConnector { this.assertConnection(input.connectionId); const databases = configuredMysqlSchemas(this.connection, this.poolConfig.database); const placeholders = databases.map(() => '?').join(', '); + let allScopedTables: string[] | null = null; + if (input.tableScope) { + allScopedTables = []; + for (const database of databases) { + allScopedTables.push(...scopedTableNames(input.tableScope, { catalog: null, db: database })); + } + if (allScopedTables.length === 0) { + return this.emptySnapshot(databases); + } + } + const tableNameClause = allScopedTables + ? `AND TABLE_NAME IN (${allScopedTables.map(() => '?').join(', ')})` + : ''; + const tableNameParams = allScopedTables ?? []; const tables = await this.queryRaw( ` SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE, TABLE_COMMENT, TABLE_ROWS FROM INFORMATION_SCHEMA.TABLES - WHERE TABLE_SCHEMA IN (${placeholders}) AND TABLE_TYPE IN ('BASE TABLE', 'VIEW') + WHERE TABLE_SCHEMA IN (${placeholders}) AND TABLE_TYPE IN ('BASE TABLE', 'VIEW') ${tableNameClause} ORDER BY TABLE_SCHEMA, TABLE_NAME `, - databases, + [...databases, ...tableNameParams], ); const columns = await this.queryRaw( ` SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, IS_NULLABLE, COLUMN_COMMENT FROM INFORMATION_SCHEMA.COLUMNS - WHERE TABLE_SCHEMA IN (${placeholders}) + WHERE TABLE_SCHEMA IN (${placeholders}) ${tableNameClause} ORDER BY TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION `, - databases, + [...databases, ...tableNameParams], ); const primaryKeys = await this.queryRaw( ` @@ -359,9 +374,10 @@ export class KtxMysqlScanConnector implements KtxScanConnector { FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE WHERE TABLE_SCHEMA IN (${placeholders}) AND CONSTRAINT_NAME = 'PRIMARY' + ${tableNameClause} ORDER BY TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION `, - databases, + [...databases, ...tableNameParams], ); const foreignKeys = await this.queryRaw( ` @@ -369,9 +385,10 @@ export class KtxMysqlScanConnector implements KtxScanConnector { FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE WHERE TABLE_SCHEMA IN (${placeholders}) AND REFERENCED_TABLE_NAME IS NOT NULL + ${tableNameClause} ORDER BY TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME `, - databases, + [...databases, ...tableNameParams], ); const columnsByTable = groupByTable(columns, this.poolConfig.database); @@ -403,6 +420,23 @@ export class KtxMysqlScanConnector implements KtxScanConnector { }; } + private emptySnapshot(databases: string[]): KtxSchemaSnapshot { + return { + connectionId: this.connectionId, + driver: 'mysql', + extractedAt: this.now().toISOString(), + scope: { schemas: databases }, + metadata: { + database: this.poolConfig.database, + schemas: databases, + host: this.poolConfig.host, + table_count: 0, + total_columns: 0, + }, + tables: [], + }; + } + async sampleTable(input: KtxTableSampleInput, _ctx: KtxScanContext): Promise { this.assertConnection(input.connectionId); const result = await this.query(this.dialect.generateSampleQuery(this.qTableName(input.table), input.limit, input.columns)); diff --git a/packages/cli/src/connectors/mysql/live-database-introspection.ts b/packages/cli/src/connectors/mysql/live-database-introspection.ts index ea649761..897244d5 100644 --- a/packages/cli/src/connectors/mysql/live-database-introspection.ts +++ b/packages/cli/src/connectors/mysql/live-database-introspection.ts @@ -1,4 +1,7 @@ -import type { LiveDatabaseIntrospectionPort } from '../../context/ingest/adapters/live-database/types.js'; +import type { + LiveDatabaseIntrospectionOptions, + LiveDatabaseIntrospectionPort, +} from '../../context/ingest/adapters/live-database/types.js'; import type { KtxProjectConnectionConfig } from '../../context/project/config.js'; import { KtxMysqlScanConnector, @@ -18,7 +21,7 @@ export function createMysqlLiveDatabaseIntrospection( options: CreateMysqlLiveDatabaseIntrospectionOptions, ): LiveDatabaseIntrospectionPort { return { - async extractSchema(connectionId: string) { + async extractSchema(connectionId: string, introspectionOptions?: LiveDatabaseIntrospectionOptions) { const connection = options.connections[connectionId] as KtxMysqlConnectionConfig | undefined; const connector = new KtxMysqlScanConnector({ connectionId, @@ -28,7 +31,14 @@ export function createMysqlLiveDatabaseIntrospection( now: options.now, }); try { - return await connector.introspect({ connectionId, driver: 'mysql' }, { runId: `mysql-${connectionId}` }); + return await connector.introspect( + { + connectionId, + driver: 'mysql', + ...(introspectionOptions?.tableScope ? { tableScope: introspectionOptions.tableScope } : {}), + }, + { runId: `mysql-${connectionId}` }, + ); } finally { await connector.cleanup(); } diff --git a/packages/cli/src/connectors/postgres/connector.test.ts b/packages/cli/src/connectors/postgres/connector.test.ts index cf595f5c..346c2ef2 100644 --- a/packages/cli/src/connectors/postgres/connector.test.ts +++ b/packages/cli/src/connectors/postgres/connector.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it, vi } from 'vitest'; import { createPostgresLiveDatabaseIntrospection } from '../../connectors/postgres/live-database-introspection.js'; import { isKtxPostgresConnectionConfig, KtxPostgresScanConnector, postgresPoolConfigFromConfig, type KtxPostgresPoolFactory } from '../../connectors/postgres/connector.js'; +import { tableRefSet } from '../../context/scan/table-ref.js'; interface FakeQueryResult { rows: Record[]; @@ -259,6 +260,63 @@ describe('KtxPostgresScanConnector', () => { ).rejects.toThrow('Only read-only SELECT/WITH queries can be executed locally'); }); + it('limits introspection to tables in tableScope', async () => { + const queries: Array<{ sql: string; params?: unknown[] }> = []; + const poolFactory: KtxPostgresPoolFactory = { + createPool() { + return { + async connect() { + return { + query: vi.fn(async (sql: string, params?: unknown[]) => { + queries.push({ sql, params }); + if (sql.includes('FROM pg_catalog.pg_class c')) { + return { rows: [{ table_name: 'orders', table_kind: 'r', row_count: '3', table_comment: null }] }; + } + if (sql.includes('FROM pg_catalog.pg_attribute a')) { + return { + rows: [ + { + table_name: 'orders', + column_name: 'id', + data_type: 'integer', + is_nullable: false, + column_comment: null, + }, + ], + }; + } + return { rows: [] }; + }), + release: vi.fn(), + }; + }, + end: vi.fn(async () => undefined), + }; + }, + }; + const connector = new KtxPostgresScanConnector({ + connectionId: 'warehouse', + connection: { + driver: 'postgres', + host: 'db.example.test', + database: 'analytics', + username: 'reader', + password: 'test-password', // pragma: allowlist secret + schema: 'public', + }, + poolFactory, + }); + const scope = tableRefSet([{ catalog: null, db: 'public', name: 'orders' }]); + const snapshot = await connector.introspect( + { connectionId: 'warehouse', driver: 'postgres', tableScope: scope }, + { runId: 'scope-test' }, + ); + expect(snapshot.tables.map((table) => table.name)).toEqual(['orders']); + const tablesQuery = queries.find((query) => query.sql.includes('FROM pg_catalog.pg_class c')); + expect(tablesQuery?.sql).toMatch(/c\.relname = ANY\(\$2\)/); + expect(tablesQuery?.params).toEqual(['public', ['orders']]); + }); + it('adapts native PostgreSQL snapshots to live-database introspection for local ingest', async () => { const introspection = createPostgresLiveDatabaseIntrospection({ connections: { diff --git a/packages/cli/src/connectors/postgres/connector.ts b/packages/cli/src/connectors/postgres/connector.ts index 36a2bda6..5cb94bf4 100644 --- a/packages/cli/src/connectors/postgres/connector.ts +++ b/packages/cli/src/connectors/postgres/connector.ts @@ -3,6 +3,7 @@ import { homedir } from 'node:os'; import { resolve } from 'node:path'; import { assertReadOnlySql, limitSqlForExecution } from '../../context/connections/read-only-sql.js'; import { createKtxConnectorCapabilities, type KtxColumnSampleInput, type KtxColumnSampleResult, type KtxColumnStatsInput, type KtxColumnStatsResult, type KtxQueryResult, type KtxReadOnlyQueryInput, type KtxScanConnector, type KtxScanContext, type KtxScanInput, type KtxSchemaColumn, type KtxSchemaForeignKey, type KtxSchemaSnapshot, type KtxSchemaTable, type KtxTableListEntry, type KtxTableRef, type KtxTableSampleInput, type KtxTableSampleResult } from '../../context/scan/types.js'; +import { scopedTableNames } from '../../context/scan/table-ref.js'; import { Pool } from 'pg'; import { KtxPostgresDialect } from './dialect.js'; @@ -379,7 +380,9 @@ export class KtxPostgresScanConnector implements KtxScanConnector { const schemas = schemasFromConnection(this.connection); const allTables: KtxSchemaTable[] = []; for (const schema of schemas) { - const tables = await this.loadSchemaTables(schema); + const scopedNames = input.tableScope ? scopedTableNames(input.tableScope, { catalog: null, db: schema }) : null; + if (scopedNames && scopedNames.length === 0) continue; + const tables = await this.loadSchemaTables(schema, scopedNames); allTables.push(...tables); } return { @@ -543,7 +546,11 @@ export class KtxPostgresScanConnector implements KtxScanConnector { } } - private async loadSchemaTables(schema: string): Promise { + private async loadSchemaTables(schema: string, scopedNames: readonly string[] | null): Promise { + if (scopedNames && scopedNames.length === 0) return []; + const pgCatalogScopeClause = scopedNames ? 'AND c.relname = ANY($2)' : ''; + const tableConstraintScopeClause = scopedNames ? 'AND tc.table_name = ANY($2)' : ''; + const scopeValues = scopedNames ? [scopedNames] : []; const tables = await this.queryRaw( ` SELECT @@ -557,9 +564,10 @@ export class KtxPostgresScanConnector implements KtxScanConnector { ON d.objoid = c.oid AND d.objsubid = 0 WHERE n.nspname = $1 AND c.relkind IN ('r', 'v') + ${pgCatalogScopeClause} ORDER BY c.relname `, - [schema], + [schema, ...scopeValues], ); const columns = await this.queryRaw( ` @@ -578,9 +586,10 @@ export class KtxPostgresScanConnector implements KtxScanConnector { AND c.relkind IN ('r', 'v') AND a.attnum > 0 AND NOT a.attisdropped + ${pgCatalogScopeClause} ORDER BY c.relname, a.attnum `, - [schema], + [schema, ...scopeValues], ); const primaryKeys = await this.queryRaw( ` @@ -591,9 +600,10 @@ export class KtxPostgresScanConnector implements KtxScanConnector { AND tc.table_schema = kcu.table_schema WHERE tc.constraint_type = 'PRIMARY KEY' AND tc.table_schema = $1 + ${tableConstraintScopeClause} ORDER BY tc.table_name, kcu.ordinal_position `, - [schema], + [schema, ...scopeValues], ); const foreignKeys = await this.queryRaw( ` @@ -613,9 +623,10 @@ export class KtxPostgresScanConnector implements KtxScanConnector { AND ccu.table_schema = tc.table_schema WHERE tc.constraint_type = 'FOREIGN KEY' AND tc.table_schema = $1 + ${tableConstraintScopeClause} ORDER BY tc.table_name, kcu.column_name `, - [schema], + [schema, ...scopeValues], ); const columnsByTable = groupByTable(columns); diff --git a/packages/cli/src/connectors/postgres/live-database-introspection.ts b/packages/cli/src/connectors/postgres/live-database-introspection.ts index 83e29489..8b4454bc 100644 --- a/packages/cli/src/connectors/postgres/live-database-introspection.ts +++ b/packages/cli/src/connectors/postgres/live-database-introspection.ts @@ -1,4 +1,7 @@ -import type { LiveDatabaseIntrospectionPort } from '../../context/ingest/adapters/live-database/types.js'; +import type { + LiveDatabaseIntrospectionOptions, + LiveDatabaseIntrospectionPort, +} from '../../context/ingest/adapters/live-database/types.js'; import type { KtxProjectConnectionConfig } from '../../context/project/config.js'; import { KtxPostgresScanConnector, @@ -18,7 +21,7 @@ export function createPostgresLiveDatabaseIntrospection( options: CreatePostgresLiveDatabaseIntrospectionOptions, ): LiveDatabaseIntrospectionPort { return { - async extractSchema(connectionId: string) { + async extractSchema(connectionId: string, introspectionOptions?: LiveDatabaseIntrospectionOptions) { const connection = options.connections[connectionId] as KtxPostgresConnectionConfig | undefined; const connector = new KtxPostgresScanConnector({ connectionId, @@ -28,7 +31,14 @@ export function createPostgresLiveDatabaseIntrospection( now: options.now, }); try { - return await connector.introspect({ connectionId, driver: 'postgres' }, { runId: `postgres-${connectionId}` }); + return await connector.introspect( + { + connectionId, + driver: 'postgres', + ...(introspectionOptions?.tableScope ? { tableScope: introspectionOptions.tableScope } : {}), + }, + { runId: `postgres-${connectionId}` }, + ); } finally { await connector.cleanup(); } diff --git a/packages/cli/src/connectors/snowflake/connector.test.ts b/packages/cli/src/connectors/snowflake/connector.test.ts index 7e17117e..a321e289 100644 --- a/packages/cli/src/connectors/snowflake/connector.test.ts +++ b/packages/cli/src/connectors/snowflake/connector.test.ts @@ -1,6 +1,15 @@ import { describe, expect, it, vi } from 'vitest'; + +const createPool = vi.hoisted(() => vi.fn()); + +vi.mock('snowflake-sdk', () => ({ + default: { createPool }, + createPool, +})); + import { createSnowflakeLiveDatabaseIntrospection } from '../../connectors/snowflake/live-database-introspection.js'; import { isKtxSnowflakeConnectionConfig, KtxSnowflakeScanConnector, snowflakeConnectionConfigFromConfig, type KtxSnowflakeDriver, type KtxSnowflakeDriverFactory } from '../../connectors/snowflake/connector.js'; +import { tableRefSet } from '../../context/scan/table-ref.js'; function fakeDriverFactory(): KtxSnowflakeDriverFactory { const driver: KtxSnowflakeDriver = { @@ -63,6 +72,38 @@ function fakeDriverFactory(): KtxSnowflakeDriverFactory { return { createDriver: vi.fn(() => driver) }; } +function fakeSnowflakeStatement(headers: string[] = ['ONE']) { + return { + getColumns: () => headers.map((header) => ({ getName: () => header, getType: () => 'TEXT' })), + }; +} + +function installSnowflakePoolMock() { + const executedSql: string[] = []; + const connection = { + execute: vi.fn( + (input: { + sqlText: string; + complete: ( + error: Error | null, + statement: ReturnType, + rows: Array>, + ) => void; + }) => { + executedSql.push(input.sqlText); + input.complete(null, fakeSnowflakeStatement(), [{ ONE: 1 }]); + }, + ), + }; + const pool = { + use: vi.fn(async (fn: (conn: typeof connection) => Promise) => fn(connection)), + drain: vi.fn(async () => undefined), + clear: vi.fn(async () => undefined), + }; + createPool.mockReturnValue(pool); + return { connection, pool, executedSql }; +} + describe('KtxSnowflakeScanConnector', () => { it('resolves Snowflake connection configuration safely', () => { expect( @@ -99,6 +140,99 @@ describe('KtxSnowflakeScanConnector', () => { }); }); + it('defaults and validates Snowflake maxSessions', () => { + const baseConnection = { + driver: 'snowflake', + authMethod: 'password', + account: 'acct', + warehouse: 'WH', + database: 'ANALYTICS', + schema_name: 'PUBLIC', + username: 'reader', + password: 'fixture-pass', // pragma: allowlist secret + } as const; + + expect( + snowflakeConnectionConfigFromConfig({ + connectionId: 'warehouse', + connection: baseConnection, + }), + ).toMatchObject({ maxSessions: 4 }); + + expect( + snowflakeConnectionConfigFromConfig({ + connectionId: 'warehouse', + connection: { ...baseConnection, maxSessions: 8 }, + }), + ).toMatchObject({ maxSessions: 8 }); + + for (const maxSessions of [0, -1, 1.5, Number.NaN]) { + expect(() => + snowflakeConnectionConfigFromConfig({ + connectionId: 'warehouse', + connection: { ...baseConnection, maxSessions }, + }), + ).toThrow('connections.warehouse.maxSessions must be a positive integer'); + } + }); + + it('uses one lazy Snowflake pool and drains it during cleanup', async () => { + const { pool, executedSql } = installSnowflakePoolMock(); + const close = vi.fn(async () => undefined); + const connector = new KtxSnowflakeScanConnector({ + connectionId: 'warehouse', + connection: { + driver: 'snowflake', + authMethod: 'password', + account: 'acct', + warehouse: 'WH', + database: 'ANALYTICS', + schema_name: 'PUBLIC', + username: 'reader', + password: 'fixture-pass', // pragma: allowlist secret + role: 'ANALYST', + maxSessions: 3, + }, + sdkOptionsProvider: { + resolve: vi.fn(async () => ({ sdkOptions: { application: 'ktx-test' }, close })), + }, + }); + + expect(createPool).not.toHaveBeenCalled(); + + await connector.executeReadOnly({ connectionId: 'warehouse', sql: 'select 1', maxRows: 1 }, { runId: 'run-1' }); + await connector.executeReadOnly({ connectionId: 'warehouse', sql: 'select 1', maxRows: 1 }, { runId: 'run-1' }); + + expect(createPool).toHaveBeenCalledTimes(1); + expect(createPool).toHaveBeenCalledWith( + expect.objectContaining({ + account: 'acct', + username: 'reader', + warehouse: 'WH', + database: 'ANALYTICS', + schema: 'PUBLIC', + role: 'ANALYST', + password: 'fixture-pass', // pragma: allowlist secret + clientSessionKeepAlive: true, + clientSessionKeepAliveHeartbeatFrequency: 900, + application: 'ktx-test', + }), + expect.objectContaining({ + min: 0, + max: 3, + evictionRunIntervalMillis: 30_000, + acquireTimeoutMillis: 60_000, + }), + ); + expect(pool.use).toHaveBeenCalledTimes(2); + expect(executedSql.some((sql) => /^USE\s+/i.test(sql.trim()))).toBe(false); + + await connector.cleanup(); + expect(pool.drain).toHaveBeenCalledBefore(pool.clear); + expect(pool.clear).toHaveBeenCalledTimes(1); + expect(close).toHaveBeenCalledTimes(1); + }); + it('introspects schema, primary keys, comments, row counts, and dimensions', async () => { const connector = new KtxSnowflakeScanConnector({ connectionId: 'warehouse', @@ -157,6 +291,108 @@ describe('KtxSnowflakeScanConnector', () => { ]); }); + it('continues introspection when primary-key discovery is not authorized', async () => { + const driverFactory = fakeDriverFactory(); + const driver = (driverFactory.createDriver as ReturnType).getMockImplementation() as + | (() => KtxSnowflakeDriver) + | undefined; + if (!driver) throw new Error('driver mock missing'); + const built = driver(); + (built.query as ReturnType).mockImplementation(async (sql: string) => { + if (sql.includes('TABLE_CONSTRAINTS')) { + throw new Error( + "SQL compilation error: Object 'ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE' does not exist or not authorized.", + ); + } + throw new Error(`Unexpected SQL: ${sql}`); + }); + (driverFactory.createDriver as ReturnType).mockReturnValue(built); + + const warn = vi.spyOn(console, 'warn').mockImplementation(() => undefined); + try { + const connector = new KtxSnowflakeScanConnector({ + connectionId: 'warehouse', + connection: { + driver: 'snowflake', + authMethod: 'password', + account: 'acct', + warehouse: 'WH', + database: 'ANALYTICS', + schema_name: 'PUBLIC', + username: 'reader', + password: 'fixture-pass', // pragma: allowlist secret + }, + driverFactory, + }); + + const snapshot = await connector.introspect( + { connectionId: 'warehouse', driver: 'snowflake' }, + { runId: 'scan-run-pk-skip' }, + ); + + expect(snapshot.tables.map((table) => table.name).sort()).toEqual(['ORDERS', 'ORDER_SUMMARY']); + expect(snapshot.tables.every((table) => table.columns.every((column) => column.primaryKey === false))).toBe(true); + expect(warn).not.toHaveBeenCalled(); + } finally { + warn.mockRestore(); + } + }); + + it('limits introspection to tables in tableScope', async () => { + const queries: Array<{ sql: string; params?: unknown }> = []; + const getSchemaMetadata = vi.fn(async (_schemaName?: string, scopedNames?: readonly string[] | null) => + scopedNames?.includes('ORDERS') + ? [ + { + name: 'ORDERS', + catalog: 'ANALYTICS', + db: 'MARTS', + rowCount: 10, + comment: null, + columns: [{ name: 'ID', type: 'NUMBER', nullable: false, comment: null }], + }, + ] + : [], + ); + const driverFactory: KtxSnowflakeDriverFactory = { + createDriver: vi.fn(() => ({ + test: vi.fn(async () => ({ success: true })), + query: vi.fn(async (sql: string, params?: unknown) => { + queries.push({ sql, params }); + return { headers: [], rows: [], totalRows: 0, rowCount: 0 }; + }), + getSchemaMetadata, + listSchemas: vi.fn(async () => []), + listTables: vi.fn(async () => []), + cleanup: vi.fn(async () => undefined), + })), + }; + const connector = new KtxSnowflakeScanConnector({ + connectionId: 'warehouse', + connection: { + driver: 'snowflake', + authMethod: 'password', + account: 'acct', + warehouse: 'WH', + database: 'ANALYTICS', + schema_name: 'MARTS', + username: 'reader', + password: 'fixture-pass', // pragma: allowlist secret + }, + driverFactory, + }); + const scope = tableRefSet([{ catalog: 'ANALYTICS', db: 'MARTS', name: 'ORDERS' }]); + const snapshot = await connector.introspect( + { connectionId: 'warehouse', driver: 'snowflake', tableScope: scope }, + { runId: 'scope-test' }, + ); + expect(snapshot.tables.map((table) => table.name)).toEqual(['ORDERS']); + expect(getSchemaMetadata).toHaveBeenCalledWith('MARTS', ['ORDERS']); + const primaryKeysQuery = queries.find((query) => query.sql.includes('TABLE_CONSTRAINTS')); + expect(primaryKeysQuery?.sql).toMatch(/AND tc\.TABLE_NAME IN \(\?\)/); + expect(primaryKeysQuery?.params).toEqual(['MARTS', 'ANALYTICS', 'ORDERS']); + }); + it('supports read-only query, sampling, distinct values, row counts, schema listing, and cleanup', async () => { const driverFactory = fakeDriverFactory(); const connector = new KtxSnowflakeScanConnector({ diff --git a/packages/cli/src/connectors/snowflake/connector.ts b/packages/cli/src/connectors/snowflake/connector.ts index 41263d4b..0281b298 100644 --- a/packages/cli/src/connectors/snowflake/connector.ts +++ b/packages/cli/src/connectors/snowflake/connector.ts @@ -4,9 +4,12 @@ import { homedir } from 'node:os'; import { resolve } from 'node:path'; import { assertReadOnlySql, limitSqlForExecution } from '../../context/connections/read-only-sql.js'; import { createKtxConnectorCapabilities, type KtxColumnSampleInput, type KtxColumnSampleResult, type KtxColumnStatsInput, type KtxColumnStatsResult, type KtxQueryResult, type KtxReadOnlyQueryInput, type KtxScanConnector, type KtxScanContext, type KtxScanInput, type KtxSchemaColumn, type KtxSchemaSnapshot, type KtxSchemaTable, type KtxTableRef, type KtxTableSampleInput, type KtxTableListEntry, type KtxTableSampleResult } from '../../context/scan/types.js'; -import * as snowflake from 'snowflake-sdk'; +import { scopedTableNames } from '../../context/scan/table-ref.js'; +import snowflake from 'snowflake-sdk'; +import type { Bind, Binds, Connection, ConnectionOptions } from 'snowflake-sdk'; import { KtxSnowflakeDialect } from './dialect.js'; import { assertSafeSnowflakeIdentifier, quoteSnowflakeIdentifier } from './identifiers.js'; +import { configureSnowflakeSdkLogger } from './sdk-logger.js'; export interface KtxSnowflakeConnectionConfig { driver?: string; @@ -21,6 +24,7 @@ export interface KtxSnowflakeConnectionConfig { privateKey?: string; passphrase?: string; role?: string; + maxSessions?: number; [key: string]: unknown; } @@ -35,6 +39,7 @@ export interface KtxSnowflakeResolvedConnectionConfig { privateKey?: string; passphrase?: string; role?: string; + maxSessions: number; } export interface KtxSnowflakeRawColumnMetadata { @@ -56,7 +61,7 @@ export interface KtxSnowflakeRawTableMetadata { export interface KtxSnowflakeDriver { test(): Promise<{ success: boolean; error?: string }>; query(sql: string, params?: unknown): Promise; - getSchemaMetadata(schemaName?: string): Promise; + getSchemaMetadata(schemaName?: string, scopedTableNames?: readonly string[] | null): Promise; listSchemas(): Promise; listTables(schemas?: string[]): Promise; cleanup(): Promise; @@ -79,6 +84,12 @@ export interface KtxSnowflakeSdkOptionsProvider { export interface KtxSnowflakeScanConnectorOptions { connectionId: string; connection: KtxSnowflakeConnectionConfig | undefined; + /** + * KTX project directory. When provided, snowflake-sdk's logger is redirected to + * `/.ktx/logs/snowflake.log` so its JSON output does not bleed into + * the CLI's TTY. Tests that use a fake driverFactory can leave this undefined. + */ + projectDir?: string; driverFactory?: KtxSnowflakeDriverFactory; sdkOptionsProvider?: KtxSnowflakeSdkOptionsProvider; env?: NodeJS.ProcessEnv; @@ -123,13 +134,31 @@ function stringConfigValue( return typeof value === 'string' && value.trim().length > 0 ? resolveStringReference(value.trim(), env) : undefined; } +function positiveIntegerConfigValue(input: { + connection: KtxSnowflakeConnectionConfig; + key: keyof KtxSnowflakeConnectionConfig; + connectionId: string; + defaultValue: number; +}): number { + const value = input.connection[input.key]; + if (value === undefined) { + return input.defaultValue; + } + const numberValue = Number(value); + if (!Number.isInteger(numberValue) || numberValue < 1) { + throw new Error(`connections.${input.connectionId}.${String(input.key)} must be a positive integer`); + } + return numberValue; +} + function schemaNames(connection: KtxSnowflakeConnectionConfig, env: NodeJS.ProcessEnv): string[] { if (Array.isArray(connection.schema_names) && connection.schema_names.length > 0) { return connection.schema_names .filter((schema) => schema.trim().length > 0) .map((schema) => resolveStringReference(schema, env)); } - return [stringConfigValue(connection, 'schema_name', env) ?? 'PUBLIC']; + const single = stringConfigValue(connection, 'schema_name', env); + return single ? [single] : []; } function firstNumber(value: unknown): number | null { @@ -159,7 +188,7 @@ function normalizeSnowflakeValue(value: unknown, columnType?: string): unknown { return value; } -function toSnowflakeBind(value: unknown): snowflake.Bind { +function toSnowflakeBind(value: unknown): Bind { if (value === null || typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') { return value; } @@ -169,7 +198,7 @@ function toSnowflakeBind(value: unknown): snowflake.Bind { return String(value); } -function toSnowflakeBinds(params: unknown[] | undefined): snowflake.Binds | undefined { +function toSnowflakeBinds(params: unknown[] | undefined): Binds | undefined { return params?.map((value) => toSnowflakeBind(value)); } @@ -220,6 +249,12 @@ export function snowflakeConnectionConfigFromConfig(input: { database, schemas: resolvedSchemas, username, + maxSessions: positiveIntegerConfigValue({ + connection: input.connection, + key: 'maxSessions', + connectionId: input.connectionId, + defaultValue: 4, + }), }; const role = stringConfigValue(input.connection, 'role', env); if (role) { @@ -255,6 +290,7 @@ class DefaultSnowflakeDriverFactory implements KtxSnowflakeDriverFactory { class SnowflakeSdkDriver implements KtxSnowflakeDriver { private closeSdkOptions: Array<() => Promise> = []; + private pool: ReturnType | null = null; constructor( private readonly resolved: KtxSnowflakeResolvedConnectionConfig, @@ -275,37 +311,50 @@ class SnowflakeSdkDriver implements KtxSnowflakeDriver { } async query(sql: string, params?: unknown): Promise { - let connection: snowflake.Connection | null = null; + const binds = Array.isArray(params) ? toSnowflakeBinds(params) : undefined; try { - connection = await this.createConnection(); - const binds = Array.isArray(params) ? toSnowflakeBinds(params) : undefined; - const result = await this.executeSnowflakeQuery(connection, sql, binds); + const pool = await this.getPool(); + const result = await pool.use(async (connection: snowflake.Connection) => + this.executeSnowflakeQuery(connection, sql, binds), + ); return { ...result, totalRows: result.rows.length, rowCount: result.rows.length }; - } finally { - if (connection) { - await this.destroyConnection(connection); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (/timeout/i.test(message) && /pool|acquire/i.test(message)) { + throw new Error( + "Snowflake session pool exhausted after 60s - consider lowering maxSessions or increasing your account's concurrent-statement limit.", + ); } + throw error; } } - async getSchemaMetadata(schemaName = this.resolved.schemas[0] ?? 'PUBLIC'): Promise { + async getSchemaMetadata( + schemaName = this.resolved.schemas[0] ?? 'PUBLIC', + scopedTableNames: readonly string[] | null = null, + ): Promise { + const scopeClause = + scopedTableNames && scopedTableNames.length > 0 + ? `AND TABLE_NAME IN (${scopedTableNames.map(() => '?').join(', ')})` + : ''; + const scopeParams = scopedTableNames ?? []; const tablesResult = await this.query( ` SELECT TABLE_NAME, TABLE_TYPE, COMMENT, ROW_COUNT FROM INFORMATION_SCHEMA.TABLES - WHERE TABLE_SCHEMA = ? AND TABLE_CATALOG = ? + WHERE TABLE_SCHEMA = ? AND TABLE_CATALOG = ? ${scopeClause} ORDER BY TABLE_NAME `, - [schemaName, this.resolved.database], + [schemaName, this.resolved.database, ...scopeParams], ); const columnsResult = await this.query( ` SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE, IS_NULLABLE, COMMENT, ORDINAL_POSITION FROM INFORMATION_SCHEMA.COLUMNS - WHERE TABLE_SCHEMA = ? AND TABLE_CATALOG = ? + WHERE TABLE_SCHEMA = ? AND TABLE_CATALOG = ? ${scopeClause} ORDER BY TABLE_NAME, ORDINAL_POSITION `, - [schemaName, this.resolved.database], + [schemaName, this.resolved.database, ...scopeParams], ); const columnsByTable = new Map(); for (const row of columnsResult.rows) { @@ -357,27 +406,41 @@ class SnowflakeSdkDriver implements KtxSnowflakeDriver { } async cleanup(): Promise { + const pool = this.pool; + this.pool = null; + if (pool) { + // Drain before clear so in-flight Snowflake statements finish before idle + // sessions are closed. + await pool.drain(); + await pool.clear(); + } const closers = this.closeSdkOptions; this.closeSdkOptions = []; - await Promise.all(closers.map((close) => close())); + await Promise.all(closers.map((close) => Promise.resolve(close()))); } private async runTest(): Promise<{ success: boolean; error?: string }> { - let connection: snowflake.Connection | null = null; try { - connection = await this.createConnection(); - await this.executeSnowflakeQuery(connection, 'SELECT 1'); + await this.query('SELECT 1'); return { success: true }; } catch (error) { return { success: false, error: error instanceof Error ? error.message : String(error) }; - } finally { - if (connection) { - await this.destroyConnection(connection); - } } } - private async createConnection(): Promise { + private async getPool(): Promise> { + if (!this.pool) { + this.pool = snowflake.createPool(await this.resolveConnectionOptions(), { + min: 0, + max: this.resolved.maxSessions, + evictionRunIntervalMillis: 30_000, + acquireTimeoutMillis: 60_000, + }); + } + return this.pool; + } + + private async resolveConnectionOptions(): Promise { const patch = await this.sdkOptionsProvider?.resolve({ account: this.resolved.account, connection: { ...this.resolved, driver: 'snowflake' }, @@ -385,60 +448,27 @@ class SnowflakeSdkDriver implements KtxSnowflakeDriver { if (patch?.close) { this.closeSdkOptions.push(patch.close); } - const baseConfig: snowflake.ConnectionOptions = { + const sessionSchema = this.resolved.schemas[0]; + const baseConfig: ConnectionOptions = { account: this.resolved.account, username: this.resolved.username, warehouse: this.resolved.warehouse, database: this.resolved.database, - schema: this.resolved.schemas[0] ?? 'PUBLIC', + ...(sessionSchema ? { schema: sessionSchema } : {}), role: this.resolved.role, + clientSessionKeepAlive: true, + clientSessionKeepAliveHeartbeatFrequency: 900, ...patch?.sdkOptions, }; - const connectionConfig: snowflake.ConnectionOptions = - this.resolved.authMethod === 'rsa' - ? { ...baseConfig, authenticator: 'SNOWFLAKE_JWT', privateKey: this.decryptPrivateKey() } - : { ...baseConfig, password: this.resolved.password }; - const connection = snowflake.createConnection(connectionConfig); - return new Promise((resolveConnection, rejectConnection) => { - connection.connect((error, connected) => { - if (error) { - rejectConnection(error); - return; - } - const resolvedConnection = connected ?? connection; - this.setConnectionContext(resolvedConnection).then( - () => resolveConnection(resolvedConnection), - (contextError) => { - resolvedConnection.destroy(() => undefined); - rejectConnection(contextError); - }, - ); - }); - }); - } - - private async setConnectionContext(connection: snowflake.Connection): Promise { - if (this.resolved.role) { - await this.executeSnowflakeQuery(connection, `USE ROLE ${quoteSnowflakeIdentifier(this.resolved.role, 'role')}`); - } - await this.executeSnowflakeQuery( - connection, - `USE WAREHOUSE ${quoteSnowflakeIdentifier(this.resolved.warehouse, 'warehouse')}`, - ); - await this.executeSnowflakeQuery( - connection, - `USE DATABASE ${quoteSnowflakeIdentifier(this.resolved.database, 'database')}`, - ); - await this.executeSnowflakeQuery( - connection, - `USE SCHEMA ${quoteSnowflakeIdentifier(this.resolved.schemas[0] ?? 'PUBLIC', 'schema')}`, - ); + return this.resolved.authMethod === 'rsa' + ? { ...baseConfig, authenticator: 'SNOWFLAKE_JWT', privateKey: this.decryptPrivateKey() } + : { ...baseConfig, password: this.resolved.password }; } private async executeSnowflakeQuery( - connection: snowflake.Connection, + connection: Connection, sqlText: string, - binds?: snowflake.Binds, + binds?: Binds, ): Promise<{ headers: string[]; headerTypes?: string[]; rows: unknown[][] }> { return new Promise((resolveQuery, rejectQuery) => { connection.execute({ @@ -461,18 +491,6 @@ class SnowflakeSdkDriver implements KtxSnowflakeDriver { }); } - private destroyConnection(connection: snowflake.Connection): Promise { - return new Promise((resolveDestroy, rejectDestroy) => { - connection.destroy((error) => { - if (error) { - rejectDestroy(error); - return; - } - resolveDestroy(); - }); - }); - } - private decryptPrivateKey(): string { if (!this.resolved.privateKey) { throw new Error('Private key is required for RSA authentication'); @@ -510,6 +528,9 @@ export class KtxSnowflakeScanConnector implements KtxScanConnector { this.driverFactory = options.driverFactory ?? new DefaultSnowflakeDriverFactory(); this.now = options.now ?? (() => new Date()); this.id = `snowflake:${options.connectionId}`; + if (options.projectDir) { + configureSnowflakeSdkLogger(options.projectDir); + } } async testConnection(): Promise<{ success: boolean; error?: string }> { @@ -520,7 +541,11 @@ export class KtxSnowflakeScanConnector implements KtxScanConnector { this.assertConnection(input.connectionId); const tables: KtxSchemaTable[] = []; for (const schemaName of this.resolved.schemas) { - const rawTables = await this.getDriver().getSchemaMetadata(schemaName); + const scopedNames = input.tableScope + ? scopedTableNames(input.tableScope, { catalog: this.resolved.database, db: schemaName }) + : null; + if (scopedNames && scopedNames.length === 0) continue; + const rawTables = await this.getDriver().getSchemaMetadata(schemaName, scopedNames); const primaryKeys = await this.primaryKeys(rawTables.map((table) => table.name), schemaName); tables.push(...rawTables.map((table) => this.toSchemaTable(table, primaryKeys))); } @@ -653,32 +678,39 @@ export class KtxSnowflakeScanConnector implements KtxScanConnector { } private async primaryKeys(tableNames: string[], schemaName: string): Promise>> { - if (tableNames.length === 0) { - return new Map(); - } - const result = await this.getDriver().query( - ` - SELECT tc.TABLE_NAME, kcu.COLUMN_NAME - FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS tc - JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE kcu - ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME - AND tc.TABLE_SCHEMA = kcu.TABLE_SCHEMA - AND tc.TABLE_CATALOG = kcu.TABLE_CATALOG - WHERE tc.CONSTRAINT_TYPE = 'PRIMARY KEY' - AND tc.TABLE_SCHEMA = ? - AND tc.TABLE_CATALOG = ? - ORDER BY tc.TABLE_NAME, kcu.ORDINAL_POSITION - `, - [schemaName, this.resolved.database], - ); const grouped = new Map>(); for (const tableName of tableNames) { grouped.set(tableName, new Set()); } - for (const row of result.rows) { - const tableName = String(row[0]); - const columnName = String(row[1]); - grouped.get(tableName)?.add(columnName); + if (tableNames.length === 0) { + return grouped; + } + const tableNamePlaceholders = tableNames.map(() => '?').join(', '); + try { + const result = await this.getDriver().query( + ` + SELECT tc.TABLE_NAME, kcu.COLUMN_NAME + FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS tc + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE kcu + ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME + AND tc.TABLE_SCHEMA = kcu.TABLE_SCHEMA + AND tc.TABLE_CATALOG = kcu.TABLE_CATALOG + WHERE tc.CONSTRAINT_TYPE = 'PRIMARY KEY' + AND tc.TABLE_SCHEMA = ? + AND tc.TABLE_CATALOG = ? + AND tc.TABLE_NAME IN (${tableNamePlaceholders}) + ORDER BY tc.TABLE_NAME, kcu.ORDINAL_POSITION + `, + [schemaName, this.resolved.database, ...tableNames], + ); + for (const row of result.rows) { + const tableName = String(row[0]); + const columnName = String(row[1]); + grouped.get(tableName)?.add(columnName); + } + } catch { + // INFORMATION_SCHEMA.KEY_COLUMN_USAGE often isn't granted to read-only roles; + // continue with empty PK map and let FK inference + profiling carry the slack. } return grouped; } diff --git a/packages/cli/src/connectors/snowflake/historic-sql-query-client.ts b/packages/cli/src/connectors/snowflake/historic-sql-query-client.ts new file mode 100644 index 00000000..7d4070f5 --- /dev/null +++ b/packages/cli/src/connectors/snowflake/historic-sql-query-client.ts @@ -0,0 +1,31 @@ +import { KtxSnowflakeScanConnector, type KtxSnowflakeScanConnectorOptions } from './connector.js'; + +export type KtxSnowflakeHistoricSqlQueryClientOptions = KtxSnowflakeScanConnectorOptions; + +export class KtxSnowflakeHistoricSqlQueryClient { + private readonly connectionId: string; + private readonly connector: KtxSnowflakeScanConnector; + + constructor(options: KtxSnowflakeHistoricSqlQueryClientOptions) { + this.connectionId = options.connectionId; + this.connector = new KtxSnowflakeScanConnector(options); + } + + async executeQuery( + sql: string, + ): Promise<{ headers: string[]; rows: unknown[][]; totalRows: number }> { + const result = await this.connector.executeReadOnly( + { connectionId: this.connectionId, sql }, + {} as never, + ); + return { + headers: result.headers, + rows: result.rows, + totalRows: result.totalRows, + }; + } + + async cleanup(): Promise { + await this.connector.cleanup(); + } +} diff --git a/packages/cli/src/connectors/snowflake/live-database-introspection.ts b/packages/cli/src/connectors/snowflake/live-database-introspection.ts index 58812c1a..2becd219 100644 --- a/packages/cli/src/connectors/snowflake/live-database-introspection.ts +++ b/packages/cli/src/connectors/snowflake/live-database-introspection.ts @@ -1,4 +1,7 @@ -import type { LiveDatabaseIntrospectionPort } from '../../context/ingest/adapters/live-database/types.js'; +import type { + LiveDatabaseIntrospectionOptions, + LiveDatabaseIntrospectionPort, +} from '../../context/ingest/adapters/live-database/types.js'; import type { KtxProjectConnectionConfig } from '../../context/project/config.js'; import { KtxSnowflakeScanConnector, @@ -9,6 +12,7 @@ import { interface CreateSnowflakeLiveDatabaseIntrospectionOptions { connections: Record; + projectDir?: string; driverFactory?: KtxSnowflakeDriverFactory; sdkOptionsProvider?: KtxSnowflakeSdkOptionsProvider; now?: () => Date; @@ -18,18 +22,23 @@ export function createSnowflakeLiveDatabaseIntrospection( options: CreateSnowflakeLiveDatabaseIntrospectionOptions, ): LiveDatabaseIntrospectionPort { return { - async extractSchema(connectionId: string) { + async extractSchema(connectionId: string, introspectionOptions?: LiveDatabaseIntrospectionOptions) { const connection = options.connections[connectionId] as KtxSnowflakeConnectionConfig | undefined; const connector = new KtxSnowflakeScanConnector({ connectionId, connection, + ...(options.projectDir ? { projectDir: options.projectDir } : {}), driverFactory: options.driverFactory, sdkOptionsProvider: options.sdkOptionsProvider, now: options.now, }); try { return await connector.introspect( - { connectionId, driver: 'snowflake' }, + { + connectionId, + driver: 'snowflake', + ...(introspectionOptions?.tableScope ? { tableScope: introspectionOptions.tableScope } : {}), + }, { runId: `snowflake-${connectionId}` }, ); } finally { diff --git a/packages/cli/src/connectors/snowflake/sdk-logger.test.ts b/packages/cli/src/connectors/snowflake/sdk-logger.test.ts new file mode 100644 index 00000000..73bf0c76 --- /dev/null +++ b/packages/cli/src/connectors/snowflake/sdk-logger.test.ts @@ -0,0 +1,57 @@ +import { mkdtempSync, rmSync, statSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join, resolve } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +const { configure } = vi.hoisted(() => ({ configure: vi.fn() })); +vi.mock('snowflake-sdk', () => ({ + default: { configure }, +})); + +import { + configureSnowflakeSdkLogger, + resetSnowflakeSdkLoggerConfigurationForTests, +} from './sdk-logger.js'; + +describe('configureSnowflakeSdkLogger', () => { + let projectDir: string; + + beforeEach(() => { + configure.mockReset(); + resetSnowflakeSdkLoggerConfigurationForTests(); + projectDir = mkdtempSync(join(tmpdir(), 'ktx-snowflake-logger-')); + }); + + afterEach(() => { + rmSync(projectDir, { recursive: true, force: true }); + }); + + it('routes logs to /.ktx/logs/snowflake.log with console output disabled', () => { + const expected = resolve(projectDir, '.ktx', 'logs', 'snowflake.log'); + const returned = configureSnowflakeSdkLogger(projectDir); + expect(returned).toBe(expected); + expect(configure).toHaveBeenCalledTimes(1); + expect(configure).toHaveBeenCalledWith({ + logFilePath: expected, + additionalLogToConsole: false, + }); + expect(statSync(resolve(projectDir, '.ktx', 'logs')).isDirectory()).toBe(true); + }); + + it('is idempotent for the same projectDir', () => { + configureSnowflakeSdkLogger(projectDir); + configureSnowflakeSdkLogger(projectDir); + expect(configure).toHaveBeenCalledTimes(1); + }); + + it('reconfigures when projectDir changes', () => { + const other = mkdtempSync(join(tmpdir(), 'ktx-snowflake-logger-other-')); + try { + configureSnowflakeSdkLogger(projectDir); + configureSnowflakeSdkLogger(other); + expect(configure).toHaveBeenCalledTimes(2); + } finally { + rmSync(other, { recursive: true, force: true }); + } + }); +}); diff --git a/packages/cli/src/connectors/snowflake/sdk-logger.ts b/packages/cli/src/connectors/snowflake/sdk-logger.ts new file mode 100644 index 00000000..f6eed277 --- /dev/null +++ b/packages/cli/src/connectors/snowflake/sdk-logger.ts @@ -0,0 +1,32 @@ +import { mkdirSync } from 'node:fs'; +import { resolve } from 'node:path'; +import snowflake from 'snowflake-sdk'; + +let configuredLogFilePath: string | null = null; + +/** + * Redirects the snowflake-sdk logger to a project-scoped file so its JSON output + * does not bleed into the CLI's TTY (which would pollute the setup wizard and + * break the in-place progress repainter in `context-build-view.ts`). + * + * Idempotent per process: re-calling with the same projectDir is a no-op. + */ +export function configureSnowflakeSdkLogger(projectDir: string): string { + const logDir = resolve(projectDir, '.ktx', 'logs'); + const logFilePath = resolve(logDir, 'snowflake.log'); + if (configuredLogFilePath === logFilePath) { + return logFilePath; + } + mkdirSync(logDir, { recursive: true }); + snowflake.configure({ + logFilePath, + additionalLogToConsole: false, + }); + configuredLogFilePath = logFilePath; + return logFilePath; +} + +/** @internal */ +export function resetSnowflakeSdkLoggerConfigurationForTests(): void { + configuredLogFilePath = null; +} diff --git a/packages/cli/src/connectors/sqlite/connector.test.ts b/packages/cli/src/connectors/sqlite/connector.test.ts index 77ec4b3c..ecd283b7 100644 --- a/packages/cli/src/connectors/sqlite/connector.test.ts +++ b/packages/cli/src/connectors/sqlite/connector.test.ts @@ -6,6 +6,7 @@ import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import { createSqliteLiveDatabaseIntrospection } from '../../connectors/sqlite/live-database-introspection.js'; import { isKtxSqliteConnectionConfig, KtxSqliteScanConnector, sqliteDatabasePathFromConfig } from '../../connectors/sqlite/connector.js'; +import { tableRefSet } from '../../context/scan/table-ref.js'; describe('KtxSqliteScanConnector', () => { let tempDir: string; @@ -196,6 +197,19 @@ describe('KtxSqliteScanConnector', () => { ).resolves.toBeNull(); }); + it('limits introspection to tables in tableScope', async () => { + const connector = new KtxSqliteScanConnector({ + connectionId: 'warehouse', + connection: { driver: 'sqlite', path: dbPath }, + }); + const scope = tableRefSet([{ catalog: null, db: null, name: 'orders' }]); + const snapshot = await connector.introspect( + { connectionId: 'warehouse', driver: 'sqlite', tableScope: scope }, + { runId: 'scope-test' }, + ); + expect(snapshot.tables.map((table) => table.name)).toEqual(['orders']); + }); + it('adapts native SQLite snapshots to live-database introspection for local ingest', async () => { const introspection = createSqliteLiveDatabaseIntrospection({ projectDir: tempDir, diff --git a/packages/cli/src/connectors/sqlite/connector.ts b/packages/cli/src/connectors/sqlite/connector.ts index e915c776..17b33a71 100644 --- a/packages/cli/src/connectors/sqlite/connector.ts +++ b/packages/cli/src/connectors/sqlite/connector.ts @@ -6,6 +6,7 @@ import { fileURLToPath } from 'node:url'; import { assertReadOnlySql, limitSqlForExecution } from '../../context/connections/read-only-sql.js'; import { normalizeQueryRows } from '../../context/connections/query-executor.js'; import { createKtxConnectorCapabilities, type KtxColumnSampleInput, type KtxColumnSampleResult, type KtxColumnStatsInput, type KtxColumnStatsResult, type KtxQueryResult, type KtxReadOnlyQueryInput, type KtxScanConnector, type KtxScanContext, type KtxScanInput, type KtxSchemaForeignKey, type KtxSchemaSnapshot, type KtxSchemaTable, type KtxTableRef, type KtxTableSampleInput, type KtxTableSampleResult } from '../../context/scan/types.js'; +import { scopedTableNames } from '../../context/scan/table-ref.js'; import { KtxSqliteDialect } from './dialect.js'; export interface KtxSqliteConnectionConfig { @@ -181,11 +182,16 @@ export class KtxSqliteScanConnector implements KtxScanConnector { async introspect(input: KtxScanInput, _ctx: KtxScanContext): Promise { this.assertConnection(input.connectionId); const database = this.database(); - const rawTables = database - .prepare( - `SELECT name, type FROM sqlite_master WHERE type IN ('table', 'view') AND name NOT LIKE 'sqlite_%' ORDER BY name`, - ) - .all() as SqliteMasterRow[]; + const scopedNames = input.tableScope ? scopedTableNames(input.tableScope, { catalog: null, db: null }) : null; + const scopeClause = scopedNames ? `AND name IN (${scopedNames.map(() => '?').join(', ')})` : ''; + const rawTables = + scopedNames && scopedNames.length === 0 + ? [] + : (database + .prepare( + `SELECT name, type FROM sqlite_master WHERE type IN ('table', 'view') AND name NOT LIKE 'sqlite_%' ${scopeClause} ORDER BY name`, + ) + .all(...(scopedNames ?? [])) as SqliteMasterRow[]); const tables = rawTables.map((table) => this.readTable(database, table)); const fileStats = existsSync(this.dbPath) ? statSync(this.dbPath) : null; return { diff --git a/packages/cli/src/connectors/sqlite/live-database-introspection.ts b/packages/cli/src/connectors/sqlite/live-database-introspection.ts index 62a1f8c5..93fae6a9 100644 --- a/packages/cli/src/connectors/sqlite/live-database-introspection.ts +++ b/packages/cli/src/connectors/sqlite/live-database-introspection.ts @@ -1,4 +1,7 @@ -import type { LiveDatabaseIntrospectionPort } from '../../context/ingest/adapters/live-database/types.js'; +import type { + LiveDatabaseIntrospectionOptions, + LiveDatabaseIntrospectionPort, +} from '../../context/ingest/adapters/live-database/types.js'; import type { KtxProjectConnectionConfig } from '../../context/project/config.js'; import { KtxSqliteScanConnector, type KtxSqliteConnectionConfig } from './connector.js'; @@ -12,7 +15,7 @@ export function createSqliteLiveDatabaseIntrospection( options: CreateSqliteLiveDatabaseIntrospectionOptions, ): LiveDatabaseIntrospectionPort { return { - async extractSchema(connectionId: string) { + async extractSchema(connectionId: string, introspectionOptions?: LiveDatabaseIntrospectionOptions) { const connection = options.connections[connectionId] as KtxSqliteConnectionConfig | undefined; const connector = new KtxSqliteScanConnector({ connectionId, @@ -21,7 +24,14 @@ export function createSqliteLiveDatabaseIntrospection( now: options.now, }); try { - return await connector.introspect({ connectionId, driver: 'sqlite' }, { runId: `sqlite-${connectionId}` }); + return await connector.introspect( + { + connectionId, + driver: 'sqlite', + ...(introspectionOptions?.tableScope ? { tableScope: introspectionOptions.tableScope } : {}), + }, + { runId: `sqlite-${connectionId}` }, + ); } finally { await connector.cleanup(); } diff --git a/packages/cli/src/connectors/sqlserver/connector.test.ts b/packages/cli/src/connectors/sqlserver/connector.test.ts index bd9a8af1..ef00bd3a 100644 --- a/packages/cli/src/connectors/sqlserver/connector.test.ts +++ b/packages/cli/src/connectors/sqlserver/connector.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it, vi } from 'vitest'; import { createSqlServerLiveDatabaseIntrospection } from '../../connectors/sqlserver/live-database-introspection.js'; import { isKtxSqlServerConnectionConfig, KtxSqlServerScanConnector, sqlServerConnectionPoolConfigFromConfig, type KtxSqlServerPoolFactory, type KtxSqlServerQueryResult } from '../../connectors/sqlserver/connector.js'; +import { tableRefSet } from '../../context/scan/table-ref.js'; function recordset>( rows: T[], @@ -290,6 +291,55 @@ describe('KtxSqlServerScanConnector', () => { await connector.cleanup(); }); + it('limits introspection to tables in tableScope', async () => { + const queries: string[] = []; + const inputs: Array<{ name: string; value: unknown }> = []; + const request = { + input: vi.fn((name: string, value: unknown) => { + inputs.push({ name, value }); + return request; + }), + query: vi.fn(async (sql: string): Promise => { + queries.push(sql); + if (sql.includes('INFORMATION_SCHEMA.TABLES')) { + return result([{ table_name: 'orders', table_type: 'BASE TABLE' }], ['table_name', 'table_type']); + } + if (sql.includes('INFORMATION_SCHEMA.COLUMNS')) { + return result( + [{ table_name: 'orders', column_name: 'id', data_type: 'int', is_nullable: 'NO' }], + ['table_name', 'column_name', 'data_type', 'is_nullable'], + ); + } + return result([], []); + }), + }; + const poolFactory: KtxSqlServerPoolFactory = { + createPool: vi.fn(async () => ({ + request: () => request, + close: vi.fn(async () => undefined), + })), + }; + const connector = new KtxSqlServerScanConnector({ + connectionId: 'warehouse', + connection: { + driver: 'sqlserver', + host: 'db.example.test', + database: 'analytics', + username: 'reader', + schema: 'dbo', + }, + poolFactory, + }); + const scope = tableRefSet([{ catalog: 'analytics', db: 'dbo', name: 'orders' }]); + const snapshot = await connector.introspect( + { connectionId: 'warehouse', driver: 'sqlserver', tableScope: scope }, + { runId: 'scope-test' }, + ); + expect(snapshot.tables.map((table) => table.name)).toEqual(['orders']); + expect(queries.find((query) => query.includes('INFORMATION_SCHEMA.TABLES'))).toMatch(/TABLE_NAME IN \(@table_0\)/); + expect(inputs).toEqual(expect.arrayContaining([{ name: 'table_0', value: 'orders' }])); + }); + it('adapts native SQL Server snapshots to live-database introspection for local ingest', async () => { const introspection = createSqlServerLiveDatabaseIntrospection({ connections: { diff --git a/packages/cli/src/connectors/sqlserver/connector.ts b/packages/cli/src/connectors/sqlserver/connector.ts index d9c227d7..64b8075e 100644 --- a/packages/cli/src/connectors/sqlserver/connector.ts +++ b/packages/cli/src/connectors/sqlserver/connector.ts @@ -1,5 +1,6 @@ import { assertReadOnlySql } from '../../context/connections/read-only-sql.js'; import { createKtxConnectorCapabilities, type KtxColumnSampleInput, type KtxColumnSampleResult, type KtxColumnStatsInput, type KtxColumnStatsResult, type KtxQueryResult, type KtxReadOnlyQueryInput, type KtxScanConnector, type KtxScanContext, type KtxScanInput, type KtxSchemaColumn, type KtxSchemaForeignKey, type KtxSchemaSnapshot, type KtxSchemaTable, type KtxTableListEntry, type KtxTableRef, type KtxTableSampleInput, type KtxTableSampleResult } from '../../context/scan/types.js'; +import { scopedTableNames } from '../../context/scan/table-ref.js'; import { readFileSync } from 'node:fs'; import { homedir } from 'node:os'; import { resolve } from 'node:path'; @@ -121,6 +122,20 @@ function sqlRecordset( return recordset; } +function tableScopeSql( + scopedNames: readonly string[] | null, + columnExpression: string, +): { clause: string; params: Record } { + if (!scopedNames) return { clause: '', params: {} }; + const params: Record = {}; + const placeholders = scopedNames.map((name, index) => { + const key = `table_${index}`; + params[key] = name; + return `@${key}`; + }); + return { clause: `AND ${columnExpression} IN (${placeholders.join(', ')})`, params }; +} + class DefaultSqlServerPoolFactory implements KtxSqlServerPoolFactory { async createPool(config: KtxSqlServerPoolConfig): Promise { const pool = await new sql.ConnectionPool(config as sql.config).connect(); @@ -314,7 +329,10 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { this.assertConnection(input.connectionId); const tables: KtxSchemaTable[] = []; for (const schemaName of this.schemas) { - tables.push(...(await this.introspectSchema(schemaName))); + const scopedNames = input.tableScope + ? scopedTableNames(input.tableScope, { catalog: this.poolConfig.database, db: schemaName }) + : null; + tables.push(...(await this.introspectSchema(schemaName, scopedNames))); } return { connectionId: this.connectionId, @@ -461,16 +479,19 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { } } - private async introspectSchema(schemaName: string): Promise { + private async introspectSchema(schemaName: string, scopedNames: readonly string[] | null): Promise { + if (scopedNames && scopedNames.length === 0) return []; + const tableScope = tableScopeSql(scopedNames, 'TABLE_NAME'); const tables = await this.queryRaw<{ table_name: string; table_type: string }>( ` SELECT TABLE_NAME AS table_name, TABLE_TYPE AS table_type FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = @schemaName AND TABLE_TYPE IN ('BASE TABLE', 'VIEW') + ${tableScope.clause} ORDER BY TABLE_NAME `, - { schemaName }, + { schemaName, ...tableScope.params }, ); const columns = await this.queryRaw<{ table_name: string; @@ -482,15 +503,16 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { SELECT TABLE_NAME AS table_name, COLUMN_NAME AS column_name, DATA_TYPE AS data_type, IS_NULLABLE AS is_nullable FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = @schemaName + ${tableScope.clause} ORDER BY TABLE_NAME, ORDINAL_POSITION `, - { schemaName }, + { schemaName, ...tableScope.params }, ); - const tableComments = await this.tableComments(schemaName); - const columnComments = await this.columnComments(schemaName); - const primaryKeys = await this.primaryKeys(schemaName); - const foreignKeys = await this.foreignKeys(schemaName); - const rowCounts = await this.rowCounts(schemaName); + const tableComments = await this.tableComments(schemaName, scopedNames); + const columnComments = await this.columnComments(schemaName, scopedNames); + const primaryKeys = await this.primaryKeys(schemaName, scopedNames); + const foreignKeys = await this.foreignKeys(schemaName, scopedNames); + const rowCounts = await this.rowCounts(schemaName, scopedNames); const columnsByTable = groupByTable(columns); const foreignKeysByTable = groupByTable(foreignKeys); @@ -508,7 +530,8 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { })); } - private async tableComments(schemaName: string): Promise> { + private async tableComments(schemaName: string, scopedNames: readonly string[] | null): Promise> { + const tableScope = tableScopeSql(scopedNames, 'o.name'); const rows = await this.queryRaw<{ table_name: string; table_comment: string }>( ` SELECT o.name AS table_name, CAST(ep.value AS NVARCHAR(MAX)) AS table_comment @@ -519,13 +542,15 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { AND ep.name = 'MS_Description' WHERE s.name = @schemaName AND o.type IN ('U', 'V') + ${tableScope.clause} `, - { schemaName }, + { schemaName, ...tableScope.params }, ); return new Map(rows.map((row) => [row.table_name, row.table_comment])); } - private async columnComments(schemaName: string): Promise> { + private async columnComments(schemaName: string, scopedNames: readonly string[] | null): Promise> { + const tableScope = tableScopeSql(scopedNames, 'o.name'); const rows = await this.queryRaw<{ table_name: string; column_name: string; column_comment: string }>( ` SELECT o.name AS table_name, c.name AS column_name, CAST(ep.value AS NVARCHAR(MAX)) AS column_comment @@ -537,13 +562,18 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { AND ep.name = 'MS_Description' WHERE s.name = @schemaName AND o.type IN ('U', 'V') + ${tableScope.clause} `, - { schemaName }, + { schemaName, ...tableScope.params }, ); return new Map(rows.map((row) => [`${row.table_name}.${row.column_name}`, row.column_comment])); } - private async primaryKeys(schemaName: string): Promise>> { + private async primaryKeys( + schemaName: string, + scopedNames: readonly string[] | null, + ): Promise>> { + const tableScope = tableScopeSql(scopedNames, 'tc.TABLE_NAME'); const rows = await this.queryRaw<{ table_name: string; column_name: string }>( ` SELECT tc.TABLE_NAME AS table_name, kcu.COLUMN_NAME AS column_name @@ -553,9 +583,10 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { AND tc.TABLE_SCHEMA = kcu.TABLE_SCHEMA WHERE tc.CONSTRAINT_TYPE = 'PRIMARY KEY' AND tc.TABLE_SCHEMA = @schemaName + ${tableScope.clause} ORDER BY tc.TABLE_NAME, kcu.ORDINAL_POSITION `, - { schemaName }, + { schemaName, ...tableScope.params }, ); const grouped = new Map>(); for (const row of rows) { @@ -566,7 +597,10 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { return grouped; } - private async foreignKeys(schemaName: string): Promise< + private async foreignKeys( + schemaName: string, + scopedNames: readonly string[] | null, + ): Promise< Array<{ table_name: string; column_name: string; @@ -576,6 +610,7 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { constraint_name: string; }> > { + const tableScope = tableScopeSql(scopedNames, 'fk.TABLE_NAME'); return this.queryRaw( ` SELECT @@ -596,13 +631,15 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { AND pk.CONSTRAINT_NAME = rc.UNIQUE_CONSTRAINT_NAME AND pk.ORDINAL_POSITION = fk.ORDINAL_POSITION WHERE fk.TABLE_SCHEMA = @schemaName + ${tableScope.clause} ORDER BY fk.TABLE_NAME, fk.COLUMN_NAME `, - { schemaName }, + { schemaName, ...tableScope.params }, ); } - private async rowCounts(schemaName: string): Promise> { + private async rowCounts(schemaName: string, scopedNames: readonly string[] | null): Promise> { + const tableScope = tableScopeSql(scopedNames, 't.name'); const rows = await this.queryRaw<{ table_name: string; row_count: unknown }>( ` SELECT t.name AS table_name, SUM(p.rows) AS row_count @@ -611,9 +648,10 @@ export class KtxSqlServerScanConnector implements KtxScanConnector { INNER JOIN sys.schemas s ON t.schema_id = s.schema_id WHERE s.name = @schemaName AND p.index_id IN (0, 1) + ${tableScope.clause} GROUP BY t.name `, - { schemaName }, + { schemaName, ...tableScope.params }, ); return new Map(rows.map((row) => [row.table_name, firstNumber(row.row_count) ?? 0])); } diff --git a/packages/cli/src/connectors/sqlserver/live-database-introspection.ts b/packages/cli/src/connectors/sqlserver/live-database-introspection.ts index 6bd54ba1..6468856d 100644 --- a/packages/cli/src/connectors/sqlserver/live-database-introspection.ts +++ b/packages/cli/src/connectors/sqlserver/live-database-introspection.ts @@ -1,4 +1,7 @@ -import type { LiveDatabaseIntrospectionPort } from '../../context/ingest/adapters/live-database/types.js'; +import type { + LiveDatabaseIntrospectionOptions, + LiveDatabaseIntrospectionPort, +} from '../../context/ingest/adapters/live-database/types.js'; import type { KtxProjectConnectionConfig } from '../../context/project/config.js'; import { KtxSqlServerScanConnector, @@ -18,7 +21,7 @@ export function createSqlServerLiveDatabaseIntrospection( options: CreateSqlServerLiveDatabaseIntrospectionOptions, ): LiveDatabaseIntrospectionPort { return { - async extractSchema(connectionId: string) { + async extractSchema(connectionId: string, introspectionOptions?: LiveDatabaseIntrospectionOptions) { const connection = options.connections[connectionId] as KtxSqlServerConnectionConfig | undefined; const connector = new KtxSqlServerScanConnector({ connectionId, @@ -29,7 +32,11 @@ export function createSqlServerLiveDatabaseIntrospection( }); try { return await connector.introspect( - { connectionId, driver: 'sqlserver' }, + { + connectionId, + driver: 'sqlserver', + ...(introspectionOptions?.tableScope ? { tableScope: introspectionOptions.tableScope } : {}), + }, { runId: `sqlserver-${connectionId}` }, ); } finally { diff --git a/packages/cli/src/context-build-view.ts b/packages/cli/src/context-build-view.ts index c734c6b9..9f6e5f78 100644 --- a/packages/cli/src/context-build-view.ts +++ b/packages/cli/src/context-build-view.ts @@ -319,7 +319,8 @@ function renderPhaseRow(phase: PhaseState, frame: number, styled: boolean): stri } else if (phase.status === 'skipped') { trailing = styled ? dim('skipped') : 'skipped'; } else if (phase.status === 'failed') { - trailing = styled ? red('failed') : 'failed'; + const label = styled ? red('failed') : 'failed'; + trailing = phase.summary ? `${label} ${phase.summary}` : label; } const bar = `${segments.join(' ')} ${trailing}`.trimEnd(); return ` ${icon} ${name} ${bar}`; diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/connection-dialect.ts b/packages/cli/src/context/ingest/adapters/historic-sql/connection-dialect.ts new file mode 100644 index 00000000..e81966b5 --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/historic-sql/connection-dialect.ts @@ -0,0 +1,48 @@ +import type { HistoricSqlDialect } from './types.js'; + +const KNOWN_DIALECTS = ['postgres', 'bigquery', 'snowflake'] as const; + +function isKnownDialect(value: string): value is HistoricSqlDialect { + return (KNOWN_DIALECTS as readonly string[]).includes(value); +} + +function recordOrNull(value: unknown): Record | null { + return value && typeof value === 'object' && !Array.isArray(value) ? (value as Record) : null; +} + +function historicSqlRecord(connection: unknown): Record | null { + const conn = recordOrNull(connection); + return conn ? recordOrNull(conn.historicSql) : null; +} + +function queryHistoryRecord(connection: unknown): Record | null { + const conn = recordOrNull(connection); + const context = conn ? recordOrNull(conn.context) : null; + return context ? recordOrNull(context.queryHistory) : null; +} + +export function isQueryHistoryEnabled(connection: unknown): boolean { + const queryHistory = queryHistoryRecord(connection); + if (queryHistory) { + return queryHistory.enabled === true; + } + return historicSqlRecord(connection)?.enabled === true; +} + +/** + * Resolves the query-history dialect for a connection. Returns null when + * query history is disabled, or when the connection's driver has no + * query-history reader. + */ +export function queryHistoryDialectForConnection(connection: unknown): HistoricSqlDialect | null { + if (!isQueryHistoryEnabled(connection)) { + return null; + } + const conn = recordOrNull(connection); + const driver = String(conn?.driver ?? '').toLowerCase(); + if (driver === 'postgres' || driver === 'postgresql') return 'postgres'; + if (driver === 'bigquery') return 'bigquery'; + if (driver === 'snowflake') return 'snowflake'; + const legacy = String(historicSqlRecord(connection)?.dialect ?? '').toLowerCase(); + return isKnownDialect(legacy) ? legacy : null; +} diff --git a/packages/cli/src/context/ingest/adapters/live-database/daemon-introspection.test.ts b/packages/cli/src/context/ingest/adapters/live-database/daemon-introspection.test.ts index 8237d903..9310f148 100644 --- a/packages/cli/src/context/ingest/adapters/live-database/daemon-introspection.test.ts +++ b/packages/cli/src/context/ingest/adapters/live-database/daemon-introspection.test.ts @@ -1,6 +1,7 @@ import { once } from 'node:events'; import { createServer } from 'node:http'; import { describe, expect, it, vi } from 'vitest'; +import { tableRefSet } from '../../../scan/table-ref.js'; import { createDaemonLiveDatabaseIntrospection } from './daemon-introspection.js'; const daemonResponse = { @@ -161,7 +162,11 @@ describe('createDaemonLiveDatabaseIntrospection', () => { baseUrl: `http://127.0.0.1:${address.port}`, }); - await expect(introspection.extractSchema('warehouse')).resolves.toMatchObject({ + await expect( + introspection.extractSchema('warehouse', { + tableScope: tableRefSet([{ catalog: 'warehouse', db: 'public', name: 'orders' }]), + }), + ).resolves.toMatchObject({ connectionId: 'warehouse', tables: [{ name: 'customers' }, { name: 'orders' }], }); @@ -176,6 +181,7 @@ describe('createDaemonLiveDatabaseIntrospection', () => { schemas: ['public'], statement_timeout_ms: 30_000, connection_timeout_seconds: 5, + table_scope: [{ catalog: 'warehouse', db: 'public', name: 'orders' }], }, }, ]); @@ -217,7 +223,7 @@ describe('createDaemonLiveDatabaseIntrospection', () => { expect(runJson).not.toHaveBeenCalled(); }); - it('filters out tables not on the enabled_tables allowlist', async () => { + it('does not use connection enabled_tables as a response filter', async () => { const runJson = vi.fn(async () => daemonResponse); const introspection = createDaemonLiveDatabaseIntrospection({ connections: { @@ -232,7 +238,8 @@ describe('createDaemonLiveDatabaseIntrospection', () => { }); const snapshot = await introspection.extractSchema('warehouse'); - expect(snapshot.tables.map((table) => `${table.db}.${table.name}`)).toEqual(['public.orders']); + expect(snapshot.tables.map((table) => `${table.db}.${table.name}`)).toEqual(['public.customers', 'public.orders']); + expect(runJson).toHaveBeenCalledWith('database-introspect', expect.not.objectContaining({ table_scope: expect.anything() })); }); it('passes through every table when enabled_tables is omitted or empty', async () => { diff --git a/packages/cli/src/context/ingest/adapters/live-database/daemon-introspection.ts b/packages/cli/src/context/ingest/adapters/live-database/daemon-introspection.ts index ff01fda9..f71e332d 100644 --- a/packages/cli/src/context/ingest/adapters/live-database/daemon-introspection.ts +++ b/packages/cli/src/context/ingest/adapters/live-database/daemon-introspection.ts @@ -3,10 +3,10 @@ import { request as httpRequest } from 'node:http'; import { request as httpsRequest } from 'node:https'; import { URL } from 'node:url'; import type { KtxProjectConnectionConfig } from '../../../project/config.js'; -import { filterSnapshotTables, resolveEnabledTables } from '../../../scan/enabled-tables.js'; +import { tableRefFromKey } from '../../../scan/table-ref.js'; import type { KtxSchemaColumn, KtxSchemaForeignKey, KtxSchemaSnapshot, KtxSchemaTable } from '../../../scan/types.js'; import { inferKtxDimensionType, normalizeKtxNativeType } from '../../../scan/type-normalization.js'; -import type { LiveDatabaseIntrospectionPort } from './types.js'; +import type { LiveDatabaseIntrospectionOptions, LiveDatabaseIntrospectionPort } from './types.js'; type KtxDaemonDatabaseIntrospectionCommand = 'database-introspect'; @@ -220,6 +220,18 @@ function mapDaemonSnapshot( }; } +function serializeTableScope(options: LiveDatabaseIntrospectionOptions | undefined): Array<{ + catalog: string | null; + db: string | null; + name: string; +}> | undefined { + if (!options?.tableScope) return undefined; + return [...options.tableScope].map((key) => { + const ref = tableRefFromKey(key); + return { catalog: ref.catalog, db: ref.db, name: ref.name }; + }); +} + export function createDaemonLiveDatabaseIntrospection( options: DaemonLiveDatabaseIntrospectionOptions, ): LiveDatabaseIntrospectionPort { @@ -231,8 +243,9 @@ export function createDaemonLiveDatabaseIntrospection( const now = options.now ?? (() => new Date()); return { - async extractSchema(connectionId: string): Promise { + async extractSchema(connectionId: string, introspectionOptions?: LiveDatabaseIntrospectionOptions): Promise { const connection = requirePostgresConnection(options.connections, connectionId); + const tableScope = serializeTableScope(introspectionOptions); const payload = { connection_id: connectionId, driver: normalizeDriver(connection.driver), @@ -240,17 +253,16 @@ export function createDaemonLiveDatabaseIntrospection( schemas, statement_timeout_ms: options.statementTimeoutMs ?? 30_000, connection_timeout_seconds: options.connectionTimeoutSeconds ?? 5, + ...(tableScope !== undefined ? { table_scope: tableScope } : {}), }; const raw = requestJson ? await requestJson('/database/introspect', payload) : await runJson('database-introspect', payload); - const snapshot = mapDaemonSnapshot(raw, { + return mapDaemonSnapshot(raw, { connectionId, extractedAt: now().toISOString(), schemas, }); - const enabledTables = resolveEnabledTables(connection); - return enabledTables ? filterSnapshotTables(snapshot, enabledTables) : snapshot; }, }; } diff --git a/packages/cli/src/context/ingest/adapters/live-database/live-database.adapter.test.ts b/packages/cli/src/context/ingest/adapters/live-database/live-database.adapter.test.ts index 7e7a3f74..6cd543e1 100644 --- a/packages/cli/src/context/ingest/adapters/live-database/live-database.adapter.test.ts +++ b/packages/cli/src/context/ingest/adapters/live-database/live-database.adapter.test.ts @@ -1,7 +1,8 @@ -import { mkdtemp } from 'node:fs/promises'; +import { mkdtemp, readdir, rm } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { describe, expect, it, vi } from 'vitest'; +import { tableRefSet, type KtxTableRefKey } from '../../../scan/table-ref.js'; import { LiveDatabaseSourceAdapter } from './live-database.adapter.js'; describe('LiveDatabaseSourceAdapter', () => { @@ -43,7 +44,7 @@ describe('LiveDatabaseSourceAdapter', () => { await adapter.fetch(undefined, dir, { connectionId: 'conn-1', sourceKey: 'live-database' }); - expect(extractSchema).toHaveBeenCalledWith('conn-1'); + expect(extractSchema).toHaveBeenCalledWith('conn-1', { tableScope: undefined }); await expect(adapter.detect(dir)).resolves.toBe(true); const chunked = await adapter.chunk(dir); expect(chunked.workUnits.map((wu) => wu.unitKey)).toEqual(['live-database-public-orders']); @@ -56,4 +57,55 @@ describe('LiveDatabaseSourceAdapter', () => { expect(adapter.source).toBe('live-database'); expect(adapter.skillNames).toEqual(['live_database_ingest']); }); + + it('threads tableScope from fetch context into the introspection port without post-filtering', async () => { + const extractSchema = vi.fn( + async (_connectionId: string, _options?: { tableScope?: ReadonlySet }) => ({ + connectionId: 'warehouse', + driver: 'snowflake' as const, + extractedAt: '2026-05-22T00:00:00.000Z', + scope: {}, + metadata: {}, + tables: [ + { + catalog: 'A', + db: 'MARTS', + name: 'IN_SCOPE', + kind: 'table' as const, + comment: null, + estimatedRows: 0, + columns: [], + foreignKeys: [], + }, + { + catalog: 'A', + db: 'MARTS', + name: 'OUT_OF_SCOPE', + kind: 'table' as const, + comment: null, + estimatedRows: 0, + columns: [], + foreignKeys: [], + }, + ], + }), + ); + const scope = tableRefSet([{ catalog: 'A', db: 'MARTS', name: 'IN_SCOPE' }]); + const adapter = new LiveDatabaseSourceAdapter({ + introspection: { extractSchema }, + }); + const stagedDir = await mkdtemp(join(tmpdir(), 'ktx-livedb-scope-')); + try { + await adapter.fetch(undefined, stagedDir, { + connectionId: 'warehouse', + sourceKey: 'live-database', + tableScope: scope, + }); + expect(extractSchema).toHaveBeenCalledWith('warehouse', { tableScope: scope }); + const tables = await readdir(join(stagedDir, 'tables')); + expect(tables).toHaveLength(2); + } finally { + await rm(stagedDir, { recursive: true, force: true }); + } + }); }); diff --git a/packages/cli/src/context/ingest/adapters/live-database/live-database.adapter.ts b/packages/cli/src/context/ingest/adapters/live-database/live-database.adapter.ts index 9e5076ab..68087bc0 100644 --- a/packages/cli/src/context/ingest/adapters/live-database/live-database.adapter.ts +++ b/packages/cli/src/context/ingest/adapters/live-database/live-database.adapter.ts @@ -14,7 +14,8 @@ export class LiveDatabaseSourceAdapter implements SourceAdapter { } async fetch(_pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise { - const snapshot = await this.deps.introspection.extractSchema(ctx.connectionId); + const tableScope = ctx.tableScope; + const snapshot = await this.deps.introspection.extractSchema(ctx.connectionId, { tableScope }); await writeLiveDatabaseSnapshot(stagedDir, { ...snapshot, connectionId: ctx.connectionId, diff --git a/packages/cli/src/context/ingest/adapters/live-database/types.ts b/packages/cli/src/context/ingest/adapters/live-database/types.ts index b9846b1b..6115387f 100644 --- a/packages/cli/src/context/ingest/adapters/live-database/types.ts +++ b/packages/cli/src/context/ingest/adapters/live-database/types.ts @@ -1,7 +1,12 @@ import type { KtxSchemaSnapshot } from '../../../scan/types.js'; +import type { KtxTableRefKey } from '../../../scan/table-ref.js'; + +export interface LiveDatabaseIntrospectionOptions { + tableScope?: ReadonlySet; +} export interface LiveDatabaseIntrospectionPort { - extractSchema(connectionId: string): Promise; + extractSchema(connectionId: string, options?: LiveDatabaseIntrospectionOptions): Promise; } export interface LiveDatabaseSourceAdapterDeps { diff --git a/packages/cli/src/context/ingest/local-stage-ingest.ts b/packages/cli/src/context/ingest/local-stage-ingest.ts index 5897281f..f10a4a78 100644 --- a/packages/cli/src/context/ingest/local-stage-ingest.ts +++ b/packages/cli/src/context/ingest/local-stage-ingest.ts @@ -9,6 +9,7 @@ import { sanitizeMemoryFlowError } from './memory-flow/live-buffer.js'; import type { MemoryFlowEventSink, MemoryFlowPlannedWorkUnit } from './memory-flow/types.js'; import { buildSyncId } from './raw-sources-paths.js'; import { SqliteLocalIngestStore } from './sqlite-local-ingest-store.js'; +import type { KtxTableRefKey } from '../scan/table-ref.js'; import type { IngestTrigger, SourceAdapter, WorkUnit } from './types.js'; type LocalIngestStatus = 'running' | 'done' | 'error'; @@ -62,6 +63,7 @@ export interface RunLocalStageOnlyIngestOptions { now?: () => Date; dryRun?: boolean; memoryFlow?: MemoryFlowEventSink; + tableScope?: ReadonlySet; } const LOCAL_AUTHOR = 'ktx'; @@ -225,6 +227,7 @@ async function prepareLocalStagedDir( stagedDir: string, sourceDir: string | undefined, connectionId: string, + tableScope: ReadonlySet | undefined, ): Promise { await rm(stagedDir, { recursive: true, force: true }); await mkdir(stagedDir, { recursive: true }); @@ -242,7 +245,7 @@ async function prepareLocalStagedDir( ); } const pullConfig = await localPullConfigForAdapter(project, adapter, connectionId); - await adapter.fetch(pullConfig, stagedDir, { connectionId, sourceKey: adapter.source }); + await adapter.fetch(pullConfig, stagedDir, { connectionId, sourceKey: adapter.source, tableScope }); return null; } @@ -274,7 +277,14 @@ async function runLocalStageOnlyIngestInner(options: RunLocalStageOnlyIngestOpti assertCompatibleExistingRun(existingRun, runId, adapter.source, connectionId); const stagedDir = join(options.project.projectDir, '.ktx/cache/local-ingest', runId, 'staged'); - const sourceDir = await prepareLocalStagedDir(options.project, adapter, stagedDir, options.sourceDir, connectionId); + const sourceDir = await prepareLocalStagedDir( + options.project, + adapter, + stagedDir, + options.sourceDir, + connectionId, + options.tableScope, + ); const detected = await adapter.detect(stagedDir); if (!detected) { diff --git a/packages/cli/src/context/ingest/types.ts b/packages/cli/src/context/ingest/types.ts index 991670f6..337885af 100644 --- a/packages/cli/src/context/ingest/types.ts +++ b/packages/cli/src/context/ingest/types.ts @@ -2,6 +2,7 @@ import type { KtxEmbeddingPort } from '../core/embedding.js'; import type { MemoryAction } from '../../context/memory/types.js'; import type { SemanticLayerService } from '../../context/sl/semantic-layer.service.js'; import type { TouchedSlSource } from '../../context/tools/touched-sl-sources.js'; +import type { KtxTableRefKey } from '../scan/table-ref.js'; import type { MemoryFlowEventSink } from './memory-flow/types.js'; import type { StageIndex } from './stages/stage-index.types.js'; import type { WorkUnitOutcome } from './stages/stage-3-work-units.js'; @@ -52,6 +53,7 @@ export interface ChunkResult { export interface FetchContext { connectionId: string; sourceKey: string; + tableScope?: ReadonlySet; memoryFlow?: MemoryFlowEventSink; } diff --git a/packages/cli/src/context/llm/claude-code-runtime.test.ts b/packages/cli/src/context/llm/claude-code-runtime.test.ts index 38959140..b1003b78 100644 --- a/packages/cli/src/context/llm/claude-code-runtime.test.ts +++ b/packages/cli/src/context/llm/claude-code-runtime.test.ts @@ -91,9 +91,14 @@ describe('ClaudeCodeKtxLlmRuntime', () => { }); }); - it('validates structured output with the caller schema', async () => { + it('validates structured output with the caller schema and whitelists the SDK StructuredOutput tool', async () => { const schema = z.object({ answer: z.string() }); - const query = vi.fn((_input: any) => stream([initMessage(), resultMessage({ structured_output: { answer: 'yes' } })])); + const query = vi.fn((_input: any) => + stream([ + initMessage({ tools: ['StructuredOutput'] }), + resultMessage({ structured_output: { answer: 'yes' } }), + ]), + ); const runtime = new ClaudeCodeKtxLlmRuntime({ projectDir: '/tmp/project', modelSlots: { default: 'sonnet' }, @@ -341,7 +346,10 @@ describe('ClaudeCodeKtxLlmRuntime', () => { it('passes scrubbed env to object generation and agent loops', async () => { const schema = z.object({ answer: z.string() }); const objectQuery = vi.fn((_input: any) => - stream([initMessage(), resultMessage({ structured_output: { answer: 'yes' } })]), + stream([ + initMessage({ tools: ['StructuredOutput'] }), + resultMessage({ structured_output: { answer: 'yes' } }), + ]), ); const objectRuntime = new ClaudeCodeKtxLlmRuntime({ projectDir: '/tmp/project', diff --git a/packages/cli/src/context/llm/claude-code-runtime.ts b/packages/cli/src/context/llm/claude-code-runtime.ts index c6783d71..0eb3eadb 100644 --- a/packages/cli/src/context/llm/claude-code-runtime.ts +++ b/packages/cli/src/context/llm/claude-code-runtime.ts @@ -47,6 +47,13 @@ const BUILTIN_TOOLS = [ const KTX_MCP_SERVER_NAME = 'ktx'; +// SDK-internal pseudo-tool that the Claude Code CLI announces in its +// system/init message whenever outputFormat: { type: 'json_schema' } is set. +// Structured output is returned via result.structured_output (not through +// canUseTool), so the tool only needs to be whitelisted for generateObject's +// init isolation check; generateText / runAgentLoop never see it. +const STRUCTURED_OUTPUT_TOOL_NAME = 'StructuredOutput'; + function isResult(message: SDKMessage): message is SDKResultMessage { return message.type === 'result'; } @@ -238,7 +245,12 @@ export class ClaudeCodeKtxLlmRuntime implements KtxLlmRuntimePort { projectDir: this.deps.projectDir, model: modelForRole(this.deps.modelSlots, input.role), env: this.deps.env, - maxTurns: 1, + // Structured output occasionally takes more than one assistant turn — + // the model may emit thinking/text before the StructuredOutput tool + // call, or the SDK may count assistant + tool-result as separate turns. + // 5 leaves headroom without enabling unbounded loops; the json_schema + // constraint still forces the final answer to be the schema. + maxTurns: 5, tools: input.tools, }), outputFormat: { type: 'json_schema' as const, schema: jsonSchema(input.schema as z.ZodType) }, @@ -247,7 +259,7 @@ export class ClaudeCodeKtxLlmRuntime implements KtxLlmRuntimePort { query: this.runQuery, prompt: [input.system, input.prompt].filter(Boolean).join('\n\n'), options, - allowedToolIds: new Set(mcpToolIds(input.tools ?? {})), + allowedToolIds: new Set([...mcpToolIds(input.tools ?? {}), STRUCTURED_OUTPUT_TOOL_NAME]), expectedMcpServerNames: expectedMcpServerNames(input.tools), }); const error = resultError(result); diff --git a/packages/cli/src/context/project/config.test.ts b/packages/cli/src/context/project/config.test.ts index 3b7f2feb..55188aa2 100644 --- a/packages/cli/src/context/project/config.test.ts +++ b/packages/cli/src/context/project/config.test.ts @@ -74,6 +74,7 @@ connections: maxLlmTablesPerBatch: 40, maxCandidatesPerColumn: 25, profileSampleRows: 10000, + profileConcurrency: 4, validationConcurrency: 4, }, }, @@ -278,6 +279,7 @@ scan: maxLlmTablesPerBatch: 12 maxCandidatesPerColumn: 7 profileSampleRows: 500 + profileConcurrency: 3 validationConcurrency: 2 validationBudget: 0 `); @@ -291,6 +293,7 @@ scan: maxLlmTablesPerBatch: 12, maxCandidatesPerColumn: 7, profileSampleRows: 500, + profileConcurrency: 3, validationConcurrency: 2, validationBudget: 0, }); @@ -302,6 +305,7 @@ scan: expect(serializeKtxProjectConfig(config)).toContain('maxLlmTablesPerBatch: 12'); expect(serializeKtxProjectConfig(config)).toContain('maxCandidatesPerColumn: 7'); expect(serializeKtxProjectConfig(config)).toContain('profileSampleRows: 500'); + expect(serializeKtxProjectConfig(config)).toContain('profileConcurrency: 3'); expect(serializeKtxProjectConfig(config)).toContain('validationConcurrency: 2'); expect(serializeKtxProjectConfig(config)).toContain('validationBudget: 0'); }); @@ -326,6 +330,7 @@ scan: maxLlmTablesPerBatch: 0 maxCandidatesPerColumn: -4 profileSampleRows: 0 + profileConcurrency: 0 validationConcurrency: 0 validationBudget: 1.5 `; @@ -341,6 +346,7 @@ scan: 'scan.relationships.maxLlmTablesPerBatch', 'scan.relationships.maxCandidatesPerColumn', 'scan.relationships.profileSampleRows', + 'scan.relationships.profileConcurrency', 'scan.relationships.validationConcurrency', 'scan.relationships.validationBudget', ]), diff --git a/packages/cli/src/context/project/config.ts b/packages/cli/src/context/project/config.ts index 2824ca59..e83f502e 100644 --- a/packages/cli/src/context/project/config.ts +++ b/packages/cli/src/context/project/config.ts @@ -163,6 +163,11 @@ const scanRelationshipsSchema = z .default(25) .describe('Maximum number of candidate join partners considered per column during relationship discovery.'), profileSampleRows: z.int().positive().default(10000).describe('Number of rows sampled per table when profiling values for relationship inference.'), + profileConcurrency: z + .int() + .positive() + .default(4) + .describe('Parallel relationship-profile queries run against the database during scan.'), validationConcurrency: z.int().positive().default(4).describe('Number of relationship validation queries run in parallel against the database.'), validationBudget: z .union([z.literal('all'), z.int().nonnegative()]) diff --git a/packages/cli/src/context/scan/description-generation.test.ts b/packages/cli/src/context/scan/description-generation.test.ts index e47d32be..bc7b1e25 100644 --- a/packages/cli/src/context/scan/description-generation.test.ts +++ b/packages/cli/src/context/scan/description-generation.test.ts @@ -378,6 +378,121 @@ describe('KtxDescriptionGenerator', () => { expect(cache.set).toHaveBeenCalledWith('warehouse.public.orders', 'Commerce orders'); expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders'); }); + + it('generates one structured table description and reuses table samples for all columns', async () => { + const llmRuntime = createLlmProvider('unused'); + llmRuntime.generateObject = vi.fn(async () => ({ + tableDescription: 'Commerce orders', + columns: [ + { name: 'status', description: 'Current order state' }, + { name: 'amount', description: 'Order amount in dollars' }, + ], + })); + const connector = createConnector(); + const generator = new KtxDescriptionGenerator({ + llmRuntime, + settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 }, + }); + + const result = await generator.generateBatchedTableDescriptions({ + connectionId: 'conn-1', + connector, + context: { runId: 'run-1' }, + dataSourceType: 'POSTGRESQL', + supportsNestedAnalysis: false, + table: { + catalog: null, + db: 'public', + name: 'orders', + rawDescriptions: { db: 'Orders fact table' }, + columns: [ + { name: 'status', type: 'text' }, + { name: 'amount', type: 'numeric' }, + ], + }, + }); + + expect(result.tableDescription).toBe('Commerce orders'); + expect(Object.fromEntries(result.columnDescriptions)).toEqual({ + status: 'Current order state', + amount: 'Order amount in dollars', + }); + expect(connector.sampleTable).toHaveBeenCalledTimes(1); + expect(connector.sampleColumn).not.toHaveBeenCalled(); + expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1); + expect(llmRuntime.generateText).not.toHaveBeenCalled(); + }); + + it('falls back to one column generateText call for each missing structured column', async () => { + const llmRuntime = createLlmProvider('Fallback status'); + llmRuntime.generateObject = vi.fn(async () => ({ + tableDescription: 'Commerce orders', + columns: [{ name: 'amount', description: 'Order amount in dollars' }], + })); + const connector = createConnector(); + const generator = new KtxDescriptionGenerator({ + llmRuntime, + settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 }, + }); + + const result = await generator.generateBatchedTableDescriptions({ + connectionId: 'conn-1', + connector, + context: { runId: 'run-1' }, + dataSourceType: 'POSTGRESQL', + supportsNestedAnalysis: false, + table: { + catalog: null, + db: 'public', + name: 'orders', + columns: [ + { name: 'status', type: 'text' }, + { name: 'amount', type: 'numeric' }, + ], + }, + }); + + expect(Object.fromEntries(result.columnDescriptions)).toEqual({ + status: 'Fallback status', + amount: 'Order amount in dollars', + }); + expect(connector.sampleColumn).not.toHaveBeenCalled(); + expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1); + expect(llmRuntime.generateText).toHaveBeenCalledTimes(1); + }); + + it('does not run per-column fallback when structured object generation throws', async () => { + const llmRuntime = createLlmProvider('Fallback description'); + llmRuntime.generateObject = vi.fn(async () => { + throw new Error('object output unavailable'); + }); + const warnings: string[] = []; + const generator = new KtxDescriptionGenerator({ + llmRuntime, + onWarning: (warning) => warnings.push(warning.code), + settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 }, + }); + + const result = await generator.generateBatchedTableDescriptions({ + connectionId: 'conn-1', + connector: createConnector(), + context: { runId: 'run-1' }, + dataSourceType: 'POSTGRESQL', + supportsNestedAnalysis: false, + table: { + catalog: null, + db: 'public', + name: 'orders', + columns: [{ name: 'status', type: 'text' }], + }, + }); + + expect(result.tableDescription).toBeNull(); + expect(Object.fromEntries(result.columnDescriptions)).toEqual({ status: null }); + expect(warnings).toContain('enrichment_failed'); + expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1); + expect(llmRuntime.generateText).not.toHaveBeenCalled(); + }); }); describe('KtxDescriptionGenerator resilience', () => { diff --git a/packages/cli/src/context/scan/description-generation.ts b/packages/cli/src/context/scan/description-generation.ts index 4526215d..c6a41449 100644 --- a/packages/cli/src/context/scan/description-generation.ts +++ b/packages/cli/src/context/scan/description-generation.ts @@ -1,4 +1,5 @@ import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js'; +import { z } from 'zod'; import type { KtxColumnSampleInput, KtxColumnSampleResult, @@ -53,7 +54,7 @@ export interface KtxDescriptionColumn { sampleValues?: unknown[]; } -export interface KtxDescriptionColumnTable extends KtxTableRef { +interface KtxDescriptionColumnTable extends KtxTableRef { columns: KtxDescriptionColumn[]; } @@ -112,6 +113,23 @@ export interface KtxGenerateTableDescriptionInput { table: KtxDescriptionTableInput; } +export interface KtxGenerateBatchedTableDescriptionsInput { + connectionId: string; + connector: KtxDescriptionSamplingPort; + context: KtxScanContext; + dataSourceType: string; + supportsNestedAnalysis: boolean; + table: KtxDescriptionColumnTable & { + rawDescriptions?: Record; + columns: Array; + }; +} + +export interface KtxBatchedTableDescriptionsResult { + tableDescription: string | null; + columnDescriptions: Map; +} + export interface KtxGenerateDataSourceDescriptionInput { connectionId: string; connector: KtxDescriptionSamplingPort; @@ -136,6 +154,18 @@ interface ColumnTaskResult { skipped: boolean; } +const batchedTableDescriptionSchema = z.object({ + tableDescription: z.string(), + columns: z.array( + z.object({ + name: z.string(), + description: z.string(), + }), + ), +}); + +type BatchedTableDescriptionOutput = z.infer; + function descriptionSources(rawDescriptions: Record | undefined): Array<[string, string]> { if (!rawDescriptions) { return []; @@ -250,6 +280,76 @@ function wordLimitLine(maxWords: number): string { return `Please provide a concise description in ${maxWords} words or less.`; } +function sampleValuesByColumn( + columns: readonly KtxDescriptionColumn[], + sampleData: KtxTableSampleResult | null, +): Map { + const values = new Map(); + for (const column of columns) { + const existingValues = column.sampleValues?.filter((value) => value !== null && value !== undefined) ?? []; + if (existingValues.length > 0) { + values.set(column.name, existingValues); + } + } + if (!sampleData) { + return values; + } + for (const column of columns) { + const index = sampleData.headers.findIndex((header) => header.toLowerCase() === column.name.toLowerCase()); + if (index < 0) { + continue; + } + const sampledValues = sampleData.rows + .map((row) => row[index]) + .filter((value) => value !== null && value !== undefined); + if (sampledValues.length > 0) { + values.set(column.name, sampledValues); + } + } + return values; +} + +function batchedPrompt(input: { + table: KtxGenerateBatchedTableDescriptionsInput['table']; + sampleData: KtxTableSampleResult | null; + dataSourceType: string; + tableMaxWords: number; + columnMaxWords: number; +}): KtxDescriptionPrompt { + const columnLines = input.table.columns + .map((column) => { + const typePart = column.type ? ` (${column.type})` : ''; + const commentPart = column.rawDescriptions?.db ? ` - ${column.rawDescriptions.db}` : ''; + return `- ${column.name}${typePart}${commentPart}`; + }) + .join('\n'); + const sampleLines = + input.sampleData && input.sampleData.rows.length > 0 + ? input.sampleData.rows + .slice(0, 5) + .map((row) => + input.sampleData!.headers.map((header, index) => `${header}=${String(row[index] ?? '')}`).join(', '), + ) + .join('\n') + : 'unavailable'; + return { + system: [ + 'Analyze one database table and return structured JSON matching the supplied schema.', + `The table description must be ${input.tableMaxWords} words or less.`, + `Each column description must be ${input.columnMaxWords} words or less.`, + 'Describe business meaning directly. Do not repeat table or column names as filler.', + ].join('\n'), + user: [ + `Table: ${input.table.name}`, + `Data source type: ${input.dataSourceType}`, + 'Columns:', + columnLines, + 'Sample rows:', + sampleLines, + ].join('\n'), + }; +} + /** @internal */ export function buildKtxColumnDescriptionPrompt( input: KtxColumnDescriptionPromptInput & { maxWords?: number }, @@ -463,11 +563,11 @@ export class KtxDescriptionGenerator { } } - const sampleTable = input.connector.sampleTable; + const connector = input.connector; let sampleData: KtxTableSampleResult | null = null; let fallbackReason: 'capability_missing' | 'sampling_failed' | 'empty_sample' | null = null; - if (!sampleTable) { + if (!connector.sampleTable) { fallbackReason = 'capability_missing'; this.logger?.warn('KTX scan connector does not support table sampling; falling back to metadata-only prompt', { connectorId: input.connector.id, @@ -484,7 +584,7 @@ export class KtxDescriptionGenerator { try { sampleData = await retryAsync( () => - sampleTable( + connector.sampleTable!( { connectionId: input.connectionId, table: tableRef, @@ -582,6 +682,156 @@ export class KtxDescriptionGenerator { } } + async generateBatchedTableDescriptions( + input: KtxGenerateBatchedTableDescriptionsInput, + ): Promise { + const tableRef = toTableRef(input.table); + let sampleData: KtxTableSampleResult | null = null; + let fallbackReason: 'capability_missing' | 'sampling_failed' | 'empty_sample' | null = null; + if (!input.connector.sampleTable) { + fallbackReason = 'capability_missing'; + this.logger?.warn('KTX scan connector does not support table sampling; falling back to metadata-only prompt', { + connectorId: input.connector.id, + table: input.table.name, + }); + this.onWarning?.({ + code: 'connector_capability_missing', + message: `Connector ${input.connector.id} does not support sampleTable; using metadata-only description prompt`, + table: input.table.name, + recoverable: true, + metadata: { connectorId: input.connector.id, capability: 'sampleTable' }, + }); + } else { + try { + sampleData = await retryAsync( + () => + input.connector.sampleTable!( + { + connectionId: input.connectionId, + table: tableRef, + limit: 20, + }, + input.context, + ), + { + attempts: 3, + baseDelayMs: 200, + signal: input.context.signal, + onAttemptFailure: (error, attempt) => { + this.logger?.warn(`sampleTable attempt ${attempt} failed for ${input.table.name}: ${errorMessage(error)}`, { + connectorId: input.connector.id, + table: input.table.name, + attempt, + }); + }, + }, + ); + if (sampleData.rows.length === 0) { + fallbackReason = 'empty_sample'; + this.logger?.warn('sampleTable returned no rows; using metadata-only prompt', { + connectorId: input.connector.id, + table: input.table.name, + }); + } + } catch (error) { + if (error instanceof KtxAbortedError) { + throw error; + } + fallbackReason = 'sampling_failed'; + this.logger?.error(`sampleTable exhausted retries for ${input.table.name}: ${errorMessage(error)}`, { + connectorId: input.connector.id, + table: input.table.name, + }); + this.onWarning?.({ + code: 'sampling_failed', + message: `Failed to sample table ${input.table.name} after retries: ${errorMessage(error)}`, + table: input.table.name, + recoverable: true, + metadata: { connectorId: input.connector.id, error: errorMessage(error) }, + }); + } + } + + const sampleValues = sampleValuesByColumn(input.table.columns, sampleData); + const descriptions = new Map(); + let tableDescription: string | null = null; + let structuredGenerationSucceeded = false; + + try { + const prompt = batchedPrompt({ + table: input.table, + sampleData, + dataSourceType: input.dataSourceType, + tableMaxWords: this.settings.tableMaxWords, + columnMaxWords: this.settings.columnMaxWords, + }); + const generated = await this.llmRuntime.generateObject< + BatchedTableDescriptionOutput, + typeof batchedTableDescriptionSchema + >({ + role: 'candidateExtraction', + system: prompt.system, + prompt: prompt.user, + schema: batchedTableDescriptionSchema, + temperature: this.settings.temperature, + }); + structuredGenerationSucceeded = true; + tableDescription = generated.tableDescription.trim() || null; + const generatedColumns = new Map( + generated.columns.map((column) => [column.name.toLowerCase(), column.description.trim() || null]), + ); + for (const column of input.table.columns) { + const description = generatedColumns.get(column.name.toLowerCase()) ?? null; + descriptions.set(column.name, description); + } + if (tableDescription && fallbackReason !== null) { + this.onWarning?.({ + code: 'description_fallback_used', + message: `Generated table description without sample rows for ${input.table.name} (reason: ${fallbackReason})`, + table: input.table.name, + recoverable: true, + metadata: { connectorId: input.connector.id, reason: fallbackReason }, + }); + } + } catch (error) { + this.logger?.warn(`Batched table description failed for ${input.table.name}: ${errorMessage(error)}`, { + connectorId: input.connector.id, + table: input.table.name, + }); + this.onWarning?.({ + code: 'enrichment_failed', + message: `Failed to generate batched description for table ${input.table.name}: ${errorMessage(error)}`, + table: input.table.name, + recoverable: true, + metadata: { connectorId: input.connector.id }, + }); + } + + if (!structuredGenerationSucceeded) { + for (const column of input.table.columns) { + descriptions.set(column.name, null); + } + return { tableDescription, columnDescriptions: descriptions }; + } + + const tableContext = `Table: ${input.table.name} | Columns: ${input.table.columns.map((column) => column.name).join(', ')} | Data source: ${input.dataSourceType}`; + for (const column of input.table.columns) { + if (descriptions.get(column.name)) { + continue; + } + const fallback = await this.generateColumnDescriptionFromPreparedValues({ + column, + columnValues: sampleValues.get(column.name) ?? [], + tableContext, + dataSourceType: input.dataSourceType, + supportsNestedAnalysis: input.supportsNestedAnalysis, + }); + descriptions.set(column.name, fallback); + } + + return { tableDescription, columnDescriptions: descriptions }; + } + async generateDataSourceDescription(input: KtxGenerateDataSourceDescriptionInput): Promise { if (input.tables.length === 0) { return 'No tables found in database'; @@ -684,11 +934,11 @@ export class KtxDescriptionGenerator { }); columnValues = []; } else { - const sampleColumn = input.connector.sampleColumn; + const connector = input.connector; try { const sample = await retryAsync( () => - sampleColumn( + connector.sampleColumn!( { connectionId: input.connectionId, table: tableRef, @@ -732,27 +982,13 @@ export class KtxDescriptionGenerator { } } - const nonNullValues = (columnValues ?? []).filter((value) => value !== null && value !== undefined); - const hasRawDescriptions = descriptionSources(column.rawDescriptions).length > 0; - if (nonNullValues.length === 0 && !hasRawDescriptions) { - return { - columnName: column.name, - description: null, - skipped: false, - processed: false, - }; - } - - const prompt = buildKtxColumnDescriptionPrompt({ - columnName: column.name, - columnValues: nonNullValues, + const description = await this.generateColumnDescriptionFromPreparedValues({ + column, + columnValues: columnValues ?? [], tableContext, dataSourceType: input.dataSourceType, supportsNestedAnalysis: input.supportsNestedAnalysis, - rawDescriptions: column.rawDescriptions, - maxWords: this.settings.columnMaxWords, }); - const description = await this.generateAiDescription(prompt, 'ktx-column-description'); if (cacheKey && description) { await this.cache?.set(cacheKey, description); @@ -782,6 +1018,30 @@ export class KtxDescriptionGenerator { } } + private async generateColumnDescriptionFromPreparedValues(input: { + column: KtxDescriptionColumn; + columnValues: unknown[]; + tableContext: string; + dataSourceType: string; + supportsNestedAnalysis: boolean; + }): Promise { + const nonNullValues = input.columnValues.filter((value) => value !== null && value !== undefined); + const hasRawDescriptions = descriptionSources(input.column.rawDescriptions).length > 0; + if (nonNullValues.length === 0 && !hasRawDescriptions) { + return null; + } + const prompt = buildKtxColumnDescriptionPrompt({ + columnName: input.column.name, + columnValues: nonNullValues, + tableContext: input.tableContext, + dataSourceType: input.dataSourceType, + supportsNestedAnalysis: input.supportsNestedAnalysis, + rawDescriptions: input.column.rawDescriptions, + maxWords: this.settings.columnMaxWords, + }); + return this.generateAiDescription(prompt, 'ktx-column-description'); + } + private async generateAiDescription(prompt: KtxDescriptionPrompt, _operationName: string): Promise { try { const text = await this.llmRuntime.generateText({ diff --git a/packages/cli/src/context/scan/enabled-tables.ts b/packages/cli/src/context/scan/enabled-tables.ts index f522d44f..327992ac 100644 --- a/packages/cli/src/context/scan/enabled-tables.ts +++ b/packages/cli/src/context/scan/enabled-tables.ts @@ -1,17 +1,63 @@ -import type { KtxSchemaSnapshot } from './types.js'; +import { tableRefSet, type KtxTableRefKey } from './table-ref.js'; +import type { KtxTableRef } from './types.js'; -export function resolveEnabledTables(connection: Record | undefined): Set | null { +/** + * Parses the `enabled_tables` field on a connection into a scope of + * fully-qualified table refs. Returns `null` when the field is absent or + * empty (meaning "no scope — include every table in the resolved schemas"). + * + * Accepted entry forms: + * "catalog.db.name" — fully qualified + * "db.name" — schema-qualified (catalog = null; legacy / Postgres-shape) + * "name" — bare (catalog = db = null; SQLite-shape) + * { catalog?, db?, name } — escape hatch for identifiers containing dots + * + * The setup wizard writes the fully-qualified form going forward; the lenient + * parser keeps existing project configs working. + */ +export function resolveEnabledTables( + connection: Record | undefined, +): ReadonlySet | null { const raw = connection?.enabled_tables; if (!Array.isArray(raw) || raw.length === 0) return null; - return new Set(raw.filter((v): v is string => typeof v === 'string')); + const refs: KtxTableRef[] = []; + for (const value of raw) { + const parsed = parseEnabledTableEntry(value); + if (parsed) refs.push(parsed); + } + if (refs.length === 0) return null; + return tableRefSet(refs); } -export function filterSnapshotTables(snapshot: KtxSchemaSnapshot, enabledTables: Set): KtxSchemaSnapshot { - return { - ...snapshot, - tables: snapshot.tables.filter((table) => { - const key = table.db ? `${table.db}.${table.name}` : table.name; - return enabledTables.has(key); - }), - }; +function parseEnabledTableEntry(value: unknown): KtxTableRef | null { + if (typeof value === 'string') { + return parseDottedEntry(value); + } + if (value && typeof value === 'object' && !Array.isArray(value)) { + const entry = value as { catalog?: unknown; db?: unknown; name?: unknown }; + const name = typeof entry.name === 'string' ? entry.name : null; + if (!name) return null; + return { + catalog: typeof entry.catalog === 'string' ? entry.catalog : null, + db: typeof entry.db === 'string' ? entry.db : null, + name, + }; + } + return null; +} + +function parseDottedEntry(value: string): KtxTableRef | null { + const trimmed = value.trim(); + if (trimmed.length === 0) return null; + const parts = trimmed.split('.'); + if (parts.length === 3) { + return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! }; + } + if (parts.length === 2) { + return { catalog: null, db: parts[0]!, name: parts[1]! }; + } + if (parts.length === 1) { + return { catalog: null, db: null, name: parts[0]! }; + } + return null; } diff --git a/packages/cli/src/context/scan/local-enrichment-artifacts.test.ts b/packages/cli/src/context/scan/local-enrichment-artifacts.test.ts index 56994568..8a49fc78 100644 --- a/packages/cli/src/context/scan/local-enrichment-artifacts.test.ts +++ b/packages/cli/src/context/scan/local-enrichment-artifacts.test.ts @@ -289,6 +289,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => { maxLlmTablesPerBatch: 12, maxCandidatesPerColumn: 7, profileSampleRows: 500, + profileConcurrency: 3, validationConcurrency: 2, }, }); @@ -378,6 +379,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => { validationRequiredForManifest: true, maxCandidatesPerColumn: 7, profileSampleRows: 500, + profileConcurrency: 3, validationConcurrency: 2, }, profileWarnings: [], @@ -472,6 +474,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => { maxLlmTablesPerBatch: 40, maxCandidatesPerColumn: 25, profileSampleRows: 10000, + profileConcurrency: 4, validationConcurrency: 4, }, dryRun: false, @@ -741,6 +744,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => { maxLlmTablesPerBatch: 40, maxCandidatesPerColumn: 25, profileSampleRows: 10000, + profileConcurrency: 4, validationConcurrency: 4, }, dryRun: false, diff --git a/packages/cli/src/context/scan/local-enrichment-artifacts.ts b/packages/cli/src/context/scan/local-enrichment-artifacts.ts index 3a2d15f6..9de46a98 100644 --- a/packages/cli/src/context/scan/local-enrichment-artifacts.ts +++ b/packages/cli/src/context/scan/local-enrichment-artifacts.ts @@ -382,6 +382,7 @@ export async function writeLocalScanEnrichmentArtifacts( validationRequiredForManifest: input.relationshipSettings.validationRequiredForManifest, maxCandidatesPerColumn: input.relationshipSettings.maxCandidatesPerColumn, profileSampleRows: input.relationshipSettings.profileSampleRows, + profileConcurrency: input.relationshipSettings.profileConcurrency, validationConcurrency: input.relationshipSettings.validationConcurrency, } : undefined, diff --git a/packages/cli/src/context/scan/local-enrichment.test.ts b/packages/cli/src/context/scan/local-enrichment.test.ts index 66b66fc2..9647c8b9 100644 --- a/packages/cli/src/context/scan/local-enrichment.test.ts +++ b/packages/cli/src/context/scan/local-enrichment.test.ts @@ -299,6 +299,38 @@ describe('local scan enrichment', () => { ]); }); + it('uses the supplied snapshot without calling connector.introspect', async () => { + const scanConnector = connector(); + const introspect = vi.mocked(scanConnector.introspect); + + const result = await runLocalScanEnrichment({ + connectionId: 'warehouse', + mode: 'structural', + connector: scanConnector, + snapshot, + context: { runId: 'scan-run-snapshot' }, + providers: null, + }); + + expect(result.snapshot).toEqual(snapshot); + expect(introspect).not.toHaveBeenCalled(); + }); + + it('falls back to connector.introspect when no snapshot is supplied', async () => { + const scanConnector = connector(); + + const result = await runLocalScanEnrichment({ + connectionId: 'warehouse', + mode: 'structural', + connector: scanConnector, + context: { runId: 'scan-run-introspect' }, + providers: null, + }); + + expect(result.snapshot).toEqual(snapshot); + expect(scanConnector.introspect).toHaveBeenCalledTimes(1); + }); + it('runs deterministic relationship detection for relationship scans', async () => { const result = await runLocalScanEnrichment({ connectionId: 'warehouse', @@ -473,7 +505,7 @@ describe('local scan enrichment', () => { expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 }); }); - it('generates table descriptions with bounded table-level concurrency', async () => { + it('generates batched table descriptions with bounded table-level concurrency', async () => { const concurrentSnapshot: KtxSchemaSnapshot = { ...snapshot, tables: Array.from({ length: 8 }, (_, index) => ({ @@ -497,27 +529,27 @@ describe('local scan enrichment', () => { ], })), }; - let activeColumnSamples = 0; - let maxActiveColumnSamples = 0; + let activeTableSamples = 0; + let maxActiveTableSamples = 0; const scanConnector = { ...connector(), introspect: vi.fn(async () => concurrentSnapshot), - sampleColumn: vi.fn(async () => { - activeColumnSamples += 1; - maxActiveColumnSamples = Math.max(maxActiveColumnSamples, activeColumnSamples); + sampleColumn: vi.fn(async () => ({ + values: ['1'], + nullCount: 0, + distinctCount: 1, + })), + sampleTable: vi.fn(async () => { + activeTableSamples += 1; + maxActiveTableSamples = Math.max(maxActiveTableSamples, activeTableSamples); await new Promise((resolve) => setTimeout(resolve, 10)); - activeColumnSamples -= 1; + activeTableSamples -= 1; return { - values: ['1'], - nullCount: 0, - distinctCount: 1, + headers: ['id'], + rows: [[1]], + totalRows: 1, }; }), - sampleTable: vi.fn(async () => ({ - headers: ['id'], - rows: [[1]], - totalRows: 1, - })), }; const settings = { ...buildDefaultKtxProjectConfig().scan.relationships, @@ -533,7 +565,8 @@ describe('local scan enrichment', () => { relationshipSettings: settings, }); - expect(maxActiveColumnSamples).toBe(6); + expect(maxActiveTableSamples).toBe(4); + expect(scanConnector.sampleColumn).not.toHaveBeenCalled(); }); it('reports enrichment progress for countable stages', async () => { @@ -675,7 +708,7 @@ describe('local scan enrichment', () => { providerIdentity: { provider: 'fake', embeddingDimensions: 6 }, }); - const generateText = vi.spyOn(providers.llmRuntime, 'generateText'); + const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject'); const embedBatch = vi.spyOn(providers.embedding, 'embedBatch'); const second = await runLocalScanEnrichment({ connectionId: 'warehouse', @@ -693,7 +726,7 @@ describe('local scan enrichment', () => { expect(first.state.resumedStages).toEqual([]); expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']); expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']); - expect(generateText).not.toHaveBeenCalled(); + expect(generateObject).not.toHaveBeenCalled(); expect(embedBatch).not.toHaveBeenCalled(); expect(second.descriptionUpdates).toEqual(first.descriptionUpdates); expect(second.embeddingUpdates).toEqual(first.embeddingUpdates); @@ -731,7 +764,7 @@ describe('local scan enrichment', () => { tables: [{ ...firstTable, name: 'customers' }], })), }; - const generateText = vi.spyOn(providers.llmRuntime, 'generateText'); + const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject'); const result = await runLocalScanEnrichment({ connectionId: 'warehouse', @@ -747,7 +780,7 @@ describe('local scan enrichment', () => { expect(result.state.resumedStages).toEqual([]); expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']); - expect(generateText).toHaveBeenCalled(); + expect(generateObject).toHaveBeenCalled(); }); it('runs providerless enriched scans as relationship-only discovery enrichment', async () => { diff --git a/packages/cli/src/context/scan/local-enrichment.ts b/packages/cli/src/context/scan/local-enrichment.ts index 680f8f60..545b2ad6 100644 --- a/packages/cli/src/context/scan/local-enrichment.ts +++ b/packages/cli/src/context/scan/local-enrichment.ts @@ -1,7 +1,7 @@ import pLimit from 'p-limit'; import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js'; import { buildDefaultKtxProjectConfig, type KtxScanRelationshipConfig } from '../project/config.js'; -import { type KtxDescriptionColumnTable, KtxDescriptionGenerator } from './description-generation.js'; +import { KtxDescriptionGenerator } from './description-generation.js'; import { buildKtxColumnEmbeddingText } from './embedding-text.js'; import { completedKtxScanEnrichmentStateSummary, @@ -41,7 +41,7 @@ import type { KtxTableRef, } from './types.js'; -const DESCRIPTION_TABLE_CONCURRENCY = 6; +const DESCRIPTION_TABLE_CONCURRENCY = 4; export interface KtxLocalScanEnrichmentProviders { llmRuntime: KtxLlmRuntimePort; @@ -53,6 +53,7 @@ export interface KtxLocalScanEnrichmentInput { mode: KtxScanMode; detectRelationships?: boolean; connector: KtxScanConnector; + snapshot?: KtxSchemaSnapshot; context: KtxScanContext; providers: KtxLocalScanEnrichmentProviders | null; stateStore?: KtxScanEnrichmentStateStore | null; @@ -179,7 +180,17 @@ function deterministicLlmRuntime(): KtxLlmRuntimePort { async generateText(input) { return `Deterministic description for ${input.prompt.slice(0, 64).trim() || 'data source'}`; }, - async generateObject() { + async generateObject(input) { + if (input.prompt.includes('Sample rows:')) { + const columns = Array.from(input.prompt.matchAll(/^- ([^\s(]+)/gm), (match) => ({ + name: match[1] ?? 'column', + description: `Deterministic description for ${match[1] ?? 'column'}`, + })); + return { + tableDescription: `Deterministic description for ${input.prompt.slice(0, 64).trim() || 'table'}`, + columns, + } as never; + } return { pkCandidates: [], fkCandidates: [] } as never; }, async runAgentLoop() { @@ -234,30 +245,6 @@ export function snapshotToKtxEnrichedSchema( }; } -function descriptionTable(table: KtxSchemaTable): KtxDescriptionColumnTable { - return { - catalog: table.catalog, - db: table.db, - name: table.name, - columns: table.columns.map((column) => ({ - name: column.name, - ...(column.comment ? { sampleValues: [column.comment], rawDescriptions: { db: column.comment } } : {}), - })), - }; -} - -function tableMetadataColumns(table: KtxSchemaTable): Array<{ - name: string; - nativeType?: string | null; - comment?: string | null; -}> { - return table.columns.map((column) => ({ - name: column.name, - nativeType: column.nativeType ?? null, - comment: column.comment ?? null, - })); -} - function embeddingBatchSize(maxBatchSize: number): number { return Number.isInteger(maxBatchSize) && maxBatchSize > 0 ? maxBatchSize : 100; } @@ -306,32 +293,28 @@ async function generateDescriptions(input: { transient: true, }, ); - const tableInput = descriptionTable(table); - const columnResult = await generator.generateColumnDescriptions({ + const batched = await generator.generateBatchedTableDescriptions({ connectionId: input.snapshot.connectionId, connector: input.connector, context: input.context, dataSourceType: input.snapshot.driver, supportsNestedAnalysis: input.connector.capabilities.nestedAnalysis, - table: tableInput, - }); - const tableDescription = await generator.generateTableDescription({ - connectionId: input.snapshot.connectionId, - connector: input.connector, - context: input.context, - dataSourceType: input.snapshot.driver, table: { catalog: table.catalog, db: table.db, name: table.name, rawDescriptions: table.comment ? { db: table.comment } : {}, - columns: tableMetadataColumns(table), + columns: table.columns.map((column) => ({ + name: column.name, + type: column.nativeType, + ...(column.comment ? { rawDescriptions: { db: column.comment } } : {}), + })), }, }); return { table: tableRef(table), - tableDescription, - columnDescriptions: Object.fromEntries(columnResult.columnDescriptions), + tableDescription: batched.tableDescription, + columnDescriptions: Object.fromEntries(batched.columnDescriptions), }; }), ), @@ -472,15 +455,17 @@ export async function runLocalScanEnrichment( ): Promise { const progress = input.context.progress; await progress?.update(0, 'Loading enrichment schema snapshot'); - const snapshot = await input.connector.introspect( - { - connectionId: input.connectionId, - driver: input.connector.driver, - mode: input.mode, - detectRelationships: input.detectRelationships, - }, - input.context, - ); + const snapshot = + input.snapshot ?? + (await input.connector.introspect( + { + connectionId: input.connectionId, + driver: input.connector.driver, + mode: input.mode, + detectRelationships: input.detectRelationships, + }, + input.context, + )); await progress?.update(0.05, `Loaded schema snapshot with ${snapshot.tables.length} tables`); const now = input.now ?? (() => new Date()); diff --git a/packages/cli/src/context/scan/local-scan.test.ts b/packages/cli/src/context/scan/local-scan.test.ts index 081fa055..7b5af5b0 100644 --- a/packages/cli/src/context/scan/local-scan.test.ts +++ b/packages/cli/src/context/scan/local-scan.test.ts @@ -6,9 +6,15 @@ import YAML from 'yaml'; import type { SourceAdapter } from '../../context/ingest/types.js'; import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js'; import { initKtxProject, type KtxLocalProject, loadKtxProject } from '../../context/project/project.js'; -import { filterSnapshotTables, resolveEnabledTables } from './enabled-tables.js'; +import { resolveEnabledTables } from './enabled-tables.js'; import { getLocalScanReport, getLocalScanStatus, runLocalScan } from './local-scan.js'; -import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxSchemaSnapshot, KtxSchemaTable } from './types.js'; +import { tableRefKey, tableRefSet, type KtxTableRefKey } from './table-ref.js'; +import type { + KtxQueryResult, + KtxReadOnlyQueryInput, + KtxScanConnector, + KtxSchemaSnapshot, +} from './types.js'; function relationshipSqlResult( input: KtxReadOnlyQueryInput, @@ -120,7 +126,43 @@ async function writeDatabaseConfigWithoutIngestAdapters(projectDir: string): Pro ); } -function fetchOnlyAdapter(options: { extractedAt?: () => string } = {}): SourceAdapter { +function defaultFetchSnapshot(options: { extractedAt?: () => string } = {}): KtxSchemaSnapshot { + return { + connectionId: 'warehouse', + driver: 'postgres', + extractedAt: options.extractedAt?.() ?? '2026-04-29T09:00:00.000Z', + scope: { schemas: ['public'] }, + metadata: {}, + tables: [ + { + name: 'orders', + catalog: null, + db: 'public', + kind: 'table', + comment: null, + estimatedRows: null, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: null, + }, + ], + foreignKeys: [], + }, + ], + }; +} + +function fetchOnlyAdapter(options: { extractedAt?: () => string; snapshot?: KtxSchemaSnapshot } = {}): SourceAdapter { + const scanSnapshot = options.snapshot + ? { ...options.snapshot, ...(options.extractedAt ? { extractedAt: options.extractedAt() } : {}) } + : defaultFetchSnapshot(options); + return { source: 'live-database', skillNames: ['live_database_ingest'], @@ -129,39 +171,89 @@ function fetchOnlyAdapter(options: { extractedAt?: () => string } = {}): SourceA await writeFile( join(stagedDir, 'connection.json'), `${JSON.stringify({ - connectionId: 'warehouse', - driver: 'postgres', - ...(options.extractedAt ? { extractedAt: options.extractedAt() } : {}), - scope: { schemas: ['public'] }, - metadata: {}, + connectionId: scanSnapshot.connectionId, + driver: scanSnapshot.driver, + extractedAt: scanSnapshot.extractedAt, + scope: scanSnapshot.scope, + metadata: scanSnapshot.metadata, })}\n`, 'utf-8', ); await writeFile(join(stagedDir, 'foreign-keys.json'), '{"foreignKeys":[]}\n', 'utf-8'); - await writeFile( - join(stagedDir, 'tables', 'orders.json'), - '{"name":"orders","catalog":null,"db":"public","kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n', - 'utf-8', - ); + for (const table of scanSnapshot.tables) { + await writeFile(join(stagedDir, 'tables', `${table.name}.json`), `${JSON.stringify(table)}\n`, 'utf-8'); + } }, async detect() { return true; }, async chunk() { return { - workUnits: [ - { - unitKey: 'live-database-public-orders', - rawFiles: ['tables/orders.json'], - dependencyPaths: ['connection.json', 'foreign-keys.json'], - peerFileIndex: [], - }, - ], + workUnits: scanSnapshot.tables.map((table) => ({ + unitKey: `live-database-${table.db ?? 'default'}-${table.name}`, + rawFiles: [`tables/${table.name}.json`], + dependencyPaths: ['connection.json', 'foreign-keys.json'], + peerFileIndex: [], + })), }; }, }; } +function nativeScanSnapshot(): KtxSchemaSnapshot { + return { + connectionId: 'warehouse', + driver: 'postgres', + extractedAt: '2026-04-29T09:00:00.000Z', + scope: { schemas: ['public'] }, + metadata: {}, + tables: [ + { + catalog: null, + db: 'public', + name: 'orders', + kind: 'table', + comment: 'Orders', + estimatedRows: 1, + foreignKeys: [], + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: 'Order id', + }, + ], + }, + ], + }; +} + +function nativeScanConnector(options: { cleanup?: () => Promise } = {}): KtxScanConnector { + return { + id: 'test:warehouse', + driver: 'postgres', + capabilities: { + structuralIntrospection: true, + tableSampling: true, + columnSampling: true, + columnStats: false, + readOnlySql: false, + nestedAnalysis: false, + eventStreamDiscovery: false, + formalForeignKeys: false, + estimatedRowCounts: false, + }, + introspect: vi.fn(async () => nativeScanSnapshot()), + sampleTable: vi.fn(async () => ({ headers: ['id'], rows: [[1]], totalRows: 1 })), + sampleColumn: vi.fn(async () => ({ values: ['1'], nullCount: 0, distinctCount: 1 })), + ...(options.cleanup ? { cleanup: options.cleanup } : {}), + }; +} + describe('local scan', () => { let tempDir: string; let project: KtxLocalProject; @@ -244,6 +336,73 @@ describe('local scan', () => { }); }); + it('passes enabled_tables as fetch context tableScope and does not post-filter staged snapshots', async () => { + project.config.connections.warehouse = { + ...project.config.connections.warehouse, + enabled_tables: ['public.orders'], + }; + let capturedTableScope: ReadonlySet | undefined; + const adapter: SourceAdapter = { + source: 'live-database', + skillNames: ['live_database_ingest'], + async fetch(_pullConfig, stagedDir, ctx) { + capturedTableScope = ctx.tableScope; + await mkdir(join(stagedDir, 'tables'), { recursive: true }); + await writeFile( + join(stagedDir, 'connection.json'), + '{"connectionId":"warehouse","driver":"postgres","scope":{"schemas":["public"]},"metadata":{}}\n', + 'utf-8', + ); + await writeFile(join(stagedDir, 'foreign-keys.json'), '{"foreignKeys":[]}\n', 'utf-8'); + await writeFile( + join(stagedDir, 'tables', 'customers.json'), + '{"name":"customers","catalog":null,"db":"public","kind":"table","comment":null,"estimatedRows":100,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n', + 'utf-8', + ); + await writeFile( + join(stagedDir, 'tables', 'orders.json'), + '{"name":"orders","catalog":null,"db":"public","kind":"table","comment":null,"estimatedRows":1000,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n', + 'utf-8', + ); + }, + async detect() { + return true; + }, + async chunk() { + return { + workUnits: [ + { + unitKey: 'live-database-public-customers', + rawFiles: ['tables/customers.json'], + dependencyPaths: ['connection.json', 'foreign-keys.json'], + peerFileIndex: [], + }, + { + unitKey: 'live-database-public-orders', + rawFiles: ['tables/orders.json'], + dependencyPaths: ['connection.json', 'foreign-keys.json'], + peerFileIndex: [], + }, + ], + }; + }, + }; + + const result = await runLocalScan({ + project, + adapters: [adapter], + connectionId: 'warehouse', + jobId: 'scan-strict-scope-fetch', + now: () => new Date('2026-05-22T00:00:00.000Z'), + }); + + expect([...(capturedTableScope ?? [])]).toEqual([...tableRefSet([{ catalog: null, db: 'public', name: 'orders' }])]); + expect(result.report.diffSummary.tablesAdded).toBe(2); + const structuralManifest = await readFile(join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8'); + expect(structuralManifest).toContain('customers:'); + expect(structuralManifest).toContain('orders:'); + }); + it('runs a structural database scan when live-database is not listed in ktx.yaml', async () => { await writeDatabaseConfigWithoutIngestAdapters(project.projectDir); project = await loadKtxProject({ projectDir: project.projectDir }); @@ -265,6 +424,59 @@ describe('local scan', () => { }); }); + it('threads the structural snapshot into enrichment without connector re-introspection', async () => { + project.config.scan.enrichment = { mode: 'deterministic' }; + const connector = nativeScanConnector(); + const introspect = vi.mocked(connector.introspect); + + const result = await runLocalScan({ + project, + adapters: [fetchOnlyAdapter()], + connectionId: 'warehouse', + mode: 'enriched', + connector, + jobId: 'scan-enrichment-snapshot-threading', + now: () => new Date('2026-04-29T09:11:00.000Z'), + }); + + expect(result.report.enrichment.tableDescriptions).toBe('completed'); + expect(introspect).not.toHaveBeenCalled(); + }); + + it('cleans up a scan connector constructed by local scan', async () => { + const cleanup = vi.fn(async () => undefined); + + await runLocalScan({ + project, + adapters: [fetchOnlyAdapter()], + connectionId: 'warehouse', + mode: 'relationships', + detectRelationships: true, + createConnector: vi.fn(async () => nativeScanConnector({ cleanup })), + jobId: 'scan-owned-connector-cleanup', + now: () => new Date('2026-04-29T09:13:00.000Z'), + }); + + expect(cleanup).toHaveBeenCalledTimes(1); + }); + + it('does not clean up a caller-supplied scan connector', async () => { + const cleanup = vi.fn(async () => undefined); + + await runLocalScan({ + project, + adapters: [fetchOnlyAdapter()], + connectionId: 'warehouse', + mode: 'relationships', + detectRelationships: true, + connector: nativeScanConnector({ cleanup }), + jobId: 'scan-supplied-connector-cleanup', + now: () => new Date('2026-04-29T09:13:30.000Z'), + }); + + expect(cleanup).not.toHaveBeenCalled(); + }); + it('reuses scan report and raw-source paths when the same local scan run id is retried', async () => { const first = await runLocalScan({ project, @@ -447,10 +659,11 @@ describe('local scan', () => { }; }, }; + const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() }); const result = await runLocalScan({ project, - adapters: [fetchOnlyAdapter()], + adapters: [adapter], connectionId: 'warehouse', mode: 'relationships', detectRelationships: true, @@ -534,10 +747,11 @@ describe('local scan', () => { return relationshipSqlResult(input); }, }; + const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() }); const result = await runLocalScan({ project, - adapters: [fetchOnlyAdapter()], + adapters: [adapter], connectionId: 'warehouse', mode: 'relationships', detectRelationships: true, @@ -551,6 +765,142 @@ describe('local scan', () => { expect(result.report.warnings).toEqual([]); }); + it('keeps prototype connector methods when enabled_tables is configured', async () => { + project.config.connections.warehouse = { + ...project.config.connections.warehouse, + enabled_tables: ['public.customers', 'public.orders'], + }; + const scopedAdapter: SourceAdapter = { + source: 'live-database', + skillNames: ['live_database_ingest'], + async fetch(_pullConfig, stagedDir) { + await mkdir(join(stagedDir, 'tables'), { recursive: true }); + await writeFile( + join(stagedDir, 'connection.json'), + '{"connectionId":"warehouse","driver":"postgres","scope":{"schemas":["public"]},"metadata":{}}\n', + 'utf-8', + ); + await writeFile(join(stagedDir, 'foreign-keys.json'), '{"foreignKeys":[]}\n', 'utf-8'); + await writeFile( + join(stagedDir, 'tables', 'customers.json'), + '{"name":"customers","catalog":null,"db":"public","kind":"table","comment":null,"estimatedRows":100,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n', + 'utf-8', + ); + await writeFile( + join(stagedDir, 'tables', 'orders.json'), + '{"name":"orders","catalog":null,"db":"public","kind":"table","comment":null,"estimatedRows":1000,"columns":[{"name":"customer_id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":false,"comment":null}],"foreignKeys":[]}\n', + 'utf-8', + ); + }, + async detect() { + return true; + }, + async chunk() { + return { + workUnits: [ + { + unitKey: 'live-database-public-customers', + rawFiles: ['tables/customers.json'], + dependencyPaths: ['connection.json', 'foreign-keys.json'], + peerFileIndex: [], + }, + { + unitKey: 'live-database-public-orders', + rawFiles: ['tables/orders.json'], + dependencyPaths: ['connection.json', 'foreign-keys.json'], + peerFileIndex: [], + }, + ], + }; + }, + }; + class FakeClassConnector implements KtxScanConnector { + readonly id = 'test:warehouse'; + readonly driver = 'postgres' as const; + readonly capabilities = { + structuralIntrospection: true as const, + tableSampling: false, + columnSampling: false, + columnStats: true, + readOnlySql: true, + nestedAnalysis: false, + eventStreamDiscovery: false, + formalForeignKeys: false, + estimatedRowCounts: true, + }; + + async introspect(): Promise { + return { + connectionId: 'warehouse', + driver: 'postgres', + extractedAt: '2026-05-22T00:00:00.000Z', + scope: { schemas: ['public'] }, + metadata: {}, + tables: [ + { + catalog: null, + db: 'public', + name: 'customers', + kind: 'table', + comment: null, + estimatedRows: 100, + foreignKeys: [], + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: null, + }, + ], + }, + { + catalog: null, + db: 'public', + name: 'orders', + kind: 'table', + comment: null, + estimatedRows: 1000, + foreignKeys: [], + columns: [ + { + name: 'customer_id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: false, + comment: null, + }, + ], + }, + ], + }; + } + + async executeReadOnly(input: KtxReadOnlyQueryInput): Promise { + return relationshipSqlResult(input); + } + } + + const result = await runLocalScan({ + project, + adapters: [scopedAdapter], + connectionId: 'warehouse', + mode: 'relationships', + detectRelationships: true, + connector: new FakeClassConnector(), + jobId: 'scan-prototype-connector-scope', + now: () => new Date('2026-05-22T00:00:00.000Z'), + }); + + expect(result.report.relationships.accepted).toBe(1); + expect(result.report.warnings).toEqual([]); + }); + it('threads scan relationship settings into relationship-only local scans', async () => { project.config.scan.enrichment = { mode: 'deterministic' }; project.config.scan.relationships = { @@ -628,10 +978,11 @@ describe('local scan', () => { return relationshipSqlResult(input); }, }; + const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() }); const result = await runLocalScan({ project, - adapters: [fetchOnlyAdapter()], + adapters: [adapter], connectionId: 'warehouse', mode: 'relationships', detectRelationships: true, @@ -737,10 +1088,11 @@ describe('local scan', () => { return relationshipSqlResult(input); }, }; + const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() }); const result = await runLocalScan({ project, - adapters: [fetchOnlyAdapter()], + adapters: [adapter], connectionId: 'warehouse', mode: 'relationships', detectRelationships: true, @@ -863,10 +1215,11 @@ describe('local scan', () => { return relationshipSqlResult(input); }, }; + const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() }); const result = await runLocalScan({ project, - adapters: [fetchOnlyAdapter()], + adapters: [adapter], connectionId: 'warehouse', mode: 'enriched', connector, @@ -993,10 +1346,11 @@ describe('local scan', () => { return relationshipSqlResult(input, { throwOnCoverage: true }); }, }; + const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() }); const result = await runLocalScan({ project, - adapters: [fetchOnlyAdapter()], + adapters: [adapter], connectionId: 'warehouse', mode: 'relationships', detectRelationships: true, @@ -1128,7 +1482,8 @@ describe('local scan', () => { join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8', ); - expect(manifestRaw).toContain('ai: "Deterministic description'); + expect(manifestRaw).toContain('ai: |-'); + expect(manifestRaw).toContain('Deterministic description'); }); it('persists structural artifacts and a recoverable warning when standalone enrichment execution fails', async () => { @@ -1301,10 +1656,11 @@ describe('local scan', () => { }, }; const llmRuntime = deterministicLlmRuntime(); + const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() }); const first = await runLocalScan({ project, - adapters: [fetchOnlyAdapter()], + adapters: [adapter], connectionId: 'warehouse', mode: 'enriched', connector, @@ -1333,7 +1689,7 @@ describe('local scan', () => { const generateObject = vi.spyOn(llmRuntime, 'generateObject'); const retry = await runLocalScan({ project, - adapters: [fetchOnlyAdapter()], + adapters: [adapter], connectionId: 'warehouse', mode: 'enriched', connector, @@ -1359,7 +1715,6 @@ describe('local scan', () => { failedStages: [], }); expect(retry.report.enrichment.embeddings).toBe('completed'); - expect(generateObject).toHaveBeenCalledTimes(1); expect(generateObject).toHaveBeenCalledWith(expect.objectContaining({ role: 'candidateExtraction' })); expect(embeddingAttempts).toBe(2); @@ -1512,69 +1867,18 @@ describe('resolveEnabledTables', () => { expect(resolveEnabledTables({ driver: 'postgres', enabled_tables: [] })).toBeNull(); }); - it('returns Set of enabled table names', () => { + it('returns a canonical set of enabled table refs', () => { const result = resolveEnabledTables({ driver: 'postgres', enabled_tables: ['public.users', 'public.orders'], }); expect(result).toBeInstanceOf(Set); expect(result!.size).toBe(2); - expect(result!.has('public.users')).toBe(true); - expect(result!.has('public.orders')).toBe(true); + expect(result!.has(tableRefKey({ catalog: null, db: 'public', name: 'users' }))).toBe(true); + expect(result!.has(tableRefKey({ catalog: null, db: 'public', name: 'orders' }))).toBe(true); }); it('returns null for undefined connection', () => { expect(resolveEnabledTables(undefined)).toBeNull(); }); }); - -describe('filterSnapshotTables', () => { - function makeSnapshot(tables: Array<{ db: string; name: string }>): KtxSchemaSnapshot { - return { - connectionId: 'test', - driver: 'postgres', - extractedAt: '2026-01-01T00:00:00Z', - scope: {}, - metadata: {}, - tables: tables.map( - (t): KtxSchemaTable => ({ - catalog: null, - db: t.db, - name: t.name, - kind: 'table', - comment: null, - estimatedRows: null, - columns: [], - foreignKeys: [], - }), - ), - }; - } - - it('keeps only enabled tables', () => { - const snapshot = makeSnapshot([ - { db: 'public', name: 'users' }, - { db: 'public', name: 'orders' }, - { db: 'public', name: 'logs' }, - ]); - const enabled = new Set(['public.users', 'public.orders']); - const filtered = filterSnapshotTables(snapshot, enabled); - expect(filtered.tables).toHaveLength(2); - expect(filtered.tables.map((t) => t.name)).toEqual(['users', 'orders']); - }); - - it('returns empty tables when none match', () => { - const snapshot = makeSnapshot([{ db: 'public', name: 'users' }]); - const enabled = new Set(['public.orders']); - const filtered = filterSnapshotTables(snapshot, enabled); - expect(filtered.tables).toHaveLength(0); - }); - - it('preserves other snapshot fields', () => { - const snapshot = makeSnapshot([{ db: 'public', name: 'users' }]); - const enabled = new Set(['public.users']); - const filtered = filterSnapshotTables(snapshot, enabled); - expect(filtered.connectionId).toBe('test'); - expect(filtered.driver).toBe('postgres'); - }); -}); diff --git a/packages/cli/src/context/scan/local-scan.ts b/packages/cli/src/context/scan/local-scan.ts index 35333f79..cb886991 100644 --- a/packages/cli/src/context/scan/local-scan.ts +++ b/packages/cli/src/context/scan/local-scan.ts @@ -10,7 +10,7 @@ import type { KtxProjectLlmConfig, KtxScanEnrichmentConfig, KtxScanRelationshipC import type { KtxLocalProject } from '../../context/project/project.js'; import { ktxLocalStateDbPath } from '../project/local-state-db.js'; import { redactKtxScanReport } from './credentials.js'; -import { filterSnapshotTables, resolveEnabledTables } from './enabled-tables.js'; +import { resolveEnabledTables } from './enabled-tables.js'; import { completedKtxScanEnrichmentStateSummary } from './enrichment-state.js'; import { failedKtxScanEnrichmentSummary, ktxScanErrorMessage } from './enrichment-summary.js'; import { @@ -25,9 +25,7 @@ import type { KtxConnectionDriver, KtxProgressPort, KtxScanConnector, - KtxScanContext, KtxScanEnrichmentStateSummary, - KtxScanInput, KtxScanMode, KtxScanReport, KtxScanTrigger, @@ -370,17 +368,6 @@ async function readScanReport( } } -function createFilteredConnector(connector: KtxScanConnector, enabledTables: Set): KtxScanConnector { - return { - ...connector, - async introspect(input: KtxScanInput, ctx: KtxScanContext): Promise { - const snapshot = await connector.introspect(input, ctx); - return filterSnapshotTables(snapshot, enabledTables); - }, - }; -} - - function withInternalLiveDatabaseAdapter(project: KtxLocalProject): KtxLocalProject { if (project.config.ingest.adapters.includes(LIVE_DATABASE_ADAPTER)) { return project; @@ -402,14 +389,17 @@ export async function runLocalScan(options: RunLocalScanOptions): Promise { - const take = Math.min(remaining, ds[field]); - ds[field] -= take; - remaining -= take; - }; - subFrom('tablesAdded'); - subFrom('tablesUnchanged'); - subFrom('tablesModified'); - await options.progress?.update(0.6, scanChangeSummary(report.diffSummary)); - } + enrichmentSnapshot = rawSnapshot; const manifestArtifacts = await writeLocalScanManifestShards({ project: options.project, connectionId: options.connectionId, syncId: record.syncId, driver, - snapshot: structuralSnapshot, + snapshot: rawSnapshot, dryRun: false, }); report.artifactPaths.manifestShards = manifestArtifacts.manifestShards; @@ -515,6 +493,7 @@ export async function runLocalScan(options: RunLocalScanOptions): Promise { scaleExecutor.close(); } }); + + it('profiles tables concurrently up to profileConcurrency', async () => { + let inFlight = 0; + let maxInFlight = 0; + const executor = { + executeReadOnly: vi.fn(async (input: KtxReadOnlyQueryInput) => { + inFlight += 1; + maxInFlight = Math.max(maxInFlight, inFlight); + await new Promise((resolve) => setTimeout(resolve, 10)); + inFlight -= 1; + return { + headers: [ + 'column_name', + 'table_row_count', + 'row_count', + 'null_count', + 'distinct_count', + 'min_text_length', + 'max_text_length', + 'sample_values', + ], + rows: [[input.sql.includes('accounts') ? 'id' : 'account_id', 2, 2, 0, 2, 1, 2, '1\u001f2']], + totalRows: 1, + rowCount: 1, + }; + }), + }; + + await profileKtxRelationshipSchema({ + connectionId: 'warehouse', + driver: 'sqlite', + schema: schemaWithTables(['accounts', 'orders', 'payments', 'refunds']), + executor, + ctx: { runId: 'profile-concurrency' }, + profileConcurrency: 4, + }); + + expect(maxInFlight).toBe(4); + }); + + it('keeps profiling other tables when one table profile fails', async () => { + const executor = { + executeReadOnly: vi.fn(async (input: KtxReadOnlyQueryInput) => { + if (input.sql.includes('"orders"')) { + throw new Error('orders unavailable'); + } + return { + headers: [ + 'column_name', + 'table_row_count', + 'row_count', + 'null_count', + 'distinct_count', + 'min_text_length', + 'max_text_length', + 'sample_values', + ], + rows: [['id', 2, 2, 0, 2, 1, 2, '1\u001f2']], + totalRows: 1, + rowCount: 1, + }; + }), + }; + + const result = await profileKtxRelationshipSchema({ + connectionId: 'warehouse', + driver: 'sqlite', + schema: schemaWithTables(['accounts', 'orders']), + executor, + ctx: { runId: 'profile-error-isolated' }, + profileConcurrency: 2, + }); + + expect(result.warnings).toContain('profile_failed:orders:orders unavailable'); + expect(result.tables).toHaveLength(2); + expect(Object.keys(result.columns)).toContain('accounts.id'); + }); }); + +function schemaWithTables(names: string[]): KtxEnrichedSchema { + return schema( + names.map((name) => + table(name, [ + column(name, name === 'orders' ? 'account_id' : 'id', { + nullable: false, + primaryKey: name !== 'orders', + }), + ]), + ), + ); +} diff --git a/packages/cli/src/context/scan/relationship-profiling.ts b/packages/cli/src/context/scan/relationship-profiling.ts index 2172ac24..1824d263 100644 --- a/packages/cli/src/context/scan/relationship-profiling.ts +++ b/packages/cli/src/context/scan/relationship-profiling.ts @@ -1,4 +1,5 @@ import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js'; +import { mapWithConcurrency } from './relationship-validation.js'; import type { KtxConnectionDriver, KtxQueryResult, @@ -60,6 +61,7 @@ export interface ProfileKtxRelationshipSchemaInput { ctx: KtxScanContext; sampleValuesPerColumn?: number; profileSampleRows?: number; + profileConcurrency?: number; cache?: KtxRelationshipProfileCache; } @@ -227,6 +229,9 @@ function sampleAggregateSql(driver: KtxConnectionDriver, innerSql: string): stri if (driver === 'clickhouse') { return `(SELECT arrayStringConcat(groupArray(toString(value)), '\\x1F') FROM (${innerSql}) AS relationship_profile_values)`; } + if (driver === 'snowflake') { + return `(SELECT LISTAGG(CAST(value AS VARCHAR), '\\x1f') FROM (${innerSql}) AS relationship_profile_values)`; + } return `(SELECT GROUP_CONCAT(CAST(value AS TEXT), char(31)) FROM (${innerSql}) AS relationship_profile_values)`; } @@ -386,6 +391,10 @@ async function queryTableProfile(input: { }; } +type TableProfileResult = + | { tableProfile: Awaited> } + | { cached: KtxRelationshipCachedTableProfile; queryCount: 0 }; + export async function profileKtxRelationshipSchema( input: ProfileKtxRelationshipSchemaInput, ): Promise { @@ -405,54 +414,68 @@ export async function profileKtxRelationshipSchema( const tables: KtxRelationshipTableProfile[] = []; const columns: Record = {}; const warnings: string[] = []; + const executor = input.executor; - for (const table of input.schema.tables.filter((candidate) => candidate.enabled)) { - const sampleValuesPerColumn = input.sampleValuesPerColumn ?? 5; - const profileSampleRows = input.profileSampleRows ?? 10000; - const cacheKey = tableProfileCacheKey({ - connectionId: input.connectionId, - driver: input.driver, - ctx: input.ctx, - table: table.ref, - sampleValuesPerColumn, - profileSampleRows, - }); - const cached = input.cache?.tableProfiles.get(cacheKey); - if (cached) { - tables.push(cached.table); - Object.assign(columns, cached.columns); - for (const warning of cached.warnings) { - warnings.push(warning); - } - continue; - } - - try { - const tableProfile = await queryTableProfile({ + const enabledTables = input.schema.tables.filter((candidate) => candidate.enabled); + const tableResults = await mapWithConcurrency( + enabledTables, + input.profileConcurrency ?? 4, + async (table) => { + const sampleValuesPerColumn = input.sampleValuesPerColumn ?? 5; + const profileSampleRows = input.profileSampleRows ?? 10000; + const cacheKey = tableProfileCacheKey({ connectionId: input.connectionId, driver: input.driver, - table, - executor: input.executor, ctx: input.ctx, + table: table.ref, sampleValuesPerColumn, profileSampleRows, }); - queryTotal += tableProfile.queryCount; - tables.push(tableProfile.table); - Object.assign(columns, tableProfile.columns); - input.cache?.tableProfiles.set(cacheKey, { - table: tableProfile.table, - columns: tableProfile.columns, - warnings: [], - }); - } catch (error) { - const failureWarning = `profile_failed:${table.ref.name}:${error instanceof Error ? error.message : String(error)}`; - warnings.push(failureWarning); - input.cache?.tableProfiles.set(cacheKey, { - table: { table: table.ref, rowCount: 0 }, - columns: {}, - warnings: [failureWarning], - }); + const cached = input.cache?.tableProfiles.get(cacheKey); + if (cached) { + return { cached, queryCount: 0 }; + } + + try { + const tableProfile = await queryTableProfile({ + connectionId: input.connectionId, + driver: input.driver, + table, + executor, + ctx: input.ctx, + sampleValuesPerColumn, + profileSampleRows, + }); + input.cache?.tableProfiles.set(cacheKey, { + table: tableProfile.table, + columns: tableProfile.columns, + warnings: [], + }); + return { tableProfile }; + } catch (error) { + const failureWarning = `profile_failed:${table.ref.name}:${error instanceof Error ? error.message : String(error)}`; + const cachedFailure = { + table: { table: table.ref, rowCount: 0 }, + columns: {}, + warnings: [failureWarning], + }; + input.cache?.tableProfiles.set(cacheKey, cachedFailure); + return { cached: cachedFailure, queryCount: 0 }; + } + }, + ); + + for (const result of tableResults) { + if ('tableProfile' in result) { + queryTotal += result.tableProfile.queryCount; + tables.push(result.tableProfile.table); + Object.assign(columns, result.tableProfile.columns); + continue; + } + tables.push(result.cached.table); + Object.assign(columns, result.cached.columns); + for (const warning of result.cached.warnings) { + warnings.push(warning); } } diff --git a/packages/cli/src/context/scan/relationship-validation.ts b/packages/cli/src/context/scan/relationship-validation.ts index 63d7328a..685d1ea9 100644 --- a/packages/cli/src/context/scan/relationship-validation.ts +++ b/packages/cli/src/context/scan/relationship-validation.ts @@ -193,7 +193,7 @@ function statusFor(input: { return 'rejected'; } -async function mapWithConcurrency( +export async function mapWithConcurrency( inputs: readonly TInput[], concurrency: number, mapOne: (input: TInput) => Promise, diff --git a/packages/cli/src/context/scan/table-ref.test.ts b/packages/cli/src/context/scan/table-ref.test.ts new file mode 100644 index 00000000..eb52ac9b --- /dev/null +++ b/packages/cli/src/context/scan/table-ref.test.ts @@ -0,0 +1,67 @@ +import { describe, expect, it } from 'vitest'; +import { + scopedTableNames, + tableRefFromKey, + tableRefKey, + tableRefSet, + type KtxTableRefKey, +} from './table-ref.js'; + +describe('tableRefKey roundtrip', () => { + it('encodes and decodes a three-part ref', () => { + const ref = { catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' }; + expect(tableRefFromKey(tableRefKey(ref))).toEqual(ref); + }); + + it('treats null catalog/db as the empty segment', () => { + const ref = { catalog: null, db: 'public', name: 'users' }; + expect(tableRefFromKey(tableRefKey(ref))).toEqual(ref); + }); + + it('roundtrips a bare-name ref', () => { + const ref = { catalog: null, db: null, name: 'orders' }; + expect(tableRefFromKey(tableRefKey(ref))).toEqual(ref); + }); +}); + +describe('tableRefSet', () => { + it('produces a set with member-equality on canonical keys', () => { + const scope = tableRefSet([ + { catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' }, + { catalog: 'ANALYTICS', db: 'MARTS', name: 'ITEMS' }, + ]); + expect(scope.size).toBe(2); + expect(scope.has(tableRefKey({ catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' }))).toBe(true); + expect(scope.has(tableRefKey({ catalog: 'ANALYTICS', db: 'MARTS', name: 'OTHER' }))).toBe(false); + }); +}); + +describe('scopedTableNames', () => { + it('projects to the requested (catalog, db) namespace', () => { + const scope = tableRefSet([ + { catalog: 'ANALYTICS', db: 'MARTS', name: 'LISTINGS' }, + { catalog: 'ANALYTICS', db: 'MARTS', name: 'ITEMS' }, + { catalog: 'ANALYTICS', db: 'STAGING', name: 'LISTINGS' }, + ]); + expect(scopedTableNames(scope, { catalog: 'ANALYTICS', db: 'MARTS' }).sort()).toEqual(['ITEMS', 'LISTINGS']); + expect(scopedTableNames(scope, { catalog: 'ANALYTICS', db: 'STAGING' })).toEqual(['LISTINGS']); + }); + + it('treats null in the scope entry as a wildcard for that segment', () => { + const scope = tableRefSet([{ catalog: null, db: 'public', name: 'users' }]); + expect(scopedTableNames(scope, { catalog: 'any-catalog', db: 'public' })).toEqual(['users']); + }); + + it('returns empty when no scope entry matches the namespace', () => { + const scope = tableRefSet([{ catalog: 'A', db: 'B', name: 'C' }]); + expect(scopedTableNames(scope, { catalog: 'X', db: 'Y' })).toEqual([]); + }); + + it('dedupes when the same name appears under different catalog projections', () => { + const scope: ReadonlySet = tableRefSet([ + { catalog: null, db: 'public', name: 'users' }, + { catalog: 'A', db: 'public', name: 'users' }, + ]); + expect(scopedTableNames(scope, { catalog: 'A', db: 'public' })).toEqual(['users']); + }); +}); diff --git a/packages/cli/src/context/scan/table-ref.ts b/packages/cli/src/context/scan/table-ref.ts new file mode 100644 index 00000000..1a2abd70 --- /dev/null +++ b/packages/cli/src/context/scan/table-ref.ts @@ -0,0 +1,53 @@ +import type { KtxTableRef } from './types.js'; + +/** + * Branded canonical string representation of a {@link KtxTableRef}. + * + * Connectors compare scopes for set membership via these keys instead of the + * raw object (JS `Set` uses identity equality, which would be useless + * here). Build a key with {@link tableRefKey} and decode with + * {@link tableRefFromKey}. + */ +export type KtxTableRefKey = string & { readonly __brand: 'KtxTableRefKey' }; + +const SEPARATOR = '\x1f'; + +/** @internal */ +export function tableRefKey(ref: KtxTableRef): KtxTableRefKey { + return `${ref.catalog ?? ''}${SEPARATOR}${ref.db ?? ''}${SEPARATOR}${ref.name}` as KtxTableRefKey; +} + +/** @internal */ +export function tableRefFromKey(key: KtxTableRefKey): KtxTableRef { + const [catalog = '', db = '', name = ''] = key.split(SEPARATOR); + return { + catalog: catalog.length > 0 ? catalog : null, + db: db.length > 0 ? db : null, + name, + }; +} + +export function tableRefSet(refs: readonly KtxTableRef[]): ReadonlySet { + return new Set(refs.map(tableRefKey)); +} + +/** + * Return the bare table names from a scope that fall within the given + * (catalog, db) namespace. `catalog: null` is treated as a wildcard so that + * legacy 2-part `"db.name"` entries continue to match. Same for `db: null`. + */ +export function scopedTableNames( + scope: ReadonlySet, + namespace: { catalog?: string | null; db?: string | null }, +): string[] { + const names = new Set(); + const wantCatalog = namespace.catalog ?? null; + const wantDb = namespace.db ?? null; + for (const key of scope) { + const ref = tableRefFromKey(key); + if (wantCatalog !== null && ref.catalog !== null && ref.catalog !== wantCatalog) continue; + if (wantDb !== null && ref.db !== null && ref.db !== wantDb) continue; + names.add(ref.name); + } + return [...names]; +} diff --git a/packages/cli/src/context/scan/types.ts b/packages/cli/src/context/scan/types.ts index f4299e86..95e6b590 100644 --- a/packages/cli/src/context/scan/types.ts +++ b/packages/cli/src/context/scan/types.ts @@ -1,3 +1,5 @@ +import type { KtxTableRefKey } from './table-ref.js'; + export type KtxConnectionDriver = | 'sqlite' | 'postgres' @@ -137,6 +139,14 @@ export interface KtxScanInput { connectionId: string; driver: KtxConnectionDriver; scope?: KtxSchemaScope; + /** + * Restricts introspection to a specific set of fully-qualified tables. + * `undefined` means "all tables within {@link scope}". Connectors that honor + * this field should push the filter into their metadata queries. Callers do + * not post-filter, so a connector that ignores `tableScope` will over-fetch + * and surface the extra tables in output. + */ + tableScope?: ReadonlySet; mode?: KtxScanMode; dryRun?: boolean; detectRelationships?: boolean; diff --git a/packages/cli/src/local-adapters.ts b/packages/cli/src/local-adapters.ts index 88ee9880..cfc57adc 100644 --- a/packages/cli/src/local-adapters.ts +++ b/packages/cli/src/local-adapters.ts @@ -12,10 +12,14 @@ import { isKtxSqliteConnectionConfig } from './connectors/sqlite/connector.js'; import { createSqlServerLiveDatabaseIntrospection } from './connectors/sqlserver/live-database-introspection.js'; import { isKtxSqlServerConnectionConfig } from './connectors/sqlserver/connector.js'; import { BigQueryHistoricSqlQueryHistoryReader } from './context/ingest/adapters/historic-sql/bigquery-query-history-reader.js'; +import { queryHistoryDialectForConnection } from './context/ingest/adapters/historic-sql/connection-dialect.js'; import { createDaemonLiveDatabaseIntrospection } from './context/ingest/adapters/live-database/daemon-introspection.js'; import { createDefaultLocalIngestAdapters, type DefaultLocalIngestAdaptersOptions } from './context/ingest/local-adapters.js'; import type { HistoricSqlReader } from './context/ingest/adapters/historic-sql/types.js'; -import type { LiveDatabaseIntrospectionPort } from './context/ingest/adapters/live-database/types.js'; +import type { + LiveDatabaseIntrospectionOptions, + LiveDatabaseIntrospectionPort, +} from './context/ingest/adapters/live-database/types.js'; import { LiveDatabaseSourceAdapter } from './context/ingest/adapters/live-database/live-database.adapter.js'; import { PostgresPgssReader } from './context/ingest/adapters/historic-sql/postgres-pgss-reader.js'; import { SnowflakeHistoricSqlQueryHistoryReader } from './context/ingest/adapters/historic-sql/snowflake-query-history-reader.js'; @@ -116,38 +120,39 @@ function createKtxCliLiveDatabaseIntrospection( connections: project.config.connections, }); return { - async extractSchema(connectionId: string) { + async extractSchema(connectionId: string, options?: LiveDatabaseIntrospectionOptions) { const connection = project.config.connections[connectionId]; if (isKtxPostgresConnectionConfig(connection)) { - return postgres.extractSchema(connectionId); + return postgres.extractSchema(connectionId, options); } if (isKtxSqliteConnectionConfig(connection)) { - return sqlite.extractSchema(connectionId); + return sqlite.extractSchema(connectionId, options); } if (isKtxMysqlConnectionConfig(connection)) { - return mysql.extractSchema(connectionId); + return mysql.extractSchema(connectionId, options); } if (isKtxClickHouseConnectionConfig(connection)) { - return clickhouse.extractSchema(connectionId); + return clickhouse.extractSchema(connectionId, options); } if (isKtxSqlServerConnectionConfig(connection)) { - return sqlserver.extractSchema(connectionId); + return sqlserver.extractSchema(connectionId, options); } if (isKtxBigQueryConnectionConfig(connection)) { - return bigquery.extractSchema(connectionId); + return bigquery.extractSchema(connectionId, options); } if (hasSnowflakeDriver(connection)) { const { createSnowflakeLiveDatabaseIntrospection } = await import('./connectors/snowflake/live-database-introspection.js'); const { isKtxSnowflakeConnectionConfig } = await import('./connectors/snowflake/connector.js');; if (!isKtxSnowflakeConnectionConfig(connection)) { - return daemon.extractSchema(connectionId); + return daemon.extractSchema(connectionId, options); } const snowflake = createSnowflakeLiveDatabaseIntrospection({ connections: project.config.connections, + projectDir: project.projectDir, }); - return snowflake.extractSchema(connectionId); + return snowflake.extractSchema(connectionId, options); } - return daemon.extractSchema(connectionId); + return daemon.extractSchema(connectionId, options); }, }; } @@ -160,47 +165,6 @@ export interface KtxCliLocalIngestAdaptersOptions extends DefaultLocalIngestAdap logger?: KtxOperationalLogger; } -function historicSqlRecord(connection: unknown): Record | null { - if ( - connection && - typeof connection === 'object' && - 'historicSql' in connection && - typeof (connection as { historicSql?: unknown }).historicSql === 'object' && - (connection as { historicSql?: unknown }).historicSql !== null && - !Array.isArray((connection as { historicSql?: unknown }).historicSql) - ) { - return (connection as { historicSql: Record }).historicSql; - } - return null; -} - -function enabledHistoricSqlDialect(connection: unknown): 'postgres' | 'bigquery' | 'snowflake' | null { - const direct = historicSqlRecord(connection); - const context = - connection && typeof connection === 'object' && !Array.isArray(connection) - ? (connection as { context?: unknown }).context - : null; - const queryHistory = - context && typeof context === 'object' && !Array.isArray(context) - ? (context as { queryHistory?: unknown }).queryHistory - : null; - const enabled = - queryHistory && typeof queryHistory === 'object' && !Array.isArray(queryHistory) - ? (queryHistory as { enabled?: unknown }).enabled === true - : direct?.enabled === true; - if (!enabled) { - return null; - } - const driver = String((connection as { driver?: unknown })?.driver ?? '').toLowerCase(); - if (driver === 'postgres' || driver === 'postgresql') return 'postgres'; - if (driver === 'bigquery') return 'bigquery'; - if (driver === 'snowflake') return 'snowflake'; - const legacyDialect = String(direct?.dialect ?? '').toLowerCase(); - return legacyDialect === 'postgres' || legacyDialect === 'bigquery' || legacyDialect === 'snowflake' - ? legacyDialect - : null; -} - function createEphemeralPostgresHistoricSqlClient(project: KtxLocalProject, connectionId: string) { const connection = project.config.connections[connectionId] as KtxPostgresConnectionConfig | undefined; const inputDriver = connection?.driver ?? 'unknown'; @@ -263,6 +227,7 @@ async function createEphemeralSnowflakeHistoricSqlClient( const connector = new connectorModule.KtxSnowflakeScanConnector({ connectionId, connection, + projectDir: project.projectDir, }); try { const result = await connector.executeReadOnly({ connectionId, sql: query }, {} as never); @@ -303,7 +268,7 @@ function historicSqlOptionsForLocalRun(project: KtxLocalProject, options: KtxCli return undefined; } const connection = project.config.connections[connectionId]; - const dialect = enabledHistoricSqlDialect(connection); + const dialect = queryHistoryDialectForConnection(connection); if (!dialect) { return undefined; } diff --git a/packages/cli/src/local-scan-connectors.ts b/packages/cli/src/local-scan-connectors.ts index 10b2dd05..4f763be5 100644 --- a/packages/cli/src/local-scan-connectors.ts +++ b/packages/cli/src/local-scan-connectors.ts @@ -64,7 +64,7 @@ export async function createKtxCliScanConnector( if (!isKtxSnowflakeConnectionConfig(connection)) { throw invalidConnectionConfigError(connectionId, driver); } - return new KtxSnowflakeScanConnector({ connectionId, connection }); + return new KtxSnowflakeScanConnector({ connectionId, connection, projectDir: project.projectDir }); } throw new Error( `Connection "${connectionId}" uses driver "${driver}", which has no native standalone KTX scan connector. Supported drivers: ${SUPPORTED_DRIVERS}.`, diff --git a/packages/cli/src/public-ingest.test.ts b/packages/cli/src/public-ingest.test.ts index 1f5dd67d..7c400752 100644 --- a/packages/cli/src/public-ingest.test.ts +++ b/packages/cli/src/public-ingest.test.ts @@ -942,7 +942,7 @@ describe('runKtxPublicIngest', () => { expect(io.stdout()).not.toContain('Debug:'); }); - it('prints query-history retry guidance for query-history facet failures', async () => { + it('skips the query-history facet but keeps the target green when query-history fails', async () => { const io = makeIo(); const project = deepReadyProject({ warehouse: { driver: 'postgres', context: { depth: 'deep' } }, @@ -969,11 +969,13 @@ describe('runKtxPublicIngest', () => { io.io, { loadProject: vi.fn(async () => project), runScan, runIngest }, ), - ).resolves.toBe(1); + ).resolves.toBe(0); - expect(io.stdout()).toMatch(/warehouse\s+done\s+failed\s+skipped\s+skipped/); + expect(io.stdout()).toContain('Ingest finished with skipped query history'); + expect(io.stdout()).toMatch(/warehouse\s+done\s+skipped\s+skipped\s+skipped/); + expect(io.stdout()).toContain('Skipped query history:'); expect(io.stdout()).toContain( - 'warehouse failed: Query history failed for 60 tasks. First failure: Google Cloud authentication failed while analyzing query history', + 'Query history failed for 60 tasks. First failure: Google Cloud authentication failed while analyzing query history', ); expect(io.stdout()).not.toContain('warehouse failed: Error:'); expect(io.stdout()).toContain('Retry: ktx ingest warehouse --project-dir /tmp/project --deep --query-history'); @@ -1007,8 +1009,9 @@ describe('runKtxPublicIngest', () => { io.io, { loadProject: vi.fn(async () => project), runScan, runIngest }, ), - ).resolves.toBe(1); + ).resolves.toBe(0); + expect(io.stdout()).toContain('Ingest finished with skipped query history'); expect(io.stdout()).toContain('Missing bundled Python runtime manifest'); expect(io.stdout()).toContain( 'In a source checkout, build the local runtime assets with: pnpm run artifacts:build', diff --git a/packages/cli/src/public-ingest.ts b/packages/cli/src/public-ingest.ts index 498edb3a..ce6c6344 100644 --- a/packages/cli/src/public-ingest.ts +++ b/packages/cli/src/public-ingest.ts @@ -601,10 +601,47 @@ function markTargetResult( }; } +function markTargetWithSkippedQueryHistory( + target: KtxPublicIngestPlanTarget, + args: Extract, + detail: string, +): KtxPublicIngestTargetResult { + const baseline = markTargetResult(target, args, 'done'); + return { + ...baseline, + steps: baseline.steps.map((step) => + step.operation === 'query-history' ? { ...step, status: 'skipped', detail } : step, + ), + }; +} + +function queryHistoryFailureDetail(input: { + target: KtxPublicIngestPlanTarget; + args: Extract; + capturedOutput?: string; +}): string { + const captured = capturedFailureMessage(input.capturedOutput ?? ''); + return failureDetailWithRetry({ + target: input.target, + args: input.args, + failedOperation: 'query-history', + failureDetail: captured, + }); +} + function resultFailed(result: KtxPublicIngestTargetResult): boolean { return result.steps.some((step) => step.status === 'failed'); } +function resultSkippedQueryHistory( + result: KtxPublicIngestTargetResult, +): { connectionId: string; detail: string } | null { + const skipped = result.steps.find( + (step) => step.operation === 'query-history' && step.status === 'skipped' && step.detail !== undefined, + ); + return skipped?.detail ? { connectionId: result.connectionId, detail: skipped.detail } : null; +} + function rowsBucket(): '<10k' | '<100k' | '<1M' | '<10M' | '>=10M' { return '<10k'; } @@ -644,7 +681,17 @@ function stepStatus(result: KtxPublicIngestTargetResult, operation: KtxPublicIng function renderPlainResults(results: KtxPublicIngestTargetResult[], io: KtxCliIo): void { const failures = results.filter(resultFailed); - io.stdout.write(failures.length > 0 ? 'Ingest finished with partial failures\n' : 'Ingest finished\n'); + const skippedQueryHistory = results.map(resultSkippedQueryHistory).filter((entry) => entry !== null) as Array<{ + connectionId: string; + detail: string; + }>; + const headerSuffix = + failures.length > 0 + ? ' with partial failures' + : skippedQueryHistory.length > 0 + ? ' with skipped query history' + : ''; + io.stdout.write(`Ingest finished${headerSuffix}\n`); io.stdout.write('\n'); io.stdout.write('Source Database schema Query history Source ingest Memory update\n'); for (const result of results) { @@ -659,17 +706,22 @@ function renderPlainResults(results: KtxPublicIngestTargetResult[], io: KtxCliIo ); } - if (failures.length === 0) { - return; + if (failures.length > 0) { + io.stdout.write('\nFailed sources:\n'); + for (const result of failures) { + const failedStep = result.steps.find((step) => step.status === 'failed'); + if (!failedStep) { + continue; + } + io.stdout.write(` ${failedStep.detail ?? `${result.connectionId} failed.`}\n`); + } } - io.stdout.write('\nFailed sources:\n'); - for (const result of failures) { - const failedStep = result.steps.find((step) => step.status === 'failed'); - if (!failedStep) { - continue; + if (skippedQueryHistory.length > 0) { + io.stdout.write('\nSkipped query history:\n'); + for (const { detail } of skippedQueryHistory) { + io.stdout.write(` ${detail}\n`); } - io.stdout.write(` ${failedStep.detail ?? `${result.connectionId} failed.`}\n`); } } @@ -849,14 +901,13 @@ export async function executePublicIngestTarget( ? await runIngest(ingestArgs, ingestIo, ingestDeps) : await runIngest(ingestArgs, ingestIo); if (qhExitCode !== 0) { - deps.onPhaseEnd?.('query-history', 'failed'); - return markTargetResult( + const detail = queryHistoryFailureDetail({ target, args, - 'failed', - 'query-history', - capturedIngestIo ? capturedFailureMessage(capturedIngestIo.capturedOutput()) : undefined, - ); + capturedOutput: capturedIngestIo ? capturedIngestIo.capturedOutput() : undefined, + }); + deps.onPhaseEnd?.('query-history', 'failed', detail); + return markTargetWithSkippedQueryHistory(target, args, detail); } deps.onPhaseEnd?.('query-history', 'done'); } diff --git a/packages/cli/src/scan.test.ts b/packages/cli/src/scan.test.ts index 16cfdbd3..5ec745e6 100644 --- a/packages/cli/src/scan.test.ts +++ b/packages/cli/src/scan.test.ts @@ -96,14 +96,17 @@ const createSnowflakeLiveDatabaseIntrospection = vi.hoisted(() => const isKtxSnowflakeConnectionConfig = vi.hoisted(() => vi.fn((connection: { driver?: string } | undefined) => connection?.driver === 'snowflake'), ); +const snowflakeConnectorInstances = vi.hoisted(() => [] as Array<{ cleanup: ReturnType }>); const KtxSnowflakeScanConnector = vi.hoisted( () => class { readonly id: string; readonly driver = 'snowflake'; + readonly cleanup = vi.fn(async () => undefined); constructor(options: { connectionId: string }) { this.id = `snowflake:${options.connectionId}`; + snowflakeConnectorInstances.push(this); } }, ); @@ -1047,6 +1050,95 @@ describe('runKtxScan', () => { await rm(tempProject, { recursive: true, force: true }); }); + it('cleans up a constructed scan connector after an enriched scan succeeds', async () => { + await initKtxProject({ projectDir: tempDir }); + await writeFile( + join(tempDir, 'ktx.yaml'), + [ + 'connections:', + ' warehouse:', + ' driver: snowflake', + ' account: acct', + ' warehouse: WH', + ' database: ANALYTICS', + ' schema_name: PUBLIC', + ' username: reader', + ' password: env:SNOWFLAKE_PASSWORD', + '', + ].join('\n'), + 'utf-8', + ); + snowflakeConnectorInstances.length = 0; + const runLocalScan = vi.fn(async (): Promise => ({ + runId: 'scan-run-cleanup', + status: 'done', + done: true, + connectionId: 'warehouse', + mode: 'enriched', + dryRun: false, + syncId: 'sync-1', + report: { ...report, mode: 'enriched' }, + })); + + await expect( + runKtxScan( + { + command: 'run', + projectDir: tempDir, + connectionId: 'warehouse', + mode: 'enriched', + detectRelationships: false, + dryRun: false, + }, + makeIo().io, + { runLocalScan, createLocalIngestAdapters: noLocalIngestAdapters }, + ), + ).resolves.toBe(0); + + expect(snowflakeConnectorInstances[0]?.cleanup).toHaveBeenCalledTimes(1); + }); + + it('cleans up a constructed scan connector after runLocalScan throws', async () => { + await initKtxProject({ projectDir: tempDir }); + await writeFile( + join(tempDir, 'ktx.yaml'), + [ + 'connections:', + ' warehouse:', + ' driver: snowflake', + ' account: acct', + ' warehouse: WH', + ' database: ANALYTICS', + ' schema_name: PUBLIC', + ' username: reader', + ' password: env:SNOWFLAKE_PASSWORD', + '', + ].join('\n'), + 'utf-8', + ); + snowflakeConnectorInstances.length = 0; + const runLocalScan = vi.fn(async () => { + throw new Error('scan failed'); + }); + + await expect( + runKtxScan( + { + command: 'run', + projectDir: tempDir, + connectionId: 'warehouse', + mode: 'relationships', + detectRelationships: true, + dryRun: false, + }, + makeIo().io, + { runLocalScan, createLocalIngestAdapters: noLocalIngestAdapters }, + ), + ).resolves.toBe(1); + + expect(snowflakeConnectorInstances[0]?.cleanup).toHaveBeenCalledTimes(1); + }); + it('routes standalone postgres scans through the native connector before daemon fallback', async () => { const tempProject = await mkdtemp(join(tmpdir(), 'ktx-scan-cli-native-postgres-')); await initKtxProject({ projectDir: tempProject }); diff --git a/packages/cli/src/scan.ts b/packages/cli/src/scan.ts index f40da497..94b80f65 100644 --- a/packages/cli/src/scan.ts +++ b/packages/cli/src/scan.ts @@ -375,6 +375,7 @@ export async function runKtxScan(args: KtxScanArgs, io: KtxCliIo = process, deps writeRunSummary(result.report, args.projectDir, io); } finally { cliProgress?.flush(); + await connector?.cleanup?.(); } return 0; } catch (error) { diff --git a/packages/cli/src/setup-databases.test.ts b/packages/cli/src/setup-databases.test.ts index c401dc51..50a1c6ed 100644 --- a/packages/cli/src/setup-databases.test.ts +++ b/packages/cli/src/setup-databases.test.ts @@ -545,8 +545,8 @@ describe('setup databases step', () => { }, { driver: 'snowflake', - selectValues: ['no'], - textValues: ['', 'env:SNOWFLAKE_ACCOUNT', 'ANALYTICS_WH', 'ANALYTICS', '', 'env:SNOWFLAKE_USER', ''], + selectValues: ['password', 'no'], + textValues: ['', 'env:SNOWFLAKE_ACCOUNT', 'ANALYTICS_WH', 'ANALYTICS', 'env:SNOWFLAKE_USER', ''], passwordValues: ['env:SNOWFLAKE_PASSWORD'], expectedTextPrompts: [ { @@ -563,11 +563,6 @@ describe('setup databases step', () => { { message: 'Snowflake database name', }, - { - message: 'Snowflake schema\nPress Enter for PUBLIC, or enter a schema name.', - placeholder: 'PUBLIC', - initialValue: 'PUBLIC', - }, { message: 'Snowflake username', }, @@ -602,6 +597,8 @@ describe('setup databases step', () => { prompts, testConnection: vi.fn(async () => 0), scanConnection: vi.fn(async () => 0), + listSchemas: vi.fn(async () => []), + listTables: vi.fn(async () => []), }, ); @@ -775,6 +772,8 @@ describe('setup databases step', () => { }); const testConnection = vi.fn(async () => 0); const scanConnection = vi.fn(async () => 0); + const listSchemas = vi.fn(async () => []); + const listTables = vi.fn(async () => []); const result = await runKtxSetupDatabasesStep( { @@ -785,7 +784,7 @@ describe('setup databases step', () => { disableQueryHistory: true, }, makeIo().io, - { prompts, testConnection, scanConnection }, + { prompts, testConnection, scanConnection, listSchemas, listTables }, ); expect(result).toEqual({ @@ -1692,6 +1691,62 @@ describe('setup databases step', () => { expect(io.stdout()).toContain('✓ orbit_analytics, orbit_raw'); }); + it('falls back to comma-separated free-text when listSchemas fails interactively', async () => { + const io = makeIo(); + const prompts = makePromptAdapter({ + selectValues: ['url'], + textValues: ['', 'env:DATABASE_URL', 'orbit_analytics, orbit_raw'], + }); + const testConnection = vi.fn(async () => 0); + const scanConnection = vi.fn(async () => 0); + const listSchemas = vi.fn(async () => { + throw new Error('permission denied to list schemas'); + }); + const listTables = vi.fn(async (_projectDir: string, _connectionId: string, schemas?: string[]) => + (schemas ?? []).map((schema) => ({ schema, name: 'events', kind: 'table' as const })), + ); + const pickers = makePickerStubs({ + scopes: [ + { + schemas: ['orbit_analytics', 'orbit_raw'], + tables: ['orbit_analytics.events', 'orbit_raw.events'], + }, + ], + }); + + const result = await runKtxSetupDatabasesStep( + { + projectDir: tempDir, + inputMode: 'auto', + databaseDrivers: ['postgres'], + databaseSchemas: [], + skipDatabases: false, + }, + io.io, + { + prompts, + testConnection, + scanConnection, + listSchemas, + listTables, + pickDatabaseScope: pickers.pickDatabaseScope, + }, + ); + + expect(result.status).toBe('ready'); + expect(io.stderr()).toContain('Could not discover postgresql schemas'); + expect(vi.mocked(prompts.text).mock.calls.map(([options]) => options.message)).toContain( + textInputPrompt( + 'Enter schemas for postgres-warehouse as a comma-separated list (e.g. SALES, MARKETING).', + ), + ); + expect(pickers.scopeCalls[0]).toMatchObject({ + schemas: ['orbit_analytics', 'orbit_raw'], + initialSchemas: ['orbit_analytics', 'orbit_raw'], + schemaSuggestion: { suggested: new Set(['orbit_analytics', 'orbit_raw']) }, + }); + }); + it('passes schemas and a lazy table callback to the scope picker instead of eager table discovery', async () => { const listSchemas = vi.fn(async () => ['analytics', 'raw']); const listTables = vi.fn(async (_projectDir: string, _connectionId: string, schemas?: string[]) => @@ -2015,6 +2070,7 @@ describe('setup databases step', () => { it('writes query history config for supported Snowflake databases after validation succeeds', async () => { const io = makeIo(); + const historicSqlProbe = vi.fn(async () => ({ ok: true, lines: [] })); const result = await runKtxSetupDatabasesStep( { projectDir: tempDir, @@ -2032,12 +2088,21 @@ describe('setup databases step', () => { { testConnection: vi.fn(async () => 0), scanConnection: vi.fn(async () => 0), + historicSqlProbe, prompts: makePromptAdapter({ - textValues: ['env:SNOWFLAKE_ACCOUNT', 'WH', 'ANALYTICS', 'PUBLIC', 'reader', ''], + selectValues: ['password'], + textValues: ['env:SNOWFLAKE_ACCOUNT', 'WH', 'ANALYTICS', 'reader', ''], passwordValues: ['env:SNOWFLAKE_PASSWORD'], }), }, ); + expect(historicSqlProbe).toHaveBeenCalledWith( + expect.objectContaining({ + projectDir: tempDir, + connectionId: 'snowflake', + dialect: 'snowflake', + }), + ); expect(result.status).toBe('ready'); const configText = await readFile(join(tempDir, 'ktx.yaml'), 'utf-8'); @@ -2067,6 +2132,51 @@ describe('setup databases step', () => { expect(config.ingest.adapters).toEqual([]); }); + it('configures Snowflake with RSA key-pair auth via setup wizard', async () => { + const io = makeIo(); + const result = await runKtxSetupDatabasesStep( + { + projectDir: tempDir, + inputMode: 'disabled', + databaseDrivers: ['snowflake'], + databaseConnectionId: 'snowflake', + databaseSchemas: [], + skipDatabases: false, + }, + io.io, + { + testConnection: vi.fn(async () => 0), + scanConnection: vi.fn(async () => 0), + prompts: makePromptAdapter({ + selectValues: ['rsa'], + textValues: [ + 'env:SNOWFLAKE_ACCOUNT', + 'WH', + 'ANALYTICS', + 'reader', + '~/.ssh/snowflake_rsa_key.p8', + '', + ], + passwordValues: ['env:SNOWFLAKE_KEY_PASS'], + }), + }, + ); + + expect(result.status).toBe('ready'); + const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')); + expect(config.connections.snowflake).toMatchObject({ + driver: 'snowflake', + authMethod: 'rsa', + account: 'env:SNOWFLAKE_ACCOUNT', + warehouse: 'WH', + database: 'ANALYTICS', + username: 'reader', + privateKey: 'file:~/.ssh/snowflake_rsa_key.p8', // pragma: allowlist secret + passphrase: 'env:SNOWFLAKE_KEY_PASS', // pragma: allowlist secret + }); + expect(config.connections.snowflake.password).toBeUndefined(); + }); + it('writes Postgres query history config with minExecutions and ignores window/redaction output', async () => { const io = makeIo(); const result = await runKtxSetupDatabasesStep( @@ -2427,7 +2537,53 @@ describe('setup databases step', () => { expect(io.stdout()).toContain('Query history probe...'); expect(io.stdout()).not.toContain('Historic SQL probe...'); expect(io.stdout()).toContain('pg_stat_statements extension is not installed'); - expect(io.stdout()).toContain('Setup written; first ingest run will fail until fixed.'); + expect(io.stdout()).toContain('Setup written; query history will be skipped until fixed.'); + }); + + it('prints a non-blocking Snowflake query history probe failure with the grants remediation', async () => { + const io = makeIo(); + const historicSqlProbe = vi.fn(async () => ({ + ok: false, + lines: [ + ' FAIL Snowflake role cannot read SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY', + ' Fix: Run (as ACCOUNTADMIN): GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE ;', + ], + })); + + const result = await runKtxSetupDatabasesStep( + { + projectDir: tempDir, + inputMode: 'disabled', + databaseDrivers: ['snowflake'], + databaseConnectionId: 'warehouse', + databaseSchemas: [], + enableQueryHistory: true, + skipDatabases: false, + }, + io.io, + { + testConnection: vi.fn(async () => 0), + scanConnection: vi.fn(async () => 0), + historicSqlProbe, + prompts: makePromptAdapter({ + textValues: ['env:SNOWFLAKE_ACCOUNT', 'WH', 'ANALYTICS', 'reader', ''], + passwordValues: ['env:SNOWFLAKE_PASSWORD'], + }), + }, + ); + + expect(result.status).toBe('ready'); + expect(historicSqlProbe).toHaveBeenCalledWith( + expect.objectContaining({ + projectDir: tempDir, + connectionId: 'warehouse', + dialect: 'snowflake', + }), + ); + expect(io.stdout()).toContain('Query history probe...'); + expect(io.stdout()).toContain('Snowflake role cannot read SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY'); + expect(io.stdout()).toContain('GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE'); + expect(io.stdout()).toContain('Setup written; query history will be skipped until fixed.'); }); it('does not run the query history probe when the regular connection test fails', async () => { diff --git a/packages/cli/src/setup-databases.ts b/packages/cli/src/setup-databases.ts index 1c21f9fe..30d4fa20 100644 --- a/packages/cli/src/setup-databases.ts +++ b/packages/cli/src/setup-databases.ts @@ -343,6 +343,13 @@ function historicSqlProbeFailureLines(error: unknown): string[] { ]; } if (error instanceof Error && error.name === 'HistoricSqlGrantsMissingError') { + const dialect = (error as { dialect?: unknown }).dialect; + if (dialect === 'snowflake') { + return [ + ' FAIL Snowflake role cannot read SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY', + ' Fix: Run (as ACCOUNTADMIN): GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE ;', + ]; + } return [ ' FAIL Postgres connection role lacks pg_read_all_stats', ' Fix: Run: GRANT pg_read_all_stats TO ;', @@ -355,10 +362,18 @@ function historicSqlProbeFailureLines(error: unknown): string[] { } async function defaultHistoricSqlProbe(input: KtxSetupHistoricSqlProbeInput): Promise { - if (input.dialect !== 'postgres') { - return { ok: true, lines: [] }; + if (input.dialect === 'postgres') { + return probePostgresHistoricSql(input); } + if (input.dialect === 'snowflake') { + return probeSnowflakeHistoricSql(input); + } + return { ok: true, lines: [] }; +} +async function probePostgresHistoricSql( + input: KtxSetupHistoricSqlProbeInput, +): Promise { const project = await loadKtxProject({ projectDir: input.projectDir }); const connection = project.config.connections[input.connectionId]; const [{ PostgresPgssReader }, { KtxPostgresHistoricSqlQueryClient }, { isKtxPostgresConnectionConfig }] = @@ -396,6 +411,46 @@ async function defaultHistoricSqlProbe(input: KtxSetupHistoricSqlProbeInput): Pr } } +async function probeSnowflakeHistoricSql( + input: KtxSetupHistoricSqlProbeInput, +): Promise { + const project = await loadKtxProject({ projectDir: input.projectDir }); + const connection = project.config.connections[input.connectionId]; + const [{ SnowflakeHistoricSqlQueryHistoryReader }, { KtxSnowflakeHistoricSqlQueryClient }, { isKtxSnowflakeConnectionConfig }] = + await Promise.all([ + import('./context/ingest/adapters/historic-sql/snowflake-query-history-reader.js'), + import('./connectors/snowflake/historic-sql-query-client.js'), + import('./connectors/snowflake/connector.js'), + ]); + + if (!isKtxSnowflakeConnectionConfig(connection)) { + return { + ok: false, + lines: [` FAIL Connection ${input.connectionId} is not a native Snowflake connection.`], + }; + } + + const client = new KtxSnowflakeHistoricSqlQueryClient({ + connectionId: input.connectionId, + connection, + projectDir: input.projectDir, + }); + try { + const result = await new SnowflakeHistoricSqlQueryHistoryReader().probe(client); + return { + ok: true, + lines: [ + ' OK SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY accessible', + ...result.warnings.map((warning: string) => ` ! ${warning}`), + ], + }; + } catch (error) { + return { ok: false, lines: historicSqlProbeFailureLines(error) }; + } finally { + await client.cleanup(); + } +} + async function defaultListSchemas(projectDir: string, connectionId: string): Promise { const project = await loadKtxProject({ projectDir }); const connection = project.config.connections[connectionId]; @@ -459,7 +514,7 @@ async function defaultListSchemas(projectDir: string, connectionId: string): Pro if (driver === 'snowflake') { const { KtxSnowflakeScanConnector, isKtxSnowflakeConnectionConfig } = await import('./connectors/snowflake/connector.js');; if (!isKtxSnowflakeConnectionConfig(connection)) return []; - const connector = new KtxSnowflakeScanConnector({ connectionId, connection }); + const connector = new KtxSnowflakeScanConnector({ connectionId, connection, projectDir }); try { return await connector.listSchemas(); } finally { @@ -535,7 +590,7 @@ async function defaultListTables( if (driver === 'snowflake') { const { KtxSnowflakeScanConnector, isKtxSnowflakeConnectionConfig } = await import('./connectors/snowflake/connector.js');; if (!isKtxSnowflakeConnectionConfig(connection)) return []; - const connector = new KtxSnowflakeScanConnector({ connectionId, connection }); + const connector = new KtxSnowflakeScanConnector({ connectionId, connection, projectDir }); try { return await connector.listTables(schemas); } finally { @@ -954,43 +1009,86 @@ async function buildConnectionConfig(input: { stringConfigField(input.existingConnection, 'database'), ); if (database === undefined) return 'back'; - const schemaName = await promptText( - prompts, - 'Snowflake schema\nPress Enter for PUBLIC, or enter a schema name.', - stringConfigField(input.existingConnection, 'schema_name') ?? 'PUBLIC', - ); - if (schemaName === undefined) return 'back'; const username = await promptText( prompts, 'Snowflake username', stringConfigField(input.existingConnection, 'username'), ); if (username === undefined) return 'back'; - const passwordRef = await promptCredential({ - prompts, - message: 'Snowflake password', - projectDir: args.projectDir, - connectionId: input.connectionId, - secretName: 'password', // pragma: allowlist secret + const authChoice = await prompts.select({ + message: 'Snowflake authentication method', + options: [ + { value: 'password', label: 'Password' }, + { value: 'rsa', label: 'Key-pair (RSA / JWT)' }, + { value: 'back', label: 'Back' }, + ], }); - if (passwordRef === 'back') return 'back'; // pragma: allowlist secret + if (authChoice === 'back') return 'back'; + const authMethod: 'password' | 'rsa' = authChoice === 'rsa' ? 'rsa' : 'password'; + let passwordRef: string | null = null; + let privateKeyInput: string | undefined; + let passphraseRef: string | null = null; + if (authMethod === 'password') { + const ref = await promptCredential({ + prompts, + message: 'Snowflake password', + projectDir: args.projectDir, + connectionId: input.connectionId, + secretName: 'password', // pragma: allowlist secret + }); + if (ref === 'back') return 'back'; // pragma: allowlist secret + passwordRef = ref; + } else { + privateKeyInput = await promptText( + prompts, + 'Path to Snowflake private key (PEM)\nFor example ~/.ssh/snowflake_rsa_key.p8, or $ENV_VAR / env:NAME / file:/abs/path.', + displayFileReference(stringConfigField(input.existingConnection, 'privateKey')), + ); + if (privateKeyInput === undefined) return 'back'; + const phr = await promptCredential({ + prompts, + message: 'Private key passphrase (optional)\nPress Enter to skip.', + projectDir: args.projectDir, + connectionId: input.connectionId, + secretName: 'snowflake-passphrase', // pragma: allowlist secret + }); + if (phr === 'back') return 'back'; + passphraseRef = phr; + } const role = await promptText( prompts, 'Snowflake role (optional)\nPress Enter to skip.', stringConfigField(input.existingConnection, 'role'), ); if (role === undefined) return 'back'; - const resolvedPasswordRef = passwordRef ?? stringConfigField(input.existingConnection, 'password'); - if (!account || !warehouse || !database || !schemaName || !username || !resolvedPasswordRef) return null; + if (authMethod === 'password') { + const resolvedPasswordRef = passwordRef ?? stringConfigField(input.existingConnection, 'password'); + if (!account || !warehouse || !database || !username || !resolvedPasswordRef) return null; + return { + driver: 'snowflake', + authMethod: 'password', + account, + warehouse, + database, + username, + password: resolvedPasswordRef, + ...(role ? { role } : {}), + }; + } + const resolvedPrivateKey = privateKeyInput + ? normalizeFileReference(privateKeyInput) + : stringConfigField(input.existingConnection, 'privateKey'); + if (!account || !warehouse || !database || !username || !resolvedPrivateKey) return null; + const resolvedPassphrase = passphraseRef ?? stringConfigField(input.existingConnection, 'passphrase'); return { driver: 'snowflake', - authMethod: 'password', + authMethod: 'rsa', account, warehouse, database, - schema_name: schemaName, username, - password: resolvedPasswordRef, + privateKey: resolvedPrivateKey, + ...(resolvedPassphrase ? { passphrase: resolvedPassphrase } : {}), ...(role ? { role } : {}), }; } @@ -1425,6 +1523,21 @@ async function writeScopeConfig(input: { }); } +async function promptCommaSeparatedScope(input: { + prompts: KtxSetupDatabasesPromptAdapter; + connectionId: string; + spec: ScopeDiscoverySpec; +}): Promise { + const example = + input.spec.nounPlural === 'datasets' ? 'sales, marketing' : 'SALES, MARKETING'; + const value = await promptText( + input.prompts, + `Enter ${input.spec.nounPlural} for ${input.connectionId} as a comma-separated list (e.g. ${example}).`, + ); + if (value === undefined) return undefined; + return unique(value.split(',').map((part) => part.trim())); +} + async function maybeConfigureDatabaseScope(input: { projectDir: string; connectionId: string; @@ -1494,28 +1607,48 @@ async function maybeConfigureDatabaseScope(input: { writeSetupSection(input.io, 'Discovering tables', [`Connecting to ${input.connectionId}…`]); - const schemas = unique( - cliSchemas.length > 0 - ? cliSchemas - : await (async (): Promise => { - if (!spec) return []; - try { - return await (input.deps.listSchemas ?? defaultListSchemas)(input.projectDir, input.connectionId); - } catch (error) { - const detail = error instanceof Error ? error.message : String(error); - input.io.stderr.write( - `Could not discover ${spec.promptLabel.toLowerCase()} for ${input.connectionId}; ${detail}\n`, - ); - return []; - } - })(), - ); + let effectiveCliSchemas = cliSchemas; + let listedSchemas: string[]; + if (cliSchemas.length > 0) { + listedSchemas = cliSchemas; + } else if (!spec) { + listedSchemas = []; + } else { + try { + listedSchemas = await (input.deps.listSchemas ?? defaultListSchemas)( + input.projectDir, + input.connectionId, + ); + } catch (error) { + const detail = error instanceof Error ? error.message : String(error); + input.io.stderr.write( + `Could not discover ${spec.promptLabel.toLowerCase()} for ${input.connectionId}; ${detail}\n`, + ); + const typed = await promptCommaSeparatedScope({ + prompts: input.prompts, + connectionId: input.connectionId, + spec, + }); + if (typed === undefined) return 'back'; + effectiveCliSchemas = typed; + listedSchemas = typed; + if (typed.length > 0) { + await writeScopeConfig({ + projectDir: input.projectDir, + connectionId: input.connectionId, + values: typed, + spec, + }); + } + } + } + const schemas = unique(listedSchemas); if (spec && schemas.length === 0) { return 'ready'; } const schemaSuggestion = - cliSchemas.length > 0 - ? { excluded: new Set(), suggested: new Set(cliSchemas) } + effectiveCliSchemas.length > 0 + ? { excluded: new Set(), suggested: new Set(effectiveCliSchemas) } : spec?.suggest(schemas) ?? { excluded: new Set(), suggested: new Set() }; const existingEnabled = hasExistingTables && input.forcePrompt === true @@ -1533,7 +1666,7 @@ async function maybeConfigureDatabaseScope(input: { schemaSuggestion, existing: { enabledTables: existingEnabled }, supportsSchemaScope: spec !== undefined, - initialSchemas: cliSchemas.length > 0 ? cliSchemas : undefined, + initialSchemas: effectiveCliSchemas.length > 0 ? effectiveCliSchemas : undefined, prompts: input.prompts, listTablesForSchemas: (selectedSchemas) => (input.deps.listTables ?? defaultListTables)(input.projectDir, input.connectionId, selectedSchemas), @@ -1638,7 +1771,12 @@ async function maybeRunHistoricSqlSetupProbe(input: { const connection = project.config.connections[input.connectionId]; const queryHistory = queryHistoryConfigRecord(connection) ?? historicSqlConfigRecord(connection); const driver = normalizeDriver(connection?.driver); - if (queryHistory?.enabled !== true || driver !== 'postgres') { + if (queryHistory?.enabled !== true) { + return; + } + const dialect: 'postgres' | 'snowflake' | null = + driver === 'postgres' ? 'postgres' : driver === 'snowflake' ? 'snowflake' : null; + if (!dialect) { return; } @@ -1647,13 +1785,13 @@ async function maybeRunHistoricSqlSetupProbe(input: { const result = await probe({ projectDir: input.projectDir, connectionId: input.connectionId, - dialect: 'postgres', + dialect, }); for (const line of result.lines) { input.io.stdout.write(`│${line}\n`); } if (!result.ok) { - input.io.stdout.write('│ Setup written; first ingest run will fail until fixed.\n'); + input.io.stdout.write('│ Setup written; query history will be skipped until fixed.\n'); } } diff --git a/packages/cli/src/status-project.test.ts b/packages/cli/src/status-project.test.ts index 9f8c879e..83862bfb 100644 --- a/packages/cli/src/status-project.test.ts +++ b/packages/cli/src/status-project.test.ts @@ -148,6 +148,161 @@ function withPostgresQueryHistory(config: KtxProjectConfig): KtxProjectConfig { }; } +function withSnowflakeQueryHistory(config: KtxProjectConfig): KtxProjectConfig { + return { + ...config, + connections: { + ...config.connections, + warehouse: { + driver: 'snowflake', + account: 'EMOVRJS-CZ07756', + warehouse: 'COMPUTE_WH', + database: 'ANALYTICS', + username: 'svc_ktx', + password: 'env:SNOWFLAKE_PASSWORD', // pragma: allowlist secret + context: { queryHistory: { enabled: true } }, + } as KtxProjectConfig['connections'][string], + }, + }; +} + +function withBigQueryQueryHistory(config: KtxProjectConfig): KtxProjectConfig { + return { + ...config, + connections: { + ...config.connections, + bq: { + driver: 'bigquery', + credentials_json: 'env:BQ_CREDENTIALS_JSON', + context: { queryHistory: { enabled: true } }, + } as KtxProjectConfig['connections'][string], + }, + }; +} + +function withMysqlQueryHistory(config: KtxProjectConfig): KtxProjectConfig { + return { + ...config, + connections: { + ...config.connections, + legacy: { + driver: 'mysql', + host: 'db.example.com', + database: 'analytics', + username: 'svc', + password: 'env:MYSQL_PASSWORD', // pragma: allowlist secret + context: { queryHistory: { enabled: true } }, + } as KtxProjectConfig['connections'][string], + }, + }; +} + +describe('buildProjectStatus query history dispatch', () => { + it('runs the snowflake probe for snowflake connections, not the postgres one', async () => { + let postgresCalls = 0; + let snowflakeCalls = 0; + const project = projectWithConfig(withSnowflakeQueryHistory(baseProjectConfig())); + + const status = await buildProjectStatus(project, { + claudeCodeAuthProbe: stubClaudeCodeAuthProbe, + postgresQueryHistoryProbe: async () => { + postgresCalls += 1; + throw new Error('postgres probe should not run for snowflake'); + }, + snowflakeQueryHistoryProbe: async () => { + snowflakeCalls += 1; + return { warnings: [], info: [] }; + }, + }); + + expect(postgresCalls).toBe(0); + expect(snowflakeCalls).toBe(1); + expect(status.queryHistory).toHaveLength(1); + expect(status.queryHistory[0]).toMatchObject({ + connection: 'warehouse', + driver: 'snowflake', + dialect: 'snowflake', + status: 'ok', + }); + expect(status.queryHistory[0].detail).toMatch(/SNOWFLAKE\.ACCOUNT_USAGE\.QUERY_HISTORY/); + expect(status.queryHistory[0].fix).toBeUndefined(); + expect(status.verdict).not.toBe('blocked'); + }); + + it('reports snowflake probe failures with the reader-provided remediation', async () => { + const project = projectWithConfig(withSnowflakeQueryHistory(baseProjectConfig())); + const { HistoricSqlGrantsMissingError } = await import( + './context/ingest/adapters/historic-sql/errors.js' + ); + + const status = await buildProjectStatus(project, { + claudeCodeAuthProbe: stubClaudeCodeAuthProbe, + snowflakeQueryHistoryProbe: async () => { + throw new HistoricSqlGrantsMissingError({ + dialect: 'snowflake', + message: 'role cannot read SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY', + remediation: 'GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE ktx;', + }); + }, + }); + + expect(status.queryHistory[0]).toMatchObject({ + connection: 'warehouse', + driver: 'snowflake', + dialect: 'snowflake', + status: 'fail', + fix: 'GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE ktx;', + }); + expect(status.queryHistory[0].detail).not.toMatch(/Set connections.*driver to postgres/); + }); + + it('runs the bigquery probe for bigquery connections', async () => { + let bigqueryCalls = 0; + const project = projectWithConfig(withBigQueryQueryHistory(baseProjectConfig())); + + const status = await buildProjectStatus(project, { + claudeCodeAuthProbe: stubClaudeCodeAuthProbe, + bigqueryQueryHistoryProbe: async () => { + bigqueryCalls += 1; + return { warnings: [], info: [] }; + }, + }); + + expect(bigqueryCalls).toBe(1); + expect(status.queryHistory[0]).toMatchObject({ + connection: 'bq', + driver: 'bigquery', + dialect: 'bigquery', + status: 'ok', + }); + expect(status.queryHistory[0].detail).toMatch(/INFORMATION_SCHEMA\.JOBS_BY_PROJECT/); + }); + + it('fails with an accurate message for drivers without a query history reader', async () => { + const project = projectWithConfig(withMysqlQueryHistory(baseProjectConfig())); + + const status = await buildProjectStatus(project, { + claudeCodeAuthProbe: stubClaudeCodeAuthProbe, + postgresQueryHistoryProbe: async () => { + throw new Error('postgres probe must not run for mysql'); + }, + }); + + expect(status.queryHistory).toHaveLength(1); + expect(status.queryHistory[0]).toMatchObject({ + connection: 'legacy', + driver: 'mysql', + dialect: 'mysql', + status: 'fail', + detail: 'query history is not supported for driver "mysql"', + }); + expect(status.queryHistory[0].fix).toMatch( + /Disable connections\.legacy\.context\.queryHistory/, + ); + expect(status.queryHistory[0].fix).not.toMatch(/driver to postgres/); + }); +}); + describe('buildProjectStatus --fast', () => { it('skips claude-code probe and Postgres query-history probe', async () => { let claudeProbeCalls = 0; diff --git a/packages/cli/src/status-project.ts b/packages/cli/src/status-project.ts index 9e257157..9b5f1af4 100644 --- a/packages/cli/src/status-project.ts +++ b/packages/cli/src/status-project.ts @@ -5,6 +5,10 @@ import type { KtxConfigIssue, KtxProjectConfig, KtxProjectConnectionConfig, KtxP import type { KtxLocalProject } from './context/project/project.js'; import { ktxLocalStateDbPath } from './context/project/local-state-db.js'; import type { PostgresPgssProbeResult } from './context/ingest/adapters/historic-sql/types.js'; +import { + isQueryHistoryEnabled, + queryHistoryDialectForConnection, +} from './context/ingest/adapters/historic-sql/connection-dialect.js'; import { formatClaudeCodePromptCachingFix, formatClaudeCodePromptCachingWarning, @@ -47,7 +51,8 @@ interface ConnectionStatus extends ProjectStatusLine { interface QueryHistoryStatus extends ProjectStatusLine { connection: string; - dialect: 'postgres'; + driver: string; + dialect: string; } interface PipelineStatus { @@ -396,45 +401,44 @@ function buildConnectionStatus( } } -interface PostgresQueryHistoryProbeInput { +interface QueryHistoryProbeInput { projectDir: string; connectionId: string; connection: KtxProjectConnectionConfig; env: NodeJS.ProcessEnv; } -type PostgresQueryHistoryProbe = ( - input: PostgresQueryHistoryProbeInput, -) => Promise; - -function recordValue(value: unknown): Record | null { - return value && typeof value === 'object' && !Array.isArray(value) ? (value as Record) : null; +interface GenericProbeResult { + warnings: string[]; + info?: string[]; } -function queryHistoryRecord(connection: KtxProjectConnectionConfig): Record | null { - const context = recordValue(connection.context); - return recordValue(context?.queryHistory); -} +type PostgresQueryHistoryProbe = (input: QueryHistoryProbeInput) => Promise; +type SnowflakeQueryHistoryProbe = (input: QueryHistoryProbeInput) => Promise; +type BigQueryQueryHistoryProbe = (input: QueryHistoryProbeInput) => Promise; -function legacyHistoricSqlRecord(connection: KtxProjectConnectionConfig): Record | null { - return recordValue(connection.historicSql); -} - -function isEnabledPostgresQueryHistory(connection: KtxProjectConnectionConfig): boolean { - const queryHistory = queryHistoryRecord(connection); - if (queryHistory) { - return queryHistory.enabled === true; +function failureDetail(error: unknown): string { + if (error instanceof Error && error.message.trim().length > 0) { + return error.message.trim().split('\n')[0] ?? error.message.trim(); } - const legacy = legacyHistoricSqlRecord(connection); - return legacy?.enabled === true && legacy.dialect === 'postgres'; + return String(error); } -function isPostgresDriver(connection: KtxProjectConnectionConfig): boolean { - const driver = String(connection.driver ?? '').toLowerCase(); - return driver === 'postgres' || driver === 'postgresql'; +function postgresReadinessDetail(result: PostgresPgssProbeResult): string { + const warningText = result.warnings.length > 0 ? ` with warnings: ${result.warnings.join('; ')}` : ''; + const info = result.info ?? []; + const infoText = info.length > 0 ? `; info: ${info.join('; ')}` : ''; + return `pg_stat_statements ready (${result.pgServerVersion})${warningText}${infoText}`; } -function queryHistoryFailureFix(error: unknown, connectionId: string, projectDir: string): string { +function genericReadinessDetail(label: string, result: GenericProbeResult): string { + const warningText = result.warnings.length > 0 ? ` with warnings: ${result.warnings.join('; ')}` : ''; + const info = result.info ?? []; + const infoText = info.length > 0 ? `; info: ${info.join('; ')}` : ''; + return `${label} ready${warningText}${infoText}`; +} + +function probeFailureFix(error: unknown, dialect: string, connectionId: string, projectDir: string): string { if (error instanceof Error && error.name === 'HistoricSqlExtensionMissingError' && 'remediation' in error) { return String(error.remediation); } @@ -444,25 +448,11 @@ function queryHistoryFailureFix(error: unknown, connectionId: string, projectDir if (error instanceof Error && error.name === 'HistoricSqlVersionUnsupportedError') { return 'Use PostgreSQL 14 or newer, or disable query history for this connection'; } - return `Fix connections.${connectionId} Postgres settings, then rerun \`ktx status --project-dir ${projectDir}\``; -} - -function failureDetail(error: unknown): string { - if (error instanceof Error && error.message.trim().length > 0) { - return error.message.trim().split('\n')[0] ?? error.message.trim(); - } - return String(error); -} - -function readinessDetail(result: PostgresPgssProbeResult): string { - const warningText = result.warnings.length > 0 ? ` with warnings: ${result.warnings.join('; ')}` : ''; - const info = result.info ?? []; - const infoText = info.length > 0 ? `; info: ${info.join('; ')}` : ''; - return `pg_stat_statements ready (${result.pgServerVersion})${warningText}${infoText}`; + return `Fix connections.${connectionId} ${dialect} settings, then rerun \`ktx status --project-dir ${projectDir}\``; } async function defaultPostgresQueryHistoryProbe( - input: PostgresQueryHistoryProbeInput, + input: QueryHistoryProbeInput, ): Promise { const [{ PostgresPgssReader }, { KtxPostgresHistoricSqlQueryClient }, { isKtxPostgresConnectionConfig }] = await Promise.all([ @@ -488,63 +478,225 @@ async function defaultPostgresQueryHistoryProbe( } } +async function defaultSnowflakeQueryHistoryProbe( + input: QueryHistoryProbeInput, +): Promise { + const [{ SnowflakeHistoricSqlQueryHistoryReader }, { KtxSnowflakeHistoricSqlQueryClient }, { isKtxSnowflakeConnectionConfig }] = + await Promise.all([ + import('./context/ingest/adapters/historic-sql/snowflake-query-history-reader.js'), + import('./connectors/snowflake/historic-sql-query-client.js'), + import('./connectors/snowflake/connector.js'), + ]); + + const inputDriver = input.connection.driver ?? 'unknown'; + if (!isKtxSnowflakeConnectionConfig(input.connection)) { + throw new Error(`Native Snowflake connector cannot run driver "${inputDriver}"`); + } + + const client = new KtxSnowflakeHistoricSqlQueryClient({ + connectionId: input.connectionId, + connection: input.connection, + projectDir: input.projectDir, + env: input.env, + }); + try { + return await new SnowflakeHistoricSqlQueryHistoryReader().probe(client); + } finally { + await client.cleanup(); + } +} + +async function defaultBigQueryQueryHistoryProbe( + input: QueryHistoryProbeInput, +): Promise { + const [ + { BigQueryHistoricSqlQueryHistoryReader }, + { KtxBigQueryScanConnector, isKtxBigQueryConnectionConfig }, + { resolveKtxConfigReference }, + ] = await Promise.all([ + import('./context/ingest/adapters/historic-sql/bigquery-query-history-reader.js'), + import('./connectors/bigquery/connector.js'), + import('./context/core/config-reference.js'), + ]); + + const inputDriver = input.connection.driver ?? 'unknown'; + if (!isKtxBigQueryConnectionConfig(input.connection)) { + throw new Error(`Native BigQuery connector cannot run driver "${inputDriver}"`); + } + + const rawCredentials = typeof input.connection.credentials_json === 'string' ? input.connection.credentials_json : ''; + const resolvedCredentials = resolveKtxConfigReference(rawCredentials, input.env); + if (!resolvedCredentials) { + throw new Error(`Query history BigQuery connection ${input.connectionId} requires credentials_json`); + } + const parsed = JSON.parse(resolvedCredentials) as { project_id?: unknown }; + if (typeof parsed.project_id !== 'string' || parsed.project_id.trim().length === 0) { + throw new Error(`Query history BigQuery connection ${input.connectionId} requires credentials_json.project_id`); + } + const region = + typeof input.connection.location === 'string' && input.connection.location.trim().length > 0 + ? input.connection.location.trim() + : 'us'; + + const connector = new KtxBigQueryScanConnector({ + connectionId: input.connectionId, + connection: input.connection, + }); + try { + return await new BigQueryHistoricSqlQueryHistoryReader({ + projectId: parsed.project_id, + region, + }).probe({ + async executeQuery(sql: string) { + const result = await connector.executeReadOnly({ connectionId: input.connectionId, sql }, {} as never); + return { + headers: result.headers, + rows: result.rows, + totalRows: result.totalRows, + }; + }, + }); + } finally { + await connector.cleanup(); + } +} + +interface DispatchedProbe { + label: string; + spinnerLabel: string; + fastSkipDetail: string; + run: () => Promise<{ status: ProjectStatusLevel; detail: string; fix?: string }>; +} + +function postgresProbeDispatch( + input: QueryHistoryProbeInput, + probe: PostgresQueryHistoryProbe, +): DispatchedProbe { + return { + label: 'postgres', + spinnerLabel: `Probing pg_stat_statements on ${input.connectionId}`, + fastSkipDetail: 'pg_stat_statements probe skipped (--fast)', + run: async () => { + const result = await probe(input); + return { + status: result.warnings.length > 0 ? 'warn' : 'ok', + detail: postgresReadinessDetail(result), + ...(result.warnings.length > 0 + ? { + fix: `Update the Postgres parameter group or config, then rerun \`ktx status --project-dir ${input.projectDir}\``, + } + : {}), + }; + }, + }; +} + +function snowflakeProbeDispatch( + input: QueryHistoryProbeInput, + probe: SnowflakeQueryHistoryProbe, +): DispatchedProbe { + return { + label: 'snowflake', + spinnerLabel: `Probing SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY on ${input.connectionId}`, + fastSkipDetail: 'SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY probe skipped (--fast)', + run: async () => { + const result = await probe(input); + return { + status: result.warnings.length > 0 ? 'warn' : 'ok', + detail: genericReadinessDetail('SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY', result), + }; + }, + }; +} + +function bigqueryProbeDispatch( + input: QueryHistoryProbeInput, + probe: BigQueryQueryHistoryProbe, +): DispatchedProbe { + return { + label: 'bigquery', + spinnerLabel: `Probing INFORMATION_SCHEMA.JOBS_BY_PROJECT on ${input.connectionId}`, + fastSkipDetail: 'INFORMATION_SCHEMA.JOBS_BY_PROJECT probe skipped (--fast)', + run: async () => { + const result = await probe(input); + return { + status: result.warnings.length > 0 ? 'warn' : 'ok', + detail: genericReadinessDetail('INFORMATION_SCHEMA.JOBS_BY_PROJECT', result), + }; + }, + }; +} + async function buildQueryHistoryStatus( project: KtxLocalProject, options: BuildProjectStatusOptions, ): Promise { const targets = Object.entries(project.config.connections) - .filter(([, connection]) => isEnabledPostgresQueryHistory(connection)) + .filter(([, connection]) => isQueryHistoryEnabled(connection)) .sort(([left], [right]) => left.localeCompare(right)); - const probe = options.postgresQueryHistoryProbe ?? defaultPostgresQueryHistoryProbe; + const postgresProbe = options.postgresQueryHistoryProbe ?? defaultPostgresQueryHistoryProbe; + const snowflakeProbe = options.snowflakeQueryHistoryProbe ?? defaultSnowflakeQueryHistoryProbe; + const bigqueryProbe = options.bigqueryQueryHistoryProbe ?? defaultBigQueryQueryHistoryProbe; const env = options.env ?? process.env; const statuses: QueryHistoryStatus[] = []; + for (const [connectionId, connection] of targets) { - if (!isPostgresDriver(connection)) { + const driver = String(connection.driver ?? 'unknown').toLowerCase(); + const dialect = queryHistoryDialectForConnection(connection); + + if (!dialect) { statuses.push({ connection: connectionId, - dialect: 'postgres', + driver, + dialect: driver, status: 'fail', - detail: `connections.${connectionId}.context.queryHistory is enabled but driver is ${String(connection.driver)}`, - fix: `Set connections.${connectionId}.driver to postgres or disable query history for this connection`, + detail: `query history is not supported for driver "${driver}"`, + fix: `Disable connections.${connectionId}.context.queryHistory, or use a postgres, snowflake, or bigquery connection`, }); continue; } + const probeInput: QueryHistoryProbeInput = { + projectDir: project.projectDir, + connectionId, + connection, + env, + }; + const dispatched = + dialect === 'postgres' + ? postgresProbeDispatch(probeInput, postgresProbe) + : dialect === 'snowflake' + ? snowflakeProbeDispatch(probeInput, snowflakeProbe) + : bigqueryProbeDispatch(probeInput, bigqueryProbe); + if (options.fast === true) { statuses.push({ connection: connectionId, - dialect: 'postgres', + driver, + dialect, status: 'skipped', - detail: 'pg_stat_statements probe skipped (--fast)', + detail: dispatched.fastSkipDetail, }); continue; } try { - const result = await withSpinner( - options.useSpinner === true, - `Probing pg_stat_statements on ${connectionId}`, - () => probe({ projectDir: project.projectDir, connectionId, connection, env }), - ); + const outcome = await withSpinner(options.useSpinner === true, dispatched.spinnerLabel, dispatched.run); statuses.push({ connection: connectionId, - dialect: 'postgres', - status: result.warnings.length > 0 ? 'warn' : 'ok', - detail: readinessDetail(result), - ...(result.warnings.length > 0 - ? { - fix: `Update the Postgres parameter group or config, then rerun \`ktx status --project-dir ${project.projectDir}\``, - } - : {}), + driver, + dialect, + ...outcome, }); } catch (error) { statuses.push({ connection: connectionId, - dialect: 'postgres', + driver, + dialect, status: 'fail', detail: failureDetail(error), - fix: queryHistoryFailureFix(error, connectionId, project.projectDir), + fix: probeFailureFix(error, dispatched.label, connectionId, project.projectDir), }); } } @@ -731,6 +883,8 @@ function buildVerdict( export interface BuildProjectStatusOptions { env?: NodeJS.ProcessEnv; postgresQueryHistoryProbe?: PostgresQueryHistoryProbe; + snowflakeQueryHistoryProbe?: SnowflakeQueryHistoryProbe; + bigqueryQueryHistoryProbe?: BigQueryQueryHistoryProbe; claudeCodeAuthProbe?: ClaudeCodeAuthProbe; configIssues?: KtxConfigIssue[]; fast?: boolean; diff --git a/packages/cli/src/telemetry/project-snapshot.test.ts b/packages/cli/src/telemetry/project-snapshot.test.ts index daf4e766..a1c06472 100644 --- a/packages/cli/src/telemetry/project-snapshot.test.ts +++ b/packages/cli/src/telemetry/project-snapshot.test.ts @@ -47,6 +47,7 @@ describe('buildProjectStackSnapshotFields', () => { maxLlmTablesPerBatch: 40, maxCandidatesPerColumn: 25, profileSampleRows: 10000, + profileConcurrency: 4, validationConcurrency: 4, }, }, diff --git a/python/ktx-daemon/src/ktx_daemon/database_introspection.py b/python/ktx-daemon/src/ktx_daemon/database_introspection.py index ba9fa1d8..82058f95 100644 --- a/python/ktx-daemon/src/ktx_daemon/database_introspection.py +++ b/python/ktx-daemon/src/ktx_daemon/database_introspection.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json from collections.abc import Callable, Mapping, Sequence from dataclasses import dataclass from datetime import datetime, timezone @@ -24,6 +25,16 @@ join pg_catalog.pg_class c and c.relname = t.table_name where t.table_schema = any(%s) and t.table_type = 'BASE TABLE' + and ( + %s::jsonb is null + or exists ( + select 1 + from jsonb_to_recordset(%s::jsonb) as scope(catalog text, db text, name text) + where (scope.catalog is null or scope.catalog = current_database()) + and (scope.db is null or scope.db = t.table_schema) + and scope.name = t.table_name + ) + ) order by t.table_schema, t.table_name """ @@ -52,6 +63,16 @@ where n.nspname = any(%s) and c.relkind in ('r', 'p') and a.attnum > 0 and not a.attisdropped + and ( + %s::jsonb is null + or exists ( + select 1 + from jsonb_to_recordset(%s::jsonb) as scope(catalog text, db text, name text) + where (scope.catalog is null or scope.catalog = current_database()) + and (scope.db is null or scope.db = n.nspname) + and scope.name = c.relname + ) + ) order by n.nspname, c.relname, a.attnum """ @@ -80,6 +101,16 @@ join information_schema.key_column_usage target_key and target_key.ordinal_position = source_key.position_in_unique_constraint where source_constraint.constraint_type = 'FOREIGN KEY' and source_constraint.table_schema = any(%s) + and ( + %s::jsonb is null + or exists ( + select 1 + from jsonb_to_recordset(%s::jsonb) as scope(catalog text, db text, name text) + where (scope.catalog is null or scope.catalog = current_database()) + and (scope.db is null or scope.db = source_constraint.table_schema) + and scope.name = source_constraint.table_name + ) + ) order by source_constraint.table_schema, source_constraint.table_name, source_constraint.constraint_name, source_key.ordinal_position """ @@ -108,6 +139,12 @@ class LiveDatabaseTable(BaseModel): foreign_keys: list[LiveDatabaseForeignKey] = Field(default_factory=list) +class LiveDatabaseTableScopeRef(BaseModel): + catalog: str | None = None + db: str | None = None + name: str + + class DatabaseIntrospectionRequest(BaseModel): connection_id: str driver: str = "postgres" @@ -115,6 +152,7 @@ class DatabaseIntrospectionRequest(BaseModel): schemas: list[str] = Field(default_factory=lambda: ["public"]) statement_timeout_ms: int = Field(default=30_000, ge=1) connection_timeout_seconds: int = Field(default=5, ge=1) + table_scope: list[LiveDatabaseTableScopeRef] | None = None @field_validator("schemas") @classmethod @@ -169,6 +207,23 @@ def _statement_timeout_config(statement_timeout_ms: int) -> tuple[str, tuple[str ) +def _table_scope_json( + table_scope: Sequence[LiveDatabaseTableScopeRef] | None, +) -> str | None: + if table_scope is None: + return None + return json.dumps( + [ + { + "catalog": ref.catalog, + "db": ref.db, + "name": ref.name, + } + for ref in table_scope + ] + ) + + def _load_postgres_rows( request: DatabaseIntrospectionRequest, ) -> DatabaseIntrospectionRows: @@ -190,7 +245,8 @@ def _load_postgres_rows( connection.execute("BEGIN READ ONLY") try: connection.execute(*_statement_timeout_config(request.statement_timeout_ms)) - params = (request.schemas,) + scope_json = _table_scope_json(request.table_scope) + params = (request.schemas, scope_json, scope_json) table_rows = list(connection.execute(TABLES_SQL, params)) column_rows = list(connection.execute(COLUMNS_SQL, params)) foreign_key_rows = list(connection.execute(FOREIGN_KEYS_SQL, params)) diff --git a/python/ktx-daemon/tests/test_app.py b/python/ktx-daemon/tests/test_app.py index e423a31e..9960daaf 100644 --- a/python/ktx-daemon/tests/test_app.py +++ b/python/ktx-daemon/tests/test_app.py @@ -155,6 +155,7 @@ def test_database_introspect_endpoint_returns_snapshot() -> None: "driver": "postgres", "url": "postgresql://readonly@example.test/warehouse", "schemas": ["public"], + "table_scope": [{"db": "public", "name": "orders"}], }, ) @@ -162,6 +163,8 @@ def test_database_introspect_endpoint_returns_snapshot() -> None: assert response.json()["connection_id"] == "warehouse" assert response.json()["tables"][0]["name"] == "orders" assert calls[0].connection_id == "warehouse" + assert calls[0].table_scope[0].db == "public" + assert calls[0].table_scope[0].name == "orders" def test_database_introspect_endpoint_maps_value_error_to_400() -> None: diff --git a/python/ktx-daemon/tests/test_cli.py b/python/ktx-daemon/tests/test_cli.py index 88e06de7..76376320 100644 --- a/python/ktx-daemon/tests/test_cli.py +++ b/python/ktx-daemon/tests/test_cli.py @@ -311,6 +311,9 @@ def test_database_introspect_command_reads_stdin_and_writes_json( assert request.connection_id == "warehouse" assert request.driver == "postgres" assert request.schemas == ["public"] + assert request.table_scope is not None + assert request.table_scope[0].db == "public" + assert request.table_scope[0].name == "orders" return DatabaseIntrospectionResponse( connection_id="warehouse", extracted_at="2026-04-28T10:00:00+00:00", @@ -337,7 +340,7 @@ def test_database_introspect_command_reads_stdin_and_writes_json( sys, "stdin", io.StringIO( - '{"connection_id":"warehouse","driver":"postgres","url":"postgresql://readonly@example.test/warehouse","schemas":["public"]}' + '{"connection_id":"warehouse","driver":"postgres","url":"postgresql://readonly@example.test/warehouse","schemas":["public"],"table_scope":[{"db":"public","name":"orders"}]}' ), ) diff --git a/python/ktx-daemon/tests/test_database_introspection.py b/python/ktx-daemon/tests/test_database_introspection.py index 7dd2f3f9..0a018046 100644 --- a/python/ktx-daemon/tests/test_database_introspection.py +++ b/python/ktx-daemon/tests/test_database_introspection.py @@ -5,7 +5,9 @@ import pytest from ktx_daemon.database_introspection import ( DatabaseIntrospectionRequest, DatabaseIntrospectionRows, + LiveDatabaseTableScopeRef, _statement_timeout_config, + _table_scope_json, introspect_database_response, ) @@ -146,6 +148,22 @@ def test_database_introspection_request_rejects_empty_schema_list() -> None: ) +def test_table_scope_json_serializes_null_wildcards() -> None: + assert _table_scope_json( + [ + LiveDatabaseTableScopeRef(catalog=None, db="public", name="orders"), + LiveDatabaseTableScopeRef( + catalog="warehouse", + db="marts", + name="customers", + ), + ] + ) == ( + '[{"catalog": null, "db": "public", "name": "orders"}, ' + '{"catalog": "warehouse", "db": "marts", "name": "customers"}]' + ) + + def test_statement_timeout_config_uses_parameterized_set_config() -> None: assert _statement_timeout_config(30_000) == ( "SELECT set_config('statement_timeout', %s, true)", diff --git a/scripts/ktx-reset.sh b/scripts/ktx-reset.sh new file mode 100755 index 00000000..808e0d25 --- /dev/null +++ b/scripts/ktx-reset.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# ktx-reset.sh - Reset a ktx project directory back to its seed state. +# +# Removes everything in except ktx.yaml and .ktx/, and prunes .ktx/ +# down to just .ktx/secrets/. Useful when re-running ingest/setup against +# a known-clean project tree. + +set -e +set -o pipefail + +if [ -z "$1" ]; then + echo "usage: ktx-reset " >&2 + exit 1 +fi + +dir="${1%/}" +if [ ! -d "$dir" ]; then + echo "ktx-reset: $dir is not a directory" >&2 + exit 1 +fi + +find "$dir" -mindepth 1 -maxdepth 1 ! -name ktx.yaml ! -name .ktx -exec rm -rf {} + +if [ -d "$dir/.ktx" ]; then + find "$dir/.ktx" -mindepth 1 -maxdepth 1 ! -name secrets -exec rm -rf {} + +fi +tree -a "$dir"