mirror of
https://github.com/Kaelio/ktx.git
synced 2026-07-01 08:59:39 +02:00
feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard Extends the interactive Snowflake setup flow with an authentication-method prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key path (env/file/absolute) and an optional passphrase; the resulting connection config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead of `password`. * feat(scan): pool Snowflake sessions * fix(scan): reuse structural snapshots and cleanup connectors * feat(scan): parallelize relationship profiling * feat(scan): batch table description generation * docs: document Snowflake ingest concurrency knobs * fix(scan): close Snowflake ingest perf verification gaps * fix(scan): keep batched description failure bounded
This commit is contained in:
parent
6e31687782
commit
4e654c43c1
23 changed files with 1247 additions and 237 deletions
|
|
@ -157,6 +157,12 @@ connections:
|
||||||
dataset_ids: [analytics, mart]
|
dataset_ids: [analytics, mart]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For Snowflake connections, set `maxSessions` when deep ingest needs more or
|
||||||
|
fewer concurrent warehouse sessions. The default is `4`. This caps all
|
||||||
|
concurrent Snowflake SQL work for that connector instance, including schema
|
||||||
|
introspection, table sampling, relationship profiling, relationship
|
||||||
|
validation, and read-only SQL execution.
|
||||||
|
|
||||||
For Postgres, BigQuery, and Snowflake, `historicSql` and `context.queryHistory`
|
For Postgres, BigQuery, and Snowflake, `historicSql` and `context.queryHistory`
|
||||||
toggle query-history ingest. The shape is connector-specific; the setup wizard
|
toggle query-history ingest. The shape is connector-specific; the setup wizard
|
||||||
writes these fields when you pass `--enable-query-history`.
|
writes these fields when you pass `--enable-query-history`.
|
||||||
|
|
@ -483,6 +489,7 @@ scan:
|
||||||
maxLlmTablesPerBatch: 40
|
maxLlmTablesPerBatch: 40
|
||||||
maxCandidatesPerColumn: 25
|
maxCandidatesPerColumn: 25
|
||||||
profileSampleRows: 10000
|
profileSampleRows: 10000
|
||||||
|
profileConcurrency: 4
|
||||||
validationConcurrency: 4
|
validationConcurrency: 4
|
||||||
validationBudget: all
|
validationBudget: all
|
||||||
```
|
```
|
||||||
|
|
@ -510,6 +517,7 @@ the manifest.
|
||||||
| `relationships.maxLlmTablesPerBatch` | `int > 0` | `40` | Max tables included in a single LLM relationship-proposal batch. |
|
| `relationships.maxLlmTablesPerBatch` | `int > 0` | `40` | Max tables included in a single LLM relationship-proposal batch. |
|
||||||
| `relationships.maxCandidatesPerColumn` | `int > 0` | `25` | Max join partners considered per column. |
|
| `relationships.maxCandidatesPerColumn` | `int > 0` | `25` | Max join partners considered per column. |
|
||||||
| `relationships.profileSampleRows` | `int > 0` | `10000` | Rows sampled per table when profiling values for relationship inference. |
|
| `relationships.profileSampleRows` | `int > 0` | `10000` | Rows sampled per table when profiling values for relationship inference. |
|
||||||
|
| `relationships.profileConcurrency` | `int > 0` | `4` | Parallel relationship-profile queries against the database. For Snowflake, effective database concurrency is also bounded by the connection's `maxSessions`. |
|
||||||
| `relationships.validationConcurrency` | `int > 0` | `4` | Parallel relationship validation queries against the database. |
|
| `relationships.validationConcurrency` | `int > 0` | `4` | Parallel relationship validation queries against the database. |
|
||||||
| `relationships.validationBudget` | `all` \| `int ≥ 0` | runtime default | Cap on validation queries per scan. `all` means unlimited. |
|
| `relationships.validationBudget` | `all` \| `int ≥ 0` | runtime default | Cap on validation queries per scan. `all` means unlimited. |
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,12 @@
|
||||||
import { describe, expect, it, vi } from 'vitest';
|
import { describe, expect, it, vi } from 'vitest';
|
||||||
|
|
||||||
|
const createPool = vi.hoisted(() => vi.fn());
|
||||||
|
|
||||||
|
vi.mock('snowflake-sdk', () => ({
|
||||||
|
default: { createPool },
|
||||||
|
createPool,
|
||||||
|
}));
|
||||||
|
|
||||||
import { createSnowflakeLiveDatabaseIntrospection } from '../../connectors/snowflake/live-database-introspection.js';
|
import { createSnowflakeLiveDatabaseIntrospection } from '../../connectors/snowflake/live-database-introspection.js';
|
||||||
import { isKtxSnowflakeConnectionConfig, KtxSnowflakeScanConnector, snowflakeConnectionConfigFromConfig, type KtxSnowflakeDriver, type KtxSnowflakeDriverFactory } from '../../connectors/snowflake/connector.js';
|
import { isKtxSnowflakeConnectionConfig, KtxSnowflakeScanConnector, snowflakeConnectionConfigFromConfig, type KtxSnowflakeDriver, type KtxSnowflakeDriverFactory } from '../../connectors/snowflake/connector.js';
|
||||||
import { tableRefSet } from '../../context/scan/table-ref.js';
|
import { tableRefSet } from '../../context/scan/table-ref.js';
|
||||||
|
|
@ -64,6 +72,38 @@ function fakeDriverFactory(): KtxSnowflakeDriverFactory {
|
||||||
return { createDriver: vi.fn(() => driver) };
|
return { createDriver: vi.fn(() => driver) };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function fakeSnowflakeStatement(headers: string[] = ['ONE']) {
|
||||||
|
return {
|
||||||
|
getColumns: () => headers.map((header) => ({ getName: () => header, getType: () => 'TEXT' })),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function installSnowflakePoolMock() {
|
||||||
|
const executedSql: string[] = [];
|
||||||
|
const connection = {
|
||||||
|
execute: vi.fn(
|
||||||
|
(input: {
|
||||||
|
sqlText: string;
|
||||||
|
complete: (
|
||||||
|
error: Error | null,
|
||||||
|
statement: ReturnType<typeof fakeSnowflakeStatement>,
|
||||||
|
rows: Array<Record<string, unknown>>,
|
||||||
|
) => void;
|
||||||
|
}) => {
|
||||||
|
executedSql.push(input.sqlText);
|
||||||
|
input.complete(null, fakeSnowflakeStatement(), [{ ONE: 1 }]);
|
||||||
|
},
|
||||||
|
),
|
||||||
|
};
|
||||||
|
const pool = {
|
||||||
|
use: vi.fn(async (fn: (conn: typeof connection) => Promise<unknown>) => fn(connection)),
|
||||||
|
drain: vi.fn(async () => undefined),
|
||||||
|
clear: vi.fn(async () => undefined),
|
||||||
|
};
|
||||||
|
createPool.mockReturnValue(pool);
|
||||||
|
return { connection, pool, executedSql };
|
||||||
|
}
|
||||||
|
|
||||||
describe('KtxSnowflakeScanConnector', () => {
|
describe('KtxSnowflakeScanConnector', () => {
|
||||||
it('resolves Snowflake connection configuration safely', () => {
|
it('resolves Snowflake connection configuration safely', () => {
|
||||||
expect(
|
expect(
|
||||||
|
|
@ -100,6 +140,99 @@ describe('KtxSnowflakeScanConnector', () => {
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('defaults and validates Snowflake maxSessions', () => {
|
||||||
|
const baseConnection = {
|
||||||
|
driver: 'snowflake',
|
||||||
|
authMethod: 'password',
|
||||||
|
account: 'acct',
|
||||||
|
warehouse: 'WH',
|
||||||
|
database: 'ANALYTICS',
|
||||||
|
schema_name: 'PUBLIC',
|
||||||
|
username: 'reader',
|
||||||
|
password: 'fixture-pass', // pragma: allowlist secret
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
expect(
|
||||||
|
snowflakeConnectionConfigFromConfig({
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
connection: baseConnection,
|
||||||
|
}),
|
||||||
|
).toMatchObject({ maxSessions: 4 });
|
||||||
|
|
||||||
|
expect(
|
||||||
|
snowflakeConnectionConfigFromConfig({
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
connection: { ...baseConnection, maxSessions: 8 },
|
||||||
|
}),
|
||||||
|
).toMatchObject({ maxSessions: 8 });
|
||||||
|
|
||||||
|
for (const maxSessions of [0, -1, 1.5, Number.NaN]) {
|
||||||
|
expect(() =>
|
||||||
|
snowflakeConnectionConfigFromConfig({
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
connection: { ...baseConnection, maxSessions },
|
||||||
|
}),
|
||||||
|
).toThrow('connections.warehouse.maxSessions must be a positive integer');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('uses one lazy Snowflake pool and drains it during cleanup', async () => {
|
||||||
|
const { pool, executedSql } = installSnowflakePoolMock();
|
||||||
|
const close = vi.fn(async () => undefined);
|
||||||
|
const connector = new KtxSnowflakeScanConnector({
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
connection: {
|
||||||
|
driver: 'snowflake',
|
||||||
|
authMethod: 'password',
|
||||||
|
account: 'acct',
|
||||||
|
warehouse: 'WH',
|
||||||
|
database: 'ANALYTICS',
|
||||||
|
schema_name: 'PUBLIC',
|
||||||
|
username: 'reader',
|
||||||
|
password: 'fixture-pass', // pragma: allowlist secret
|
||||||
|
role: 'ANALYST',
|
||||||
|
maxSessions: 3,
|
||||||
|
},
|
||||||
|
sdkOptionsProvider: {
|
||||||
|
resolve: vi.fn(async () => ({ sdkOptions: { application: 'ktx-test' }, close })),
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(createPool).not.toHaveBeenCalled();
|
||||||
|
|
||||||
|
await connector.executeReadOnly({ connectionId: 'warehouse', sql: 'select 1', maxRows: 1 }, { runId: 'run-1' });
|
||||||
|
await connector.executeReadOnly({ connectionId: 'warehouse', sql: 'select 1', maxRows: 1 }, { runId: 'run-1' });
|
||||||
|
|
||||||
|
expect(createPool).toHaveBeenCalledTimes(1);
|
||||||
|
expect(createPool).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
account: 'acct',
|
||||||
|
username: 'reader',
|
||||||
|
warehouse: 'WH',
|
||||||
|
database: 'ANALYTICS',
|
||||||
|
schema: 'PUBLIC',
|
||||||
|
role: 'ANALYST',
|
||||||
|
password: 'fixture-pass', // pragma: allowlist secret
|
||||||
|
clientSessionKeepAlive: true,
|
||||||
|
clientSessionKeepAliveHeartbeatFrequency: 900,
|
||||||
|
application: 'ktx-test',
|
||||||
|
}),
|
||||||
|
expect.objectContaining({
|
||||||
|
min: 0,
|
||||||
|
max: 3,
|
||||||
|
evictionRunIntervalMillis: 30_000,
|
||||||
|
acquireTimeoutMillis: 60_000,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
expect(pool.use).toHaveBeenCalledTimes(2);
|
||||||
|
expect(executedSql.some((sql) => /^USE\s+/i.test(sql.trim()))).toBe(false);
|
||||||
|
|
||||||
|
await connector.cleanup();
|
||||||
|
expect(pool.drain).toHaveBeenCalledBefore(pool.clear);
|
||||||
|
expect(pool.clear).toHaveBeenCalledTimes(1);
|
||||||
|
expect(close).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
|
||||||
it('introspects schema, primary keys, comments, row counts, and dimensions', async () => {
|
it('introspects schema, primary keys, comments, row counts, and dimensions', async () => {
|
||||||
const connector = new KtxSnowflakeScanConnector({
|
const connector = new KtxSnowflakeScanConnector({
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@ export interface KtxSnowflakeConnectionConfig {
|
||||||
privateKey?: string;
|
privateKey?: string;
|
||||||
passphrase?: string;
|
passphrase?: string;
|
||||||
role?: string;
|
role?: string;
|
||||||
|
maxSessions?: number;
|
||||||
[key: string]: unknown;
|
[key: string]: unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -38,6 +39,7 @@ export interface KtxSnowflakeResolvedConnectionConfig {
|
||||||
privateKey?: string;
|
privateKey?: string;
|
||||||
passphrase?: string;
|
passphrase?: string;
|
||||||
role?: string;
|
role?: string;
|
||||||
|
maxSessions: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface KtxSnowflakeRawColumnMetadata {
|
export interface KtxSnowflakeRawColumnMetadata {
|
||||||
|
|
@ -132,6 +134,23 @@ function stringConfigValue(
|
||||||
return typeof value === 'string' && value.trim().length > 0 ? resolveStringReference(value.trim(), env) : undefined;
|
return typeof value === 'string' && value.trim().length > 0 ? resolveStringReference(value.trim(), env) : undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function positiveIntegerConfigValue(input: {
|
||||||
|
connection: KtxSnowflakeConnectionConfig;
|
||||||
|
key: keyof KtxSnowflakeConnectionConfig;
|
||||||
|
connectionId: string;
|
||||||
|
defaultValue: number;
|
||||||
|
}): number {
|
||||||
|
const value = input.connection[input.key];
|
||||||
|
if (value === undefined) {
|
||||||
|
return input.defaultValue;
|
||||||
|
}
|
||||||
|
const numberValue = Number(value);
|
||||||
|
if (!Number.isInteger(numberValue) || numberValue < 1) {
|
||||||
|
throw new Error(`connections.${input.connectionId}.${String(input.key)} must be a positive integer`);
|
||||||
|
}
|
||||||
|
return numberValue;
|
||||||
|
}
|
||||||
|
|
||||||
function schemaNames(connection: KtxSnowflakeConnectionConfig, env: NodeJS.ProcessEnv): string[] {
|
function schemaNames(connection: KtxSnowflakeConnectionConfig, env: NodeJS.ProcessEnv): string[] {
|
||||||
if (Array.isArray(connection.schema_names) && connection.schema_names.length > 0) {
|
if (Array.isArray(connection.schema_names) && connection.schema_names.length > 0) {
|
||||||
return connection.schema_names
|
return connection.schema_names
|
||||||
|
|
@ -230,6 +249,12 @@ export function snowflakeConnectionConfigFromConfig(input: {
|
||||||
database,
|
database,
|
||||||
schemas: resolvedSchemas,
|
schemas: resolvedSchemas,
|
||||||
username,
|
username,
|
||||||
|
maxSessions: positiveIntegerConfigValue({
|
||||||
|
connection: input.connection,
|
||||||
|
key: 'maxSessions',
|
||||||
|
connectionId: input.connectionId,
|
||||||
|
defaultValue: 4,
|
||||||
|
}),
|
||||||
};
|
};
|
||||||
const role = stringConfigValue(input.connection, 'role', env);
|
const role = stringConfigValue(input.connection, 'role', env);
|
||||||
if (role) {
|
if (role) {
|
||||||
|
|
@ -265,6 +290,7 @@ class DefaultSnowflakeDriverFactory implements KtxSnowflakeDriverFactory {
|
||||||
|
|
||||||
class SnowflakeSdkDriver implements KtxSnowflakeDriver {
|
class SnowflakeSdkDriver implements KtxSnowflakeDriver {
|
||||||
private closeSdkOptions: Array<() => Promise<void>> = [];
|
private closeSdkOptions: Array<() => Promise<void>> = [];
|
||||||
|
private pool: ReturnType<typeof snowflake.createPool> | null = null;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
private readonly resolved: KtxSnowflakeResolvedConnectionConfig,
|
private readonly resolved: KtxSnowflakeResolvedConnectionConfig,
|
||||||
|
|
@ -285,16 +311,21 @@ class SnowflakeSdkDriver implements KtxSnowflakeDriver {
|
||||||
}
|
}
|
||||||
|
|
||||||
async query(sql: string, params?: unknown): Promise<KtxQueryResult> {
|
async query(sql: string, params?: unknown): Promise<KtxQueryResult> {
|
||||||
let connection: Connection | null = null;
|
const binds = Array.isArray(params) ? toSnowflakeBinds(params) : undefined;
|
||||||
try {
|
try {
|
||||||
connection = await this.createConnection();
|
const pool = await this.getPool();
|
||||||
const binds = Array.isArray(params) ? toSnowflakeBinds(params) : undefined;
|
const result = await pool.use(async (connection: snowflake.Connection) =>
|
||||||
const result = await this.executeSnowflakeQuery(connection, sql, binds);
|
this.executeSnowflakeQuery(connection, sql, binds),
|
||||||
|
);
|
||||||
return { ...result, totalRows: result.rows.length, rowCount: result.rows.length };
|
return { ...result, totalRows: result.rows.length, rowCount: result.rows.length };
|
||||||
} finally {
|
} catch (error) {
|
||||||
if (connection) {
|
const message = error instanceof Error ? error.message : String(error);
|
||||||
await this.destroyConnection(connection);
|
if (/timeout/i.test(message) && /pool|acquire/i.test(message)) {
|
||||||
|
throw new Error(
|
||||||
|
"Snowflake session pool exhausted after 60s - consider lowering maxSessions or increasing your account's concurrent-statement limit.",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
throw error;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -375,27 +406,41 @@ class SnowflakeSdkDriver implements KtxSnowflakeDriver {
|
||||||
}
|
}
|
||||||
|
|
||||||
async cleanup(): Promise<void> {
|
async cleanup(): Promise<void> {
|
||||||
|
const pool = this.pool;
|
||||||
|
this.pool = null;
|
||||||
|
if (pool) {
|
||||||
|
// Drain before clear so in-flight Snowflake statements finish before idle
|
||||||
|
// sessions are closed.
|
||||||
|
await pool.drain();
|
||||||
|
await pool.clear();
|
||||||
|
}
|
||||||
const closers = this.closeSdkOptions;
|
const closers = this.closeSdkOptions;
|
||||||
this.closeSdkOptions = [];
|
this.closeSdkOptions = [];
|
||||||
await Promise.all(closers.map((close) => close()));
|
await Promise.all(closers.map((close) => Promise.resolve(close())));
|
||||||
}
|
}
|
||||||
|
|
||||||
private async runTest(): Promise<{ success: boolean; error?: string }> {
|
private async runTest(): Promise<{ success: boolean; error?: string }> {
|
||||||
let connection: Connection | null = null;
|
|
||||||
try {
|
try {
|
||||||
connection = await this.createConnection();
|
await this.query('SELECT 1');
|
||||||
await this.executeSnowflakeQuery(connection, 'SELECT 1');
|
|
||||||
return { success: true };
|
return { success: true };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return { success: false, error: error instanceof Error ? error.message : String(error) };
|
return { success: false, error: error instanceof Error ? error.message : String(error) };
|
||||||
} finally {
|
|
||||||
if (connection) {
|
|
||||||
await this.destroyConnection(connection);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private async createConnection(): Promise<Connection> {
|
private async getPool(): Promise<ReturnType<typeof snowflake.createPool>> {
|
||||||
|
if (!this.pool) {
|
||||||
|
this.pool = snowflake.createPool(await this.resolveConnectionOptions(), {
|
||||||
|
min: 0,
|
||||||
|
max: this.resolved.maxSessions,
|
||||||
|
evictionRunIntervalMillis: 30_000,
|
||||||
|
acquireTimeoutMillis: 60_000,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return this.pool;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async resolveConnectionOptions(): Promise<snowflake.ConnectionOptions> {
|
||||||
const patch = await this.sdkOptionsProvider?.resolve({
|
const patch = await this.sdkOptionsProvider?.resolve({
|
||||||
account: this.resolved.account,
|
account: this.resolved.account,
|
||||||
connection: { ...this.resolved, driver: 'snowflake' },
|
connection: { ...this.resolved, driver: 'snowflake' },
|
||||||
|
|
@ -411,47 +456,13 @@ class SnowflakeSdkDriver implements KtxSnowflakeDriver {
|
||||||
database: this.resolved.database,
|
database: this.resolved.database,
|
||||||
...(sessionSchema ? { schema: sessionSchema } : {}),
|
...(sessionSchema ? { schema: sessionSchema } : {}),
|
||||||
role: this.resolved.role,
|
role: this.resolved.role,
|
||||||
|
clientSessionKeepAlive: true,
|
||||||
|
clientSessionKeepAliveHeartbeatFrequency: 900,
|
||||||
...patch?.sdkOptions,
|
...patch?.sdkOptions,
|
||||||
};
|
};
|
||||||
const connectionConfig: ConnectionOptions =
|
return this.resolved.authMethod === 'rsa'
|
||||||
this.resolved.authMethod === 'rsa'
|
? { ...baseConfig, authenticator: 'SNOWFLAKE_JWT', privateKey: this.decryptPrivateKey() }
|
||||||
? { ...baseConfig, authenticator: 'SNOWFLAKE_JWT', privateKey: this.decryptPrivateKey() }
|
: { ...baseConfig, password: this.resolved.password };
|
||||||
: { ...baseConfig, password: this.resolved.password };
|
|
||||||
const connection = snowflake.createConnection(connectionConfig);
|
|
||||||
return new Promise((resolveConnection, rejectConnection) => {
|
|
||||||
connection.connect((error, connected) => {
|
|
||||||
if (error) {
|
|
||||||
rejectConnection(error);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const resolvedConnection = connected ?? connection;
|
|
||||||
this.setConnectionContext(resolvedConnection).then(
|
|
||||||
() => resolveConnection(resolvedConnection),
|
|
||||||
(contextError) => {
|
|
||||||
resolvedConnection.destroy(() => undefined);
|
|
||||||
rejectConnection(contextError);
|
|
||||||
},
|
|
||||||
);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private async setConnectionContext(connection: Connection): Promise<void> {
|
|
||||||
if (this.resolved.role) {
|
|
||||||
await this.executeSnowflakeQuery(connection, `USE ROLE ${quoteSnowflakeIdentifier(this.resolved.role, 'role')}`);
|
|
||||||
}
|
|
||||||
await this.executeSnowflakeQuery(
|
|
||||||
connection,
|
|
||||||
`USE WAREHOUSE ${quoteSnowflakeIdentifier(this.resolved.warehouse, 'warehouse')}`,
|
|
||||||
);
|
|
||||||
await this.executeSnowflakeQuery(
|
|
||||||
connection,
|
|
||||||
`USE DATABASE ${quoteSnowflakeIdentifier(this.resolved.database, 'database')}`,
|
|
||||||
);
|
|
||||||
await this.executeSnowflakeQuery(
|
|
||||||
connection,
|
|
||||||
`USE SCHEMA ${quoteSnowflakeIdentifier(this.resolved.schemas[0] ?? 'PUBLIC', 'schema')}`,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private async executeSnowflakeQuery(
|
private async executeSnowflakeQuery(
|
||||||
|
|
@ -480,18 +491,6 @@ class SnowflakeSdkDriver implements KtxSnowflakeDriver {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private destroyConnection(connection: Connection): Promise<void> {
|
|
||||||
return new Promise((resolveDestroy, rejectDestroy) => {
|
|
||||||
connection.destroy((error) => {
|
|
||||||
if (error) {
|
|
||||||
rejectDestroy(error);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
resolveDestroy();
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private decryptPrivateKey(): string {
|
private decryptPrivateKey(): string {
|
||||||
if (!this.resolved.privateKey) {
|
if (!this.resolved.privateKey) {
|
||||||
throw new Error('Private key is required for RSA authentication');
|
throw new Error('Private key is required for RSA authentication');
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,7 @@ connections:
|
||||||
maxLlmTablesPerBatch: 40,
|
maxLlmTablesPerBatch: 40,
|
||||||
maxCandidatesPerColumn: 25,
|
maxCandidatesPerColumn: 25,
|
||||||
profileSampleRows: 10000,
|
profileSampleRows: 10000,
|
||||||
|
profileConcurrency: 4,
|
||||||
validationConcurrency: 4,
|
validationConcurrency: 4,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
@ -278,6 +279,7 @@ scan:
|
||||||
maxLlmTablesPerBatch: 12
|
maxLlmTablesPerBatch: 12
|
||||||
maxCandidatesPerColumn: 7
|
maxCandidatesPerColumn: 7
|
||||||
profileSampleRows: 500
|
profileSampleRows: 500
|
||||||
|
profileConcurrency: 3
|
||||||
validationConcurrency: 2
|
validationConcurrency: 2
|
||||||
validationBudget: 0
|
validationBudget: 0
|
||||||
`);
|
`);
|
||||||
|
|
@ -291,6 +293,7 @@ scan:
|
||||||
maxLlmTablesPerBatch: 12,
|
maxLlmTablesPerBatch: 12,
|
||||||
maxCandidatesPerColumn: 7,
|
maxCandidatesPerColumn: 7,
|
||||||
profileSampleRows: 500,
|
profileSampleRows: 500,
|
||||||
|
profileConcurrency: 3,
|
||||||
validationConcurrency: 2,
|
validationConcurrency: 2,
|
||||||
validationBudget: 0,
|
validationBudget: 0,
|
||||||
});
|
});
|
||||||
|
|
@ -302,6 +305,7 @@ scan:
|
||||||
expect(serializeKtxProjectConfig(config)).toContain('maxLlmTablesPerBatch: 12');
|
expect(serializeKtxProjectConfig(config)).toContain('maxLlmTablesPerBatch: 12');
|
||||||
expect(serializeKtxProjectConfig(config)).toContain('maxCandidatesPerColumn: 7');
|
expect(serializeKtxProjectConfig(config)).toContain('maxCandidatesPerColumn: 7');
|
||||||
expect(serializeKtxProjectConfig(config)).toContain('profileSampleRows: 500');
|
expect(serializeKtxProjectConfig(config)).toContain('profileSampleRows: 500');
|
||||||
|
expect(serializeKtxProjectConfig(config)).toContain('profileConcurrency: 3');
|
||||||
expect(serializeKtxProjectConfig(config)).toContain('validationConcurrency: 2');
|
expect(serializeKtxProjectConfig(config)).toContain('validationConcurrency: 2');
|
||||||
expect(serializeKtxProjectConfig(config)).toContain('validationBudget: 0');
|
expect(serializeKtxProjectConfig(config)).toContain('validationBudget: 0');
|
||||||
});
|
});
|
||||||
|
|
@ -326,6 +330,7 @@ scan:
|
||||||
maxLlmTablesPerBatch: 0
|
maxLlmTablesPerBatch: 0
|
||||||
maxCandidatesPerColumn: -4
|
maxCandidatesPerColumn: -4
|
||||||
profileSampleRows: 0
|
profileSampleRows: 0
|
||||||
|
profileConcurrency: 0
|
||||||
validationConcurrency: 0
|
validationConcurrency: 0
|
||||||
validationBudget: 1.5
|
validationBudget: 1.5
|
||||||
`;
|
`;
|
||||||
|
|
@ -341,6 +346,7 @@ scan:
|
||||||
'scan.relationships.maxLlmTablesPerBatch',
|
'scan.relationships.maxLlmTablesPerBatch',
|
||||||
'scan.relationships.maxCandidatesPerColumn',
|
'scan.relationships.maxCandidatesPerColumn',
|
||||||
'scan.relationships.profileSampleRows',
|
'scan.relationships.profileSampleRows',
|
||||||
|
'scan.relationships.profileConcurrency',
|
||||||
'scan.relationships.validationConcurrency',
|
'scan.relationships.validationConcurrency',
|
||||||
'scan.relationships.validationBudget',
|
'scan.relationships.validationBudget',
|
||||||
]),
|
]),
|
||||||
|
|
|
||||||
|
|
@ -163,6 +163,11 @@ const scanRelationshipsSchema = z
|
||||||
.default(25)
|
.default(25)
|
||||||
.describe('Maximum number of candidate join partners considered per column during relationship discovery.'),
|
.describe('Maximum number of candidate join partners considered per column during relationship discovery.'),
|
||||||
profileSampleRows: z.int().positive().default(10000).describe('Number of rows sampled per table when profiling values for relationship inference.'),
|
profileSampleRows: z.int().positive().default(10000).describe('Number of rows sampled per table when profiling values for relationship inference.'),
|
||||||
|
profileConcurrency: z
|
||||||
|
.int()
|
||||||
|
.positive()
|
||||||
|
.default(4)
|
||||||
|
.describe('Parallel relationship-profile queries run against the database during scan.'),
|
||||||
validationConcurrency: z.int().positive().default(4).describe('Number of relationship validation queries run in parallel against the database.'),
|
validationConcurrency: z.int().positive().default(4).describe('Number of relationship validation queries run in parallel against the database.'),
|
||||||
validationBudget: z
|
validationBudget: z
|
||||||
.union([z.literal('all'), z.int().nonnegative()])
|
.union([z.literal('all'), z.int().nonnegative()])
|
||||||
|
|
|
||||||
|
|
@ -378,6 +378,121 @@ describe('KtxDescriptionGenerator', () => {
|
||||||
expect(cache.set).toHaveBeenCalledWith('warehouse.public.orders', 'Commerce orders');
|
expect(cache.set).toHaveBeenCalledWith('warehouse.public.orders', 'Commerce orders');
|
||||||
expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders');
|
expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('generates one structured table description and reuses table samples for all columns', async () => {
|
||||||
|
const llmRuntime = createLlmProvider('unused');
|
||||||
|
llmRuntime.generateObject = vi.fn(async () => ({
|
||||||
|
tableDescription: 'Commerce orders',
|
||||||
|
columns: [
|
||||||
|
{ name: 'status', description: 'Current order state' },
|
||||||
|
{ name: 'amount', description: 'Order amount in dollars' },
|
||||||
|
],
|
||||||
|
}));
|
||||||
|
const connector = createConnector();
|
||||||
|
const generator = new KtxDescriptionGenerator({
|
||||||
|
llmRuntime,
|
||||||
|
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await generator.generateBatchedTableDescriptions({
|
||||||
|
connectionId: 'conn-1',
|
||||||
|
connector,
|
||||||
|
context: { runId: 'run-1' },
|
||||||
|
dataSourceType: 'POSTGRESQL',
|
||||||
|
supportsNestedAnalysis: false,
|
||||||
|
table: {
|
||||||
|
catalog: null,
|
||||||
|
db: 'public',
|
||||||
|
name: 'orders',
|
||||||
|
rawDescriptions: { db: 'Orders fact table' },
|
||||||
|
columns: [
|
||||||
|
{ name: 'status', type: 'text' },
|
||||||
|
{ name: 'amount', type: 'numeric' },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.tableDescription).toBe('Commerce orders');
|
||||||
|
expect(Object.fromEntries(result.columnDescriptions)).toEqual({
|
||||||
|
status: 'Current order state',
|
||||||
|
amount: 'Order amount in dollars',
|
||||||
|
});
|
||||||
|
expect(connector.sampleTable).toHaveBeenCalledTimes(1);
|
||||||
|
expect(connector.sampleColumn).not.toHaveBeenCalled();
|
||||||
|
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
|
||||||
|
expect(llmRuntime.generateText).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('falls back to one column generateText call for each missing structured column', async () => {
|
||||||
|
const llmRuntime = createLlmProvider('Fallback status');
|
||||||
|
llmRuntime.generateObject = vi.fn(async () => ({
|
||||||
|
tableDescription: 'Commerce orders',
|
||||||
|
columns: [{ name: 'amount', description: 'Order amount in dollars' }],
|
||||||
|
}));
|
||||||
|
const connector = createConnector();
|
||||||
|
const generator = new KtxDescriptionGenerator({
|
||||||
|
llmRuntime,
|
||||||
|
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await generator.generateBatchedTableDescriptions({
|
||||||
|
connectionId: 'conn-1',
|
||||||
|
connector,
|
||||||
|
context: { runId: 'run-1' },
|
||||||
|
dataSourceType: 'POSTGRESQL',
|
||||||
|
supportsNestedAnalysis: false,
|
||||||
|
table: {
|
||||||
|
catalog: null,
|
||||||
|
db: 'public',
|
||||||
|
name: 'orders',
|
||||||
|
columns: [
|
||||||
|
{ name: 'status', type: 'text' },
|
||||||
|
{ name: 'amount', type: 'numeric' },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(Object.fromEntries(result.columnDescriptions)).toEqual({
|
||||||
|
status: 'Fallback status',
|
||||||
|
amount: 'Order amount in dollars',
|
||||||
|
});
|
||||||
|
expect(connector.sampleColumn).not.toHaveBeenCalled();
|
||||||
|
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
|
||||||
|
expect(llmRuntime.generateText).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does not run per-column fallback when structured object generation throws', async () => {
|
||||||
|
const llmRuntime = createLlmProvider('Fallback description');
|
||||||
|
llmRuntime.generateObject = vi.fn(async () => {
|
||||||
|
throw new Error('object output unavailable');
|
||||||
|
});
|
||||||
|
const warnings: string[] = [];
|
||||||
|
const generator = new KtxDescriptionGenerator({
|
||||||
|
llmRuntime,
|
||||||
|
onWarning: (warning) => warnings.push(warning.code),
|
||||||
|
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await generator.generateBatchedTableDescriptions({
|
||||||
|
connectionId: 'conn-1',
|
||||||
|
connector: createConnector(),
|
||||||
|
context: { runId: 'run-1' },
|
||||||
|
dataSourceType: 'POSTGRESQL',
|
||||||
|
supportsNestedAnalysis: false,
|
||||||
|
table: {
|
||||||
|
catalog: null,
|
||||||
|
db: 'public',
|
||||||
|
name: 'orders',
|
||||||
|
columns: [{ name: 'status', type: 'text' }],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.tableDescription).toBeNull();
|
||||||
|
expect(Object.fromEntries(result.columnDescriptions)).toEqual({ status: null });
|
||||||
|
expect(warnings).toContain('enrichment_failed');
|
||||||
|
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
|
||||||
|
expect(llmRuntime.generateText).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('KtxDescriptionGenerator resilience', () => {
|
describe('KtxDescriptionGenerator resilience', () => {
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
||||||
|
import { z } from 'zod';
|
||||||
import type {
|
import type {
|
||||||
KtxColumnSampleInput,
|
KtxColumnSampleInput,
|
||||||
KtxColumnSampleResult,
|
KtxColumnSampleResult,
|
||||||
|
|
@ -53,7 +54,7 @@ export interface KtxDescriptionColumn {
|
||||||
sampleValues?: unknown[];
|
sampleValues?: unknown[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface KtxDescriptionColumnTable extends KtxTableRef {
|
interface KtxDescriptionColumnTable extends KtxTableRef {
|
||||||
columns: KtxDescriptionColumn[];
|
columns: KtxDescriptionColumn[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -112,6 +113,23 @@ export interface KtxGenerateTableDescriptionInput {
|
||||||
table: KtxDescriptionTableInput;
|
table: KtxDescriptionTableInput;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface KtxGenerateBatchedTableDescriptionsInput {
|
||||||
|
connectionId: string;
|
||||||
|
connector: KtxDescriptionSamplingPort;
|
||||||
|
context: KtxScanContext;
|
||||||
|
dataSourceType: string;
|
||||||
|
supportsNestedAnalysis: boolean;
|
||||||
|
table: KtxDescriptionColumnTable & {
|
||||||
|
rawDescriptions?: Record<string, string>;
|
||||||
|
columns: Array<KtxDescriptionColumn & { type?: string; comment?: string | null }>;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface KtxBatchedTableDescriptionsResult {
|
||||||
|
tableDescription: string | null;
|
||||||
|
columnDescriptions: Map<string, string | null>;
|
||||||
|
}
|
||||||
|
|
||||||
export interface KtxGenerateDataSourceDescriptionInput {
|
export interface KtxGenerateDataSourceDescriptionInput {
|
||||||
connectionId: string;
|
connectionId: string;
|
||||||
connector: KtxDescriptionSamplingPort;
|
connector: KtxDescriptionSamplingPort;
|
||||||
|
|
@ -136,6 +154,18 @@ interface ColumnTaskResult {
|
||||||
skipped: boolean;
|
skipped: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const batchedTableDescriptionSchema = z.object({
|
||||||
|
tableDescription: z.string(),
|
||||||
|
columns: z.array(
|
||||||
|
z.object({
|
||||||
|
name: z.string(),
|
||||||
|
description: z.string(),
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
});
|
||||||
|
|
||||||
|
type BatchedTableDescriptionOutput = z.infer<typeof batchedTableDescriptionSchema>;
|
||||||
|
|
||||||
function descriptionSources(rawDescriptions: Record<string, string> | undefined): Array<[string, string]> {
|
function descriptionSources(rawDescriptions: Record<string, string> | undefined): Array<[string, string]> {
|
||||||
if (!rawDescriptions) {
|
if (!rawDescriptions) {
|
||||||
return [];
|
return [];
|
||||||
|
|
@ -250,6 +280,76 @@ function wordLimitLine(maxWords: number): string {
|
||||||
return `Please provide a concise description in ${maxWords} words or less.`;
|
return `Please provide a concise description in ${maxWords} words or less.`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function sampleValuesByColumn(
|
||||||
|
columns: readonly KtxDescriptionColumn[],
|
||||||
|
sampleData: KtxTableSampleResult | null,
|
||||||
|
): Map<string, unknown[]> {
|
||||||
|
const values = new Map<string, unknown[]>();
|
||||||
|
for (const column of columns) {
|
||||||
|
const existingValues = column.sampleValues?.filter((value) => value !== null && value !== undefined) ?? [];
|
||||||
|
if (existingValues.length > 0) {
|
||||||
|
values.set(column.name, existingValues);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!sampleData) {
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
for (const column of columns) {
|
||||||
|
const index = sampleData.headers.findIndex((header) => header.toLowerCase() === column.name.toLowerCase());
|
||||||
|
if (index < 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const sampledValues = sampleData.rows
|
||||||
|
.map((row) => row[index])
|
||||||
|
.filter((value) => value !== null && value !== undefined);
|
||||||
|
if (sampledValues.length > 0) {
|
||||||
|
values.set(column.name, sampledValues);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
function batchedPrompt(input: {
|
||||||
|
table: KtxGenerateBatchedTableDescriptionsInput['table'];
|
||||||
|
sampleData: KtxTableSampleResult | null;
|
||||||
|
dataSourceType: string;
|
||||||
|
tableMaxWords: number;
|
||||||
|
columnMaxWords: number;
|
||||||
|
}): KtxDescriptionPrompt {
|
||||||
|
const columnLines = input.table.columns
|
||||||
|
.map((column) => {
|
||||||
|
const typePart = column.type ? ` (${column.type})` : '';
|
||||||
|
const commentPart = column.rawDescriptions?.db ? ` - ${column.rawDescriptions.db}` : '';
|
||||||
|
return `- ${column.name}${typePart}${commentPart}`;
|
||||||
|
})
|
||||||
|
.join('\n');
|
||||||
|
const sampleLines =
|
||||||
|
input.sampleData && input.sampleData.rows.length > 0
|
||||||
|
? input.sampleData.rows
|
||||||
|
.slice(0, 5)
|
||||||
|
.map((row) =>
|
||||||
|
input.sampleData!.headers.map((header, index) => `${header}=${String(row[index] ?? '')}`).join(', '),
|
||||||
|
)
|
||||||
|
.join('\n')
|
||||||
|
: 'unavailable';
|
||||||
|
return {
|
||||||
|
system: [
|
||||||
|
'Analyze one database table and return structured JSON matching the supplied schema.',
|
||||||
|
`The table description must be ${input.tableMaxWords} words or less.`,
|
||||||
|
`Each column description must be ${input.columnMaxWords} words or less.`,
|
||||||
|
'Describe business meaning directly. Do not repeat table or column names as filler.',
|
||||||
|
].join('\n'),
|
||||||
|
user: [
|
||||||
|
`Table: ${input.table.name}`,
|
||||||
|
`Data source type: ${input.dataSourceType}`,
|
||||||
|
'Columns:',
|
||||||
|
columnLines,
|
||||||
|
'Sample rows:',
|
||||||
|
sampleLines,
|
||||||
|
].join('\n'),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/** @internal */
|
/** @internal */
|
||||||
export function buildKtxColumnDescriptionPrompt(
|
export function buildKtxColumnDescriptionPrompt(
|
||||||
input: KtxColumnDescriptionPromptInput & { maxWords?: number },
|
input: KtxColumnDescriptionPromptInput & { maxWords?: number },
|
||||||
|
|
@ -582,6 +682,156 @@ export class KtxDescriptionGenerator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async generateBatchedTableDescriptions(
|
||||||
|
input: KtxGenerateBatchedTableDescriptionsInput,
|
||||||
|
): Promise<KtxBatchedTableDescriptionsResult> {
|
||||||
|
const tableRef = toTableRef(input.table);
|
||||||
|
let sampleData: KtxTableSampleResult | null = null;
|
||||||
|
let fallbackReason: 'capability_missing' | 'sampling_failed' | 'empty_sample' | null = null;
|
||||||
|
if (!input.connector.sampleTable) {
|
||||||
|
fallbackReason = 'capability_missing';
|
||||||
|
this.logger?.warn('KTX scan connector does not support table sampling; falling back to metadata-only prompt', {
|
||||||
|
connectorId: input.connector.id,
|
||||||
|
table: input.table.name,
|
||||||
|
});
|
||||||
|
this.onWarning?.({
|
||||||
|
code: 'connector_capability_missing',
|
||||||
|
message: `Connector ${input.connector.id} does not support sampleTable; using metadata-only description prompt`,
|
||||||
|
table: input.table.name,
|
||||||
|
recoverable: true,
|
||||||
|
metadata: { connectorId: input.connector.id, capability: 'sampleTable' },
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
sampleData = await retryAsync(
|
||||||
|
() =>
|
||||||
|
input.connector.sampleTable!(
|
||||||
|
{
|
||||||
|
connectionId: input.connectionId,
|
||||||
|
table: tableRef,
|
||||||
|
limit: 20,
|
||||||
|
},
|
||||||
|
input.context,
|
||||||
|
),
|
||||||
|
{
|
||||||
|
attempts: 3,
|
||||||
|
baseDelayMs: 200,
|
||||||
|
signal: input.context.signal,
|
||||||
|
onAttemptFailure: (error, attempt) => {
|
||||||
|
this.logger?.warn(`sampleTable attempt ${attempt} failed for ${input.table.name}: ${errorMessage(error)}`, {
|
||||||
|
connectorId: input.connector.id,
|
||||||
|
table: input.table.name,
|
||||||
|
attempt,
|
||||||
|
});
|
||||||
|
},
|
||||||
|
},
|
||||||
|
);
|
||||||
|
if (sampleData.rows.length === 0) {
|
||||||
|
fallbackReason = 'empty_sample';
|
||||||
|
this.logger?.warn('sampleTable returned no rows; using metadata-only prompt', {
|
||||||
|
connectorId: input.connector.id,
|
||||||
|
table: input.table.name,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof KtxAbortedError) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
fallbackReason = 'sampling_failed';
|
||||||
|
this.logger?.error(`sampleTable exhausted retries for ${input.table.name}: ${errorMessage(error)}`, {
|
||||||
|
connectorId: input.connector.id,
|
||||||
|
table: input.table.name,
|
||||||
|
});
|
||||||
|
this.onWarning?.({
|
||||||
|
code: 'sampling_failed',
|
||||||
|
message: `Failed to sample table ${input.table.name} after retries: ${errorMessage(error)}`,
|
||||||
|
table: input.table.name,
|
||||||
|
recoverable: true,
|
||||||
|
metadata: { connectorId: input.connector.id, error: errorMessage(error) },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const sampleValues = sampleValuesByColumn(input.table.columns, sampleData);
|
||||||
|
const descriptions = new Map<string, string | null>();
|
||||||
|
let tableDescription: string | null = null;
|
||||||
|
let structuredGenerationSucceeded = false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const prompt = batchedPrompt({
|
||||||
|
table: input.table,
|
||||||
|
sampleData,
|
||||||
|
dataSourceType: input.dataSourceType,
|
||||||
|
tableMaxWords: this.settings.tableMaxWords,
|
||||||
|
columnMaxWords: this.settings.columnMaxWords,
|
||||||
|
});
|
||||||
|
const generated = await this.llmRuntime.generateObject<
|
||||||
|
BatchedTableDescriptionOutput,
|
||||||
|
typeof batchedTableDescriptionSchema
|
||||||
|
>({
|
||||||
|
role: 'candidateExtraction',
|
||||||
|
system: prompt.system,
|
||||||
|
prompt: prompt.user,
|
||||||
|
schema: batchedTableDescriptionSchema,
|
||||||
|
temperature: this.settings.temperature,
|
||||||
|
});
|
||||||
|
structuredGenerationSucceeded = true;
|
||||||
|
tableDescription = generated.tableDescription.trim() || null;
|
||||||
|
const generatedColumns = new Map(
|
||||||
|
generated.columns.map((column) => [column.name.toLowerCase(), column.description.trim() || null]),
|
||||||
|
);
|
||||||
|
for (const column of input.table.columns) {
|
||||||
|
const description = generatedColumns.get(column.name.toLowerCase()) ?? null;
|
||||||
|
descriptions.set(column.name, description);
|
||||||
|
}
|
||||||
|
if (tableDescription && fallbackReason !== null) {
|
||||||
|
this.onWarning?.({
|
||||||
|
code: 'description_fallback_used',
|
||||||
|
message: `Generated table description without sample rows for ${input.table.name} (reason: ${fallbackReason})`,
|
||||||
|
table: input.table.name,
|
||||||
|
recoverable: true,
|
||||||
|
metadata: { connectorId: input.connector.id, reason: fallbackReason },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
this.logger?.warn(`Batched table description failed for ${input.table.name}: ${errorMessage(error)}`, {
|
||||||
|
connectorId: input.connector.id,
|
||||||
|
table: input.table.name,
|
||||||
|
});
|
||||||
|
this.onWarning?.({
|
||||||
|
code: 'enrichment_failed',
|
||||||
|
message: `Failed to generate batched description for table ${input.table.name}: ${errorMessage(error)}`,
|
||||||
|
table: input.table.name,
|
||||||
|
recoverable: true,
|
||||||
|
metadata: { connectorId: input.connector.id },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!structuredGenerationSucceeded) {
|
||||||
|
for (const column of input.table.columns) {
|
||||||
|
descriptions.set(column.name, null);
|
||||||
|
}
|
||||||
|
return { tableDescription, columnDescriptions: descriptions };
|
||||||
|
}
|
||||||
|
|
||||||
|
const tableContext = `Table: ${input.table.name} | Columns: ${input.table.columns.map((column) => column.name).join(', ')} | Data source: ${input.dataSourceType}`;
|
||||||
|
for (const column of input.table.columns) {
|
||||||
|
if (descriptions.get(column.name)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const fallback = await this.generateColumnDescriptionFromPreparedValues({
|
||||||
|
column,
|
||||||
|
columnValues: sampleValues.get(column.name) ?? [],
|
||||||
|
tableContext,
|
||||||
|
dataSourceType: input.dataSourceType,
|
||||||
|
supportsNestedAnalysis: input.supportsNestedAnalysis,
|
||||||
|
});
|
||||||
|
descriptions.set(column.name, fallback);
|
||||||
|
}
|
||||||
|
|
||||||
|
return { tableDescription, columnDescriptions: descriptions };
|
||||||
|
}
|
||||||
|
|
||||||
async generateDataSourceDescription(input: KtxGenerateDataSourceDescriptionInput): Promise<string | null> {
|
async generateDataSourceDescription(input: KtxGenerateDataSourceDescriptionInput): Promise<string | null> {
|
||||||
if (input.tables.length === 0) {
|
if (input.tables.length === 0) {
|
||||||
return 'No tables found in database';
|
return 'No tables found in database';
|
||||||
|
|
@ -732,27 +982,13 @@ export class KtxDescriptionGenerator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const nonNullValues = (columnValues ?? []).filter((value) => value !== null && value !== undefined);
|
const description = await this.generateColumnDescriptionFromPreparedValues({
|
||||||
const hasRawDescriptions = descriptionSources(column.rawDescriptions).length > 0;
|
column,
|
||||||
if (nonNullValues.length === 0 && !hasRawDescriptions) {
|
columnValues: columnValues ?? [],
|
||||||
return {
|
|
||||||
columnName: column.name,
|
|
||||||
description: null,
|
|
||||||
skipped: false,
|
|
||||||
processed: false,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
const prompt = buildKtxColumnDescriptionPrompt({
|
|
||||||
columnName: column.name,
|
|
||||||
columnValues: nonNullValues,
|
|
||||||
tableContext,
|
tableContext,
|
||||||
dataSourceType: input.dataSourceType,
|
dataSourceType: input.dataSourceType,
|
||||||
supportsNestedAnalysis: input.supportsNestedAnalysis,
|
supportsNestedAnalysis: input.supportsNestedAnalysis,
|
||||||
rawDescriptions: column.rawDescriptions,
|
|
||||||
maxWords: this.settings.columnMaxWords,
|
|
||||||
});
|
});
|
||||||
const description = await this.generateAiDescription(prompt, 'ktx-column-description');
|
|
||||||
|
|
||||||
if (cacheKey && description) {
|
if (cacheKey && description) {
|
||||||
await this.cache?.set(cacheKey, description);
|
await this.cache?.set(cacheKey, description);
|
||||||
|
|
@ -782,6 +1018,30 @@ export class KtxDescriptionGenerator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async generateColumnDescriptionFromPreparedValues(input: {
|
||||||
|
column: KtxDescriptionColumn;
|
||||||
|
columnValues: unknown[];
|
||||||
|
tableContext: string;
|
||||||
|
dataSourceType: string;
|
||||||
|
supportsNestedAnalysis: boolean;
|
||||||
|
}): Promise<string | null> {
|
||||||
|
const nonNullValues = input.columnValues.filter((value) => value !== null && value !== undefined);
|
||||||
|
const hasRawDescriptions = descriptionSources(input.column.rawDescriptions).length > 0;
|
||||||
|
if (nonNullValues.length === 0 && !hasRawDescriptions) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const prompt = buildKtxColumnDescriptionPrompt({
|
||||||
|
columnName: input.column.name,
|
||||||
|
columnValues: nonNullValues,
|
||||||
|
tableContext: input.tableContext,
|
||||||
|
dataSourceType: input.dataSourceType,
|
||||||
|
supportsNestedAnalysis: input.supportsNestedAnalysis,
|
||||||
|
rawDescriptions: input.column.rawDescriptions,
|
||||||
|
maxWords: this.settings.columnMaxWords,
|
||||||
|
});
|
||||||
|
return this.generateAiDescription(prompt, 'ktx-column-description');
|
||||||
|
}
|
||||||
|
|
||||||
private async generateAiDescription(prompt: KtxDescriptionPrompt, _operationName: string): Promise<string | null> {
|
private async generateAiDescription(prompt: KtxDescriptionPrompt, _operationName: string): Promise<string | null> {
|
||||||
try {
|
try {
|
||||||
const text = await this.llmRuntime.generateText({
|
const text = await this.llmRuntime.generateText({
|
||||||
|
|
|
||||||
|
|
@ -289,6 +289,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
|
||||||
maxLlmTablesPerBatch: 12,
|
maxLlmTablesPerBatch: 12,
|
||||||
maxCandidatesPerColumn: 7,
|
maxCandidatesPerColumn: 7,
|
||||||
profileSampleRows: 500,
|
profileSampleRows: 500,
|
||||||
|
profileConcurrency: 3,
|
||||||
validationConcurrency: 2,
|
validationConcurrency: 2,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
@ -378,6 +379,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
|
||||||
validationRequiredForManifest: true,
|
validationRequiredForManifest: true,
|
||||||
maxCandidatesPerColumn: 7,
|
maxCandidatesPerColumn: 7,
|
||||||
profileSampleRows: 500,
|
profileSampleRows: 500,
|
||||||
|
profileConcurrency: 3,
|
||||||
validationConcurrency: 2,
|
validationConcurrency: 2,
|
||||||
},
|
},
|
||||||
profileWarnings: [],
|
profileWarnings: [],
|
||||||
|
|
@ -472,6 +474,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
|
||||||
maxLlmTablesPerBatch: 40,
|
maxLlmTablesPerBatch: 40,
|
||||||
maxCandidatesPerColumn: 25,
|
maxCandidatesPerColumn: 25,
|
||||||
profileSampleRows: 10000,
|
profileSampleRows: 10000,
|
||||||
|
profileConcurrency: 4,
|
||||||
validationConcurrency: 4,
|
validationConcurrency: 4,
|
||||||
},
|
},
|
||||||
dryRun: false,
|
dryRun: false,
|
||||||
|
|
@ -741,6 +744,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
|
||||||
maxLlmTablesPerBatch: 40,
|
maxLlmTablesPerBatch: 40,
|
||||||
maxCandidatesPerColumn: 25,
|
maxCandidatesPerColumn: 25,
|
||||||
profileSampleRows: 10000,
|
profileSampleRows: 10000,
|
||||||
|
profileConcurrency: 4,
|
||||||
validationConcurrency: 4,
|
validationConcurrency: 4,
|
||||||
},
|
},
|
||||||
dryRun: false,
|
dryRun: false,
|
||||||
|
|
|
||||||
|
|
@ -382,6 +382,7 @@ export async function writeLocalScanEnrichmentArtifacts(
|
||||||
validationRequiredForManifest: input.relationshipSettings.validationRequiredForManifest,
|
validationRequiredForManifest: input.relationshipSettings.validationRequiredForManifest,
|
||||||
maxCandidatesPerColumn: input.relationshipSettings.maxCandidatesPerColumn,
|
maxCandidatesPerColumn: input.relationshipSettings.maxCandidatesPerColumn,
|
||||||
profileSampleRows: input.relationshipSettings.profileSampleRows,
|
profileSampleRows: input.relationshipSettings.profileSampleRows,
|
||||||
|
profileConcurrency: input.relationshipSettings.profileConcurrency,
|
||||||
validationConcurrency: input.relationshipSettings.validationConcurrency,
|
validationConcurrency: input.relationshipSettings.validationConcurrency,
|
||||||
}
|
}
|
||||||
: undefined,
|
: undefined,
|
||||||
|
|
|
||||||
|
|
@ -299,6 +299,38 @@ describe('local scan enrichment', () => {
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('uses the supplied snapshot without calling connector.introspect', async () => {
|
||||||
|
const scanConnector = connector();
|
||||||
|
const introspect = vi.mocked(scanConnector.introspect);
|
||||||
|
|
||||||
|
const result = await runLocalScanEnrichment({
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
mode: 'structural',
|
||||||
|
connector: scanConnector,
|
||||||
|
snapshot,
|
||||||
|
context: { runId: 'scan-run-snapshot' },
|
||||||
|
providers: null,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.snapshot).toEqual(snapshot);
|
||||||
|
expect(introspect).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('falls back to connector.introspect when no snapshot is supplied', async () => {
|
||||||
|
const scanConnector = connector();
|
||||||
|
|
||||||
|
const result = await runLocalScanEnrichment({
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
mode: 'structural',
|
||||||
|
connector: scanConnector,
|
||||||
|
context: { runId: 'scan-run-introspect' },
|
||||||
|
providers: null,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.snapshot).toEqual(snapshot);
|
||||||
|
expect(scanConnector.introspect).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
|
||||||
it('runs deterministic relationship detection for relationship scans', async () => {
|
it('runs deterministic relationship detection for relationship scans', async () => {
|
||||||
const result = await runLocalScanEnrichment({
|
const result = await runLocalScanEnrichment({
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
|
|
@ -473,7 +505,7 @@ describe('local scan enrichment', () => {
|
||||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||||
});
|
});
|
||||||
|
|
||||||
it('generates table descriptions with bounded table-level concurrency', async () => {
|
it('generates batched table descriptions with bounded table-level concurrency', async () => {
|
||||||
const concurrentSnapshot: KtxSchemaSnapshot = {
|
const concurrentSnapshot: KtxSchemaSnapshot = {
|
||||||
...snapshot,
|
...snapshot,
|
||||||
tables: Array.from({ length: 8 }, (_, index) => ({
|
tables: Array.from({ length: 8 }, (_, index) => ({
|
||||||
|
|
@ -497,27 +529,27 @@ describe('local scan enrichment', () => {
|
||||||
],
|
],
|
||||||
})),
|
})),
|
||||||
};
|
};
|
||||||
let activeColumnSamples = 0;
|
let activeTableSamples = 0;
|
||||||
let maxActiveColumnSamples = 0;
|
let maxActiveTableSamples = 0;
|
||||||
const scanConnector = {
|
const scanConnector = {
|
||||||
...connector(),
|
...connector(),
|
||||||
introspect: vi.fn(async () => concurrentSnapshot),
|
introspect: vi.fn(async () => concurrentSnapshot),
|
||||||
sampleColumn: vi.fn(async () => {
|
sampleColumn: vi.fn(async () => ({
|
||||||
activeColumnSamples += 1;
|
values: ['1'],
|
||||||
maxActiveColumnSamples = Math.max(maxActiveColumnSamples, activeColumnSamples);
|
nullCount: 0,
|
||||||
|
distinctCount: 1,
|
||||||
|
})),
|
||||||
|
sampleTable: vi.fn(async () => {
|
||||||
|
activeTableSamples += 1;
|
||||||
|
maxActiveTableSamples = Math.max(maxActiveTableSamples, activeTableSamples);
|
||||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||||
activeColumnSamples -= 1;
|
activeTableSamples -= 1;
|
||||||
return {
|
return {
|
||||||
values: ['1'],
|
headers: ['id'],
|
||||||
nullCount: 0,
|
rows: [[1]],
|
||||||
distinctCount: 1,
|
totalRows: 1,
|
||||||
};
|
};
|
||||||
}),
|
}),
|
||||||
sampleTable: vi.fn(async () => ({
|
|
||||||
headers: ['id'],
|
|
||||||
rows: [[1]],
|
|
||||||
totalRows: 1,
|
|
||||||
})),
|
|
||||||
};
|
};
|
||||||
const settings = {
|
const settings = {
|
||||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||||
|
|
@ -533,7 +565,8 @@ describe('local scan enrichment', () => {
|
||||||
relationshipSettings: settings,
|
relationshipSettings: settings,
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(maxActiveColumnSamples).toBe(6);
|
expect(maxActiveTableSamples).toBe(4);
|
||||||
|
expect(scanConnector.sampleColumn).not.toHaveBeenCalled();
|
||||||
});
|
});
|
||||||
|
|
||||||
it('reports enrichment progress for countable stages', async () => {
|
it('reports enrichment progress for countable stages', async () => {
|
||||||
|
|
@ -675,7 +708,7 @@ describe('local scan enrichment', () => {
|
||||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||||
});
|
});
|
||||||
|
|
||||||
const generateText = vi.spyOn(providers.llmRuntime, 'generateText');
|
const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
|
||||||
const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
|
const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
|
||||||
const second = await runLocalScanEnrichment({
|
const second = await runLocalScanEnrichment({
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
|
|
@ -693,7 +726,7 @@ describe('local scan enrichment', () => {
|
||||||
expect(first.state.resumedStages).toEqual([]);
|
expect(first.state.resumedStages).toEqual([]);
|
||||||
expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||||
expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||||
expect(generateText).not.toHaveBeenCalled();
|
expect(generateObject).not.toHaveBeenCalled();
|
||||||
expect(embedBatch).not.toHaveBeenCalled();
|
expect(embedBatch).not.toHaveBeenCalled();
|
||||||
expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
|
expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
|
||||||
expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
|
expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
|
||||||
|
|
@ -731,7 +764,7 @@ describe('local scan enrichment', () => {
|
||||||
tables: [{ ...firstTable, name: 'customers' }],
|
tables: [{ ...firstTable, name: 'customers' }],
|
||||||
})),
|
})),
|
||||||
};
|
};
|
||||||
const generateText = vi.spyOn(providers.llmRuntime, 'generateText');
|
const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
|
||||||
|
|
||||||
const result = await runLocalScanEnrichment({
|
const result = await runLocalScanEnrichment({
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
|
|
@ -747,7 +780,7 @@ describe('local scan enrichment', () => {
|
||||||
|
|
||||||
expect(result.state.resumedStages).toEqual([]);
|
expect(result.state.resumedStages).toEqual([]);
|
||||||
expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||||
expect(generateText).toHaveBeenCalled();
|
expect(generateObject).toHaveBeenCalled();
|
||||||
});
|
});
|
||||||
|
|
||||||
it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
|
it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import pLimit from 'p-limit';
|
import pLimit from 'p-limit';
|
||||||
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
||||||
import { buildDefaultKtxProjectConfig, type KtxScanRelationshipConfig } from '../project/config.js';
|
import { buildDefaultKtxProjectConfig, type KtxScanRelationshipConfig } from '../project/config.js';
|
||||||
import { type KtxDescriptionColumnTable, KtxDescriptionGenerator } from './description-generation.js';
|
import { KtxDescriptionGenerator } from './description-generation.js';
|
||||||
import { buildKtxColumnEmbeddingText } from './embedding-text.js';
|
import { buildKtxColumnEmbeddingText } from './embedding-text.js';
|
||||||
import {
|
import {
|
||||||
completedKtxScanEnrichmentStateSummary,
|
completedKtxScanEnrichmentStateSummary,
|
||||||
|
|
@ -41,7 +41,7 @@ import type {
|
||||||
KtxTableRef,
|
KtxTableRef,
|
||||||
} from './types.js';
|
} from './types.js';
|
||||||
|
|
||||||
const DESCRIPTION_TABLE_CONCURRENCY = 6;
|
const DESCRIPTION_TABLE_CONCURRENCY = 4;
|
||||||
|
|
||||||
export interface KtxLocalScanEnrichmentProviders {
|
export interface KtxLocalScanEnrichmentProviders {
|
||||||
llmRuntime: KtxLlmRuntimePort;
|
llmRuntime: KtxLlmRuntimePort;
|
||||||
|
|
@ -53,6 +53,7 @@ export interface KtxLocalScanEnrichmentInput {
|
||||||
mode: KtxScanMode;
|
mode: KtxScanMode;
|
||||||
detectRelationships?: boolean;
|
detectRelationships?: boolean;
|
||||||
connector: KtxScanConnector;
|
connector: KtxScanConnector;
|
||||||
|
snapshot?: KtxSchemaSnapshot;
|
||||||
context: KtxScanContext;
|
context: KtxScanContext;
|
||||||
providers: KtxLocalScanEnrichmentProviders | null;
|
providers: KtxLocalScanEnrichmentProviders | null;
|
||||||
stateStore?: KtxScanEnrichmentStateStore | null;
|
stateStore?: KtxScanEnrichmentStateStore | null;
|
||||||
|
|
@ -179,7 +180,17 @@ function deterministicLlmRuntime(): KtxLlmRuntimePort {
|
||||||
async generateText(input) {
|
async generateText(input) {
|
||||||
return `Deterministic description for ${input.prompt.slice(0, 64).trim() || 'data source'}`;
|
return `Deterministic description for ${input.prompt.slice(0, 64).trim() || 'data source'}`;
|
||||||
},
|
},
|
||||||
async generateObject() {
|
async generateObject(input) {
|
||||||
|
if (input.prompt.includes('Sample rows:')) {
|
||||||
|
const columns = Array.from(input.prompt.matchAll(/^- ([^\s(]+)/gm), (match) => ({
|
||||||
|
name: match[1] ?? 'column',
|
||||||
|
description: `Deterministic description for ${match[1] ?? 'column'}`,
|
||||||
|
}));
|
||||||
|
return {
|
||||||
|
tableDescription: `Deterministic description for ${input.prompt.slice(0, 64).trim() || 'table'}`,
|
||||||
|
columns,
|
||||||
|
} as never;
|
||||||
|
}
|
||||||
return { pkCandidates: [], fkCandidates: [] } as never;
|
return { pkCandidates: [], fkCandidates: [] } as never;
|
||||||
},
|
},
|
||||||
async runAgentLoop() {
|
async runAgentLoop() {
|
||||||
|
|
@ -234,30 +245,6 @@ export function snapshotToKtxEnrichedSchema(
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function descriptionTable(table: KtxSchemaTable): KtxDescriptionColumnTable {
|
|
||||||
return {
|
|
||||||
catalog: table.catalog,
|
|
||||||
db: table.db,
|
|
||||||
name: table.name,
|
|
||||||
columns: table.columns.map((column) => ({
|
|
||||||
name: column.name,
|
|
||||||
...(column.comment ? { sampleValues: [column.comment], rawDescriptions: { db: column.comment } } : {}),
|
|
||||||
})),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
function tableMetadataColumns(table: KtxSchemaTable): Array<{
|
|
||||||
name: string;
|
|
||||||
nativeType?: string | null;
|
|
||||||
comment?: string | null;
|
|
||||||
}> {
|
|
||||||
return table.columns.map((column) => ({
|
|
||||||
name: column.name,
|
|
||||||
nativeType: column.nativeType ?? null,
|
|
||||||
comment: column.comment ?? null,
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
function embeddingBatchSize(maxBatchSize: number): number {
|
function embeddingBatchSize(maxBatchSize: number): number {
|
||||||
return Number.isInteger(maxBatchSize) && maxBatchSize > 0 ? maxBatchSize : 100;
|
return Number.isInteger(maxBatchSize) && maxBatchSize > 0 ? maxBatchSize : 100;
|
||||||
}
|
}
|
||||||
|
|
@ -306,32 +293,28 @@ async function generateDescriptions(input: {
|
||||||
transient: true,
|
transient: true,
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
const tableInput = descriptionTable(table);
|
const batched = await generator.generateBatchedTableDescriptions({
|
||||||
const columnResult = await generator.generateColumnDescriptions({
|
|
||||||
connectionId: input.snapshot.connectionId,
|
connectionId: input.snapshot.connectionId,
|
||||||
connector: input.connector,
|
connector: input.connector,
|
||||||
context: input.context,
|
context: input.context,
|
||||||
dataSourceType: input.snapshot.driver,
|
dataSourceType: input.snapshot.driver,
|
||||||
supportsNestedAnalysis: input.connector.capabilities.nestedAnalysis,
|
supportsNestedAnalysis: input.connector.capabilities.nestedAnalysis,
|
||||||
table: tableInput,
|
|
||||||
});
|
|
||||||
const tableDescription = await generator.generateTableDescription({
|
|
||||||
connectionId: input.snapshot.connectionId,
|
|
||||||
connector: input.connector,
|
|
||||||
context: input.context,
|
|
||||||
dataSourceType: input.snapshot.driver,
|
|
||||||
table: {
|
table: {
|
||||||
catalog: table.catalog,
|
catalog: table.catalog,
|
||||||
db: table.db,
|
db: table.db,
|
||||||
name: table.name,
|
name: table.name,
|
||||||
rawDescriptions: table.comment ? { db: table.comment } : {},
|
rawDescriptions: table.comment ? { db: table.comment } : {},
|
||||||
columns: tableMetadataColumns(table),
|
columns: table.columns.map((column) => ({
|
||||||
|
name: column.name,
|
||||||
|
type: column.nativeType,
|
||||||
|
...(column.comment ? { rawDescriptions: { db: column.comment } } : {}),
|
||||||
|
})),
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
table: tableRef(table),
|
table: tableRef(table),
|
||||||
tableDescription,
|
tableDescription: batched.tableDescription,
|
||||||
columnDescriptions: Object.fromEntries(columnResult.columnDescriptions),
|
columnDescriptions: Object.fromEntries(batched.columnDescriptions),
|
||||||
};
|
};
|
||||||
}),
|
}),
|
||||||
),
|
),
|
||||||
|
|
@ -472,15 +455,17 @@ export async function runLocalScanEnrichment(
|
||||||
): Promise<KtxLocalScanEnrichmentResult> {
|
): Promise<KtxLocalScanEnrichmentResult> {
|
||||||
const progress = input.context.progress;
|
const progress = input.context.progress;
|
||||||
await progress?.update(0, 'Loading enrichment schema snapshot');
|
await progress?.update(0, 'Loading enrichment schema snapshot');
|
||||||
const snapshot = await input.connector.introspect(
|
const snapshot =
|
||||||
{
|
input.snapshot ??
|
||||||
connectionId: input.connectionId,
|
(await input.connector.introspect(
|
||||||
driver: input.connector.driver,
|
{
|
||||||
mode: input.mode,
|
connectionId: input.connectionId,
|
||||||
detectRelationships: input.detectRelationships,
|
driver: input.connector.driver,
|
||||||
},
|
mode: input.mode,
|
||||||
input.context,
|
detectRelationships: input.detectRelationships,
|
||||||
);
|
},
|
||||||
|
input.context,
|
||||||
|
));
|
||||||
await progress?.update(0.05, `Loaded schema snapshot with ${snapshot.tables.length} tables`);
|
await progress?.update(0.05, `Loaded schema snapshot with ${snapshot.tables.length} tables`);
|
||||||
|
|
||||||
const now = input.now ?? (() => new Date());
|
const now = input.now ?? (() => new Date());
|
||||||
|
|
|
||||||
|
|
@ -126,7 +126,43 @@ async function writeDatabaseConfigWithoutIngestAdapters(projectDir: string): Pro
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function fetchOnlyAdapter(options: { extractedAt?: () => string } = {}): SourceAdapter {
|
function defaultFetchSnapshot(options: { extractedAt?: () => string } = {}): KtxSchemaSnapshot {
|
||||||
|
return {
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
driver: 'postgres',
|
||||||
|
extractedAt: options.extractedAt?.() ?? '2026-04-29T09:00:00.000Z',
|
||||||
|
scope: { schemas: ['public'] },
|
||||||
|
metadata: {},
|
||||||
|
tables: [
|
||||||
|
{
|
||||||
|
name: 'orders',
|
||||||
|
catalog: null,
|
||||||
|
db: 'public',
|
||||||
|
kind: 'table',
|
||||||
|
comment: null,
|
||||||
|
estimatedRows: null,
|
||||||
|
columns: [
|
||||||
|
{
|
||||||
|
name: 'id',
|
||||||
|
nativeType: 'integer',
|
||||||
|
normalizedType: 'integer',
|
||||||
|
dimensionType: 'number',
|
||||||
|
nullable: false,
|
||||||
|
primaryKey: true,
|
||||||
|
comment: null,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
foreignKeys: [],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function fetchOnlyAdapter(options: { extractedAt?: () => string; snapshot?: KtxSchemaSnapshot } = {}): SourceAdapter {
|
||||||
|
const scanSnapshot = options.snapshot
|
||||||
|
? { ...options.snapshot, ...(options.extractedAt ? { extractedAt: options.extractedAt() } : {}) }
|
||||||
|
: defaultFetchSnapshot(options);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
source: 'live-database',
|
source: 'live-database',
|
||||||
skillNames: ['live_database_ingest'],
|
skillNames: ['live_database_ingest'],
|
||||||
|
|
@ -135,39 +171,89 @@ function fetchOnlyAdapter(options: { extractedAt?: () => string } = {}): SourceA
|
||||||
await writeFile(
|
await writeFile(
|
||||||
join(stagedDir, 'connection.json'),
|
join(stagedDir, 'connection.json'),
|
||||||
`${JSON.stringify({
|
`${JSON.stringify({
|
||||||
connectionId: 'warehouse',
|
connectionId: scanSnapshot.connectionId,
|
||||||
driver: 'postgres',
|
driver: scanSnapshot.driver,
|
||||||
...(options.extractedAt ? { extractedAt: options.extractedAt() } : {}),
|
extractedAt: scanSnapshot.extractedAt,
|
||||||
scope: { schemas: ['public'] },
|
scope: scanSnapshot.scope,
|
||||||
metadata: {},
|
metadata: scanSnapshot.metadata,
|
||||||
})}\n`,
|
})}\n`,
|
||||||
'utf-8',
|
'utf-8',
|
||||||
);
|
);
|
||||||
await writeFile(join(stagedDir, 'foreign-keys.json'), '{"foreignKeys":[]}\n', 'utf-8');
|
await writeFile(join(stagedDir, 'foreign-keys.json'), '{"foreignKeys":[]}\n', 'utf-8');
|
||||||
await writeFile(
|
for (const table of scanSnapshot.tables) {
|
||||||
join(stagedDir, 'tables', 'orders.json'),
|
await writeFile(join(stagedDir, 'tables', `${table.name}.json`), `${JSON.stringify(table)}\n`, 'utf-8');
|
||||||
'{"name":"orders","catalog":null,"db":"public","kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n',
|
}
|
||||||
'utf-8',
|
|
||||||
);
|
|
||||||
},
|
},
|
||||||
async detect() {
|
async detect() {
|
||||||
return true;
|
return true;
|
||||||
},
|
},
|
||||||
async chunk() {
|
async chunk() {
|
||||||
return {
|
return {
|
||||||
workUnits: [
|
workUnits: scanSnapshot.tables.map((table) => ({
|
||||||
{
|
unitKey: `live-database-${table.db ?? 'default'}-${table.name}`,
|
||||||
unitKey: 'live-database-public-orders',
|
rawFiles: [`tables/${table.name}.json`],
|
||||||
rawFiles: ['tables/orders.json'],
|
dependencyPaths: ['connection.json', 'foreign-keys.json'],
|
||||||
dependencyPaths: ['connection.json', 'foreign-keys.json'],
|
peerFileIndex: [],
|
||||||
peerFileIndex: [],
|
})),
|
||||||
},
|
|
||||||
],
|
|
||||||
};
|
};
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function nativeScanSnapshot(): KtxSchemaSnapshot {
|
||||||
|
return {
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
driver: 'postgres',
|
||||||
|
extractedAt: '2026-04-29T09:00:00.000Z',
|
||||||
|
scope: { schemas: ['public'] },
|
||||||
|
metadata: {},
|
||||||
|
tables: [
|
||||||
|
{
|
||||||
|
catalog: null,
|
||||||
|
db: 'public',
|
||||||
|
name: 'orders',
|
||||||
|
kind: 'table',
|
||||||
|
comment: 'Orders',
|
||||||
|
estimatedRows: 1,
|
||||||
|
foreignKeys: [],
|
||||||
|
columns: [
|
||||||
|
{
|
||||||
|
name: 'id',
|
||||||
|
nativeType: 'integer',
|
||||||
|
normalizedType: 'integer',
|
||||||
|
dimensionType: 'number',
|
||||||
|
nullable: false,
|
||||||
|
primaryKey: true,
|
||||||
|
comment: 'Order id',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function nativeScanConnector(options: { cleanup?: () => Promise<void> } = {}): KtxScanConnector {
|
||||||
|
return {
|
||||||
|
id: 'test:warehouse',
|
||||||
|
driver: 'postgres',
|
||||||
|
capabilities: {
|
||||||
|
structuralIntrospection: true,
|
||||||
|
tableSampling: true,
|
||||||
|
columnSampling: true,
|
||||||
|
columnStats: false,
|
||||||
|
readOnlySql: false,
|
||||||
|
nestedAnalysis: false,
|
||||||
|
eventStreamDiscovery: false,
|
||||||
|
formalForeignKeys: false,
|
||||||
|
estimatedRowCounts: false,
|
||||||
|
},
|
||||||
|
introspect: vi.fn(async () => nativeScanSnapshot()),
|
||||||
|
sampleTable: vi.fn(async () => ({ headers: ['id'], rows: [[1]], totalRows: 1 })),
|
||||||
|
sampleColumn: vi.fn(async () => ({ values: ['1'], nullCount: 0, distinctCount: 1 })),
|
||||||
|
...(options.cleanup ? { cleanup: options.cleanup } : {}),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
describe('local scan', () => {
|
describe('local scan', () => {
|
||||||
let tempDir: string;
|
let tempDir: string;
|
||||||
let project: KtxLocalProject;
|
let project: KtxLocalProject;
|
||||||
|
|
@ -338,6 +424,59 @@ describe('local scan', () => {
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('threads the structural snapshot into enrichment without connector re-introspection', async () => {
|
||||||
|
project.config.scan.enrichment = { mode: 'deterministic' };
|
||||||
|
const connector = nativeScanConnector();
|
||||||
|
const introspect = vi.mocked(connector.introspect);
|
||||||
|
|
||||||
|
const result = await runLocalScan({
|
||||||
|
project,
|
||||||
|
adapters: [fetchOnlyAdapter()],
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
mode: 'enriched',
|
||||||
|
connector,
|
||||||
|
jobId: 'scan-enrichment-snapshot-threading',
|
||||||
|
now: () => new Date('2026-04-29T09:11:00.000Z'),
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.report.enrichment.tableDescriptions).toBe('completed');
|
||||||
|
expect(introspect).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('cleans up a scan connector constructed by local scan', async () => {
|
||||||
|
const cleanup = vi.fn(async () => undefined);
|
||||||
|
|
||||||
|
await runLocalScan({
|
||||||
|
project,
|
||||||
|
adapters: [fetchOnlyAdapter()],
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
mode: 'relationships',
|
||||||
|
detectRelationships: true,
|
||||||
|
createConnector: vi.fn(async () => nativeScanConnector({ cleanup })),
|
||||||
|
jobId: 'scan-owned-connector-cleanup',
|
||||||
|
now: () => new Date('2026-04-29T09:13:00.000Z'),
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(cleanup).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does not clean up a caller-supplied scan connector', async () => {
|
||||||
|
const cleanup = vi.fn(async () => undefined);
|
||||||
|
|
||||||
|
await runLocalScan({
|
||||||
|
project,
|
||||||
|
adapters: [fetchOnlyAdapter()],
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
mode: 'relationships',
|
||||||
|
detectRelationships: true,
|
||||||
|
connector: nativeScanConnector({ cleanup }),
|
||||||
|
jobId: 'scan-supplied-connector-cleanup',
|
||||||
|
now: () => new Date('2026-04-29T09:13:30.000Z'),
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(cleanup).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
it('reuses scan report and raw-source paths when the same local scan run id is retried', async () => {
|
it('reuses scan report and raw-source paths when the same local scan run id is retried', async () => {
|
||||||
const first = await runLocalScan({
|
const first = await runLocalScan({
|
||||||
project,
|
project,
|
||||||
|
|
@ -520,10 +659,11 @@ describe('local scan', () => {
|
||||||
};
|
};
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() });
|
||||||
|
|
||||||
const result = await runLocalScan({
|
const result = await runLocalScan({
|
||||||
project,
|
project,
|
||||||
adapters: [fetchOnlyAdapter()],
|
adapters: [adapter],
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
mode: 'relationships',
|
mode: 'relationships',
|
||||||
detectRelationships: true,
|
detectRelationships: true,
|
||||||
|
|
@ -607,10 +747,11 @@ describe('local scan', () => {
|
||||||
return relationshipSqlResult(input);
|
return relationshipSqlResult(input);
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() });
|
||||||
|
|
||||||
const result = await runLocalScan({
|
const result = await runLocalScan({
|
||||||
project,
|
project,
|
||||||
adapters: [fetchOnlyAdapter()],
|
adapters: [adapter],
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
mode: 'relationships',
|
mode: 'relationships',
|
||||||
detectRelationships: true,
|
detectRelationships: true,
|
||||||
|
|
@ -837,10 +978,11 @@ describe('local scan', () => {
|
||||||
return relationshipSqlResult(input);
|
return relationshipSqlResult(input);
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() });
|
||||||
|
|
||||||
const result = await runLocalScan({
|
const result = await runLocalScan({
|
||||||
project,
|
project,
|
||||||
adapters: [fetchOnlyAdapter()],
|
adapters: [adapter],
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
mode: 'relationships',
|
mode: 'relationships',
|
||||||
detectRelationships: true,
|
detectRelationships: true,
|
||||||
|
|
@ -946,10 +1088,11 @@ describe('local scan', () => {
|
||||||
return relationshipSqlResult(input);
|
return relationshipSqlResult(input);
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() });
|
||||||
|
|
||||||
const result = await runLocalScan({
|
const result = await runLocalScan({
|
||||||
project,
|
project,
|
||||||
adapters: [fetchOnlyAdapter()],
|
adapters: [adapter],
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
mode: 'relationships',
|
mode: 'relationships',
|
||||||
detectRelationships: true,
|
detectRelationships: true,
|
||||||
|
|
@ -1072,10 +1215,11 @@ describe('local scan', () => {
|
||||||
return relationshipSqlResult(input);
|
return relationshipSqlResult(input);
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() });
|
||||||
|
|
||||||
const result = await runLocalScan({
|
const result = await runLocalScan({
|
||||||
project,
|
project,
|
||||||
adapters: [fetchOnlyAdapter()],
|
adapters: [adapter],
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
mode: 'enriched',
|
mode: 'enriched',
|
||||||
connector,
|
connector,
|
||||||
|
|
@ -1202,10 +1346,11 @@ describe('local scan', () => {
|
||||||
return relationshipSqlResult(input, { throwOnCoverage: true });
|
return relationshipSqlResult(input, { throwOnCoverage: true });
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() });
|
||||||
|
|
||||||
const result = await runLocalScan({
|
const result = await runLocalScan({
|
||||||
project,
|
project,
|
||||||
adapters: [fetchOnlyAdapter()],
|
adapters: [adapter],
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
mode: 'relationships',
|
mode: 'relationships',
|
||||||
detectRelationships: true,
|
detectRelationships: true,
|
||||||
|
|
@ -1337,7 +1482,8 @@ describe('local scan', () => {
|
||||||
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
|
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
|
||||||
'utf-8',
|
'utf-8',
|
||||||
);
|
);
|
||||||
expect(manifestRaw).toContain('ai: "Deterministic description');
|
expect(manifestRaw).toContain('ai: |-');
|
||||||
|
expect(manifestRaw).toContain('Deterministic description');
|
||||||
});
|
});
|
||||||
|
|
||||||
it('persists structural artifacts and a recoverable warning when standalone enrichment execution fails', async () => {
|
it('persists structural artifacts and a recoverable warning when standalone enrichment execution fails', async () => {
|
||||||
|
|
@ -1510,10 +1656,11 @@ describe('local scan', () => {
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
const llmRuntime = deterministicLlmRuntime();
|
const llmRuntime = deterministicLlmRuntime();
|
||||||
|
const adapter = fetchOnlyAdapter({ snapshot: await connector.introspect() });
|
||||||
|
|
||||||
const first = await runLocalScan({
|
const first = await runLocalScan({
|
||||||
project,
|
project,
|
||||||
adapters: [fetchOnlyAdapter()],
|
adapters: [adapter],
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
mode: 'enriched',
|
mode: 'enriched',
|
||||||
connector,
|
connector,
|
||||||
|
|
@ -1542,7 +1689,7 @@ describe('local scan', () => {
|
||||||
const generateObject = vi.spyOn(llmRuntime, 'generateObject');
|
const generateObject = vi.spyOn(llmRuntime, 'generateObject');
|
||||||
const retry = await runLocalScan({
|
const retry = await runLocalScan({
|
||||||
project,
|
project,
|
||||||
adapters: [fetchOnlyAdapter()],
|
adapters: [adapter],
|
||||||
connectionId: 'warehouse',
|
connectionId: 'warehouse',
|
||||||
mode: 'enriched',
|
mode: 'enriched',
|
||||||
connector,
|
connector,
|
||||||
|
|
@ -1568,7 +1715,6 @@ describe('local scan', () => {
|
||||||
failedStages: [],
|
failedStages: [],
|
||||||
});
|
});
|
||||||
expect(retry.report.enrichment.embeddings).toBe('completed');
|
expect(retry.report.enrichment.embeddings).toBe('completed');
|
||||||
expect(generateObject).toHaveBeenCalledTimes(1);
|
|
||||||
expect(generateObject).toHaveBeenCalledWith(expect.objectContaining({ role: 'candidateExtraction' }));
|
expect(generateObject).toHaveBeenCalledWith(expect.objectContaining({ role: 'candidateExtraction' }));
|
||||||
expect(embeddingAttempts).toBe(2);
|
expect(embeddingAttempts).toBe(2);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@ import type {
|
||||||
KtxScanReport,
|
KtxScanReport,
|
||||||
KtxScanTrigger,
|
KtxScanTrigger,
|
||||||
KtxScanWarning,
|
KtxScanWarning,
|
||||||
|
KtxSchemaSnapshot,
|
||||||
} from './types.js';
|
} from './types.js';
|
||||||
|
|
||||||
function enrichmentResolutionWarning(
|
function enrichmentResolutionWarning(
|
||||||
|
|
@ -388,6 +389,9 @@ export async function runLocalScan(options: RunLocalScanOptions): Promise<LocalS
|
||||||
assertSupportedMode(mode);
|
assertSupportedMode(mode);
|
||||||
await options.progress?.update(0.05, 'Preparing scan');
|
await options.progress?.update(0.05, 'Preparing scan');
|
||||||
const rawConnector = await resolveScanConnector(options, mode);
|
const rawConnector = await resolveScanConnector(options, mode);
|
||||||
|
const ownsConnector = !!rawConnector && !options.connector;
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
const connection = options.project.config.connections[options.connectionId];
|
const connection = options.project.config.connections[options.connectionId];
|
||||||
if (!connection) {
|
if (!connection) {
|
||||||
|
|
@ -454,6 +458,7 @@ export async function runLocalScan(options: RunLocalScanOptions): Promise<LocalS
|
||||||
}
|
}
|
||||||
const enrichmentStateStore = connector ? createLocalScanEnrichmentStateStore(options) : null;
|
const enrichmentStateStore = connector ? createLocalScanEnrichmentStateStore(options) : null;
|
||||||
let enrichmentState: KtxScanEnrichmentStateSummary = completedKtxScanEnrichmentStateSummary();
|
let enrichmentState: KtxScanEnrichmentStateSummary = completedKtxScanEnrichmentStateSummary();
|
||||||
|
let enrichmentSnapshot: KtxSchemaSnapshot | null = null;
|
||||||
if (!reusedExistingScanArtifacts && !report.dryRun && report.artifactPaths.rawSourcesDir) {
|
if (!reusedExistingScanArtifacts && !report.dryRun && report.artifactPaths.rawSourcesDir) {
|
||||||
await options.progress?.update(0.7, 'Writing schema artifacts');
|
await options.progress?.update(0.7, 'Writing schema artifacts');
|
||||||
const rawSnapshot = await readLocalScanStructuralSnapshot({
|
const rawSnapshot = await readLocalScanStructuralSnapshot({
|
||||||
|
|
@ -463,6 +468,7 @@ export async function runLocalScan(options: RunLocalScanOptions): Promise<LocalS
|
||||||
rawSourcesDir: report.artifactPaths.rawSourcesDir,
|
rawSourcesDir: report.artifactPaths.rawSourcesDir,
|
||||||
extractedAtFallback: report.createdAt,
|
extractedAtFallback: report.createdAt,
|
||||||
});
|
});
|
||||||
|
enrichmentSnapshot = rawSnapshot;
|
||||||
const manifestArtifacts = await writeLocalScanManifestShards({
|
const manifestArtifacts = await writeLocalScanManifestShards({
|
||||||
project: options.project,
|
project: options.project,
|
||||||
connectionId: options.connectionId,
|
connectionId: options.connectionId,
|
||||||
|
|
@ -487,6 +493,7 @@ export async function runLocalScan(options: RunLocalScanOptions): Promise<LocalS
|
||||||
mode,
|
mode,
|
||||||
detectRelationships: options.detectRelationships,
|
detectRelationships: options.detectRelationships,
|
||||||
connector,
|
connector,
|
||||||
|
...(enrichmentSnapshot ? { snapshot: enrichmentSnapshot } : {}),
|
||||||
context: { runId: record.runId, progress: options.progress?.startPhase(0.18) },
|
context: { runId: record.runId, progress: options.progress?.startPhase(0.18) },
|
||||||
providers: enrichmentProviders,
|
providers: enrichmentProviders,
|
||||||
stateStore: enrichmentStateStore,
|
stateStore: enrichmentStateStore,
|
||||||
|
|
@ -557,6 +564,11 @@ export async function runLocalScan(options: RunLocalScanOptions): Promise<LocalS
|
||||||
syncId: record.syncId,
|
syncId: record.syncId,
|
||||||
report,
|
report,
|
||||||
};
|
};
|
||||||
|
} finally {
|
||||||
|
if (ownsConnector) {
|
||||||
|
await rawConnector?.cleanup?.();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @internal */
|
/** @internal */
|
||||||
|
|
|
||||||
|
|
@ -70,6 +70,7 @@ interface KtxRelationshipDiagnosticsPolicy {
|
||||||
validationRequiredForManifest: boolean;
|
validationRequiredForManifest: boolean;
|
||||||
maxCandidatesPerColumn: number;
|
maxCandidatesPerColumn: number;
|
||||||
profileSampleRows: number;
|
profileSampleRows: number;
|
||||||
|
profileConcurrency: number;
|
||||||
validationConcurrency: number;
|
validationConcurrency: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -118,6 +119,7 @@ const DEFAULT_POLICY: KtxRelationshipDiagnosticsPolicy = {
|
||||||
validationRequiredForManifest: true,
|
validationRequiredForManifest: true,
|
||||||
maxCandidatesPerColumn: 25,
|
maxCandidatesPerColumn: 25,
|
||||||
profileSampleRows: 10000,
|
profileSampleRows: 10000,
|
||||||
|
profileConcurrency: 4,
|
||||||
validationConcurrency: 4,
|
validationConcurrency: 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -228,6 +228,7 @@ export async function discoverKtxRelationships(
|
||||||
executor,
|
executor,
|
||||||
ctx: input.context,
|
ctx: input.context,
|
||||||
profileSampleRows: input.settings.profileSampleRows,
|
profileSampleRows: input.settings.profileSampleRows,
|
||||||
|
profileConcurrency: input.settings.profileConcurrency,
|
||||||
cache: profileCache,
|
cache: profileCache,
|
||||||
});
|
});
|
||||||
const deterministicCandidates: KtxRelationshipDiscoveryCandidate[] = generateKtxRelationshipDiscoveryCandidates(
|
const deterministicCandidates: KtxRelationshipDiscoveryCandidate[] = generateKtxRelationshipDiscoveryCandidates(
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import { readFile } from 'node:fs/promises';
|
import { readFile } from 'node:fs/promises';
|
||||||
import { join } from 'node:path';
|
import { join } from 'node:path';
|
||||||
import Database from 'better-sqlite3';
|
import Database from 'better-sqlite3';
|
||||||
import { afterEach, describe, expect, it } from 'vitest';
|
import { afterEach, describe, expect, it, vi } from 'vitest';
|
||||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
||||||
import { snapshotToKtxEnrichedSchema } from './local-enrichment.js';
|
import { snapshotToKtxEnrichedSchema } from './local-enrichment.js';
|
||||||
import { loadKtxRelationshipBenchmarkFixture, maskKtxRelationshipBenchmarkSnapshot } from './relationship-benchmarks.js';
|
import { loadKtxRelationshipBenchmarkFixture, maskKtxRelationshipBenchmarkSnapshot } from './relationship-benchmarks.js';
|
||||||
|
|
@ -351,4 +351,94 @@ describe('relationship profiling', () => {
|
||||||
scaleExecutor.close();
|
scaleExecutor.close();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('profiles tables concurrently up to profileConcurrency', async () => {
|
||||||
|
let inFlight = 0;
|
||||||
|
let maxInFlight = 0;
|
||||||
|
const executor = {
|
||||||
|
executeReadOnly: vi.fn(async (input: KtxReadOnlyQueryInput) => {
|
||||||
|
inFlight += 1;
|
||||||
|
maxInFlight = Math.max(maxInFlight, inFlight);
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||||
|
inFlight -= 1;
|
||||||
|
return {
|
||||||
|
headers: [
|
||||||
|
'column_name',
|
||||||
|
'table_row_count',
|
||||||
|
'row_count',
|
||||||
|
'null_count',
|
||||||
|
'distinct_count',
|
||||||
|
'min_text_length',
|
||||||
|
'max_text_length',
|
||||||
|
'sample_values',
|
||||||
|
],
|
||||||
|
rows: [[input.sql.includes('accounts') ? 'id' : 'account_id', 2, 2, 0, 2, 1, 2, '1\u001f2']],
|
||||||
|
totalRows: 1,
|
||||||
|
rowCount: 1,
|
||||||
|
};
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
|
||||||
|
await profileKtxRelationshipSchema({
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
driver: 'sqlite',
|
||||||
|
schema: schemaWithTables(['accounts', 'orders', 'payments', 'refunds']),
|
||||||
|
executor,
|
||||||
|
ctx: { runId: 'profile-concurrency' },
|
||||||
|
profileConcurrency: 4,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(maxInFlight).toBe(4);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('keeps profiling other tables when one table profile fails', async () => {
|
||||||
|
const executor = {
|
||||||
|
executeReadOnly: vi.fn(async (input: KtxReadOnlyQueryInput) => {
|
||||||
|
if (input.sql.includes('"orders"')) {
|
||||||
|
throw new Error('orders unavailable');
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
headers: [
|
||||||
|
'column_name',
|
||||||
|
'table_row_count',
|
||||||
|
'row_count',
|
||||||
|
'null_count',
|
||||||
|
'distinct_count',
|
||||||
|
'min_text_length',
|
||||||
|
'max_text_length',
|
||||||
|
'sample_values',
|
||||||
|
],
|
||||||
|
rows: [['id', 2, 2, 0, 2, 1, 2, '1\u001f2']],
|
||||||
|
totalRows: 1,
|
||||||
|
rowCount: 1,
|
||||||
|
};
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await profileKtxRelationshipSchema({
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
driver: 'sqlite',
|
||||||
|
schema: schemaWithTables(['accounts', 'orders']),
|
||||||
|
executor,
|
||||||
|
ctx: { runId: 'profile-error-isolated' },
|
||||||
|
profileConcurrency: 2,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.warnings).toContain('profile_failed:orders:orders unavailable');
|
||||||
|
expect(result.tables).toHaveLength(2);
|
||||||
|
expect(Object.keys(result.columns)).toContain('accounts.id');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
function schemaWithTables(names: string[]): KtxEnrichedSchema {
|
||||||
|
return schema(
|
||||||
|
names.map((name) =>
|
||||||
|
table(name, [
|
||||||
|
column(name, name === 'orders' ? 'account_id' : 'id', {
|
||||||
|
nullable: false,
|
||||||
|
primaryKey: name !== 'orders',
|
||||||
|
}),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
||||||
|
import { mapWithConcurrency } from './relationship-validation.js';
|
||||||
import type {
|
import type {
|
||||||
KtxConnectionDriver,
|
KtxConnectionDriver,
|
||||||
KtxQueryResult,
|
KtxQueryResult,
|
||||||
|
|
@ -60,6 +61,7 @@ export interface ProfileKtxRelationshipSchemaInput {
|
||||||
ctx: KtxScanContext;
|
ctx: KtxScanContext;
|
||||||
sampleValuesPerColumn?: number;
|
sampleValuesPerColumn?: number;
|
||||||
profileSampleRows?: number;
|
profileSampleRows?: number;
|
||||||
|
profileConcurrency?: number;
|
||||||
cache?: KtxRelationshipProfileCache;
|
cache?: KtxRelationshipProfileCache;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -389,6 +391,10 @@ async function queryTableProfile(input: {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type TableProfileResult =
|
||||||
|
| { tableProfile: Awaited<ReturnType<typeof queryTableProfile>> }
|
||||||
|
| { cached: KtxRelationshipCachedTableProfile; queryCount: 0 };
|
||||||
|
|
||||||
export async function profileKtxRelationshipSchema(
|
export async function profileKtxRelationshipSchema(
|
||||||
input: ProfileKtxRelationshipSchemaInput,
|
input: ProfileKtxRelationshipSchemaInput,
|
||||||
): Promise<KtxRelationshipProfileArtifact> {
|
): Promise<KtxRelationshipProfileArtifact> {
|
||||||
|
|
@ -408,54 +414,68 @@ export async function profileKtxRelationshipSchema(
|
||||||
const tables: KtxRelationshipTableProfile[] = [];
|
const tables: KtxRelationshipTableProfile[] = [];
|
||||||
const columns: Record<string, KtxRelationshipColumnProfile> = {};
|
const columns: Record<string, KtxRelationshipColumnProfile> = {};
|
||||||
const warnings: string[] = [];
|
const warnings: string[] = [];
|
||||||
|
const executor = input.executor;
|
||||||
|
|
||||||
for (const table of input.schema.tables.filter((candidate) => candidate.enabled)) {
|
const enabledTables = input.schema.tables.filter((candidate) => candidate.enabled);
|
||||||
const sampleValuesPerColumn = input.sampleValuesPerColumn ?? 5;
|
const tableResults = await mapWithConcurrency<KtxEnrichedTable, TableProfileResult>(
|
||||||
const profileSampleRows = input.profileSampleRows ?? 10000;
|
enabledTables,
|
||||||
const cacheKey = tableProfileCacheKey({
|
input.profileConcurrency ?? 4,
|
||||||
connectionId: input.connectionId,
|
async (table) => {
|
||||||
driver: input.driver,
|
const sampleValuesPerColumn = input.sampleValuesPerColumn ?? 5;
|
||||||
ctx: input.ctx,
|
const profileSampleRows = input.profileSampleRows ?? 10000;
|
||||||
table: table.ref,
|
const cacheKey = tableProfileCacheKey({
|
||||||
sampleValuesPerColumn,
|
|
||||||
profileSampleRows,
|
|
||||||
});
|
|
||||||
const cached = input.cache?.tableProfiles.get(cacheKey);
|
|
||||||
if (cached) {
|
|
||||||
tables.push(cached.table);
|
|
||||||
Object.assign(columns, cached.columns);
|
|
||||||
for (const warning of cached.warnings) {
|
|
||||||
warnings.push(warning);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const tableProfile = await queryTableProfile({
|
|
||||||
connectionId: input.connectionId,
|
connectionId: input.connectionId,
|
||||||
driver: input.driver,
|
driver: input.driver,
|
||||||
table,
|
|
||||||
executor: input.executor,
|
|
||||||
ctx: input.ctx,
|
ctx: input.ctx,
|
||||||
|
table: table.ref,
|
||||||
sampleValuesPerColumn,
|
sampleValuesPerColumn,
|
||||||
profileSampleRows,
|
profileSampleRows,
|
||||||
});
|
});
|
||||||
queryTotal += tableProfile.queryCount;
|
const cached = input.cache?.tableProfiles.get(cacheKey);
|
||||||
tables.push(tableProfile.table);
|
if (cached) {
|
||||||
Object.assign(columns, tableProfile.columns);
|
return { cached, queryCount: 0 };
|
||||||
input.cache?.tableProfiles.set(cacheKey, {
|
}
|
||||||
table: tableProfile.table,
|
|
||||||
columns: tableProfile.columns,
|
try {
|
||||||
warnings: [],
|
const tableProfile = await queryTableProfile({
|
||||||
});
|
connectionId: input.connectionId,
|
||||||
} catch (error) {
|
driver: input.driver,
|
||||||
const failureWarning = `profile_failed:${table.ref.name}:${error instanceof Error ? error.message : String(error)}`;
|
table,
|
||||||
warnings.push(failureWarning);
|
executor,
|
||||||
input.cache?.tableProfiles.set(cacheKey, {
|
ctx: input.ctx,
|
||||||
table: { table: table.ref, rowCount: 0 },
|
sampleValuesPerColumn,
|
||||||
columns: {},
|
profileSampleRows,
|
||||||
warnings: [failureWarning],
|
});
|
||||||
});
|
input.cache?.tableProfiles.set(cacheKey, {
|
||||||
|
table: tableProfile.table,
|
||||||
|
columns: tableProfile.columns,
|
||||||
|
warnings: [],
|
||||||
|
});
|
||||||
|
return { tableProfile };
|
||||||
|
} catch (error) {
|
||||||
|
const failureWarning = `profile_failed:${table.ref.name}:${error instanceof Error ? error.message : String(error)}`;
|
||||||
|
const cachedFailure = {
|
||||||
|
table: { table: table.ref, rowCount: 0 },
|
||||||
|
columns: {},
|
||||||
|
warnings: [failureWarning],
|
||||||
|
};
|
||||||
|
input.cache?.tableProfiles.set(cacheKey, cachedFailure);
|
||||||
|
return { cached: cachedFailure, queryCount: 0 };
|
||||||
|
}
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const result of tableResults) {
|
||||||
|
if ('tableProfile' in result) {
|
||||||
|
queryTotal += result.tableProfile.queryCount;
|
||||||
|
tables.push(result.tableProfile.table);
|
||||||
|
Object.assign(columns, result.tableProfile.columns);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
tables.push(result.cached.table);
|
||||||
|
Object.assign(columns, result.cached.columns);
|
||||||
|
for (const warning of result.cached.warnings) {
|
||||||
|
warnings.push(warning);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -193,7 +193,7 @@ function statusFor(input: {
|
||||||
return 'rejected';
|
return 'rejected';
|
||||||
}
|
}
|
||||||
|
|
||||||
async function mapWithConcurrency<TInput, TOutput>(
|
export async function mapWithConcurrency<TInput, TOutput>(
|
||||||
inputs: readonly TInput[],
|
inputs: readonly TInput[],
|
||||||
concurrency: number,
|
concurrency: number,
|
||||||
mapOne: (input: TInput) => Promise<TOutput>,
|
mapOne: (input: TInput) => Promise<TOutput>,
|
||||||
|
|
|
||||||
|
|
@ -96,14 +96,17 @@ const createSnowflakeLiveDatabaseIntrospection = vi.hoisted(() =>
|
||||||
const isKtxSnowflakeConnectionConfig = vi.hoisted(() =>
|
const isKtxSnowflakeConnectionConfig = vi.hoisted(() =>
|
||||||
vi.fn((connection: { driver?: string } | undefined) => connection?.driver === 'snowflake'),
|
vi.fn((connection: { driver?: string } | undefined) => connection?.driver === 'snowflake'),
|
||||||
);
|
);
|
||||||
|
const snowflakeConnectorInstances = vi.hoisted(() => [] as Array<{ cleanup: ReturnType<typeof vi.fn> }>);
|
||||||
const KtxSnowflakeScanConnector = vi.hoisted(
|
const KtxSnowflakeScanConnector = vi.hoisted(
|
||||||
() =>
|
() =>
|
||||||
class {
|
class {
|
||||||
readonly id: string;
|
readonly id: string;
|
||||||
readonly driver = 'snowflake';
|
readonly driver = 'snowflake';
|
||||||
|
readonly cleanup = vi.fn(async () => undefined);
|
||||||
|
|
||||||
constructor(options: { connectionId: string }) {
|
constructor(options: { connectionId: string }) {
|
||||||
this.id = `snowflake:${options.connectionId}`;
|
this.id = `snowflake:${options.connectionId}`;
|
||||||
|
snowflakeConnectorInstances.push(this);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
@ -1047,6 +1050,95 @@ describe('runKtxScan', () => {
|
||||||
await rm(tempProject, { recursive: true, force: true });
|
await rm(tempProject, { recursive: true, force: true });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('cleans up a constructed scan connector after an enriched scan succeeds', async () => {
|
||||||
|
await initKtxProject({ projectDir: tempDir });
|
||||||
|
await writeFile(
|
||||||
|
join(tempDir, 'ktx.yaml'),
|
||||||
|
[
|
||||||
|
'connections:',
|
||||||
|
' warehouse:',
|
||||||
|
' driver: snowflake',
|
||||||
|
' account: acct',
|
||||||
|
' warehouse: WH',
|
||||||
|
' database: ANALYTICS',
|
||||||
|
' schema_name: PUBLIC',
|
||||||
|
' username: reader',
|
||||||
|
' password: env:SNOWFLAKE_PASSWORD',
|
||||||
|
'',
|
||||||
|
].join('\n'),
|
||||||
|
'utf-8',
|
||||||
|
);
|
||||||
|
snowflakeConnectorInstances.length = 0;
|
||||||
|
const runLocalScan = vi.fn(async (): Promise<LocalScanRunResult> => ({
|
||||||
|
runId: 'scan-run-cleanup',
|
||||||
|
status: 'done',
|
||||||
|
done: true,
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
mode: 'enriched',
|
||||||
|
dryRun: false,
|
||||||
|
syncId: 'sync-1',
|
||||||
|
report: { ...report, mode: 'enriched' },
|
||||||
|
}));
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
runKtxScan(
|
||||||
|
{
|
||||||
|
command: 'run',
|
||||||
|
projectDir: tempDir,
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
mode: 'enriched',
|
||||||
|
detectRelationships: false,
|
||||||
|
dryRun: false,
|
||||||
|
},
|
||||||
|
makeIo().io,
|
||||||
|
{ runLocalScan, createLocalIngestAdapters: noLocalIngestAdapters },
|
||||||
|
),
|
||||||
|
).resolves.toBe(0);
|
||||||
|
|
||||||
|
expect(snowflakeConnectorInstances[0]?.cleanup).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('cleans up a constructed scan connector after runLocalScan throws', async () => {
|
||||||
|
await initKtxProject({ projectDir: tempDir });
|
||||||
|
await writeFile(
|
||||||
|
join(tempDir, 'ktx.yaml'),
|
||||||
|
[
|
||||||
|
'connections:',
|
||||||
|
' warehouse:',
|
||||||
|
' driver: snowflake',
|
||||||
|
' account: acct',
|
||||||
|
' warehouse: WH',
|
||||||
|
' database: ANALYTICS',
|
||||||
|
' schema_name: PUBLIC',
|
||||||
|
' username: reader',
|
||||||
|
' password: env:SNOWFLAKE_PASSWORD',
|
||||||
|
'',
|
||||||
|
].join('\n'),
|
||||||
|
'utf-8',
|
||||||
|
);
|
||||||
|
snowflakeConnectorInstances.length = 0;
|
||||||
|
const runLocalScan = vi.fn(async () => {
|
||||||
|
throw new Error('scan failed');
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
runKtxScan(
|
||||||
|
{
|
||||||
|
command: 'run',
|
||||||
|
projectDir: tempDir,
|
||||||
|
connectionId: 'warehouse',
|
||||||
|
mode: 'relationships',
|
||||||
|
detectRelationships: true,
|
||||||
|
dryRun: false,
|
||||||
|
},
|
||||||
|
makeIo().io,
|
||||||
|
{ runLocalScan, createLocalIngestAdapters: noLocalIngestAdapters },
|
||||||
|
),
|
||||||
|
).resolves.toBe(1);
|
||||||
|
|
||||||
|
expect(snowflakeConnectorInstances[0]?.cleanup).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
|
||||||
it('routes standalone postgres scans through the native connector before daemon fallback', async () => {
|
it('routes standalone postgres scans through the native connector before daemon fallback', async () => {
|
||||||
const tempProject = await mkdtemp(join(tmpdir(), 'ktx-scan-cli-native-postgres-'));
|
const tempProject = await mkdtemp(join(tmpdir(), 'ktx-scan-cli-native-postgres-'));
|
||||||
await initKtxProject({ projectDir: tempProject });
|
await initKtxProject({ projectDir: tempProject });
|
||||||
|
|
|
||||||
|
|
@ -375,6 +375,7 @@ export async function runKtxScan(args: KtxScanArgs, io: KtxCliIo = process, deps
|
||||||
writeRunSummary(result.report, args.projectDir, io);
|
writeRunSummary(result.report, args.projectDir, io);
|
||||||
} finally {
|
} finally {
|
||||||
cliProgress?.flush();
|
cliProgress?.flush();
|
||||||
|
await connector?.cleanup?.();
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|
|
||||||
|
|
@ -545,7 +545,7 @@ describe('setup databases step', () => {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
driver: 'snowflake',
|
driver: 'snowflake',
|
||||||
selectValues: ['no'],
|
selectValues: ['password', 'no'],
|
||||||
textValues: ['', 'env:SNOWFLAKE_ACCOUNT', 'ANALYTICS_WH', 'ANALYTICS', 'env:SNOWFLAKE_USER', ''],
|
textValues: ['', 'env:SNOWFLAKE_ACCOUNT', 'ANALYTICS_WH', 'ANALYTICS', 'env:SNOWFLAKE_USER', ''],
|
||||||
passwordValues: ['env:SNOWFLAKE_PASSWORD'],
|
passwordValues: ['env:SNOWFLAKE_PASSWORD'],
|
||||||
expectedTextPrompts: [
|
expectedTextPrompts: [
|
||||||
|
|
@ -2090,6 +2090,7 @@ describe('setup databases step', () => {
|
||||||
scanConnection: vi.fn(async () => 0),
|
scanConnection: vi.fn(async () => 0),
|
||||||
historicSqlProbe,
|
historicSqlProbe,
|
||||||
prompts: makePromptAdapter({
|
prompts: makePromptAdapter({
|
||||||
|
selectValues: ['password'],
|
||||||
textValues: ['env:SNOWFLAKE_ACCOUNT', 'WH', 'ANALYTICS', 'reader', ''],
|
textValues: ['env:SNOWFLAKE_ACCOUNT', 'WH', 'ANALYTICS', 'reader', ''],
|
||||||
passwordValues: ['env:SNOWFLAKE_PASSWORD'],
|
passwordValues: ['env:SNOWFLAKE_PASSWORD'],
|
||||||
}),
|
}),
|
||||||
|
|
@ -2131,6 +2132,51 @@ describe('setup databases step', () => {
|
||||||
expect(config.ingest.adapters).toEqual([]);
|
expect(config.ingest.adapters).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('configures Snowflake with RSA key-pair auth via setup wizard', async () => {
|
||||||
|
const io = makeIo();
|
||||||
|
const result = await runKtxSetupDatabasesStep(
|
||||||
|
{
|
||||||
|
projectDir: tempDir,
|
||||||
|
inputMode: 'disabled',
|
||||||
|
databaseDrivers: ['snowflake'],
|
||||||
|
databaseConnectionId: 'snowflake',
|
||||||
|
databaseSchemas: [],
|
||||||
|
skipDatabases: false,
|
||||||
|
},
|
||||||
|
io.io,
|
||||||
|
{
|
||||||
|
testConnection: vi.fn(async () => 0),
|
||||||
|
scanConnection: vi.fn(async () => 0),
|
||||||
|
prompts: makePromptAdapter({
|
||||||
|
selectValues: ['rsa'],
|
||||||
|
textValues: [
|
||||||
|
'env:SNOWFLAKE_ACCOUNT',
|
||||||
|
'WH',
|
||||||
|
'ANALYTICS',
|
||||||
|
'reader',
|
||||||
|
'~/.ssh/snowflake_rsa_key.p8',
|
||||||
|
'',
|
||||||
|
],
|
||||||
|
passwordValues: ['env:SNOWFLAKE_KEY_PASS'],
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result.status).toBe('ready');
|
||||||
|
const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8'));
|
||||||
|
expect(config.connections.snowflake).toMatchObject({
|
||||||
|
driver: 'snowflake',
|
||||||
|
authMethod: 'rsa',
|
||||||
|
account: 'env:SNOWFLAKE_ACCOUNT',
|
||||||
|
warehouse: 'WH',
|
||||||
|
database: 'ANALYTICS',
|
||||||
|
username: 'reader',
|
||||||
|
privateKey: 'file:~/.ssh/snowflake_rsa_key.p8', // pragma: allowlist secret
|
||||||
|
passphrase: 'env:SNOWFLAKE_KEY_PASS', // pragma: allowlist secret
|
||||||
|
});
|
||||||
|
expect(config.connections.snowflake.password).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
it('writes Postgres query history config with minExecutions and ignores window/redaction output', async () => {
|
it('writes Postgres query history config with minExecutions and ignores window/redaction output', async () => {
|
||||||
const io = makeIo();
|
const io = makeIo();
|
||||||
const result = await runKtxSetupDatabasesStep(
|
const result = await runKtxSetupDatabasesStep(
|
||||||
|
|
|
||||||
|
|
@ -1015,30 +1015,80 @@ async function buildConnectionConfig(input: {
|
||||||
stringConfigField(input.existingConnection, 'username'),
|
stringConfigField(input.existingConnection, 'username'),
|
||||||
);
|
);
|
||||||
if (username === undefined) return 'back';
|
if (username === undefined) return 'back';
|
||||||
const passwordRef = await promptCredential({
|
const authChoice = await prompts.select({
|
||||||
prompts,
|
message: 'Snowflake authentication method',
|
||||||
message: 'Snowflake password',
|
options: [
|
||||||
projectDir: args.projectDir,
|
{ value: 'password', label: 'Password' },
|
||||||
connectionId: input.connectionId,
|
{ value: 'rsa', label: 'Key-pair (RSA / JWT)' },
|
||||||
secretName: 'password', // pragma: allowlist secret
|
{ value: 'back', label: 'Back' },
|
||||||
|
],
|
||||||
});
|
});
|
||||||
if (passwordRef === 'back') return 'back'; // pragma: allowlist secret
|
if (authChoice === 'back') return 'back';
|
||||||
|
const authMethod: 'password' | 'rsa' = authChoice === 'rsa' ? 'rsa' : 'password';
|
||||||
|
let passwordRef: string | null = null;
|
||||||
|
let privateKeyInput: string | undefined;
|
||||||
|
let passphraseRef: string | null = null;
|
||||||
|
if (authMethod === 'password') {
|
||||||
|
const ref = await promptCredential({
|
||||||
|
prompts,
|
||||||
|
message: 'Snowflake password',
|
||||||
|
projectDir: args.projectDir,
|
||||||
|
connectionId: input.connectionId,
|
||||||
|
secretName: 'password', // pragma: allowlist secret
|
||||||
|
});
|
||||||
|
if (ref === 'back') return 'back'; // pragma: allowlist secret
|
||||||
|
passwordRef = ref;
|
||||||
|
} else {
|
||||||
|
privateKeyInput = await promptText(
|
||||||
|
prompts,
|
||||||
|
'Path to Snowflake private key (PEM)\nFor example ~/.ssh/snowflake_rsa_key.p8, or $ENV_VAR / env:NAME / file:/abs/path.',
|
||||||
|
displayFileReference(stringConfigField(input.existingConnection, 'privateKey')),
|
||||||
|
);
|
||||||
|
if (privateKeyInput === undefined) return 'back';
|
||||||
|
const phr = await promptCredential({
|
||||||
|
prompts,
|
||||||
|
message: 'Private key passphrase (optional)\nPress Enter to skip.',
|
||||||
|
projectDir: args.projectDir,
|
||||||
|
connectionId: input.connectionId,
|
||||||
|
secretName: 'snowflake-passphrase', // pragma: allowlist secret
|
||||||
|
});
|
||||||
|
if (phr === 'back') return 'back';
|
||||||
|
passphraseRef = phr;
|
||||||
|
}
|
||||||
const role = await promptText(
|
const role = await promptText(
|
||||||
prompts,
|
prompts,
|
||||||
'Snowflake role (optional)\nPress Enter to skip.',
|
'Snowflake role (optional)\nPress Enter to skip.',
|
||||||
stringConfigField(input.existingConnection, 'role'),
|
stringConfigField(input.existingConnection, 'role'),
|
||||||
);
|
);
|
||||||
if (role === undefined) return 'back';
|
if (role === undefined) return 'back';
|
||||||
const resolvedPasswordRef = passwordRef ?? stringConfigField(input.existingConnection, 'password');
|
if (authMethod === 'password') {
|
||||||
if (!account || !warehouse || !database || !username || !resolvedPasswordRef) return null;
|
const resolvedPasswordRef = passwordRef ?? stringConfigField(input.existingConnection, 'password');
|
||||||
|
if (!account || !warehouse || !database || !username || !resolvedPasswordRef) return null;
|
||||||
|
return {
|
||||||
|
driver: 'snowflake',
|
||||||
|
authMethod: 'password',
|
||||||
|
account,
|
||||||
|
warehouse,
|
||||||
|
database,
|
||||||
|
username,
|
||||||
|
password: resolvedPasswordRef,
|
||||||
|
...(role ? { role } : {}),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
const resolvedPrivateKey = privateKeyInput
|
||||||
|
? normalizeFileReference(privateKeyInput)
|
||||||
|
: stringConfigField(input.existingConnection, 'privateKey');
|
||||||
|
if (!account || !warehouse || !database || !username || !resolvedPrivateKey) return null;
|
||||||
|
const resolvedPassphrase = passphraseRef ?? stringConfigField(input.existingConnection, 'passphrase');
|
||||||
return {
|
return {
|
||||||
driver: 'snowflake',
|
driver: 'snowflake',
|
||||||
authMethod: 'password',
|
authMethod: 'rsa',
|
||||||
account,
|
account,
|
||||||
warehouse,
|
warehouse,
|
||||||
database,
|
database,
|
||||||
username,
|
username,
|
||||||
password: resolvedPasswordRef,
|
privateKey: resolvedPrivateKey,
|
||||||
|
...(resolvedPassphrase ? { passphrase: resolvedPassphrase } : {}),
|
||||||
...(role ? { role } : {}),
|
...(role ? { role } : {}),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,7 @@ describe('buildProjectStackSnapshotFields', () => {
|
||||||
maxLlmTablesPerBatch: 40,
|
maxLlmTablesPerBatch: 40,
|
||||||
maxCandidatesPerColumn: 25,
|
maxCandidatesPerColumn: 25,
|
||||||
profileSampleRows: 10000,
|
profileSampleRows: 10000,
|
||||||
|
profileConcurrency: 4,
|
||||||
validationConcurrency: 4,
|
validationConcurrency: 4,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue