feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)

* feat(cli): add RSA key-pair auth option to Snowflake setup wizard

Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.

* feat(scan): pool Snowflake sessions

* fix(scan): reuse structural snapshots and cleanup connectors

* feat(scan): parallelize relationship profiling

* feat(scan): batch table description generation

* docs: document Snowflake ingest concurrency knobs

* fix(scan): close Snowflake ingest perf verification gaps

* fix(scan): keep batched description failure bounded
This commit is contained in:
Andrey Avtomonov 2026-05-23 02:39:45 +02:00 committed by GitHub
parent 6e31687782
commit 4e654c43c1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 1247 additions and 237 deletions

View file

@ -378,6 +378,121 @@ describe('KtxDescriptionGenerator', () => {
expect(cache.set).toHaveBeenCalledWith('warehouse.public.orders', 'Commerce orders');
expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders');
});
it('generates one structured table description and reuses table samples for all columns', async () => {
const llmRuntime = createLlmProvider('unused');
llmRuntime.generateObject = vi.fn(async () => ({
tableDescription: 'Commerce orders',
columns: [
{ name: 'status', description: 'Current order state' },
{ name: 'amount', description: 'Order amount in dollars' },
],
}));
const connector = createConnector();
const generator = new KtxDescriptionGenerator({
llmRuntime,
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateBatchedTableDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
rawDescriptions: { db: 'Orders fact table' },
columns: [
{ name: 'status', type: 'text' },
{ name: 'amount', type: 'numeric' },
],
},
});
expect(result.tableDescription).toBe('Commerce orders');
expect(Object.fromEntries(result.columnDescriptions)).toEqual({
status: 'Current order state',
amount: 'Order amount in dollars',
});
expect(connector.sampleTable).toHaveBeenCalledTimes(1);
expect(connector.sampleColumn).not.toHaveBeenCalled();
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
expect(llmRuntime.generateText).not.toHaveBeenCalled();
});
it('falls back to one column generateText call for each missing structured column', async () => {
const llmRuntime = createLlmProvider('Fallback status');
llmRuntime.generateObject = vi.fn(async () => ({
tableDescription: 'Commerce orders',
columns: [{ name: 'amount', description: 'Order amount in dollars' }],
}));
const connector = createConnector();
const generator = new KtxDescriptionGenerator({
llmRuntime,
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateBatchedTableDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [
{ name: 'status', type: 'text' },
{ name: 'amount', type: 'numeric' },
],
},
});
expect(Object.fromEntries(result.columnDescriptions)).toEqual({
status: 'Fallback status',
amount: 'Order amount in dollars',
});
expect(connector.sampleColumn).not.toHaveBeenCalled();
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
expect(llmRuntime.generateText).toHaveBeenCalledTimes(1);
});
it('does not run per-column fallback when structured object generation throws', async () => {
const llmRuntime = createLlmProvider('Fallback description');
llmRuntime.generateObject = vi.fn(async () => {
throw new Error('object output unavailable');
});
const warnings: string[] = [];
const generator = new KtxDescriptionGenerator({
llmRuntime,
onWarning: (warning) => warnings.push(warning.code),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateBatchedTableDescriptions({
connectionId: 'conn-1',
connector: createConnector(),
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'status', type: 'text' }],
},
});
expect(result.tableDescription).toBeNull();
expect(Object.fromEntries(result.columnDescriptions)).toEqual({ status: null });
expect(warnings).toContain('enrichment_failed');
expect(llmRuntime.generateObject).toHaveBeenCalledTimes(1);
expect(llmRuntime.generateText).not.toHaveBeenCalled();
});
});
describe('KtxDescriptionGenerator resilience', () => {