2026-05-10 23:12:26 +02:00
|
|
|
|
import Database from 'better-sqlite3';
|
|
|
|
|
|
import { describe, expect, it, vi } from 'vitest';
|
test: split cli tests from source tree (#216)
* feat(cli): define full warehouse dialect contract
* test(cli): keep dialect edge tests focused
* fix(cli): stabilize dialect contract foundation
* refactor(connectors): own read-only query preparation
* refactor(connectors): resolve dialects through registry
* refactor(connectors): keep concrete dialect classes internal
* chore(workspace): enforce dialect import boundary
* refactor(cli): resolve relationship dialect at scan boundary
* refactor(cli): use dialect display parsing for entity details
* refactor(cli): use dialect display parsing for warehouse catalog
* refactor(cli): use dialect SQL in relationship workflows
* test(cli): verify solid dialect scan workflow closure
* test: split cli tests from source tree
* refactor(cli): standardize BigQuery scope listing
* feat(sqlite): implement connector scope listing
* test(connectors): cover required table listing
* feat(cli): add warehouse driver registry
* refactor(setup): route scope discovery through driver registry
* refactor(cli): route local query execution through driver registry
* refactor(historic-sql): route dialect support through driver registry
* refactor(cli): test warehouse connections through driver registry
* fix(cli): close driver registry type export gaps
* Improve setup daemon diagnostics
* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback
Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.
* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match
The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.
Align the picker boundary with the canonical 3-level KtxTableRef:
- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
(resolveEnabledTables already accepts the 3-part shape) and
schemasFromEnabledTables now goes through parseDottedTableEntry so it
recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
reuse.
Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).
* fix(cli): allow debug telemetry under opt-out env
2026-05-26 08:49:05 +02:00
|
|
|
|
import { buildDefaultKtxProjectConfig } from '../../../src/context/project/config.js';
|
2026-05-10 23:12:26 +02:00
|
|
|
|
import type {
|
2026-05-10 23:51:24 +02:00
|
|
|
|
KtxScanEnrichmentCompletedStage,
|
|
|
|
|
|
KtxScanEnrichmentFailedStage,
|
|
|
|
|
|
KtxScanEnrichmentStageLookup,
|
|
|
|
|
|
KtxScanEnrichmentStateStore,
|
test: split cli tests from source tree (#216)
* feat(cli): define full warehouse dialect contract
* test(cli): keep dialect edge tests focused
* fix(cli): stabilize dialect contract foundation
* refactor(connectors): own read-only query preparation
* refactor(connectors): resolve dialects through registry
* refactor(connectors): keep concrete dialect classes internal
* chore(workspace): enforce dialect import boundary
* refactor(cli): resolve relationship dialect at scan boundary
* refactor(cli): use dialect display parsing for entity details
* refactor(cli): use dialect display parsing for warehouse catalog
* refactor(cli): use dialect SQL in relationship workflows
* test(cli): verify solid dialect scan workflow closure
* test: split cli tests from source tree
* refactor(cli): standardize BigQuery scope listing
* feat(sqlite): implement connector scope listing
* test(connectors): cover required table listing
* feat(cli): add warehouse driver registry
* refactor(setup): route scope discovery through driver registry
* refactor(cli): route local query execution through driver registry
* refactor(historic-sql): route dialect support through driver registry
* refactor(cli): test warehouse connections through driver registry
* fix(cli): close driver registry type export gaps
* Improve setup daemon diagnostics
* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback
Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.
* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match
The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.
Align the picker boundary with the canonical 3-level KtxTableRef:
- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
(resolveEnabledTables already accepts the 3-part shape) and
schemasFromEnabledTables now goes through parseDottedTableEntry so it
recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
reuse.
Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).
* fix(cli): allow debug telemetry under opt-out env
2026-05-26 08:49:05 +02:00
|
|
|
|
} from '../../../src/context/scan/enrichment-state.js';
|
2026-05-10 23:12:26 +02:00
|
|
|
|
import {
|
|
|
|
|
|
createDeterministicLocalScanEnrichmentProviders,
|
|
|
|
|
|
runLocalScanEnrichment,
|
2026-05-10 23:51:24 +02:00
|
|
|
|
snapshotToKtxEnrichedSchema,
|
test: split cli tests from source tree (#216)
* feat(cli): define full warehouse dialect contract
* test(cli): keep dialect edge tests focused
* fix(cli): stabilize dialect contract foundation
* refactor(connectors): own read-only query preparation
* refactor(connectors): resolve dialects through registry
* refactor(connectors): keep concrete dialect classes internal
* chore(workspace): enforce dialect import boundary
* refactor(cli): resolve relationship dialect at scan boundary
* refactor(cli): use dialect display parsing for entity details
* refactor(cli): use dialect display parsing for warehouse catalog
* refactor(cli): use dialect SQL in relationship workflows
* test(cli): verify solid dialect scan workflow closure
* test: split cli tests from source tree
* refactor(cli): standardize BigQuery scope listing
* feat(sqlite): implement connector scope listing
* test(connectors): cover required table listing
* feat(cli): add warehouse driver registry
* refactor(setup): route scope discovery through driver registry
* refactor(cli): route local query execution through driver registry
* refactor(historic-sql): route dialect support through driver registry
* refactor(cli): test warehouse connections through driver registry
* fix(cli): close driver registry type export gaps
* Improve setup daemon diagnostics
* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback
Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.
* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match
The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.
Align the picker boundary with the canonical 3-level KtxTableRef:
- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
(resolveEnabledTables already accepts the 3-part shape) and
schemasFromEnabledTables now goes through parseDottedTableEntry so it
recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
reuse.
Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).
* fix(cli): allow debug telemetry under opt-out env
2026-05-26 08:49:05 +02:00
|
|
|
|
} from '../../../src/context/scan/local-enrichment.js';
|
2026-05-10 23:12:26 +02:00
|
|
|
|
import {
|
2026-05-10 23:51:24 +02:00
|
|
|
|
createKtxConnectorCapabilities,
|
|
|
|
|
|
type KtxQueryResult,
|
|
|
|
|
|
type KtxReadOnlyQueryInput,
|
2026-05-19 16:40:01 +02:00
|
|
|
|
type KtxEmbeddingPort,
|
2026-05-10 23:51:24 +02:00
|
|
|
|
type KtxScanConnector,
|
|
|
|
|
|
type KtxScanContext,
|
|
|
|
|
|
type KtxSchemaSnapshot,
|
test: split cli tests from source tree (#216)
* feat(cli): define full warehouse dialect contract
* test(cli): keep dialect edge tests focused
* fix(cli): stabilize dialect contract foundation
* refactor(connectors): own read-only query preparation
* refactor(connectors): resolve dialects through registry
* refactor(connectors): keep concrete dialect classes internal
* chore(workspace): enforce dialect import boundary
* refactor(cli): resolve relationship dialect at scan boundary
* refactor(cli): use dialect display parsing for entity details
* refactor(cli): use dialect display parsing for warehouse catalog
* refactor(cli): use dialect SQL in relationship workflows
* test(cli): verify solid dialect scan workflow closure
* test: split cli tests from source tree
* refactor(cli): standardize BigQuery scope listing
* feat(sqlite): implement connector scope listing
* test(connectors): cover required table listing
* feat(cli): add warehouse driver registry
* refactor(setup): route scope discovery through driver registry
* refactor(cli): route local query execution through driver registry
* refactor(historic-sql): route dialect support through driver registry
* refactor(cli): test warehouse connections through driver registry
* fix(cli): close driver registry type export gaps
* Improve setup daemon diagnostics
* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback
Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.
* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match
The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.
Align the picker boundary with the canonical 3-level KtxTableRef:
- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
(resolveEnabledTables already accepts the 3-part shape) and
schemasFromEnabledTables now goes through parseDottedTableEntry so it
recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
reuse.
Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).
* fix(cli): allow debug telemetry under opt-out env
2026-05-26 08:49:05 +02:00
|
|
|
|
} from '../../../src/context/scan/types.js';
|
2026-05-10 23:12:26 +02:00
|
|
|
|
|
2026-05-19 16:40:01 +02:00
|
|
|
|
function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort {
|
|
|
|
|
|
return {
|
|
|
|
|
|
dimensions: options.dimensions,
|
|
|
|
|
|
maxBatchSize: options.maxBatchSize ?? 64,
|
|
|
|
|
|
async embedBatch(texts) {
|
|
|
|
|
|
return texts.map((_, textIndex) =>
|
|
|
|
|
|
Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex),
|
|
|
|
|
|
);
|
|
|
|
|
|
},
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-10 23:51:24 +02:00
|
|
|
|
const snapshot: KtxSchemaSnapshot = {
|
2026-05-10 23:12:26 +02:00
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
|
extractedAt: '2026-04-29T12:00:00.000Z',
|
|
|
|
|
|
scope: { schemas: ['public'] },
|
|
|
|
|
|
metadata: {},
|
|
|
|
|
|
tables: [
|
|
|
|
|
|
{
|
|
|
|
|
|
catalog: null,
|
|
|
|
|
|
db: 'public',
|
|
|
|
|
|
name: 'customers',
|
|
|
|
|
|
kind: 'table',
|
|
|
|
|
|
comment: 'Customer accounts',
|
|
|
|
|
|
estimatedRows: 2,
|
|
|
|
|
|
foreignKeys: [],
|
|
|
|
|
|
columns: [
|
|
|
|
|
|
{
|
|
|
|
|
|
name: 'id',
|
|
|
|
|
|
nativeType: 'integer',
|
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
|
dimensionType: 'number',
|
|
|
|
|
|
nullable: false,
|
|
|
|
|
|
primaryKey: true,
|
|
|
|
|
|
comment: 'Customer id',
|
|
|
|
|
|
},
|
|
|
|
|
|
],
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
catalog: null,
|
|
|
|
|
|
db: 'public',
|
|
|
|
|
|
name: 'orders',
|
|
|
|
|
|
kind: 'table',
|
|
|
|
|
|
comment: 'Customer orders',
|
|
|
|
|
|
estimatedRows: 3,
|
|
|
|
|
|
foreignKeys: [],
|
|
|
|
|
|
columns: [
|
|
|
|
|
|
{
|
|
|
|
|
|
name: 'id',
|
|
|
|
|
|
nativeType: 'integer',
|
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
|
dimensionType: 'number',
|
|
|
|
|
|
nullable: false,
|
|
|
|
|
|
primaryKey: true,
|
|
|
|
|
|
comment: 'Order id',
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
name: 'customer_id',
|
|
|
|
|
|
nativeType: 'integer',
|
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
|
dimensionType: 'number',
|
|
|
|
|
|
nullable: false,
|
|
|
|
|
|
primaryKey: false,
|
|
|
|
|
|
comment: 'Customer id',
|
|
|
|
|
|
},
|
|
|
|
|
|
],
|
|
|
|
|
|
},
|
|
|
|
|
|
],
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2026-05-10 23:51:24 +02:00
|
|
|
|
function connector(): KtxScanConnector {
|
2026-05-10 23:12:26 +02:00
|
|
|
|
return {
|
|
|
|
|
|
id: 'test:warehouse',
|
|
|
|
|
|
driver: 'postgres',
|
2026-05-10 23:51:24 +02:00
|
|
|
|
capabilities: createKtxConnectorCapabilities({
|
2026-05-10 23:12:26 +02:00
|
|
|
|
tableSampling: true,
|
|
|
|
|
|
columnSampling: true,
|
|
|
|
|
|
readOnlySql: true,
|
|
|
|
|
|
columnStats: true,
|
|
|
|
|
|
}),
|
|
|
|
|
|
introspect: vi.fn(async () => snapshot),
|
test: split cli tests from source tree (#216)
* feat(cli): define full warehouse dialect contract
* test(cli): keep dialect edge tests focused
* fix(cli): stabilize dialect contract foundation
* refactor(connectors): own read-only query preparation
* refactor(connectors): resolve dialects through registry
* refactor(connectors): keep concrete dialect classes internal
* chore(workspace): enforce dialect import boundary
* refactor(cli): resolve relationship dialect at scan boundary
* refactor(cli): use dialect display parsing for entity details
* refactor(cli): use dialect display parsing for warehouse catalog
* refactor(cli): use dialect SQL in relationship workflows
* test(cli): verify solid dialect scan workflow closure
* test: split cli tests from source tree
* refactor(cli): standardize BigQuery scope listing
* feat(sqlite): implement connector scope listing
* test(connectors): cover required table listing
* feat(cli): add warehouse driver registry
* refactor(setup): route scope discovery through driver registry
* refactor(cli): route local query execution through driver registry
* refactor(historic-sql): route dialect support through driver registry
* refactor(cli): test warehouse connections through driver registry
* fix(cli): close driver registry type export gaps
* Improve setup daemon diagnostics
* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback
Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.
* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match
The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.
Align the picker boundary with the canonical 3-level KtxTableRef:
- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
(resolveEnabledTables already accepts the 3-part shape) and
schemasFromEnabledTables now goes through parseDottedTableEntry so it
recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
reuse.
Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).
* fix(cli): allow debug telemetry under opt-out env
2026-05-26 08:49:05 +02:00
|
|
|
|
listSchemas: vi.fn(async () => []),
|
|
|
|
|
|
listTables: vi.fn(async () => []),
|
2026-05-10 23:12:26 +02:00
|
|
|
|
sampleTable: vi.fn(async () => ({
|
|
|
|
|
|
headers: ['id', 'customer_id'],
|
|
|
|
|
|
rows: [[1, 10]],
|
|
|
|
|
|
totalRows: 1,
|
|
|
|
|
|
})),
|
|
|
|
|
|
sampleColumn: vi.fn(async () => ({
|
|
|
|
|
|
values: ['10', '11'],
|
|
|
|
|
|
nullCount: 0,
|
|
|
|
|
|
distinctCount: 2,
|
|
|
|
|
|
})),
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
class InMemorySqliteExecutor {
|
|
|
|
|
|
readonly db = new Database(':memory:');
|
|
|
|
|
|
|
2026-05-10 23:51:24 +02:00
|
|
|
|
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
2026-05-10 23:12:26 +02:00
|
|
|
|
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
|
|
|
|
|
const headers = Object.keys(rows[0] ?? {});
|
|
|
|
|
|
return Promise.resolve({
|
|
|
|
|
|
headers,
|
|
|
|
|
|
rows: rows.map((row) => headers.map((header) => row[header])),
|
|
|
|
|
|
totalRows: rows.length,
|
|
|
|
|
|
rowCount: rows.length,
|
|
|
|
|
|
});
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
close(): void {
|
|
|
|
|
|
this.db.close();
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-10 23:51:24 +02:00
|
|
|
|
function noDeclaredRelationshipSnapshot(): KtxSchemaSnapshot {
|
2026-05-10 23:12:26 +02:00
|
|
|
|
return {
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
driver: 'sqlite',
|
|
|
|
|
|
extractedAt: '2026-05-07T00:00:00.000Z',
|
|
|
|
|
|
scope: {},
|
|
|
|
|
|
metadata: {},
|
|
|
|
|
|
tables: [
|
|
|
|
|
|
{
|
|
|
|
|
|
catalog: null,
|
|
|
|
|
|
db: null,
|
|
|
|
|
|
name: 'accounts',
|
|
|
|
|
|
kind: 'table',
|
|
|
|
|
|
comment: null,
|
|
|
|
|
|
estimatedRows: 2,
|
|
|
|
|
|
foreignKeys: [],
|
|
|
|
|
|
columns: [
|
|
|
|
|
|
{
|
|
|
|
|
|
name: 'id',
|
|
|
|
|
|
nativeType: 'INTEGER',
|
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
|
dimensionType: 'number',
|
|
|
|
|
|
nullable: false,
|
|
|
|
|
|
primaryKey: false,
|
|
|
|
|
|
comment: null,
|
|
|
|
|
|
},
|
|
|
|
|
|
],
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
catalog: null,
|
|
|
|
|
|
db: null,
|
|
|
|
|
|
name: 'orders',
|
|
|
|
|
|
kind: 'table',
|
|
|
|
|
|
comment: null,
|
|
|
|
|
|
estimatedRows: 3,
|
|
|
|
|
|
foreignKeys: [],
|
|
|
|
|
|
columns: [
|
|
|
|
|
|
{
|
|
|
|
|
|
name: 'id',
|
|
|
|
|
|
nativeType: 'INTEGER',
|
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
|
dimensionType: 'number',
|
|
|
|
|
|
nullable: false,
|
|
|
|
|
|
primaryKey: false,
|
|
|
|
|
|
comment: null,
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
name: 'account_id',
|
|
|
|
|
|
nativeType: 'INTEGER',
|
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
|
dimensionType: 'number',
|
|
|
|
|
|
nullable: false,
|
|
|
|
|
|
primaryKey: false,
|
|
|
|
|
|
comment: null,
|
|
|
|
|
|
},
|
|
|
|
|
|
],
|
|
|
|
|
|
},
|
|
|
|
|
|
],
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-10 23:51:24 +02:00
|
|
|
|
function memoryEnrichmentStateStore(): KtxScanEnrichmentStateStore {
|
|
|
|
|
|
const records = new Map<string, KtxScanEnrichmentCompletedStage | KtxScanEnrichmentFailedStage>();
|
|
|
|
|
|
const key = (input: Pick<KtxScanEnrichmentStageLookup, 'runId' | 'stage'>) => `${input.runId}:${input.stage}`;
|
2026-05-10 23:12:26 +02:00
|
|
|
|
return {
|
2026-05-10 23:51:24 +02:00
|
|
|
|
async findCompletedStage<TOutput>(input: KtxScanEnrichmentStageLookup) {
|
2026-05-10 23:12:26 +02:00
|
|
|
|
const record = records.get(key(input));
|
|
|
|
|
|
if (!record || record.status !== 'completed' || record.inputHash !== input.inputHash) {
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
2026-05-10 23:51:24 +02:00
|
|
|
|
return record as KtxScanEnrichmentCompletedStage<TOutput>;
|
2026-05-10 23:12:26 +02:00
|
|
|
|
},
|
|
|
|
|
|
async saveCompletedStage(input) {
|
|
|
|
|
|
records.set(key(input), {
|
|
|
|
|
|
...input,
|
|
|
|
|
|
status: 'completed',
|
|
|
|
|
|
errorMessage: null,
|
|
|
|
|
|
});
|
|
|
|
|
|
},
|
|
|
|
|
|
async saveFailedStage(input) {
|
|
|
|
|
|
records.set(key(input), {
|
|
|
|
|
|
...input,
|
|
|
|
|
|
status: 'failed',
|
|
|
|
|
|
output: null,
|
|
|
|
|
|
});
|
|
|
|
|
|
},
|
|
|
|
|
|
async listRunStages(runId) {
|
|
|
|
|
|
return [...records.values()].filter((record) => record.runId === runId);
|
|
|
|
|
|
},
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
describe('local scan enrichment', () => {
|
|
|
|
|
|
it('maps a scan snapshot into relationship detector schema', () => {
|
2026-05-10 23:51:24 +02:00
|
|
|
|
const schema = snapshotToKtxEnrichedSchema(snapshot);
|
2026-05-10 23:12:26 +02:00
|
|
|
|
|
|
|
|
|
|
expect(schema.connectionId).toBe('warehouse');
|
|
|
|
|
|
expect(schema.tables).toHaveLength(2);
|
|
|
|
|
|
expect(schema.tables[1]?.columns.map((column) => column.name)).toEqual(['id', 'customer_id']);
|
|
|
|
|
|
expect(schema.tables[1]?.columns[1]).toMatchObject({
|
|
|
|
|
|
id: 'public.orders.customer_id',
|
|
|
|
|
|
tableId: 'public.orders',
|
|
|
|
|
|
primaryKey: false,
|
|
|
|
|
|
sampleValues: null,
|
|
|
|
|
|
embedding: null,
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('maps snapshot foreign keys into formal schema relationships', () => {
|
|
|
|
|
|
const source = noDeclaredRelationshipSnapshot();
|
|
|
|
|
|
const snapshotWithForeignKey = {
|
|
|
|
|
|
...source,
|
|
|
|
|
|
tables: source.tables.map((table) =>
|
|
|
|
|
|
table.name === 'orders'
|
|
|
|
|
|
? {
|
|
|
|
|
|
...table,
|
|
|
|
|
|
foreignKeys: [
|
|
|
|
|
|
{
|
|
|
|
|
|
fromColumn: 'account_id',
|
|
|
|
|
|
toCatalog: null,
|
|
|
|
|
|
toDb: null,
|
|
|
|
|
|
toTable: 'accounts',
|
|
|
|
|
|
toColumn: 'id',
|
|
|
|
|
|
constraintName: 'orders_account_id_fkey',
|
|
|
|
|
|
},
|
|
|
|
|
|
],
|
|
|
|
|
|
}
|
|
|
|
|
|
: table.name === 'accounts'
|
|
|
|
|
|
? {
|
|
|
|
|
|
...table,
|
|
|
|
|
|
columns: table.columns.map((column) =>
|
|
|
|
|
|
column.name === 'id' ? { ...column, primaryKey: true } : column,
|
|
|
|
|
|
),
|
|
|
|
|
|
}
|
|
|
|
|
|
: table,
|
|
|
|
|
|
),
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2026-05-10 23:51:24 +02:00
|
|
|
|
const schema = snapshotToKtxEnrichedSchema(snapshotWithForeignKey);
|
2026-05-10 23:12:26 +02:00
|
|
|
|
|
|
|
|
|
|
expect(schema.relationships).toEqual([
|
|
|
|
|
|
{
|
|
|
|
|
|
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
|
|
|
|
|
|
source: 'formal',
|
|
|
|
|
|
from: {
|
|
|
|
|
|
tableId: 'orders',
|
|
|
|
|
|
columnIds: ['orders.account_id'],
|
|
|
|
|
|
table: { catalog: null, db: null, name: 'orders' },
|
|
|
|
|
|
columns: ['account_id'],
|
|
|
|
|
|
},
|
|
|
|
|
|
to: {
|
|
|
|
|
|
tableId: 'accounts',
|
|
|
|
|
|
columnIds: ['accounts.id'],
|
|
|
|
|
|
table: { catalog: null, db: null, name: 'accounts' },
|
|
|
|
|
|
columns: ['id'],
|
|
|
|
|
|
},
|
|
|
|
|
|
relationshipType: 'many_to_one',
|
|
|
|
|
|
confidence: 1,
|
|
|
|
|
|
isPrimaryKeyReference: true,
|
|
|
|
|
|
},
|
|
|
|
|
|
]);
|
|
|
|
|
|
});
|
|
|
|
|
|
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
it('uses the supplied snapshot without calling connector.introspect', async () => {
|
|
|
|
|
|
const scanConnector = connector();
|
|
|
|
|
|
const introspect = vi.mocked(scanConnector.introspect);
|
|
|
|
|
|
|
|
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'structural',
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
snapshot,
|
|
|
|
|
|
context: { runId: 'scan-run-snapshot' },
|
|
|
|
|
|
providers: null,
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.snapshot).toEqual(snapshot);
|
|
|
|
|
|
expect(introspect).not.toHaveBeenCalled();
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('falls back to connector.introspect when no snapshot is supplied', async () => {
|
|
|
|
|
|
const scanConnector = connector();
|
|
|
|
|
|
|
|
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'structural',
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-introspect' },
|
|
|
|
|
|
providers: null,
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.snapshot).toEqual(snapshot);
|
|
|
|
|
|
expect(scanConnector.introspect).toHaveBeenCalledTimes(1);
|
|
|
|
|
|
});
|
|
|
|
|
|
|
test: split cli tests from source tree (#216)
* feat(cli): define full warehouse dialect contract
* test(cli): keep dialect edge tests focused
* fix(cli): stabilize dialect contract foundation
* refactor(connectors): own read-only query preparation
* refactor(connectors): resolve dialects through registry
* refactor(connectors): keep concrete dialect classes internal
* chore(workspace): enforce dialect import boundary
* refactor(cli): resolve relationship dialect at scan boundary
* refactor(cli): use dialect display parsing for entity details
* refactor(cli): use dialect display parsing for warehouse catalog
* refactor(cli): use dialect SQL in relationship workflows
* test(cli): verify solid dialect scan workflow closure
* test: split cli tests from source tree
* refactor(cli): standardize BigQuery scope listing
* feat(sqlite): implement connector scope listing
* test(connectors): cover required table listing
* feat(cli): add warehouse driver registry
* refactor(setup): route scope discovery through driver registry
* refactor(cli): route local query execution through driver registry
* refactor(historic-sql): route dialect support through driver registry
* refactor(cli): test warehouse connections through driver registry
* fix(cli): close driver registry type export gaps
* Improve setup daemon diagnostics
* refactor(setup): centralize rail-prefixed diagnostics + query-history fallback
Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput
into clack.ts so the setup wizard, managed daemons, and embedding/agent steps
share one rail-formatted writer. setup-databases.ts also adds a
"disable query history and retry" option when the schema-context build fails
and query history is the likely culprit, surfaced via a new
failed-query-history-unavailable status.
* fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match
The setup picker's KtxTableListEntry was a 2-level { schema, name }, so
qualifiedTableId always wrote db.name into enabled_tables. When BigQuery,
Snowflake, or SQL Server later ran fast ingest, their introspect step filtered
the scope set with scopedTableNames(scope, { catalog: projectId|database, db })
— catalog was non-null on the introspect side but null in the scope refs, so
every entry was rejected, the live-database adapter staged zero table files,
and detect() failed with 'Adapter "live-database" did not recognize fetched
source output'.
Align the picker boundary with the canonical 3-level KtxTableRef:
- Add catalog: string | null to KtxTableListEntry.
- BigQuery/Snowflake/SQL Server listTables populate catalog from the
resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null.
- qualifiedTableId emits catalog.schema.name when catalog is non-null
(resolveEnabledTables already accepts the 3-part shape) and
schemasFromEnabledTables now goes through parseDottedTableEntry so it
recovers the schema correctly from both 2-part and 3-part entries.
- Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker
reuse.
Update listTables expectations in all seven connector tests and the setup /
picker test fixtures. Add a picker regression test that covers the
catalog-bearing round-trip (save + refine).
* fix(cli): allow debug telemetry under opt-out env
2026-05-26 08:49:05 +02:00
|
|
|
|
it('fails when connector driver and snapshot driver differ', async () => {
|
|
|
|
|
|
const mismatchedConnector: KtxScanConnector = {
|
|
|
|
|
|
...connector(),
|
|
|
|
|
|
driver: 'mysql',
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
await expect(
|
|
|
|
|
|
runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'relationships',
|
|
|
|
|
|
detectRelationships: true,
|
|
|
|
|
|
connector: mismatchedConnector,
|
|
|
|
|
|
snapshot,
|
|
|
|
|
|
context: { runId: 'scan-run-driver-mismatch' },
|
|
|
|
|
|
providers: null,
|
|
|
|
|
|
}),
|
|
|
|
|
|
).rejects.toThrow(
|
|
|
|
|
|
'ktx scan connector driver "mysql" does not match snapshot driver "postgres" for connection "warehouse"',
|
|
|
|
|
|
);
|
|
|
|
|
|
});
|
|
|
|
|
|
|
2026-05-10 23:12:26 +02:00
|
|
|
|
it('runs deterministic relationship detection for relationship scans', async () => {
|
|
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'relationships',
|
|
|
|
|
|
detectRelationships: true,
|
|
|
|
|
|
connector: connector(),
|
|
|
|
|
|
context: { runId: 'scan-run-1' },
|
|
|
|
|
|
providers: null,
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.summary).toMatchObject({
|
|
|
|
|
|
deterministicRelationships: 'completed',
|
|
|
|
|
|
llmRelationshipValidation: 'skipped',
|
|
|
|
|
|
embeddings: 'skipped',
|
|
|
|
|
|
});
|
|
|
|
|
|
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
|
|
|
|
|
expect(result.summary.statisticalValidation).toBe('skipped');
|
|
|
|
|
|
expect(result.warnings).toContainEqual({
|
|
|
|
|
|
code: 'relationship_validation_failed',
|
2026-05-10 23:51:24 +02:00
|
|
|
|
message: 'KTX scan connector advertises readOnlySql but does not expose executeReadOnly',
|
2026-05-10 23:12:26 +02:00
|
|
|
|
recoverable: true,
|
|
|
|
|
|
metadata: { capability: 'readOnlySql' },
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('runs relationship discovery with connector SQL evidence', async () => {
|
|
|
|
|
|
const executor = new InMemorySqliteExecutor();
|
|
|
|
|
|
try {
|
|
|
|
|
|
executor.db.exec(`
|
|
|
|
|
|
CREATE TABLE accounts (id INTEGER NOT NULL);
|
|
|
|
|
|
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
|
|
|
|
|
INSERT INTO accounts (id) VALUES (1), (2);
|
|
|
|
|
|
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
|
|
|
|
|
`);
|
|
|
|
|
|
const scanConnector = {
|
|
|
|
|
|
...connector(),
|
|
|
|
|
|
driver: 'sqlite' as const,
|
2026-05-10 23:51:24 +02:00
|
|
|
|
capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
|
2026-05-10 23:12:26 +02:00
|
|
|
|
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
|
|
|
|
|
|
executeReadOnly: executor.executeReadOnly.bind(executor),
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'relationships',
|
|
|
|
|
|
detectRelationships: true,
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-relationship-discovery' },
|
|
|
|
|
|
providers: null,
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
|
|
|
|
|
expect(result.summary.statisticalValidation).toBe('completed');
|
|
|
|
|
|
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
|
|
|
|
|
|
expect(result.resolvedRelationships).toEqual([
|
|
|
|
|
|
expect.objectContaining({
|
|
|
|
|
|
status: 'accepted',
|
|
|
|
|
|
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
|
|
|
|
|
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
|
|
|
|
|
}),
|
|
|
|
|
|
]);
|
|
|
|
|
|
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
|
|
|
|
|
|
} finally {
|
|
|
|
|
|
executor.close();
|
|
|
|
|
|
}
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('honors scan relationship config when LLM proposals are disabled', async () => {
|
2026-05-19 16:40:01 +02:00
|
|
|
|
const providers = createDeterministicLocalScanEnrichmentProviders();
|
2026-05-16 12:06:34 +02:00
|
|
|
|
const generateObject = vi.fn();
|
2026-05-10 23:12:26 +02:00
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'relationships',
|
|
|
|
|
|
detectRelationships: true,
|
|
|
|
|
|
connector: connector(),
|
|
|
|
|
|
context: { runId: 'scan-run-llm-disabled' },
|
|
|
|
|
|
providers: {
|
|
|
|
|
|
...providers,
|
2026-05-16 12:06:34 +02:00
|
|
|
|
llmRuntime: {
|
|
|
|
|
|
...providers.llmRuntime,
|
|
|
|
|
|
generateObject: generateObject as never,
|
2026-05-10 23:12:26 +02:00
|
|
|
|
},
|
|
|
|
|
|
},
|
|
|
|
|
|
relationshipSettings: {
|
2026-05-14 17:39:31 +02:00
|
|
|
|
...buildDefaultKtxProjectConfig().scan.relationships,
|
2026-05-10 23:12:26 +02:00
|
|
|
|
llmProposals: false,
|
|
|
|
|
|
maxLlmTablesPerBatch: 40,
|
|
|
|
|
|
},
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.summary.llmRelationshipValidation).toBe('skipped');
|
2026-05-16 12:06:34 +02:00
|
|
|
|
expect(generateObject).not.toHaveBeenCalled();
|
2026-05-10 23:12:26 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('skips relationship detection when scan relationships are disabled', async () => {
|
|
|
|
|
|
const settings = {
|
2026-05-14 17:39:31 +02:00
|
|
|
|
...buildDefaultKtxProjectConfig().scan.relationships,
|
2026-05-10 23:12:26 +02:00
|
|
|
|
enabled: false,
|
|
|
|
|
|
};
|
|
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
connector: connector(),
|
|
|
|
|
|
context: { runId: 'disabled-relationships' },
|
|
|
|
|
|
providers: createDeterministicLocalScanEnrichmentProviders(),
|
|
|
|
|
|
relationshipSettings: settings,
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.summary.deterministicRelationships).toBe('skipped');
|
|
|
|
|
|
expect(result.summary.statisticalValidation).toBe('skipped');
|
|
|
|
|
|
expect(result.summary.llmRelationshipValidation).toBe('skipped');
|
|
|
|
|
|
expect(result.relationships).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
|
|
|
|
|
|
expect(result.relationshipUpdate).toBeNull();
|
|
|
|
|
|
expect(result.relationshipProfile).toBeNull();
|
|
|
|
|
|
expect(result.resolvedRelationships).toBeNull();
|
|
|
|
|
|
});
|
|
|
|
|
|
|
fix(context): merge overlay columns onto manifest columns by name (#94)
* fix(context): merge overlay columns onto manifest columns by name
composeOverlay was appending overlay columns to the manifest column list,
producing duplicate entries when dbt/metabase overlays declared a column
just to attach descriptions. The duplicates carried no `type`, so the
pydantic SourceDefinition rejected them at semantic-query time and broke
`ktx sl query` for every overlay-backed measure. Now overlay columns
match base columns by name (case-insensitive): same-name entries merge
onto the manifest (overlay fields win, type/role fall back to the base,
descriptions merge per source key) and only new names append.
* refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract
Overlay sources now have two distinct collections: `columns:` for computed
columns (requiring `expr` + `type`) and `column_overrides:` for metadata
patches to inherited manifest columns. Composing or loading an overlay that
mixes the two — or references an unknown column — fails with a typed error.
Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` /
`toResolvedWire` as the strict shape sent to the Python engine, and add a
schema contract test that diffs Zod against the Pydantic JSON schema dumped
by `python -m semantic_layer dump-schema`. `SourceDefinition` is now
`extra="forbid"` on the Python side.
`loadAllSources` surfaces per-file load errors instead of swallowing them,
so validation/query paths can report manifest shard parse failures.
* fix(context): make scan description generation resilient and quiet
A transient sampleTable failure during ingest used to take out every
table in a connection: generateTableDescription returned a hardcoded
'Table not found' string into descriptions.ai, and KtxDescriptionGenerator
was constructed without a logger, so the failure left no trail anywhere.
- sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff,
honouring KtxScanContext.signal via a new KtxAbortedError.
- On retry exhaustion or missing capability, table generation falls back
to a metadata-only prompt built from column name / native type / comment
/ rawDescriptions. The column path follows the same rule -- call the
LLM when any of samples or rawDescriptions are available; skip only
when both are absent.
- Logger is now threaded from KtxScanContext into the generator. Failures
emit structured KtxScanWarning entries (new description_fallback_used
code, plus existing sampling_failed / enrichment_failed /
connector_capability_missing). ktx scan groups warnings by code so a
batch of identical failures collapses to one summary line plus sample.
- Returns null on failure instead of the 'Table not found' sentinel; the
manifest writer's existing guard already skips empty descriptions, so
schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS
already strips stale 'ai' on merge, so existing YAML clears on next run.
Also suppress AI SDK v6 'system in messages' warning: pull system messages
out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages
helper and pass them top-level to generateText (preserves cacheControl
providerOptions on the SystemModelMessage). Agent-runner's local
splitSystemPromptMessages dedupes onto the shared helper.
* test(docs): align examples-docs assertions with revamped docs
PR #103 (setup/guide doc revamp) reworded several CLI examples and
connection labels; the assertions in scripts/examples-docs.test.mjs
still referenced the pre-revamp wording and were failing in CI on main.
Update the regexes to match the post-revamp content:
- drop the `--json` flag from the sl-query example expectation
- move the `Driver:` / `Status: ok` probe to the connection reference,
which is where that output now lives (driver id is lowercase
`postgres`, not the display name `PostgreSQL`)
- drop the obsolete `Install \`uv\`...` troubleshooting line
- accept `<connectionId>` everywhere; the docs no longer use the
hyphenated `<connection-id>` form
- match the `warehouse` connection id used in the quickstart instead of
the `postgres-warehouse` id only used in the README and setup ref
* fix(sl): skip TS/Python schema contract test when uv is unavailable
The TypeScript checks CI job does not install uv or Python, so the
module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw
ENOENT and failed the suite. Wrap the schema dump in a try/catch and
guard the describe block with `describe.skipIf` so the test skips in
environments without uv. Local dev and any CI job that has uv on PATH
still runs the cross-language contract assertion.
2026-05-15 02:11:04 +02:00
|
|
|
|
it('forwards context.logger and emits warnings when sampleTable fails repeatedly', async () => {
|
|
|
|
|
|
const failingConnector: KtxScanConnector = {
|
|
|
|
|
|
...connector(),
|
|
|
|
|
|
sampleTable: vi.fn(async () => {
|
|
|
|
|
|
throw new Error('pool: ECONNRESET');
|
|
|
|
|
|
}),
|
|
|
|
|
|
};
|
|
|
|
|
|
const logger = {
|
|
|
|
|
|
debug: vi.fn(),
|
|
|
|
|
|
info: vi.fn(),
|
|
|
|
|
|
warn: vi.fn(),
|
|
|
|
|
|
error: vi.fn(),
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
detectRelationships: false,
|
|
|
|
|
|
connector: failingConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-warnings', logger },
|
2026-05-19 16:40:01 +02:00
|
|
|
|
providers: createDeterministicLocalScanEnrichmentProviders(),
|
fix(context): merge overlay columns onto manifest columns by name (#94)
* fix(context): merge overlay columns onto manifest columns by name
composeOverlay was appending overlay columns to the manifest column list,
producing duplicate entries when dbt/metabase overlays declared a column
just to attach descriptions. The duplicates carried no `type`, so the
pydantic SourceDefinition rejected them at semantic-query time and broke
`ktx sl query` for every overlay-backed measure. Now overlay columns
match base columns by name (case-insensitive): same-name entries merge
onto the manifest (overlay fields win, type/role fall back to the base,
descriptions merge per source key) and only new names append.
* refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract
Overlay sources now have two distinct collections: `columns:` for computed
columns (requiring `expr` + `type`) and `column_overrides:` for metadata
patches to inherited manifest columns. Composing or loading an overlay that
mixes the two — or references an unknown column — fails with a typed error.
Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` /
`toResolvedWire` as the strict shape sent to the Python engine, and add a
schema contract test that diffs Zod against the Pydantic JSON schema dumped
by `python -m semantic_layer dump-schema`. `SourceDefinition` is now
`extra="forbid"` on the Python side.
`loadAllSources` surfaces per-file load errors instead of swallowing them,
so validation/query paths can report manifest shard parse failures.
* fix(context): make scan description generation resilient and quiet
A transient sampleTable failure during ingest used to take out every
table in a connection: generateTableDescription returned a hardcoded
'Table not found' string into descriptions.ai, and KtxDescriptionGenerator
was constructed without a logger, so the failure left no trail anywhere.
- sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff,
honouring KtxScanContext.signal via a new KtxAbortedError.
- On retry exhaustion or missing capability, table generation falls back
to a metadata-only prompt built from column name / native type / comment
/ rawDescriptions. The column path follows the same rule -- call the
LLM when any of samples or rawDescriptions are available; skip only
when both are absent.
- Logger is now threaded from KtxScanContext into the generator. Failures
emit structured KtxScanWarning entries (new description_fallback_used
code, plus existing sampling_failed / enrichment_failed /
connector_capability_missing). ktx scan groups warnings by code so a
batch of identical failures collapses to one summary line plus sample.
- Returns null on failure instead of the 'Table not found' sentinel; the
manifest writer's existing guard already skips empty descriptions, so
schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS
already strips stale 'ai' on merge, so existing YAML clears on next run.
Also suppress AI SDK v6 'system in messages' warning: pull system messages
out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages
helper and pass them top-level to generateText (preserves cacheControl
providerOptions on the SystemModelMessage). Agent-runner's local
splitSystemPromptMessages dedupes onto the shared helper.
* test(docs): align examples-docs assertions with revamped docs
PR #103 (setup/guide doc revamp) reworded several CLI examples and
connection labels; the assertions in scripts/examples-docs.test.mjs
still referenced the pre-revamp wording and were failing in CI on main.
Update the regexes to match the post-revamp content:
- drop the `--json` flag from the sl-query example expectation
- move the `Driver:` / `Status: ok` probe to the connection reference,
which is where that output now lives (driver id is lowercase
`postgres`, not the display name `PostgreSQL`)
- drop the obsolete `Install \`uv\`...` troubleshooting line
- accept `<connectionId>` everywhere; the docs no longer use the
hyphenated `<connection-id>` form
- match the `warehouse` connection id used in the quickstart instead of
the `postgres-warehouse` id only used in the README and setup ref
* fix(sl): skip TS/Python schema contract test when uv is unavailable
The TypeScript checks CI job does not install uv or Python, so the
module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw
ENOENT and failed the suite. Wrap the schema dump in a try/catch and
guard the describe block with `describe.skipIf` so the test skips in
environments without uv. Local dev and any CI job that has uv on PATH
still runs the cross-language contract assertion.
2026-05-15 02:11:04 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
const codes = result.warnings.map((warning) => warning.code);
|
|
|
|
|
|
expect(codes).toContain('sampling_failed');
|
|
|
|
|
|
expect(codes).toContain('description_fallback_used');
|
|
|
|
|
|
expect(result.warnings.some((warning) => warning.table === 'customers')).toBe(true);
|
|
|
|
|
|
expect(logger.warn).toHaveBeenCalled();
|
|
|
|
|
|
expect(logger.error).toHaveBeenCalled();
|
|
|
|
|
|
// Each of the two tables produced sampling_failed + description_fallback_used, so 2 + 2 = 4 warnings minimum.
|
|
|
|
|
|
expect(result.warnings.length).toBeGreaterThanOrEqual(4);
|
|
|
|
|
|
// Sampling was retried 3× for each of the 2 tables = 6 calls
|
|
|
|
|
|
expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
|
|
|
|
|
|
});
|
|
|
|
|
|
|
2026-05-19 16:40:01 +02:00
|
|
|
|
it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
|
2026-05-10 23:12:26 +02:00
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
detectRelationships: true,
|
|
|
|
|
|
connector: connector(),
|
|
|
|
|
|
context: { runId: 'scan-run-2' },
|
2026-05-19 16:40:01 +02:00
|
|
|
|
providers: createDeterministicLocalScanEnrichmentProviders(),
|
2026-05-10 23:12:26 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.summary).toMatchObject({
|
|
|
|
|
|
dataDictionary: 'completed',
|
|
|
|
|
|
tableDescriptions: 'completed',
|
|
|
|
|
|
columnDescriptions: 'completed',
|
2026-05-19 16:40:01 +02:00
|
|
|
|
embeddings: 'skipped',
|
2026-05-10 23:12:26 +02:00
|
|
|
|
deterministicRelationships: 'completed',
|
|
|
|
|
|
});
|
2026-05-19 16:40:01 +02:00
|
|
|
|
expect(result.embeddingUpdates).toEqual([]);
|
2026-05-10 23:12:26 +02:00
|
|
|
|
expect(result.snapshot).toEqual(snapshot);
|
|
|
|
|
|
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
|
|
|
|
|
});
|
|
|
|
|
|
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
it('generates batched table descriptions with bounded table-level concurrency', async () => {
|
2026-05-12 14:34:59 +02:00
|
|
|
|
const concurrentSnapshot: KtxSchemaSnapshot = {
|
|
|
|
|
|
...snapshot,
|
|
|
|
|
|
tables: Array.from({ length: 8 }, (_, index) => ({
|
|
|
|
|
|
catalog: null,
|
|
|
|
|
|
db: 'public',
|
|
|
|
|
|
name: `table_${index + 1}`,
|
|
|
|
|
|
kind: 'table' as const,
|
|
|
|
|
|
comment: null,
|
|
|
|
|
|
estimatedRows: 2,
|
|
|
|
|
|
foreignKeys: [],
|
|
|
|
|
|
columns: [
|
|
|
|
|
|
{
|
|
|
|
|
|
name: 'id',
|
|
|
|
|
|
nativeType: 'integer',
|
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
|
dimensionType: 'number' as const,
|
|
|
|
|
|
nullable: false,
|
|
|
|
|
|
primaryKey: true,
|
|
|
|
|
|
comment: null,
|
|
|
|
|
|
},
|
|
|
|
|
|
],
|
|
|
|
|
|
})),
|
|
|
|
|
|
};
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
let activeTableSamples = 0;
|
|
|
|
|
|
let maxActiveTableSamples = 0;
|
2026-05-12 14:34:59 +02:00
|
|
|
|
const scanConnector = {
|
|
|
|
|
|
...connector(),
|
|
|
|
|
|
introspect: vi.fn(async () => concurrentSnapshot),
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
sampleColumn: vi.fn(async () => ({
|
|
|
|
|
|
values: ['1'],
|
|
|
|
|
|
nullCount: 0,
|
|
|
|
|
|
distinctCount: 1,
|
|
|
|
|
|
})),
|
|
|
|
|
|
sampleTable: vi.fn(async () => {
|
|
|
|
|
|
activeTableSamples += 1;
|
|
|
|
|
|
maxActiveTableSamples = Math.max(maxActiveTableSamples, activeTableSamples);
|
2026-05-12 14:34:59 +02:00
|
|
|
|
await new Promise((resolve) => setTimeout(resolve, 10));
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
activeTableSamples -= 1;
|
2026-05-12 14:34:59 +02:00
|
|
|
|
return {
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
headers: ['id'],
|
|
|
|
|
|
rows: [[1]],
|
|
|
|
|
|
totalRows: 1,
|
2026-05-12 14:34:59 +02:00
|
|
|
|
};
|
|
|
|
|
|
}),
|
|
|
|
|
|
};
|
|
|
|
|
|
const settings = {
|
2026-05-14 17:39:31 +02:00
|
|
|
|
...buildDefaultKtxProjectConfig().scan.relationships,
|
2026-05-12 14:34:59 +02:00
|
|
|
|
enabled: false,
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-concurrent-descriptions' },
|
2026-05-19 16:40:01 +02:00
|
|
|
|
providers: createDeterministicLocalScanEnrichmentProviders(),
|
2026-05-12 14:34:59 +02:00
|
|
|
|
relationshipSettings: settings,
|
|
|
|
|
|
});
|
|
|
|
|
|
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
expect(maxActiveTableSamples).toBe(4);
|
|
|
|
|
|
expect(scanConnector.sampleColumn).not.toHaveBeenCalled();
|
2026-05-12 14:34:59 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
2026-05-10 23:12:26 +02:00
|
|
|
|
it('reports enrichment progress for countable stages', async () => {
|
|
|
|
|
|
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
|
|
|
|
|
|
const progress = {
|
|
|
|
|
|
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
|
|
|
|
|
|
events.push({ progress: progressValue, message, transient: options?.transient });
|
|
|
|
|
|
},
|
|
|
|
|
|
startPhase() {
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
},
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
detectRelationships: true,
|
|
|
|
|
|
connector: connector(),
|
|
|
|
|
|
context: { runId: 'scan-run-progress', progress },
|
2026-05-19 16:40:01 +02:00
|
|
|
|
providers: {
|
|
|
|
|
|
...createDeterministicLocalScanEnrichmentProviders(),
|
|
|
|
|
|
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
|
|
|
|
|
},
|
2026-05-10 23:12:26 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(events).toEqual(
|
|
|
|
|
|
expect.arrayContaining([
|
|
|
|
|
|
expect.objectContaining({ message: 'Generating descriptions 1/2 tables', transient: true }),
|
|
|
|
|
|
expect.objectContaining({ message: 'Generating descriptions 2/2 tables', transient: true }),
|
|
|
|
|
|
expect.objectContaining({ message: 'Building embeddings 1/1 batches', transient: true }),
|
|
|
|
|
|
expect.objectContaining({ message: 'Detecting relationships' }),
|
|
|
|
|
|
]),
|
|
|
|
|
|
);
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('reports progress before enrichment connector introspection starts', async () => {
|
|
|
|
|
|
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
|
|
|
|
|
|
const progress = {
|
|
|
|
|
|
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
|
|
|
|
|
|
events.push({ progress: progressValue, message, transient: options?.transient });
|
|
|
|
|
|
},
|
|
|
|
|
|
startPhase() {
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
},
|
|
|
|
|
|
};
|
|
|
|
|
|
const scanConnector = {
|
|
|
|
|
|
...connector(),
|
|
|
|
|
|
introspect: vi.fn(async () => {
|
|
|
|
|
|
expect(events).toContainEqual(expect.objectContaining({ message: 'Loading enrichment schema snapshot' }));
|
|
|
|
|
|
return snapshot;
|
|
|
|
|
|
}),
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'relationships',
|
|
|
|
|
|
detectRelationships: true,
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-progress-before-introspection', progress },
|
|
|
|
|
|
providers: null,
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(scanConnector.introspect).toHaveBeenCalled();
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('splits enrichment embedding requests by provider batch size', async () => {
|
2026-05-10 23:51:24 +02:00
|
|
|
|
const manyColumnSnapshot: KtxSchemaSnapshot = {
|
2026-05-10 23:12:26 +02:00
|
|
|
|
...snapshot,
|
|
|
|
|
|
tables: [
|
|
|
|
|
|
{
|
|
|
|
|
|
catalog: null,
|
|
|
|
|
|
db: 'public',
|
|
|
|
|
|
name: 'wide_orders',
|
|
|
|
|
|
kind: 'table',
|
|
|
|
|
|
comment: 'Wide order facts',
|
|
|
|
|
|
estimatedRows: 3,
|
|
|
|
|
|
foreignKeys: [],
|
|
|
|
|
|
columns: Array.from({ length: 5 }, (_, index) => ({
|
|
|
|
|
|
name: `metric_${index + 1}`,
|
|
|
|
|
|
nativeType: 'integer',
|
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
|
dimensionType: 'number' as const,
|
|
|
|
|
|
nullable: false,
|
|
|
|
|
|
primaryKey: false,
|
|
|
|
|
|
comment: `Metric ${index + 1}`,
|
|
|
|
|
|
})),
|
|
|
|
|
|
},
|
|
|
|
|
|
],
|
|
|
|
|
|
};
|
|
|
|
|
|
const scanConnector = {
|
|
|
|
|
|
...connector(),
|
|
|
|
|
|
introspect: vi.fn(async () => manyColumnSnapshot),
|
|
|
|
|
|
};
|
2026-05-19 16:40:01 +02:00
|
|
|
|
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
|
2026-05-10 23:12:26 +02:00
|
|
|
|
const embedBatch = vi.fn(async (texts: string[]) => {
|
|
|
|
|
|
if (texts.length > 2) {
|
|
|
|
|
|
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
|
|
|
|
|
|
}
|
|
|
|
|
|
return texts.map((_, index) => [index, index + 1, index + 2]);
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
detectRelationships: false,
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-batched-embeddings' },
|
|
|
|
|
|
providers: {
|
2026-05-16 12:06:34 +02:00
|
|
|
|
llmRuntime: deterministicProviders.llmRuntime,
|
2026-05-10 23:12:26 +02:00
|
|
|
|
embedding: {
|
|
|
|
|
|
dimensions: 3,
|
|
|
|
|
|
maxBatchSize: 2,
|
|
|
|
|
|
embedBatch,
|
|
|
|
|
|
},
|
|
|
|
|
|
},
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.embeddingUpdates).toHaveLength(5);
|
|
|
|
|
|
expect(embedBatch.mock.calls.map(([texts]) => texts).map((texts) => texts.length)).toEqual([2, 2, 1]);
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
|
|
|
|
|
|
const stateStore = memoryEnrichmentStateStore();
|
|
|
|
|
|
const scanConnector = connector();
|
2026-05-19 16:40:01 +02:00
|
|
|
|
const providers = {
|
|
|
|
|
|
...createDeterministicLocalScanEnrichmentProviders(),
|
|
|
|
|
|
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
|
|
|
|
|
};
|
2026-05-10 23:12:26 +02:00
|
|
|
|
|
|
|
|
|
|
const first = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
detectRelationships: true,
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-resume-1' },
|
|
|
|
|
|
providers,
|
|
|
|
|
|
stateStore,
|
|
|
|
|
|
syncId: 'sync-resume-1',
|
2026-05-19 16:40:01 +02:00
|
|
|
|
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
2026-05-10 23:12:26 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
|
2026-05-10 23:12:26 +02:00
|
|
|
|
const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
|
|
|
|
|
|
const second = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
detectRelationships: true,
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-resume-1' },
|
|
|
|
|
|
providers,
|
|
|
|
|
|
stateStore,
|
|
|
|
|
|
syncId: 'sync-resume-1',
|
2026-05-19 16:40:01 +02:00
|
|
|
|
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
2026-05-10 23:12:26 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
|
|
|
|
|
expect(first.state.resumedStages).toEqual([]);
|
|
|
|
|
|
expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
|
|
|
|
|
expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
expect(generateObject).not.toHaveBeenCalled();
|
2026-05-10 23:12:26 +02:00
|
|
|
|
expect(embedBatch).not.toHaveBeenCalled();
|
|
|
|
|
|
expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
|
|
|
|
|
|
expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
|
|
|
|
|
|
expect(second.relationships).toEqual(first.relationships);
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('does not reuse completed stages when the snapshot changes', async () => {
|
|
|
|
|
|
const stateStore = memoryEnrichmentStateStore();
|
2026-05-19 16:40:01 +02:00
|
|
|
|
const providers = {
|
|
|
|
|
|
...createDeterministicLocalScanEnrichmentProviders(),
|
|
|
|
|
|
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
|
|
|
|
|
};
|
2026-05-10 23:12:26 +02:00
|
|
|
|
const scanConnector = connector();
|
|
|
|
|
|
|
|
|
|
|
|
await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
detectRelationships: false,
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-resume-hash' },
|
|
|
|
|
|
providers,
|
|
|
|
|
|
stateStore,
|
|
|
|
|
|
syncId: 'sync-resume-hash',
|
2026-05-19 16:40:01 +02:00
|
|
|
|
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
2026-05-10 23:12:26 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
const firstTable = snapshot.tables[0];
|
|
|
|
|
|
if (!firstTable) {
|
|
|
|
|
|
throw new Error('Expected test snapshot table');
|
|
|
|
|
|
}
|
|
|
|
|
|
const changedConnector = {
|
|
|
|
|
|
...connector(),
|
|
|
|
|
|
introspect: vi.fn(async () => ({
|
|
|
|
|
|
...snapshot,
|
|
|
|
|
|
tables: [{ ...firstTable, name: 'customers' }],
|
|
|
|
|
|
})),
|
|
|
|
|
|
};
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
const generateObject = vi.spyOn(providers.llmRuntime, 'generateObject');
|
2026-05-10 23:12:26 +02:00
|
|
|
|
|
|
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
detectRelationships: false,
|
|
|
|
|
|
connector: changedConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-resume-hash' },
|
|
|
|
|
|
providers,
|
|
|
|
|
|
stateStore,
|
|
|
|
|
|
syncId: 'sync-resume-hash',
|
2026-05-19 16:40:01 +02:00
|
|
|
|
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
2026-05-10 23:12:26 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.state.resumedStages).toEqual([]);
|
|
|
|
|
|
expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
|
expect(generateObject).toHaveBeenCalled();
|
2026-05-10 23:12:26 +02:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
|
|
|
|
|
|
const executor = new InMemorySqliteExecutor();
|
|
|
|
|
|
try {
|
|
|
|
|
|
executor.db.exec(`
|
|
|
|
|
|
CREATE TABLE accounts (id INTEGER NOT NULL);
|
|
|
|
|
|
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
|
|
|
|
|
INSERT INTO accounts (id) VALUES (1), (2);
|
|
|
|
|
|
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
|
|
|
|
|
`);
|
|
|
|
|
|
const scanConnector = {
|
|
|
|
|
|
...connector(),
|
|
|
|
|
|
driver: 'sqlite' as const,
|
2026-05-10 23:51:24 +02:00
|
|
|
|
capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
|
2026-05-10 23:12:26 +02:00
|
|
|
|
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
|
|
|
|
|
|
executeReadOnly: executor.executeReadOnly.bind(executor),
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
const result = await runLocalScanEnrichment({
|
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
|
mode: 'enriched',
|
|
|
|
|
|
detectRelationships: false,
|
|
|
|
|
|
connector: scanConnector,
|
|
|
|
|
|
context: { runId: 'scan-run-providerless-enriched' },
|
|
|
|
|
|
providers: null,
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
expect(result.summary).toEqual({
|
|
|
|
|
|
dataDictionary: 'skipped',
|
|
|
|
|
|
tableDescriptions: 'skipped',
|
|
|
|
|
|
columnDescriptions: 'skipped',
|
|
|
|
|
|
embeddings: 'skipped',
|
|
|
|
|
|
deterministicRelationships: 'completed',
|
|
|
|
|
|
llmRelationshipValidation: 'skipped',
|
|
|
|
|
|
statisticalValidation: 'completed',
|
|
|
|
|
|
});
|
|
|
|
|
|
expect(result.descriptionUpdates).toEqual([]);
|
|
|
|
|
|
expect(result.embeddingUpdates).toEqual([]);
|
|
|
|
|
|
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
|
|
|
|
|
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
|
|
|
|
|
|
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
|
|
|
|
|
|
expect(result.resolvedRelationships).toEqual([
|
|
|
|
|
|
expect.objectContaining({
|
|
|
|
|
|
status: 'accepted',
|
|
|
|
|
|
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
|
|
|
|
|
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
|
|
|
|
|
}),
|
|
|
|
|
|
]);
|
|
|
|
|
|
expect(result.warnings).toContainEqual({
|
|
|
|
|
|
code: 'scan_enrichment_backend_not_configured',
|
|
|
|
|
|
message:
|
|
|
|
|
|
'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
|
|
|
|
|
|
recoverable: true,
|
|
|
|
|
|
metadata: {
|
|
|
|
|
|
skippedStages: ['descriptions', 'embeddings'],
|
|
|
|
|
|
relationshipDetection: true,
|
|
|
|
|
|
},
|
|
|
|
|
|
});
|
|
|
|
|
|
} finally {
|
|
|
|
|
|
executor.close();
|
|
|
|
|
|
}
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
});
|