2026-05-10 23:12:26 +02:00
|
|
|
import { describe, expect, it, vi } from 'vitest';
|
chore(workspace): gate dead-code with knip production mode (#196)
* refactor(workspace): relocate @ktx/llm source into packages/cli/src/llm
* refactor(workspace): rewrite @ktx/llm imports to relative paths
* refactor(workspace): fold internal packages into cli
* chore(workspace): gate dead-code with knip production mode
Turn on production-mode knip plus an autofix run in pre-commit and the
`pnpm dead-code` script, document the `/** @internal */` convention for
test-only exports in AGENTS.md, annotate test-only exports across the
CLI with that JSDoc, and drop dead exports/wrappers the new gate
surfaced (e.g. `cli-project.ts`, `lookerRuntimeSourceToFileAdapterSource`,
`createLocalScanEnrichmentProvidersFromConfig`,
`PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES`, stale type re-exports).
Replace the loose `ignoreIssues` allowlist in `knip.json` with explicit
production entries so cross-package barrel leaks are caught.
* refactor(cli): delete internal barrel index.ts files
The 34 `index.ts` re-export barrels inside `packages/cli/src/` were
holdovers from the pre-fold multi-workspace structure. Post-fold-in they
served no production purpose: external consumers go through the single
package main entry, and in-repo callers mostly imported through them
only because the path was short. Internally, knip flagged most barrel
re-exports as production-dead (only reached via tests).
This change:
- Deletes every internal barrel except `packages/cli/src/index.ts`
(the published package entry).
- Rewrites ~270 source/test files to import each name directly from
the file that defines it.
- Moves `tools/warehouse-verification/index.ts` to
`create-warehouse-verification-tools.ts` (the function it defined
locally) and updates its single consumer.
- Renames `search/backend-conformance.ts` → `.test-utils.ts` to match
the existing test-helper file convention.
- Deletes 13 dead test-only chains (dbt-descriptions/*,
live-database/extracted-schema, live-database/structural-sync,
relationship-* feedback/review chain) plus their tests and a
cascading orphan integration test.
- Updates test mocks that pointed at deleted barrel paths
(notion-client, connector barrels in scan/local-scan-connectors
tests) to mock the source files instead.
- Points the maintainer benchmark script
(`scripts/relationship-benchmark-report.mjs`) at source files
instead of `dist/context/scan/index.js`.
- Drops the barrel `!` entries from `knip.json`; adds explicit
production entries only for the benchmark code reached via dist by
the maintainer script.
Net: 413 files changed, ~1.2k insertions, ~9.4k deletions.
`pnpm run dead-code` (Biome + knip default + knip production) and
`pnpm run type-check` are clean; 2277 tests pass.
* refactor(workspace): rename @ktx/cli to @kaelio/ktx and pack it directly
Promote the CLI workspace package to the public name `@kaelio/ktx` and
drop the separate `scripts/build-public-npm-package.mjs` wrapper. The
CLI package is now publishable in place (`publishConfig.access: public`,
`provenance: true`), so artifact packing uses `pnpm pack` against
`packages/cli/` instead of assembling a parallel package tree.
Updates all workspace filter invocations, docs, tests, and release
readiness checks to reference the new package name, and folds the
tarball-name helper into `scripts/public-npm-release-metadata.mjs`.
* docs: align "agent clients" and "data agents" terminology
Replace "client agents" with "agent clients" and "database agents" with
"data agents" across AGENTS.md, README.md, the docs-site copy, and the
matching setup-agents test description, matching the canonical
vocabulary in docs/terminology.md.
Also moves packages/cli/tsconfig.json's tsBuildInfoFile from
node_modules/.cache/ to dist/.tsbuildinfo so incremental builds survive
node_modules reinstalls.
* refactor(release): single source of truth for package version
Make packages/cli/package.json the single source of truth for the
@kaelio/ktx version. publicNpmPackageVersion() now reads it directly,
so artifact filenames, release-readiness checks, and the Python wheel
version all derive from one field. The duplicate
release-policy.json.publicNpmPackageVersion is removed.
Previously the two fields could drift: tarballs were named
kaelio-ktx-0.4.1.tgz while internally containing
@kaelio/ktx@0.0.0-private.
- update-public-release-version.mjs rewrites both Python pyproject.toml
files (ktx-daemon, ktx-sl) alongside the npm package.jsons,
normalizing the version for PEP 440 (e.g. 0.1.0-rc.2 -> 0.1.0rc2).
- semantic-release-config.cjs adds the two pyproject.toml files to
@semantic-release/git assets so the release commit back to main
carries every version source in lockstep.
- The six "?? '0.0.0-private'" fallback literals across the CLI are
replaced with "?? getKtxCliPackageInfo().version", and
createDefaultKtxMcpServer makes its version arg required.
- docs/release.md describes the actual commit-back model: the dev tree
always reflects the most recent release; no sentinel pin to
maintain.
Verified: pnpm run artifacts:build now produces
kaelio-ktx-0.4.1.tgz and kaelio_ktx-0.4.1-py3-none-any.whl with
@kaelio/ktx@0.4.1 inside. Full type-check, dead-code, and
2287 vitests + 173 script tests pass.
* refactor(cli): inject embedding provider resolution and detect sentence-transformers runtime
Make resolveProjectEmbeddingProvider and runtimeIo injectable in ingest and
scan command entrypoints so tests can stub them, and teach
resolvePublicIngestRuntimeRequirements to flag the local-embeddings runtime
feature when ktx.yaml selects sentence-transformers.
* chore(cli): mark buildLocalStatsStatus and LocalStatsStatus as @internal
Both symbols are consumed only by status-project.test.ts. Annotating with
/** @internal */ keeps knip's production-mode check clean without changing
runtime behavior.
* fix(cli): use real package metadata in print-command-tree
The stubbed package name embedded a forbidden product identifier that
tripped the boundary check in CI. Read the metadata from package.json
instead — keeps the rendered tree unchanged and removes a duplicate
source of truth.
* feat(cli): show embedding coverage in `ktx status`, drop duplicate disk counts
Inline `(N embedded)` next to the Wiki scope counts and Semantic-layer
source counts, computed with `SUM(embedding_json IS NOT NULL)` over
`knowledge_pages` and `local_sl_sources`. Rename the "Knowledge" label to
"Wiki" (canonical per `docs/terminology.md`) and rename the matching
`localStats.knowledgePages` field to `localStats.wikiPages`.
Drop `wiki=N md` and `semantic-layer=N yaml` from the Disk row — those
duplicated the per-surface rows above. Disk now reports only actual byte
usage (db, cache, raw-sources). The unused `wikiGlobalMarkdownCount` /
`semanticLayerYamlCount` fields, the `isMarkdownEntry` / `isYamlEntry`
helpers, and the `filter` arg on `summarizeDir` are removed.
2026-05-21 15:28:58 +02:00
|
|
|
import { createPostgresLiveDatabaseIntrospection } from '../../connectors/postgres/live-database-introspection.js';
|
|
|
|
|
import { isKtxPostgresConnectionConfig, KtxPostgresScanConnector, postgresPoolConfigFromConfig, type KtxPostgresPoolFactory } from '../../connectors/postgres/connector.js';
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
import { tableRefSet } from '../../context/scan/table-ref.js';
|
2026-05-10 23:12:26 +02:00
|
|
|
|
|
|
|
|
interface FakeQueryResult {
|
|
|
|
|
rows: Record<string, unknown>[];
|
|
|
|
|
fields?: Array<{ name: string; dataTypeID: number }>;
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-10 23:51:24 +02:00
|
|
|
function fakePoolFactory(results: Map<string, FakeQueryResult>): KtxPostgresPoolFactory {
|
2026-05-10 23:12:26 +02:00
|
|
|
const query = vi.fn(async (sql: string, params?: unknown[]) => {
|
|
|
|
|
const normalized = sql.replace(/\s+/g, ' ').trim();
|
|
|
|
|
for (const [key, value] of results.entries()) {
|
|
|
|
|
if (normalized.includes(key)) {
|
|
|
|
|
return value;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
throw new Error(`Unexpected SQL: ${normalized} params=${JSON.stringify(params ?? [])}`);
|
|
|
|
|
});
|
|
|
|
|
return {
|
|
|
|
|
createPool() {
|
|
|
|
|
return {
|
|
|
|
|
async connect() {
|
|
|
|
|
return {
|
|
|
|
|
query,
|
|
|
|
|
release: vi.fn(),
|
|
|
|
|
};
|
|
|
|
|
},
|
|
|
|
|
end: vi.fn(async () => undefined),
|
|
|
|
|
};
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function metadataResults(): Map<string, FakeQueryResult> {
|
|
|
|
|
return new Map<string, FakeQueryResult>([
|
|
|
|
|
[
|
|
|
|
|
'FROM pg_catalog.pg_class c JOIN pg_catalog.pg_namespace n',
|
|
|
|
|
{
|
|
|
|
|
rows: [
|
|
|
|
|
{ table_name: 'customers', table_kind: 'r', row_count: '2', table_comment: 'Customers' },
|
|
|
|
|
{ table_name: 'orders', table_kind: 'r', row_count: '3', table_comment: null },
|
|
|
|
|
{ table_name: 'recent_orders', table_kind: 'v', row_count: '0', table_comment: 'Recent orders' },
|
|
|
|
|
],
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
'FROM pg_catalog.pg_attribute a JOIN pg_catalog.pg_class c',
|
|
|
|
|
{
|
|
|
|
|
rows: [
|
|
|
|
|
{ table_name: 'customers', column_name: 'id', data_type: 'integer', is_nullable: false, column_comment: null },
|
|
|
|
|
{ table_name: 'customers', column_name: 'name', data_type: 'text', is_nullable: false, column_comment: 'Name' },
|
|
|
|
|
{ table_name: 'orders', column_name: 'id', data_type: 'integer', is_nullable: false, column_comment: null },
|
|
|
|
|
{ table_name: 'orders', column_name: 'customer_id', data_type: 'integer', is_nullable: false, column_comment: null },
|
|
|
|
|
{ table_name: 'orders', column_name: 'status', data_type: 'text', is_nullable: true, column_comment: null },
|
|
|
|
|
{ table_name: 'recent_orders', column_name: 'id', data_type: 'integer', is_nullable: true, column_comment: null },
|
|
|
|
|
],
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
"tc.constraint_type = 'FOREIGN KEY'",
|
|
|
|
|
{
|
|
|
|
|
rows: [
|
|
|
|
|
{
|
|
|
|
|
table_name: 'orders',
|
|
|
|
|
column_name: 'customer_id',
|
|
|
|
|
foreign_table_schema: 'public',
|
|
|
|
|
foreign_table_name: 'customers',
|
|
|
|
|
foreign_column_name: 'id',
|
|
|
|
|
constraint_name: 'orders_customer_id_fkey',
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
"tc.constraint_type = 'PRIMARY KEY'",
|
|
|
|
|
{
|
|
|
|
|
rows: [
|
|
|
|
|
{ table_name: 'customers', column_name: 'id' },
|
|
|
|
|
{ table_name: 'orders', column_name: 'id' },
|
|
|
|
|
],
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
['SELECT "id" FROM "public"."orders" LIMIT 1', { rows: [{ id: 10 }], fields: [{ name: 'id', dataTypeID: 23 }] }],
|
|
|
|
|
[
|
|
|
|
|
'SELECT "status" FROM "public"."orders" WHERE "status" IS NOT NULL',
|
|
|
|
|
{ rows: [{ status: 'paid' }, { status: 'open' }], fields: [{ name: 'status', dataTypeID: 25 }] },
|
|
|
|
|
],
|
|
|
|
|
['COUNT(DISTINCT val) AS cardinality', { rows: [{ cardinality: '2' }] }],
|
|
|
|
|
['SELECT DISTINCT "status"::text AS val', { rows: [{ val: 'open' }, { val: 'paid' }] }],
|
|
|
|
|
['SELECT COUNT(*) AS count FROM "public"."orders"', { rows: [{ count: '3' }] }],
|
|
|
|
|
['FROM pg_stats s', { rows: [{ column_name: 'status', estimated_cardinality: '2' }] }],
|
|
|
|
|
['SELECT 1', { rows: [{ '?column?': 1 }], fields: [{ name: '?column?', dataTypeID: 23 }] }],
|
|
|
|
|
['SELECT schema_name FROM information_schema.schemata', { rows: [{ schema_name: 'public' }] }],
|
|
|
|
|
]);
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-10 23:51:24 +02:00
|
|
|
describe('KtxPostgresScanConnector', () => {
|
2026-05-10 23:12:26 +02:00
|
|
|
it('resolves configuration safely', () => {
|
2026-05-13 19:37:25 +02:00
|
|
|
expect(isKtxPostgresConnectionConfig({ driver: 'postgres', url: 'env:DATABASE_URL' })).toBe(true);
|
2026-05-10 23:51:24 +02:00
|
|
|
expect(isKtxPostgresConnectionConfig({ driver: 'postgresql', host: 'db', database: 'analytics' })).toBe(true);
|
|
|
|
|
expect(isKtxPostgresConnectionConfig({ driver: 'mysql', host: 'db' })).toBe(false);
|
2026-05-10 23:12:26 +02:00
|
|
|
expect(
|
|
|
|
|
postgresPoolConfigFromConfig({
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
connection: {
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
username: 'reader',
|
|
|
|
|
password: 'test-password', // pragma: allowlist secret
|
|
|
|
|
schemas: ['analytics', 'public'],
|
|
|
|
|
ssl: true,
|
|
|
|
|
rejectUnauthorized: false,
|
|
|
|
|
},
|
|
|
|
|
}),
|
|
|
|
|
).toMatchObject({
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
port: 5432,
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
user: 'reader',
|
|
|
|
|
password: 'test-password', // pragma: allowlist secret
|
|
|
|
|
options: '-c search_path=analytics,public',
|
|
|
|
|
ssl: { rejectUnauthorized: false },
|
|
|
|
|
});
|
2026-05-11 22:35:07 +02:00
|
|
|
const libpqPreferConfig = postgresPoolConfigFromConfig({
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
connection: {
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
url: 'env:DEMO_DATABASE_URL',
|
|
|
|
|
},
|
|
|
|
|
env: {
|
2026-05-11 23:16:48 +02:00
|
|
|
DEMO_DATABASE_URL: 'postgresql://reader@demo.example.test:5432/demo?sslmode=prefer',
|
2026-05-11 22:35:07 +02:00
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
expect(libpqPreferConfig).toMatchObject({
|
2026-05-11 23:16:48 +02:00
|
|
|
host: 'demo.example.test',
|
2026-05-11 22:35:07 +02:00
|
|
|
port: 5432,
|
|
|
|
|
database: 'demo',
|
|
|
|
|
user: 'reader',
|
|
|
|
|
});
|
|
|
|
|
expect(libpqPreferConfig).not.toHaveProperty('connectionString');
|
|
|
|
|
expect(libpqPreferConfig).not.toHaveProperty('ssl');
|
2026-05-13 19:37:25 +02:00
|
|
|
expect(
|
2026-05-10 23:12:26 +02:00
|
|
|
postgresPoolConfigFromConfig({
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
connection: { driver: 'postgres', host: 'db.example.test', database: 'analytics', username: 'reader' },
|
|
|
|
|
}),
|
2026-05-13 19:37:25 +02:00
|
|
|
).toMatchObject({
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
user: 'reader',
|
|
|
|
|
});
|
2026-05-10 23:12:26 +02:00
|
|
|
});
|
|
|
|
|
|
|
|
|
|
it('introspects schemas, tables, views, primary keys, comments, row counts, and foreign keys', async () => {
|
2026-05-10 23:51:24 +02:00
|
|
|
const connector = new KtxPostgresScanConnector({
|
2026-05-10 23:12:26 +02:00
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
connection: {
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
username: 'reader',
|
|
|
|
|
password: 'test-password', // pragma: allowlist secret
|
|
|
|
|
schema: 'public',
|
|
|
|
|
},
|
|
|
|
|
poolFactory: fakePoolFactory(metadataResults()),
|
|
|
|
|
now: () => new Date('2026-04-29T10:00:00.000Z'),
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
const snapshot = await connector.introspect(
|
|
|
|
|
{ connectionId: 'warehouse', driver: 'postgres' },
|
|
|
|
|
{ runId: 'scan-run-1' },
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
expect(snapshot).toMatchObject({
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
extractedAt: '2026-04-29T10:00:00.000Z',
|
|
|
|
|
scope: { schemas: ['public'] },
|
|
|
|
|
metadata: {
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
schemas: ['public'],
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
table_count: 3,
|
|
|
|
|
total_columns: 6,
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
expect(snapshot.tables.map((table) => [table.db, table.name, table.kind, table.estimatedRows])).toEqual([
|
|
|
|
|
['public', 'customers', 'table', 2],
|
|
|
|
|
['public', 'orders', 'table', 3],
|
|
|
|
|
['public', 'recent_orders', 'view', null],
|
|
|
|
|
]);
|
|
|
|
|
expect(snapshot.tables.find((table) => table.name === 'customers')?.columns[0]).toMatchObject({
|
|
|
|
|
name: 'id',
|
|
|
|
|
nativeType: 'integer',
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
dimensionType: 'number',
|
|
|
|
|
nullable: false,
|
|
|
|
|
primaryKey: true,
|
|
|
|
|
});
|
|
|
|
|
expect(snapshot.tables.find((table) => table.name === 'orders')?.foreignKeys).toEqual([
|
|
|
|
|
{
|
|
|
|
|
fromColumn: 'customer_id',
|
|
|
|
|
toCatalog: null,
|
|
|
|
|
toDb: 'public',
|
|
|
|
|
toTable: 'customers',
|
|
|
|
|
toColumn: 'id',
|
|
|
|
|
constraintName: 'orders_customer_id_fkey',
|
|
|
|
|
},
|
|
|
|
|
]);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
it('runs samples, distinct values, statistics, read-only SQL, and schema listing', async () => {
|
2026-05-10 23:51:24 +02:00
|
|
|
const connector = new KtxPostgresScanConnector({
|
2026-05-10 23:12:26 +02:00
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
connection: {
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
username: 'reader',
|
|
|
|
|
password: 'test-password', // pragma: allowlist secret
|
|
|
|
|
schema: 'public',
|
|
|
|
|
},
|
|
|
|
|
poolFactory: fakePoolFactory(metadataResults()),
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
await expect(
|
|
|
|
|
connector.sampleTable(
|
|
|
|
|
{ connectionId: 'warehouse', table: { catalog: null, db: 'public', name: 'orders' }, columns: ['id'], limit: 1 },
|
|
|
|
|
{ runId: 'scan-run-1' },
|
|
|
|
|
),
|
|
|
|
|
).resolves.toEqual({ headers: ['id'], headerTypes: ['integer'], rows: [[10]], totalRows: 1 });
|
|
|
|
|
|
|
|
|
|
await expect(
|
|
|
|
|
connector.sampleColumn(
|
|
|
|
|
{ connectionId: 'warehouse', table: { catalog: null, db: 'public', name: 'orders' }, column: 'status', limit: 5 },
|
|
|
|
|
{ runId: 'scan-run-1' },
|
|
|
|
|
),
|
|
|
|
|
).resolves.toMatchObject({ values: ['paid', 'open'], nullCount: null, distinctCount: null });
|
|
|
|
|
|
|
|
|
|
await expect(
|
|
|
|
|
connector.getColumnDistinctValues(
|
|
|
|
|
{ catalog: null, db: 'public', name: 'orders' },
|
|
|
|
|
'status',
|
|
|
|
|
{ maxCardinality: 5, limit: 10, sampleSize: 100 },
|
|
|
|
|
),
|
|
|
|
|
).resolves.toEqual({ values: ['open', 'paid'], cardinality: 2 });
|
|
|
|
|
|
|
|
|
|
await expect(connector.getColumnStatistics({ catalog: null, db: 'public', name: 'orders' })).resolves.toEqual({
|
|
|
|
|
cardinalityByColumn: new Map([['status', 2]]),
|
|
|
|
|
});
|
|
|
|
|
await expect(connector.getTableRowCount({ db: 'public', name: 'orders' })).resolves.toBe(3);
|
|
|
|
|
await expect(connector.listSchemas()).resolves.toEqual(['public']);
|
|
|
|
|
await expect(connector.testConnection()).resolves.toEqual({ success: true });
|
|
|
|
|
|
|
|
|
|
await expect(
|
|
|
|
|
connector.executeReadOnly({ connectionId: 'warehouse', sql: 'delete from orders' }, { runId: 'scan-run-1' }),
|
|
|
|
|
).rejects.toThrow('Only read-only SELECT/WITH queries can be executed locally');
|
|
|
|
|
});
|
|
|
|
|
|
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204)
* feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure
Snowflake setup previously asked for a single schema as free text, then
ran a multiselect against the discovered schemas — two schema questions
back-to-back, with the first being only a session bootstrap. The SDK's
`schema` is optional, so the bootstrap step is unnecessary.
- Remove the free-text Snowflake schema prompt; only pass `schema` to
snowflake-sdk when one is configured.
- When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the
user for a comma-separated list, persist it as `schema_names`, and use
it as both the table-list filter and the multiselect default. Applies
to every driver with a scope-discovery spec, not just Snowflake.
- Update docs to lead with `schema_names`; keep `schema_name` as a
documented single-schema shorthand.
* fix(snowflake): keep introspecting when primary-key discovery is denied
The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and
INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the
connection role may not have. Previously a 'SQL compilation error:
Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist
or not authorized' aborted the entire introspect — schemas, columns,
and row counts were all discarded over a missing nice-to-have.
Wrap the constraint query in try/catch, log a one-line warning per
schema, and return an empty PK map. Columns end up with
primaryKey=false; relationship inference still has FK and profiling
to fall back on.
* fix(scan): unblock relationship discovery on Snowflake
Two adjacent bugs prevented the scan's relationship pipeline from producing
any joins on a Snowflake warehouse:
- relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch
for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table
profile query failed with "Unknown function GROUP_CONCAT". Add an explicit
Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter
(Snowflake requires the delimiter to be a constant, so CHR(31) is rejected).
- description-generation.ts destructured `connector.sampleTable` and
`connector.sampleColumn` into bare locals, losing the `this` binding when
the class-method connectors (Snowflake, Postgres, MySQL) were invoked.
Every sample call threw "Cannot read properties of undefined (reading
'assertConnection')" and degraded LLM descriptions to metadata-only
prompts. Call the methods through the connector instead.
Without these, even after the primary-key probe is allowed to fail softly,
the scan ends up with 0 validated relationships and an empty `joins:` block
in every shard YAML.
* test(scan): cover table-ref helpers
* feat(scan): plumb tableScope through live-database introspection port
* feat(scan): apply tableScope during metadata fetch
* feat(scan): enforce table scope at fetch boundary
* feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206)
* feat(cli): add RSA key-pair auth option to Snowflake setup wizard
Extends the interactive Snowflake setup flow with an authentication-method
prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key
path (env/file/absolute) and an optional passphrase; the resulting connection
config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead
of `password`.
* feat(scan): pool Snowflake sessions
* fix(scan): reuse structural snapshots and cleanup connectors
* feat(scan): parallelize relationship profiling
* feat(scan): batch table description generation
* docs: document Snowflake ingest concurrency knobs
* fix(scan): close Snowflake ingest perf verification gaps
* fix(scan): keep batched description failure bounded
* feat(scan): dispatch query-history probes by connection driver
Extract historic-sql dialect resolution into a shared helper so the
status-project readiness check and the local ingest factory agree on
which connections enable query history and which probe to run. The
status command now picks the postgres/snowflake/bigquery probe based on
the connection's driver instead of always reporting against postgres,
which previously caused snowflake connections with queryHistory.enabled
to surface a misleading "driver is snowflake" failure.
Also drops a noisy console.warn from Snowflake primary-key discovery —
INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only
roles and the FK + profiling paths handle the empty PK map already.
* fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject
The Claude Code agent SDK announces an internal pseudo-tool named
StructuredOutput in the system/init message whenever outputFormat is set
to { type: 'json_schema' }. The runtime's isolation check built its
allowedToolIds set only from MCP tool ids and treated StructuredOutput
as an unexpected host-injected tool, so every generateObject call threw
"Claude Code runtime isolation failed: tools=StructuredOutput ..." and
the table-descriptions and relationship-LLM-proposal enrichment stages
recorded null output across the board.
Whitelist StructuredOutput specifically in generateObject's
allowedToolIds — the check also enforces missing_tools symmetry, so
generateText and runAgentLoop, which do not see StructuredOutput, must
not require it.
generateObject also ran with maxTurns: 1, which the model intermittently
breached when it emitted thinking text before the structured response.
Raised to 5 to give the schema-bound call enough headroom without
allowing unbounded loops. The existing tests now exercise the path with
an init message that announces StructuredOutput so the regression cannot
slip back in.
* chore(scripts): add ktx-reset.sh project-cleanup helper
Convenience script for repeatable ingest testing: takes a project
directory and prunes everything except ktx.yaml and .ktx/secrets/, so
the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
|
|
|
it('limits introspection to tables in tableScope', async () => {
|
|
|
|
|
const queries: Array<{ sql: string; params?: unknown[] }> = [];
|
|
|
|
|
const poolFactory: KtxPostgresPoolFactory = {
|
|
|
|
|
createPool() {
|
|
|
|
|
return {
|
|
|
|
|
async connect() {
|
|
|
|
|
return {
|
|
|
|
|
query: vi.fn(async (sql: string, params?: unknown[]) => {
|
|
|
|
|
queries.push({ sql, params });
|
|
|
|
|
if (sql.includes('FROM pg_catalog.pg_class c')) {
|
|
|
|
|
return { rows: [{ table_name: 'orders', table_kind: 'r', row_count: '3', table_comment: null }] };
|
|
|
|
|
}
|
|
|
|
|
if (sql.includes('FROM pg_catalog.pg_attribute a')) {
|
|
|
|
|
return {
|
|
|
|
|
rows: [
|
|
|
|
|
{
|
|
|
|
|
table_name: 'orders',
|
|
|
|
|
column_name: 'id',
|
|
|
|
|
data_type: 'integer',
|
|
|
|
|
is_nullable: false,
|
|
|
|
|
column_comment: null,
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
return { rows: [] };
|
|
|
|
|
}),
|
|
|
|
|
release: vi.fn(),
|
|
|
|
|
};
|
|
|
|
|
},
|
|
|
|
|
end: vi.fn(async () => undefined),
|
|
|
|
|
};
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
const connector = new KtxPostgresScanConnector({
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
connection: {
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
username: 'reader',
|
|
|
|
|
password: 'test-password', // pragma: allowlist secret
|
|
|
|
|
schema: 'public',
|
|
|
|
|
},
|
|
|
|
|
poolFactory,
|
|
|
|
|
});
|
|
|
|
|
const scope = tableRefSet([{ catalog: null, db: 'public', name: 'orders' }]);
|
|
|
|
|
const snapshot = await connector.introspect(
|
|
|
|
|
{ connectionId: 'warehouse', driver: 'postgres', tableScope: scope },
|
|
|
|
|
{ runId: 'scope-test' },
|
|
|
|
|
);
|
|
|
|
|
expect(snapshot.tables.map((table) => table.name)).toEqual(['orders']);
|
|
|
|
|
const tablesQuery = queries.find((query) => query.sql.includes('FROM pg_catalog.pg_class c'));
|
|
|
|
|
expect(tablesQuery?.sql).toMatch(/c\.relname = ANY\(\$2\)/);
|
|
|
|
|
expect(tablesQuery?.params).toEqual(['public', ['orders']]);
|
|
|
|
|
});
|
|
|
|
|
|
2026-05-10 23:12:26 +02:00
|
|
|
it('adapts native PostgreSQL snapshots to live-database introspection for local ingest', async () => {
|
|
|
|
|
const introspection = createPostgresLiveDatabaseIntrospection({
|
|
|
|
|
connections: {
|
|
|
|
|
warehouse: {
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
username: 'reader',
|
|
|
|
|
password: 'test-password', // pragma: allowlist secret
|
|
|
|
|
schema: 'public',
|
|
|
|
|
},
|
|
|
|
|
},
|
|
|
|
|
poolFactory: fakePoolFactory(metadataResults()),
|
|
|
|
|
now: () => new Date('2026-04-29T10:00:00.000Z'),
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
const snapshot = await introspection.extractSchema('warehouse');
|
|
|
|
|
|
|
|
|
|
expect(snapshot).toMatchObject({
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
extractedAt: '2026-04-29T10:00:00.000Z',
|
|
|
|
|
});
|
|
|
|
|
expect(snapshot.tables.find((table) => table.name === 'customers')).toMatchObject({
|
|
|
|
|
name: 'customers',
|
|
|
|
|
catalog: null,
|
|
|
|
|
db: 'public',
|
|
|
|
|
columns: [
|
|
|
|
|
{
|
|
|
|
|
name: 'id',
|
|
|
|
|
nativeType: 'integer',
|
|
|
|
|
normalizedType: 'integer',
|
|
|
|
|
dimensionType: 'number',
|
|
|
|
|
nullable: false,
|
|
|
|
|
primaryKey: true,
|
|
|
|
|
comment: null,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
name: 'name',
|
|
|
|
|
nativeType: 'text',
|
|
|
|
|
normalizedType: 'text',
|
|
|
|
|
dimensionType: 'string',
|
|
|
|
|
nullable: false,
|
|
|
|
|
primaryKey: false,
|
|
|
|
|
comment: 'Name',
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
foreignKeys: [],
|
|
|
|
|
});
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
it('does not end the pool before introspection completes', async () => {
|
|
|
|
|
let endCalled = false;
|
2026-05-10 23:51:24 +02:00
|
|
|
const endAwarePoolFactory: KtxPostgresPoolFactory = {
|
2026-05-10 23:12:26 +02:00
|
|
|
createPool() {
|
|
|
|
|
const inner = fakePoolFactory(metadataResults()).createPool({
|
|
|
|
|
max: 1,
|
|
|
|
|
idleTimeoutMillis: 1,
|
|
|
|
|
connectionTimeoutMillis: 1,
|
|
|
|
|
});
|
|
|
|
|
return {
|
|
|
|
|
async connect() {
|
|
|
|
|
if (endCalled) {
|
|
|
|
|
throw new Error('Cannot use a pool after calling end on the pool');
|
|
|
|
|
}
|
|
|
|
|
return inner.connect();
|
|
|
|
|
},
|
|
|
|
|
async end() {
|
|
|
|
|
endCalled = true;
|
|
|
|
|
return inner.end();
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
const introspection = createPostgresLiveDatabaseIntrospection({
|
|
|
|
|
connections: {
|
|
|
|
|
warehouse: {
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
username: 'reader',
|
|
|
|
|
password: 'test-password', // pragma: allowlist secret
|
|
|
|
|
schema: 'public',
|
|
|
|
|
},
|
|
|
|
|
},
|
|
|
|
|
poolFactory: endAwarePoolFactory,
|
|
|
|
|
now: () => new Date('2026-04-29T10:00:00.000Z'),
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
const snapshot = await introspection.extractSchema('warehouse');
|
|
|
|
|
expect(snapshot.tables.length).toBeGreaterThan(0);
|
|
|
|
|
expect(endCalled).toBe(true);
|
|
|
|
|
});
|
2026-05-12 16:56:58 -04:00
|
|
|
|
|
|
|
|
it('attaches an error listener to the pg pool', async () => {
|
|
|
|
|
const on = vi.fn();
|
|
|
|
|
const poolFactory: KtxPostgresPoolFactory = {
|
|
|
|
|
createPool() {
|
|
|
|
|
return {
|
|
|
|
|
on,
|
|
|
|
|
async connect() {
|
|
|
|
|
return {
|
|
|
|
|
query: vi.fn(async () => ({ rows: [{ '?column?': 1 }], fields: [{ name: '?column?', dataTypeID: 23 }] })),
|
|
|
|
|
release: vi.fn(),
|
|
|
|
|
};
|
|
|
|
|
},
|
|
|
|
|
end: vi.fn(async () => undefined),
|
|
|
|
|
};
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
const connector = new KtxPostgresScanConnector({
|
|
|
|
|
connectionId: 'warehouse',
|
|
|
|
|
connection: {
|
|
|
|
|
driver: 'postgres',
|
|
|
|
|
host: 'db.example.test',
|
|
|
|
|
database: 'analytics',
|
|
|
|
|
username: 'reader',
|
|
|
|
|
password: 'test-password', // pragma: allowlist secret
|
|
|
|
|
},
|
|
|
|
|
poolFactory,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
await expect(connector.testConnection()).resolves.toEqual({ success: true });
|
|
|
|
|
|
|
|
|
|
expect(on).toHaveBeenCalledWith('error', expect.any(Function));
|
|
|
|
|
});
|
2026-05-10 23:12:26 +02:00
|
|
|
});
|