ktx/packages/cli/src/connectors/sqlite/connector.ts

361 lines
12 KiB
TypeScript
Raw Normal View History

2026-05-10 23:12:26 +02:00
import Database from 'better-sqlite3';
import { existsSync, readFileSync, statSync } from 'node:fs';
import { homedir } from 'node:os';
import { isAbsolute, resolve } from 'node:path';
import { fileURLToPath } from 'node:url';
chore(workspace): gate dead-code with knip production mode (#196) * refactor(workspace): relocate @ktx/llm source into packages/cli/src/llm * refactor(workspace): rewrite @ktx/llm imports to relative paths * refactor(workspace): fold internal packages into cli * chore(workspace): gate dead-code with knip production mode Turn on production-mode knip plus an autofix run in pre-commit and the `pnpm dead-code` script, document the `/** @internal */` convention for test-only exports in AGENTS.md, annotate test-only exports across the CLI with that JSDoc, and drop dead exports/wrappers the new gate surfaced (e.g. `cli-project.ts`, `lookerRuntimeSourceToFileAdapterSource`, `createLocalScanEnrichmentProvidersFromConfig`, `PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES`, stale type re-exports). Replace the loose `ignoreIssues` allowlist in `knip.json` with explicit production entries so cross-package barrel leaks are caught. * refactor(cli): delete internal barrel index.ts files The 34 `index.ts` re-export barrels inside `packages/cli/src/` were holdovers from the pre-fold multi-workspace structure. Post-fold-in they served no production purpose: external consumers go through the single package main entry, and in-repo callers mostly imported through them only because the path was short. Internally, knip flagged most barrel re-exports as production-dead (only reached via tests). This change: - Deletes every internal barrel except `packages/cli/src/index.ts` (the published package entry). - Rewrites ~270 source/test files to import each name directly from the file that defines it. - Moves `tools/warehouse-verification/index.ts` to `create-warehouse-verification-tools.ts` (the function it defined locally) and updates its single consumer. - Renames `search/backend-conformance.ts` → `.test-utils.ts` to match the existing test-helper file convention. - Deletes 13 dead test-only chains (dbt-descriptions/*, live-database/extracted-schema, live-database/structural-sync, relationship-* feedback/review chain) plus their tests and a cascading orphan integration test. - Updates test mocks that pointed at deleted barrel paths (notion-client, connector barrels in scan/local-scan-connectors tests) to mock the source files instead. - Points the maintainer benchmark script (`scripts/relationship-benchmark-report.mjs`) at source files instead of `dist/context/scan/index.js`. - Drops the barrel `!` entries from `knip.json`; adds explicit production entries only for the benchmark code reached via dist by the maintainer script. Net: 413 files changed, ~1.2k insertions, ~9.4k deletions. `pnpm run dead-code` (Biome + knip default + knip production) and `pnpm run type-check` are clean; 2277 tests pass. * refactor(workspace): rename @ktx/cli to @kaelio/ktx and pack it directly Promote the CLI workspace package to the public name `@kaelio/ktx` and drop the separate `scripts/build-public-npm-package.mjs` wrapper. The CLI package is now publishable in place (`publishConfig.access: public`, `provenance: true`), so artifact packing uses `pnpm pack` against `packages/cli/` instead of assembling a parallel package tree. Updates all workspace filter invocations, docs, tests, and release readiness checks to reference the new package name, and folds the tarball-name helper into `scripts/public-npm-release-metadata.mjs`. * docs: align "agent clients" and "data agents" terminology Replace "client agents" with "agent clients" and "database agents" with "data agents" across AGENTS.md, README.md, the docs-site copy, and the matching setup-agents test description, matching the canonical vocabulary in docs/terminology.md. Also moves packages/cli/tsconfig.json's tsBuildInfoFile from node_modules/.cache/ to dist/.tsbuildinfo so incremental builds survive node_modules reinstalls. * refactor(release): single source of truth for package version Make packages/cli/package.json the single source of truth for the @kaelio/ktx version. publicNpmPackageVersion() now reads it directly, so artifact filenames, release-readiness checks, and the Python wheel version all derive from one field. The duplicate release-policy.json.publicNpmPackageVersion is removed. Previously the two fields could drift: tarballs were named kaelio-ktx-0.4.1.tgz while internally containing @kaelio/ktx@0.0.0-private. - update-public-release-version.mjs rewrites both Python pyproject.toml files (ktx-daemon, ktx-sl) alongside the npm package.jsons, normalizing the version for PEP 440 (e.g. 0.1.0-rc.2 -> 0.1.0rc2). - semantic-release-config.cjs adds the two pyproject.toml files to @semantic-release/git assets so the release commit back to main carries every version source in lockstep. - The six "?? '0.0.0-private'" fallback literals across the CLI are replaced with "?? getKtxCliPackageInfo().version", and createDefaultKtxMcpServer makes its version arg required. - docs/release.md describes the actual commit-back model: the dev tree always reflects the most recent release; no sentinel pin to maintain. Verified: pnpm run artifacts:build now produces kaelio-ktx-0.4.1.tgz and kaelio_ktx-0.4.1-py3-none-any.whl with @kaelio/ktx@0.4.1 inside. Full type-check, dead-code, and 2287 vitests + 173 script tests pass. * refactor(cli): inject embedding provider resolution and detect sentence-transformers runtime Make resolveProjectEmbeddingProvider and runtimeIo injectable in ingest and scan command entrypoints so tests can stub them, and teach resolvePublicIngestRuntimeRequirements to flag the local-embeddings runtime feature when ktx.yaml selects sentence-transformers. * chore(cli): mark buildLocalStatsStatus and LocalStatsStatus as @internal Both symbols are consumed only by status-project.test.ts. Annotating with /** @internal */ keeps knip's production-mode check clean without changing runtime behavior. * fix(cli): use real package metadata in print-command-tree The stubbed package name embedded a forbidden product identifier that tripped the boundary check in CI. Read the metadata from package.json instead — keeps the rendered tree unchanged and removes a duplicate source of truth. * feat(cli): show embedding coverage in `ktx status`, drop duplicate disk counts Inline `(N embedded)` next to the Wiki scope counts and Semantic-layer source counts, computed with `SUM(embedding_json IS NOT NULL)` over `knowledge_pages` and `local_sl_sources`. Rename the "Knowledge" label to "Wiki" (canonical per `docs/terminology.md`) and rename the matching `localStats.knowledgePages` field to `localStats.wikiPages`. Drop `wiki=N md` and `semantic-layer=N yaml` from the Disk row — those duplicated the per-surface rows above. Disk now reports only actual byte usage (db, cache, raw-sources). The unused `wikiGlobalMarkdownCount` / `semanticLayerYamlCount` fields, the `isMarkdownEntry` / `isYamlEntry` helpers, and the `filter` arg on `summarizeDir` are removed.
2026-05-21 15:28:58 +02:00
import { assertReadOnlySql, limitSqlForExecution } from '../../context/connections/read-only-sql.js';
import { normalizeQueryRows } from '../../context/connections/query-executor.js';
import { createKtxConnectorCapabilities, type KtxColumnSampleInput, type KtxColumnSampleResult, type KtxColumnStatsInput, type KtxColumnStatsResult, type KtxQueryResult, type KtxReadOnlyQueryInput, type KtxScanConnector, type KtxScanContext, type KtxScanInput, type KtxSchemaForeignKey, type KtxSchemaSnapshot, type KtxSchemaTable, type KtxTableRef, type KtxTableSampleInput, type KtxTableSampleResult } from '../../context/scan/types.js';
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204) * feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure Snowflake setup previously asked for a single schema as free text, then ran a multiselect against the discovered schemas — two schema questions back-to-back, with the first being only a session bootstrap. The SDK's `schema` is optional, so the bootstrap step is unnecessary. - Remove the free-text Snowflake schema prompt; only pass `schema` to snowflake-sdk when one is configured. - When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the user for a comma-separated list, persist it as `schema_names`, and use it as both the table-list filter and the multiselect default. Applies to every driver with a scope-discovery spec, not just Snowflake. - Update docs to lead with `schema_names`; keep `schema_name` as a documented single-schema shorthand. * fix(snowflake): keep introspecting when primary-key discovery is denied The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the connection role may not have. Previously a 'SQL compilation error: Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist or not authorized' aborted the entire introspect — schemas, columns, and row counts were all discarded over a missing nice-to-have. Wrap the constraint query in try/catch, log a one-line warning per schema, and return an empty PK map. Columns end up with primaryKey=false; relationship inference still has FK and profiling to fall back on. * fix(scan): unblock relationship discovery on Snowflake Two adjacent bugs prevented the scan's relationship pipeline from producing any joins on a Snowflake warehouse: - relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table profile query failed with "Unknown function GROUP_CONCAT". Add an explicit Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected). - description-generation.ts destructured `connector.sampleTable` and `connector.sampleColumn` into bare locals, losing the `this` binding when the class-method connectors (Snowflake, Postgres, MySQL) were invoked. Every sample call threw "Cannot read properties of undefined (reading 'assertConnection')" and degraded LLM descriptions to metadata-only prompts. Call the methods through the connector instead. Without these, even after the primary-key probe is allowed to fail softly, the scan ends up with 0 validated relationships and an empty `joins:` block in every shard YAML. * test(scan): cover table-ref helpers * feat(scan): plumb tableScope through live-database introspection port * feat(scan): apply tableScope during metadata fetch * feat(scan): enforce table scope at fetch boundary * feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206) * feat(cli): add RSA key-pair auth option to Snowflake setup wizard Extends the interactive Snowflake setup flow with an authentication-method prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key path (env/file/absolute) and an optional passphrase; the resulting connection config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead of `password`. * feat(scan): pool Snowflake sessions * fix(scan): reuse structural snapshots and cleanup connectors * feat(scan): parallelize relationship profiling * feat(scan): batch table description generation * docs: document Snowflake ingest concurrency knobs * fix(scan): close Snowflake ingest perf verification gaps * fix(scan): keep batched description failure bounded * feat(scan): dispatch query-history probes by connection driver Extract historic-sql dialect resolution into a shared helper so the status-project readiness check and the local ingest factory agree on which connections enable query history and which probe to run. The status command now picks the postgres/snowflake/bigquery probe based on the connection's driver instead of always reporting against postgres, which previously caused snowflake connections with queryHistory.enabled to surface a misleading "driver is snowflake" failure. Also drops a noisy console.warn from Snowflake primary-key discovery — INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only roles and the FK + profiling paths handle the empty PK map already. * fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject The Claude Code agent SDK announces an internal pseudo-tool named StructuredOutput in the system/init message whenever outputFormat is set to { type: 'json_schema' }. The runtime's isolation check built its allowedToolIds set only from MCP tool ids and treated StructuredOutput as an unexpected host-injected tool, so every generateObject call threw "Claude Code runtime isolation failed: tools=StructuredOutput ..." and the table-descriptions and relationship-LLM-proposal enrichment stages recorded null output across the board. Whitelist StructuredOutput specifically in generateObject's allowedToolIds — the check also enforces missing_tools symmetry, so generateText and runAgentLoop, which do not see StructuredOutput, must not require it. generateObject also ran with maxTurns: 1, which the model intermittently breached when it emitted thinking text before the structured response. Raised to 5 to give the schema-bound call enough headroom without allowing unbounded loops. The existing tests now exercise the path with an init message that announces StructuredOutput so the regression cannot slip back in. * chore(scripts): add ktx-reset.sh project-cleanup helper Convenience script for repeatable ingest testing: takes a project directory and prunes everything except ktx.yaml and .ktx/secrets/, so the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
import { scopedTableNames } from '../../context/scan/table-ref.js';
2026-05-10 23:51:24 +02:00
import { KtxSqliteDialect } from './dialect.js';
export interface KtxSqliteConnectionConfig {
2026-05-10 23:12:26 +02:00
driver?: string;
path?: string;
url?: string;
[key: string]: unknown;
}
chore(workspace): gate dead-code with knip production mode (#196) * refactor(workspace): relocate @ktx/llm source into packages/cli/src/llm * refactor(workspace): rewrite @ktx/llm imports to relative paths * refactor(workspace): fold internal packages into cli * chore(workspace): gate dead-code with knip production mode Turn on production-mode knip plus an autofix run in pre-commit and the `pnpm dead-code` script, document the `/** @internal */` convention for test-only exports in AGENTS.md, annotate test-only exports across the CLI with that JSDoc, and drop dead exports/wrappers the new gate surfaced (e.g. `cli-project.ts`, `lookerRuntimeSourceToFileAdapterSource`, `createLocalScanEnrichmentProvidersFromConfig`, `PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES`, stale type re-exports). Replace the loose `ignoreIssues` allowlist in `knip.json` with explicit production entries so cross-package barrel leaks are caught. * refactor(cli): delete internal barrel index.ts files The 34 `index.ts` re-export barrels inside `packages/cli/src/` were holdovers from the pre-fold multi-workspace structure. Post-fold-in they served no production purpose: external consumers go through the single package main entry, and in-repo callers mostly imported through them only because the path was short. Internally, knip flagged most barrel re-exports as production-dead (only reached via tests). This change: - Deletes every internal barrel except `packages/cli/src/index.ts` (the published package entry). - Rewrites ~270 source/test files to import each name directly from the file that defines it. - Moves `tools/warehouse-verification/index.ts` to `create-warehouse-verification-tools.ts` (the function it defined locally) and updates its single consumer. - Renames `search/backend-conformance.ts` → `.test-utils.ts` to match the existing test-helper file convention. - Deletes 13 dead test-only chains (dbt-descriptions/*, live-database/extracted-schema, live-database/structural-sync, relationship-* feedback/review chain) plus their tests and a cascading orphan integration test. - Updates test mocks that pointed at deleted barrel paths (notion-client, connector barrels in scan/local-scan-connectors tests) to mock the source files instead. - Points the maintainer benchmark script (`scripts/relationship-benchmark-report.mjs`) at source files instead of `dist/context/scan/index.js`. - Drops the barrel `!` entries from `knip.json`; adds explicit production entries only for the benchmark code reached via dist by the maintainer script. Net: 413 files changed, ~1.2k insertions, ~9.4k deletions. `pnpm run dead-code` (Biome + knip default + knip production) and `pnpm run type-check` are clean; 2277 tests pass. * refactor(workspace): rename @ktx/cli to @kaelio/ktx and pack it directly Promote the CLI workspace package to the public name `@kaelio/ktx` and drop the separate `scripts/build-public-npm-package.mjs` wrapper. The CLI package is now publishable in place (`publishConfig.access: public`, `provenance: true`), so artifact packing uses `pnpm pack` against `packages/cli/` instead of assembling a parallel package tree. Updates all workspace filter invocations, docs, tests, and release readiness checks to reference the new package name, and folds the tarball-name helper into `scripts/public-npm-release-metadata.mjs`. * docs: align "agent clients" and "data agents" terminology Replace "client agents" with "agent clients" and "database agents" with "data agents" across AGENTS.md, README.md, the docs-site copy, and the matching setup-agents test description, matching the canonical vocabulary in docs/terminology.md. Also moves packages/cli/tsconfig.json's tsBuildInfoFile from node_modules/.cache/ to dist/.tsbuildinfo so incremental builds survive node_modules reinstalls. * refactor(release): single source of truth for package version Make packages/cli/package.json the single source of truth for the @kaelio/ktx version. publicNpmPackageVersion() now reads it directly, so artifact filenames, release-readiness checks, and the Python wheel version all derive from one field. The duplicate release-policy.json.publicNpmPackageVersion is removed. Previously the two fields could drift: tarballs were named kaelio-ktx-0.4.1.tgz while internally containing @kaelio/ktx@0.0.0-private. - update-public-release-version.mjs rewrites both Python pyproject.toml files (ktx-daemon, ktx-sl) alongside the npm package.jsons, normalizing the version for PEP 440 (e.g. 0.1.0-rc.2 -> 0.1.0rc2). - semantic-release-config.cjs adds the two pyproject.toml files to @semantic-release/git assets so the release commit back to main carries every version source in lockstep. - The six "?? '0.0.0-private'" fallback literals across the CLI are replaced with "?? getKtxCliPackageInfo().version", and createDefaultKtxMcpServer makes its version arg required. - docs/release.md describes the actual commit-back model: the dev tree always reflects the most recent release; no sentinel pin to maintain. Verified: pnpm run artifacts:build now produces kaelio-ktx-0.4.1.tgz and kaelio_ktx-0.4.1-py3-none-any.whl with @kaelio/ktx@0.4.1 inside. Full type-check, dead-code, and 2287 vitests + 173 script tests pass. * refactor(cli): inject embedding provider resolution and detect sentence-transformers runtime Make resolveProjectEmbeddingProvider and runtimeIo injectable in ingest and scan command entrypoints so tests can stub them, and teach resolvePublicIngestRuntimeRequirements to flag the local-embeddings runtime feature when ktx.yaml selects sentence-transformers. * chore(cli): mark buildLocalStatsStatus and LocalStatsStatus as @internal Both symbols are consumed only by status-project.test.ts. Annotating with /** @internal */ keeps knip's production-mode check clean without changing runtime behavior. * fix(cli): use real package metadata in print-command-tree The stubbed package name embedded a forbidden product identifier that tripped the boundary check in CI. Read the metadata from package.json instead — keeps the rendered tree unchanged and removes a duplicate source of truth. * feat(cli): show embedding coverage in `ktx status`, drop duplicate disk counts Inline `(N embedded)` next to the Wiki scope counts and Semantic-layer source counts, computed with `SUM(embedding_json IS NOT NULL)` over `knowledge_pages` and `local_sl_sources`. Rename the "Knowledge" label to "Wiki" (canonical per `docs/terminology.md`) and rename the matching `localStats.knowledgePages` field to `localStats.wikiPages`. Drop `wiki=N md` and `semantic-layer=N yaml` from the Disk row — those duplicated the per-surface rows above. Disk now reports only actual byte usage (db, cache, raw-sources). The unused `wikiGlobalMarkdownCount` / `semanticLayerYamlCount` fields, the `isMarkdownEntry` / `isYamlEntry` helpers, and the `filter` arg on `summarizeDir` are removed.
2026-05-21 15:28:58 +02:00
/** @internal */
2026-05-10 23:12:26 +02:00
export interface SqliteDatabasePathInput {
connectionId: string;
projectDir?: string;
2026-05-10 23:51:24 +02:00
connection: KtxSqliteConnectionConfig | undefined;
2026-05-10 23:12:26 +02:00
}
2026-05-10 23:51:24 +02:00
export interface KtxSqliteScanConnectorOptions extends SqliteDatabasePathInput {
2026-05-10 23:12:26 +02:00
now?: () => Date;
}
2026-05-10 23:51:24 +02:00
export interface KtxSqliteReadOnlyQueryInput extends KtxReadOnlyQueryInput {
2026-05-10 23:12:26 +02:00
params?: Record<string, unknown> | unknown[];
}
2026-05-10 23:51:24 +02:00
export interface KtxSqliteColumnDistinctValuesOptions {
2026-05-10 23:12:26 +02:00
maxCardinality: number;
limit: number;
sampleSize?: number;
}
2026-05-10 23:51:24 +02:00
export interface KtxSqliteColumnDistinctValuesResult {
2026-05-10 23:12:26 +02:00
values: string[] | null;
cardinality: number;
}
interface SqliteMasterRow {
name: string;
type: 'table' | 'view';
}
interface SqliteTableInfoRow {
cid: number;
name: string;
type: string;
notnull: number;
dflt_value: unknown;
pk: number;
}
interface SqliteForeignKeyRow {
id: number;
seq: number;
table: string;
from: string;
to: string;
}
function stringConfigValue(
2026-05-10 23:51:24 +02:00
connection: KtxSqliteConnectionConfig | undefined,
key: keyof KtxSqliteConnectionConfig,
2026-05-10 23:12:26 +02:00
): string | undefined {
const value = connection?.[key];
return typeof value === 'string' && value.trim().length > 0 ? resolveStringReference(key, value.trim()) : undefined;
}
2026-05-10 23:51:24 +02:00
function resolveStringReference(key: keyof KtxSqliteConnectionConfig, value: string): string {
2026-05-10 23:12:26 +02:00
if (value.startsWith('env:')) {
return process.env[value.slice('env:'.length)] ?? '';
}
fix: surface silent failures and drop unused dead-code paths (#193) Address overengineering audit findings across cli/context/connector packages: - F1 Snowflake `query`: drop bare catch that flattened all errors to empty result - F2 memory-agent: treat LLM `stopReason === 'error'` as crash (skip squash-merge) - F3 WikiSearchTool: description honest about token-only fallback vs sqlite-fts5 hybrid - F5 Scan enrichment provider resolution: return discriminated status and surface distinct `llm_unavailable` / `embedding_unavailable` warnings per failure mode - F6 Relationship validation budget: drop dead `tableCount === undefined → 'all'` branch; update tests to pass `tableCount` like production - F8 `ktx sql`: use canonical `resolveOutputMode` (now honors KTX_OUTPUT/CI/TTY) - F9 MCP stdio server: default `protocolIo.stderr` to `process.stderr` so memory_ingest startup failures are visible - F13/F14 Scan/setup JSON readers: distinguish ENOENT from corruption instead of silently treating both as missing - F15 `createKtxCliScanConnector`: throw config-shape error when driver matches but type guard rejects, instead of "no native connector" - F16 ContextEvidenceSearchTool: surface `embedding_unhealthy:<reason>` instead of silently dropping the semantic lane - F17 PromptService: default partials to `[]` (removes stale `clinical_policy` reference from a prior product) - F20 `contextBuildCommands`: drop unused `runId` parameter Dead-code removal: - F4 Delete `AgentRunnerService` (duplicated `RuntimeAgentRunner`, only test-used); migrate tests to exercise `AiSdkKtxLlmRuntime.runAgentLoop` directly - F7 Delete `KtxScanOrchestrator` and its test (no production callers; the inline pipeline in `runLocalScan` is the single source of truth) - F18 Delete `generateKtxText`/`generateKtxObject` pass-through helpers; inline the single `runtime.generateObject` call at its caller Plus a clarifying comment on the SQLite `resolveStringReference` `file:` carve-out (load-bearing for SQLite URI form, not a bug).
2026-05-21 02:38:18 +02:00
// `file:` on the `url` key is SQLite's native URI form (e.g. `file:///db.sqlite`), not a
// file-contents reference — skip the read so the URI passes through verbatim.
2026-05-10 23:12:26 +02:00
if (key !== 'url' && value.startsWith('file:')) {
const rawPath = value.slice('file:'.length);
const path = rawPath.startsWith('~') ? resolve(homedir(), rawPath.slice(1)) : rawPath;
return readFileSync(path, 'utf-8').trim();
}
return value;
}
function sqlitePathFromUrl(url: string): string {
if (url.startsWith('file:')) {
return fileURLToPath(url);
}
if (url.startsWith('sqlite:')) {
const parsed = new URL(url);
return decodeURIComponent(parsed.pathname);
}
return url;
}
function stripLeadingSqlComments(sql: string): string {
let index = 0;
while (index < sql.length) {
while (/\s/.test(sql[index] ?? '')) {
index += 1;
}
if (sql.startsWith('--', index)) {
const end = sql.indexOf('\n', index + 2);
index = end === -1 ? sql.length : end + 1;
continue;
}
if (sql.startsWith('/*', index)) {
const end = sql.indexOf('*/', index + 2);
if (end === -1) {
return sql.slice(index);
}
index = end + 2;
continue;
}
break;
}
return sql.slice(index);
}
export function isKtxSqliteConnectionConfig(
connection: KtxSqliteConnectionConfig | undefined,
): connection is KtxSqliteConnectionConfig {
2026-05-10 23:12:26 +02:00
const driver = String(connection?.driver ?? '').toLowerCase();
return driver === 'sqlite' || driver === 'sqlite3';
}
chore(workspace): gate dead-code with knip production mode (#196) * refactor(workspace): relocate @ktx/llm source into packages/cli/src/llm * refactor(workspace): rewrite @ktx/llm imports to relative paths * refactor(workspace): fold internal packages into cli * chore(workspace): gate dead-code with knip production mode Turn on production-mode knip plus an autofix run in pre-commit and the `pnpm dead-code` script, document the `/** @internal */` convention for test-only exports in AGENTS.md, annotate test-only exports across the CLI with that JSDoc, and drop dead exports/wrappers the new gate surfaced (e.g. `cli-project.ts`, `lookerRuntimeSourceToFileAdapterSource`, `createLocalScanEnrichmentProvidersFromConfig`, `PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES`, stale type re-exports). Replace the loose `ignoreIssues` allowlist in `knip.json` with explicit production entries so cross-package barrel leaks are caught. * refactor(cli): delete internal barrel index.ts files The 34 `index.ts` re-export barrels inside `packages/cli/src/` were holdovers from the pre-fold multi-workspace structure. Post-fold-in they served no production purpose: external consumers go through the single package main entry, and in-repo callers mostly imported through them only because the path was short. Internally, knip flagged most barrel re-exports as production-dead (only reached via tests). This change: - Deletes every internal barrel except `packages/cli/src/index.ts` (the published package entry). - Rewrites ~270 source/test files to import each name directly from the file that defines it. - Moves `tools/warehouse-verification/index.ts` to `create-warehouse-verification-tools.ts` (the function it defined locally) and updates its single consumer. - Renames `search/backend-conformance.ts` → `.test-utils.ts` to match the existing test-helper file convention. - Deletes 13 dead test-only chains (dbt-descriptions/*, live-database/extracted-schema, live-database/structural-sync, relationship-* feedback/review chain) plus their tests and a cascading orphan integration test. - Updates test mocks that pointed at deleted barrel paths (notion-client, connector barrels in scan/local-scan-connectors tests) to mock the source files instead. - Points the maintainer benchmark script (`scripts/relationship-benchmark-report.mjs`) at source files instead of `dist/context/scan/index.js`. - Drops the barrel `!` entries from `knip.json`; adds explicit production entries only for the benchmark code reached via dist by the maintainer script. Net: 413 files changed, ~1.2k insertions, ~9.4k deletions. `pnpm run dead-code` (Biome + knip default + knip production) and `pnpm run type-check` are clean; 2277 tests pass. * refactor(workspace): rename @ktx/cli to @kaelio/ktx and pack it directly Promote the CLI workspace package to the public name `@kaelio/ktx` and drop the separate `scripts/build-public-npm-package.mjs` wrapper. The CLI package is now publishable in place (`publishConfig.access: public`, `provenance: true`), so artifact packing uses `pnpm pack` against `packages/cli/` instead of assembling a parallel package tree. Updates all workspace filter invocations, docs, tests, and release readiness checks to reference the new package name, and folds the tarball-name helper into `scripts/public-npm-release-metadata.mjs`. * docs: align "agent clients" and "data agents" terminology Replace "client agents" with "agent clients" and "database agents" with "data agents" across AGENTS.md, README.md, the docs-site copy, and the matching setup-agents test description, matching the canonical vocabulary in docs/terminology.md. Also moves packages/cli/tsconfig.json's tsBuildInfoFile from node_modules/.cache/ to dist/.tsbuildinfo so incremental builds survive node_modules reinstalls. * refactor(release): single source of truth for package version Make packages/cli/package.json the single source of truth for the @kaelio/ktx version. publicNpmPackageVersion() now reads it directly, so artifact filenames, release-readiness checks, and the Python wheel version all derive from one field. The duplicate release-policy.json.publicNpmPackageVersion is removed. Previously the two fields could drift: tarballs were named kaelio-ktx-0.4.1.tgz while internally containing @kaelio/ktx@0.0.0-private. - update-public-release-version.mjs rewrites both Python pyproject.toml files (ktx-daemon, ktx-sl) alongside the npm package.jsons, normalizing the version for PEP 440 (e.g. 0.1.0-rc.2 -> 0.1.0rc2). - semantic-release-config.cjs adds the two pyproject.toml files to @semantic-release/git assets so the release commit back to main carries every version source in lockstep. - The six "?? '0.0.0-private'" fallback literals across the CLI are replaced with "?? getKtxCliPackageInfo().version", and createDefaultKtxMcpServer makes its version arg required. - docs/release.md describes the actual commit-back model: the dev tree always reflects the most recent release; no sentinel pin to maintain. Verified: pnpm run artifacts:build now produces kaelio-ktx-0.4.1.tgz and kaelio_ktx-0.4.1-py3-none-any.whl with @kaelio/ktx@0.4.1 inside. Full type-check, dead-code, and 2287 vitests + 173 script tests pass. * refactor(cli): inject embedding provider resolution and detect sentence-transformers runtime Make resolveProjectEmbeddingProvider and runtimeIo injectable in ingest and scan command entrypoints so tests can stub them, and teach resolvePublicIngestRuntimeRequirements to flag the local-embeddings runtime feature when ktx.yaml selects sentence-transformers. * chore(cli): mark buildLocalStatsStatus and LocalStatsStatus as @internal Both symbols are consumed only by status-project.test.ts. Annotating with /** @internal */ keeps knip's production-mode check clean without changing runtime behavior. * fix(cli): use real package metadata in print-command-tree The stubbed package name embedded a forbidden product identifier that tripped the boundary check in CI. Read the metadata from package.json instead — keeps the rendered tree unchanged and removes a duplicate source of truth. * feat(cli): show embedding coverage in `ktx status`, drop duplicate disk counts Inline `(N embedded)` next to the Wiki scope counts and Semantic-layer source counts, computed with `SUM(embedding_json IS NOT NULL)` over `knowledge_pages` and `local_sl_sources`. Rename the "Knowledge" label to "Wiki" (canonical per `docs/terminology.md`) and rename the matching `localStats.knowledgePages` field to `localStats.wikiPages`. Drop `wiki=N md` and `semantic-layer=N yaml` from the Disk row — those duplicated the per-surface rows above. Disk now reports only actual byte usage (db, cache, raw-sources). The unused `wikiGlobalMarkdownCount` / `semanticLayerYamlCount` fields, the `isMarkdownEntry` / `isYamlEntry` helpers, and the `filter` arg on `summarizeDir` are removed.
2026-05-21 15:28:58 +02:00
/** @internal */
2026-05-10 23:12:26 +02:00
export function sqliteDatabasePathFromConfig(input: SqliteDatabasePathInput): string {
const inputDriver = input.connection?.driver ?? 'unknown';
2026-05-10 23:51:24 +02:00
if (!isKtxSqliteConnectionConfig(input.connection)) {
throw new Error(`Native SQLite connector cannot run driver "${inputDriver}"`);
2026-05-10 23:12:26 +02:00
}
const configuredPath = stringConfigValue(input.connection, 'path') ?? sqlitePathFromUrl(stringConfigValue(input.connection, 'url') ?? '');
2026-05-10 23:12:26 +02:00
if (!configuredPath) {
throw new Error(`Native SQLite connector requires connections.${input.connectionId}.path or url`);
2026-05-10 23:12:26 +02:00
}
return isAbsolute(configuredPath) ? configuredPath : resolve(input.projectDir ?? process.cwd(), configuredPath);
}
2026-05-10 23:51:24 +02:00
export class KtxSqliteScanConnector implements KtxScanConnector {
2026-05-10 23:12:26 +02:00
readonly id: string;
readonly driver = 'sqlite' as const;
2026-05-10 23:51:24 +02:00
readonly capabilities = createKtxConnectorCapabilities({
2026-05-10 23:12:26 +02:00
tableSampling: true,
columnSampling: true,
columnStats: false,
readOnlySql: true,
nestedAnalysis: false,
formalForeignKeys: true,
estimatedRowCounts: true,
});
private readonly connectionId: string;
private readonly dbPath: string;
private readonly now: () => Date;
2026-05-10 23:51:24 +02:00
private readonly dialect = new KtxSqliteDialect();
2026-05-10 23:12:26 +02:00
private db: Database.Database | null = null;
2026-05-10 23:51:24 +02:00
constructor(options: KtxSqliteScanConnectorOptions) {
2026-05-10 23:12:26 +02:00
this.connectionId = options.connectionId;
this.dbPath = sqliteDatabasePathFromConfig(options);
this.now = options.now ?? (() => new Date());
this.id = `sqlite:${options.connectionId}`;
}
async testConnection(): Promise<{ success: boolean; error?: string }> {
try {
if (!existsSync(this.dbPath) || !statSync(this.dbPath).isFile()) {
return { success: false, error: `File not found: ${this.dbPath}` };
}
this.database().prepare('SELECT 1').get();
return { success: true };
} catch (error) {
return { success: false, error: error instanceof Error ? error.message : String(error) };
}
}
2026-05-10 23:51:24 +02:00
async introspect(input: KtxScanInput, _ctx: KtxScanContext): Promise<KtxSchemaSnapshot> {
2026-05-10 23:12:26 +02:00
this.assertConnection(input.connectionId);
const database = this.database();
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204) * feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure Snowflake setup previously asked for a single schema as free text, then ran a multiselect against the discovered schemas — two schema questions back-to-back, with the first being only a session bootstrap. The SDK's `schema` is optional, so the bootstrap step is unnecessary. - Remove the free-text Snowflake schema prompt; only pass `schema` to snowflake-sdk when one is configured. - When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the user for a comma-separated list, persist it as `schema_names`, and use it as both the table-list filter and the multiselect default. Applies to every driver with a scope-discovery spec, not just Snowflake. - Update docs to lead with `schema_names`; keep `schema_name` as a documented single-schema shorthand. * fix(snowflake): keep introspecting when primary-key discovery is denied The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the connection role may not have. Previously a 'SQL compilation error: Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist or not authorized' aborted the entire introspect — schemas, columns, and row counts were all discarded over a missing nice-to-have. Wrap the constraint query in try/catch, log a one-line warning per schema, and return an empty PK map. Columns end up with primaryKey=false; relationship inference still has FK and profiling to fall back on. * fix(scan): unblock relationship discovery on Snowflake Two adjacent bugs prevented the scan's relationship pipeline from producing any joins on a Snowflake warehouse: - relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table profile query failed with "Unknown function GROUP_CONCAT". Add an explicit Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected). - description-generation.ts destructured `connector.sampleTable` and `connector.sampleColumn` into bare locals, losing the `this` binding when the class-method connectors (Snowflake, Postgres, MySQL) were invoked. Every sample call threw "Cannot read properties of undefined (reading 'assertConnection')" and degraded LLM descriptions to metadata-only prompts. Call the methods through the connector instead. Without these, even after the primary-key probe is allowed to fail softly, the scan ends up with 0 validated relationships and an empty `joins:` block in every shard YAML. * test(scan): cover table-ref helpers * feat(scan): plumb tableScope through live-database introspection port * feat(scan): apply tableScope during metadata fetch * feat(scan): enforce table scope at fetch boundary * feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206) * feat(cli): add RSA key-pair auth option to Snowflake setup wizard Extends the interactive Snowflake setup flow with an authentication-method prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key path (env/file/absolute) and an optional passphrase; the resulting connection config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead of `password`. * feat(scan): pool Snowflake sessions * fix(scan): reuse structural snapshots and cleanup connectors * feat(scan): parallelize relationship profiling * feat(scan): batch table description generation * docs: document Snowflake ingest concurrency knobs * fix(scan): close Snowflake ingest perf verification gaps * fix(scan): keep batched description failure bounded * feat(scan): dispatch query-history probes by connection driver Extract historic-sql dialect resolution into a shared helper so the status-project readiness check and the local ingest factory agree on which connections enable query history and which probe to run. The status command now picks the postgres/snowflake/bigquery probe based on the connection's driver instead of always reporting against postgres, which previously caused snowflake connections with queryHistory.enabled to surface a misleading "driver is snowflake" failure. Also drops a noisy console.warn from Snowflake primary-key discovery — INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only roles and the FK + profiling paths handle the empty PK map already. * fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject The Claude Code agent SDK announces an internal pseudo-tool named StructuredOutput in the system/init message whenever outputFormat is set to { type: 'json_schema' }. The runtime's isolation check built its allowedToolIds set only from MCP tool ids and treated StructuredOutput as an unexpected host-injected tool, so every generateObject call threw "Claude Code runtime isolation failed: tools=StructuredOutput ..." and the table-descriptions and relationship-LLM-proposal enrichment stages recorded null output across the board. Whitelist StructuredOutput specifically in generateObject's allowedToolIds — the check also enforces missing_tools symmetry, so generateText and runAgentLoop, which do not see StructuredOutput, must not require it. generateObject also ran with maxTurns: 1, which the model intermittently breached when it emitted thinking text before the structured response. Raised to 5 to give the schema-bound call enough headroom without allowing unbounded loops. The existing tests now exercise the path with an init message that announces StructuredOutput so the regression cannot slip back in. * chore(scripts): add ktx-reset.sh project-cleanup helper Convenience script for repeatable ingest testing: takes a project directory and prunes everything except ktx.yaml and .ktx/secrets/, so the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
const scopedNames = input.tableScope ? scopedTableNames(input.tableScope, { catalog: null, db: null }) : null;
const scopeClause = scopedNames ? `AND name IN (${scopedNames.map(() => '?').join(', ')})` : '';
const rawTables =
scopedNames && scopedNames.length === 0
? []
: (database
.prepare(
`SELECT name, type FROM sqlite_master WHERE type IN ('table', 'view') AND name NOT LIKE 'sqlite_%' ${scopeClause} ORDER BY name`,
)
.all(...(scopedNames ?? [])) as SqliteMasterRow[]);
2026-05-10 23:12:26 +02:00
const tables = rawTables.map((table) => this.readTable(database, table));
const fileStats = existsSync(this.dbPath) ? statSync(this.dbPath) : null;
return {
connectionId: this.connectionId,
driver: 'sqlite',
extractedAt: this.now().toISOString(),
scope: {},
metadata: {
file_path: this.dbPath,
file_size: fileStats ? fileStats.size : 0,
table_count: tables.length,
total_columns: tables.reduce((sum, table) => sum + table.columns.length, 0),
},
tables,
};
}
2026-05-10 23:51:24 +02:00
async sampleTable(input: KtxTableSampleInput, _ctx: KtxScanContext): Promise<KtxTableSampleResult> {
2026-05-10 23:12:26 +02:00
this.assertConnection(input.connectionId);
const result = this.query(this.dialect.generateSampleQuery(this.qTableName(input.table), input.limit, input.columns));
return { headers: result.headers, rows: result.rows, totalRows: result.totalRows };
}
2026-05-10 23:51:24 +02:00
async sampleColumn(input: KtxColumnSampleInput, _ctx: KtxScanContext): Promise<KtxColumnSampleResult> {
2026-05-10 23:12:26 +02:00
this.assertConnection(input.connectionId);
const result = this.query(
this.dialect.generateColumnSampleQuery(this.qTableName(input.table), input.column, input.limit),
);
const values = result.rows.filter((row) => row.length > 0 && row[0] !== null).map((row) => row[0]);
return { values, nullCount: null, distinctCount: null };
}
2026-05-10 23:51:24 +02:00
async columnStats(_input: KtxColumnStatsInput, _ctx: KtxScanContext): Promise<KtxColumnStatsResult | null> {
2026-05-10 23:12:26 +02:00
return null;
}
2026-05-10 23:51:24 +02:00
async executeReadOnly(input: KtxSqliteReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
2026-05-10 23:12:26 +02:00
this.assertConnection(input.connectionId);
const result = this.query(limitSqlForExecution(stripLeadingSqlComments(input.sql), input.maxRows), input.params);
return { ...result, rowCount: result.rows.length };
}
async getColumnDistinctValues(
2026-05-10 23:51:24 +02:00
table: KtxTableRef,
2026-05-10 23:12:26 +02:00
columnName: string,
2026-05-10 23:51:24 +02:00
options: KtxSqliteColumnDistinctValuesOptions,
): Promise<KtxSqliteColumnDistinctValuesResult | null> {
2026-05-10 23:12:26 +02:00
const sampleSize = options.sampleSize ?? 10000;
const tableName = this.qTableName(table);
const quotedColumn = this.dialect.quoteIdentifier(columnName);
const cardinalityResult = this.query(
this.dialect.generateCardinalitySampleQuery(tableName, quotedColumn, sampleSize),
);
if (cardinalityResult.rows.length === 0) {
return null;
}
const cardinality = Number(cardinalityResult.rows[0][0]);
if (Number.isNaN(cardinality)) {
return null;
}
if (cardinality === 0) {
return { values: [], cardinality: 0 };
}
if (cardinality > options.maxCardinality) {
return { values: null, cardinality };
}
const valuesResult = this.query(this.dialect.generateDistinctValuesQuery(tableName, quotedColumn, options.limit));
return {
values: valuesResult.rows.filter((row) => row.length > 0 && row[0] !== null).map((row) => String(row[0])),
cardinality,
};
}
async getTableRowCount(tableName: string): Promise<number> {
const result = this.query(`SELECT COUNT(*) AS count FROM ${this.dialect.quoteIdentifier(tableName)}`);
return Number(result.rows[0]?.[0] ?? 0);
}
2026-05-10 23:51:24 +02:00
qTableName(table: Pick<KtxTableRef, 'name'>): string {
2026-05-10 23:12:26 +02:00
return this.dialect.formatTableName(table);
}
quoteIdentifier(identifier: string): string {
return this.dialect.quoteIdentifier(identifier);
}
async cleanup(): Promise<void> {
if (this.db) {
this.db.close();
this.db = null;
}
}
private database(): Database.Database {
if (!this.db) {
this.db = new Database(this.dbPath, { readonly: true, fileMustExist: true });
}
return this.db;
}
2026-05-10 23:51:24 +02:00
private query(sql: string, params?: Record<string, unknown> | unknown[]): Omit<KtxQueryResult, 'rowCount'> {
2026-05-10 23:12:26 +02:00
const statement = this.database().prepare(assertReadOnlySql(sql));
const rows = (params ? statement.all(params) : statement.all()) as unknown[];
return {
headers: statement.columns().map((column) => column.name),
rows: normalizeQueryRows(rows),
totalRows: rows.length,
};
}
2026-05-10 23:51:24 +02:00
private readTable(database: Database.Database, table: SqliteMasterRow): KtxSchemaTable {
2026-05-10 23:12:26 +02:00
const columns = database
.prepare(`PRAGMA table_info(${this.dialect.quoteIdentifier(table.name)})`)
.all() as SqliteTableInfoRow[];
const foreignKeys = database
.prepare(`PRAGMA foreign_key_list(${this.dialect.quoteIdentifier(table.name)})`)
.all() as SqliteForeignKeyRow[];
const estimatedRows =
table.type === 'table'
? Number(
(
database
.prepare(`SELECT COUNT(*) AS count FROM ${this.dialect.quoteIdentifier(table.name)}`)
.get() as { count: unknown }
).count,
)
: null;
return {
catalog: null,
db: null,
name: table.name,
kind: table.type,
comment: null,
estimatedRows,
columns: columns.map((column) => ({
name: column.name,
nativeType: column.type,
normalizedType: this.dialect.mapDataType(column.type),
dimensionType: this.dialect.mapToDimensionType(column.type),
nullable: column.notnull === 0 && column.pk === 0,
primaryKey: column.pk > 0,
comment: null,
})),
foreignKeys: this.mapForeignKeys(foreignKeys),
};
}
2026-05-10 23:51:24 +02:00
private mapForeignKeys(rows: SqliteForeignKeyRow[]): KtxSchemaForeignKey[] {
2026-05-10 23:12:26 +02:00
return rows
.sort((a, b) => a.id - b.id || a.seq - b.seq)
.map((row) => ({
fromColumn: row.from,
toCatalog: null,
toDb: null,
toTable: row.table,
toColumn: row.to,
constraintName: null,
}));
}
private assertConnection(connectionId: string): void {
if (connectionId !== this.connectionId) {
2026-05-10 23:51:24 +02:00
throw new Error(`KTX SQLite connector ${this.id} cannot serve connection ${connectionId}`);
2026-05-10 23:12:26 +02:00
}
}
}