feat(connections): add execute-only warehouses; stop silent full-project scans

A configured warehouse was always a scan/ingest target. The only way to use a
connection purely for SQL execution (ktx sql / sql_execution) was the leaky
workaround of an empty setup.database_connection_ids — which actually re-includes
every warehouse via the 'fall back to all' branch — so e.g. a BigQuery connection
meant only for read-only queries triggered a full-billing-project scan.

- Add a per-connection scan_enabled flag (default true) to warehouse connections.
  scan_enabled: false registers the connection for execution only and never as a
  scan target.
- Route every scan-target selection path through one predicate
  (isScanTargetWarehouse): both ingest (primaryWarehouseConnectionIds, including
  the all-warehouses fallback) and setup (configuredPrimaryConnectionIds) now
  exclude execute-only connections. Setup validates the credential but skips
  scope discovery and scan for them. Execution paths are untouched — the warehouse
  descriptor still resolves, so ktx sql / sql_execution keep working.
- Scripted setup with no --database-schema no longer silently scopes the scan to
  every discovered schema/dataset: it warns with the count and names how to narrow
  (--database-schema) or opt out (scan_enabled: false).
This commit is contained in:
Andrey Avtomonov 2026-06-09 13:05:15 +02:00
parent a02fcab487
commit ece0dfb2c8
10 changed files with 187 additions and 3 deletions

View file

@ -72,6 +72,24 @@ export function localConnectionToWarehouseDescriptor(
return info;
}
/**
* True when the connection is registered for SQL execution only (`scan_enabled: false`) and
* must never be used as a scan/ingest target. Execution paths (`ktx sql`, `sql_execution`) are
* unaffected they resolve the warehouse via {@link localConnectionToWarehouseDescriptor}.
*/
export function isExecuteOnlyConnection(connection: KtxProjectConnectionConfig | undefined): boolean {
return (connection as { scan_enabled?: boolean } | undefined)?.scan_enabled === false;
}
/**
* True when the connection is a warehouse AND eligible to be scanned/ingested. This is the single
* predicate every scan-target selection path routes through, so execute-only connections are
* excluded consistently including the "fall back to all warehouses" path.
*/
export function isScanTargetWarehouse(id: string, connection: KtxProjectConnectionConfig | undefined): boolean {
return localConnectionToWarehouseDescriptor(id, connection) !== null && !isExecuteOnlyConnection(connection);
}
export function localConnectionTypeForConfig(id: string, connection: KtxProjectConnectionConfig | undefined): string {
const descriptor = localConnectionToWarehouseDescriptor(id, connection);
if (descriptor) {

View file

@ -1,5 +1,5 @@
import { join } from 'node:path';
import { localConnectionToWarehouseDescriptor } from '../../context/connections/local-warehouse-descriptor.js';
import { isScanTargetWarehouse, localConnectionToWarehouseDescriptor } from '../../context/connections/local-warehouse-descriptor.js';
import { notionConnectionToPullConfig, parseNotionConnectionConfig } from '../../context/connections/notion-config.js';
import { resolveKtxConfigReference } from '../core/config-reference.js';
import { ktxLocalStateDbPath } from '../../context/project/local-state-db.js';
@ -147,14 +147,14 @@ export function createDefaultLocalIngestAdapters(
function primaryWarehouseConnectionIds(project: KtxLocalProject): string[] {
const configuredPrimaryIds = project.config.setup?.database_connection_ids ?? [];
const configured = configuredPrimaryIds.filter((connectionId) =>
Boolean(localConnectionToWarehouseDescriptor(connectionId, project.config.connections[connectionId])),
isScanTargetWarehouse(connectionId, project.config.connections[connectionId]),
);
if (configured.length > 0) {
return [...new Set(configured)];
}
return Object.entries(project.config.connections)
.filter(([connectionId, connection]) => Boolean(localConnectionToWarehouseDescriptor(connectionId, connection)))
.filter(([connectionId, connection]) => isScanTargetWarehouse(connectionId, connection))
.map(([connectionId]) => connectionId)
.sort((left, right) => left.localeCompare(right));
}

View file

@ -32,6 +32,12 @@ function warehouseConnectionSchema<const Driver extends WarehouseDriver>(driver:
.describe(
'Optional allowlist of fully-qualified table names ("schema.table") to ingest. When set, live-database ingest discards any table whose schema-qualified name is not in this list. Useful for smoke-testing ingest on a single table.',
),
scan_enabled: z
.boolean()
.optional()
.describe(
'When false, this connection is registered for SQL execution only (ktx sql / sql_execution) and is never used as a scan/ingest target. Omit (or true) to scan and ingest it as a primary warehouse.',
),
})
.describe(
`${driver} warehouse connection. Additional driver-tunable fields (e.g. context.queryHistory) are accepted and passed through.`,

View file

@ -4,6 +4,7 @@ import { delimiter, dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';
import { promisify } from 'node:util';
import { getDriverRegistration } from './context/connections/drivers.js';
import { isExecuteOnlyConnection } from './context/connections/local-warehouse-descriptor.js';
import { createLocalKtxLlmRuntimeFromConfig } from './context/llm/local-config.js';
import type { KtxLlmRuntimePort } from './context/llm/runtime-port.js';
import { queryHistoryDialectForConnection } from './context/ingest/adapters/historic-sql/connection-dialect.js';
@ -459,12 +460,14 @@ function configuredPrimaryConnectionIds(
const configuredIds =
setupConnectionIds
?.filter((connectionId) => normalizeDriver(connections[connectionId]?.driver) !== null)
.filter((connectionId) => !isExecuteOnlyConnection(connections[connectionId]))
.filter((connectionId, index, ids) => ids.indexOf(connectionId) === index) ?? [];
if (configuredIds.length > 0) {
return configuredIds;
}
return Object.entries(connections)
.filter(([, connection]) => normalizeDriver(connection.driver) !== null)
.filter(([, connection]) => !isExecuteOnlyConnection(connection))
.map(([connectionId]) => connectionId)
.sort((left, right) => left.localeCompare(right));
}
@ -1384,11 +1387,13 @@ async function maybeConfigureDatabaseScope(input: {
if (input.args.inputMode === 'disabled') {
if (spec) {
let scopeToWrite: string[] = cliSchemas;
let scopedFromDiscovery = false;
if (scopeToWrite.length === 0) {
try {
scopeToWrite = unique(
await (input.deps.listSchemas ?? defaultListSchemas)(input.projectDir, input.connectionId),
);
scopedFromDiscovery = true;
} catch (error) {
const detail = error instanceof Error ? error.message : String(error);
input.io.stderr.write(
@ -1397,6 +1402,18 @@ async function maybeConfigureDatabaseScope(input: {
return okValidateResult();
}
}
// Scripted setup with no explicit scope would otherwise silently scan every discovered
// schema/dataset the credential can see — including unrelated ones on a shared billing
// account. Surface that so the operator can narrow it or register the connection as
// execute-only instead of discovering it as a silent full-warehouse scan.
if (scopedFromDiscovery && scopeToWrite.length > 1) {
input.io.stderr.write(
`No --database-schema given for ${input.connectionId}; scanning all ${scopeToWrite.length} ` +
`discovered ${spec.nounPlural} (${scopeToWrite.join(', ')}). Pass --database-schema to narrow ` +
'the scan, or set connections.' +
`${input.connectionId}.scan_enabled: false to register it for SQL execution only.\n`,
);
}
if (scopeToWrite.length > 0) {
await writeScopeConfig({
projectDir: input.projectDir,
@ -1894,6 +1911,15 @@ async function validateAndScanConnection(input: {
const testLines = ['✓ Connection test passed', `Driver: ${driverDisplay}`];
writeSetupSection(input.io, `Testing ${input.connectionId}`, testLines);
// Execute-only connections (scan_enabled: false) are registered for SQL execution only:
// the credential is validated above, but ktx never introspects/scans the warehouse.
if (isExecuteOnlyConnection(project.config.connections[input.connectionId])) {
writeSetupSection(input.io, `Registering ${input.connectionId}`, [
'Execute-only connection (scan_enabled: false) — skipping schema scan.',
]);
return okValidateResult();
}
const scopeStatus = await maybeConfigureDatabaseScope({ ...input, forcePrompt: input.forceScopeAndTables });
if (scopeStatus.status !== 'ok') {
return scopeStatus;