feat(connections): add execute-only warehouses; stop silent full-project scans

A configured warehouse was always a scan/ingest target. The only way to use a
connection purely for SQL execution (ktx sql / sql_execution) was the leaky
workaround of an empty setup.database_connection_ids — which actually re-includes
every warehouse via the 'fall back to all' branch — so e.g. a BigQuery connection
meant only for read-only queries triggered a full-billing-project scan.

- Add a per-connection scan_enabled flag (default true) to warehouse connections.
  scan_enabled: false registers the connection for execution only and never as a
  scan target.
- Route every scan-target selection path through one predicate
  (isScanTargetWarehouse): both ingest (primaryWarehouseConnectionIds, including
  the all-warehouses fallback) and setup (configuredPrimaryConnectionIds) now
  exclude execute-only connections. Setup validates the credential but skips
  scope discovery and scan for them. Execution paths are untouched — the warehouse
  descriptor still resolves, so ktx sql / sql_execution keep working.
- Scripted setup with no --database-schema no longer silently scopes the scan to
  every discovered schema/dataset: it warns with the count and names how to narrow
  (--database-schema) or opt out (scan_enabled: false).
This commit is contained in:
Andrey Avtomonov 2026-06-09 13:05:15 +02:00
parent a02fcab487
commit ece0dfb2c8
10 changed files with 187 additions and 3 deletions

View file

@ -1,10 +1,32 @@
import { describe, expect, it } from 'vitest';
import {
isExecuteOnlyConnection,
isScanTargetWarehouse,
localConnectionInfoFromConfig,
localConnectionToWarehouseDescriptor,
localConnectionTypeForConfig,
} from '../../../src/context/connections/local-warehouse-descriptor.js';
describe('execute-only warehouse connections', () => {
it('treats a warehouse without scan_enabled as a scan target', () => {
const connection = { driver: 'postgres', url: 'postgresql://db/a' } as const;
expect(isExecuteOnlyConnection(connection)).toBe(false);
expect(isScanTargetWarehouse('w', connection)).toBe(true);
});
it('excludes a warehouse with scan_enabled: false from scan targets but still resolves it as a warehouse', () => {
const connection = { driver: 'postgres', url: 'postgresql://db/a', scan_enabled: false } as const;
expect(isExecuteOnlyConnection(connection)).toBe(true);
expect(isScanTargetWarehouse('w', connection)).toBe(false);
// Execution paths must still see it as a warehouse so `ktx sql` works.
expect(localConnectionToWarehouseDescriptor('w', connection)).not.toBeNull();
});
it('does not treat non-warehouse connections as scan targets', () => {
expect(isScanTargetWarehouse('n', { driver: 'notion', auth_token: 'x' } as never)).toBe(false);
});
});
describe('localConnectionToWarehouseDescriptor', () => {
it('maps local Postgres URLs to canonical warehouse descriptors', () => {
expect(

View file

@ -634,6 +634,21 @@ describe('local ingest adapters', () => {
await expect(adapter?.listTargetConnectionIds?.('/tmp/staged-dbt')).resolves.toEqual(['warehouse']);
});
it('excludes execute-only (scan_enabled: false) warehouses from primary scan targets', async () => {
const adapters = createDefaultLocalIngestAdapters(
projectWithConnections({
scannable: { driver: 'postgres', url: 'postgresql://db/a' },
executeonly: { driver: 'postgres', url: 'postgresql://db/b', scan_enabled: false },
docs: { driver: 'dbt', source_dir: './dbt' },
} as never),
);
// No setup.database_connection_ids → falls back to "all warehouses", which must now
// skip the execute-only connection rather than re-including it.
const dbt = adapters.find((adapter) => adapter.source === 'dbt');
await expect(dbt?.listTargetConnectionIds?.('/tmp/staged-dbt')).resolves.toEqual(['scannable']);
});
it('passes primary warehouse connection ids to the local Notion adapter', async () => {
const adapters = createDefaultLocalIngestAdapters(
projectWithConnections({

View file

@ -129,6 +129,18 @@ connections:
expect(serialized).not.toContain('completed_steps:');
});
it('parses and serializes a warehouse connection marked execute-only (scan_enabled: false)', () => {
const config = parseKtxProjectConfig(`
connections:
public_bq:
driver: bigquery
scan_enabled: false
`);
expect(config.connections.public_bq).toMatchObject({ driver: 'bigquery', scan_enabled: false });
expect(serializeKtxProjectConfig(config)).toContain('scan_enabled: false');
});
it('parses global direct Anthropic LLM config', () => {
const config = parseKtxProjectConfig(`
llm:

View file

@ -1586,6 +1586,64 @@ describe('setup databases step', () => {
});
});
it('registers an execute-only connection (scan_enabled: false) without scanning it', async () => {
await writeFile(
join(tempDir, 'ktx.yaml'),
['connections:', ' public_bq:', ' driver: bigquery', ' scan_enabled: false', ''].join('\n'),
'utf-8',
);
const io = makeIo();
const testConnection = vi.fn(async () => 0);
const scanConnection = vi.fn(async () => 0);
const result = await runKtxSetupDatabasesStep(
{
projectDir: tempDir,
inputMode: 'disabled',
databaseConnectionIds: ['public_bq'],
databaseSchemas: [],
skipDatabases: false,
},
io.io,
{ testConnection, scanConnection, listSchemas: vi.fn(async () => ['a', 'b', 'c']) },
);
expect(result.status).toBe('ready');
// The credential is validated, but the warehouse is never introspected/scanned.
expect(testConnection).toHaveBeenCalledWith(tempDir, 'public_bq', expect.anything());
expect(scanConnection).not.toHaveBeenCalled();
});
it('warns instead of silently scanning every discovered dataset when scripted setup has no scope', async () => {
await writeFile(
join(tempDir, 'ktx.yaml'),
['connections:', ' warehouse:', ' driver: bigquery', ''].join('\n'),
'utf-8',
);
const io = makeIo();
const result = await runKtxSetupDatabasesStep(
{
projectDir: tempDir,
inputMode: 'disabled',
databaseConnectionIds: ['warehouse'],
databaseSchemas: [],
skipDatabases: false,
},
io.io,
{
testConnection: vi.fn(async () => 0),
scanConnection: vi.fn(async () => 0),
listSchemas: vi.fn(async () => ['stripe', 'posthog', 'linear']),
listTables: vi.fn(async () => []),
},
);
expect(result.status).toBe('ready');
expect(io.stderr()).toContain('No --database-schema given for warehouse');
expect(io.stderr()).toContain('scan_enabled: false');
});
it('keeps scripted database ids fail-fast even when input mode is auto', async () => {
await writeFile(
join(tempDir, 'ktx.yaml'),