Improve schema setup and Notion ingest UX

This commit is contained in:
Luca Martial 2026-05-11 12:34:33 -07:00
parent 155613c794
commit 72a4ace13c
21 changed files with 540 additions and 118 deletions

View file

@ -327,8 +327,19 @@ describe('createRepainter', () => {
repainter.paint('hello');
repainter.paint('bye');
expect(io.stdout()).toContain('\rbye');
expect(io.stdout()).not.toContain('\u001b[1A\rbye');
expect(io.stdout()).toContain('bye');
expect(io.stdout()).not.toMatch(/\[\d+A/);
});
it('does not undershoot cursor-up when a line is exactly the terminal width', () => {
const io = makeIo({ isTTY: true, columns: 10 });
const repainter = createRepainter(io.io);
repainter.paint('0123456789\nsecond\n');
repainter.paint('0123456789\nsecond\n');
const cursorMoves = [...io.stdout().matchAll(/\[(\d+)A/g)].map((m) => Number(m[1]));
expect(cursorMoves).toEqual([2]);
});
});

View file

@ -378,7 +378,8 @@ export function createRepainter(io: KtxCliIo) {
}
io.stdout.write('\r');
}
io.stdout.write(content.replaceAll('\n', `${ESC}[K\n`));
io.stdout.write(`${ESC}[2K`);
io.stdout.write(content.replaceAll('\n', `\n${ESC}[2K`));
io.stdout.write(`${ESC}[J`);
hasPainted = true;
lastCursorUpRows = cursorUpRowsAfterWrite(content);

View file

@ -531,7 +531,7 @@ describe('setup databases step', () => {
message: 'Primary sources already configured: warehouse\nWhat would you like to do?',
options: [
{ value: 'add', label: 'Add another primary source' },
{ value: 'continue', label: 'Continue setup' },
{ value: 'continue', label: 'Continue to knowledge sources' },
{ value: 'back', label: 'Back' },
],
});
@ -582,7 +582,7 @@ describe('setup databases step', () => {
message: 'Primary sources already configured: warehouse\nWhat would you like to do?',
options: [
{ value: 'add', label: 'Add another primary source' },
{ value: 'continue', label: 'Continue setup' },
{ value: 'continue', label: 'Continue to knowledge sources' },
{ value: 'back', label: 'Back' },
],
});
@ -617,7 +617,7 @@ describe('setup databases step', () => {
message: 'Primary sources already configured: postgres-warehouse\nWhat would you like to do?',
options: [
{ value: 'add', label: 'Add another primary source' },
{ value: 'continue', label: 'Continue setup' },
{ value: 'continue', label: 'Continue to knowledge sources' },
{ value: 'back', label: 'Back' },
],
});
@ -652,7 +652,7 @@ describe('setup databases step', () => {
message: 'Primary sources already configured: postgres-warehouse\nWhat would you like to do?',
options: [
{ value: 'add', label: 'Add another primary source' },
{ value: 'continue', label: 'Continue setup' },
{ value: 'continue', label: 'Continue to knowledge sources' },
{ value: 'back', label: 'Back' },
],
});
@ -695,7 +695,7 @@ describe('setup databases step', () => {
message: 'Primary sources already configured: warehouse\nWhat would you like to do?',
options: [
{ value: 'add', label: 'Add another primary source' },
{ value: 'continue', label: 'Continue setup' },
{ value: 'continue', label: 'Continue to knowledge sources' },
{ value: 'back', label: 'Back' },
],
});
@ -918,6 +918,10 @@ describe('setup databases step', () => {
'│ ✓ Connection test passed',
'│ Driver: PostgreSQL · Tables: 2',
'│',
].join('\n'),
);
expect(io.stdout()).toContain(
[
'◇ Scanning postgres-warehouse',
'│ ✓ Structural scan completed',
'│ Changes: 2 new tables',
@ -1007,7 +1011,7 @@ describe('setup databases step', () => {
expect(config.connections['postgres-warehouse']).toMatchObject({
schemas: ['orbit_analytics', 'orbit_raw'],
});
expect(io.stdout()).toContain('Schemas: orbit_analytics, orbit_raw');
expect(io.stdout()).toContain(' orbit_analytics, orbit_raw');
});
it('auto-selects all discovered Postgres schemas in non-interactive setup', async () => {
@ -1043,7 +1047,7 @@ describe('setup databases step', () => {
expect(config.connections.warehouse).toMatchObject({
schemas: ['orbit_analytics', 'orbit_raw', 'public'],
});
expect(io.stdout()).toContain('Schemas: orbit_analytics, orbit_raw, public');
expect(io.stdout()).toContain(' orbit_analytics, orbit_raw, public');
});
it('adds one non-interactive Postgres URL connection, tests it, scans it, and marks databases complete', async () => {

View file

@ -112,6 +112,56 @@ const DEFAULT_CONNECTION_IDS: Record<KtxSetupDatabaseDriver, string> = {
snowflake: 'snowflake-warehouse',
};
interface ScopeDiscoverySpec {
noun: string;
nounPlural: string;
promptLabel: string;
configArrayField: string;
configSingleField: string;
defaultSelection: (values: string[]) => string[];
}
const SCOPE_DISCOVERY_SPECS: Partial<Record<KtxSetupDatabaseDriver, ScopeDiscoverySpec>> = {
postgres: {
noun: 'schema',
nounPlural: 'schemas',
promptLabel: 'PostgreSQL schemas',
configArrayField: 'schemas',
configSingleField: 'schema',
defaultSelection(schemas) {
const nonPublic = schemas.filter((s) => s !== 'public');
return nonPublic.length > 0 ? nonPublic : schemas;
},
},
sqlserver: {
noun: 'schema',
nounPlural: 'schemas',
promptLabel: 'SQL Server schemas',
configArrayField: 'schemas',
configSingleField: 'schema',
defaultSelection: (schemas) => schemas,
},
bigquery: {
noun: 'dataset',
nounPlural: 'datasets',
promptLabel: 'BigQuery datasets',
configArrayField: 'dataset_ids',
configSingleField: 'dataset_id',
defaultSelection: (datasets) => datasets,
},
snowflake: {
noun: 'schema',
nounPlural: 'schemas',
promptLabel: 'Snowflake schemas',
configArrayField: 'schema_names',
configSingleField: 'schema_name',
defaultSelection(schemas) {
const nonPublic = schemas.filter((s) => s !== 'PUBLIC');
return nonPublic.length > 0 ? nonPublic : schemas;
},
},
};
type UrlDriverType = Extract<KtxSetupDatabaseDriver, 'postgres' | 'mysql' | 'clickhouse' | 'sqlserver'>;
const DRIVER_CONNECTION_DEFAULTS: Record<UrlDriverType, { port: string }> = {
@ -260,16 +310,53 @@ async function defaultHistoricSqlProbe(input: KtxSetupHistoricSqlProbeInput): Pr
async function defaultListSchemas(projectDir: string, connectionId: string): Promise<string[]> {
const project = await loadKtxProject({ projectDir });
const connection = project.config.connections[connectionId];
const { KtxPostgresScanConnector, isKtxPostgresConnectionConfig } = await import('@ktx/connector-postgres');
if (!isKtxPostgresConnectionConfig(connection)) {
return [];
const driver = normalizeDriver(connection?.driver);
if (driver === 'postgres') {
const { KtxPostgresScanConnector, isKtxPostgresConnectionConfig } = await import('@ktx/connector-postgres');
if (!isKtxPostgresConnectionConfig(connection)) return [];
const connector = new KtxPostgresScanConnector({ connectionId, connection });
try {
return await connector.listSchemas();
} finally {
await connector.cleanup();
}
}
const connector = new KtxPostgresScanConnector({ connectionId, connection });
try {
return await connector.listSchemas();
} finally {
await connector.cleanup();
if (driver === 'sqlserver') {
const { KtxSqlServerScanConnector, isKtxSqlServerConnectionConfig } = await import('@ktx/connector-sqlserver');
if (!isKtxSqlServerConnectionConfig(connection)) return [];
const connector = new KtxSqlServerScanConnector({ connectionId, connection });
try {
return await connector.listSchemas();
} finally {
await connector.cleanup();
}
}
if (driver === 'bigquery') {
const { KtxBigQueryScanConnector, isKtxBigQueryConnectionConfig } = await import('@ktx/connector-bigquery');
if (!isKtxBigQueryConnectionConfig(connection)) return [];
const connector = new KtxBigQueryScanConnector({ connectionId, connection });
try {
return await connector.listDatasets();
} finally {
await connector.cleanup();
}
}
if (driver === 'snowflake') {
const { KtxSnowflakeScanConnector, isKtxSnowflakeConnectionConfig } = await import('@ktx/connector-snowflake');
if (!isKtxSnowflakeConnectionConfig(connection)) return [];
const connector = new KtxSnowflakeScanConnector({ connectionId, connection });
try {
return await connector.listSchemas();
} finally {
await connector.cleanup();
}
}
return [];
}
function existingConnectionIdsByDriver(
@ -307,7 +394,7 @@ function configuredPrimarySourcesPrompt(connectionIds: string[]): {
message: `Primary sources already configured: ${connectionIds.join(', ')}\nWhat would you like to do?`,
options: [
{ value: 'add', label: 'Add another primary source' },
{ value: 'continue', label: 'Continue setup' },
{ value: 'continue', label: 'Continue to knowledge sources' },
{ value: 'back', label: 'Back' },
],
};
@ -831,41 +918,44 @@ async function writeConnectionConfig(input: {
}
}
function configuredSchemas(connection: KtxProjectConnectionConfig | undefined): string[] {
function configuredScopeValues(
connection: KtxProjectConnectionConfig | undefined,
spec: ScopeDiscoverySpec,
): string[] {
if (!connection) return [];
if (Array.isArray(connection.schemas)) {
return connection.schemas
.filter((schema): schema is string => typeof schema === 'string' && schema.trim().length > 0)
.map((schema) => schema.trim());
const arrayVal = connection[spec.configArrayField];
if (Array.isArray(arrayVal)) {
return arrayVal
.filter((v): v is string => typeof v === 'string' && v.trim().length > 0)
.map((v) => v.trim());
}
return typeof connection.schema === 'string' && connection.schema.trim().length > 0 ? [connection.schema.trim()] : [];
const singleVal = connection[spec.configSingleField];
return typeof singleVal === 'string' && singleVal.trim().length > 0 ? [singleVal.trim()] : [];
}
function defaultSchemaSelection(schemas: string[]): string[] {
const nonPublic = schemas.filter((schema) => schema !== 'public');
return nonPublic.length > 0 ? nonPublic : schemas;
}
async function writeConnectionSchemas(input: {
async function writeScopeConfig(input: {
projectDir: string;
connectionId: string;
schemas: string[];
values: string[];
spec: ScopeDiscoverySpec;
}): Promise<void> {
const project = await loadKtxProject({ projectDir: input.projectDir });
const connection = project.config.connections[input.connectionId];
if (!connection) return;
const { schema: _schema, ...connectionWithoutLegacySchema } = connection;
const cleaned = Object.fromEntries(
Object.entries(connection).filter(([key]) => key !== input.spec.configSingleField),
) as KtxProjectConnectionConfig;
await writeConnectionConfig({
projectDir: input.projectDir,
connectionId: input.connectionId,
connection: {
...connectionWithoutLegacySchema,
schemas: unique(input.schemas),
...cleaned,
[input.spec.configArrayField]: unique(input.values),
},
});
}
async function maybeConfigurePostgresSchemas(input: {
async function maybeConfigureSchemaScope(input: {
projectDir: string;
connectionId: string;
args: KtxSetupDatabasesArgs;
@ -875,65 +965,77 @@ async function maybeConfigurePostgresSchemas(input: {
}): Promise<boolean> {
const project = await loadKtxProject({ projectDir: input.projectDir });
const connection = project.config.connections[input.connectionId];
if (normalizeDriver(connection?.driver) !== 'postgres') {
return true;
}
const driver = normalizeDriver(connection?.driver);
if (!driver) return true;
if (configuredSchemas(connection).length > 0) {
const spec = SCOPE_DISCOVERY_SPECS[driver];
if (!spec) return true;
const arrayVal = connection?.[spec.configArrayField];
if (Array.isArray(arrayVal) && arrayVal.length > 0) {
return true;
}
if (input.args.databaseSchemas.length > 0) {
await writeConnectionSchemas({
await writeScopeConfig({
projectDir: input.projectDir,
connectionId: input.connectionId,
schemas: input.args.databaseSchemas,
values: input.args.databaseSchemas,
spec,
});
return true;
}
let discoveredSchemas: string[];
writeSetupSection(input.io, `Discovering ${spec.promptLabel.toLowerCase()}`, [
`Connecting to ${input.connectionId}`,
]);
let discovered: string[];
try {
discoveredSchemas = unique(
discovered = unique(
await (input.deps.listSchemas ?? defaultListSchemas)(input.projectDir, input.connectionId),
);
} catch (error) {
input.io.stderr.write(
`Could not discover PostgreSQL schemas for ${input.connectionId}; continuing with existing schema scope. ` +
`Could not discover ${spec.promptLabel.toLowerCase()} for ${input.connectionId}; continuing with existing ${spec.noun} scope. ` +
`Pass --database-schema to set it explicitly. ${error instanceof Error ? error.message : String(error)}\n`,
);
return true;
}
if (discoveredSchemas.length === 0) {
if (discovered.length === 0) {
return true;
}
let selectedSchemas: string[];
if (input.args.inputMode === 'disabled' || discoveredSchemas.length === 1) {
selectedSchemas = discoveredSchemas;
let selected: string[];
if (input.args.inputMode === 'disabled' || discovered.length === 1) {
selected = discovered;
} else {
const initialValues = defaultSchemaSelection(discoveredSchemas);
const preconfigured = configuredScopeValues(connection, spec).filter((v) => discovered.includes(v));
const initialValues = preconfigured.length > 0 ? preconfigured : spec.defaultSelection(discovered);
const choices = await input.prompts.multiselect({
message: withMultiselectNavigation(
'PostgreSQL schemas to scan\nKTX found multiple non-system schemas. Select every schema agents should use.',
`${spec.promptLabel} to scan\n` +
`KTX found multiple ${spec.nounPlural}. Select every ${spec.noun} agents should use.`,
),
options: discoveredSchemas.map((schema) => ({ value: schema, label: schema })),
options: discovered.map((v) => ({ value: v, label: v })),
initialValues,
required: true,
});
if (choices.includes('back')) {
return false;
}
selectedSchemas = choices.length > 0 ? choices : initialValues;
selected = choices.length > 0 ? choices : initialValues;
}
await writeConnectionSchemas({
await writeScopeConfig({
projectDir: input.projectDir,
connectionId: input.connectionId,
schemas: selectedSchemas,
values: selected,
spec,
});
writeSetupSection(input.io, `Selecting schemas for ${input.connectionId}`, [
`Schemas: ${selectedSchemas.join(', ')}`,
const capitalNounPlural = spec.nounPlural[0]!.toUpperCase() + spec.nounPlural.slice(1);
writeSetupSection(input.io, `${capitalNounPlural} saved for ${input.connectionId}`, [
`${selected.join(', ')}`,
]);
return true;
}
@ -1049,7 +1151,7 @@ async function validateAndScanConnection(input: {
testLines.push(`Driver: ${driverDisplay}${Number.isFinite(tableCount) ? ` · Tables: ${tableCount}` : ''}`);
writeSetupSection(input.io, `Testing ${input.connectionId}`, testLines);
if (!(await maybeConfigurePostgresSchemas(input))) {
if (!(await maybeConfigureSchemaScope(input))) {
return false;
}

View file

@ -211,6 +211,37 @@ describe('setup sources step', () => {
expect(runMapping).toHaveBeenCalledWith(projectDir, 'prod_metabase', io.io);
});
it('writes Notion config with the full default knowledge create budget', async () => {
await addPrimarySource();
const validateNotion = vi.fn(async () => ({ ok: true as const, detail: 'roots=1' }));
await expect(
runKtxSetupSourcesStep(
{
projectDir,
inputMode: 'disabled',
source: 'notion',
sourceConnectionId: 'notion-main',
sourceApiKeyRef: 'env:NOTION_TOKEN',
notionCrawlMode: 'selected_roots',
notionRootPageIds: ['page-1'],
runInitialSourceIngest: false,
skipSources: false,
},
makeIo().io,
{ validateNotion },
),
).resolves.toEqual({ status: 'ready', projectDir, connectionIds: ['notion-main'] });
expect((await readConfig()).connections['notion-main']).toMatchObject({
driver: 'notion',
auth_token_ref: 'env:NOTION_TOKEN',
root_page_ids: ['page-1'],
max_knowledge_creates_per_run: 25,
max_knowledge_updates_per_run: 20,
});
});
it('defaults interactive Metabase and Looker source setup to the only warehouse connection', async () => {
await addPrimarySource();
const cases: Array<{

View file

@ -36,6 +36,8 @@ import { writeProjectLocalSecretReference } from './setup-secrets.js';
export type KtxSetupSourceType = 'dbt' | 'metricflow' | 'metabase' | 'looker' | 'lookml' | 'notion';
const DEFAULT_NOTION_MAX_KNOWLEDGE_CREATES_PER_RUN = 25;
export interface KtxSetupSourcesArgs {
projectDir: string;
inputMode: 'auto' | 'disabled';
@ -521,7 +523,7 @@ function buildNotionConnection(args: KtxSetupSourcesArgs): KtxProjectConnectionC
root_database_ids: [],
root_data_source_ids: [],
max_pages_per_run: 1000,
max_knowledge_creates_per_run: 5,
max_knowledge_creates_per_run: DEFAULT_NOTION_MAX_KNOWLEDGE_CREATES_PER_RUN,
max_knowledge_updates_per_run: 20,
last_successful_cursor: null,
};