ktx/packages/cli/test/context/scan/local-enrichment-artifacts.test.ts

912 lines
29 KiB
TypeScript
Raw Permalink Normal View History

2026-05-10 23:12:26 +02:00
import { mkdtemp, readFile, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import YAML from 'yaml';
test: split cli tests from source tree (#216) * feat(cli): define full warehouse dialect contract * test(cli): keep dialect edge tests focused * fix(cli): stabilize dialect contract foundation * refactor(connectors): own read-only query preparation * refactor(connectors): resolve dialects through registry * refactor(connectors): keep concrete dialect classes internal * chore(workspace): enforce dialect import boundary * refactor(cli): resolve relationship dialect at scan boundary * refactor(cli): use dialect display parsing for entity details * refactor(cli): use dialect display parsing for warehouse catalog * refactor(cli): use dialect SQL in relationship workflows * test(cli): verify solid dialect scan workflow closure * test: split cli tests from source tree * refactor(cli): standardize BigQuery scope listing * feat(sqlite): implement connector scope listing * test(connectors): cover required table listing * feat(cli): add warehouse driver registry * refactor(setup): route scope discovery through driver registry * refactor(cli): route local query execution through driver registry * refactor(historic-sql): route dialect support through driver registry * refactor(cli): test warehouse connections through driver registry * fix(cli): close driver registry type export gaps * Improve setup daemon diagnostics * refactor(setup): centralize rail-prefixed diagnostics + query-history fallback Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput into clack.ts so the setup wizard, managed daemons, and embedding/agent steps share one rail-formatted writer. setup-databases.ts also adds a "disable query history and retry" option when the schema-context build fails and query history is the likely culprit, surfaced via a new failed-query-history-unavailable status. * fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match The setup picker's KtxTableListEntry was a 2-level { schema, name }, so qualifiedTableId always wrote db.name into enabled_tables. When BigQuery, Snowflake, or SQL Server later ran fast ingest, their introspect step filtered the scope set with scopedTableNames(scope, { catalog: projectId|database, db }) — catalog was non-null on the introspect side but null in the scope refs, so every entry was rejected, the live-database adapter staged zero table files, and detect() failed with 'Adapter "live-database" did not recognize fetched source output'. Align the picker boundary with the canonical 3-level KtxTableRef: - Add catalog: string | null to KtxTableListEntry. - BigQuery/Snowflake/SQL Server listTables populate catalog from the resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null. - qualifiedTableId emits catalog.schema.name when catalog is non-null (resolveEnabledTables already accepts the 3-part shape) and schemasFromEnabledTables now goes through parseDottedTableEntry so it recovers the schema correctly from both 2-part and 3-part entries. - Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker reuse. Update listTables expectations in all seven connector tests and the setup / picker test fixtures. Add a picker regression test that covers the catalog-bearing round-trip (save + refine). * fix(cli): allow debug telemetry under opt-out env
2026-05-26 08:49:05 +02:00
import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js';
import type { KtxLocalScanEnrichmentResult } from '../../../src/context/scan/local-enrichment.js';
import { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards } from '../../../src/context/scan/local-enrichment-artifacts.js';
import type { KtxSchemaSnapshot } from '../../../src/context/scan/types.js';
2026-05-10 23:12:26 +02:00
2026-05-10 23:51:24 +02:00
const snapshot: KtxSchemaSnapshot = {
2026-05-10 23:12:26 +02:00
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'customers',
kind: 'table',
comment: 'DB customer table',
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'DB customer id',
},
],
},
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: 'DB orders table',
estimatedRows: 3,
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: 'orders_customer_id_fkey',
},
],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'DB order id',
},
{
name: 'customer_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: 'DB customer id',
},
],
},
],
};
2026-05-10 23:51:24 +02:00
function enrichment(): KtxLocalScanEnrichmentResult {
2026-05-10 23:12:26 +02:00
return {
snapshot,
summary: {
dataDictionary: 'completed',
tableDescriptions: 'completed',
columnDescriptions: 'completed',
embeddings: 'completed',
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
state: {
resumedStages: [],
completedStages: ['descriptions', 'embeddings', 'relationships'],
failedStages: [],
},
warnings: [],
descriptionUpdates: [
{
table: { catalog: null, db: 'public', name: 'orders' },
tableDescription: 'AI orders table',
columnDescriptions: {
id: 'AI order id',
customer_id: 'AI customer reference',
},
},
{
table: { catalog: null, db: 'public', name: 'customers' },
tableDescription: 'AI customers table',
columnDescriptions: {
id: 'AI customer id',
},
},
],
embeddingUpdates: [
{ columnId: 'public.orders.id', text: 'orders id', embedding: [0.1, 0.2] },
{ columnId: 'public.orders.customer_id', text: 'orders customer_id', embedding: [0.3, 0.4] },
],
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
source: 'inferred',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
relationshipProfile: {
connectionId: 'warehouse',
driver: 'postgres',
sqlAvailable: true,
queryCount: 6,
tables: [{ table: { catalog: null, db: 'public', name: 'customers' }, rowCount: 2 }],
columns: {
'customers.id': {
table: { catalog: null, db: 'public', name: 'customers' },
column: 'id',
nativeType: 'integer',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
},
warnings: [],
},
resolvedRelationships: [
{
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
source: 'llm_proposal',
status: 'accepted',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.92,
pkScore: 0.95,
fkScore: 0.91,
score: 0.9,
evidence: {
sourceColumnBase: 'buyer',
targetTableBase: 'customer',
targetColumnBase: 'id',
targetKeyScore: 0.88,
nameScore: 0.45,
reasons: ['llm_proposal', 'llm_pk_proposal'],
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
},
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
checkedValues: 2,
reasons: ['validation_passed'],
},
graph: {
targetPkScore: 0.95,
incomingCandidateCount: 1,
conflictRank: 1,
reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'],
},
},
],
compositeRelationships: null,
};
}
describe('writeLocalScanEnrichmentArtifacts', () => {
let tempDir: string;
2026-05-10 23:51:24 +02:00
let project: KtxLocalProject;
2026-05-10 23:12:26 +02:00
beforeEach(async () => {
2026-05-10 23:51:24 +02:00
tempDir = await mkdtemp(join(tmpdir(), 'ktx-local-enrichment-artifacts-'));
project = await initKtxProject({
2026-05-10 23:12:26 +02:00
projectDir: join(tempDir, 'project'),
});
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('writes enrichment artifacts and manifest shards while preserving external descriptions', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify(
{
tables: {
orders: {
table: 'public.orders',
descriptions: { user: 'Pinned analyst description', ai: 'Old AI description' },
columns: [
{
name: 'id',
type: 'number',
descriptions: { user: 'Pinned id description', ai: 'Old AI id' },
},
{ name: 'customer_id', type: 'number' },
],
joins: [
{
to: 'customers',
on: 'orders.id = customers.id',
relationship: 'many_to_one',
source: 'manual',
},
],
},
},
},
{ indent: 2, lineWidth: 0 },
),
2026-05-10 23:51:24 +02:00
'ktx',
'ktx@example.com',
2026-05-10 23:12:26 +02:00
'Seed manifest shard',
);
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
syncId: 'sync-1',
driver: 'postgres',
enrichment: enrichment(),
dryRun: false,
relationshipSettings: {
enabled: true,
llmProposals: false,
validationRequiredForManifest: true,
acceptThreshold: 0.91,
reviewThreshold: 0.61,
maxLlmTablesPerBatch: 12,
maxCandidatesPerColumn: 7,
profileSampleRows: 500,
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204) * feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure Snowflake setup previously asked for a single schema as free text, then ran a multiselect against the discovered schemas — two schema questions back-to-back, with the first being only a session bootstrap. The SDK's `schema` is optional, so the bootstrap step is unnecessary. - Remove the free-text Snowflake schema prompt; only pass `schema` to snowflake-sdk when one is configured. - When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the user for a comma-separated list, persist it as `schema_names`, and use it as both the table-list filter and the multiselect default. Applies to every driver with a scope-discovery spec, not just Snowflake. - Update docs to lead with `schema_names`; keep `schema_name` as a documented single-schema shorthand. * fix(snowflake): keep introspecting when primary-key discovery is denied The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the connection role may not have. Previously a 'SQL compilation error: Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist or not authorized' aborted the entire introspect — schemas, columns, and row counts were all discarded over a missing nice-to-have. Wrap the constraint query in try/catch, log a one-line warning per schema, and return an empty PK map. Columns end up with primaryKey=false; relationship inference still has FK and profiling to fall back on. * fix(scan): unblock relationship discovery on Snowflake Two adjacent bugs prevented the scan's relationship pipeline from producing any joins on a Snowflake warehouse: - relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table profile query failed with "Unknown function GROUP_CONCAT". Add an explicit Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected). - description-generation.ts destructured `connector.sampleTable` and `connector.sampleColumn` into bare locals, losing the `this` binding when the class-method connectors (Snowflake, Postgres, MySQL) were invoked. Every sample call threw "Cannot read properties of undefined (reading 'assertConnection')" and degraded LLM descriptions to metadata-only prompts. Call the methods through the connector instead. Without these, even after the primary-key probe is allowed to fail softly, the scan ends up with 0 validated relationships and an empty `joins:` block in every shard YAML. * test(scan): cover table-ref helpers * feat(scan): plumb tableScope through live-database introspection port * feat(scan): apply tableScope during metadata fetch * feat(scan): enforce table scope at fetch boundary * feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206) * feat(cli): add RSA key-pair auth option to Snowflake setup wizard Extends the interactive Snowflake setup flow with an authentication-method prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key path (env/file/absolute) and an optional passphrase; the resulting connection config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead of `password`. * feat(scan): pool Snowflake sessions * fix(scan): reuse structural snapshots and cleanup connectors * feat(scan): parallelize relationship profiling * feat(scan): batch table description generation * docs: document Snowflake ingest concurrency knobs * fix(scan): close Snowflake ingest perf verification gaps * fix(scan): keep batched description failure bounded * feat(scan): dispatch query-history probes by connection driver Extract historic-sql dialect resolution into a shared helper so the status-project readiness check and the local ingest factory agree on which connections enable query history and which probe to run. The status command now picks the postgres/snowflake/bigquery probe based on the connection's driver instead of always reporting against postgres, which previously caused snowflake connections with queryHistory.enabled to surface a misleading "driver is snowflake" failure. Also drops a noisy console.warn from Snowflake primary-key discovery — INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only roles and the FK + profiling paths handle the empty PK map already. * fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject The Claude Code agent SDK announces an internal pseudo-tool named StructuredOutput in the system/init message whenever outputFormat is set to { type: 'json_schema' }. The runtime's isolation check built its allowedToolIds set only from MCP tool ids and treated StructuredOutput as an unexpected host-injected tool, so every generateObject call threw "Claude Code runtime isolation failed: tools=StructuredOutput ..." and the table-descriptions and relationship-LLM-proposal enrichment stages recorded null output across the board. Whitelist StructuredOutput specifically in generateObject's allowedToolIds — the check also enforces missing_tools symmetry, so generateText and runAgentLoop, which do not see StructuredOutput, must not require it. generateObject also ran with maxTurns: 1, which the model intermittently breached when it emitted thinking text before the structured response. Raised to 5 to give the schema-bound call enough headroom without allowing unbounded loops. The existing tests now exercise the path with an init message that announces StructuredOutput so the regression cannot slip back in. * chore(scripts): add ktx-reset.sh project-cleanup helper Convenience script for repeatable ingest testing: takes a project directory and prunes everything except ktx.yaml and .ktx/secrets/, so the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
profileConcurrency: 3,
2026-05-10 23:12:26 +02:00
validationConcurrency: 2,
},
});
expect(result).toEqual({
enrichmentArtifacts: [
'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/embeddings.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json',
],
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
manifestShardsWritten: 1,
});
await expect(
readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json'),
'utf-8',
),
).resolves.toContain('AI orders table');
const relationshipsRaw = await readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json'),
'utf-8',
);
const relationshipsArtifact = JSON.parse(relationshipsRaw) as {
accepted: Array<{
id: string;
status: string;
source: string;
pkScore: number;
fkScore: number;
evidence: unknown;
reasons: string[];
validation: unknown;
graph: unknown;
}>;
review: unknown[];
rejected: unknown[];
skipped: unknown[];
};
expect(relationshipsArtifact.accepted).toHaveLength(1);
expect(relationshipsArtifact.accepted[0]).toMatchObject({
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
status: 'accepted',
source: 'llm_proposal',
pkScore: 0.95,
fkScore: 0.91,
evidence: expect.objectContaining({
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
}),
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
validation: expect.objectContaining({ reasons: ['validation_passed'] }),
graph: expect.objectContaining({ reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'] }),
});
expect(relationshipsArtifact.review).toEqual([]);
expect(relationshipsArtifact.rejected).toEqual([]);
expect(relationshipsArtifact.skipped).toEqual([]);
const profileRaw = await readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json'),
'utf-8',
);
expect(JSON.parse(profileRaw)).toMatchObject({
connectionId: 'warehouse',
driver: 'postgres',
sqlAvailable: true,
queryCount: 6,
warnings: [],
});
const diagnosticsRaw = await readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json'),
'utf-8',
);
expect(JSON.parse(diagnosticsRaw)).toMatchObject({
connectionId: 'warehouse',
summary: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
noAcceptedReason: null,
candidateCountsBySource: { llm_proposal: 1 },
validation: { available: true, sqlAvailable: true, queryCount: 6 },
thresholds: { acceptThreshold: 0.91, reviewThreshold: 0.61 },
policy: {
validationRequiredForManifest: true,
maxCandidatesPerColumn: 7,
profileSampleRows: 500,
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204) * feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure Snowflake setup previously asked for a single schema as free text, then ran a multiselect against the discovered schemas — two schema questions back-to-back, with the first being only a session bootstrap. The SDK's `schema` is optional, so the bootstrap step is unnecessary. - Remove the free-text Snowflake schema prompt; only pass `schema` to snowflake-sdk when one is configured. - When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the user for a comma-separated list, persist it as `schema_names`, and use it as both the table-list filter and the multiselect default. Applies to every driver with a scope-discovery spec, not just Snowflake. - Update docs to lead with `schema_names`; keep `schema_name` as a documented single-schema shorthand. * fix(snowflake): keep introspecting when primary-key discovery is denied The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the connection role may not have. Previously a 'SQL compilation error: Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist or not authorized' aborted the entire introspect — schemas, columns, and row counts were all discarded over a missing nice-to-have. Wrap the constraint query in try/catch, log a one-line warning per schema, and return an empty PK map. Columns end up with primaryKey=false; relationship inference still has FK and profiling to fall back on. * fix(scan): unblock relationship discovery on Snowflake Two adjacent bugs prevented the scan's relationship pipeline from producing any joins on a Snowflake warehouse: - relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table profile query failed with "Unknown function GROUP_CONCAT". Add an explicit Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected). - description-generation.ts destructured `connector.sampleTable` and `connector.sampleColumn` into bare locals, losing the `this` binding when the class-method connectors (Snowflake, Postgres, MySQL) were invoked. Every sample call threw "Cannot read properties of undefined (reading 'assertConnection')" and degraded LLM descriptions to metadata-only prompts. Call the methods through the connector instead. Without these, even after the primary-key probe is allowed to fail softly, the scan ends up with 0 validated relationships and an empty `joins:` block in every shard YAML. * test(scan): cover table-ref helpers * feat(scan): plumb tableScope through live-database introspection port * feat(scan): apply tableScope during metadata fetch * feat(scan): enforce table scope at fetch boundary * feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206) * feat(cli): add RSA key-pair auth option to Snowflake setup wizard Extends the interactive Snowflake setup flow with an authentication-method prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key path (env/file/absolute) and an optional passphrase; the resulting connection config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead of `password`. * feat(scan): pool Snowflake sessions * fix(scan): reuse structural snapshots and cleanup connectors * feat(scan): parallelize relationship profiling * feat(scan): batch table description generation * docs: document Snowflake ingest concurrency knobs * fix(scan): close Snowflake ingest perf verification gaps * fix(scan): keep batched description failure bounded * feat(scan): dispatch query-history probes by connection driver Extract historic-sql dialect resolution into a shared helper so the status-project readiness check and the local ingest factory agree on which connections enable query history and which probe to run. The status command now picks the postgres/snowflake/bigquery probe based on the connection's driver instead of always reporting against postgres, which previously caused snowflake connections with queryHistory.enabled to surface a misleading "driver is snowflake" failure. Also drops a noisy console.warn from Snowflake primary-key discovery — INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only roles and the FK + profiling paths handle the empty PK map already. * fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject The Claude Code agent SDK announces an internal pseudo-tool named StructuredOutput in the system/init message whenever outputFormat is set to { type: 'json_schema' }. The runtime's isolation check built its allowedToolIds set only from MCP tool ids and treated StructuredOutput as an unexpected host-injected tool, so every generateObject call threw "Claude Code runtime isolation failed: tools=StructuredOutput ..." and the table-descriptions and relationship-LLM-proposal enrichment stages recorded null output across the board. Whitelist StructuredOutput specifically in generateObject's allowedToolIds — the check also enforces missing_tools symmetry, so generateText and runAgentLoop, which do not see StructuredOutput, must not require it. generateObject also ran with maxTurns: 1, which the model intermittently breached when it emitted thinking text before the structured response. Raised to 5 to give the schema-bound call enough headroom without allowing unbounded loops. The existing tests now exercise the path with an init message that announces StructuredOutput so the regression cannot slip back in. * chore(scripts): add ktx-reset.sh project-cleanup helper Convenience script for repeatable ingest testing: takes a project directory and prunes everything except ktx.yaml and .ktx/secrets/, so the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
profileConcurrency: 3,
2026-05-10 23:12:26 +02:00
validationConcurrency: 2,
},
profileWarnings: [],
});
const manifestRaw = await readFile(
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
'utf-8',
);
const manifest = YAML.parse(manifestRaw) as {
tables: {
orders: {
descriptions: Record<string, string>;
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
joins: Array<{ to: string; on: string; source: string }>;
};
};
};
expect(manifest.tables.orders.descriptions).toEqual({
user: 'Pinned analyst description',
db: 'DB orders table',
ai: 'AI orders table',
});
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
user: 'Pinned id description',
db: 'DB order id',
ai: 'AI order id',
});
expect(manifest.tables.orders.joins).toEqual(
expect.arrayContaining([
expect.objectContaining({
to: 'customers',
on: 'orders.customer_id = customers.id',
source: 'formal',
}),
expect.objectContaining({
to: 'customers',
on: 'orders.id = customers.id',
source: 'manual',
}),
]),
);
});
it('writes formal accepted relationships into relationship artifacts and manifest shards', async () => {
const source = enrichment();
2026-05-10 23:51:24 +02:00
const formalEnrichment: KtxLocalScanEnrichmentResult = {
2026-05-10 23:12:26 +02:00
...source,
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
source: 'formal',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
resolvedRelationships: [],
compositeRelationships: null,
};
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
driver: 'sqlite',
syncId: 'sync-formal',
enrichment: formalEnrichment,
relationshipSettings: {
enabled: true,
llmProposals: false,
validationRequiredForManifest: true,
acceptThreshold: 0.85,
reviewThreshold: 0.55,
maxLlmTablesPerBatch: 40,
maxCandidatesPerColumn: 25,
profileSampleRows: 10000,
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204) * feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure Snowflake setup previously asked for a single schema as free text, then ran a multiselect against the discovered schemas — two schema questions back-to-back, with the first being only a session bootstrap. The SDK's `schema` is optional, so the bootstrap step is unnecessary. - Remove the free-text Snowflake schema prompt; only pass `schema` to snowflake-sdk when one is configured. - When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the user for a comma-separated list, persist it as `schema_names`, and use it as both the table-list filter and the multiselect default. Applies to every driver with a scope-discovery spec, not just Snowflake. - Update docs to lead with `schema_names`; keep `schema_name` as a documented single-schema shorthand. * fix(snowflake): keep introspecting when primary-key discovery is denied The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the connection role may not have. Previously a 'SQL compilation error: Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist or not authorized' aborted the entire introspect — schemas, columns, and row counts were all discarded over a missing nice-to-have. Wrap the constraint query in try/catch, log a one-line warning per schema, and return an empty PK map. Columns end up with primaryKey=false; relationship inference still has FK and profiling to fall back on. * fix(scan): unblock relationship discovery on Snowflake Two adjacent bugs prevented the scan's relationship pipeline from producing any joins on a Snowflake warehouse: - relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table profile query failed with "Unknown function GROUP_CONCAT". Add an explicit Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected). - description-generation.ts destructured `connector.sampleTable` and `connector.sampleColumn` into bare locals, losing the `this` binding when the class-method connectors (Snowflake, Postgres, MySQL) were invoked. Every sample call threw "Cannot read properties of undefined (reading 'assertConnection')" and degraded LLM descriptions to metadata-only prompts. Call the methods through the connector instead. Without these, even after the primary-key probe is allowed to fail softly, the scan ends up with 0 validated relationships and an empty `joins:` block in every shard YAML. * test(scan): cover table-ref helpers * feat(scan): plumb tableScope through live-database introspection port * feat(scan): apply tableScope during metadata fetch * feat(scan): enforce table scope at fetch boundary * feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206) * feat(cli): add RSA key-pair auth option to Snowflake setup wizard Extends the interactive Snowflake setup flow with an authentication-method prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key path (env/file/absolute) and an optional passphrase; the resulting connection config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead of `password`. * feat(scan): pool Snowflake sessions * fix(scan): reuse structural snapshots and cleanup connectors * feat(scan): parallelize relationship profiling * feat(scan): batch table description generation * docs: document Snowflake ingest concurrency knobs * fix(scan): close Snowflake ingest perf verification gaps * fix(scan): keep batched description failure bounded * feat(scan): dispatch query-history probes by connection driver Extract historic-sql dialect resolution into a shared helper so the status-project readiness check and the local ingest factory agree on which connections enable query history and which probe to run. The status command now picks the postgres/snowflake/bigquery probe based on the connection's driver instead of always reporting against postgres, which previously caused snowflake connections with queryHistory.enabled to surface a misleading "driver is snowflake" failure. Also drops a noisy console.warn from Snowflake primary-key discovery — INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only roles and the FK + profiling paths handle the empty PK map already. * fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject The Claude Code agent SDK announces an internal pseudo-tool named StructuredOutput in the system/init message whenever outputFormat is set to { type: 'json_schema' }. The runtime's isolation check built its allowedToolIds set only from MCP tool ids and treated StructuredOutput as an unexpected host-injected tool, so every generateObject call threw "Claude Code runtime isolation failed: tools=StructuredOutput ..." and the table-descriptions and relationship-LLM-proposal enrichment stages recorded null output across the board. Whitelist StructuredOutput specifically in generateObject's allowedToolIds — the check also enforces missing_tools symmetry, so generateText and runAgentLoop, which do not see StructuredOutput, must not require it. generateObject also ran with maxTurns: 1, which the model intermittently breached when it emitted thinking text before the structured response. Raised to 5 to give the schema-bound call enough headroom without allowing unbounded loops. The existing tests now exercise the path with an init message that announces StructuredOutput so the regression cannot slip back in. * chore(scripts): add ktx-reset.sh project-cleanup helper Convenience script for repeatable ingest testing: takes a project directory and prunes everything except ktx.yaml and .ktx/secrets/, so the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
profileConcurrency: 4,
2026-05-10 23:12:26 +02:00
validationConcurrency: 4,
},
dryRun: false,
});
const relationshipsPath = 'raw-sources/warehouse/live-database/sync-formal/enrichment/relationships.json';
const relationships = JSON.parse((await project.fileStore.readFile(relationshipsPath)).content) as {
accepted: Array<{ source: string; reasons: string[] }>;
};
expect(relationships.accepted).toEqual([
expect.objectContaining({
source: 'formal',
reasons: ['formal_metadata_accepted'],
}),
]);
const manifestPath = result.manifestShards[0];
if (!manifestPath) {
throw new Error('Expected manifest shard path');
}
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
tables: { orders: { joins: Array<{ to: string; on: string; source: string }> } };
};
expect(manifest.tables.orders.joins).toEqual(
expect.arrayContaining([
expect.objectContaining({
to: 'customers',
on: 'orders.customer_id = customers.id',
source: 'formal',
}),
]),
);
});
it('writes manually applied relationship joins with manual source', async () => {
const result = await writeLocalScanManifestShards({
project,
connectionId: 'warehouse',
syncId: 'sync-manual',
driver: 'postgres',
snapshot,
dryRun: false,
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'public.orders:(public.orders.customer_id)->public.customers:(public.customers.id)',
source: 'manual',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
});
expect(result.manifestShardsWritten).toBe(1);
const shard = YAML.parse(await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'));
expect(shard.tables.orders.joins).toContainEqual({
to: 'customers',
on: 'orders.customer_id = customers.id',
relationship: 'many_to_one',
source: 'manual',
});
});
it('does not persist generated error descriptions in manifest shards', async () => {
await writeLocalScanManifestShards({
project,
connectionId: 'warehouse',
syncId: 'sync-error-description',
driver: 'postgres',
snapshot,
descriptionUpdates: [
{
table: { catalog: null, db: 'public', name: 'orders' },
tableDescription: 'Error generating description: timeout exceeded when trying to connect',
columnDescriptions: {
id: 'Error generating description: timeout exceeded when trying to connect',
customer_id: 'AI customer reference',
},
},
],
dryRun: false,
});
const shard = YAML.parse(
await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'),
) as {
tables: {
orders: {
descriptions?: Record<string, string>;
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
};
};
};
expect(shard.tables.orders.descriptions).toEqual({ db: 'DB orders table' });
expect(shard.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
db: 'DB order id',
});
expect(shard.tables.orders.columns.find((column) => column.name === 'customer_id')?.descriptions).toEqual({
db: 'DB customer id',
ai: 'AI customer reference',
});
});
2026-05-10 23:12:26 +02:00
it('writes accepted composite relationships to relationship artifacts and manifest shards', async () => {
2026-05-10 23:51:24 +02:00
const compositeSnapshot: KtxSchemaSnapshot = {
2026-05-10 23:12:26 +02:00
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-05-07T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'order_lines',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'order_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'line_number',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: 'public',
name: 'order_line_allocations',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'order_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'line_number',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
2026-05-10 23:51:24 +02:00
const compositeEnrichment: KtxLocalScanEnrichmentResult = Object.assign(enrichment(), {
2026-05-10 23:12:26 +02:00
snapshot: compositeSnapshot,
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
descriptionUpdates: [],
embeddingUpdates: [],
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'inferred',
from: {
tableId: 'public.order_line_allocations',
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
columns: ['order_id', 'line_number'],
},
to: {
tableId: 'public.order_lines',
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
table: { catalog: null, db: 'public', name: 'order_lines' },
columns: ['order_id', 'line_number'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
resolvedRelationships: [],
compositeRelationships: [
{
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'composite_profile_match',
status: 'accepted',
from: {
tableId: 'public.order_line_allocations',
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
columns: ['order_id', 'line_number'],
},
to: {
tableId: 'public.order_lines',
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
table: { catalog: null, db: 'public', name: 'order_lines' },
columns: ['order_id', 'line_number'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
reasons: ['composite_validation_passed'],
},
},
],
});
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
driver: 'postgres',
syncId: 'sync-composite',
enrichment: compositeEnrichment,
relationshipSettings: {
enabled: true,
llmProposals: false,
validationRequiredForManifest: true,
acceptThreshold: 0.85,
reviewThreshold: 0.55,
maxLlmTablesPerBatch: 40,
maxCandidatesPerColumn: 25,
profileSampleRows: 10000,
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204) * feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure Snowflake setup previously asked for a single schema as free text, then ran a multiselect against the discovered schemas — two schema questions back-to-back, with the first being only a session bootstrap. The SDK's `schema` is optional, so the bootstrap step is unnecessary. - Remove the free-text Snowflake schema prompt; only pass `schema` to snowflake-sdk when one is configured. - When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the user for a comma-separated list, persist it as `schema_names`, and use it as both the table-list filter and the multiselect default. Applies to every driver with a scope-discovery spec, not just Snowflake. - Update docs to lead with `schema_names`; keep `schema_name` as a documented single-schema shorthand. * fix(snowflake): keep introspecting when primary-key discovery is denied The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the connection role may not have. Previously a 'SQL compilation error: Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist or not authorized' aborted the entire introspect — schemas, columns, and row counts were all discarded over a missing nice-to-have. Wrap the constraint query in try/catch, log a one-line warning per schema, and return an empty PK map. Columns end up with primaryKey=false; relationship inference still has FK and profiling to fall back on. * fix(scan): unblock relationship discovery on Snowflake Two adjacent bugs prevented the scan's relationship pipeline from producing any joins on a Snowflake warehouse: - relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table profile query failed with "Unknown function GROUP_CONCAT". Add an explicit Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected). - description-generation.ts destructured `connector.sampleTable` and `connector.sampleColumn` into bare locals, losing the `this` binding when the class-method connectors (Snowflake, Postgres, MySQL) were invoked. Every sample call threw "Cannot read properties of undefined (reading 'assertConnection')" and degraded LLM descriptions to metadata-only prompts. Call the methods through the connector instead. Without these, even after the primary-key probe is allowed to fail softly, the scan ends up with 0 validated relationships and an empty `joins:` block in every shard YAML. * test(scan): cover table-ref helpers * feat(scan): plumb tableScope through live-database introspection port * feat(scan): apply tableScope during metadata fetch * feat(scan): enforce table scope at fetch boundary * feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206) * feat(cli): add RSA key-pair auth option to Snowflake setup wizard Extends the interactive Snowflake setup flow with an authentication-method prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key path (env/file/absolute) and an optional passphrase; the resulting connection config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead of `password`. * feat(scan): pool Snowflake sessions * fix(scan): reuse structural snapshots and cleanup connectors * feat(scan): parallelize relationship profiling * feat(scan): batch table description generation * docs: document Snowflake ingest concurrency knobs * fix(scan): close Snowflake ingest perf verification gaps * fix(scan): keep batched description failure bounded * feat(scan): dispatch query-history probes by connection driver Extract historic-sql dialect resolution into a shared helper so the status-project readiness check and the local ingest factory agree on which connections enable query history and which probe to run. The status command now picks the postgres/snowflake/bigquery probe based on the connection's driver instead of always reporting against postgres, which previously caused snowflake connections with queryHistory.enabled to surface a misleading "driver is snowflake" failure. Also drops a noisy console.warn from Snowflake primary-key discovery — INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only roles and the FK + profiling paths handle the empty PK map already. * fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject The Claude Code agent SDK announces an internal pseudo-tool named StructuredOutput in the system/init message whenever outputFormat is set to { type: 'json_schema' }. The runtime's isolation check built its allowedToolIds set only from MCP tool ids and treated StructuredOutput as an unexpected host-injected tool, so every generateObject call threw "Claude Code runtime isolation failed: tools=StructuredOutput ..." and the table-descriptions and relationship-LLM-proposal enrichment stages recorded null output across the board. Whitelist StructuredOutput specifically in generateObject's allowedToolIds — the check also enforces missing_tools symmetry, so generateText and runAgentLoop, which do not see StructuredOutput, must not require it. generateObject also ran with maxTurns: 1, which the model intermittently breached when it emitted thinking text before the structured response. Raised to 5 to give the schema-bound call enough headroom without allowing unbounded loops. The existing tests now exercise the path with an init message that announces StructuredOutput so the regression cannot slip back in. * chore(scripts): add ktx-reset.sh project-cleanup helper Convenience script for repeatable ingest testing: takes a project directory and prunes everything except ktx.yaml and .ktx/secrets/, so the next ktx setup or ktx ingest run starts from a known-clean state.
2026-05-23 10:41:30 +02:00
profileConcurrency: 4,
2026-05-10 23:12:26 +02:00
validationConcurrency: 4,
},
dryRun: false,
});
const relationships = JSON.parse(
(await project.fileStore.readFile('raw-sources/warehouse/live-database/sync-composite/enrichment/relationships.json'))
.content,
) as { accepted: Array<{ from: { columns: string[] }; to: { columns: string[] }; reasons: string[] }> };
expect(relationships.accepted[0]).toMatchObject({
from: { columns: ['order_id', 'line_number'] },
to: { columns: ['order_id', 'line_number'] },
reasons: ['composite_validation_passed'],
});
const manifestPath = result.manifestShards[0];
if (!manifestPath) {
throw new Error('Expected manifest shard path');
}
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
tables: { order_line_allocations: { joins: Array<{ to: string; on: string; source: string }> } };
};
expect(manifest.tables.order_line_allocations.joins).toEqual([
{
to: 'order_lines',
on: 'order_line_allocations.order_id = order_lines.order_id AND order_line_allocations.line_number = order_lines.line_number',
relationship: 'many_to_one',
source: 'inferred',
},
]);
});
it('writes structural manifest shards without enrichment artifacts', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify(
{
tables: {
orders: {
table: 'public.orders',
descriptions: { user: 'Pinned structural description', ai: 'Old generated text' },
usage: {
narrative: 'Orders are commonly filtered by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
ownerNote: 'Preserve analyst note',
},
2026-05-10 23:12:26 +02:00
columns: [
{
name: 'id',
type: 'number',
descriptions: { user: 'Pinned structural id', ai: 'Old generated id' },
},
{ name: 'customer_id', type: 'number' },
],
joins: [
{
to: 'customers',
on: 'orders.id = customers.id',
relationship: 'many_to_one',
source: 'manual',
},
],
},
},
},
{ indent: 2, lineWidth: 0 },
),
2026-05-10 23:51:24 +02:00
'ktx',
'ktx@example.com',
2026-05-10 23:12:26 +02:00
'Seed structural manifest shard',
);
const result = await writeLocalScanManifestShards({
project,
connectionId: 'warehouse',
syncId: 'sync-structural-1',
driver: 'postgres',
snapshot,
dryRun: false,
});
expect(result).toEqual({
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
manifestShardsWritten: 1,
});
await expect(
readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-structural-1/enrichment/descriptions.json'),
'utf-8',
),
).rejects.toMatchObject({ code: 'ENOENT' });
const manifestRaw = await readFile(
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
'utf-8',
);
const manifest = YAML.parse(manifestRaw) as {
tables: {
orders: {
descriptions: Record<string, string>;
usage?: Record<string, unknown>;
2026-05-10 23:12:26 +02:00
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
joins: Array<{ to: string; on: string; source: string }>;
};
};
};
expect(manifest.tables.orders.descriptions).toEqual({
user: 'Pinned structural description',
db: 'DB orders table',
});
expect(manifest.tables.orders.usage).toEqual({
narrative: 'Orders are commonly filtered by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
ownerNote: 'Preserve analyst note',
});
2026-05-10 23:12:26 +02:00
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
user: 'Pinned structural id',
db: 'DB order id',
});
expect(manifest.tables.orders.joins).toEqual(
expect.arrayContaining([
expect.objectContaining({
to: 'customers',
on: 'orders.customer_id = customers.id',
source: 'formal',
}),
expect.objectContaining({
to: 'customers',
on: 'orders.id = customers.id',
source: 'manual',
}),
]),
);
});
it('returns planned empty paths without writing files during dry runs', async () => {
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
syncId: 'sync-dry-run',
driver: 'postgres',
enrichment: enrichment(),
dryRun: true,
});
expect(result).toEqual({
enrichmentArtifacts: [],
manifestShards: [],
manifestShardsWritten: 0,
});
await expect(
readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-dry-run/enrichment/descriptions.json'),
'utf-8',
),
).rejects.toMatchObject({ code: 'ENOENT' });
});
});