mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-22 08:38:08 +02:00
chore(workspace): gate dead-code with knip production mode (#196)
* refactor(workspace): relocate @ktx/llm source into packages/cli/src/llm * refactor(workspace): rewrite @ktx/llm imports to relative paths * refactor(workspace): fold internal packages into cli * chore(workspace): gate dead-code with knip production mode Turn on production-mode knip plus an autofix run in pre-commit and the `pnpm dead-code` script, document the `/** @internal */` convention for test-only exports in AGENTS.md, annotate test-only exports across the CLI with that JSDoc, and drop dead exports/wrappers the new gate surfaced (e.g. `cli-project.ts`, `lookerRuntimeSourceToFileAdapterSource`, `createLocalScanEnrichmentProvidersFromConfig`, `PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES`, stale type re-exports). Replace the loose `ignoreIssues` allowlist in `knip.json` with explicit production entries so cross-package barrel leaks are caught. * refactor(cli): delete internal barrel index.ts files The 34 `index.ts` re-export barrels inside `packages/cli/src/` were holdovers from the pre-fold multi-workspace structure. Post-fold-in they served no production purpose: external consumers go through the single package main entry, and in-repo callers mostly imported through them only because the path was short. Internally, knip flagged most barrel re-exports as production-dead (only reached via tests). This change: - Deletes every internal barrel except `packages/cli/src/index.ts` (the published package entry). - Rewrites ~270 source/test files to import each name directly from the file that defines it. - Moves `tools/warehouse-verification/index.ts` to `create-warehouse-verification-tools.ts` (the function it defined locally) and updates its single consumer. - Renames `search/backend-conformance.ts` → `.test-utils.ts` to match the existing test-helper file convention. - Deletes 13 dead test-only chains (dbt-descriptions/*, live-database/extracted-schema, live-database/structural-sync, relationship-* feedback/review chain) plus their tests and a cascading orphan integration test. - Updates test mocks that pointed at deleted barrel paths (notion-client, connector barrels in scan/local-scan-connectors tests) to mock the source files instead. - Points the maintainer benchmark script (`scripts/relationship-benchmark-report.mjs`) at source files instead of `dist/context/scan/index.js`. - Drops the barrel `!` entries from `knip.json`; adds explicit production entries only for the benchmark code reached via dist by the maintainer script. Net: 413 files changed, ~1.2k insertions, ~9.4k deletions. `pnpm run dead-code` (Biome + knip default + knip production) and `pnpm run type-check` are clean; 2277 tests pass. * refactor(workspace): rename @ktx/cli to @kaelio/ktx and pack it directly Promote the CLI workspace package to the public name `@kaelio/ktx` and drop the separate `scripts/build-public-npm-package.mjs` wrapper. The CLI package is now publishable in place (`publishConfig.access: public`, `provenance: true`), so artifact packing uses `pnpm pack` against `packages/cli/` instead of assembling a parallel package tree. Updates all workspace filter invocations, docs, tests, and release readiness checks to reference the new package name, and folds the tarball-name helper into `scripts/public-npm-release-metadata.mjs`. * docs: align "agent clients" and "data agents" terminology Replace "client agents" with "agent clients" and "database agents" with "data agents" across AGENTS.md, README.md, the docs-site copy, and the matching setup-agents test description, matching the canonical vocabulary in docs/terminology.md. Also moves packages/cli/tsconfig.json's tsBuildInfoFile from node_modules/.cache/ to dist/.tsbuildinfo so incremental builds survive node_modules reinstalls. * refactor(release): single source of truth for package version Make packages/cli/package.json the single source of truth for the @kaelio/ktx version. publicNpmPackageVersion() now reads it directly, so artifact filenames, release-readiness checks, and the Python wheel version all derive from one field. The duplicate release-policy.json.publicNpmPackageVersion is removed. Previously the two fields could drift: tarballs were named kaelio-ktx-0.4.1.tgz while internally containing @kaelio/ktx@0.0.0-private. - update-public-release-version.mjs rewrites both Python pyproject.toml files (ktx-daemon, ktx-sl) alongside the npm package.jsons, normalizing the version for PEP 440 (e.g. 0.1.0-rc.2 -> 0.1.0rc2). - semantic-release-config.cjs adds the two pyproject.toml files to @semantic-release/git assets so the release commit back to main carries every version source in lockstep. - The six "?? '0.0.0-private'" fallback literals across the CLI are replaced with "?? getKtxCliPackageInfo().version", and createDefaultKtxMcpServer makes its version arg required. - docs/release.md describes the actual commit-back model: the dev tree always reflects the most recent release; no sentinel pin to maintain. Verified: pnpm run artifacts:build now produces kaelio-ktx-0.4.1.tgz and kaelio_ktx-0.4.1-py3-none-any.whl with @kaelio/ktx@0.4.1 inside. Full type-check, dead-code, and 2287 vitests + 173 script tests pass. * refactor(cli): inject embedding provider resolution and detect sentence-transformers runtime Make resolveProjectEmbeddingProvider and runtimeIo injectable in ingest and scan command entrypoints so tests can stub them, and teach resolvePublicIngestRuntimeRequirements to flag the local-embeddings runtime feature when ktx.yaml selects sentence-transformers. * chore(cli): mark buildLocalStatsStatus and LocalStatsStatus as @internal Both symbols are consumed only by status-project.test.ts. Annotating with /** @internal */ keeps knip's production-mode check clean without changing runtime behavior. * fix(cli): use real package metadata in print-command-tree The stubbed package name embedded a forbidden product identifier that tripped the boundary check in CI. Read the metadata from package.json instead — keeps the rendered tree unchanged and removes a duplicate source of truth. * feat(cli): show embedding coverage in `ktx status`, drop duplicate disk counts Inline `(N embedded)` next to the Wiki scope counts and Semantic-layer source counts, computed with `SUM(embedding_json IS NOT NULL)` over `knowledge_pages` and `local_sl_sources`. Rename the "Knowledge" label to "Wiki" (canonical per `docs/terminology.md`) and rename the matching `localStats.knowledgePages` field to `localStats.wikiPages`. Drop `wiki=N md` and `semantic-layer=N yaml` from the Disk row — those duplicated the per-surface rows above. Disk now reports only actual byte usage (db, cache, raw-sources). The unused `wikiGlobalMarkdownCount` / `semanticLayerYamlCount` fields, the `isMarkdownEntry` / `isYamlEntry` helpers, and the `filter` arg on `summarizeDir` are removed.
This commit is contained in:
parent
a1cfb03d73
commit
2366b00301
1002 changed files with 2286 additions and 12051 deletions
183
packages/cli/src/context/scan/credentials.test.ts
Normal file
183
packages/cli/src/context/scan/credentials.test.ts
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { REDACTED_KTX_CREDENTIAL_VALUE } from '../core/redaction.js';
|
||||
import {
|
||||
redactKtxCredentialEnvelope,
|
||||
redactKtxCredentialValue,
|
||||
redactKtxScanMetadata,
|
||||
redactKtxScanReport,
|
||||
redactKtxScanWarning,
|
||||
} from './credentials.js';
|
||||
import type { KtxCredentialEnvelope, KtxScanReport, KtxScanWarning } from './types.js';
|
||||
|
||||
describe('KTX scan credential redaction', () => {
|
||||
it('keeps credential references inspectable', () => {
|
||||
const envReference: KtxCredentialEnvelope = { kind: 'env', name: 'DATABASE_URL' };
|
||||
const fileReference: KtxCredentialEnvelope = { kind: 'file', path: '~/.config/ktx/warehouse' };
|
||||
|
||||
expect(redactKtxCredentialEnvelope(envReference)).toEqual(envReference);
|
||||
expect(redactKtxCredentialEnvelope(fileReference)).toEqual(fileReference);
|
||||
});
|
||||
|
||||
it('redacts resolved credential envelope values recursively', () => {
|
||||
expect(
|
||||
redactKtxCredentialEnvelope({
|
||||
kind: 'resolved',
|
||||
source: 'host',
|
||||
values: {
|
||||
username: 'readonly',
|
||||
password: 'secret-password', // pragma: allowlist secret
|
||||
nested: {
|
||||
api_key: 'phx_123', // pragma: allowlist secret
|
||||
warehouse: 'compute_wh',
|
||||
},
|
||||
headers: [{ authorizationToken: 'token-value' }, { label: 'safe' }],
|
||||
},
|
||||
}),
|
||||
).toEqual({
|
||||
kind: 'resolved',
|
||||
source: 'host',
|
||||
redacted: true,
|
||||
values: {
|
||||
username: 'readonly',
|
||||
password: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
nested: {
|
||||
api_key: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
warehouse: 'compute_wh',
|
||||
},
|
||||
headers: [{ authorizationToken: REDACTED_KTX_CREDENTIAL_VALUE }, { label: 'safe' }],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts scan metadata fields that commonly contain secrets', () => {
|
||||
expect(
|
||||
redactKtxScanMetadata({
|
||||
driver: 'postgres',
|
||||
url: 'postgres://user:pass@example.test/db', // pragma: allowlist secret
|
||||
serviceAccountJson: {
|
||||
client_email: 'reader@example.test',
|
||||
private_key: 'pem-value', // pragma: allowlist secret
|
||||
},
|
||||
safeCount: 3,
|
||||
}),
|
||||
).toEqual({
|
||||
driver: 'postgres',
|
||||
url: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
serviceAccountJson: {
|
||||
client_email: 'reader@example.test',
|
||||
private_key: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
},
|
||||
safeCount: 3,
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts scan warning messages and metadata without hiding safe context', () => {
|
||||
const warning: KtxScanWarning = {
|
||||
code: 'sampling_failed',
|
||||
message: 'sample failed for postgres://reader:secret@example.test/db', // pragma: allowlist secret
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
table: 'orders',
|
||||
url: 'postgres://reader:secret@example.test/db', // pragma: allowlist secret
|
||||
nested: {
|
||||
api_key: 'sk_test_123', // pragma: allowlist secret
|
||||
schema: 'public',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
expect(redactKtxScanWarning(warning)).toEqual({
|
||||
code: 'sampling_failed',
|
||||
message: 'sample failed for postgres://reader:<redacted>@example.test/db',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
table: 'orders',
|
||||
url: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
nested: {
|
||||
api_key: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
schema: 'public',
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts scan report warning metadata recursively', () => {
|
||||
const report: KtxScanReport = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
syncId: 'sync-1',
|
||||
runId: 'run-1',
|
||||
trigger: 'cli',
|
||||
mode: 'structural',
|
||||
dryRun: false,
|
||||
artifactPaths: {
|
||||
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-1',
|
||||
reportPath: 'raw-sources/warehouse/live-database/sync-1/scan-report.json',
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts: [],
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: 0,
|
||||
tablesModified: 0,
|
||||
tablesDeleted: 0,
|
||||
tablesUnchanged: 0,
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 0,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 0,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
capabilityGaps: [],
|
||||
warnings: [
|
||||
{
|
||||
code: 'credential_redacted',
|
||||
message: 'metadata redacted',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
|
||||
safeCount: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
|
||||
enrichmentState: {
|
||||
resumedStages: [],
|
||||
completedStages: [],
|
||||
failedStages: [],
|
||||
},
|
||||
createdAt: '2026-04-29T00:00:00.000Z',
|
||||
};
|
||||
|
||||
const redacted = redactKtxScanReport(report);
|
||||
|
||||
expect(redacted.warnings[0]?.metadata).toEqual({
|
||||
credentials_json: REDACTED_KTX_CREDENTIAL_VALUE,
|
||||
safeCount: 2,
|
||||
});
|
||||
expect(report.warnings[0]?.metadata).toEqual({
|
||||
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
|
||||
safeCount: 2,
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts standalone primitive credential values only when the field key is sensitive', () => {
|
||||
expect(redactKtxCredentialValue('password', 'abc')).toBe(REDACTED_KTX_CREDENTIAL_VALUE);
|
||||
expect(redactKtxCredentialValue('schema', 'public')).toBe('public');
|
||||
});
|
||||
});
|
||||
51
packages/cli/src/context/scan/credentials.ts
Normal file
51
packages/cli/src/context/scan/credentials.ts
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import {
|
||||
redactKtxSensitiveMetadata,
|
||||
redactKtxSensitiveText,
|
||||
redactKtxSensitiveValue,
|
||||
} from '../core/redaction.js';
|
||||
import type { KtxCredentialEnvelope, KtxScanReport, KtxScanWarning } from './types.js';
|
||||
|
||||
/** @internal */
|
||||
export function redactKtxCredentialValue(key: string, value: unknown): unknown {
|
||||
return redactKtxSensitiveValue(key, value);
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function redactKtxScanMetadata(metadata: Record<string, unknown>): Record<string, unknown> {
|
||||
return redactKtxSensitiveMetadata(metadata);
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function redactKtxCredentialEnvelope(envelope: KtxCredentialEnvelope): KtxCredentialEnvelope {
|
||||
if (envelope.kind !== 'resolved') {
|
||||
return envelope;
|
||||
}
|
||||
return {
|
||||
kind: 'resolved',
|
||||
source: envelope.source,
|
||||
redacted: true,
|
||||
values: redactKtxScanMetadata(envelope.values),
|
||||
};
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function redactKtxScanWarning(warning: KtxScanWarning): KtxScanWarning {
|
||||
if (!warning.metadata) {
|
||||
return {
|
||||
...warning,
|
||||
message: redactKtxSensitiveText(warning.message),
|
||||
};
|
||||
}
|
||||
return {
|
||||
...warning,
|
||||
message: redactKtxSensitiveText(warning.message),
|
||||
metadata: redactKtxScanMetadata(warning.metadata),
|
||||
};
|
||||
}
|
||||
|
||||
export function redactKtxScanReport(report: KtxScanReport): KtxScanReport {
|
||||
return {
|
||||
...report,
|
||||
warnings: report.warnings.map((warning) => redactKtxScanWarning(warning)),
|
||||
};
|
||||
}
|
||||
114
packages/cli/src/context/scan/data-dictionary.test.ts
Normal file
114
packages/cli/src/context/scan/data-dictionary.test.ts
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
defaultKtxDataDictionarySettings,
|
||||
isKtxDataDictionaryCandidate,
|
||||
shouldKtxSampleColumnForDictionary,
|
||||
} from './data-dictionary.js';
|
||||
|
||||
const defaultPatterns = defaultKtxDataDictionarySettings.excludePatterns;
|
||||
|
||||
describe('KTX scan data dictionary policy', () => {
|
||||
it('includes text-like and boolean categorical types', () => {
|
||||
expect(isKtxDataDictionaryCandidate('varchar(50)', 'status', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('VARCHAR', 'category', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('text', 'region', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('string', 'payment_method', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('nvarchar(100)', 'tier', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('enum', 'status', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('boolean', 'active', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('bool', 'verified', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('character varying(50)', 'region', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('character(1)', 'flag', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('ntext', 'category', defaultPatterns)).toBe(true);
|
||||
});
|
||||
|
||||
it('excludes non-categorical primitive types', () => {
|
||||
expect(isKtxDataDictionaryCandidate('integer', 'count', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('bigint', 'total', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('timestamp', 'created', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('date', 'birth', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('numeric', 'amount', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('decimal(10,2)', 'price', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('float', 'rate', defaultPatterns)).toBe(false);
|
||||
});
|
||||
|
||||
it('excludes configured high-cardinality or sensitive name patterns', () => {
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'user_id', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'session_uuid', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'api_key', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'password_hash', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'auth_token', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'id', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'created_at', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'birth_date', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('text', 'description', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('text', 'email_body', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'image_url', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'email', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'phone_number', defaultPatterns)).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'street_address', defaultPatterns)).toBe(false);
|
||||
});
|
||||
|
||||
it('keeps business categorical names eligible', () => {
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'status', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'region', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'country', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'payment_method', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'currency', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'plan', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'category', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'tier', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'gender', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'language', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'order_type', defaultPatterns)).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'order_status', defaultPatterns)).toBe(true);
|
||||
});
|
||||
|
||||
it('respects host-provided exclusion patterns and skips invalid regex patterns', () => {
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'company_size', ['company'])).toBe(false);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'status', ['company'])).toBe(true);
|
||||
expect(isKtxDataDictionaryCandidate('varchar', 'status', ['[invalid', '(unclosed'])).toBe(true);
|
||||
});
|
||||
|
||||
it('skips columns that already have persisted dictionary state', () => {
|
||||
expect(
|
||||
shouldKtxSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'status',
|
||||
sampleValues: ['paid'],
|
||||
cardinality: null,
|
||||
settings: defaultKtxDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: false, reason: 'already_populated' });
|
||||
|
||||
expect(
|
||||
shouldKtxSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'empty_status',
|
||||
sampleValues: null,
|
||||
cardinality: 0,
|
||||
settings: defaultKtxDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: false, reason: 'empty_column' });
|
||||
|
||||
expect(
|
||||
shouldKtxSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'customer_name',
|
||||
sampleValues: null,
|
||||
cardinality: 300,
|
||||
settings: defaultKtxDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: false, reason: 'high_cardinality' });
|
||||
|
||||
expect(
|
||||
shouldKtxSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'status',
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
settings: defaultKtxDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: true });
|
||||
});
|
||||
});
|
||||
112
packages/cli/src/context/scan/data-dictionary.ts
Normal file
112
packages/cli/src/context/scan/data-dictionary.ts
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
export interface KtxDataDictionarySettings {
|
||||
cardinalityThreshold: number;
|
||||
maxValuesToStore: number;
|
||||
sampleSize: number;
|
||||
useDbStatistics: boolean;
|
||||
excludePatterns: string[];
|
||||
}
|
||||
|
||||
export const defaultKtxDataDictionarySettings: KtxDataDictionarySettings = {
|
||||
cardinalityThreshold: 200,
|
||||
maxValuesToStore: 100,
|
||||
sampleSize: 10000,
|
||||
useDbStatistics: true,
|
||||
excludePatterns: [
|
||||
'_id$',
|
||||
'_uuid$',
|
||||
'_key$',
|
||||
'_hash$',
|
||||
'_token$',
|
||||
'^id$',
|
||||
'^uuid$',
|
||||
'_at$',
|
||||
'_date$',
|
||||
'_time$',
|
||||
'description$',
|
||||
'comment$',
|
||||
'notes?$',
|
||||
'message$',
|
||||
'body$',
|
||||
'content$',
|
||||
'_url$',
|
||||
'_path$',
|
||||
'email$',
|
||||
'^phone',
|
||||
'address$',
|
||||
],
|
||||
};
|
||||
|
||||
type KtxDataDictionarySkipReason =
|
||||
| 'not_candidate'
|
||||
| 'already_populated'
|
||||
| 'empty_column'
|
||||
| 'high_cardinality';
|
||||
|
||||
/** @internal */
|
||||
export interface KtxDataDictionarySampleDecision {
|
||||
sample: boolean;
|
||||
reason?: KtxDataDictionarySkipReason;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxDataDictionaryColumnState {
|
||||
columnType: string;
|
||||
columnName: string;
|
||||
sampleValues?: readonly string[] | null;
|
||||
cardinality?: number | null;
|
||||
settings: KtxDataDictionarySettings;
|
||||
}
|
||||
|
||||
const categoricalCandidateTypes = /^(n?varchar|n?char|n?text|string|character|enum|bool(ean)?)/i;
|
||||
|
||||
export function isKtxDataDictionaryCandidate(
|
||||
columnType: string,
|
||||
columnName: string,
|
||||
excludePatterns: readonly string[] = defaultKtxDataDictionarySettings.excludePatterns,
|
||||
): boolean {
|
||||
const typeLower = columnType.toLowerCase();
|
||||
const nameLower = columnName.toLowerCase();
|
||||
|
||||
if (!categoricalCandidateTypes.test(typeLower)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const patternText of excludePatterns) {
|
||||
try {
|
||||
const pattern = new RegExp(patternText, 'i');
|
||||
if (pattern.test(nameLower)) {
|
||||
return false;
|
||||
}
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function shouldKtxSampleColumnForDictionary(
|
||||
input: KtxDataDictionaryColumnState,
|
||||
): KtxDataDictionarySampleDecision {
|
||||
const sampleValues = input.sampleValues ?? null;
|
||||
const cardinality = input.cardinality ?? null;
|
||||
|
||||
if (sampleValues && sampleValues.length > 0) {
|
||||
return { sample: false, reason: 'already_populated' };
|
||||
}
|
||||
|
||||
if (cardinality === 0) {
|
||||
return { sample: false, reason: 'empty_column' };
|
||||
}
|
||||
|
||||
if (cardinality !== null && cardinality > input.settings.cardinalityThreshold) {
|
||||
return { sample: false, reason: 'high_cardinality' };
|
||||
}
|
||||
|
||||
if (!isKtxDataDictionaryCandidate(input.columnType, input.columnName, input.settings.excludePatterns)) {
|
||||
return { sample: false, reason: 'not_candidate' };
|
||||
}
|
||||
|
||||
return { sample: true };
|
||||
}
|
||||
667
packages/cli/src/context/scan/description-generation.test.ts
Normal file
667
packages/cli/src/context/scan/description-generation.test.ts
Normal file
|
|
@ -0,0 +1,667 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
|
||||
vi.mock('ai', async (importOriginal) => {
|
||||
const actual = await importOriginal<typeof import('ai')>();
|
||||
return { ...actual, generateText: vi.fn() };
|
||||
});
|
||||
|
||||
import { generateText } from 'ai';
|
||||
import {
|
||||
buildKtxColumnDescriptionPrompt,
|
||||
buildKtxDataSourceDescriptionPrompt,
|
||||
buildKtxTableDescriptionPrompt,
|
||||
type KtxDescriptionCachePort,
|
||||
KtxDescriptionGenerator,
|
||||
} from './description-generation.js';
|
||||
import { createKtxConnectorCapabilities, type KtxScanConnector } from './types.js';
|
||||
|
||||
function createCache(initial: Record<string, string> = {}): KtxDescriptionCachePort {
|
||||
const data = new Map(Object.entries(initial));
|
||||
return {
|
||||
buildTableKey: (table) => [table.catalog, table.db, table.name].filter(Boolean).join('.'),
|
||||
buildColumnKey: (table, columnName) => [table.catalog, table.db, table.name, columnName].filter(Boolean).join('.'),
|
||||
buildConnectionKey: (connectionName) => `__connection:${connectionName}`,
|
||||
get: vi.fn(async (key: string) => data.get(key) ?? null),
|
||||
set: vi.fn(async (key: string, value: string) => {
|
||||
data.set(key, value);
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function createLlmProvider(text = 'generated description') {
|
||||
vi.mocked(generateText).mockResolvedValue({ text } as never);
|
||||
return {
|
||||
generateText: vi.fn(async (input) => {
|
||||
const result = await generateText({
|
||||
system: input.system ? { role: 'system', content: input.system } : undefined,
|
||||
messages: [{ role: 'user', content: input.prompt }],
|
||||
temperature: input.temperature,
|
||||
} as never);
|
||||
return result.text;
|
||||
}),
|
||||
generateObject: vi.fn(),
|
||||
runAgentLoop: vi.fn(),
|
||||
} as any;
|
||||
}
|
||||
|
||||
function createFailingLlmProvider(message = 'timeout exceeded when trying to connect') {
|
||||
vi.mocked(generateText).mockRejectedValue(new Error(message) as never);
|
||||
return {
|
||||
generateText: vi.fn(async (input) => {
|
||||
const result = await generateText({
|
||||
system: input.system ? { role: 'system', content: input.system } : undefined,
|
||||
messages: [{ role: 'user', content: input.prompt }],
|
||||
temperature: input.temperature,
|
||||
} as never);
|
||||
return result.text;
|
||||
}),
|
||||
generateObject: vi.fn(),
|
||||
runAgentLoop: vi.fn(),
|
||||
} as any;
|
||||
}
|
||||
|
||||
function createConnector(): KtxScanConnector {
|
||||
return {
|
||||
id: 'test-connector',
|
||||
driver: 'postgres',
|
||||
capabilities: createKtxConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
nestedAnalysis: true,
|
||||
}),
|
||||
introspect: vi.fn(async () => {
|
||||
throw new Error('introspection is not used by description generation');
|
||||
}),
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['paid', 'refunded', null],
|
||||
nullCount: 1,
|
||||
distinctCount: 2,
|
||||
})),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id', 'status', 'amount'],
|
||||
rows: [
|
||||
[1, 'paid', 20],
|
||||
[2, 'refunded', 10],
|
||||
],
|
||||
totalRows: 2,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
describe('KTX description prompt builders', () => {
|
||||
it('builds column prompts with sample values, source descriptions, and nested BigQuery guidance', () => {
|
||||
const { system, user } = buildKtxColumnDescriptionPrompt({
|
||||
columnName: 'payload',
|
||||
columnValues: [{ nested: true }, '[1,2]'],
|
||||
tableContext: 'Table: events | Columns: payload | Data source: BIGQUERY',
|
||||
dataSourceType: 'BIGQUERY',
|
||||
supportsNestedAnalysis: true,
|
||||
rawDescriptions: { db: 'Raw event payload', ai: 'Old AI text', user: 'User text' },
|
||||
maxWords: 12,
|
||||
});
|
||||
|
||||
expect(user).toContain(
|
||||
'<table_context> Table: events | Columns: payload | Data source: BIGQUERY </table_context>',
|
||||
);
|
||||
expect(user).toContain('<column_name> payload </column_name>');
|
||||
expect(user).toContain('<sample_values> [object Object], [1,2] </sample_values>');
|
||||
expect(user).toContain('<db_documentation> Raw event payload </db_documentation>');
|
||||
expect(user).not.toContain('Old AI text');
|
||||
expect(user).not.toContain('User text');
|
||||
expect(system).toContain('nested/structured data');
|
||||
expect(system).toContain('12 words or less');
|
||||
expect(user).not.toContain('12 words or less');
|
||||
});
|
||||
|
||||
it('builds table and data-source prompts from sampled rows', () => {
|
||||
const sample = {
|
||||
headers: ['id', 'status'],
|
||||
rows: [
|
||||
[1, 'paid'],
|
||||
[2, 'refunded'],
|
||||
],
|
||||
totalRows: 2,
|
||||
};
|
||||
|
||||
const table = buildKtxTableDescriptionPrompt({
|
||||
tableName: 'orders',
|
||||
sampleData: sample,
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
rawDescriptions: { dbt: 'Fact table for commerce orders' },
|
||||
});
|
||||
expect(table.user).toContain('status: paid, refunded');
|
||||
expect(table.system).toContain('Analyze database tables');
|
||||
|
||||
const datasource = buildKtxDataSourceDescriptionPrompt({
|
||||
tableSamples: [['orders', sample]],
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
});
|
||||
expect(datasource.user).toContain('orders (2 columns, 2 sample rows)');
|
||||
expect(datasource.system).toContain('Analyze databases');
|
||||
});
|
||||
});
|
||||
|
||||
describe('KtxDescriptionGenerator', () => {
|
||||
it('generates column descriptions with pre-fetched values, cache hits, and word-limit metadata', async () => {
|
||||
const cache = createCache({ 'warehouse.public.orders.cached_status': 'Cached status description' });
|
||||
const llmRuntime = createLlmProvider('Payment state');
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime,
|
||||
cache,
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
temperature: 0.2,
|
||||
concurrencyLimit: 2,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: 'warehouse',
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [
|
||||
{ name: 'status', sampleValues: ['paid', 'refunded'], rawDescriptions: { db: 'Payment lifecycle' } },
|
||||
{ name: 'cached_status', sampleValues: ['open'] },
|
||||
],
|
||||
},
|
||||
skipExisting: false,
|
||||
existingDescriptions: {},
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
columnDescriptions: [
|
||||
['status', 'Payment state'],
|
||||
['cached_status', 'Cached status description'],
|
||||
],
|
||||
processedColumns: ['status'],
|
||||
skippedColumns: ['cached_status'],
|
||||
});
|
||||
expect(connector.sampleColumn).not.toHaveBeenCalled();
|
||||
expect(generateText).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
temperature: 0.2,
|
||||
system: expect.objectContaining({
|
||||
role: 'system',
|
||||
content: expect.stringContaining('Please provide a concise description in 12 words or less.'),
|
||||
}),
|
||||
messages: expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
role: 'user',
|
||||
content: expect.stringContaining('<column_name> status </column_name>'),
|
||||
}),
|
||||
]),
|
||||
}),
|
||||
);
|
||||
const lastCall = vi.mocked(generateText).mock.calls.at(-1)?.[0];
|
||||
expect(lastCall?.messages?.some((message) => message.role === 'system')).toBe(false);
|
||||
});
|
||||
|
||||
it('samples through the connector when column values are not pre-fetched', async () => {
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Current order state'),
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(connector.sampleColumn).toHaveBeenCalledWith(
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
limit: 50,
|
||||
},
|
||||
{ runId: 'run-1' },
|
||||
);
|
||||
expect(result.columnDescriptions).toEqual([['status', 'Current order state']]);
|
||||
});
|
||||
|
||||
it('samples through a description sampling port without requiring structural introspection', async () => {
|
||||
const sampler = {
|
||||
id: 'description-sampler:conn-1',
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['paid', 'refunded'],
|
||||
nullCount: null,
|
||||
distinctCount: null,
|
||||
})),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id', 'status'],
|
||||
rows: [[1, 'paid']],
|
||||
totalRows: 1,
|
||||
})),
|
||||
};
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Generated through sampler'),
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector: sampler,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.columnDescriptions).toEqual([['status', 'Generated through sampler']]);
|
||||
expect(sampler.sampleColumn).toHaveBeenCalledWith(
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
limit: 50,
|
||||
},
|
||||
{ runId: 'run-1' },
|
||||
);
|
||||
expect('introspect' in sampler).toBe(false);
|
||||
});
|
||||
|
||||
it('does not turn LLM failures into generated descriptions', async () => {
|
||||
const cache = createCache();
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createFailingLlmProvider(),
|
||||
cache,
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
},
|
||||
});
|
||||
|
||||
const columnResult = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status' }],
|
||||
},
|
||||
});
|
||||
|
||||
await expect(
|
||||
generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
|
||||
expect(columnResult).toEqual({
|
||||
columnDescriptions: [['status', null]],
|
||||
processedColumns: [],
|
||||
skippedColumns: [],
|
||||
});
|
||||
expect(cache.set).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('generates and caches table and data-source descriptions', async () => {
|
||||
const cache = createCache();
|
||||
const connector = createConnector();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Commerce orders'),
|
||||
cache,
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
concurrencyLimit: 2,
|
||||
},
|
||||
});
|
||||
|
||||
await expect(
|
||||
generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: 'warehouse', db: 'public', name: 'orders', rawDescriptions: { db: 'Raw orders' } },
|
||||
}),
|
||||
).resolves.toBe('Commerce orders');
|
||||
|
||||
await expect(
|
||||
generator.generateDataSourceDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
tables: [
|
||||
{ catalog: 'warehouse', db: 'public', name: 'orders' },
|
||||
{ catalog: 'warehouse', db: 'public', name: 'customers' },
|
||||
],
|
||||
connectionName: 'Warehouse',
|
||||
}),
|
||||
).resolves.toBe('Commerce orders');
|
||||
|
||||
expect(cache.set).toHaveBeenCalledWith('warehouse.public.orders', 'Commerce orders');
|
||||
expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders');
|
||||
});
|
||||
});
|
||||
|
||||
describe('KtxDescriptionGenerator resilience', () => {
|
||||
function createLogger() {
|
||||
return {
|
||||
debug: vi.fn(),
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
error: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
it('retries sampleTable on transient failure and uses sampled rows when it eventually succeeds', async () => {
|
||||
const sampleTable = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
|
||||
.mockRejectedValueOnce(new Error('pool: transient ECONNRESET'))
|
||||
.mockRejectedValueOnce(new Error('pool: transient ECONNRESET'))
|
||||
.mockResolvedValue({
|
||||
headers: ['id', 'status'],
|
||||
rows: [
|
||||
[1, 'paid'],
|
||||
[2, 'refunded'],
|
||||
],
|
||||
totalRows: 2,
|
||||
});
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleTable,
|
||||
};
|
||||
const logger = createLogger();
|
||||
const warnings: Array<{ code: string; table?: string }> = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Commerce orders'),
|
||||
logger,
|
||||
onWarning: (warning) => warnings.push({ code: warning.code, ...(warning.table ? { table: warning.table } : {}) }),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24, concurrencyLimit: 2 },
|
||||
});
|
||||
|
||||
const description = await generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
});
|
||||
|
||||
expect(description).toBe('Commerce orders');
|
||||
expect(sampleTable).toHaveBeenCalledTimes(3);
|
||||
expect(logger.warn).toHaveBeenCalledTimes(2);
|
||||
expect(warnings).toEqual([]);
|
||||
});
|
||||
|
||||
it('falls back to metadata-only prompt when sampleTable retries exhaust', async () => {
|
||||
const sampleTable = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
|
||||
.mockRejectedValue(new Error('pool: connection refused'));
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleTable,
|
||||
};
|
||||
const logger = createLogger();
|
||||
const warnings: Array<{ code: string; table?: string; metadata?: Record<string, unknown> }> = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Customer reference data'),
|
||||
logger,
|
||||
onWarning: (warning) =>
|
||||
warnings.push({
|
||||
code: warning.code,
|
||||
...(warning.table ? { table: warning.table } : {}),
|
||||
...(warning.metadata ? { metadata: warning.metadata } : {}),
|
||||
}),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24, concurrencyLimit: 2 },
|
||||
});
|
||||
|
||||
const description = await generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
columns: [
|
||||
{ name: 'id', nativeType: 'uuid' },
|
||||
{ name: 'email', nativeType: 'text', comment: 'Primary contact email' },
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
expect(description).toBe('Customer reference data');
|
||||
expect(sampleTable).toHaveBeenCalledTimes(3);
|
||||
expect(warnings.map((warning) => warning.code)).toEqual(['sampling_failed', 'description_fallback_used']);
|
||||
expect(warnings[1]?.metadata?.reason).toBe('sampling_failed');
|
||||
const userPrompt = (vi.mocked(generateText).mock.calls.at(-1)?.[0] as { messages: Array<{ role: string; content: string }> })
|
||||
.messages.find((message) => message.role === 'user')?.content;
|
||||
expect(userPrompt).toContain('Columns (metadata only, no sample rows)');
|
||||
expect(userPrompt).toContain('email (text)');
|
||||
expect(userPrompt).toContain('Primary contact email');
|
||||
});
|
||||
|
||||
it('emits enrichment_failed and returns null when both sampling and metadata-only LLM fail', async () => {
|
||||
const sampleTable = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
|
||||
.mockRejectedValue(new Error('pool: connection refused'));
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleTable,
|
||||
};
|
||||
const warnings: string[] = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createFailingLlmProvider(),
|
||||
onWarning: (warning) => warnings.push(warning.code),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const description = await generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: null, db: 'public', name: 'orphan', columns: [{ name: 'id' }] },
|
||||
});
|
||||
|
||||
expect(description).toBeNull();
|
||||
expect(warnings).toEqual(['sampling_failed', 'enrichment_failed']);
|
||||
});
|
||||
|
||||
it('uses metadata-only fallback when connector has no sampleTable', async () => {
|
||||
const connector = createConnector();
|
||||
const samplerWithoutTable: KtxScanConnector = {
|
||||
...connector,
|
||||
sampleTable: undefined,
|
||||
};
|
||||
const warnings: string[] = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Orders mart'),
|
||||
onWarning: (warning) => warnings.push(warning.code),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const description = await generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector: samplerWithoutTable,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'mart_orders',
|
||||
columns: [{ name: 'order_id', nativeType: 'uuid' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(description).toBe('Orders mart');
|
||||
expect(warnings).toEqual(['connector_capability_missing', 'description_fallback_used']);
|
||||
});
|
||||
|
||||
it('aborts retry loop when the scan context signal fires', async () => {
|
||||
const controller = new AbortController();
|
||||
const sampleTable = vi.fn<NonNullable<KtxScanConnector['sampleTable']>>().mockImplementation(async () => {
|
||||
controller.abort();
|
||||
throw new Error('first attempt blew up');
|
||||
});
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleTable,
|
||||
};
|
||||
const warnings: string[] = [];
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('should not be called'),
|
||||
onWarning: (warning) => warnings.push(warning.code),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
await expect(
|
||||
generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1', signal: controller.signal },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
}),
|
||||
).rejects.toThrow('aborted');
|
||||
|
||||
expect(sampleTable).toHaveBeenCalledTimes(1);
|
||||
expect(warnings).toEqual([]);
|
||||
});
|
||||
|
||||
it('generates column descriptions from rawDescriptions when sampleColumn is unavailable', async () => {
|
||||
const samplerWithoutColumn: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleColumn: undefined,
|
||||
};
|
||||
const logger = createLogger();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Payment lifecycle state'),
|
||||
logger,
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector: samplerWithoutColumn,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status', rawDescriptions: { db: 'order lifecycle state' } }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.columnDescriptions).toEqual([['status', 'Payment lifecycle state']]);
|
||||
expect(logger.warn).toHaveBeenCalled();
|
||||
const userPrompt = (
|
||||
vi.mocked(generateText).mock.calls.at(-1)?.[0] as { messages: Array<{ role: string; content: string }> }
|
||||
).messages.find((message) => message.role === 'user')?.content;
|
||||
expect(userPrompt).toContain('<sample_values> unavailable </sample_values>');
|
||||
expect(userPrompt).toContain('<db_documentation> order lifecycle state </db_documentation>');
|
||||
});
|
||||
|
||||
it('generates column descriptions from rawDescriptions when sampleColumn retries exhaust', async () => {
|
||||
const sampleColumn = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleColumn']>>()
|
||||
.mockRejectedValue(new Error('pool: connection refused'));
|
||||
const flakyConnector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleColumn,
|
||||
};
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('Customer reference identifier'),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector: flakyConnector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'customer_id', rawDescriptions: { db: 'FK to customers.id' } }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(sampleColumn).toHaveBeenCalledTimes(3);
|
||||
expect(result.columnDescriptions).toEqual([['customer_id', 'Customer reference identifier']]);
|
||||
});
|
||||
|
||||
it('skips column LLM call only when neither samples nor rawDescriptions are available', async () => {
|
||||
const sampleColumn = vi
|
||||
.fn<NonNullable<KtxScanConnector['sampleColumn']>>()
|
||||
.mockResolvedValue({ values: [null, null], nullCount: 2, distinctCount: 0 });
|
||||
const connector: KtxScanConnector = {
|
||||
...createConnector(),
|
||||
sampleColumn,
|
||||
};
|
||||
vi.mocked(generateText).mockClear();
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: createLlmProvider('should not be called'),
|
||||
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'opaque_blob' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.columnDescriptions).toEqual([['opaque_blob', null]]);
|
||||
expect(generateText).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
800
packages/cli/src/context/scan/description-generation.ts
Normal file
800
packages/cli/src/context/scan/description-generation.ts
Normal file
|
|
@ -0,0 +1,800 @@
|
|||
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
||||
import type {
|
||||
KtxColumnSampleInput,
|
||||
KtxColumnSampleResult,
|
||||
KtxScanContext,
|
||||
KtxScanLoggerPort,
|
||||
KtxScanWarning,
|
||||
KtxTableRef,
|
||||
KtxTableSampleInput,
|
||||
KtxTableSampleResult,
|
||||
} from './types.js';
|
||||
|
||||
interface KtxDescriptionTableColumn {
|
||||
name: string;
|
||||
nativeType?: string | null;
|
||||
comment?: string | null;
|
||||
}
|
||||
|
||||
export interface KtxDescriptionCachePort {
|
||||
buildTableKey(table: KtxTableRef): string;
|
||||
buildColumnKey(table: KtxTableRef, columnName: string): string;
|
||||
buildConnectionKey(connectionName: string): string;
|
||||
get(key: string): Promise<string | null>;
|
||||
set(key: string, value: string): Promise<void>;
|
||||
}
|
||||
|
||||
interface KtxDescriptionSamplingPort {
|
||||
id: string;
|
||||
sampleColumn?(input: KtxColumnSampleInput, ctx: KtxScanContext): Promise<KtxColumnSampleResult>;
|
||||
sampleTable?(input: KtxTableSampleInput, ctx: KtxScanContext): Promise<KtxTableSampleResult>;
|
||||
}
|
||||
|
||||
interface KtxDescriptionGenerationSettings {
|
||||
columnMaxWords: number;
|
||||
tableMaxWords: number;
|
||||
dataSourceMaxWords: number;
|
||||
temperature?: number;
|
||||
concurrencyLimit?: number;
|
||||
}
|
||||
|
||||
interface ResolvedKtxDescriptionGenerationSettings {
|
||||
columnMaxWords: number;
|
||||
tableMaxWords: number;
|
||||
dataSourceMaxWords: number;
|
||||
temperature?: number;
|
||||
concurrencyLimit: number;
|
||||
}
|
||||
|
||||
export interface KtxDescriptionColumn {
|
||||
name: string;
|
||||
type?: string;
|
||||
rawDescriptions?: Record<string, string>;
|
||||
sampleValues?: unknown[];
|
||||
}
|
||||
|
||||
export interface KtxDescriptionColumnTable extends KtxTableRef {
|
||||
columns: KtxDescriptionColumn[];
|
||||
}
|
||||
|
||||
interface KtxDescriptionTableInput extends KtxTableRef {
|
||||
rawDescriptions?: Record<string, string>;
|
||||
columns?: KtxDescriptionTableColumn[];
|
||||
}
|
||||
|
||||
export interface KtxColumnAnalysisResult {
|
||||
columnDescriptions: Array<[string, string | null]>;
|
||||
processedColumns: string[];
|
||||
skippedColumns: string[];
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxColumnDescriptionPromptInput {
|
||||
columnName: string;
|
||||
columnValues: unknown[];
|
||||
tableContext: string;
|
||||
dataSourceType: string;
|
||||
supportsNestedAnalysis: boolean;
|
||||
rawDescriptions?: Record<string, string>;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxTableDescriptionPromptInput {
|
||||
tableName: string;
|
||||
sampleData?: KtxTableSampleResult;
|
||||
columns?: KtxDescriptionTableColumn[];
|
||||
dataSourceType: string;
|
||||
rawDescriptions?: Record<string, string>;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxDataSourceDescriptionPromptInput {
|
||||
tableSamples: Array<[string, KtxTableSampleResult]>;
|
||||
dataSourceType: string;
|
||||
}
|
||||
|
||||
export interface KtxGenerateColumnDescriptionsInput {
|
||||
connectionId: string;
|
||||
connector: KtxDescriptionSamplingPort;
|
||||
context: KtxScanContext;
|
||||
dataSourceType: string;
|
||||
supportsNestedAnalysis: boolean;
|
||||
table: KtxDescriptionColumnTable;
|
||||
skipExisting?: boolean;
|
||||
existingDescriptions?: Record<string, string | null>;
|
||||
}
|
||||
|
||||
export interface KtxGenerateTableDescriptionInput {
|
||||
connectionId: string;
|
||||
connector: KtxDescriptionSamplingPort;
|
||||
context: KtxScanContext;
|
||||
dataSourceType: string;
|
||||
table: KtxDescriptionTableInput;
|
||||
}
|
||||
|
||||
export interface KtxGenerateDataSourceDescriptionInput {
|
||||
connectionId: string;
|
||||
connector: KtxDescriptionSamplingPort;
|
||||
context: KtxScanContext;
|
||||
dataSourceType: string;
|
||||
tables: KtxTableRef[];
|
||||
connectionName?: string;
|
||||
}
|
||||
|
||||
export interface KtxDescriptionGeneratorOptions {
|
||||
llmRuntime: KtxLlmRuntimePort;
|
||||
cache?: KtxDescriptionCachePort;
|
||||
logger?: KtxScanLoggerPort;
|
||||
onWarning?: (warning: KtxScanWarning) => void;
|
||||
settings: KtxDescriptionGenerationSettings;
|
||||
}
|
||||
|
||||
interface ColumnTaskResult {
|
||||
columnName: string;
|
||||
description: string | null;
|
||||
processed: boolean;
|
||||
skipped: boolean;
|
||||
}
|
||||
|
||||
function descriptionSources(rawDescriptions: Record<string, string> | undefined): Array<[string, string]> {
|
||||
if (!rawDescriptions) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return Object.entries(rawDescriptions).filter(([source, text]) => source !== 'ai' && source !== 'user' && !!text);
|
||||
}
|
||||
|
||||
function errorMessage(error: unknown): string {
|
||||
return error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
|
||||
class KtxAbortedError extends Error {
|
||||
constructor() {
|
||||
super('aborted');
|
||||
this.name = 'KtxAbortedError';
|
||||
}
|
||||
}
|
||||
|
||||
async function delayWithAbort(ms: number, signal?: AbortSignal): Promise<void> {
|
||||
if (!signal) {
|
||||
await new Promise<void>((resolve) => setTimeout(resolve, ms));
|
||||
return;
|
||||
}
|
||||
if (signal.aborted) {
|
||||
throw new KtxAbortedError();
|
||||
}
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
signal.removeEventListener('abort', onAbort);
|
||||
resolve();
|
||||
}, ms);
|
||||
const onAbort = (): void => {
|
||||
clearTimeout(timer);
|
||||
reject(new KtxAbortedError());
|
||||
};
|
||||
signal.addEventListener('abort', onAbort, { once: true });
|
||||
});
|
||||
}
|
||||
|
||||
interface RetryAsyncOptions {
|
||||
attempts: number;
|
||||
baseDelayMs: number;
|
||||
signal?: AbortSignal;
|
||||
onAttemptFailure?: (error: unknown, attempt: number) => void;
|
||||
}
|
||||
|
||||
async function retryAsync<T>(fn: () => Promise<T>, options: RetryAsyncOptions): Promise<T> {
|
||||
const attempts = Math.max(1, options.attempts);
|
||||
let lastError: unknown;
|
||||
for (let attempt = 1; attempt <= attempts; attempt += 1) {
|
||||
if (options.signal?.aborted) {
|
||||
throw new KtxAbortedError();
|
||||
}
|
||||
try {
|
||||
return await fn();
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
if (error instanceof KtxAbortedError) {
|
||||
throw error;
|
||||
}
|
||||
options.onAttemptFailure?.(error, attempt);
|
||||
if (attempt === attempts) {
|
||||
break;
|
||||
}
|
||||
const delay = options.baseDelayMs * 2 ** (attempt - 1);
|
||||
await delayWithAbort(delay, options.signal);
|
||||
}
|
||||
}
|
||||
throw lastError;
|
||||
}
|
||||
|
||||
function toTableRef(table: KtxTableRef): KtxTableRef {
|
||||
return {
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
};
|
||||
}
|
||||
|
||||
async function runWithConcurrency<TInput, TOutput>(
|
||||
items: readonly TInput[],
|
||||
concurrencyLimit: number,
|
||||
worker: (item: TInput, index: number) => Promise<TOutput>,
|
||||
): Promise<TOutput[]> {
|
||||
const results: TOutput[] = [];
|
||||
let nextIndex = 0;
|
||||
const workerCount = Math.max(1, Math.min(concurrencyLimit, items.length || 1));
|
||||
|
||||
await Promise.all(
|
||||
Array.from({ length: workerCount }, async () => {
|
||||
while (nextIndex < items.length) {
|
||||
const index = nextIndex;
|
||||
nextIndex += 1;
|
||||
const item = items[index];
|
||||
if (item !== undefined) {
|
||||
results[index] = await worker(item, index);
|
||||
}
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
export interface KtxDescriptionPrompt {
|
||||
system: string;
|
||||
user: string;
|
||||
}
|
||||
|
||||
function wordLimitLine(maxWords: number): string {
|
||||
return `Please provide a concise description in ${maxWords} words or less.`;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function buildKtxColumnDescriptionPrompt(
|
||||
input: KtxColumnDescriptionPromptInput & { maxWords?: number },
|
||||
): KtxDescriptionPrompt {
|
||||
const sampleValues = input.columnValues.slice(0, 5);
|
||||
const valuesStr = sampleValues
|
||||
.filter((value) => value !== null && value !== undefined)
|
||||
.map((value) => String(value))
|
||||
.join(', ');
|
||||
|
||||
const systemParts: string[] = [
|
||||
`Analyze database columns and provide a concise description.
|
||||
|
||||
Provide a brief description of what the column contains without repeating the column name.
|
||||
Focus on the data's meaning and business purpose. Start directly with the content description.
|
||||
Example:
|
||||
"first names of individuals, likely employees or contacts" instead of "The column contains first names..."
|
||||
"Job titles or roles of individuals..." instead of "This column contains job titles..."`,
|
||||
];
|
||||
if (input.dataSourceType === 'BIGQUERY' && input.supportsNestedAnalysis) {
|
||||
systemParts.push(
|
||||
'If the sampled values indicate nested/structured data (JSON, STRUCT, or ARRAY), describe its general business purpose and data organization.',
|
||||
);
|
||||
}
|
||||
if (input.maxWords !== undefined) {
|
||||
systemParts.push(wordLimitLine(input.maxWords));
|
||||
}
|
||||
|
||||
const sampleValuesContent = valuesStr.length > 0 ? valuesStr : 'unavailable';
|
||||
let user = `<table_context> ${input.tableContext} </table_context>
|
||||
|
||||
<column_name> ${input.columnName} </column_name>
|
||||
|
||||
<sample_values> ${sampleValuesContent} </sample_values>
|
||||
`;
|
||||
|
||||
const sources = descriptionSources(input.rawDescriptions);
|
||||
if (sources.length > 0) {
|
||||
user += '\nExisting descriptions from other sources:\n';
|
||||
for (const [source, text] of sources) {
|
||||
user += `<${source}_documentation> ${text} </${source}_documentation>\n`;
|
||||
}
|
||||
user +=
|
||||
'\nSynthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.\n';
|
||||
}
|
||||
|
||||
return { system: systemParts.join('\n\n'), user: user.trim() };
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function buildKtxTableDescriptionPrompt(
|
||||
input: KtxTableDescriptionPromptInput & { maxWords?: number },
|
||||
): KtxDescriptionPrompt {
|
||||
const systemParts: string[] = [
|
||||
`Analyze database tables and provide a concise description.
|
||||
|
||||
Provide a brief description of what the table represents and its business purpose.
|
||||
Do NOT list or describe individual columns or fields.
|
||||
Start directly with the content description without mentioning the table name.
|
||||
Focus on the data's meaning and business purpose.
|
||||
Example: "Information about healthcare professionals used for workforce management" instead of "The blahblah table contains information about healthcare professionals including their names, titles..."`,
|
||||
];
|
||||
if (input.dataSourceType === 'BIGQUERY') {
|
||||
systemParts.push(
|
||||
"Note (don't include in the final answer): BigQuery tables may contain nested structures, arrays, or other complex data types.",
|
||||
);
|
||||
}
|
||||
if (input.maxWords !== undefined) {
|
||||
systemParts.push(wordLimitLine(input.maxWords));
|
||||
}
|
||||
|
||||
const hasSamples = !!input.sampleData && input.sampleData.rows.length > 0;
|
||||
let columnsLine: string;
|
||||
let rowsLine: string;
|
||||
if (hasSamples) {
|
||||
const sampleData = input.sampleData!;
|
||||
const columnInfo: string[] = [];
|
||||
for (let index = 0; index < Math.min(sampleData.headers.length, 10); index += 1) {
|
||||
const header = sampleData.headers[index];
|
||||
const sampleValues = sampleData.rows
|
||||
.slice(0, 3)
|
||||
.map((row) => row[index])
|
||||
.filter((value) => value !== null && value !== undefined);
|
||||
columnInfo.push(`${header}: ${sampleValues.map((value) => String(value)).join(', ')}`);
|
||||
}
|
||||
columnsLine = `Columns and sample data: ${columnInfo.join(' | ')}`;
|
||||
rowsLine = `Total rows in sample: ${sampleData.rows.length}`;
|
||||
} else if (input.columns && input.columns.length > 0) {
|
||||
const columnInfo = input.columns.slice(0, 30).map((column) => {
|
||||
const typePart = column.nativeType ? ` (${column.nativeType})` : '';
|
||||
const commentPart = column.comment ? ` — ${column.comment}` : '';
|
||||
return `${column.name}${typePart}${commentPart}`;
|
||||
});
|
||||
columnsLine = `Columns (metadata only, no sample rows): ${columnInfo.join(' | ')}`;
|
||||
rowsLine = 'Sample rows: unavailable';
|
||||
} else {
|
||||
columnsLine = 'Columns: unavailable';
|
||||
rowsLine = 'Sample rows: unavailable';
|
||||
}
|
||||
|
||||
let user = `Table: ${input.tableName}
|
||||
${columnsLine}
|
||||
${rowsLine}
|
||||
Data source type: ${input.dataSourceType}`;
|
||||
|
||||
const sources = descriptionSources(input.rawDescriptions);
|
||||
if (sources.length > 0) {
|
||||
user += '\n\nExisting descriptions from other sources:\n';
|
||||
for (const [source, text] of sources) {
|
||||
user += `${source}: ${text}\n`;
|
||||
}
|
||||
user +=
|
||||
'\nSynthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.';
|
||||
}
|
||||
|
||||
return { system: systemParts.join('\n\n'), user: user.trim() };
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function buildKtxDataSourceDescriptionPrompt(
|
||||
input: KtxDataSourceDescriptionPromptInput & { maxWords?: number },
|
||||
): KtxDescriptionPrompt {
|
||||
const tablesText = input.tableSamples
|
||||
.map(
|
||||
([tableName, sampleData]) =>
|
||||
`${tableName} (${sampleData.headers.length} columns, ${sampleData.rows.length} sample rows)`,
|
||||
)
|
||||
.join(' | ');
|
||||
|
||||
const systemParts: string[] = [
|
||||
`Analyze databases and provide a concise description.
|
||||
|
||||
Provide a direct, concise description of what the database represents and its business purpose.
|
||||
Do NOT start with phrases like "This database appears to represent" or "This BigQuery dataset".
|
||||
Start directly with the domain or business area description.
|
||||
Focus on the overall data model and its intended use.
|
||||
Example: "Healthcare-related database with a focus on patient management..." instead of "This database appears to represent a healthcare-related system..."`,
|
||||
];
|
||||
if (input.dataSourceType === 'BIGQUERY') {
|
||||
systemParts.push(
|
||||
"Note (don't include in the final answer): BigQuery datasets may contain large-scale analytics data, nested structures, and complex data types.",
|
||||
);
|
||||
}
|
||||
if (input.maxWords !== undefined) {
|
||||
systemParts.push(wordLimitLine(input.maxWords));
|
||||
}
|
||||
|
||||
const user = `Tables: ${tablesText}
|
||||
Total tables analyzed: ${input.tableSamples.length}
|
||||
Data source type: ${input.dataSourceType}`;
|
||||
|
||||
return { system: systemParts.join('\n\n'), user };
|
||||
}
|
||||
|
||||
export class KtxDescriptionGenerator {
|
||||
private readonly llmRuntime: KtxLlmRuntimePort;
|
||||
private readonly cache?: KtxDescriptionCachePort;
|
||||
private readonly logger?: KtxScanLoggerPort;
|
||||
private readonly onWarning?: (warning: KtxScanWarning) => void;
|
||||
private readonly settings: ResolvedKtxDescriptionGenerationSettings;
|
||||
|
||||
constructor(options: KtxDescriptionGeneratorOptions) {
|
||||
this.llmRuntime = options.llmRuntime;
|
||||
this.cache = options.cache;
|
||||
this.logger = options.logger;
|
||||
this.onWarning = options.onWarning;
|
||||
this.settings = {
|
||||
columnMaxWords: options.settings.columnMaxWords,
|
||||
tableMaxWords: options.settings.tableMaxWords,
|
||||
dataSourceMaxWords: options.settings.dataSourceMaxWords,
|
||||
...(options.settings.temperature !== undefined ? { temperature: options.settings.temperature } : {}),
|
||||
concurrencyLimit: options.settings.concurrencyLimit ?? 5,
|
||||
};
|
||||
}
|
||||
|
||||
async generateColumnDescriptions(input: KtxGenerateColumnDescriptionsInput): Promise<KtxColumnAnalysisResult> {
|
||||
const columnsToProcess = input.table.columns;
|
||||
const tableContext = `Table: ${input.table.name} | Columns: ${columnsToProcess.map((column) => column.name).join(', ')} | Data source: ${input.dataSourceType}`;
|
||||
|
||||
const results = await runWithConcurrency(columnsToProcess, this.settings.concurrencyLimit, async (column) =>
|
||||
this.generateOneColumnDescription(input, column, tableContext),
|
||||
);
|
||||
|
||||
const columnDescriptions: Array<[string, string | null]> = [];
|
||||
const processedColumns: string[] = [];
|
||||
const skippedColumns: string[] = [];
|
||||
|
||||
for (const result of results) {
|
||||
columnDescriptions.push([result.columnName, result.description]);
|
||||
if (result.skipped) {
|
||||
skippedColumns.push(result.columnName);
|
||||
} else if (result.processed) {
|
||||
processedColumns.push(result.columnName);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
columnDescriptions,
|
||||
processedColumns,
|
||||
skippedColumns,
|
||||
};
|
||||
}
|
||||
|
||||
async generateTableDescription(input: KtxGenerateTableDescriptionInput): Promise<string | null> {
|
||||
const tableRef = toTableRef(input.table);
|
||||
const cacheKey = this.cache?.buildTableKey(tableRef);
|
||||
if (cacheKey) {
|
||||
const cached = await this.cache?.get(cacheKey);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
}
|
||||
|
||||
const sampleTable = input.connector.sampleTable;
|
||||
let sampleData: KtxTableSampleResult | null = null;
|
||||
let fallbackReason: 'capability_missing' | 'sampling_failed' | 'empty_sample' | null = null;
|
||||
|
||||
if (!sampleTable) {
|
||||
fallbackReason = 'capability_missing';
|
||||
this.logger?.warn('KTX scan connector does not support table sampling; falling back to metadata-only prompt', {
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
});
|
||||
this.onWarning?.({
|
||||
code: 'connector_capability_missing',
|
||||
message: `Connector ${input.connector.id} does not support sampleTable; using metadata-only description prompt`,
|
||||
table: input.table.name,
|
||||
recoverable: true,
|
||||
metadata: { connectorId: input.connector.id, capability: 'sampleTable' },
|
||||
});
|
||||
} else {
|
||||
try {
|
||||
sampleData = await retryAsync(
|
||||
() =>
|
||||
sampleTable(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
table: tableRef,
|
||||
limit: 20,
|
||||
},
|
||||
input.context,
|
||||
),
|
||||
{
|
||||
attempts: 3,
|
||||
baseDelayMs: 200,
|
||||
signal: input.context.signal,
|
||||
onAttemptFailure: (error, attempt) => {
|
||||
this.logger?.warn(
|
||||
`sampleTable attempt ${attempt} failed for ${input.table.name}: ${errorMessage(error)}`,
|
||||
{
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
attempt,
|
||||
},
|
||||
);
|
||||
},
|
||||
},
|
||||
);
|
||||
if (sampleData.rows.length === 0) {
|
||||
fallbackReason = 'empty_sample';
|
||||
this.logger?.warn('sampleTable returned no rows; using metadata-only prompt', {
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof KtxAbortedError) {
|
||||
throw error;
|
||||
}
|
||||
fallbackReason = 'sampling_failed';
|
||||
this.logger?.error(`sampleTable exhausted retries for ${input.table.name}: ${errorMessage(error)}`, {
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
});
|
||||
this.onWarning?.({
|
||||
code: 'sampling_failed',
|
||||
message: `Failed to sample table ${input.table.name} after retries: ${errorMessage(error)}`,
|
||||
table: input.table.name,
|
||||
recoverable: true,
|
||||
metadata: { connectorId: input.connector.id, error: errorMessage(error) },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const prompt = buildKtxTableDescriptionPrompt({
|
||||
tableName: input.table.name,
|
||||
...(fallbackReason === null && sampleData ? { sampleData } : {}),
|
||||
...(input.table.columns && input.table.columns.length > 0 ? { columns: input.table.columns } : {}),
|
||||
dataSourceType: input.dataSourceType,
|
||||
rawDescriptions: input.table.rawDescriptions,
|
||||
maxWords: this.settings.tableMaxWords,
|
||||
});
|
||||
const description = await this.generateAiDescription(prompt, 'ktx-table-description');
|
||||
if (cacheKey && description) {
|
||||
await this.cache?.set(cacheKey, description);
|
||||
}
|
||||
if (description && fallbackReason !== null) {
|
||||
this.onWarning?.({
|
||||
code: 'description_fallback_used',
|
||||
message: `Generated table description without sample rows for ${input.table.name} (reason: ${fallbackReason})`,
|
||||
table: input.table.name,
|
||||
recoverable: true,
|
||||
metadata: { connectorId: input.connector.id, reason: fallbackReason },
|
||||
});
|
||||
}
|
||||
if (!description) {
|
||||
this.onWarning?.({
|
||||
code: 'enrichment_failed',
|
||||
message: `Failed to generate description for table ${input.table.name}`,
|
||||
table: input.table.name,
|
||||
recoverable: true,
|
||||
metadata: { connectorId: input.connector.id, usedFallback: fallbackReason !== null },
|
||||
});
|
||||
}
|
||||
return description;
|
||||
} catch (error) {
|
||||
this.logger?.error(`Error generating table description: ${errorMessage(error)}`, {
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
});
|
||||
this.onWarning?.({
|
||||
code: 'enrichment_failed',
|
||||
message: `Failed to generate description for table ${input.table.name}: ${errorMessage(error)}`,
|
||||
table: input.table.name,
|
||||
recoverable: true,
|
||||
metadata: { connectorId: input.connector.id },
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async generateDataSourceDescription(input: KtxGenerateDataSourceDescriptionInput): Promise<string | null> {
|
||||
if (input.tables.length === 0) {
|
||||
return 'No tables found in database';
|
||||
}
|
||||
|
||||
const cacheKey = input.connectionName ? this.cache?.buildConnectionKey(input.connectionName) : undefined;
|
||||
if (cacheKey) {
|
||||
const cached = await this.cache?.get(cacheKey);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
}
|
||||
|
||||
if (!input.connector.sampleTable) {
|
||||
this.logger?.warn('KTX scan connector does not support table sampling for data-source description generation', {
|
||||
connectorId: input.connector.id,
|
||||
});
|
||||
return 'No accessible tables found in database';
|
||||
}
|
||||
|
||||
const tablesToAnalyze = input.tables.slice(0, 10);
|
||||
const tableSamples = await runWithConcurrency(tablesToAnalyze, this.settings.concurrencyLimit, async (table) => {
|
||||
try {
|
||||
const sampleData = await input.connector.sampleTable!(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
table: toTableRef(table),
|
||||
limit: 5,
|
||||
},
|
||||
input.context,
|
||||
);
|
||||
return [table.name, sampleData] as [string, KtxTableSampleResult];
|
||||
} catch (error) {
|
||||
this.logger?.warn(`Failed to sample table '${table.name}' for data source analysis - ${errorMessage(error)}`);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
const accessibleSamples = tableSamples.filter(
|
||||
(sample): sample is [string, KtxTableSampleResult] => sample !== null,
|
||||
);
|
||||
if (accessibleSamples.length === 0) {
|
||||
return 'No accessible tables found in database';
|
||||
}
|
||||
|
||||
try {
|
||||
const prompt = buildKtxDataSourceDescriptionPrompt({
|
||||
tableSamples: accessibleSamples,
|
||||
dataSourceType: input.dataSourceType,
|
||||
maxWords: this.settings.dataSourceMaxWords,
|
||||
});
|
||||
const description = await this.generateAiDescription(prompt, 'ktx-data-source-description');
|
||||
if (cacheKey && description) {
|
||||
await this.cache?.set(cacheKey, description);
|
||||
}
|
||||
return description;
|
||||
} catch (error) {
|
||||
this.logger?.error(`Error generating data source description: ${errorMessage(error)}`);
|
||||
return 'Failed to generate data source description';
|
||||
}
|
||||
}
|
||||
|
||||
private async generateOneColumnDescription(
|
||||
input: KtxGenerateColumnDescriptionsInput,
|
||||
column: KtxDescriptionColumn,
|
||||
tableContext: string,
|
||||
): Promise<ColumnTaskResult> {
|
||||
const existingDescription = input.existingDescriptions?.[column.name];
|
||||
if (input.skipExisting && existingDescription) {
|
||||
return {
|
||||
columnName: column.name,
|
||||
description: existingDescription,
|
||||
skipped: true,
|
||||
processed: false,
|
||||
};
|
||||
}
|
||||
|
||||
const tableRef = toTableRef(input.table);
|
||||
const cacheKey = this.cache?.buildColumnKey(tableRef, column.name);
|
||||
if (cacheKey) {
|
||||
const cached = await this.cache?.get(cacheKey);
|
||||
if (cached) {
|
||||
return {
|
||||
columnName: column.name,
|
||||
description: cached,
|
||||
skipped: true,
|
||||
processed: false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
let columnValues = column.sampleValues;
|
||||
if (!columnValues || columnValues.length === 0) {
|
||||
if (!input.connector.sampleColumn) {
|
||||
this.logger?.warn('KTX scan connector does not support column sampling; using available metadata only', {
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
column: column.name,
|
||||
});
|
||||
columnValues = [];
|
||||
} else {
|
||||
const sampleColumn = input.connector.sampleColumn;
|
||||
try {
|
||||
const sample = await retryAsync(
|
||||
() =>
|
||||
sampleColumn(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
table: tableRef,
|
||||
column: column.name,
|
||||
limit: 50,
|
||||
},
|
||||
input.context,
|
||||
),
|
||||
{
|
||||
attempts: 3,
|
||||
baseDelayMs: 200,
|
||||
signal: input.context.signal,
|
||||
onAttemptFailure: (error, attempt) => {
|
||||
this.logger?.warn(
|
||||
`sampleColumn attempt ${attempt} failed for ${input.table.name}.${column.name}: ${errorMessage(error)}`,
|
||||
{
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
column: column.name,
|
||||
attempt,
|
||||
},
|
||||
);
|
||||
},
|
||||
},
|
||||
);
|
||||
columnValues = sample.values;
|
||||
} catch (error) {
|
||||
if (error instanceof KtxAbortedError) {
|
||||
throw error;
|
||||
}
|
||||
this.logger?.warn(
|
||||
`sampleColumn exhausted retries for ${input.table.name}.${column.name}; using available metadata only: ${errorMessage(error)}`,
|
||||
{
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
column: column.name,
|
||||
},
|
||||
);
|
||||
columnValues = [];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const nonNullValues = (columnValues ?? []).filter((value) => value !== null && value !== undefined);
|
||||
const hasRawDescriptions = descriptionSources(column.rawDescriptions).length > 0;
|
||||
if (nonNullValues.length === 0 && !hasRawDescriptions) {
|
||||
return {
|
||||
columnName: column.name,
|
||||
description: null,
|
||||
skipped: false,
|
||||
processed: false,
|
||||
};
|
||||
}
|
||||
|
||||
const prompt = buildKtxColumnDescriptionPrompt({
|
||||
columnName: column.name,
|
||||
columnValues: nonNullValues,
|
||||
tableContext,
|
||||
dataSourceType: input.dataSourceType,
|
||||
supportsNestedAnalysis: input.supportsNestedAnalysis,
|
||||
rawDescriptions: column.rawDescriptions,
|
||||
maxWords: this.settings.columnMaxWords,
|
||||
});
|
||||
const description = await this.generateAiDescription(prompt, 'ktx-column-description');
|
||||
|
||||
if (cacheKey && description) {
|
||||
await this.cache?.set(cacheKey, description);
|
||||
}
|
||||
|
||||
return {
|
||||
columnName: column.name,
|
||||
description,
|
||||
skipped: false,
|
||||
processed: description !== null,
|
||||
};
|
||||
} catch (error) {
|
||||
if (error instanceof KtxAbortedError) {
|
||||
throw error;
|
||||
}
|
||||
this.logger?.error(`Error analyzing column '${column.name}': ${errorMessage(error)}`, {
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
column: column.name,
|
||||
});
|
||||
return {
|
||||
columnName: column.name,
|
||||
description: null,
|
||||
skipped: false,
|
||||
processed: false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async generateAiDescription(prompt: KtxDescriptionPrompt, _operationName: string): Promise<string | null> {
|
||||
try {
|
||||
const text = await this.llmRuntime.generateText({
|
||||
role: 'candidateExtraction',
|
||||
system: prompt.system,
|
||||
prompt: prompt.user,
|
||||
temperature: this.settings.temperature,
|
||||
});
|
||||
const description = text.trim();
|
||||
return description || null;
|
||||
} catch (error) {
|
||||
this.logger?.error(`Error generating AI description: ${errorMessage(error)}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
47
packages/cli/src/context/scan/embedding-text.test.ts
Normal file
47
packages/cli/src/context/scan/embedding-text.test.ts
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { buildKtxColumnEmbeddingText } from './embedding-text.js';
|
||||
|
||||
describe('KTX scan embedding text', () => {
|
||||
it('builds column embedding text with table, description, FK, and sample-value context', () => {
|
||||
expect(
|
||||
buildKtxColumnEmbeddingText({
|
||||
tableName: 'orders',
|
||||
columnName: 'status',
|
||||
columnType: 'varchar',
|
||||
resolvedDescription: 'Payment lifecycle state',
|
||||
sampleValues: ['paid', 'refunded', 'pending'],
|
||||
resolvedTableDescription: 'Customer orders',
|
||||
foreignKeys: {
|
||||
outgoing: [{ toTable: 'customers', toColumn: 'id' }],
|
||||
incoming: [{ fromTable: 'refunds', fromColumn: 'order_status' }],
|
||||
},
|
||||
maxSampleValues: 2,
|
||||
}),
|
||||
).toBe(
|
||||
'orders.status (varchar). Table: Customer orders. Payment lifecycle state. FK -> customers.id. FK <- refunds.order_status. Values: paid, refunded',
|
||||
);
|
||||
});
|
||||
|
||||
it('omits optional sections when the scan has no enrichment context yet', () => {
|
||||
expect(
|
||||
buildKtxColumnEmbeddingText({
|
||||
tableName: 'orders',
|
||||
columnName: 'id',
|
||||
columnType: 'integer',
|
||||
resolvedDescription: null,
|
||||
}),
|
||||
).toBe('orders.id (integer)');
|
||||
});
|
||||
|
||||
it('keeps all available sample values when no explicit max is supplied', () => {
|
||||
expect(
|
||||
buildKtxColumnEmbeddingText({
|
||||
tableName: 'orders',
|
||||
columnName: 'status',
|
||||
columnType: 'varchar',
|
||||
resolvedDescription: null,
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
}),
|
||||
).toBe('orders.status (varchar). Values: paid, refunded');
|
||||
});
|
||||
});
|
||||
45
packages/cli/src/context/scan/embedding-text.ts
Normal file
45
packages/cli/src/context/scan/embedding-text.ts
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
interface KtxColumnEmbeddingForeignKeys {
|
||||
outgoing: Array<{ toTable: string; toColumn: string }>;
|
||||
incoming: Array<{ fromTable: string; fromColumn: string }>;
|
||||
}
|
||||
|
||||
export interface KtxColumnEmbeddingTextInput {
|
||||
tableName: string;
|
||||
columnName: string;
|
||||
columnType: string;
|
||||
resolvedDescription: string | null;
|
||||
sampleValues?: readonly string[] | null;
|
||||
resolvedTableDescription?: string | null;
|
||||
foreignKeys?: KtxColumnEmbeddingForeignKeys | null;
|
||||
maxSampleValues?: number;
|
||||
}
|
||||
|
||||
export function buildKtxColumnEmbeddingText(input: KtxColumnEmbeddingTextInput): string {
|
||||
const parts: string[] = [];
|
||||
|
||||
parts.push(`${input.tableName}.${input.columnName} (${input.columnType})`);
|
||||
|
||||
if (input.resolvedTableDescription) {
|
||||
parts.push(`Table: ${input.resolvedTableDescription}`);
|
||||
}
|
||||
|
||||
if (input.resolvedDescription) {
|
||||
parts.push(input.resolvedDescription);
|
||||
}
|
||||
|
||||
if (input.foreignKeys) {
|
||||
for (const fk of input.foreignKeys.outgoing) {
|
||||
parts.push(`FK -> ${fk.toTable}.${fk.toColumn}`);
|
||||
}
|
||||
for (const fk of input.foreignKeys.incoming) {
|
||||
parts.push(`FK <- ${fk.fromTable}.${fk.fromColumn}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (input.sampleValues && input.sampleValues.length > 0) {
|
||||
const maxSampleValues = input.maxSampleValues ?? 20;
|
||||
parts.push(`Values: ${input.sampleValues.slice(0, maxSampleValues).join(', ')}`);
|
||||
}
|
||||
|
||||
return parts.join('. ');
|
||||
}
|
||||
17
packages/cli/src/context/scan/enabled-tables.ts
Normal file
17
packages/cli/src/context/scan/enabled-tables.ts
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
import type { KtxSchemaSnapshot } from './types.js';
|
||||
|
||||
export function resolveEnabledTables(connection: Record<string, unknown> | undefined): Set<string> | null {
|
||||
const raw = connection?.enabled_tables;
|
||||
if (!Array.isArray(raw) || raw.length === 0) return null;
|
||||
return new Set(raw.filter((v): v is string => typeof v === 'string'));
|
||||
}
|
||||
|
||||
export function filterSnapshotTables(snapshot: KtxSchemaSnapshot, enabledTables: Set<string>): KtxSchemaSnapshot {
|
||||
return {
|
||||
...snapshot,
|
||||
tables: snapshot.tables.filter((table) => {
|
||||
const key = table.db ? `${table.db}.${table.name}` : table.name;
|
||||
return enabledTables.has(key);
|
||||
}),
|
||||
};
|
||||
}
|
||||
175
packages/cli/src/context/scan/enrichment-state.test.ts
Normal file
175
packages/cli/src/context/scan/enrichment-state.test.ts
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import {
|
||||
completedKtxScanEnrichmentStateSummary,
|
||||
computeKtxScanEnrichmentInputHash,
|
||||
summarizeKtxScanEnrichmentState,
|
||||
} from './enrichment-state.js';
|
||||
import { SqliteLocalScanEnrichmentStateStore } from './sqlite-local-enrichment-state-store.js';
|
||||
import type { KtxSchemaSnapshot } from './types.js';
|
||||
|
||||
const snapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 1,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
describe('scan enrichment state', () => {
|
||||
let tempDir: string;
|
||||
let store: SqliteLocalScanEnrichmentStateStore;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-scan-enrichment-state-'));
|
||||
store = new SqliteLocalScanEnrichmentStateStore({ dbPath: join(tempDir, 'db.sqlite') });
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('computes stable input hashes without depending on object key order', () => {
|
||||
const first = computeKtxScanEnrichmentInputHash({
|
||||
snapshot,
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
|
||||
});
|
||||
const second = computeKtxScanEnrichmentInputHash({
|
||||
snapshot: { ...snapshot, metadata: {} },
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { llmModel: 'a', provider: 'local-heuristic' },
|
||||
});
|
||||
const firstTable = snapshot.tables[0];
|
||||
if (!firstTable) {
|
||||
throw new Error('Expected test snapshot table');
|
||||
}
|
||||
const changed = computeKtxScanEnrichmentInputHash({
|
||||
snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] },
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
|
||||
});
|
||||
|
||||
expect(first).toMatch(/^[a-f0-9]{64}$/);
|
||||
expect(second).toBe(first);
|
||||
expect(changed).not.toBe(first);
|
||||
});
|
||||
|
||||
it('persists completed stages and ignores stale hashes', async () => {
|
||||
const inputHash = computeKtxScanEnrichmentInputHash({
|
||||
snapshot,
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'local-heuristic' },
|
||||
});
|
||||
|
||||
await store.saveCompletedStage({
|
||||
runId: 'scan-run-1',
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-1',
|
||||
mode: 'enriched',
|
||||
stage: 'descriptions',
|
||||
inputHash,
|
||||
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
|
||||
updatedAt: '2026-04-29T12:01:00.000Z',
|
||||
});
|
||||
|
||||
await expect(
|
||||
store.findCompletedStage({
|
||||
runId: 'scan-run-1',
|
||||
stage: 'descriptions',
|
||||
inputHash,
|
||||
}),
|
||||
).resolves.toMatchObject({
|
||||
runId: 'scan-run-1',
|
||||
stage: 'descriptions',
|
||||
status: 'completed',
|
||||
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
|
||||
});
|
||||
|
||||
await expect(
|
||||
store.findCompletedStage({
|
||||
runId: 'scan-run-1',
|
||||
stage: 'descriptions',
|
||||
inputHash: 'different-hash',
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
});
|
||||
|
||||
it('records failed stages without making them reusable', async () => {
|
||||
await store.saveFailedStage({
|
||||
runId: 'scan-run-2',
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-2',
|
||||
mode: 'enriched',
|
||||
stage: 'embeddings',
|
||||
inputHash: 'hash-2',
|
||||
errorMessage: 'embedding service timed out',
|
||||
updatedAt: '2026-04-29T12:02:00.000Z',
|
||||
});
|
||||
|
||||
await expect(
|
||||
store.findCompletedStage({
|
||||
runId: 'scan-run-2',
|
||||
stage: 'embeddings',
|
||||
inputHash: 'hash-2',
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
|
||||
await expect(store.listRunStages('scan-run-2')).resolves.toEqual([
|
||||
expect.objectContaining({
|
||||
runId: 'scan-run-2',
|
||||
stage: 'embeddings',
|
||||
status: 'failed',
|
||||
errorMessage: 'embedding service timed out',
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it('summarizes resumed, completed, and failed stages for reports', () => {
|
||||
expect(
|
||||
summarizeKtxScanEnrichmentState({
|
||||
resumedStages: ['descriptions'],
|
||||
completedStages: ['descriptions', 'embeddings'],
|
||||
failedStages: ['relationships'],
|
||||
}),
|
||||
).toEqual({
|
||||
resumedStages: ['descriptions'],
|
||||
completedStages: ['descriptions', 'embeddings'],
|
||||
failedStages: ['relationships'],
|
||||
});
|
||||
|
||||
expect(completedKtxScanEnrichmentStateSummary()).toEqual({
|
||||
resumedStages: [],
|
||||
completedStages: [],
|
||||
failedStages: [],
|
||||
});
|
||||
});
|
||||
});
|
||||
108
packages/cli/src/context/scan/enrichment-state.ts
Normal file
108
packages/cli/src/context/scan/enrichment-state.ts
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import type { KtxScanEnrichmentStage, KtxScanEnrichmentStateSummary, KtxScanMode, KtxSchemaSnapshot } from './types.js';
|
||||
|
||||
const KTX_SCAN_ENRICHMENT_STAGES: readonly KtxScanEnrichmentStage[] = [
|
||||
'descriptions',
|
||||
'embeddings',
|
||||
'relationships',
|
||||
] as const;
|
||||
|
||||
export interface KtxScanEnrichmentStageLookup {
|
||||
runId: string;
|
||||
stage: KtxScanEnrichmentStage;
|
||||
inputHash: string;
|
||||
}
|
||||
|
||||
export interface KtxScanEnrichmentCompletedStage<TOutput = unknown> {
|
||||
runId: string;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
mode: KtxScanMode;
|
||||
stage: KtxScanEnrichmentStage;
|
||||
inputHash: string;
|
||||
status: 'completed';
|
||||
output: TOutput;
|
||||
errorMessage: null;
|
||||
updatedAt: string;
|
||||
}
|
||||
|
||||
export interface KtxScanEnrichmentFailedStage {
|
||||
runId: string;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
mode: KtxScanMode;
|
||||
stage: KtxScanEnrichmentStage;
|
||||
inputHash: string;
|
||||
status: 'failed';
|
||||
output: null;
|
||||
errorMessage: string;
|
||||
updatedAt: string;
|
||||
}
|
||||
|
||||
export type KtxScanEnrichmentStageRecord<TOutput = unknown> =
|
||||
| KtxScanEnrichmentCompletedStage<TOutput>
|
||||
| KtxScanEnrichmentFailedStage;
|
||||
|
||||
export interface KtxScanEnrichmentStateStore {
|
||||
findCompletedStage<TOutput = unknown>(
|
||||
input: KtxScanEnrichmentStageLookup,
|
||||
): Promise<KtxScanEnrichmentCompletedStage<TOutput> | null>;
|
||||
saveCompletedStage<TOutput = unknown>(
|
||||
input: Omit<KtxScanEnrichmentCompletedStage<TOutput>, 'status' | 'errorMessage'>,
|
||||
): Promise<void>;
|
||||
saveFailedStage(input: Omit<KtxScanEnrichmentFailedStage, 'status' | 'output'>): Promise<void>;
|
||||
listRunStages(runId: string): Promise<KtxScanEnrichmentStageRecord[]>;
|
||||
}
|
||||
|
||||
export interface ComputeKtxScanEnrichmentInputHashInput {
|
||||
snapshot: KtxSchemaSnapshot;
|
||||
mode: KtxScanMode;
|
||||
detectRelationships: boolean;
|
||||
providerIdentity: Record<string, unknown>;
|
||||
relationshipSettings?: unknown;
|
||||
}
|
||||
|
||||
function stableJson(value: unknown): string {
|
||||
if (Array.isArray(value)) {
|
||||
return `[${value.map(stableJson).join(',')}]`;
|
||||
}
|
||||
if (value && typeof value === 'object') {
|
||||
const entries = Object.entries(value as Record<string, unknown>).sort(([left], [right]) =>
|
||||
left.localeCompare(right),
|
||||
);
|
||||
return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableJson(item)}`).join(',')}}`;
|
||||
}
|
||||
return JSON.stringify(value);
|
||||
}
|
||||
|
||||
export function computeKtxScanEnrichmentInputHash(input: ComputeKtxScanEnrichmentInputHashInput): string {
|
||||
return createHash('sha256').update(stableJson(input)).digest('hex');
|
||||
}
|
||||
|
||||
function uniqueStages(stages: KtxScanEnrichmentStage[]): KtxScanEnrichmentStage[] {
|
||||
const seen = new Set<KtxScanEnrichmentStage>();
|
||||
const ordered: KtxScanEnrichmentStage[] = [];
|
||||
for (const stage of KTX_SCAN_ENRICHMENT_STAGES) {
|
||||
if (stages.includes(stage) && !seen.has(stage)) {
|
||||
seen.add(stage);
|
||||
ordered.push(stage);
|
||||
}
|
||||
}
|
||||
return ordered;
|
||||
}
|
||||
|
||||
export function completedKtxScanEnrichmentStateSummary(): KtxScanEnrichmentStateSummary {
|
||||
return {
|
||||
resumedStages: [],
|
||||
completedStages: [],
|
||||
failedStages: [],
|
||||
};
|
||||
}
|
||||
|
||||
export function summarizeKtxScanEnrichmentState(input: KtxScanEnrichmentStateSummary): KtxScanEnrichmentStateSummary {
|
||||
return {
|
||||
resumedStages: uniqueStages(input.resumedStages),
|
||||
completedStages: uniqueStages(input.completedStages),
|
||||
failedStages: uniqueStages(input.failedStages),
|
||||
};
|
||||
}
|
||||
42
packages/cli/src/context/scan/enrichment-summary.test.ts
Normal file
42
packages/cli/src/context/scan/enrichment-summary.test.ts
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
failedKtxScanEnrichmentSummary,
|
||||
ktxScanErrorMessage,
|
||||
skippedKtxScanEnrichmentSummary,
|
||||
} from './enrichment-summary.js';
|
||||
|
||||
describe('KTX scan enrichment summaries', () => {
|
||||
it('keeps structural scans skipped when no enrichment was requested', () => {
|
||||
expect(failedKtxScanEnrichmentSummary('structural', false)).toEqual(skippedKtxScanEnrichmentSummary);
|
||||
});
|
||||
|
||||
it('marks relationship stages failed when relationship detection fails', () => {
|
||||
expect(failedKtxScanEnrichmentSummary('relationships', true)).toEqual({
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'failed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'failed',
|
||||
});
|
||||
});
|
||||
|
||||
it('marks every enriched-only stage failed when full enrichment fails', () => {
|
||||
expect(failedKtxScanEnrichmentSummary('enriched', true)).toEqual({
|
||||
dataDictionary: 'failed',
|
||||
tableDescriptions: 'failed',
|
||||
columnDescriptions: 'failed',
|
||||
embeddings: 'failed',
|
||||
deterministicRelationships: 'failed',
|
||||
llmRelationshipValidation: 'failed',
|
||||
statisticalValidation: 'failed',
|
||||
});
|
||||
});
|
||||
|
||||
it('formats unknown thrown values for scan warnings', () => {
|
||||
expect(ktxScanErrorMessage(new Error('gateway timeout'))).toBe('gateway timeout');
|
||||
expect(ktxScanErrorMessage('plain failure')).toBe('plain failure');
|
||||
expect(ktxScanErrorMessage({ code: 'E_SCAN' })).toBe('{"code":"E_SCAN"}');
|
||||
});
|
||||
});
|
||||
52
packages/cli/src/context/scan/enrichment-summary.ts
Normal file
52
packages/cli/src/context/scan/enrichment-summary.ts
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import type { KtxScanEnrichmentSummary, KtxScanMode } from './types.js';
|
||||
|
||||
export const skippedKtxScanEnrichmentSummary: KtxScanEnrichmentSummary = {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
};
|
||||
|
||||
export function failedKtxScanEnrichmentSummary(
|
||||
mode: KtxScanMode,
|
||||
detectRelationships = false,
|
||||
): KtxScanEnrichmentSummary {
|
||||
if (mode === 'enriched') {
|
||||
return {
|
||||
dataDictionary: 'failed',
|
||||
tableDescriptions: 'failed',
|
||||
columnDescriptions: 'failed',
|
||||
embeddings: 'failed',
|
||||
deterministicRelationships: 'failed',
|
||||
llmRelationshipValidation: 'failed',
|
||||
statisticalValidation: 'failed',
|
||||
};
|
||||
}
|
||||
|
||||
if (mode === 'relationships' || detectRelationships) {
|
||||
return {
|
||||
...skippedKtxScanEnrichmentSummary,
|
||||
deterministicRelationships: 'failed',
|
||||
statisticalValidation: 'failed',
|
||||
};
|
||||
}
|
||||
|
||||
return skippedKtxScanEnrichmentSummary;
|
||||
}
|
||||
|
||||
export function ktxScanErrorMessage(error: unknown): string {
|
||||
if (error instanceof Error) {
|
||||
return error.message;
|
||||
}
|
||||
if (typeof error === 'string') {
|
||||
return error;
|
||||
}
|
||||
try {
|
||||
return JSON.stringify(error);
|
||||
} catch {
|
||||
return String(error);
|
||||
}
|
||||
}
|
||||
159
packages/cli/src/context/scan/enrichment-types.test.ts
Normal file
159
packages/cli/src/context/scan/enrichment-types.test.ts
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type {
|
||||
KtxColumnSampleUpdate,
|
||||
KtxDescriptionUpdate,
|
||||
KtxEmbeddingUpdate,
|
||||
KtxEnrichedSchema,
|
||||
KtxJoinUpdate,
|
||||
KtxRelationshipEndpoint,
|
||||
KtxRelationshipUpdate,
|
||||
KtxScanMetadataStore,
|
||||
KtxStructuralSyncPlan,
|
||||
} from './enrichment-types.js';
|
||||
|
||||
describe('KTX scan enrichment contracts', () => {
|
||||
it('models an enriched schema with reusable table, column, and relationship metadata', () => {
|
||||
const schema: KtxEnrichedSchema = {
|
||||
connectionId: 'warehouse',
|
||||
tables: [
|
||||
{
|
||||
id: 'table-orders',
|
||||
ref: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
enabled: true,
|
||||
descriptions: { db: 'Raw orders', ai: 'Customer orders' },
|
||||
columns: [
|
||||
{
|
||||
id: 'column-orders-status',
|
||||
tableId: 'table-orders',
|
||||
tableRef: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
name: 'status',
|
||||
nativeType: 'varchar',
|
||||
normalizedType: 'string',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: { db: 'Status code' },
|
||||
embedding: [0.1, 0.2],
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
cardinality: 2,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
relationships: [
|
||||
{
|
||||
id: 'rel-orders-customers',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'table-orders',
|
||||
columnIds: ['column-orders-customer-id'],
|
||||
table: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'table-customers',
|
||||
columnIds: ['column-customers-id'],
|
||||
table: { catalog: 'analytics', db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
expect(schema.tables[0].columns[0].sampleValues).toEqual(['paid', 'refunded']);
|
||||
expect(schema.relationships[0].source).toBe('formal');
|
||||
});
|
||||
|
||||
it('models metadata-store updates without requiring a concrete store implementation', async () => {
|
||||
const structuralPlan: KtxStructuralSyncPlan = {
|
||||
connectionId: 'warehouse',
|
||||
snapshotId: 'snapshot-1',
|
||||
operations: [{ kind: 'create_table', table: 'orders' }],
|
||||
};
|
||||
const descriptionUpdate: KtxDescriptionUpdate = {
|
||||
connectionId: 'warehouse',
|
||||
table: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
source: 'ai',
|
||||
tableDescription: 'Customer orders',
|
||||
columnDescriptions: { status: 'Payment lifecycle state' },
|
||||
};
|
||||
const sampleUpdate: KtxColumnSampleUpdate = {
|
||||
columnId: 'column-orders-status',
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
cardinality: 2,
|
||||
};
|
||||
const embeddingUpdate: KtxEmbeddingUpdate = {
|
||||
columnId: 'column-orders-status',
|
||||
text: 'orders.status (varchar). Values: paid, refunded',
|
||||
embedding: [0.25, 0.75],
|
||||
};
|
||||
const relationshipUpdate: KtxRelationshipUpdate = {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [],
|
||||
rejected: [],
|
||||
skipped: [{ reason: 'missing parent table', relationshipId: 'candidate-1' }],
|
||||
};
|
||||
|
||||
const store: KtxScanMetadataStore = {
|
||||
loadSchema: async () => null,
|
||||
applyStructuralPlan: async (plan) => ({
|
||||
connectionId: plan.connectionId,
|
||||
tables: [],
|
||||
relationships: [],
|
||||
}),
|
||||
updateDescriptions: async (input) => {
|
||||
expect(input).toEqual(descriptionUpdate);
|
||||
},
|
||||
updateColumnSamples: async (input) => {
|
||||
expect(input).toEqual([sampleUpdate]);
|
||||
},
|
||||
updateColumnEmbeddings: async (input) => {
|
||||
expect(input).toEqual([embeddingUpdate]);
|
||||
},
|
||||
updateInferredRelationships: async (input) => {
|
||||
expect(input).toEqual(relationshipUpdate);
|
||||
},
|
||||
};
|
||||
|
||||
await expect(store.loadSchema('warehouse')).resolves.toBeNull();
|
||||
await expect(store.applyStructuralPlan(structuralPlan)).resolves.toEqual({
|
||||
connectionId: 'warehouse',
|
||||
tables: [],
|
||||
relationships: [],
|
||||
});
|
||||
await expect(store.updateDescriptions(descriptionUpdate)).resolves.toBeUndefined();
|
||||
await expect(store.updateColumnSamples([sampleUpdate])).resolves.toBeUndefined();
|
||||
await expect(store.updateColumnEmbeddings([embeddingUpdate])).resolves.toBeUndefined();
|
||||
await expect(store.updateInferredRelationships(relationshipUpdate)).resolves.toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('relationship tuple contracts', () => {
|
||||
it('represents relationship endpoints and join updates as ordered column tuples', () => {
|
||||
const endpoint: KtxRelationshipEndpoint = {
|
||||
tableId: 'public.order_lines',
|
||||
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
};
|
||||
const update: KtxJoinUpdate = {
|
||||
connectionId: 'warehouse',
|
||||
fromTable: 'order_line_allocations',
|
||||
fromColumns: ['order_id', 'line_number'],
|
||||
toTable: 'order_lines',
|
||||
toColumns: ['order_id', 'line_number'],
|
||||
relationship: 'many_to_one',
|
||||
author: 'ktx',
|
||||
authorEmail: 'ktx@example.com',
|
||||
};
|
||||
|
||||
expect(endpoint.columns).toEqual(['order_id', 'line_number']);
|
||||
expect(endpoint.columnIds).toEqual(['public.order_lines.order_id', 'public.order_lines.line_number']);
|
||||
expect(update.fromColumns).toEqual(['order_id', 'line_number']);
|
||||
expect(update.toColumns).toEqual(['order_id', 'line_number']);
|
||||
});
|
||||
});
|
||||
119
packages/cli/src/context/scan/enrichment-types.ts
Normal file
119
packages/cli/src/context/scan/enrichment-types.ts
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
import type { KtxSchemaDimensionType, KtxTableRef } from './types.js';
|
||||
|
||||
type KtxDescriptionSource = 'ai' | 'db' | 'dbt' | 'user' | (string & {});
|
||||
|
||||
type KtxRelationshipSource = 'formal' | 'inferred' | 'manual';
|
||||
|
||||
export type KtxRelationshipType = 'many_to_one' | 'one_to_many' | 'one_to_one';
|
||||
|
||||
export interface KtxEnrichedColumn {
|
||||
id: string;
|
||||
tableId: string;
|
||||
tableRef: KtxTableRef;
|
||||
name: string;
|
||||
nativeType: string;
|
||||
normalizedType: string;
|
||||
dimensionType: KtxSchemaDimensionType;
|
||||
nullable: boolean;
|
||||
primaryKey: boolean;
|
||||
parentColumnId: string | null;
|
||||
descriptions: Partial<Record<KtxDescriptionSource, string>>;
|
||||
embedding: number[] | null;
|
||||
sampleValues: string[] | null;
|
||||
cardinality: number | null;
|
||||
}
|
||||
|
||||
export interface KtxEnrichedTable {
|
||||
id: string;
|
||||
ref: KtxTableRef;
|
||||
enabled: boolean;
|
||||
descriptions: Partial<Record<KtxDescriptionSource, string>>;
|
||||
columns: KtxEnrichedColumn[];
|
||||
}
|
||||
|
||||
export interface KtxRelationshipEndpoint {
|
||||
tableId: string;
|
||||
columnIds: string[];
|
||||
table: KtxTableRef;
|
||||
columns: string[];
|
||||
}
|
||||
|
||||
export interface KtxEnrichedRelationship {
|
||||
id: string;
|
||||
source: KtxRelationshipSource;
|
||||
from: KtxRelationshipEndpoint;
|
||||
to: KtxRelationshipEndpoint;
|
||||
relationshipType: KtxRelationshipType;
|
||||
confidence: number;
|
||||
isPrimaryKeyReference: boolean;
|
||||
}
|
||||
|
||||
export interface KtxEnrichedSchema {
|
||||
connectionId: string;
|
||||
tables: KtxEnrichedTable[];
|
||||
relationships: KtxEnrichedRelationship[];
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxStructuralSyncPlan {
|
||||
connectionId: string;
|
||||
snapshotId: string;
|
||||
operations: Array<Record<string, unknown>>;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxDescriptionUpdate {
|
||||
connectionId: string;
|
||||
table: KtxTableRef;
|
||||
source: KtxDescriptionSource;
|
||||
tableDescription?: string;
|
||||
columnDescriptions?: Record<string, string | null>;
|
||||
}
|
||||
|
||||
|
||||
/** @internal */
|
||||
export interface KtxJoinUpdate {
|
||||
connectionId: string;
|
||||
fromTable: string;
|
||||
fromColumns: string[];
|
||||
toTable: string;
|
||||
toColumns: string[];
|
||||
relationship: KtxRelationshipType;
|
||||
author: string;
|
||||
authorEmail: string;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxColumnSampleUpdate {
|
||||
columnId: string;
|
||||
sampleValues: string[] | null;
|
||||
cardinality: number | null;
|
||||
}
|
||||
|
||||
export interface KtxEmbeddingUpdate {
|
||||
columnId: string;
|
||||
text: string;
|
||||
embedding: number[];
|
||||
}
|
||||
|
||||
export interface KtxSkippedRelationship {
|
||||
relationshipId: string;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipUpdate {
|
||||
connectionId: string;
|
||||
accepted: KtxEnrichedRelationship[];
|
||||
rejected: KtxEnrichedRelationship[];
|
||||
skipped: KtxSkippedRelationship[];
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxScanMetadataStore {
|
||||
loadSchema(connectionId: string): Promise<KtxEnrichedSchema | null>;
|
||||
applyStructuralPlan(plan: KtxStructuralSyncPlan): Promise<KtxEnrichedSchema>;
|
||||
updateDescriptions(input: KtxDescriptionUpdate): Promise<void>;
|
||||
updateColumnSamples(input: KtxColumnSampleUpdate[]): Promise<void>;
|
||||
updateColumnEmbeddings(input: KtxEmbeddingUpdate[]): Promise<void>;
|
||||
updateInferredRelationships(input: KtxRelationshipUpdate): Promise<void>;
|
||||
}
|
||||
291
packages/cli/src/context/scan/entity-details.test.ts
Normal file
291
packages/cli/src/context/scan/entity-details.test.ts
Normal file
|
|
@ -0,0 +1,291 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKtxProject, type KtxLocalProject } from '../../context/project/project.js';
|
||||
import { createKtxEntityDetailsService } from './entity-details.js';
|
||||
import type { KtxConnectionDriver, KtxScanReport, KtxSchemaTable } from './types.js';
|
||||
|
||||
describe('createKtxEntityDetailsService', () => {
|
||||
let tempDir: string;
|
||||
let project: KtxLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-entity-details-service-'));
|
||||
project = await initKtxProject({ projectDir: join(tempDir, 'project') });
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
function scanReport(input: {
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
runId: string;
|
||||
driver?: KtxConnectionDriver;
|
||||
createdAt?: string;
|
||||
}): KtxScanReport {
|
||||
const rawSourcesDir = `raw-sources/${input.connectionId}/live-database/${input.syncId}`;
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver ?? 'postgres',
|
||||
syncId: input.syncId,
|
||||
runId: input.runId,
|
||||
trigger: 'mcp',
|
||||
mode: 'structural',
|
||||
dryRun: false,
|
||||
artifactPaths: {
|
||||
rawSourcesDir,
|
||||
reportPath: `${rawSourcesDir}/scan-report.json`,
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts: [],
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: 0,
|
||||
tablesModified: 0,
|
||||
tablesDeleted: 0,
|
||||
tablesUnchanged: 1,
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 0,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 1,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
capabilityGaps: [],
|
||||
warnings: [],
|
||||
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
|
||||
enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] },
|
||||
createdAt: input.createdAt ?? '2026-05-14T09:00:00.000Z',
|
||||
};
|
||||
}
|
||||
|
||||
function ordersTable(input: { db?: string | null; estimatedRows?: number | null } = {}): KtxSchemaTable {
|
||||
return {
|
||||
catalog: null,
|
||||
db: input.db ?? 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: input.estimatedRows ?? 12,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'Order status',
|
||||
},
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_customer_id_fkey',
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function seedScan(input: {
|
||||
connectionId?: string;
|
||||
syncId: string;
|
||||
runId: string;
|
||||
driver?: KtxConnectionDriver;
|
||||
extractedAt?: string;
|
||||
tables?: KtxSchemaTable[];
|
||||
}): Promise<void> {
|
||||
const connectionId = input.connectionId ?? 'warehouse';
|
||||
const report = scanReport({
|
||||
connectionId,
|
||||
syncId: input.syncId,
|
||||
runId: input.runId,
|
||||
driver: input.driver,
|
||||
createdAt: input.extractedAt,
|
||||
});
|
||||
const root = report.artifactPaths.rawSourcesDir;
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/connection.json`,
|
||||
JSON.stringify(
|
||||
{
|
||||
connectionId,
|
||||
driver: report.driver,
|
||||
extractedAt: input.extractedAt ?? report.createdAt,
|
||||
scope: { schemas: ['public'] },
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed connection',
|
||||
);
|
||||
for (const table of input.tables ?? [ordersTable()]) {
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/tables/${table.db ?? 'default'}-${table.name}.json`,
|
||||
JSON.stringify(table, null, 2),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
`seed ${table.name}`,
|
||||
);
|
||||
}
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/scan-report.json`,
|
||||
JSON.stringify(report, null, 2),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed scan report',
|
||||
);
|
||||
}
|
||||
|
||||
it('returns the latest scan snapshot table details for a display string', async () => {
|
||||
await seedScan({ syncId: 'sync-1', runId: 'scan-old', extractedAt: '2026-05-14T08:00:00.000Z' });
|
||||
await seedScan({
|
||||
syncId: 'sync-2',
|
||||
runId: 'scan-new',
|
||||
extractedAt: '2026-05-14T09:00:00.000Z',
|
||||
tables: [ordersTable({ estimatedRows: 99 })],
|
||||
});
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: 'public.orders' }],
|
||||
});
|
||||
|
||||
expect(result.results).toHaveLength(1);
|
||||
expect(result.results[0]).toMatchObject({
|
||||
ok: true,
|
||||
connectionId: 'warehouse',
|
||||
display: 'public.orders',
|
||||
estimatedRows: 99,
|
||||
snapshot: {
|
||||
syncId: 'sync-2',
|
||||
scanRunId: 'scan-new',
|
||||
extractedAt: '2026-05-14T09:00:00.000Z',
|
||||
},
|
||||
columns: [
|
||||
{ name: 'id', nativeType: 'integer', primaryKey: true },
|
||||
{ name: 'status', nativeType: 'text', nullable: false },
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('filters requested columns while keeping full-table foreign keys', async () => {
|
||||
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: { catalog: null, db: 'public', name: 'orders' }, columns: ['status'] }],
|
||||
});
|
||||
|
||||
expect(result.results[0]).toMatchObject({
|
||||
ok: true,
|
||||
columns: [{ name: 'status' }],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('returns a structured missing-scan error', async () => {
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: 'public.orders' }],
|
||||
});
|
||||
|
||||
expect(result.results).toEqual([
|
||||
{
|
||||
ok: false,
|
||||
connectionId: 'warehouse',
|
||||
table: 'public.orders',
|
||||
error: {
|
||||
code: 'scan_missing',
|
||||
message: 'No live-database scan found for connection "warehouse"; run `ktx ingest warehouse` or `ktx scan warehouse`.',
|
||||
},
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('reports ambiguous bare table names across schemas', async () => {
|
||||
await seedScan({
|
||||
syncId: 'sync-1',
|
||||
runId: 'scan-1',
|
||||
tables: [ordersTable({ db: 'public' }), ordersTable({ db: 'archive' })],
|
||||
});
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: 'orders' }],
|
||||
});
|
||||
|
||||
expect(result.results[0]).toMatchObject({
|
||||
ok: false,
|
||||
error: {
|
||||
code: 'ambiguous_table',
|
||||
candidates: [
|
||||
{ tableRef: { catalog: null, db: 'archive', name: 'orders' }, display: 'archive.orders' },
|
||||
{ tableRef: { catalog: null, db: 'public', name: 'orders' }, display: 'public.orders' },
|
||||
],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('reports missing requested columns with available column candidates', async () => {
|
||||
await seedScan({ syncId: 'sync-1', runId: 'scan-1' });
|
||||
const service = createKtxEntityDetailsService(project);
|
||||
|
||||
const result = await service.read({
|
||||
connectionId: 'warehouse',
|
||||
entities: [{ table: 'public.orders', columns: ['status', 'plan_tier'] }],
|
||||
});
|
||||
|
||||
expect(result.results[0]).toMatchObject({
|
||||
ok: false,
|
||||
error: {
|
||||
code: 'column_not_found',
|
||||
message: 'Column(s) not found on public.orders: plan_tier',
|
||||
candidates: ['id', 'status'],
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
315
packages/cli/src/context/scan/entity-details.ts
Normal file
315
packages/cli/src/context/scan/entity-details.ts
Normal file
|
|
@ -0,0 +1,315 @@
|
|||
import type { KtxLocalProject } from '../../context/project/project.js';
|
||||
import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
|
||||
import type {
|
||||
KtxConnectionDriver,
|
||||
KtxScanReport,
|
||||
KtxSchemaColumn,
|
||||
KtxSchemaSnapshot,
|
||||
KtxSchemaTable,
|
||||
KtxTableRef,
|
||||
} from './types.js';
|
||||
|
||||
export type KtxEntityDetailsTableInput = string | KtxTableRef;
|
||||
|
||||
export interface KtxEntityDetailsInput {
|
||||
connectionId: string;
|
||||
entities: Array<{
|
||||
table: KtxEntityDetailsTableInput;
|
||||
columns?: string[];
|
||||
}>;
|
||||
}
|
||||
|
||||
export interface KtxEntityDetailsSnapshotInfo {
|
||||
syncId: string;
|
||||
extractedAt: string;
|
||||
scanRunId: string | null;
|
||||
}
|
||||
|
||||
interface KtxEntityDetailsColumn {
|
||||
name: string;
|
||||
nativeType: string;
|
||||
normalizedType: string;
|
||||
dimensionType: KtxSchemaColumn['dimensionType'];
|
||||
nullable: boolean;
|
||||
primaryKey: boolean;
|
||||
comment: string | null;
|
||||
}
|
||||
|
||||
interface KtxEntityDetailsRecord {
|
||||
ok: true;
|
||||
connectionId: string;
|
||||
tableRef: KtxTableRef;
|
||||
display: string;
|
||||
kind: KtxSchemaTable['kind'];
|
||||
comment: string | null;
|
||||
estimatedRows: number | null;
|
||||
columns: KtxEntityDetailsColumn[];
|
||||
foreignKeys: KtxSchemaTable['foreignKeys'];
|
||||
snapshot: KtxEntityDetailsSnapshotInfo;
|
||||
}
|
||||
|
||||
type KtxEntityDetailsErrorCode = 'scan_missing' | 'table_not_found' | 'ambiguous_table' | 'column_not_found';
|
||||
|
||||
interface KtxEntityDetailsErrorResult {
|
||||
ok: false;
|
||||
connectionId: string;
|
||||
table: KtxEntityDetailsTableInput;
|
||||
snapshot?: KtxEntityDetailsSnapshotInfo;
|
||||
error: {
|
||||
code: KtxEntityDetailsErrorCode;
|
||||
message: string;
|
||||
candidates?: Array<{ tableRef: KtxTableRef; display: string }> | string[];
|
||||
};
|
||||
}
|
||||
|
||||
export interface KtxEntityDetailsResponse {
|
||||
results: Array<KtxEntityDetailsRecord | KtxEntityDetailsErrorResult>;
|
||||
}
|
||||
|
||||
interface LatestScan {
|
||||
report: KtxScanReport;
|
||||
snapshot: KtxSchemaSnapshot;
|
||||
}
|
||||
|
||||
interface ResolveResult {
|
||||
table: KtxSchemaTable | null;
|
||||
error?: Omit<KtxEntityDetailsErrorResult['error'], 'message'> & { message: string };
|
||||
}
|
||||
|
||||
function normalize(value: string | null | undefined): string {
|
||||
return (value ?? '').toLowerCase();
|
||||
}
|
||||
|
||||
function refsEqual(left: KtxTableRef, right: KtxTableRef): boolean {
|
||||
return (
|
||||
normalize(left.catalog) === normalize(right.catalog) &&
|
||||
normalize(left.db) === normalize(right.db) &&
|
||||
normalize(left.name) === normalize(right.name)
|
||||
);
|
||||
}
|
||||
|
||||
function cleanIdentifierPart(part: string): string {
|
||||
return part.trim().replace(/^["'`\[]|["'`\]]$/g, '');
|
||||
}
|
||||
|
||||
function splitDisplay(display: string): string[] {
|
||||
return display
|
||||
.trim()
|
||||
.split('.')
|
||||
.map(cleanIdentifierPart)
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function displayForTable(driver: KtxConnectionDriver, table: KtxTableRef): string {
|
||||
if (driver === 'sqlite') {
|
||||
return table.name;
|
||||
}
|
||||
return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.');
|
||||
}
|
||||
|
||||
function tableRef(table: KtxSchemaTable): KtxTableRef {
|
||||
return { catalog: table.catalog, db: table.db, name: table.name };
|
||||
}
|
||||
|
||||
function candidateList(
|
||||
driver: KtxConnectionDriver,
|
||||
tables: KtxSchemaTable[],
|
||||
): Array<{ tableRef: KtxTableRef; display: string }> {
|
||||
return tables
|
||||
.map((table) => ({
|
||||
tableRef: tableRef(table),
|
||||
display: displayForTable(driver, table),
|
||||
}))
|
||||
.sort((left, right) => left.display.localeCompare(right.display));
|
||||
}
|
||||
|
||||
function parseDisplayRef(driver: KtxConnectionDriver, display: string): KtxTableRef | null {
|
||||
const parts = splitDisplay(display);
|
||||
if (driver === 'sqlite') {
|
||||
return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null;
|
||||
}
|
||||
if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') {
|
||||
return parts.length === 3 ? { catalog: parts[0]!, db: parts[1]!, name: parts[2]! } : null;
|
||||
}
|
||||
if (parts.length === 2) {
|
||||
return { catalog: null, db: parts[0]!, name: parts[1]! };
|
||||
}
|
||||
if (parts.length === 3) {
|
||||
return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! };
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function resolveTable(snapshot: KtxSchemaSnapshot, input: KtxEntityDetailsTableInput): ResolveResult {
|
||||
if (typeof input !== 'string') {
|
||||
const table = snapshot.tables.find((candidate) => refsEqual(candidate, input)) ?? null;
|
||||
return table
|
||||
? { table }
|
||||
: {
|
||||
table: null,
|
||||
error: {
|
||||
code: 'table_not_found',
|
||||
message: `Table not found in latest scan: ${displayForTable(snapshot.driver, input)}`,
|
||||
candidates: candidateList(snapshot.driver, snapshot.tables),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const parsed = parseDisplayRef(snapshot.driver, input);
|
||||
if (parsed) {
|
||||
const table = snapshot.tables.find((candidate) => refsEqual(candidate, parsed)) ?? null;
|
||||
return table
|
||||
? { table }
|
||||
: {
|
||||
table: null,
|
||||
error: {
|
||||
code: 'table_not_found',
|
||||
message: `Table not found in latest scan: ${input}`,
|
||||
candidates: candidateList(snapshot.driver, snapshot.tables),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const byName = snapshot.tables.filter((candidate) => normalize(candidate.name) === normalize(input));
|
||||
if (byName.length === 1) {
|
||||
return { table: byName[0]! };
|
||||
}
|
||||
if (byName.length > 1) {
|
||||
return {
|
||||
table: null,
|
||||
error: {
|
||||
code: 'ambiguous_table',
|
||||
message: `Table name "${input}" is ambiguous across schemas/catalogs; pass a structured table ref.`,
|
||||
candidates: candidateList(snapshot.driver, byName),
|
||||
},
|
||||
};
|
||||
}
|
||||
return {
|
||||
table: null,
|
||||
error: {
|
||||
code: 'table_not_found',
|
||||
message: `Table not found in latest scan: ${input}`,
|
||||
candidates: candidateList(snapshot.driver, snapshot.tables),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function toColumn(column: KtxSchemaColumn): KtxEntityDetailsColumn {
|
||||
return {
|
||||
name: column.name,
|
||||
nativeType: column.nativeType,
|
||||
normalizedType: column.normalizedType,
|
||||
dimensionType: column.dimensionType,
|
||||
nullable: column.nullable,
|
||||
primaryKey: column.primaryKey,
|
||||
comment: column.comment,
|
||||
};
|
||||
}
|
||||
|
||||
function snapshotInfo(report: KtxScanReport, snapshot: KtxSchemaSnapshot): KtxEntityDetailsSnapshotInfo {
|
||||
return {
|
||||
syncId: report.syncId,
|
||||
extractedAt: snapshot.extractedAt,
|
||||
scanRunId: report.runId ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
async function readJson<T>(project: KtxLocalProject, path: string): Promise<T> {
|
||||
return JSON.parse((await project.fileStore.readFile(path)).content) as T;
|
||||
}
|
||||
|
||||
async function latestScan(project: KtxLocalProject, connectionId: string): Promise<LatestScan | null> {
|
||||
const root = `raw-sources/${connectionId}/live-database`;
|
||||
let listed;
|
||||
try {
|
||||
listed = await project.fileStore.listFiles(root);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
const reportPath = listed.files.filter((path) => path.endsWith('/scan-report.json')).sort().at(-1);
|
||||
if (!reportPath) {
|
||||
return null;
|
||||
}
|
||||
const report = await readJson<KtxScanReport>(project, reportPath);
|
||||
const rawSourcesDir = report.artifactPaths.rawSourcesDir ?? reportPath.slice(0, -'/scan-report.json'.length);
|
||||
const snapshot = await readLocalScanStructuralSnapshot({
|
||||
project,
|
||||
connectionId,
|
||||
driver: report.driver,
|
||||
rawSourcesDir,
|
||||
extractedAtFallback: report.createdAt,
|
||||
});
|
||||
return { report, snapshot };
|
||||
}
|
||||
|
||||
export function createKtxEntityDetailsService(project: KtxLocalProject) {
|
||||
return {
|
||||
async read(input: KtxEntityDetailsInput): Promise<KtxEntityDetailsResponse> {
|
||||
const scan = await latestScan(project, input.connectionId);
|
||||
if (!scan) {
|
||||
return {
|
||||
results: input.entities.map((entity) => ({
|
||||
ok: false,
|
||||
connectionId: input.connectionId,
|
||||
table: entity.table,
|
||||
error: {
|
||||
code: 'scan_missing',
|
||||
message: `No live-database scan found for connection "${input.connectionId}"; run \`ktx ingest ${input.connectionId}\` or \`ktx scan ${input.connectionId}\`.`,
|
||||
},
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
const info = snapshotInfo(scan.report, scan.snapshot);
|
||||
const results: KtxEntityDetailsResponse['results'] = [];
|
||||
for (const entity of input.entities) {
|
||||
const resolved = resolveTable(scan.snapshot, entity.table);
|
||||
if (!resolved.table) {
|
||||
results.push({
|
||||
ok: false,
|
||||
connectionId: input.connectionId,
|
||||
table: entity.table,
|
||||
snapshot: info,
|
||||
error: resolved.error!,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
const requested = new Set((entity.columns ?? []).map((column) => normalize(column)));
|
||||
const columns = requested.size
|
||||
? resolved.table.columns.filter((column) => requested.has(normalize(column.name)))
|
||||
: resolved.table.columns;
|
||||
if (requested.size && columns.length !== requested.size) {
|
||||
const found = new Set(columns.map((column) => normalize(column.name)));
|
||||
const missing = [...requested].filter((column) => !found.has(column));
|
||||
results.push({
|
||||
ok: false,
|
||||
connectionId: input.connectionId,
|
||||
table: entity.table,
|
||||
snapshot: info,
|
||||
error: {
|
||||
code: 'column_not_found',
|
||||
message: `Column(s) not found on ${displayForTable(scan.snapshot.driver, resolved.table)}: ${missing.join(', ')}`,
|
||||
candidates: resolved.table.columns.map((column) => column.name),
|
||||
},
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push({
|
||||
ok: true,
|
||||
connectionId: input.connectionId,
|
||||
tableRef: tableRef(resolved.table),
|
||||
display: displayForTable(scan.snapshot.driver, resolved.table),
|
||||
kind: resolved.table.kind,
|
||||
comment: resolved.table.comment,
|
||||
estimatedRows: resolved.table.estimatedRows,
|
||||
columns: columns.map(toColumn),
|
||||
foreignKeys: resolved.table.foreignKeys,
|
||||
snapshot: info,
|
||||
});
|
||||
}
|
||||
return { results };
|
||||
},
|
||||
};
|
||||
}
|
||||
907
packages/cli/src/context/scan/local-enrichment-artifacts.test.ts
Normal file
907
packages/cli/src/context/scan/local-enrichment-artifacts.test.ts
Normal file
|
|
@ -0,0 +1,907 @@
|
|||
import { mkdtemp, readFile, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import YAML from 'yaml';
|
||||
import { initKtxProject, type KtxLocalProject } from '../../context/project/project.js';
|
||||
import type { KtxLocalScanEnrichmentResult } from './local-enrichment.js';
|
||||
import { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards } from './local-enrichment-artifacts.js';
|
||||
import type { KtxSchemaSnapshot } from './types.js';
|
||||
|
||||
const snapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: 'DB customer table',
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'DB customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'DB orders table',
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_customer_id_fkey',
|
||||
},
|
||||
],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'DB order id',
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'DB customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
function enrichment(): KtxLocalScanEnrichmentResult {
|
||||
return {
|
||||
snapshot,
|
||||
summary: {
|
||||
dataDictionary: 'completed',
|
||||
tableDescriptions: 'completed',
|
||||
columnDescriptions: 'completed',
|
||||
embeddings: 'completed',
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
|
||||
state: {
|
||||
resumedStages: [],
|
||||
completedStages: ['descriptions', 'embeddings', 'relationships'],
|
||||
failedStages: [],
|
||||
},
|
||||
warnings: [],
|
||||
descriptionUpdates: [
|
||||
{
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
tableDescription: 'AI orders table',
|
||||
columnDescriptions: {
|
||||
id: 'AI order id',
|
||||
customer_id: 'AI customer reference',
|
||||
},
|
||||
},
|
||||
{
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
tableDescription: 'AI customers table',
|
||||
columnDescriptions: {
|
||||
id: 'AI customer id',
|
||||
},
|
||||
},
|
||||
],
|
||||
embeddingUpdates: [
|
||||
{ columnId: 'public.orders.id', text: 'orders id', embedding: [0.1, 0.2] },
|
||||
{ columnId: 'public.orders.customer_id', text: 'orders customer_id', embedding: [0.3, 0.4] },
|
||||
],
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
source: 'inferred',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
relationshipProfile: {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 6,
|
||||
tables: [{ table: { catalog: null, db: 'public', name: 'customers' }, rowCount: 2 }],
|
||||
columns: {
|
||||
'customers.id': {
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
column: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
resolvedRelationships: [
|
||||
{
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
source: 'llm_proposal',
|
||||
status: 'accepted',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.92,
|
||||
pkScore: 0.95,
|
||||
fkScore: 0.91,
|
||||
score: 0.9,
|
||||
evidence: {
|
||||
sourceColumnBase: 'buyer',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.88,
|
||||
nameScore: 0.45,
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
checkedValues: 2,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
graph: {
|
||||
targetPkScore: 0.95,
|
||||
incomingCandidateCount: 1,
|
||||
conflictRank: 1,
|
||||
reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
compositeRelationships: null,
|
||||
};
|
||||
}
|
||||
|
||||
describe('writeLocalScanEnrichmentArtifacts', () => {
|
||||
let tempDir: string;
|
||||
let project: KtxLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-local-enrichment-artifacts-'));
|
||||
project = await initKtxProject({
|
||||
projectDir: join(tempDir, 'project'),
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('writes enrichment artifacts and manifest shards while preserving external descriptions', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify(
|
||||
{
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
descriptions: { user: 'Pinned analyst description', ai: 'Old AI description' },
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
descriptions: { user: 'Pinned id description', ai: 'Old AI id' },
|
||||
},
|
||||
{ name: 'customer_id', type: 'number' },
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
{ indent: 2, lineWidth: 0 },
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed manifest shard',
|
||||
);
|
||||
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-1',
|
||||
driver: 'postgres',
|
||||
enrichment: enrichment(),
|
||||
dryRun: false,
|
||||
relationshipSettings: {
|
||||
enabled: true,
|
||||
llmProposals: false,
|
||||
validationRequiredForManifest: true,
|
||||
acceptThreshold: 0.91,
|
||||
reviewThreshold: 0.61,
|
||||
maxLlmTablesPerBatch: 12,
|
||||
maxCandidatesPerColumn: 7,
|
||||
profileSampleRows: 500,
|
||||
validationConcurrency: 2,
|
||||
},
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
enrichmentArtifacts: [
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/embeddings.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json',
|
||||
],
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
manifestShardsWritten: 1,
|
||||
});
|
||||
|
||||
await expect(
|
||||
readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json'),
|
||||
'utf-8',
|
||||
),
|
||||
).resolves.toContain('AI orders table');
|
||||
|
||||
const relationshipsRaw = await readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json'),
|
||||
'utf-8',
|
||||
);
|
||||
const relationshipsArtifact = JSON.parse(relationshipsRaw) as {
|
||||
accepted: Array<{
|
||||
id: string;
|
||||
status: string;
|
||||
source: string;
|
||||
pkScore: number;
|
||||
fkScore: number;
|
||||
evidence: unknown;
|
||||
reasons: string[];
|
||||
validation: unknown;
|
||||
graph: unknown;
|
||||
}>;
|
||||
review: unknown[];
|
||||
rejected: unknown[];
|
||||
skipped: unknown[];
|
||||
};
|
||||
expect(relationshipsArtifact.accepted).toHaveLength(1);
|
||||
expect(relationshipsArtifact.accepted[0]).toMatchObject({
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
status: 'accepted',
|
||||
source: 'llm_proposal',
|
||||
pkScore: 0.95,
|
||||
fkScore: 0.91,
|
||||
evidence: expect.objectContaining({
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
}),
|
||||
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
|
||||
validation: expect.objectContaining({ reasons: ['validation_passed'] }),
|
||||
graph: expect.objectContaining({ reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'] }),
|
||||
});
|
||||
expect(relationshipsArtifact.review).toEqual([]);
|
||||
expect(relationshipsArtifact.rejected).toEqual([]);
|
||||
expect(relationshipsArtifact.skipped).toEqual([]);
|
||||
|
||||
const profileRaw = await readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(JSON.parse(profileRaw)).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 6,
|
||||
warnings: [],
|
||||
});
|
||||
|
||||
const diagnosticsRaw = await readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(JSON.parse(diagnosticsRaw)).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
summary: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
|
||||
noAcceptedReason: null,
|
||||
candidateCountsBySource: { llm_proposal: 1 },
|
||||
validation: { available: true, sqlAvailable: true, queryCount: 6 },
|
||||
thresholds: { acceptThreshold: 0.91, reviewThreshold: 0.61 },
|
||||
policy: {
|
||||
validationRequiredForManifest: true,
|
||||
maxCandidatesPerColumn: 7,
|
||||
profileSampleRows: 500,
|
||||
validationConcurrency: 2,
|
||||
},
|
||||
profileWarnings: [],
|
||||
});
|
||||
|
||||
const manifestRaw = await readFile(
|
||||
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
|
||||
'utf-8',
|
||||
);
|
||||
const manifest = YAML.parse(manifestRaw) as {
|
||||
tables: {
|
||||
orders: {
|
||||
descriptions: Record<string, string>;
|
||||
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
|
||||
joins: Array<{ to: string; on: string; source: string }>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
expect(manifest.tables.orders.descriptions).toEqual({
|
||||
user: 'Pinned analyst description',
|
||||
db: 'DB orders table',
|
||||
ai: 'AI orders table',
|
||||
});
|
||||
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
|
||||
user: 'Pinned id description',
|
||||
db: 'DB order id',
|
||||
ai: 'AI order id',
|
||||
});
|
||||
expect(manifest.tables.orders.joins).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
source: 'formal',
|
||||
}),
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
source: 'manual',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('writes formal accepted relationships into relationship artifacts and manifest shards', async () => {
|
||||
const source = enrichment();
|
||||
const formalEnrichment: KtxLocalScanEnrichmentResult = {
|
||||
...source,
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
resolvedRelationships: [],
|
||||
compositeRelationships: null,
|
||||
};
|
||||
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
syncId: 'sync-formal',
|
||||
enrichment: formalEnrichment,
|
||||
relationshipSettings: {
|
||||
enabled: true,
|
||||
llmProposals: false,
|
||||
validationRequiredForManifest: true,
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
maxLlmTablesPerBatch: 40,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
validationConcurrency: 4,
|
||||
},
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
const relationshipsPath = 'raw-sources/warehouse/live-database/sync-formal/enrichment/relationships.json';
|
||||
const relationships = JSON.parse((await project.fileStore.readFile(relationshipsPath)).content) as {
|
||||
accepted: Array<{ source: string; reasons: string[] }>;
|
||||
};
|
||||
expect(relationships.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
source: 'formal',
|
||||
reasons: ['formal_metadata_accepted'],
|
||||
}),
|
||||
]);
|
||||
|
||||
const manifestPath = result.manifestShards[0];
|
||||
if (!manifestPath) {
|
||||
throw new Error('Expected manifest shard path');
|
||||
}
|
||||
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
|
||||
tables: { orders: { joins: Array<{ to: string; on: string; source: string }> } };
|
||||
};
|
||||
expect(manifest.tables.orders.joins).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
source: 'formal',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('writes manually applied relationship joins with manual source', async () => {
|
||||
const result = await writeLocalScanManifestShards({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-manual',
|
||||
driver: 'postgres',
|
||||
snapshot,
|
||||
dryRun: false,
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'public.orders:(public.orders.customer_id)->public.customers:(public.customers.id)',
|
||||
source: 'manual',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.manifestShardsWritten).toBe(1);
|
||||
const shard = YAML.parse(await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'));
|
||||
expect(shard.tables.orders.joins).toContainEqual({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
});
|
||||
});
|
||||
|
||||
it('does not persist generated error descriptions in manifest shards', async () => {
|
||||
await writeLocalScanManifestShards({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-error-description',
|
||||
driver: 'postgres',
|
||||
snapshot,
|
||||
descriptionUpdates: [
|
||||
{
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
tableDescription: 'Error generating description: timeout exceeded when trying to connect',
|
||||
columnDescriptions: {
|
||||
id: 'Error generating description: timeout exceeded when trying to connect',
|
||||
customer_id: 'AI customer reference',
|
||||
},
|
||||
},
|
||||
],
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
const shard = YAML.parse(
|
||||
await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'),
|
||||
) as {
|
||||
tables: {
|
||||
orders: {
|
||||
descriptions?: Record<string, string>;
|
||||
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
expect(shard.tables.orders.descriptions).toEqual({ db: 'DB orders table' });
|
||||
expect(shard.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
|
||||
db: 'DB order id',
|
||||
});
|
||||
expect(shard.tables.orders.columns.find((column) => column.name === 'customer_id')?.descriptions).toEqual({
|
||||
db: 'DB customer id',
|
||||
ai: 'AI customer reference',
|
||||
});
|
||||
});
|
||||
|
||||
it('writes accepted composite relationships to relationship artifacts and manifest shards', async () => {
|
||||
const compositeSnapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-05-07T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'order_lines',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'order_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'line_number',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'order_line_allocations',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'order_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'line_number',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const compositeEnrichment: KtxLocalScanEnrichmentResult = Object.assign(enrichment(), {
|
||||
snapshot: compositeSnapshot,
|
||||
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
|
||||
descriptionUpdates: [],
|
||||
embeddingUpdates: [],
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'inferred',
|
||||
from: {
|
||||
tableId: 'public.order_line_allocations',
|
||||
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.order_lines',
|
||||
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
resolvedRelationships: [],
|
||||
compositeRelationships: [
|
||||
{
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'composite_profile_match',
|
||||
status: 'accepted',
|
||||
from: {
|
||||
tableId: 'public.order_line_allocations',
|
||||
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.order_lines',
|
||||
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
reasons: ['composite_validation_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
syncId: 'sync-composite',
|
||||
enrichment: compositeEnrichment,
|
||||
relationshipSettings: {
|
||||
enabled: true,
|
||||
llmProposals: false,
|
||||
validationRequiredForManifest: true,
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
maxLlmTablesPerBatch: 40,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
validationConcurrency: 4,
|
||||
},
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
const relationships = JSON.parse(
|
||||
(await project.fileStore.readFile('raw-sources/warehouse/live-database/sync-composite/enrichment/relationships.json'))
|
||||
.content,
|
||||
) as { accepted: Array<{ from: { columns: string[] }; to: { columns: string[] }; reasons: string[] }> };
|
||||
expect(relationships.accepted[0]).toMatchObject({
|
||||
from: { columns: ['order_id', 'line_number'] },
|
||||
to: { columns: ['order_id', 'line_number'] },
|
||||
reasons: ['composite_validation_passed'],
|
||||
});
|
||||
|
||||
const manifestPath = result.manifestShards[0];
|
||||
if (!manifestPath) {
|
||||
throw new Error('Expected manifest shard path');
|
||||
}
|
||||
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
|
||||
tables: { order_line_allocations: { joins: Array<{ to: string; on: string; source: string }> } };
|
||||
};
|
||||
expect(manifest.tables.order_line_allocations.joins).toEqual([
|
||||
{
|
||||
to: 'order_lines',
|
||||
on: 'order_line_allocations.order_id = order_lines.order_id AND order_line_allocations.line_number = order_lines.line_number',
|
||||
relationship: 'many_to_one',
|
||||
source: 'inferred',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('writes structural manifest shards without enrichment artifacts', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify(
|
||||
{
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
descriptions: { user: 'Pinned structural description', ai: 'Old generated text' },
|
||||
usage: {
|
||||
narrative: 'Orders are commonly filtered by lifecycle status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
ownerNote: 'Preserve analyst note',
|
||||
},
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
descriptions: { user: 'Pinned structural id', ai: 'Old generated id' },
|
||||
},
|
||||
{ name: 'customer_id', type: 'number' },
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
{ indent: 2, lineWidth: 0 },
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed structural manifest shard',
|
||||
);
|
||||
|
||||
const result = await writeLocalScanManifestShards({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-structural-1',
|
||||
driver: 'postgres',
|
||||
snapshot,
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
manifestShardsWritten: 1,
|
||||
});
|
||||
|
||||
await expect(
|
||||
readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-structural-1/enrichment/descriptions.json'),
|
||||
'utf-8',
|
||||
),
|
||||
).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
|
||||
const manifestRaw = await readFile(
|
||||
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
|
||||
'utf-8',
|
||||
);
|
||||
const manifest = YAML.parse(manifestRaw) as {
|
||||
tables: {
|
||||
orders: {
|
||||
descriptions: Record<string, string>;
|
||||
usage?: Record<string, unknown>;
|
||||
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
|
||||
joins: Array<{ to: string; on: string; source: string }>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
expect(manifest.tables.orders.descriptions).toEqual({
|
||||
user: 'Pinned structural description',
|
||||
db: 'DB orders table',
|
||||
});
|
||||
expect(manifest.tables.orders.usage).toEqual({
|
||||
narrative: 'Orders are commonly filtered by lifecycle status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
ownerNote: 'Preserve analyst note',
|
||||
});
|
||||
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
|
||||
user: 'Pinned structural id',
|
||||
db: 'DB order id',
|
||||
});
|
||||
expect(manifest.tables.orders.joins).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
source: 'formal',
|
||||
}),
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
source: 'manual',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('returns planned empty paths without writing files during dry runs', async () => {
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-dry-run',
|
||||
driver: 'postgres',
|
||||
enrichment: enrichment(),
|
||||
dryRun: true,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
enrichmentArtifacts: [],
|
||||
manifestShards: [],
|
||||
manifestShardsWritten: 0,
|
||||
});
|
||||
await expect(
|
||||
readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-dry-run/enrichment/descriptions.json'),
|
||||
'utf-8',
|
||||
),
|
||||
).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
});
|
||||
});
|
||||
425
packages/cli/src/context/scan/local-enrichment-artifacts.ts
Normal file
425
packages/cli/src/context/scan/local-enrichment-artifacts.ts
Normal file
|
|
@ -0,0 +1,425 @@
|
|||
import YAML from 'yaml';
|
||||
import { buildLiveDatabaseManifestShards, type LiveDatabaseManifestExistingDescriptions, type LiveDatabaseManifestJoinData, type LiveDatabaseManifestJoinEntry, type LiveDatabaseManifestShard, type LiveDatabaseManifestTableData } from '../../context/ingest/adapters/live-database/manifest.js';
|
||||
import type { TableUsageOutput } from '../../context/ingest/adapters/historic-sql/skill-schemas.js';
|
||||
import type { KtxScanRelationshipConfig } from '../project/config.js';
|
||||
import type { KtxLocalProject } from '../../context/project/project.js';
|
||||
import type { KtxLocalScanEnrichmentResult } from './local-enrichment.js';
|
||||
import {
|
||||
buildKtxRelationshipArtifacts,
|
||||
buildKtxRelationshipDiagnostics,
|
||||
emptyKtxRelationshipProfileArtifact,
|
||||
} from './relationship-diagnostics.js';
|
||||
import type { KtxConnectionDriver, KtxSchemaColumn, KtxSchemaSnapshot, KtxSchemaTable } from './types.js';
|
||||
|
||||
const LIVE_DATABASE_ADAPTER = 'live-database';
|
||||
const LOCAL_AUTHOR = 'ktx';
|
||||
const LOCAL_AUTHOR_EMAIL = 'ktx@example.com';
|
||||
const SCHEMA_DIR = '_schema';
|
||||
const SL_DIR_PREFIX = 'semantic-layer';
|
||||
|
||||
export interface WriteLocalScanManifestShardsInput {
|
||||
project: KtxLocalProject;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
snapshot: KtxSchemaSnapshot;
|
||||
dryRun: boolean;
|
||||
descriptionUpdates?: KtxLocalScanEnrichmentResult['descriptionUpdates'];
|
||||
relationshipUpdate?: KtxLocalScanEnrichmentResult['relationshipUpdate'];
|
||||
}
|
||||
|
||||
export interface WriteLocalScanManifestShardsResult {
|
||||
manifestShards: string[];
|
||||
manifestShardsWritten: number;
|
||||
}
|
||||
|
||||
export interface WriteLocalScanEnrichmentArtifactsInput {
|
||||
project: KtxLocalProject;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
enrichment: KtxLocalScanEnrichmentResult;
|
||||
dryRun: boolean;
|
||||
relationshipSettings?: KtxScanRelationshipConfig;
|
||||
}
|
||||
|
||||
export interface WriteLocalScanEnrichmentArtifactsResult extends WriteLocalScanManifestShardsResult {
|
||||
enrichmentArtifacts: string[];
|
||||
}
|
||||
|
||||
interface ExistingManifestState {
|
||||
descriptions: Map<string, LiveDatabaseManifestExistingDescriptions>;
|
||||
preservedJoins: Map<string, LiveDatabaseManifestJoinEntry[]>;
|
||||
usage: Map<string, TableUsageOutput>;
|
||||
}
|
||||
|
||||
type LocalDescriptionUpdates = KtxLocalScanEnrichmentResult['descriptionUpdates'];
|
||||
|
||||
function isGeneratedErrorDescription(description: string | null | undefined): boolean {
|
||||
const normalized = description?.trim().toLowerCase();
|
||||
return (
|
||||
normalized === 'failed to generate description' ||
|
||||
normalized?.startsWith('error generating description:') === true
|
||||
);
|
||||
}
|
||||
|
||||
function artifactDir(connectionId: string, syncId: string): string {
|
||||
return `raw-sources/${connectionId}/${LIVE_DATABASE_ADAPTER}/${syncId}/enrichment`;
|
||||
}
|
||||
|
||||
function schemaDir(connectionId: string): string {
|
||||
return `${SL_DIR_PREFIX}/${connectionId}/${SCHEMA_DIR}`;
|
||||
}
|
||||
|
||||
function tableDescription(
|
||||
table: KtxSchemaTable,
|
||||
descriptionUpdates: LocalDescriptionUpdates = [],
|
||||
): Record<string, string> | undefined {
|
||||
const update = descriptionUpdates.find((candidate) => candidate.table.name === table.name);
|
||||
const descriptions: Record<string, string> = {};
|
||||
if (table.comment) {
|
||||
descriptions.db = table.comment;
|
||||
}
|
||||
if (update?.tableDescription && !isGeneratedErrorDescription(update.tableDescription)) {
|
||||
descriptions.ai = update.tableDescription;
|
||||
}
|
||||
return Object.keys(descriptions).length > 0 ? descriptions : undefined;
|
||||
}
|
||||
|
||||
function columnDescription(
|
||||
table: KtxSchemaTable,
|
||||
column: KtxSchemaColumn,
|
||||
descriptionUpdates: LocalDescriptionUpdates = [],
|
||||
): Record<string, string> | undefined {
|
||||
const update = descriptionUpdates.find((candidate) => candidate.table.name === table.name);
|
||||
const aiDescription = update?.columnDescriptions[column.name] ?? null;
|
||||
const descriptions: Record<string, string> = {};
|
||||
if (column.comment) {
|
||||
descriptions.db = column.comment;
|
||||
}
|
||||
if (aiDescription && !isGeneratedErrorDescription(aiDescription)) {
|
||||
descriptions.ai = aiDescription;
|
||||
}
|
||||
return Object.keys(descriptions).length > 0 ? descriptions : undefined;
|
||||
}
|
||||
|
||||
function snapshotTablesToManifestData(
|
||||
snapshot: KtxSchemaSnapshot,
|
||||
descriptionUpdates: LocalDescriptionUpdates = [],
|
||||
): LiveDatabaseManifestTableData[] {
|
||||
return snapshot.tables.map((table) => ({
|
||||
name: table.name,
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
descriptions: tableDescription(table, descriptionUpdates),
|
||||
columns: table.columns.map((column) => ({
|
||||
name: column.name,
|
||||
type: column.dimensionType,
|
||||
...(column.primaryKey ? { pk: true } : {}),
|
||||
...(column.nullable === false ? { nullable: false } : {}),
|
||||
descriptions: columnDescription(table, column, descriptionUpdates),
|
||||
})),
|
||||
}));
|
||||
}
|
||||
|
||||
function formalJoins(snapshot: KtxSchemaSnapshot): LiveDatabaseManifestJoinData[] {
|
||||
const joins: LiveDatabaseManifestJoinData[] = [];
|
||||
for (const table of snapshot.tables) {
|
||||
for (const foreignKey of table.foreignKeys) {
|
||||
joins.push({
|
||||
fromTable: table.name,
|
||||
fromColumns: [foreignKey.fromColumn],
|
||||
toTable: foreignKey.toTable,
|
||||
toColumns: [foreignKey.toColumn],
|
||||
relationship: 'many_to_one',
|
||||
source: 'formal',
|
||||
});
|
||||
}
|
||||
}
|
||||
return joins;
|
||||
}
|
||||
|
||||
function acceptedRelationshipJoins(
|
||||
relationshipUpdate: KtxLocalScanEnrichmentResult['relationshipUpdate'] | undefined,
|
||||
): LiveDatabaseManifestJoinData[] {
|
||||
return (relationshipUpdate?.accepted ?? []).map((relationship) => ({
|
||||
fromTable: relationship.from.table.name,
|
||||
fromColumns: relationship.from.columns,
|
||||
toTable: relationship.to.table.name,
|
||||
toColumns: relationship.to.columns,
|
||||
relationship: relationship.relationshipType,
|
||||
source: relationship.source,
|
||||
}));
|
||||
}
|
||||
|
||||
function relationshipJoins(
|
||||
snapshot: KtxSchemaSnapshot,
|
||||
relationshipUpdate: KtxLocalScanEnrichmentResult['relationshipUpdate'] | undefined,
|
||||
): LiveDatabaseManifestJoinData[] {
|
||||
const accepted = acceptedRelationshipJoins(relationshipUpdate);
|
||||
const manual = accepted.filter((relationship) => relationship.source === 'manual');
|
||||
const generated = accepted.filter((relationship) => relationship.source !== 'manual');
|
||||
return [...manual, ...formalJoins(snapshot), ...generated];
|
||||
}
|
||||
|
||||
function validColumns(snapshot: KtxSchemaSnapshot): Map<string, Set<string>> {
|
||||
return new Map(snapshot.tables.map((table) => [table.name, new Set(table.columns.map((column) => column.name))]));
|
||||
}
|
||||
|
||||
function joinReferencesExistingColumns(
|
||||
join: LiveDatabaseManifestJoinEntry,
|
||||
columnsByTable: Map<string, Set<string>>,
|
||||
): boolean {
|
||||
const terms = join.on.split(/\s+AND\s+/iu);
|
||||
for (const term of terms) {
|
||||
const match = term.match(/^(\w+)\.(\w+)\s*=\s*(\w+)\.(\w+)$/u);
|
||||
if (!match) {
|
||||
return true;
|
||||
}
|
||||
const leftTable = match[1];
|
||||
const leftColumn = match[2];
|
||||
const rightTable = match[3];
|
||||
const rightColumn = match[4];
|
||||
if (!leftTable || !leftColumn || !rightTable || !rightColumn) {
|
||||
return true;
|
||||
}
|
||||
const leftColumns = columnsByTable.get(leftTable);
|
||||
const rightColumns = columnsByTable.get(rightTable);
|
||||
if ((leftColumns && !leftColumns.has(leftColumn)) || (rightColumns && !rightColumns.has(rightColumn))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async function loadExistingManifestState(
|
||||
project: KtxLocalProject,
|
||||
connectionId: string,
|
||||
snapshot: KtxSchemaSnapshot,
|
||||
): Promise<ExistingManifestState> {
|
||||
const descriptions = new Map<string, LiveDatabaseManifestExistingDescriptions>();
|
||||
const preservedJoins = new Map<string, LiveDatabaseManifestJoinEntry[]>();
|
||||
const usage = new Map<string, TableUsageOutput>();
|
||||
const validTableNames = new Set(snapshot.tables.map((table) => table.name));
|
||||
const columnsByTable = validColumns(snapshot);
|
||||
|
||||
let files: string[];
|
||||
try {
|
||||
files = (await project.fileStore.listFiles(schemaDir(connectionId))).files.filter((file) => file.endsWith('.yaml'));
|
||||
} catch {
|
||||
return { descriptions, preservedJoins, usage };
|
||||
}
|
||||
|
||||
for (const file of files) {
|
||||
try {
|
||||
const { content } = await project.fileStore.readFile(file);
|
||||
const shard = YAML.parse(content) as LiveDatabaseManifestShard | null;
|
||||
if (!shard?.tables) {
|
||||
continue;
|
||||
}
|
||||
for (const [tableName, entry] of Object.entries(shard.tables)) {
|
||||
if (!validTableNames.has(tableName)) {
|
||||
continue;
|
||||
}
|
||||
descriptions.set(tableName, {
|
||||
table: entry.descriptions ? { ...entry.descriptions } : undefined,
|
||||
columns: new Map(
|
||||
(entry.columns ?? []).flatMap((column) =>
|
||||
column.descriptions ? ([[column.name, { ...column.descriptions }]] as const) : [],
|
||||
),
|
||||
),
|
||||
});
|
||||
if (entry.usage) {
|
||||
usage.set(tableName, { ...entry.usage });
|
||||
}
|
||||
const joins = (entry.joins ?? []).filter((join) => {
|
||||
return (
|
||||
(join.source === 'manual' || join.source === 'inferred') &&
|
||||
validTableNames.has(join.to) &&
|
||||
joinReferencesExistingColumns(join, columnsByTable)
|
||||
);
|
||||
});
|
||||
if (joins.length > 0) {
|
||||
preservedJoins.set(tableName, joins);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return { descriptions, preservedJoins, usage };
|
||||
}
|
||||
|
||||
async function writeJsonArtifact(
|
||||
project: KtxLocalProject,
|
||||
path: string,
|
||||
value: unknown,
|
||||
commitMessage: string,
|
||||
): Promise<void> {
|
||||
await project.fileStore.writeFile(
|
||||
path,
|
||||
`${JSON.stringify(value, null, 2)}\n`,
|
||||
LOCAL_AUTHOR,
|
||||
LOCAL_AUTHOR_EMAIL,
|
||||
commitMessage,
|
||||
);
|
||||
}
|
||||
|
||||
export async function writeLocalScanManifestShards(
|
||||
input: WriteLocalScanManifestShardsInput,
|
||||
): Promise<WriteLocalScanManifestShardsResult> {
|
||||
if (input.dryRun) {
|
||||
return {
|
||||
manifestShards: [],
|
||||
manifestShardsWritten: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const existing = await loadExistingManifestState(input.project, input.connectionId, input.snapshot);
|
||||
const { shards } = buildLiveDatabaseManifestShards({
|
||||
connectionType: input.driver.toUpperCase(),
|
||||
tables: snapshotTablesToManifestData(input.snapshot, input.descriptionUpdates),
|
||||
joins: relationshipJoins(input.snapshot, input.relationshipUpdate),
|
||||
existingDescriptions: existing.descriptions,
|
||||
existingPreservedJoins: existing.preservedJoins,
|
||||
existingUsage: existing.usage,
|
||||
mapColumnType: (dimensionType) => dimensionType,
|
||||
});
|
||||
|
||||
const manifestShards: string[] = [];
|
||||
for (const [shardKey, shard] of [...shards.entries()].sort(([left], [right]) => left.localeCompare(right))) {
|
||||
const path = `${schemaDir(input.connectionId)}/${shardKey}.yaml`;
|
||||
await input.project.fileStore.writeFile(
|
||||
path,
|
||||
YAML.stringify(shard, { indent: 2, lineWidth: 0, version: '1.1' }),
|
||||
LOCAL_AUTHOR,
|
||||
LOCAL_AUTHOR_EMAIL,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write manifest shard ${shardKey} syncId=${input.syncId}`,
|
||||
);
|
||||
manifestShards.push(path);
|
||||
}
|
||||
|
||||
return {
|
||||
manifestShards,
|
||||
manifestShardsWritten: manifestShards.length,
|
||||
};
|
||||
}
|
||||
|
||||
export async function writeLocalScanEnrichmentArtifacts(
|
||||
input: WriteLocalScanEnrichmentArtifactsInput,
|
||||
): Promise<WriteLocalScanEnrichmentArtifactsResult> {
|
||||
if (input.dryRun) {
|
||||
return {
|
||||
enrichmentArtifacts: [],
|
||||
manifestShards: [],
|
||||
manifestShardsWritten: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const enrichmentRoot = artifactDir(input.connectionId, input.syncId);
|
||||
const descriptionsArtifact = `${enrichmentRoot}/descriptions.json`;
|
||||
const embeddingsArtifact = `${enrichmentRoot}/embeddings.json`;
|
||||
const relationshipsArtifact = `${enrichmentRoot}/relationships.json`;
|
||||
const relationshipProfileArtifact = `${enrichmentRoot}/relationship-profile.json`;
|
||||
const relationshipDiagnosticsArtifact = `${enrichmentRoot}/relationship-diagnostics.json`;
|
||||
const enrichmentArtifacts: string[] = [];
|
||||
|
||||
if (
|
||||
input.enrichment.summary.tableDescriptions === 'completed' ||
|
||||
input.enrichment.summary.columnDescriptions === 'completed'
|
||||
) {
|
||||
enrichmentArtifacts.push(descriptionsArtifact);
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
descriptionsArtifact,
|
||||
input.enrichment.descriptionUpdates,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write enrichment descriptions syncId=${input.syncId}`,
|
||||
);
|
||||
}
|
||||
if (input.enrichment.summary.embeddings === 'completed') {
|
||||
enrichmentArtifacts.push(embeddingsArtifact);
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
embeddingsArtifact,
|
||||
input.enrichment.embeddingUpdates,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write enrichment embeddings syncId=${input.syncId}`,
|
||||
);
|
||||
}
|
||||
enrichmentArtifacts.push(relationshipsArtifact, relationshipProfileArtifact, relationshipDiagnosticsArtifact);
|
||||
const hasResolvedRelationships = input.enrichment.resolvedRelationships !== null;
|
||||
const relationshipArtifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: input.connectionId,
|
||||
resolvedRelationships: hasResolvedRelationships ? (input.enrichment.resolvedRelationships ?? []) : undefined,
|
||||
compositeRelationships: input.enrichment.compositeRelationships ?? undefined,
|
||||
relationshipUpdate: input.enrichment.relationshipUpdate ?? {
|
||||
connectionId: input.connectionId,
|
||||
accepted: [],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
});
|
||||
const relationshipProfile =
|
||||
input.enrichment.relationshipProfile ??
|
||||
emptyKtxRelationshipProfileArtifact({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
reason: 'relationship_profiling_not_run',
|
||||
});
|
||||
const relationshipDiagnostics = buildKtxRelationshipDiagnostics({
|
||||
connectionId: input.connectionId,
|
||||
artifacts: relationshipArtifacts,
|
||||
profile: relationshipProfile,
|
||||
warnings: input.enrichment.warnings,
|
||||
thresholds: input.relationshipSettings
|
||||
? {
|
||||
acceptThreshold: input.relationshipSettings.acceptThreshold,
|
||||
reviewThreshold: input.relationshipSettings.reviewThreshold,
|
||||
}
|
||||
: undefined,
|
||||
policy: input.relationshipSettings
|
||||
? {
|
||||
validationRequiredForManifest: input.relationshipSettings.validationRequiredForManifest,
|
||||
maxCandidatesPerColumn: input.relationshipSettings.maxCandidatesPerColumn,
|
||||
profileSampleRows: input.relationshipSettings.profileSampleRows,
|
||||
validationConcurrency: input.relationshipSettings.validationConcurrency,
|
||||
}
|
||||
: undefined,
|
||||
});
|
||||
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
relationshipsArtifact,
|
||||
relationshipArtifacts,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write enrichment relationships syncId=${input.syncId}`,
|
||||
);
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
relationshipProfileArtifact,
|
||||
relationshipProfile,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write relationship profile syncId=${input.syncId}`,
|
||||
);
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
relationshipDiagnosticsArtifact,
|
||||
relationshipDiagnostics,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write relationship diagnostics syncId=${input.syncId}`,
|
||||
);
|
||||
|
||||
const manifestResult = await writeLocalScanManifestShards({
|
||||
project: input.project,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
driver: input.driver,
|
||||
snapshot: input.enrichment.snapshot,
|
||||
descriptionUpdates: input.enrichment.descriptionUpdates,
|
||||
relationshipUpdate: input.enrichment.relationshipUpdate,
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
return {
|
||||
enrichmentArtifacts,
|
||||
manifestShards: manifestResult.manifestShards,
|
||||
manifestShardsWritten: manifestResult.manifestShardsWritten,
|
||||
};
|
||||
}
|
||||
815
packages/cli/src/context/scan/local-enrichment.test.ts
Normal file
815
packages/cli/src/context/scan/local-enrichment.test.ts
Normal file
|
|
@ -0,0 +1,815 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { buildDefaultKtxProjectConfig } from '../project/config.js';
|
||||
import type {
|
||||
KtxScanEnrichmentCompletedStage,
|
||||
KtxScanEnrichmentFailedStage,
|
||||
KtxScanEnrichmentStageLookup,
|
||||
KtxScanEnrichmentStateStore,
|
||||
} from './enrichment-state.js';
|
||||
import {
|
||||
createDeterministicLocalScanEnrichmentProviders,
|
||||
runLocalScanEnrichment,
|
||||
snapshotToKtxEnrichedSchema,
|
||||
} from './local-enrichment.js';
|
||||
import {
|
||||
createKtxConnectorCapabilities,
|
||||
type KtxQueryResult,
|
||||
type KtxReadOnlyQueryInput,
|
||||
type KtxEmbeddingPort,
|
||||
type KtxScanConnector,
|
||||
type KtxScanContext,
|
||||
type KtxSchemaSnapshot,
|
||||
} from './types.js';
|
||||
|
||||
function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort {
|
||||
return {
|
||||
dimensions: options.dimensions,
|
||||
maxBatchSize: options.maxBatchSize ?? 64,
|
||||
async embedBatch(texts) {
|
||||
return texts.map((_, textIndex) =>
|
||||
Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex),
|
||||
);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const snapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: 'Customer accounts',
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
function connector(): KtxScanConnector {
|
||||
return {
|
||||
id: 'test:warehouse',
|
||||
driver: 'postgres',
|
||||
capabilities: createKtxConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
readOnlySql: true,
|
||||
columnStats: true,
|
||||
}),
|
||||
introspect: vi.fn(async () => snapshot),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id', 'customer_id'],
|
||||
rows: [[1, 10]],
|
||||
totalRows: 1,
|
||||
})),
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['10', '11'],
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function noDeclaredRelationshipSnapshot(): KtxSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'accounts',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function memoryEnrichmentStateStore(): KtxScanEnrichmentStateStore {
|
||||
const records = new Map<string, KtxScanEnrichmentCompletedStage | KtxScanEnrichmentFailedStage>();
|
||||
const key = (input: Pick<KtxScanEnrichmentStageLookup, 'runId' | 'stage'>) => `${input.runId}:${input.stage}`;
|
||||
return {
|
||||
async findCompletedStage<TOutput>(input: KtxScanEnrichmentStageLookup) {
|
||||
const record = records.get(key(input));
|
||||
if (!record || record.status !== 'completed' || record.inputHash !== input.inputHash) {
|
||||
return null;
|
||||
}
|
||||
return record as KtxScanEnrichmentCompletedStage<TOutput>;
|
||||
},
|
||||
async saveCompletedStage(input) {
|
||||
records.set(key(input), {
|
||||
...input,
|
||||
status: 'completed',
|
||||
errorMessage: null,
|
||||
});
|
||||
},
|
||||
async saveFailedStage(input) {
|
||||
records.set(key(input), {
|
||||
...input,
|
||||
status: 'failed',
|
||||
output: null,
|
||||
});
|
||||
},
|
||||
async listRunStages(runId) {
|
||||
return [...records.values()].filter((record) => record.runId === runId);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('local scan enrichment', () => {
|
||||
it('maps a scan snapshot into relationship detector schema', () => {
|
||||
const schema = snapshotToKtxEnrichedSchema(snapshot);
|
||||
|
||||
expect(schema.connectionId).toBe('warehouse');
|
||||
expect(schema.tables).toHaveLength(2);
|
||||
expect(schema.tables[1]?.columns.map((column) => column.name)).toEqual(['id', 'customer_id']);
|
||||
expect(schema.tables[1]?.columns[1]).toMatchObject({
|
||||
id: 'public.orders.customer_id',
|
||||
tableId: 'public.orders',
|
||||
primaryKey: false,
|
||||
sampleValues: null,
|
||||
embedding: null,
|
||||
});
|
||||
});
|
||||
|
||||
it('maps snapshot foreign keys into formal schema relationships', () => {
|
||||
const source = noDeclaredRelationshipSnapshot();
|
||||
const snapshotWithForeignKey = {
|
||||
...source,
|
||||
tables: source.tables.map((table) =>
|
||||
table.name === 'orders'
|
||||
? {
|
||||
...table,
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'account_id',
|
||||
toCatalog: null,
|
||||
toDb: null,
|
||||
toTable: 'accounts',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_account_id_fkey',
|
||||
},
|
||||
],
|
||||
}
|
||||
: table.name === 'accounts'
|
||||
? {
|
||||
...table,
|
||||
columns: table.columns.map((column) =>
|
||||
column.name === 'id' ? { ...column, primaryKey: true } : column,
|
||||
),
|
||||
}
|
||||
: table,
|
||||
),
|
||||
};
|
||||
|
||||
const schema = snapshotToKtxEnrichedSchema(snapshotWithForeignKey);
|
||||
|
||||
expect(schema.relationships).toEqual([
|
||||
{
|
||||
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.account_id'],
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
columns: ['account_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'accounts',
|
||||
columnIds: ['accounts.id'],
|
||||
table: { catalog: null, db: null, name: 'accounts' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('runs deterministic relationship detection for relationship scans', async () => {
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-1' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.summary).toMatchObject({
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
});
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(result.summary.statisticalValidation).toBe('skipped');
|
||||
expect(result.warnings).toContainEqual({
|
||||
code: 'relationship_validation_failed',
|
||||
message: 'KTX scan connector advertises readOnlySql but does not expose executeReadOnly',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
});
|
||||
});
|
||||
|
||||
it('runs relationship discovery with connector SQL evidence', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
try {
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts (id) VALUES (1), (2);
|
||||
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
driver: 'sqlite' as const,
|
||||
capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
|
||||
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
|
||||
executeReadOnly: executor.executeReadOnly.bind(executor),
|
||||
};
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-relationship-discovery' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.summary.statisticalValidation).toBe('completed');
|
||||
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
|
||||
expect(result.resolvedRelationships).toEqual([
|
||||
expect.objectContaining({
|
||||
status: 'accepted',
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
}),
|
||||
]);
|
||||
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
|
||||
} finally {
|
||||
executor.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('honors scan relationship config when LLM proposals are disabled', async () => {
|
||||
const providers = createDeterministicLocalScanEnrichmentProviders();
|
||||
const generateObject = vi.fn();
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-llm-disabled' },
|
||||
providers: {
|
||||
...providers,
|
||||
llmRuntime: {
|
||||
...providers.llmRuntime,
|
||||
generateObject: generateObject as never,
|
||||
},
|
||||
},
|
||||
relationshipSettings: {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
llmProposals: false,
|
||||
maxLlmTablesPerBatch: 40,
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.summary.llmRelationshipValidation).toBe('skipped');
|
||||
expect(generateObject).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('skips relationship detection when scan relationships are disabled', async () => {
|
||||
const settings = {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
enabled: false,
|
||||
};
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
connector: connector(),
|
||||
context: { runId: 'disabled-relationships' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
relationshipSettings: settings,
|
||||
});
|
||||
|
||||
expect(result.summary.deterministicRelationships).toBe('skipped');
|
||||
expect(result.summary.statisticalValidation).toBe('skipped');
|
||||
expect(result.summary.llmRelationshipValidation).toBe('skipped');
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate).toBeNull();
|
||||
expect(result.relationshipProfile).toBeNull();
|
||||
expect(result.resolvedRelationships).toBeNull();
|
||||
});
|
||||
|
||||
it('forwards context.logger and emits warnings when sampleTable fails repeatedly', async () => {
|
||||
const failingConnector: KtxScanConnector = {
|
||||
...connector(),
|
||||
sampleTable: vi.fn(async () => {
|
||||
throw new Error('pool: ECONNRESET');
|
||||
}),
|
||||
};
|
||||
const logger = {
|
||||
debug: vi.fn(),
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
error: vi.fn(),
|
||||
};
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: failingConnector,
|
||||
context: { runId: 'scan-run-warnings', logger },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
});
|
||||
|
||||
const codes = result.warnings.map((warning) => warning.code);
|
||||
expect(codes).toContain('sampling_failed');
|
||||
expect(codes).toContain('description_fallback_used');
|
||||
expect(result.warnings.some((warning) => warning.table === 'customers')).toBe(true);
|
||||
expect(logger.warn).toHaveBeenCalled();
|
||||
expect(logger.error).toHaveBeenCalled();
|
||||
// Each of the two tables produced sampling_failed + description_fallback_used, so 2 + 2 = 4 warnings minimum.
|
||||
expect(result.warnings.length).toBeGreaterThanOrEqual(4);
|
||||
// Sampling was retried 3× for each of the 2 tables = 6 calls
|
||||
expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
|
||||
});
|
||||
|
||||
it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-2' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
});
|
||||
|
||||
expect(result.summary).toMatchObject({
|
||||
dataDictionary: 'completed',
|
||||
tableDescriptions: 'completed',
|
||||
columnDescriptions: 'completed',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'completed',
|
||||
});
|
||||
expect(result.embeddingUpdates).toEqual([]);
|
||||
expect(result.snapshot).toEqual(snapshot);
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
});
|
||||
|
||||
it('generates table descriptions with bounded table-level concurrency', async () => {
|
||||
const concurrentSnapshot: KtxSchemaSnapshot = {
|
||||
...snapshot,
|
||||
tables: Array.from({ length: 8 }, (_, index) => ({
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: `table_${index + 1}`,
|
||||
kind: 'table' as const,
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number' as const,
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
})),
|
||||
};
|
||||
let activeColumnSamples = 0;
|
||||
let maxActiveColumnSamples = 0;
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => concurrentSnapshot),
|
||||
sampleColumn: vi.fn(async () => {
|
||||
activeColumnSamples += 1;
|
||||
maxActiveColumnSamples = Math.max(maxActiveColumnSamples, activeColumnSamples);
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
activeColumnSamples -= 1;
|
||||
return {
|
||||
values: ['1'],
|
||||
nullCount: 0,
|
||||
distinctCount: 1,
|
||||
};
|
||||
}),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id'],
|
||||
rows: [[1]],
|
||||
totalRows: 1,
|
||||
})),
|
||||
};
|
||||
const settings = {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
enabled: false,
|
||||
};
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-concurrent-descriptions' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
relationshipSettings: settings,
|
||||
});
|
||||
|
||||
expect(maxActiveColumnSamples).toBe(6);
|
||||
});
|
||||
|
||||
it('reports enrichment progress for countable stages', async () => {
|
||||
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
|
||||
const progress = {
|
||||
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
|
||||
events.push({ progress: progressValue, message, transient: options?.transient });
|
||||
},
|
||||
startPhase() {
|
||||
return progress;
|
||||
},
|
||||
};
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-progress', progress },
|
||||
providers: {
|
||||
...createDeterministicLocalScanEnrichmentProviders(),
|
||||
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
||||
},
|
||||
});
|
||||
|
||||
expect(events).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({ message: 'Generating descriptions 1/2 tables', transient: true }),
|
||||
expect.objectContaining({ message: 'Generating descriptions 2/2 tables', transient: true }),
|
||||
expect.objectContaining({ message: 'Building embeddings 1/1 batches', transient: true }),
|
||||
expect.objectContaining({ message: 'Detecting relationships' }),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('reports progress before enrichment connector introspection starts', async () => {
|
||||
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
|
||||
const progress = {
|
||||
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
|
||||
events.push({ progress: progressValue, message, transient: options?.transient });
|
||||
},
|
||||
startPhase() {
|
||||
return progress;
|
||||
},
|
||||
};
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => {
|
||||
expect(events).toContainEqual(expect.objectContaining({ message: 'Loading enrichment schema snapshot' }));
|
||||
return snapshot;
|
||||
}),
|
||||
};
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-progress-before-introspection', progress },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(scanConnector.introspect).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('splits enrichment embedding requests by provider batch size', async () => {
|
||||
const manyColumnSnapshot: KtxSchemaSnapshot = {
|
||||
...snapshot,
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'wide_orders',
|
||||
kind: 'table',
|
||||
comment: 'Wide order facts',
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: Array.from({ length: 5 }, (_, index) => ({
|
||||
name: `metric_${index + 1}`,
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number' as const,
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: `Metric ${index + 1}`,
|
||||
})),
|
||||
},
|
||||
],
|
||||
};
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => manyColumnSnapshot),
|
||||
};
|
||||
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
|
||||
const embedBatch = vi.fn(async (texts: string[]) => {
|
||||
if (texts.length > 2) {
|
||||
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
|
||||
}
|
||||
return texts.map((_, index) => [index, index + 1, index + 2]);
|
||||
});
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-batched-embeddings' },
|
||||
providers: {
|
||||
llmRuntime: deterministicProviders.llmRuntime,
|
||||
embedding: {
|
||||
dimensions: 3,
|
||||
maxBatchSize: 2,
|
||||
embedBatch,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.embeddingUpdates).toHaveLength(5);
|
||||
expect(embedBatch.mock.calls.map(([texts]) => texts).map((texts) => texts.length)).toEqual([2, 2, 1]);
|
||||
});
|
||||
|
||||
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
|
||||
const stateStore = memoryEnrichmentStateStore();
|
||||
const scanConnector = connector();
|
||||
const providers = {
|
||||
...createDeterministicLocalScanEnrichmentProviders(),
|
||||
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
||||
};
|
||||
|
||||
const first = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-resume-1' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-1',
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
const generateText = vi.spyOn(providers.llmRuntime, 'generateText');
|
||||
const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
|
||||
const second = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-resume-1' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-1',
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(first.state.resumedStages).toEqual([]);
|
||||
expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(generateText).not.toHaveBeenCalled();
|
||||
expect(embedBatch).not.toHaveBeenCalled();
|
||||
expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
|
||||
expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
|
||||
expect(second.relationships).toEqual(first.relationships);
|
||||
});
|
||||
|
||||
it('does not reuse completed stages when the snapshot changes', async () => {
|
||||
const stateStore = memoryEnrichmentStateStore();
|
||||
const providers = {
|
||||
...createDeterministicLocalScanEnrichmentProviders(),
|
||||
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
||||
};
|
||||
const scanConnector = connector();
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-resume-hash' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-hash',
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
const firstTable = snapshot.tables[0];
|
||||
if (!firstTable) {
|
||||
throw new Error('Expected test snapshot table');
|
||||
}
|
||||
const changedConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => ({
|
||||
...snapshot,
|
||||
tables: [{ ...firstTable, name: 'customers' }],
|
||||
})),
|
||||
};
|
||||
const generateText = vi.spyOn(providers.llmRuntime, 'generateText');
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: changedConnector,
|
||||
context: { runId: 'scan-run-resume-hash' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-hash',
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
expect(result.state.resumedStages).toEqual([]);
|
||||
expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(generateText).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
try {
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts (id) VALUES (1), (2);
|
||||
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
driver: 'sqlite' as const,
|
||||
capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }),
|
||||
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
|
||||
executeReadOnly: executor.executeReadOnly.bind(executor),
|
||||
};
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-providerless-enriched' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.summary).toEqual({
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'completed',
|
||||
});
|
||||
expect(result.descriptionUpdates).toEqual([]);
|
||||
expect(result.embeddingUpdates).toEqual([]);
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
|
||||
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
|
||||
expect(result.resolvedRelationships).toEqual([
|
||||
expect.objectContaining({
|
||||
status: 'accepted',
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
}),
|
||||
]);
|
||||
expect(result.warnings).toContainEqual({
|
||||
code: 'scan_enrichment_backend_not_configured',
|
||||
message:
|
||||
'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
skippedStages: ['descriptions', 'embeddings'],
|
||||
relationshipDetection: true,
|
||||
},
|
||||
});
|
||||
} finally {
|
||||
executor.close();
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
643
packages/cli/src/context/scan/local-enrichment.ts
Normal file
643
packages/cli/src/context/scan/local-enrichment.ts
Normal file
|
|
@ -0,0 +1,643 @@
|
|||
import pLimit from 'p-limit';
|
||||
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
||||
import { buildDefaultKtxProjectConfig, type KtxScanRelationshipConfig } from '../project/config.js';
|
||||
import { type KtxDescriptionColumnTable, KtxDescriptionGenerator } from './description-generation.js';
|
||||
import { buildKtxColumnEmbeddingText } from './embedding-text.js';
|
||||
import {
|
||||
completedKtxScanEnrichmentStateSummary,
|
||||
computeKtxScanEnrichmentInputHash,
|
||||
type KtxScanEnrichmentStateStore,
|
||||
summarizeKtxScanEnrichmentState,
|
||||
} from './enrichment-state.js';
|
||||
import { skippedKtxScanEnrichmentSummary } from './enrichment-summary.js';
|
||||
import type {
|
||||
KtxEmbeddingUpdate,
|
||||
KtxEnrichedColumn,
|
||||
KtxEnrichedRelationship,
|
||||
KtxEnrichedSchema,
|
||||
KtxEnrichedTable,
|
||||
KtxRelationshipEndpoint,
|
||||
KtxRelationshipUpdate,
|
||||
} from './enrichment-types.js';
|
||||
import type { KtxCompositeRelationshipCandidate } from './relationship-composite-candidates.js';
|
||||
import type { KtxResolvedRelationshipDiscoveryCandidate } from './relationship-graph-resolver.js';
|
||||
import { discoverKtxRelationships } from './relationship-discovery.js';
|
||||
import type { KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type {
|
||||
KtxEmbeddingPort,
|
||||
KtxProgressPort,
|
||||
KtxScanConnector,
|
||||
KtxScanContext,
|
||||
KtxScanEnrichmentStage,
|
||||
KtxScanEnrichmentStateSummary,
|
||||
KtxScanEnrichmentSummary,
|
||||
KtxScanMode,
|
||||
KtxScanRelationshipSummary,
|
||||
KtxScanWarning,
|
||||
KtxSchemaColumn,
|
||||
KtxSchemaForeignKey,
|
||||
KtxSchemaSnapshot,
|
||||
KtxSchemaTable,
|
||||
KtxTableRef,
|
||||
} from './types.js';
|
||||
|
||||
const DESCRIPTION_TABLE_CONCURRENCY = 6;
|
||||
|
||||
export interface KtxLocalScanEnrichmentProviders {
|
||||
llmRuntime: KtxLlmRuntimePort;
|
||||
embedding?: KtxEmbeddingPort | null;
|
||||
}
|
||||
|
||||
export interface KtxLocalScanEnrichmentInput {
|
||||
connectionId: string;
|
||||
mode: KtxScanMode;
|
||||
detectRelationships?: boolean;
|
||||
connector: KtxScanConnector;
|
||||
context: KtxScanContext;
|
||||
providers: KtxLocalScanEnrichmentProviders | null;
|
||||
stateStore?: KtxScanEnrichmentStateStore | null;
|
||||
syncId?: string;
|
||||
providerIdentity?: Record<string, unknown>;
|
||||
relationshipSettings?: KtxScanRelationshipConfig;
|
||||
now?: () => Date;
|
||||
}
|
||||
|
||||
export interface KtxLocalScanEnrichmentResult {
|
||||
snapshot: KtxSchemaSnapshot;
|
||||
summary: KtxScanEnrichmentSummary;
|
||||
relationships: KtxScanRelationshipSummary;
|
||||
state: KtxScanEnrichmentStateSummary;
|
||||
warnings: KtxScanWarning[];
|
||||
descriptionUpdates: Array<{
|
||||
table: KtxTableRef;
|
||||
tableDescription: string | null;
|
||||
columnDescriptions: Record<string, string | null>;
|
||||
}>;
|
||||
embeddingUpdates: KtxEmbeddingUpdate[];
|
||||
relationshipUpdate: KtxRelationshipUpdate | null;
|
||||
relationshipProfile: KtxRelationshipProfileArtifact | null;
|
||||
resolvedRelationships: KtxResolvedRelationshipDiscoveryCandidate[] | null;
|
||||
compositeRelationships: KtxCompositeRelationshipCandidate[] | null;
|
||||
}
|
||||
|
||||
function tableId(table: KtxSchemaTable): string {
|
||||
return [table.catalog, table.db, table.name].filter((value): value is string => Boolean(value)).join('.');
|
||||
}
|
||||
|
||||
function columnId(table: KtxSchemaTable, column: KtxSchemaColumn): string {
|
||||
return `${tableId(table)}.${column.name}`;
|
||||
}
|
||||
|
||||
function tableRef(table: KtxSchemaTable): KtxTableRef {
|
||||
return {
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
};
|
||||
}
|
||||
|
||||
function endpoint(table: KtxEnrichedTable, column: KtxEnrichedColumn): KtxRelationshipEndpoint {
|
||||
return {
|
||||
tableId: table.id,
|
||||
columnIds: [column.id],
|
||||
table: table.ref,
|
||||
columns: [column.name],
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipId(from: KtxRelationshipEndpoint, to: KtxRelationshipEndpoint): string {
|
||||
return `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`;
|
||||
}
|
||||
|
||||
function targetMatchesForeignKey(table: KtxEnrichedTable, foreignKey: KtxSchemaForeignKey): boolean {
|
||||
return (
|
||||
table.ref.name === foreignKey.toTable &&
|
||||
(foreignKey.toCatalog === null || table.ref.catalog === foreignKey.toCatalog) &&
|
||||
(foreignKey.toDb === null || table.ref.db === foreignKey.toDb)
|
||||
);
|
||||
}
|
||||
|
||||
function formalRelationshipsFromSnapshot(
|
||||
snapshot: KtxSchemaSnapshot,
|
||||
tables: readonly KtxEnrichedTable[],
|
||||
): KtxEnrichedRelationship[] {
|
||||
const tableById = new Map(tables.map((table) => [table.id, table]));
|
||||
const relationships: KtxEnrichedRelationship[] = [];
|
||||
|
||||
for (const sourceTableSnapshot of snapshot.tables) {
|
||||
const sourceTable = tableById.get(tableId(sourceTableSnapshot));
|
||||
if (!sourceTable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const foreignKey of sourceTableSnapshot.foreignKeys) {
|
||||
const sourceColumn = sourceTable.columns.find((column) => column.name === foreignKey.fromColumn);
|
||||
const targetTable = tables.find((table) => targetMatchesForeignKey(table, foreignKey));
|
||||
const targetColumn = targetTable?.columns.find((column) => column.name === foreignKey.toColumn);
|
||||
if (!sourceColumn || !targetTable || !targetColumn) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const from = endpoint(sourceTable, sourceColumn);
|
||||
const to = endpoint(targetTable, targetColumn);
|
||||
relationships.push({
|
||||
id: relationshipId(from, to),
|
||||
source: 'formal',
|
||||
from,
|
||||
to,
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return relationships.sort((left, right) => left.id.localeCompare(right.id));
|
||||
}
|
||||
|
||||
function providerlessEnrichedWarning(relationshipDetection: boolean): KtxScanWarning {
|
||||
return {
|
||||
code: 'scan_enrichment_backend_not_configured',
|
||||
message:
|
||||
'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
skippedStages: ['descriptions', 'embeddings'],
|
||||
relationshipDetection,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function createDeterministicLocalScanEnrichmentProviders(): KtxLocalScanEnrichmentProviders {
|
||||
return {
|
||||
llmRuntime: deterministicLlmRuntime(),
|
||||
};
|
||||
}
|
||||
|
||||
function deterministicLlmRuntime(): KtxLlmRuntimePort {
|
||||
return {
|
||||
async generateText(input) {
|
||||
return `Deterministic description for ${input.prompt.slice(0, 64).trim() || 'data source'}`;
|
||||
},
|
||||
async generateObject() {
|
||||
return { pkCandidates: [], fkCandidates: [] } as never;
|
||||
},
|
||||
async runAgentLoop() {
|
||||
return { stopReason: 'natural' };
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function snapshotToKtxEnrichedSchema(
|
||||
snapshot: KtxSchemaSnapshot,
|
||||
embeddingsByColumnId: ReadonlyMap<string, number[]> = new Map(),
|
||||
): KtxEnrichedSchema {
|
||||
const tables: KtxEnrichedTable[] = snapshot.tables.map((table) => {
|
||||
const id = tableId(table);
|
||||
const ref = tableRef(table);
|
||||
const columns: KtxEnrichedColumn[] = table.columns.map((column) => {
|
||||
const idForColumn = columnId(table, column);
|
||||
return {
|
||||
id: idForColumn,
|
||||
tableId: id,
|
||||
tableRef: ref,
|
||||
name: column.name,
|
||||
nativeType: column.nativeType,
|
||||
normalizedType: column.normalizedType,
|
||||
dimensionType: column.dimensionType,
|
||||
nullable: column.nullable,
|
||||
primaryKey: column.primaryKey,
|
||||
parentColumnId: null,
|
||||
descriptions: {
|
||||
...(column.comment ? { db: column.comment } : {}),
|
||||
},
|
||||
embedding: embeddingsByColumnId.get(idForColumn) ?? null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
};
|
||||
});
|
||||
return {
|
||||
id,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {
|
||||
...(table.comment ? { db: table.comment } : {}),
|
||||
},
|
||||
columns,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
connectionId: snapshot.connectionId,
|
||||
tables,
|
||||
relationships: formalRelationshipsFromSnapshot(snapshot, tables),
|
||||
};
|
||||
}
|
||||
|
||||
function descriptionTable(table: KtxSchemaTable): KtxDescriptionColumnTable {
|
||||
return {
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
columns: table.columns.map((column) => ({
|
||||
name: column.name,
|
||||
...(column.comment ? { sampleValues: [column.comment], rawDescriptions: { db: column.comment } } : {}),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
function tableMetadataColumns(table: KtxSchemaTable): Array<{
|
||||
name: string;
|
||||
nativeType?: string | null;
|
||||
comment?: string | null;
|
||||
}> {
|
||||
return table.columns.map((column) => ({
|
||||
name: column.name,
|
||||
nativeType: column.nativeType ?? null,
|
||||
comment: column.comment ?? null,
|
||||
}));
|
||||
}
|
||||
|
||||
function embeddingBatchSize(maxBatchSize: number): number {
|
||||
return Number.isInteger(maxBatchSize) && maxBatchSize > 0 ? maxBatchSize : 100;
|
||||
}
|
||||
|
||||
async function generateDescriptions(input: {
|
||||
snapshot: KtxSchemaSnapshot;
|
||||
connector: KtxScanConnector;
|
||||
context: KtxScanContext;
|
||||
providers: KtxLocalScanEnrichmentProviders;
|
||||
progress?: KtxProgressPort;
|
||||
warnings?: KtxScanWarning[];
|
||||
}): Promise<KtxLocalScanEnrichmentResult['descriptionUpdates']> {
|
||||
const warningSink = input.warnings;
|
||||
const generator = new KtxDescriptionGenerator({
|
||||
llmRuntime: input.providers.llmRuntime,
|
||||
...(input.context.logger ? { logger: input.context.logger } : {}),
|
||||
...(warningSink
|
||||
? {
|
||||
onWarning: (warning: KtxScanWarning) => {
|
||||
warningSink.push(warning);
|
||||
},
|
||||
}
|
||||
: {}),
|
||||
settings: {
|
||||
columnMaxWords: 16,
|
||||
tableMaxWords: 24,
|
||||
dataSourceMaxWords: 32,
|
||||
concurrencyLimit: 4,
|
||||
},
|
||||
});
|
||||
|
||||
const updates: KtxLocalScanEnrichmentResult['descriptionUpdates'] = [];
|
||||
const totalTables = input.snapshot.tables.length;
|
||||
if (totalTables === 0) {
|
||||
await input.progress?.update(1, 'No tables to describe');
|
||||
return updates;
|
||||
}
|
||||
const limitTable = pLimit(DESCRIPTION_TABLE_CONCURRENCY);
|
||||
const tableUpdates = await Promise.all(
|
||||
input.snapshot.tables.map((table, index) =>
|
||||
limitTable(async () => {
|
||||
await input.progress?.update(
|
||||
(index + 1) / totalTables,
|
||||
`Generating descriptions ${index + 1}/${totalTables} tables`,
|
||||
{
|
||||
transient: true,
|
||||
},
|
||||
);
|
||||
const tableInput = descriptionTable(table);
|
||||
const columnResult = await generator.generateColumnDescriptions({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
connector: input.connector,
|
||||
context: input.context,
|
||||
dataSourceType: input.snapshot.driver,
|
||||
supportsNestedAnalysis: input.connector.capabilities.nestedAnalysis,
|
||||
table: tableInput,
|
||||
});
|
||||
const tableDescription = await generator.generateTableDescription({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
connector: input.connector,
|
||||
context: input.context,
|
||||
dataSourceType: input.snapshot.driver,
|
||||
table: {
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
rawDescriptions: table.comment ? { db: table.comment } : {},
|
||||
columns: tableMetadataColumns(table),
|
||||
},
|
||||
});
|
||||
return {
|
||||
table: tableRef(table),
|
||||
tableDescription,
|
||||
columnDescriptions: Object.fromEntries(columnResult.columnDescriptions),
|
||||
};
|
||||
}),
|
||||
),
|
||||
);
|
||||
updates.push(...tableUpdates);
|
||||
await input.progress?.update(1, `Generated descriptions for ${totalTables} tables`);
|
||||
return updates;
|
||||
}
|
||||
|
||||
async function buildEmbeddings(input: {
|
||||
snapshot: KtxSchemaSnapshot;
|
||||
embedding: KtxEmbeddingPort;
|
||||
descriptions: KtxLocalScanEnrichmentResult['descriptionUpdates'];
|
||||
progress?: KtxProgressPort;
|
||||
}): Promise<{ updates: KtxEmbeddingUpdate[]; byColumnId: Map<string, number[]> }> {
|
||||
const descriptionByTable = new Map(input.descriptions.map((item) => [item.table.name, item]));
|
||||
const texts: Array<{ columnId: string; text: string }> = [];
|
||||
|
||||
for (const table of input.snapshot.tables) {
|
||||
const tableDescriptions = descriptionByTable.get(table.name);
|
||||
for (const column of table.columns) {
|
||||
const id = columnId(table, column);
|
||||
const text = buildKtxColumnEmbeddingText({
|
||||
tableName: table.name,
|
||||
columnName: column.name,
|
||||
columnType: column.nativeType,
|
||||
resolvedDescription: tableDescriptions?.columnDescriptions[column.name] ?? column.comment,
|
||||
resolvedTableDescription: tableDescriptions?.tableDescription ?? table.comment,
|
||||
sampleValues: column.comment ? [column.comment] : null,
|
||||
foreignKeys: {
|
||||
outgoing: (table.foreignKeys ?? [])
|
||||
.filter((foreignKey) => foreignKey.fromColumn === column.name)
|
||||
.map((foreignKey) => ({ toTable: foreignKey.toTable, toColumn: foreignKey.toColumn })),
|
||||
incoming: [],
|
||||
},
|
||||
});
|
||||
texts.push({ columnId: id, text });
|
||||
}
|
||||
}
|
||||
|
||||
const embeddings: number[][] = [];
|
||||
const maxBatchSize = embeddingBatchSize(input.embedding.maxBatchSize);
|
||||
const embeddingTexts = texts.map((item) => item.text);
|
||||
const batchCount = Math.ceil(embeddingTexts.length / maxBatchSize);
|
||||
if (batchCount === 0) {
|
||||
await input.progress?.update(1, 'No embeddings to build');
|
||||
}
|
||||
for (let offset = 0; offset < embeddingTexts.length; offset += maxBatchSize) {
|
||||
const batchIndex = Math.floor(offset / maxBatchSize) + 1;
|
||||
await input.progress?.update(batchIndex / batchCount, `Building embeddings ${batchIndex}/${batchCount} batches`, {
|
||||
transient: true,
|
||||
});
|
||||
const batch = embeddingTexts.slice(offset, offset + maxBatchSize);
|
||||
const batchEmbeddings = await input.embedding.embedBatch(batch);
|
||||
if (batchEmbeddings.length !== batch.length) {
|
||||
throw new Error(`expected ${batch.length} embeddings, received ${batchEmbeddings.length}`);
|
||||
}
|
||||
embeddings.push(...batchEmbeddings);
|
||||
}
|
||||
|
||||
const byColumnId = new Map<string, number[]>();
|
||||
const updates = texts.map((item, index) => {
|
||||
const embedding = embeddings[index] ?? [];
|
||||
byColumnId.set(item.columnId, embedding);
|
||||
return {
|
||||
columnId: item.columnId,
|
||||
text: item.text,
|
||||
embedding,
|
||||
};
|
||||
});
|
||||
if (batchCount > 0) {
|
||||
await input.progress?.update(1, `Built embeddings for ${updates.length} columns`);
|
||||
}
|
||||
return { updates, byColumnId };
|
||||
}
|
||||
|
||||
async function runEnrichmentStage<TOutput>(input: {
|
||||
stateStore: KtxScanEnrichmentStateStore | null | undefined;
|
||||
runId: string;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
mode: KtxScanMode;
|
||||
stage: KtxScanEnrichmentStage;
|
||||
inputHash: string;
|
||||
now: () => Date;
|
||||
resumedStages: KtxScanEnrichmentStage[];
|
||||
completedStages: KtxScanEnrichmentStage[];
|
||||
failedStages: KtxScanEnrichmentStage[];
|
||||
compute: () => Promise<TOutput>;
|
||||
}): Promise<TOutput> {
|
||||
const existing = await input.stateStore?.findCompletedStage<TOutput>({
|
||||
runId: input.runId,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
});
|
||||
if (existing) {
|
||||
input.resumedStages.push(input.stage);
|
||||
input.completedStages.push(input.stage);
|
||||
return existing.output;
|
||||
}
|
||||
|
||||
try {
|
||||
const output = await input.compute();
|
||||
input.completedStages.push(input.stage);
|
||||
await input.stateStore?.saveCompletedStage({
|
||||
runId: input.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
mode: input.mode,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
output,
|
||||
updatedAt: input.now().toISOString(),
|
||||
});
|
||||
return output;
|
||||
} catch (error) {
|
||||
input.failedStages.push(input.stage);
|
||||
await input.stateStore?.saveFailedStage({
|
||||
runId: input.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
mode: input.mode,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
errorMessage: error instanceof Error ? error.message : String(error),
|
||||
updatedAt: input.now().toISOString(),
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
function embeddingsByColumnId(updates: KtxEmbeddingUpdate[]): Map<string, number[]> {
|
||||
return new Map(updates.map((update) => [update.columnId, update.embedding]));
|
||||
}
|
||||
|
||||
export async function runLocalScanEnrichment(
|
||||
input: KtxLocalScanEnrichmentInput,
|
||||
): Promise<KtxLocalScanEnrichmentResult> {
|
||||
const progress = input.context.progress;
|
||||
await progress?.update(0, 'Loading enrichment schema snapshot');
|
||||
const snapshot = await input.connector.introspect(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
driver: input.connector.driver,
|
||||
mode: input.mode,
|
||||
detectRelationships: input.detectRelationships,
|
||||
},
|
||||
input.context,
|
||||
);
|
||||
await progress?.update(0.05, `Loaded schema snapshot with ${snapshot.tables.length} tables`);
|
||||
|
||||
const now = input.now ?? (() => new Date());
|
||||
const state = completedKtxScanEnrichmentStateSummary();
|
||||
const syncId = input.syncId ?? input.context.runId;
|
||||
const relationshipSettings = input.relationshipSettings ?? buildDefaultKtxProjectConfig().scan.relationships;
|
||||
const inputHash = computeKtxScanEnrichmentInputHash({
|
||||
snapshot,
|
||||
mode: input.mode,
|
||||
detectRelationships: input.detectRelationships ?? false,
|
||||
providerIdentity: input.providerIdentity ?? {},
|
||||
relationshipSettings,
|
||||
});
|
||||
const warnings: KtxScanWarning[] = [];
|
||||
let descriptions: KtxLocalScanEnrichmentResult['descriptionUpdates'] = [];
|
||||
let embeddingUpdates: KtxEmbeddingUpdate[] = [];
|
||||
let schema = snapshotToKtxEnrichedSchema(snapshot);
|
||||
const summary: KtxScanEnrichmentSummary = { ...skippedKtxScanEnrichmentSummary };
|
||||
const relationshipDetectionEnabled = relationshipSettings.enabled;
|
||||
const shouldDetectRelationships =
|
||||
relationshipDetectionEnabled &&
|
||||
(input.mode === 'relationships' || input.mode === 'enriched' || (input.detectRelationships ?? false));
|
||||
|
||||
if (input.mode === 'enriched' && !input.providers) {
|
||||
warnings.push(providerlessEnrichedWarning(shouldDetectRelationships));
|
||||
}
|
||||
|
||||
if (input.mode === 'enriched' && input.providers) {
|
||||
const providers = input.providers;
|
||||
const descriptionProgress = progress?.startPhase(0.45);
|
||||
descriptions = await runEnrichmentStage({
|
||||
stateStore: input.stateStore,
|
||||
runId: input.context.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId,
|
||||
mode: input.mode,
|
||||
stage: 'descriptions',
|
||||
inputHash,
|
||||
now,
|
||||
resumedStages: state.resumedStages,
|
||||
completedStages: state.completedStages,
|
||||
failedStages: state.failedStages,
|
||||
compute: () =>
|
||||
generateDescriptions({
|
||||
snapshot,
|
||||
connector: input.connector,
|
||||
context: input.context,
|
||||
providers,
|
||||
progress: descriptionProgress,
|
||||
warnings,
|
||||
}),
|
||||
});
|
||||
summary.dataDictionary = input.connector.sampleColumn ? 'completed' : 'skipped';
|
||||
summary.tableDescriptions = 'completed';
|
||||
summary.columnDescriptions = 'completed';
|
||||
|
||||
const embeddingProgress = progress?.startPhase(0.2);
|
||||
const embedding = providers.embedding;
|
||||
if (embedding) {
|
||||
embeddingUpdates = await runEnrichmentStage({
|
||||
stateStore: input.stateStore,
|
||||
runId: input.context.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId,
|
||||
mode: input.mode,
|
||||
stage: 'embeddings',
|
||||
inputHash,
|
||||
now,
|
||||
resumedStages: state.resumedStages,
|
||||
completedStages: state.completedStages,
|
||||
failedStages: state.failedStages,
|
||||
compute: async () => {
|
||||
const embeddings = await buildEmbeddings({
|
||||
snapshot,
|
||||
embedding,
|
||||
descriptions,
|
||||
progress: embeddingProgress,
|
||||
});
|
||||
return embeddings.updates;
|
||||
},
|
||||
});
|
||||
schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
|
||||
summary.embeddings = 'completed';
|
||||
}
|
||||
}
|
||||
|
||||
let relationshipUpdate: KtxRelationshipUpdate | null = null;
|
||||
let relationshipProfile: KtxRelationshipProfileArtifact | null = null;
|
||||
let resolvedRelationships: KtxResolvedRelationshipDiscoveryCandidate[] | null = null;
|
||||
let compositeRelationships: KtxCompositeRelationshipCandidate[] | null = null;
|
||||
let relationships: KtxScanRelationshipSummary = { accepted: 0, review: 0, rejected: 0, skipped: 0 };
|
||||
if (shouldDetectRelationships) {
|
||||
const relationshipProgress = progress?.startPhase(0.25);
|
||||
const relationshipStage = await runEnrichmentStage({
|
||||
stateStore: input.stateStore,
|
||||
runId: input.context.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId,
|
||||
mode: input.mode,
|
||||
stage: 'relationships',
|
||||
inputHash,
|
||||
now,
|
||||
resumedStages: state.resumedStages,
|
||||
completedStages: state.completedStages,
|
||||
failedStages: state.failedStages,
|
||||
compute: async () => {
|
||||
await relationshipProgress?.update(0, 'Detecting relationships');
|
||||
const detection = await discoverKtxRelationships({
|
||||
connectionId: input.connectionId,
|
||||
driver: snapshot.driver,
|
||||
connector: input.connector,
|
||||
schema,
|
||||
context: input.context,
|
||||
settings: relationshipSettings,
|
||||
llmRuntime: input.providers?.llmRuntime ?? null,
|
||||
});
|
||||
|
||||
await relationshipProgress?.update(
|
||||
1,
|
||||
`Relationship detection found ${detection.relationships.accepted} accepted, ${detection.relationships.review} review`,
|
||||
);
|
||||
return {
|
||||
relationshipUpdate: detection.relationshipUpdate,
|
||||
relationshipProfile: detection.profile,
|
||||
resolvedRelationships: detection.resolvedRelationships,
|
||||
compositeRelationships: detection.compositeRelationships,
|
||||
relationships: detection.relationships,
|
||||
statisticalValidation: detection.statisticalValidation,
|
||||
llmRelationshipValidation: detection.llmRelationshipValidation,
|
||||
warnings: detection.warnings,
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
summary.deterministicRelationships = 'completed';
|
||||
summary.llmRelationshipValidation = relationshipStage.llmRelationshipValidation;
|
||||
summary.statisticalValidation = relationshipStage.statisticalValidation;
|
||||
relationshipUpdate = relationshipStage.relationshipUpdate;
|
||||
relationshipProfile = relationshipStage.relationshipProfile;
|
||||
resolvedRelationships = relationshipStage.resolvedRelationships;
|
||||
compositeRelationships = relationshipStage.compositeRelationships;
|
||||
relationships = relationshipStage.relationships;
|
||||
warnings.push(...relationshipStage.warnings);
|
||||
}
|
||||
|
||||
await progress?.update(1, 'Enrichment complete');
|
||||
return {
|
||||
snapshot,
|
||||
summary,
|
||||
relationships,
|
||||
state: summarizeKtxScanEnrichmentState(state),
|
||||
warnings,
|
||||
descriptionUpdates: descriptions,
|
||||
embeddingUpdates,
|
||||
relationshipUpdate,
|
||||
relationshipProfile,
|
||||
resolvedRelationships,
|
||||
compositeRelationships,
|
||||
};
|
||||
}
|
||||
1580
packages/cli/src/context/scan/local-scan.test.ts
Normal file
1580
packages/cli/src/context/scan/local-scan.test.ts
Normal file
File diff suppressed because it is too large
Load diff
632
packages/cli/src/context/scan/local-scan.ts
Normal file
632
packages/cli/src/context/scan/local-scan.ts
Normal file
|
|
@ -0,0 +1,632 @@
|
|||
import type { createKtxEmbeddingProvider } from '../../llm/embedding-provider.js';
|
||||
import type { createKtxLlmProvider } from '../../llm/model-provider.js';
|
||||
import type { KtxEmbeddingProvider } from '../../llm/types.js';
|
||||
import { createDefaultLocalIngestAdapters } from '../../context/ingest/local-adapters.js';
|
||||
import { getLocalStageOnlyIngestStatus, type LocalIngestRunRecord, runLocalStageOnlyIngest } from '../../context/ingest/local-stage-ingest.js';
|
||||
import type { SourceAdapter } from '../../context/ingest/types.js';
|
||||
import { createLocalKtxLlmRuntimeFromConfig } from '../../context/llm/local-config.js';
|
||||
import { KtxScanEmbeddingPortAdapter } from '../../context/llm/embedding-port.js';
|
||||
import type { KtxProjectLlmConfig, KtxScanEnrichmentConfig, KtxScanRelationshipConfig } from '../project/config.js';
|
||||
import type { KtxLocalProject } from '../../context/project/project.js';
|
||||
import { ktxLocalStateDbPath } from '../project/local-state-db.js';
|
||||
import { redactKtxScanReport } from './credentials.js';
|
||||
import { filterSnapshotTables, resolveEnabledTables } from './enabled-tables.js';
|
||||
import { completedKtxScanEnrichmentStateSummary } from './enrichment-state.js';
|
||||
import { failedKtxScanEnrichmentSummary, ktxScanErrorMessage } from './enrichment-summary.js';
|
||||
import {
|
||||
createDeterministicLocalScanEnrichmentProviders,
|
||||
type KtxLocalScanEnrichmentProviders,
|
||||
runLocalScanEnrichment,
|
||||
} from './local-enrichment.js';
|
||||
import { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards } from './local-enrichment-artifacts.js';
|
||||
import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
|
||||
import { SqliteLocalScanEnrichmentStateStore } from './sqlite-local-enrichment-state-store.js';
|
||||
import type {
|
||||
KtxConnectionDriver,
|
||||
KtxProgressPort,
|
||||
KtxScanConnector,
|
||||
KtxScanContext,
|
||||
KtxScanEnrichmentStateSummary,
|
||||
KtxScanInput,
|
||||
KtxScanMode,
|
||||
KtxScanReport,
|
||||
KtxScanTrigger,
|
||||
KtxScanWarning,
|
||||
KtxSchemaSnapshot,
|
||||
} from './types.js';
|
||||
|
||||
function enrichmentResolutionWarning(
|
||||
status: 'missing-embeddings-config' | 'missing-llm' | 'missing-embeddings-provider',
|
||||
): KtxScanWarning {
|
||||
if (status === 'missing-llm') {
|
||||
return {
|
||||
code: 'llm_unavailable',
|
||||
message:
|
||||
'scan.enrichment.mode is "llm" but the LLM provider could not be resolved from llm.provider config; LLM-driven enrichment was skipped.',
|
||||
recoverable: true,
|
||||
metadata: { reason: status },
|
||||
};
|
||||
}
|
||||
if (status === 'missing-embeddings-config') {
|
||||
return {
|
||||
code: 'embedding_unavailable',
|
||||
message:
|
||||
'scan.enrichment.mode is "llm" but scan.enrichment.embeddings is not configured; embedding enrichment was skipped.',
|
||||
recoverable: true,
|
||||
metadata: { reason: status },
|
||||
};
|
||||
}
|
||||
return {
|
||||
code: 'embedding_unavailable',
|
||||
message:
|
||||
'scan.enrichment.mode is "llm" but the embedding provider could not be resolved from scan.enrichment.embeddings config; embedding enrichment was skipped.',
|
||||
recoverable: true,
|
||||
metadata: { reason: status },
|
||||
};
|
||||
}
|
||||
|
||||
export interface RunLocalScanOptions {
|
||||
project: KtxLocalProject;
|
||||
connectionId: string;
|
||||
mode?: KtxScanMode;
|
||||
detectRelationships?: boolean;
|
||||
dryRun?: boolean;
|
||||
trigger?: KtxScanTrigger;
|
||||
databaseIntrospectionUrl?: string;
|
||||
adapters?: SourceAdapter[];
|
||||
jobId?: string;
|
||||
now?: () => Date;
|
||||
connector?: KtxScanConnector;
|
||||
createConnector?: (connectionId: string) => KtxScanConnector | Promise<KtxScanConnector>;
|
||||
enrichmentProviders?: KtxLocalScanEnrichmentProviders | null;
|
||||
enrichmentStateStore?: SqliteLocalScanEnrichmentStateStore | null;
|
||||
progress?: KtxProgressPort;
|
||||
embeddingProvider?: KtxEmbeddingProvider | null;
|
||||
}
|
||||
|
||||
export interface LocalScanRunResult {
|
||||
runId: string;
|
||||
status: 'done';
|
||||
done: true;
|
||||
connectionId: string;
|
||||
mode: KtxScanMode;
|
||||
dryRun: boolean;
|
||||
syncId: string;
|
||||
report: KtxScanReport;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface LocalScanStatusResponse {
|
||||
runId: string;
|
||||
status: LocalIngestRunRecord['status'];
|
||||
done: boolean;
|
||||
connectionId: string;
|
||||
mode: KtxScanMode;
|
||||
dryRun: boolean;
|
||||
syncId: string;
|
||||
progress: number;
|
||||
startedAt: string;
|
||||
completedAt: string;
|
||||
reportPath: string | null;
|
||||
warnings: KtxScanReport['warnings'];
|
||||
}
|
||||
|
||||
export interface LocalScanMcpOptions {
|
||||
adapters?: SourceAdapter[];
|
||||
databaseIntrospectionUrl?: string;
|
||||
jobIdFactory?: () => string;
|
||||
now?: () => Date;
|
||||
createConnector?: (connectionId: string) => KtxScanConnector | Promise<KtxScanConnector>;
|
||||
}
|
||||
|
||||
const LIVE_DATABASE_ADAPTER = 'live-database';
|
||||
const SCAN_REPORT_FILE = 'scan-report.json';
|
||||
const LOCAL_AUTHOR = 'ktx';
|
||||
const LOCAL_AUTHOR_EMAIL = 'ktx@example.com';
|
||||
|
||||
function normalizeDriver(driver: string | undefined): KtxConnectionDriver {
|
||||
const normalized = (driver ?? '').toLowerCase();
|
||||
if (
|
||||
normalized === 'postgres' ||
|
||||
normalized === 'postgresql' ||
|
||||
normalized === 'sqlite' ||
|
||||
normalized === 'sqlite3' ||
|
||||
normalized === 'mysql' ||
|
||||
normalized === 'clickhouse' ||
|
||||
normalized === 'sqlserver' ||
|
||||
normalized === 'bigquery' ||
|
||||
normalized === 'snowflake'
|
||||
) {
|
||||
return normalized === 'sqlite3' ? 'sqlite' : normalized;
|
||||
}
|
||||
throw new Error(
|
||||
`Standalone ktx scan supports postgres/postgresql/sqlite/mysql/clickhouse/sqlserver/bigquery/snowflake in this phase, received "${driver ?? 'unknown'}"`,
|
||||
);
|
||||
}
|
||||
|
||||
function tablePathCount(paths: string[]): number {
|
||||
return paths.filter((path) => path.startsWith('tables/') && path.endsWith('.json')).length;
|
||||
}
|
||||
|
||||
function rawSourcesDir(connectionId: string, syncId: string): string {
|
||||
return `raw-sources/${connectionId}/${LIVE_DATABASE_ADAPTER}/${syncId}`;
|
||||
}
|
||||
|
||||
function scanReportPath(connectionId: string, syncId: string): string {
|
||||
return `${rawSourcesDir(connectionId, syncId)}/${SCAN_REPORT_FILE}`;
|
||||
}
|
||||
|
||||
function assertSupportedMode(mode: KtxScanMode): void {
|
||||
if (mode !== 'structural' && mode !== 'relationships' && mode !== 'enriched') {
|
||||
throw new Error(`Unsupported KTX scan mode: ${mode}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function resolveScanConnector(options: RunLocalScanOptions, mode: KtxScanMode): Promise<KtxScanConnector | null> {
|
||||
if (mode === 'structural' && !options.detectRelationships) {
|
||||
return null;
|
||||
}
|
||||
if (options.connector) {
|
||||
return options.connector;
|
||||
}
|
||||
if (options.createConnector) {
|
||||
return options.createConnector(options.connectionId);
|
||||
}
|
||||
throw new Error('ktx scan --enrich and --detect-relationships require a native standalone scan connector');
|
||||
}
|
||||
|
||||
interface LocalScanEnrichmentProviderDeps {
|
||||
createKtxLlmProvider?: typeof createKtxLlmProvider;
|
||||
createKtxEmbeddingProvider?: typeof createKtxEmbeddingProvider;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
projectDir?: string;
|
||||
embeddingProvider?: KtxEmbeddingProvider | null;
|
||||
}
|
||||
|
||||
type LocalScanEnrichmentProviderResolution =
|
||||
| { status: 'ready'; providers: KtxLocalScanEnrichmentProviders }
|
||||
| { status: 'disabled' }
|
||||
| { status: 'missing-embeddings-config' }
|
||||
| { status: 'missing-llm' }
|
||||
| { status: 'missing-embeddings-provider' };
|
||||
|
||||
function resolveLocalScanEnrichmentProviders(
|
||||
config: KtxScanEnrichmentConfig,
|
||||
llmConfig: KtxProjectLlmConfig,
|
||||
deps: LocalScanEnrichmentProviderDeps = {},
|
||||
): LocalScanEnrichmentProviderResolution {
|
||||
if (config.mode === 'deterministic') {
|
||||
return { status: 'ready', providers: createDeterministicLocalScanEnrichmentProviders() };
|
||||
}
|
||||
if (config.mode !== 'llm') {
|
||||
return { status: 'disabled' };
|
||||
}
|
||||
if (!config.embeddings) {
|
||||
return { status: 'missing-embeddings-config' };
|
||||
}
|
||||
|
||||
const llmRuntime = createLocalKtxLlmRuntimeFromConfig(llmConfig, {
|
||||
...deps,
|
||||
projectDir: deps.projectDir,
|
||||
});
|
||||
if (!llmRuntime) {
|
||||
return { status: 'missing-llm' };
|
||||
}
|
||||
const embeddingProvider = deps.embeddingProvider ?? null;
|
||||
if (!embeddingProvider) {
|
||||
return { status: 'missing-embeddings-provider' };
|
||||
}
|
||||
|
||||
return {
|
||||
status: 'ready',
|
||||
providers: {
|
||||
llmRuntime,
|
||||
embedding: new KtxScanEmbeddingPortAdapter(embeddingProvider),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function createLocalScanEnrichmentStateStore(options: RunLocalScanOptions): SqliteLocalScanEnrichmentStateStore | null {
|
||||
if (options.dryRun) {
|
||||
return null;
|
||||
}
|
||||
if (options.enrichmentStateStore !== undefined) {
|
||||
return options.enrichmentStateStore;
|
||||
}
|
||||
return new SqliteLocalScanEnrichmentStateStore({ dbPath: ktxLocalStateDbPath(options.project) });
|
||||
}
|
||||
|
||||
function localScanProviderIdentity(
|
||||
config: KtxScanEnrichmentConfig,
|
||||
llmConfig: KtxProjectLlmConfig,
|
||||
relationships: KtxScanRelationshipConfig,
|
||||
): Record<string, unknown> {
|
||||
return {
|
||||
mode: config.mode,
|
||||
embeddingDimensions: config.embeddings?.dimensions ?? null,
|
||||
llmModel: llmConfig.models.default ?? null,
|
||||
embeddingModel: config.embeddings?.model ?? null,
|
||||
batchSize: config.embeddings?.batchSize ?? null,
|
||||
baseUrlConfigured: Boolean(llmConfig.provider.gateway?.base_url),
|
||||
relationships,
|
||||
};
|
||||
}
|
||||
|
||||
function reportFromIngest(input: {
|
||||
record: LocalIngestRunRecord;
|
||||
driver: KtxConnectionDriver;
|
||||
mode: KtxScanMode;
|
||||
dryRun: boolean;
|
||||
trigger: KtxScanTrigger;
|
||||
createdAt: string;
|
||||
}): KtxScanReport {
|
||||
const reportPath = input.dryRun ? null : scanReportPath(input.record.connectionId, input.record.syncId);
|
||||
return {
|
||||
connectionId: input.record.connectionId,
|
||||
driver: input.driver,
|
||||
syncId: input.record.syncId,
|
||||
runId: input.record.runId,
|
||||
trigger: input.trigger,
|
||||
mode: input.mode,
|
||||
dryRun: input.dryRun,
|
||||
artifactPaths: {
|
||||
rawSourcesDir: input.dryRun ? null : rawSourcesDir(input.record.connectionId, input.record.syncId),
|
||||
reportPath,
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts: [],
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: tablePathCount(input.record.diffPaths.added),
|
||||
tablesModified: tablePathCount(input.record.diffPaths.modified),
|
||||
tablesDeleted: tablePathCount(input.record.diffPaths.deleted),
|
||||
tablesUnchanged: tablePathCount(input.record.diffPaths.unchanged),
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 0,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 0,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
capabilityGaps: [],
|
||||
warnings: [],
|
||||
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
|
||||
enrichmentState: completedKtxScanEnrichmentStateSummary(),
|
||||
createdAt: input.createdAt,
|
||||
};
|
||||
}
|
||||
|
||||
async function writeScanReport(project: KtxLocalProject, report: KtxScanReport): Promise<void> {
|
||||
if (!report.artifactPaths.reportPath) {
|
||||
return;
|
||||
}
|
||||
await project.fileStore.writeFile(
|
||||
report.artifactPaths.reportPath,
|
||||
`${JSON.stringify(report, null, 2)}\n`,
|
||||
LOCAL_AUTHOR,
|
||||
LOCAL_AUTHOR_EMAIL,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): ${report.runId} syncId=${report.syncId}`,
|
||||
);
|
||||
}
|
||||
|
||||
function scanDiffSummaryFromRecord(record: LocalIngestRunRecord): KtxScanReport['diffSummary'] {
|
||||
return {
|
||||
tablesAdded: tablePathCount(record.diffPaths.added),
|
||||
tablesModified: tablePathCount(record.diffPaths.modified),
|
||||
tablesDeleted: tablePathCount(record.diffPaths.deleted),
|
||||
tablesUnchanged: tablePathCount(record.diffPaths.unchanged),
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
};
|
||||
}
|
||||
|
||||
function hasNoContentChanges(record: LocalIngestRunRecord): boolean {
|
||||
return (
|
||||
record.previousRunId !== null &&
|
||||
record.diffSummary.added === 0 &&
|
||||
record.diffSummary.modified === 0 &&
|
||||
record.diffSummary.deleted === 0
|
||||
);
|
||||
}
|
||||
|
||||
function scanChangeSummary(diffSummary: KtxScanReport['diffSummary']): string {
|
||||
const changedTables = diffSummary.tablesAdded + diffSummary.tablesModified + diffSummary.tablesDeleted;
|
||||
const totalTables = changedTables + diffSummary.tablesUnchanged;
|
||||
const changeNoun = changedTables === 1 ? 'change' : 'changes';
|
||||
const tableNoun = totalTables === 1 ? 'table' : 'tables';
|
||||
return `Semantic layer comparison found ${changedTables} ${changeNoun} across ${totalTables} ${tableNoun}`;
|
||||
}
|
||||
|
||||
async function readScanReport(
|
||||
project: KtxLocalProject,
|
||||
connectionId: string,
|
||||
syncId: string,
|
||||
): Promise<KtxScanReport | null> {
|
||||
try {
|
||||
const raw = await project.fileStore.readFile(scanReportPath(connectionId, syncId));
|
||||
return JSON.parse(raw.content) as KtxScanReport;
|
||||
} catch (error) {
|
||||
if (error instanceof Error && (error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||
return null;
|
||||
}
|
||||
throw new Error(
|
||||
`Failed to read scan report for ${connectionId}/${syncId}: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function createFilteredConnector(connector: KtxScanConnector, enabledTables: Set<string>): KtxScanConnector {
|
||||
return {
|
||||
...connector,
|
||||
async introspect(input: KtxScanInput, ctx: KtxScanContext): Promise<KtxSchemaSnapshot> {
|
||||
const snapshot = await connector.introspect(input, ctx);
|
||||
return filterSnapshotTables(snapshot, enabledTables);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
function withInternalLiveDatabaseAdapter(project: KtxLocalProject): KtxLocalProject {
|
||||
if (project.config.ingest.adapters.includes(LIVE_DATABASE_ADAPTER)) {
|
||||
return project;
|
||||
}
|
||||
return {
|
||||
...project,
|
||||
config: {
|
||||
...project.config,
|
||||
ingest: {
|
||||
...project.config.ingest,
|
||||
adapters: [...project.config.ingest.adapters, LIVE_DATABASE_ADAPTER],
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function runLocalScan(options: RunLocalScanOptions): Promise<LocalScanRunResult> {
|
||||
const mode = options.mode ?? 'structural';
|
||||
assertSupportedMode(mode);
|
||||
await options.progress?.update(0.05, 'Preparing scan');
|
||||
const rawConnector = await resolveScanConnector(options, mode);
|
||||
|
||||
const connection = options.project.config.connections[options.connectionId];
|
||||
if (!connection) {
|
||||
throw new Error(`Connection "${options.connectionId}" is not configured in ktx.yaml`);
|
||||
}
|
||||
const driver = normalizeDriver(connection.driver);
|
||||
const enabledTables = resolveEnabledTables(connection);
|
||||
const connector = rawConnector && enabledTables ? createFilteredConnector(rawConnector, enabledTables) : rawConnector;
|
||||
const adapters =
|
||||
options.adapters ??
|
||||
createDefaultLocalIngestAdapters(options.project, { databaseIntrospectionUrl: options.databaseIntrospectionUrl });
|
||||
let enrichmentResolution: LocalScanEnrichmentProviderResolution | null = null;
|
||||
const enrichmentProviders =
|
||||
connector && (mode !== 'structural' || options.detectRelationships)
|
||||
? options.enrichmentProviders !== undefined
|
||||
? options.enrichmentProviders
|
||||
: (() => {
|
||||
enrichmentResolution = resolveLocalScanEnrichmentProviders(
|
||||
options.project.config.scan.enrichment,
|
||||
options.project.config.llm,
|
||||
{
|
||||
projectDir: options.project.projectDir,
|
||||
embeddingProvider: options.embeddingProvider ?? null,
|
||||
},
|
||||
);
|
||||
return enrichmentResolution.status === 'ready' ? enrichmentResolution.providers : null;
|
||||
})()
|
||||
: null;
|
||||
|
||||
await options.progress?.update(0.15, 'Inspecting database schema');
|
||||
const record = await runLocalStageOnlyIngest({
|
||||
project: withInternalLiveDatabaseAdapter(options.project),
|
||||
adapters,
|
||||
adapter: LIVE_DATABASE_ADAPTER,
|
||||
connectionId: options.connectionId,
|
||||
trigger: 'manual_resync',
|
||||
jobId: options.jobId,
|
||||
now: options.now,
|
||||
dryRun: options.dryRun,
|
||||
});
|
||||
await options.progress?.update(0.55, scanChangeSummary(scanDiffSummaryFromRecord(record)));
|
||||
let report = reportFromIngest({
|
||||
record,
|
||||
driver,
|
||||
mode,
|
||||
dryRun: options.dryRun ?? false,
|
||||
trigger: options.trigger ?? 'cli',
|
||||
createdAt: (options.now?.() ?? new Date()).toISOString(),
|
||||
});
|
||||
let reusedExistingScanArtifacts = false;
|
||||
const existingReport =
|
||||
!report.dryRun && !connector && hasNoContentChanges(record)
|
||||
? await readScanReport(options.project, record.connectionId, record.syncId)
|
||||
: null;
|
||||
if (existingReport && existingReport.mode === mode && existingReport.dryRun === report.dryRun) {
|
||||
report.artifactPaths = existingReport.artifactPaths;
|
||||
report.capabilityGaps = existingReport.capabilityGaps;
|
||||
report.warnings = existingReport.warnings;
|
||||
report.relationships = existingReport.relationships;
|
||||
report.enrichment = existingReport.enrichment;
|
||||
report.enrichmentState = existingReport.enrichmentState;
|
||||
reusedExistingScanArtifacts = true;
|
||||
}
|
||||
const enrichmentStateStore = connector ? createLocalScanEnrichmentStateStore(options) : null;
|
||||
let enrichmentState: KtxScanEnrichmentStateSummary = completedKtxScanEnrichmentStateSummary();
|
||||
if (!reusedExistingScanArtifacts && !report.dryRun && report.artifactPaths.rawSourcesDir) {
|
||||
await options.progress?.update(0.7, 'Writing schema artifacts');
|
||||
const rawSnapshot = await readLocalScanStructuralSnapshot({
|
||||
project: options.project,
|
||||
connectionId: options.connectionId,
|
||||
driver,
|
||||
rawSourcesDir: report.artifactPaths.rawSourcesDir,
|
||||
extractedAtFallback: report.createdAt,
|
||||
});
|
||||
const structuralSnapshot = enabledTables ? filterSnapshotTables(rawSnapshot, enabledTables) : rawSnapshot;
|
||||
if (enabledTables && structuralSnapshot.tables.length < rawSnapshot.tables.length) {
|
||||
const excluded = rawSnapshot.tables.length - structuralSnapshot.tables.length;
|
||||
let remaining = excluded;
|
||||
const ds = report.diffSummary;
|
||||
const subFrom = (field: 'tablesAdded' | 'tablesUnchanged' | 'tablesModified') => {
|
||||
const take = Math.min(remaining, ds[field]);
|
||||
ds[field] -= take;
|
||||
remaining -= take;
|
||||
};
|
||||
subFrom('tablesAdded');
|
||||
subFrom('tablesUnchanged');
|
||||
subFrom('tablesModified');
|
||||
await options.progress?.update(0.6, scanChangeSummary(report.diffSummary));
|
||||
}
|
||||
const manifestArtifacts = await writeLocalScanManifestShards({
|
||||
project: options.project,
|
||||
connectionId: options.connectionId,
|
||||
syncId: record.syncId,
|
||||
driver,
|
||||
snapshot: structuralSnapshot,
|
||||
dryRun: false,
|
||||
});
|
||||
report.artifactPaths.manifestShards = manifestArtifacts.manifestShards;
|
||||
report.manifestShardsWritten = manifestArtifacts.manifestShardsWritten;
|
||||
}
|
||||
if (connector) {
|
||||
try {
|
||||
await options.progress?.update(
|
||||
0.82,
|
||||
mode === 'relationships' || options.detectRelationships
|
||||
? 'Detecting relationships'
|
||||
: 'Enriching schema metadata',
|
||||
);
|
||||
const enrichment = await runLocalScanEnrichment({
|
||||
connectionId: options.connectionId,
|
||||
mode,
|
||||
detectRelationships: options.detectRelationships,
|
||||
connector,
|
||||
context: { runId: record.runId, progress: options.progress?.startPhase(0.18) },
|
||||
providers: enrichmentProviders,
|
||||
stateStore: enrichmentStateStore,
|
||||
syncId: record.syncId,
|
||||
providerIdentity: localScanProviderIdentity(
|
||||
options.project.config.scan.enrichment,
|
||||
options.project.config.llm,
|
||||
options.project.config.scan.relationships,
|
||||
),
|
||||
relationshipSettings: options.project.config.scan.relationships,
|
||||
now: options.now,
|
||||
});
|
||||
const artifacts = await writeLocalScanEnrichmentArtifacts({
|
||||
project: options.project,
|
||||
connectionId: options.connectionId,
|
||||
syncId: record.syncId,
|
||||
driver,
|
||||
enrichment,
|
||||
dryRun: options.dryRun ?? false,
|
||||
relationshipSettings: options.project.config.scan.relationships,
|
||||
});
|
||||
report.enrichment = enrichment.summary;
|
||||
report.relationships = enrichment.relationships;
|
||||
enrichmentState = enrichment.state;
|
||||
report.enrichmentState = enrichmentState;
|
||||
report.warnings.push(...enrichment.warnings);
|
||||
if (enrichmentResolution && enrichmentResolution.status !== 'ready' && enrichmentResolution.status !== 'disabled') {
|
||||
report.warnings.push(enrichmentResolutionWarning(enrichmentResolution.status));
|
||||
}
|
||||
report.artifactPaths.enrichmentArtifacts = artifacts.enrichmentArtifacts;
|
||||
report.artifactPaths.manifestShards = artifacts.manifestShards;
|
||||
report.manifestShardsWritten = artifacts.manifestShardsWritten;
|
||||
} catch (error) {
|
||||
const message = ktxScanErrorMessage(error);
|
||||
report.enrichment = failedKtxScanEnrichmentSummary(mode, options.detectRelationships ?? false);
|
||||
const stages = await enrichmentStateStore?.listRunStages(record.runId);
|
||||
if (stages) {
|
||||
enrichmentState = completedKtxScanEnrichmentStateSummary();
|
||||
for (const stage of stages) {
|
||||
if (stage.status === 'completed') {
|
||||
enrichmentState.completedStages.push(stage.stage);
|
||||
} else {
|
||||
enrichmentState.failedStages.push(stage.stage);
|
||||
}
|
||||
}
|
||||
report.enrichmentState = enrichmentState;
|
||||
}
|
||||
report.warnings.push({
|
||||
code: 'enrichment_failed',
|
||||
message: `KTX scan enrichment failed after structural scan completed: ${message}`,
|
||||
recoverable: true,
|
||||
metadata: { mode, detectRelationships: options.detectRelationships ?? false },
|
||||
});
|
||||
}
|
||||
}
|
||||
report = redactKtxScanReport(report);
|
||||
if (!reusedExistingScanArtifacts) {
|
||||
await writeScanReport(options.project, report);
|
||||
}
|
||||
await options.progress?.update(1, 'Scan completed');
|
||||
return {
|
||||
runId: record.runId,
|
||||
status: 'done',
|
||||
done: true,
|
||||
connectionId: record.connectionId,
|
||||
mode,
|
||||
dryRun: options.dryRun ?? false,
|
||||
syncId: record.syncId,
|
||||
report,
|
||||
};
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export async function getLocalScanReport(project: KtxLocalProject, runId: string): Promise<KtxScanReport | null> {
|
||||
const status = await getLocalStageOnlyIngestStatus(project, runId);
|
||||
if (!status || status.adapter !== LIVE_DATABASE_ADAPTER) {
|
||||
return null;
|
||||
}
|
||||
const report = await readScanReport(project, status.connectionId, status.syncId);
|
||||
if (!report) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
...report,
|
||||
runId: status.runId,
|
||||
syncId: status.syncId,
|
||||
diffSummary: scanDiffSummaryFromRecord(status),
|
||||
};
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export async function getLocalScanStatus(
|
||||
project: KtxLocalProject,
|
||||
runId: string,
|
||||
): Promise<LocalScanStatusResponse | null> {
|
||||
const status = await getLocalStageOnlyIngestStatus(project, runId);
|
||||
if (!status || status.adapter !== LIVE_DATABASE_ADAPTER) {
|
||||
return null;
|
||||
}
|
||||
const report = await getLocalScanReport(project, runId);
|
||||
return {
|
||||
runId: status.runId,
|
||||
status: status.status,
|
||||
done: status.done,
|
||||
connectionId: status.connectionId,
|
||||
mode: report?.mode ?? 'structural',
|
||||
dryRun: report?.dryRun ?? false,
|
||||
syncId: status.syncId,
|
||||
progress: status.progress,
|
||||
startedAt: status.startedAt,
|
||||
completedAt: status.completedAt,
|
||||
reportPath: report?.artifactPaths.reportPath ?? null,
|
||||
warnings: report?.warnings ?? [],
|
||||
};
|
||||
}
|
||||
195
packages/cli/src/context/scan/local-structural-artifacts.test.ts
Normal file
195
packages/cli/src/context/scan/local-structural-artifacts.test.ts
Normal file
|
|
@ -0,0 +1,195 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKtxProject, type KtxLocalProject } from '../../context/project/project.js';
|
||||
import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
|
||||
|
||||
describe('readLocalScanStructuralSnapshot', () => {
|
||||
let tempDir: string;
|
||||
let project: KtxLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-local-structural-artifacts-'));
|
||||
project = await initKtxProject({
|
||||
projectDir: join(tempDir, 'project'),
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('rebuilds a canonical snapshot from persisted live-database raw files', async () => {
|
||||
const rawRoot = 'raw-sources/warehouse/live-database/sync-1';
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/connection.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
metadata: { source: 'sqlite-smoke' },
|
||||
tableCount: 2,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed connection artifact',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/customers.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: 'Customer table',
|
||||
estimatedRows: 12,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed customers artifact',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/orders.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 20,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed orders artifact',
|
||||
);
|
||||
|
||||
const snapshot = await readLocalScanStructuralSnapshot({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
rawSourcesDir: rawRoot,
|
||||
extractedAtFallback: '2026-04-29T13:00:00.000Z',
|
||||
});
|
||||
|
||||
expect(snapshot).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
metadata: { source: 'sqlite-smoke' },
|
||||
tables: [
|
||||
{
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
comment: 'Customer table',
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('uses the scan report timestamp when connection.json omits extractedAt', async () => {
|
||||
const rawRoot = 'raw-sources/warehouse/live-database/sync-2';
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/connection.json`,
|
||||
'{"connectionId":"warehouse","metadata":{}}\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed connection artifact without extractedAt',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/orders.json`,
|
||||
'{"name":"orders","catalog":null,"db":null,"kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed orders artifact',
|
||||
);
|
||||
|
||||
const snapshot = await readLocalScanStructuralSnapshot({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
rawSourcesDir: rawRoot,
|
||||
extractedAtFallback: '2026-04-29T13:00:00.000Z',
|
||||
});
|
||||
|
||||
expect(snapshot.extractedAt).toBe('2026-04-29T13:00:00.000Z');
|
||||
});
|
||||
});
|
||||
125
packages/cli/src/context/scan/local-structural-artifacts.ts
Normal file
125
packages/cli/src/context/scan/local-structural-artifacts.ts
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
import type { KtxLocalProject } from '../../context/project/project.js';
|
||||
import type {
|
||||
KtxConnectionDriver,
|
||||
KtxSchemaColumn,
|
||||
KtxSchemaForeignKey,
|
||||
KtxSchemaSnapshot,
|
||||
KtxSchemaTable,
|
||||
} from './types.js';
|
||||
|
||||
export interface ReadLocalScanStructuralSnapshotInput {
|
||||
project: KtxLocalProject;
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
rawSourcesDir: string;
|
||||
extractedAtFallback: string;
|
||||
}
|
||||
|
||||
interface LiveDatabaseConnectionArtifact {
|
||||
connectionId?: unknown;
|
||||
extractedAt?: unknown;
|
||||
metadata?: unknown;
|
||||
scope?: unknown;
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function metadataRecord(value: unknown): Record<string, unknown> {
|
||||
return isRecord(value) ? value : {};
|
||||
}
|
||||
|
||||
function optionalStringOrNull(value: unknown): string | null | undefined {
|
||||
if (value === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
return typeof value === 'string' ? value : null;
|
||||
}
|
||||
|
||||
function parseColumn(rawColumn: unknown, path: string): KtxSchemaColumn {
|
||||
if (
|
||||
!isRecord(rawColumn) ||
|
||||
typeof rawColumn.name !== 'string' ||
|
||||
typeof rawColumn.nativeType !== 'string' ||
|
||||
typeof rawColumn.normalizedType !== 'string' ||
|
||||
(rawColumn.dimensionType !== 'time' &&
|
||||
rawColumn.dimensionType !== 'string' &&
|
||||
rawColumn.dimensionType !== 'number' &&
|
||||
rawColumn.dimensionType !== 'boolean')
|
||||
) {
|
||||
throw new Error(`Invalid KTX schema column artifact: ${path}`);
|
||||
}
|
||||
return {
|
||||
name: rawColumn.name,
|
||||
nativeType: rawColumn.nativeType,
|
||||
normalizedType: rawColumn.normalizedType,
|
||||
dimensionType: rawColumn.dimensionType,
|
||||
nullable: rawColumn.nullable === true,
|
||||
primaryKey: rawColumn.primaryKey === true,
|
||||
comment: optionalStringOrNull(rawColumn.comment) ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function parseForeignKey(rawForeignKey: unknown, path: string): KtxSchemaForeignKey {
|
||||
if (
|
||||
!isRecord(rawForeignKey) ||
|
||||
typeof rawForeignKey.fromColumn !== 'string' ||
|
||||
typeof rawForeignKey.toTable !== 'string' ||
|
||||
typeof rawForeignKey.toColumn !== 'string'
|
||||
) {
|
||||
throw new Error(`Invalid KTX schema foreign key artifact: ${path}`);
|
||||
}
|
||||
return {
|
||||
fromColumn: rawForeignKey.fromColumn,
|
||||
toCatalog: optionalStringOrNull(rawForeignKey.toCatalog) ?? null,
|
||||
toDb: optionalStringOrNull(rawForeignKey.toDb) ?? null,
|
||||
toTable: rawForeignKey.toTable,
|
||||
toColumn: rawForeignKey.toColumn,
|
||||
constraintName: optionalStringOrNull(rawForeignKey.constraintName) ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function parseTable(raw: string, path: string): KtxSchemaTable {
|
||||
const parsed = JSON.parse(raw) as unknown;
|
||||
if (!isRecord(parsed) || typeof parsed.name !== 'string' || !Array.isArray(parsed.columns)) {
|
||||
throw new Error(`Invalid KTX schema table artifact: ${path}`);
|
||||
}
|
||||
return {
|
||||
catalog: optionalStringOrNull(parsed.catalog) ?? null,
|
||||
db: optionalStringOrNull(parsed.db) ?? null,
|
||||
name: parsed.name,
|
||||
kind:
|
||||
parsed.kind === 'view' || parsed.kind === 'external' || parsed.kind === 'event_stream' ? parsed.kind : 'table',
|
||||
comment: optionalStringOrNull(parsed.comment) ?? null,
|
||||
estimatedRows: typeof parsed.estimatedRows === 'number' ? parsed.estimatedRows : null,
|
||||
columns: parsed.columns.map((column) => parseColumn(column, path)),
|
||||
foreignKeys: Array.isArray(parsed.foreignKeys)
|
||||
? parsed.foreignKeys.map((foreignKey) => parseForeignKey(foreignKey, path))
|
||||
: [],
|
||||
};
|
||||
}
|
||||
|
||||
export async function readLocalScanStructuralSnapshot(
|
||||
input: ReadLocalScanStructuralSnapshotInput,
|
||||
): Promise<KtxSchemaSnapshot> {
|
||||
const connectionRaw = await input.project.fileStore.readFile(`${input.rawSourcesDir}/connection.json`);
|
||||
const connection = JSON.parse(connectionRaw.content) as LiveDatabaseConnectionArtifact;
|
||||
const listedTables = await input.project.fileStore.listFiles(`${input.rawSourcesDir}/tables`);
|
||||
const tablePaths = listedTables.files.filter((path) => path.endsWith('.json')).sort();
|
||||
|
||||
const tables: KtxSchemaTable[] = [];
|
||||
for (const path of tablePaths) {
|
||||
const tableRaw = await input.project.fileStore.readFile(path);
|
||||
tables.push(parseTable(tableRaw.content, path));
|
||||
}
|
||||
|
||||
return {
|
||||
connectionId: typeof connection.connectionId === 'string' ? connection.connectionId : input.connectionId,
|
||||
driver: input.driver,
|
||||
extractedAt: typeof connection.extractedAt === 'string' ? connection.extractedAt : input.extractedAtFallback,
|
||||
scope: isRecord(connection.scope) ? connection.scope : {},
|
||||
metadata: metadataRecord(connection.metadata),
|
||||
tables,
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,451 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
buildKtxRelationshipBenchmarkReport,
|
||||
formatKtxRelationshipBenchmarkReportMarkdown,
|
||||
} from './relationship-benchmark-report.js';
|
||||
import type {
|
||||
KtxRelationshipBenchmarkCaseResult,
|
||||
KtxRelationshipBenchmarkFixture,
|
||||
KtxRelationshipBenchmarkSuiteResult,
|
||||
} from './relationship-benchmarks.js';
|
||||
|
||||
type CaseResultOverrides = Omit<Partial<KtxRelationshipBenchmarkCaseResult>, 'metrics'> & {
|
||||
metrics?: Partial<KtxRelationshipBenchmarkCaseResult['metrics']>;
|
||||
};
|
||||
|
||||
function caseResult(overrides: CaseResultOverrides = {}): KtxRelationshipBenchmarkCaseResult {
|
||||
return {
|
||||
fixtureId: overrides.fixtureId ?? 'demo_b2b_no_declared_constraints',
|
||||
mode: overrides.mode ?? 'declared_pks_and_declared_fks_removed',
|
||||
metrics: {
|
||||
pkPrecision: 1,
|
||||
pkRecall: 0.5,
|
||||
pkF1: 0.6666666666666666,
|
||||
fkPrecision: 1,
|
||||
fkRecall: 1,
|
||||
fkF1: 1,
|
||||
acceptedFalsePositiveCount: 0,
|
||||
reviewRecall: 0,
|
||||
acceptedOrReviewRecall: 1,
|
||||
runtimeSeconds: 0.012345,
|
||||
sqlQueries: 14,
|
||||
llmCalls: 0,
|
||||
...(overrides.metrics ?? {}),
|
||||
},
|
||||
expected: overrides.expected ?? {
|
||||
pk: ['accounts.(id)', 'users.(id)'],
|
||||
fk: ['users.(account_id)->accounts.(id)'],
|
||||
},
|
||||
predicted: overrides.predicted ?? {
|
||||
pk: ['accounts.(id)'],
|
||||
fk: ['users.(account_id)->accounts.(id)'],
|
||||
acceptedFk: ['users.(account_id)->accounts.(id)'],
|
||||
reviewFk: [],
|
||||
},
|
||||
falsePositives: overrides.falsePositives ?? { pk: [], fk: [] },
|
||||
falseNegatives: overrides.falseNegatives ?? { pk: ['users.(id)'], fk: [] },
|
||||
skippedComposite: overrides.skippedComposite ?? { pk: [], fk: [] },
|
||||
validationBlocked: overrides.validationBlocked ?? false,
|
||||
};
|
||||
}
|
||||
|
||||
function fixture(overrides: Partial<KtxRelationshipBenchmarkFixture> = {}): KtxRelationshipBenchmarkFixture {
|
||||
return {
|
||||
id: overrides.id ?? 'demo_b2b_no_declared_constraints',
|
||||
name: overrides.name ?? 'Packaged B2B demo with declared PK and FK metadata masked',
|
||||
tier: overrides.tier ?? 'smoke',
|
||||
origin: overrides.origin ?? 'synthetic',
|
||||
thresholdEligible: overrides.thresholdEligible,
|
||||
validationBudget: overrides.validationBudget,
|
||||
snapshot: overrides.snapshot ?? {
|
||||
connectionId: 'demo_b2b',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [],
|
||||
},
|
||||
expected: overrides.expected ?? { expectedPks: [], expectedLinks: [] },
|
||||
defaultModes: overrides.defaultModes ?? ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
|
||||
dataPath: overrides.dataPath ?? '/tmp/demo.sqlite',
|
||||
columnEmbeddings: overrides.columnEmbeddings ?? {},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship benchmark report', () => {
|
||||
it('classifies run, validation-blocked, and not-run benchmark cases', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult(),
|
||||
caseResult({
|
||||
mode: 'validation_disabled',
|
||||
validationBlocked: true,
|
||||
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
|
||||
predicted: {
|
||||
pk: ['accounts.(id)'],
|
||||
fk: ['users.(account_id)->accounts.(id)'],
|
||||
acceptedFk: [],
|
||||
reviewFk: ['users.(account_id)->accounts.(id)'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: ['demo_b2b_no_declared_constraints:validation_disabled'],
|
||||
aggregate: {
|
||||
caseCount: 2,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0.5,
|
||||
meanAcceptedOrReviewRecall: 1,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [fixture()],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled', 'profiling_disabled'],
|
||||
});
|
||||
|
||||
expect(report.headline).toEqual({
|
||||
caseCount: 2,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
acceptedFalsePositiveCount: 0,
|
||||
validationBlockedCount: 1,
|
||||
});
|
||||
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.status}`)).toEqual([
|
||||
'demo_b2b_no_declared_constraints:declared_pks_and_declared_fks_removed:run',
|
||||
'demo_b2b_no_declared_constraints:validation_disabled:validation_blocked',
|
||||
'demo_b2b_no_declared_constraints:profiling_disabled:not_run',
|
||||
]);
|
||||
expect(report.cases[2]?.reason).toBe('mode not selected by fixture defaultModes');
|
||||
});
|
||||
|
||||
it('surfaces validation budget review candidates in the report reason', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'scale_stress_no_declared_constraints',
|
||||
metrics: { fkRecall: 0.5, acceptedOrReviewRecall: 1 },
|
||||
predicted: {
|
||||
pk: ['dim_entity_00.(entity_00_key)'],
|
||||
fk: [
|
||||
'fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)',
|
||||
'fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)',
|
||||
],
|
||||
acceptedFk: ['fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)'],
|
||||
reviewFk: ['fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 0,
|
||||
headlinePkRecall: 1,
|
||||
headlineFkRecall: 0.5,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 1,
|
||||
meanFkRecall: 0.5,
|
||||
meanAcceptedOrReviewRecall: 1,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'scale_stress_no_declared_constraints',
|
||||
name: 'Scale stress fixture',
|
||||
tier: 'row_bearing',
|
||||
validationBudget: 800,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
});
|
||||
|
||||
expect(report.cases[0]?.reason).toBe('review candidate validation reasons: validation_unattempted (1)');
|
||||
expect(formatKtxRelationshipBenchmarkReportMarkdown(report)).toContain('validation_unattempted');
|
||||
});
|
||||
|
||||
it('uses benchmark suite eligibility for product and smoke report rows', () => {
|
||||
const productCase = caseResult({ fixtureId: 'product_curated' });
|
||||
const productBlocked = caseResult({
|
||||
fixtureId: 'product_curated',
|
||||
mode: 'validation_disabled',
|
||||
validationBlocked: true,
|
||||
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
|
||||
});
|
||||
const smokeCase = caseResult({ fixtureId: 'smoke_even_if_marked' });
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [productCase, productBlocked, smokeCase],
|
||||
validationBlockedCases: ['product_curated:validation_disabled'],
|
||||
aggregate: {
|
||||
caseCount: 3,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0.6666666666666666,
|
||||
meanAcceptedOrReviewRecall: 1,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'product_curated',
|
||||
name: 'Curated product fixture',
|
||||
tier: 'product',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
|
||||
}),
|
||||
fixture({
|
||||
id: 'smoke_even_if_marked',
|
||||
name: 'Marked smoke fixture',
|
||||
tier: 'smoke',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
|
||||
});
|
||||
|
||||
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.tuningEligible}`)).toEqual([
|
||||
'product_curated:declared_pks_and_declared_fks_removed:true',
|
||||
'product_curated:validation_disabled:false',
|
||||
'smoke_even_if_marked:declared_pks_and_declared_fks_removed:false',
|
||||
'smoke_even_if_marked:validation_disabled:false',
|
||||
]);
|
||||
expect(formatKtxRelationshipBenchmarkReportMarkdown(report)).toContain(
|
||||
'| product_curated | product | declared_pks_and_declared_fks_removed | run | yes |',
|
||||
);
|
||||
});
|
||||
|
||||
it('formats a compact Markdown report with false negatives and blocked modes', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
metrics: { fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
falseNegatives: { pk: ['users.(id)'], fk: ['users.(account_id)->accounts.(id)'] },
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 0,
|
||||
headlineAcceptedOrReviewRecall: 0,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0,
|
||||
meanAcceptedOrReviewRecall: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(
|
||||
buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [fixture()],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
);
|
||||
|
||||
expect(markdown).toContain('# KTX Relationship Discovery Benchmark Evidence');
|
||||
expect(markdown).toContain(
|
||||
'| demo_b2b_no_declared_constraints | smoke | declared_pks_and_declared_fks_removed | run | no | 0.500 | 0.000 | 0.000 | 0 |',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(id)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(account_id)->accounts.(id)',
|
||||
);
|
||||
});
|
||||
|
||||
it('keeps headline failures separate from non-headline failure details', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'product_curated',
|
||||
falseNegatives: { pk: [], fk: [] },
|
||||
metrics: { pkRecall: 1, fkRecall: 1, acceptedOrReviewRecall: 1 },
|
||||
}),
|
||||
caseResult({
|
||||
fixtureId: 'product_curated',
|
||||
mode: 'embeddings_disabled',
|
||||
falseNegatives: {
|
||||
pk: ['customers.(id)'],
|
||||
fk: ['orders.(buyer_ref)->customers.(id)'],
|
||||
},
|
||||
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 2,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 1,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 0.75,
|
||||
meanFkRecall: 0.5,
|
||||
meanAcceptedOrReviewRecall: 0.5,
|
||||
},
|
||||
};
|
||||
|
||||
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(
|
||||
buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'product_curated',
|
||||
name: 'Curated product fixture',
|
||||
tier: 'product',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
|
||||
}),
|
||||
);
|
||||
|
||||
expect(markdown).toContain('## Failure Details');
|
||||
expect(markdown).toContain('### Headline False Negative FKs\n\n- none');
|
||||
expect(markdown).toContain(
|
||||
'- `product_curated` / `embeddings_disabled` / `run`: orders.(buyer_ref)->customers.(id)',
|
||||
);
|
||||
expect(markdown).toContain('- `product_curated` / `embeddings_disabled` / `run`: customers.(id)');
|
||||
});
|
||||
|
||||
it('formats headline failure context from remaining headline false negatives', () => {
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'public_headline_fixture',
|
||||
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
falseNegatives: {
|
||||
pk: ['parent_table.(opaque_key)'],
|
||||
fk: ['child_table.(parent_table_id)->parent_table.(opaque_key)'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 0,
|
||||
headlineAcceptedOrReviewRecall: 0,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0,
|
||||
meanAcceptedOrReviewRecall: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(
|
||||
buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'public_headline_fixture',
|
||||
name: 'Public headline fixture',
|
||||
tier: 'row_bearing',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
);
|
||||
|
||||
expect(markdown).toContain('## Headline Failure Context');
|
||||
expect(markdown).toContain('- Remaining headline false-negative PKs: 1');
|
||||
expect(markdown).toContain('- Remaining headline false-negative FKs: 1');
|
||||
expect(markdown).toContain(
|
||||
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: parent_table.(opaque_key)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: child_table.(parent_table_id)->parent_table.(opaque_key)',
|
||||
);
|
||||
});
|
||||
|
||||
it('formats skipped composite ground truth separately from false-negative details', () => {
|
||||
const compositePk = 'order_lines.(order_id,line_number)';
|
||||
const compositeFk = 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)';
|
||||
const suite: KtxRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'composite_keys_no_declared_constraints',
|
||||
metrics: { pkRecall: 0, fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
expected: {
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
},
|
||||
predicted: {
|
||||
pk: [],
|
||||
fk: [],
|
||||
acceptedFk: [],
|
||||
reviewFk: [],
|
||||
},
|
||||
falseNegatives: {
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
},
|
||||
skippedComposite: {
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0,
|
||||
headlineFkRecall: 0,
|
||||
headlineAcceptedOrReviewRecall: 0,
|
||||
meanPkRecall: 0,
|
||||
meanFkRecall: 0,
|
||||
meanAcceptedOrReviewRecall: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKtxRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'composite_keys_no_declared_constraints',
|
||||
name: 'Composite key fixture with no declared constraints',
|
||||
tier: 'row_bearing',
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
});
|
||||
|
||||
expect(report.cases[0]?.skippedComposite).toEqual({
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
});
|
||||
|
||||
const markdown = formatKtxRelationshipBenchmarkReportMarkdown(report);
|
||||
expect(markdown).toContain('## Composite Ground Truth Skips');
|
||||
expect(markdown).toContain(
|
||||
'### Skipped Composite PKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_lines.(order_id,line_number)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'### Skipped Composite FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'### Headline False Negative FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
);
|
||||
});
|
||||
});
|
||||
363
packages/cli/src/context/scan/relationship-benchmark-report.ts
Normal file
363
packages/cli/src/context/scan/relationship-benchmark-report.ts
Normal file
|
|
@ -0,0 +1,363 @@
|
|||
import { isKtxRelationshipBenchmarkTuningEligible } from './relationship-benchmarks.js';
|
||||
import type {
|
||||
KtxRelationshipBenchmarkCaseResult,
|
||||
KtxRelationshipBenchmarkFixture,
|
||||
KtxRelationshipBenchmarkMode,
|
||||
KtxRelationshipBenchmarkSuiteResult,
|
||||
} from './relationship-benchmarks.js';
|
||||
|
||||
export type KtxRelationshipBenchmarkReportCaseStatus = 'run' | 'validation_blocked' | 'not_run';
|
||||
|
||||
export interface KtxRelationshipBenchmarkReportCase {
|
||||
fixtureId: string;
|
||||
fixtureName: string;
|
||||
tier: string;
|
||||
mode: KtxRelationshipBenchmarkMode;
|
||||
status: KtxRelationshipBenchmarkReportCaseStatus;
|
||||
reason: string | null;
|
||||
tuningEligible: boolean;
|
||||
metrics: {
|
||||
pkRecall: number | null;
|
||||
fkRecall: number | null;
|
||||
acceptedOrReviewRecall: number | null;
|
||||
acceptedFalsePositiveCount: number | null;
|
||||
sqlQueries: number | null;
|
||||
llmCalls: number | null;
|
||||
runtimeSeconds: number | null;
|
||||
};
|
||||
falsePositives: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
falseNegatives: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
skippedComposite: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkReport {
|
||||
generatedAt: string;
|
||||
headline: {
|
||||
caseCount: number;
|
||||
headlineCaseCount: number;
|
||||
headlinePkRecall: number;
|
||||
headlineFkRecall: number;
|
||||
headlineAcceptedOrReviewRecall: number;
|
||||
acceptedFalsePositiveCount: number;
|
||||
validationBlockedCount: number;
|
||||
};
|
||||
cases: KtxRelationshipBenchmarkReportCase[];
|
||||
}
|
||||
|
||||
function key(fixtureId: string, mode: KtxRelationshipBenchmarkMode): string {
|
||||
return `${fixtureId}:${mode}`;
|
||||
}
|
||||
|
||||
function fixed(value: number | null): string {
|
||||
return value === null ? '-' : value.toFixed(3);
|
||||
}
|
||||
|
||||
function reportCaseReason(input: {
|
||||
fixture: KtxRelationshipBenchmarkFixture;
|
||||
result: KtxRelationshipBenchmarkCaseResult;
|
||||
}): string | null {
|
||||
if (input.result.validationBlocked) {
|
||||
return 'validation unavailable for this benchmark mode';
|
||||
}
|
||||
|
||||
if (input.fixture.validationBudget !== undefined && input.result.predicted.reviewFk.length > 0) {
|
||||
return `review candidate validation reasons: validation_unattempted (${input.result.predicted.reviewFk.length})`;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function reportCaseFromResult(input: {
|
||||
fixture: KtxRelationshipBenchmarkFixture;
|
||||
mode: KtxRelationshipBenchmarkMode;
|
||||
result: KtxRelationshipBenchmarkCaseResult;
|
||||
}): KtxRelationshipBenchmarkReportCase {
|
||||
const status = input.result.validationBlocked ? 'validation_blocked' : 'run';
|
||||
return {
|
||||
fixtureId: input.fixture.id,
|
||||
fixtureName: input.fixture.name,
|
||||
tier: input.fixture.tier,
|
||||
mode: input.mode,
|
||||
status,
|
||||
reason: reportCaseReason({ fixture: input.fixture, result: input.result }),
|
||||
tuningEligible: isKtxRelationshipBenchmarkTuningEligible({
|
||||
fixture: input.fixture,
|
||||
mode: input.mode,
|
||||
validationBlocked: input.result.validationBlocked,
|
||||
}),
|
||||
metrics: {
|
||||
pkRecall: input.result.metrics.pkRecall,
|
||||
fkRecall: input.result.metrics.fkRecall,
|
||||
acceptedOrReviewRecall: input.result.metrics.acceptedOrReviewRecall,
|
||||
acceptedFalsePositiveCount: input.result.metrics.acceptedFalsePositiveCount,
|
||||
sqlQueries: input.result.metrics.sqlQueries,
|
||||
llmCalls: input.result.metrics.llmCalls,
|
||||
runtimeSeconds: input.result.metrics.runtimeSeconds,
|
||||
},
|
||||
falsePositives: input.result.falsePositives,
|
||||
falseNegatives: input.result.falseNegatives,
|
||||
skippedComposite: input.result.skippedComposite,
|
||||
};
|
||||
}
|
||||
|
||||
function notRunCase(input: {
|
||||
fixture: KtxRelationshipBenchmarkFixture;
|
||||
mode: KtxRelationshipBenchmarkMode;
|
||||
reason: string;
|
||||
}): KtxRelationshipBenchmarkReportCase {
|
||||
return {
|
||||
fixtureId: input.fixture.id,
|
||||
fixtureName: input.fixture.name,
|
||||
tier: input.fixture.tier,
|
||||
mode: input.mode,
|
||||
status: 'not_run',
|
||||
reason: input.reason,
|
||||
tuningEligible: false,
|
||||
metrics: {
|
||||
pkRecall: null,
|
||||
fkRecall: null,
|
||||
acceptedOrReviewRecall: null,
|
||||
acceptedFalsePositiveCount: null,
|
||||
sqlQueries: null,
|
||||
llmCalls: null,
|
||||
runtimeSeconds: null,
|
||||
},
|
||||
falsePositives: { pk: [], fk: [] },
|
||||
falseNegatives: { pk: [], fk: [] },
|
||||
skippedComposite: { pk: [], fk: [] },
|
||||
};
|
||||
}
|
||||
|
||||
export function buildKtxRelationshipBenchmarkReport(input: {
|
||||
fixtures: readonly KtxRelationshipBenchmarkFixture[];
|
||||
suite: KtxRelationshipBenchmarkSuiteResult;
|
||||
modes: readonly KtxRelationshipBenchmarkMode[];
|
||||
generatedAt?: string;
|
||||
}): KtxRelationshipBenchmarkReport {
|
||||
const resultsByKey = new Map(input.suite.cases.map((result) => [key(result.fixtureId, result.mode), result]));
|
||||
const cases: KtxRelationshipBenchmarkReportCase[] = [];
|
||||
|
||||
for (const fixture of input.fixtures) {
|
||||
const selectedModes = new Set(fixture.defaultModes);
|
||||
for (const mode of input.modes) {
|
||||
const result = resultsByKey.get(key(fixture.id, mode));
|
||||
if (result) {
|
||||
cases.push(reportCaseFromResult({ fixture, mode, result }));
|
||||
continue;
|
||||
}
|
||||
cases.push(
|
||||
notRunCase({
|
||||
fixture,
|
||||
mode,
|
||||
reason: selectedModes.has(mode) ? 'mode produced no benchmark result' : 'mode not selected by fixture defaultModes',
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
generatedAt: input.generatedAt ?? new Date().toISOString(),
|
||||
headline: {
|
||||
caseCount: input.suite.aggregate.caseCount,
|
||||
headlineCaseCount: input.suite.aggregate.headlineCaseCount,
|
||||
headlinePkRecall: input.suite.aggregate.headlinePkRecall,
|
||||
headlineFkRecall: input.suite.aggregate.headlineFkRecall,
|
||||
headlineAcceptedOrReviewRecall: input.suite.aggregate.headlineAcceptedOrReviewRecall,
|
||||
acceptedFalsePositiveCount: input.suite.cases.reduce(
|
||||
(sum, result) => sum + result.metrics.acceptedFalsePositiveCount,
|
||||
0,
|
||||
),
|
||||
validationBlockedCount: input.suite.validationBlockedCases.length,
|
||||
},
|
||||
cases,
|
||||
};
|
||||
}
|
||||
|
||||
type KtxRelationshipBenchmarkFailureSelector = (
|
||||
item: KtxRelationshipBenchmarkReportCase,
|
||||
) => readonly string[];
|
||||
|
||||
function sortedFailureLines(input: {
|
||||
cases: readonly KtxRelationshipBenchmarkReportCase[];
|
||||
select: KtxRelationshipBenchmarkFailureSelector;
|
||||
}): string[] {
|
||||
return input.cases
|
||||
.flatMap((item) =>
|
||||
input.select(item).map((value) => ({
|
||||
fixtureId: item.fixtureId,
|
||||
mode: item.mode,
|
||||
status: item.status,
|
||||
value,
|
||||
})),
|
||||
)
|
||||
.sort((left, right) => {
|
||||
const leftKey = `${left.fixtureId}:${left.mode}:${left.status}:${left.value}`;
|
||||
const rightKey = `${right.fixtureId}:${right.mode}:${right.status}:${right.value}`;
|
||||
return leftKey.localeCompare(rightKey);
|
||||
})
|
||||
.map((item) => `- \`${item.fixtureId}\` / \`${item.mode}\` / \`${item.status}\`: ${item.value}`);
|
||||
}
|
||||
|
||||
function failureBlock(input: {
|
||||
title: string;
|
||||
cases: readonly KtxRelationshipBenchmarkReportCase[];
|
||||
select: KtxRelationshipBenchmarkFailureSelector;
|
||||
}): string[] {
|
||||
const values = sortedFailureLines({ cases: input.cases, select: input.select });
|
||||
return ['', `### ${input.title}`, '', ...(values.length > 0 ? values : ['- none'])];
|
||||
}
|
||||
|
||||
function headlineFailureContextBlocks(report: KtxRelationshipBenchmarkReport): string[] {
|
||||
const headlineCases = report.cases.filter((item) => item.tuningEligible);
|
||||
const remainingPkMisses = sortedFailureLines({
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falseNegatives.pk,
|
||||
});
|
||||
const remainingFkMisses = sortedFailureLines({
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falseNegatives.fk,
|
||||
});
|
||||
|
||||
return [
|
||||
'',
|
||||
'## Headline Failure Context',
|
||||
'',
|
||||
'Remaining headline misses after this run are listed here so recall gains and still-open algorithmic gaps are visible in the regenerated evidence report.',
|
||||
'',
|
||||
`- Remaining headline false-negative PKs: ${remainingPkMisses.length}`,
|
||||
`- Remaining headline false-negative FKs: ${remainingFkMisses.length}`,
|
||||
'',
|
||||
'### Remaining Headline False Negative PKs',
|
||||
'',
|
||||
...(remainingPkMisses.length > 0 ? remainingPkMisses : ['- none']),
|
||||
'',
|
||||
'### Remaining Headline False Negative FKs',
|
||||
'',
|
||||
...(remainingFkMisses.length > 0 ? remainingFkMisses : ['- none']),
|
||||
];
|
||||
}
|
||||
|
||||
function failureDetailBlocks(report: KtxRelationshipBenchmarkReport): string[] {
|
||||
const headlineCases = report.cases.filter((item) => item.tuningEligible);
|
||||
const otherCases = report.cases.filter((item) => !item.tuningEligible);
|
||||
|
||||
return [
|
||||
'',
|
||||
'## Failure Details',
|
||||
...failureBlock({
|
||||
title: 'Headline False Positive PKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falsePositives.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Headline False Positive FKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falsePositives.fk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Headline False Negative PKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falseNegatives.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Headline False Negative FKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falseNegatives.fk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Other False Positive PKs',
|
||||
cases: otherCases,
|
||||
select: (item) => item.falsePositives.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Other False Positive FKs',
|
||||
cases: otherCases,
|
||||
select: (item) => item.falsePositives.fk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Other False Negative PKs',
|
||||
cases: otherCases,
|
||||
select: (item) => item.falseNegatives.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Other False Negative FKs',
|
||||
cases: otherCases,
|
||||
select: (item) => item.falseNegatives.fk,
|
||||
}),
|
||||
];
|
||||
}
|
||||
|
||||
function compositeSkipBlocks(report: KtxRelationshipBenchmarkReport): string[] {
|
||||
const headlineCases = report.cases.filter((item) => item.tuningEligible);
|
||||
|
||||
return [
|
||||
'',
|
||||
'## Composite Ground Truth Skips',
|
||||
...failureBlock({
|
||||
title: 'Skipped Composite PKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.skippedComposite.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Skipped Composite FKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.skippedComposite.fk,
|
||||
}),
|
||||
];
|
||||
}
|
||||
|
||||
export function formatKtxRelationshipBenchmarkReportMarkdown(report: KtxRelationshipBenchmarkReport): string {
|
||||
const lines = [
|
||||
'# KTX Relationship Discovery Benchmark Evidence',
|
||||
'',
|
||||
`Generated: ${report.generatedAt}`,
|
||||
'',
|
||||
'## Headline',
|
||||
'',
|
||||
`- Cases run: ${report.headline.caseCount}`,
|
||||
`- Headline cases: ${report.headline.headlineCaseCount}`,
|
||||
`- Headline PK recall: ${fixed(report.headline.headlinePkRecall)}`,
|
||||
`- Headline FK recall: ${fixed(report.headline.headlineFkRecall)}`,
|
||||
`- Headline accepted-or-review recall: ${fixed(report.headline.headlineAcceptedOrReviewRecall)}`,
|
||||
`- Accepted false positives: ${report.headline.acceptedFalsePositiveCount}`,
|
||||
`- Validation-blocked cases: ${report.headline.validationBlockedCount}`,
|
||||
'',
|
||||
'## Cases',
|
||||
'',
|
||||
'| Fixture | Tier | Mode | Status | Tuning Eligible | PK Recall | FK Recall | Accepted+Review Recall | Accepted FP | Reason |',
|
||||
'| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | --- |',
|
||||
];
|
||||
|
||||
for (const item of report.cases) {
|
||||
lines.push(
|
||||
[
|
||||
`| ${item.fixtureId}`,
|
||||
item.tier,
|
||||
item.mode,
|
||||
item.status,
|
||||
item.tuningEligible ? 'yes' : 'no',
|
||||
fixed(item.metrics.pkRecall),
|
||||
fixed(item.metrics.fkRecall),
|
||||
fixed(item.metrics.acceptedOrReviewRecall),
|
||||
String(item.metrics.acceptedFalsePositiveCount ?? '-'),
|
||||
`${item.reason ?? ''} |`,
|
||||
].join(' | '),
|
||||
);
|
||||
}
|
||||
|
||||
lines.push(...headlineFailureContextBlocks(report));
|
||||
lines.push(...failureDetailBlocks(report));
|
||||
lines.push(...compositeSkipBlocks(report));
|
||||
lines.push('');
|
||||
|
||||
return `${lines.join('\n')}\n`;
|
||||
}
|
||||
1275
packages/cli/src/context/scan/relationship-benchmarks.test.ts
Normal file
1275
packages/cli/src/context/scan/relationship-benchmarks.test.ts
Normal file
File diff suppressed because it is too large
Load diff
902
packages/cli/src/context/scan/relationship-benchmarks.ts
Normal file
902
packages/cli/src/context/scan/relationship-benchmarks.ts
Normal file
|
|
@ -0,0 +1,902 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import { mkdtemp, readdir, readFile, stat, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { gunzipSync } from 'node:zlib';
|
||||
import Database from 'better-sqlite3';
|
||||
import YAML from 'yaml';
|
||||
import { z } from 'zod';
|
||||
import type { KtxLlmRuntimePort } from '../llm/runtime-port.js';
|
||||
import type { KtxEnrichedRelationship, KtxEnrichedSchema, KtxRelationshipType } from './enrichment-types.js';
|
||||
import { snapshotToKtxEnrichedSchema } from './local-enrichment.js';
|
||||
import type { KtxRelationshipDiscoveryCandidate } from './relationship-candidates.js';
|
||||
import {
|
||||
generateKtxRelationshipDiscoveryCandidates,
|
||||
mergeKtxRelationshipDiscoveryCandidates,
|
||||
} from './relationship-candidates.js';
|
||||
import { proposeKtxRelationshipCandidatesWithLlm } from './relationship-llm-proposal.js';
|
||||
import {
|
||||
discoverKtxCompositeRelationships,
|
||||
type KtxCompositePrimaryKeyCandidate,
|
||||
type KtxCompositeRelationshipCandidate,
|
||||
} from './relationship-composite-candidates.js';
|
||||
import { emptyKtxRelationshipProfileArtifact } from './relationship-diagnostics.js';
|
||||
import { collectKtxFormalMetadataRelationships } from './relationship-formal-metadata.js';
|
||||
import { resolveKtxRelationshipGraph } from './relationship-graph-resolver.js';
|
||||
import { type KtxRelationshipReadOnlyExecutor, profileKtxRelationshipSchema } from './relationship-profiling.js';
|
||||
import type { KtxRelationshipValidationBudget } from './relationship-budget.js';
|
||||
import type { KtxRelationshipFixtureOrigin } from './relationship-scoring.js';
|
||||
import { validateKtxRelationshipDiscoveryCandidates } from './relationship-validation.js';
|
||||
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext, KtxSchemaSnapshot } from './types.js';
|
||||
|
||||
export const KTX_RELATIONSHIP_BENCHMARK_MODES = [
|
||||
'metadata_present',
|
||||
'declared_fks_removed',
|
||||
'declared_pks_removed',
|
||||
'declared_pks_and_declared_fks_removed',
|
||||
'llm_disabled',
|
||||
'profiling_disabled',
|
||||
'validation_disabled',
|
||||
'embeddings_disabled',
|
||||
] as const;
|
||||
|
||||
export type KtxRelationshipBenchmarkMode = (typeof KTX_RELATIONSHIP_BENCHMARK_MODES)[number];
|
||||
|
||||
export const KTX_RELATIONSHIP_BENCHMARK_TIERS = ['unit', 'row_bearing', 'schema_only', 'smoke', 'product'] as const;
|
||||
|
||||
export type KtxRelationshipBenchmarkTier = (typeof KTX_RELATIONSHIP_BENCHMARK_TIERS)[number];
|
||||
|
||||
export type KtxRelationshipBenchmarkStatus = 'accepted' | 'review' | 'rejected';
|
||||
|
||||
export interface KtxRelationshipBenchmarkExpectedPk {
|
||||
table: string;
|
||||
columns: string[];
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkExpectedLink {
|
||||
fromTable: string;
|
||||
fromColumns: string[];
|
||||
toTable: string;
|
||||
toColumns: string[];
|
||||
relationship: KtxRelationshipType;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkExpectedLinks {
|
||||
expectedPks: KtxRelationshipBenchmarkExpectedPk[];
|
||||
expectedLinks: KtxRelationshipBenchmarkExpectedLink[];
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkFixture {
|
||||
id: string;
|
||||
name: string;
|
||||
tier: KtxRelationshipBenchmarkTier;
|
||||
origin: KtxRelationshipFixtureOrigin;
|
||||
thresholdEligible?: boolean;
|
||||
validationBudget?: KtxRelationshipValidationBudget;
|
||||
snapshot: KtxSchemaSnapshot;
|
||||
expected: KtxRelationshipBenchmarkExpectedLinks;
|
||||
defaultModes: KtxRelationshipBenchmarkMode[];
|
||||
dataPath: string | null;
|
||||
columnEmbeddings: Record<string, number[]>;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkDetectedPk {
|
||||
table: string;
|
||||
columns: string[];
|
||||
score: number;
|
||||
status: KtxRelationshipBenchmarkStatus;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkDetectedLink {
|
||||
fromTable: string;
|
||||
fromColumns: string[];
|
||||
toTable: string;
|
||||
toColumns: string[];
|
||||
relationship: KtxRelationshipType;
|
||||
score: number;
|
||||
status: KtxRelationshipBenchmarkStatus;
|
||||
source: string;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkDetectorResult {
|
||||
pks: KtxRelationshipBenchmarkDetectedPk[];
|
||||
links: KtxRelationshipBenchmarkDetectedLink[];
|
||||
validationBlocked: boolean;
|
||||
sqlQueries: number;
|
||||
llmCalls: number;
|
||||
runtimeSeconds: number;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkDetectorInput {
|
||||
fixtureId: string;
|
||||
mode: KtxRelationshipBenchmarkMode;
|
||||
snapshot: KtxSchemaSnapshot;
|
||||
schema: KtxEnrichedSchema;
|
||||
dataPath: string | null;
|
||||
validationBudget?: KtxRelationshipValidationBudget;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkDetector {
|
||||
detect(input: KtxRelationshipBenchmarkDetectorInput): Promise<KtxRelationshipBenchmarkDetectorResult>;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkMetrics {
|
||||
pkPrecision: number;
|
||||
pkRecall: number;
|
||||
pkF1: number;
|
||||
fkPrecision: number;
|
||||
fkRecall: number;
|
||||
fkF1: number;
|
||||
acceptedFalsePositiveCount: number;
|
||||
reviewRecall: number;
|
||||
acceptedOrReviewRecall: number;
|
||||
runtimeSeconds: number;
|
||||
sqlQueries: number;
|
||||
llmCalls: number;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkCaseResult {
|
||||
fixtureId: string;
|
||||
mode: KtxRelationshipBenchmarkMode;
|
||||
metrics: KtxRelationshipBenchmarkMetrics;
|
||||
expected: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
predicted: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
acceptedFk: string[];
|
||||
reviewFk: string[];
|
||||
};
|
||||
falsePositives: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
falseNegatives: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
skippedComposite: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
validationBlocked: boolean;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipBenchmarkSuiteResult {
|
||||
cases: KtxRelationshipBenchmarkCaseResult[];
|
||||
validationBlockedCases: string[];
|
||||
aggregate: {
|
||||
caseCount: number;
|
||||
headlineCaseCount: number;
|
||||
headlinePkRecall: number;
|
||||
headlineFkRecall: number;
|
||||
headlineAcceptedOrReviewRecall: number;
|
||||
meanPkRecall: number;
|
||||
meanFkRecall: number;
|
||||
meanAcceptedOrReviewRecall: number;
|
||||
};
|
||||
}
|
||||
|
||||
class KtxRelationshipBenchmarkSqliteExecutor implements KtxRelationshipReadOnlyExecutor {
|
||||
private readonly db: Database.Database;
|
||||
queryCount = 0;
|
||||
|
||||
constructor(dataPath: string) {
|
||||
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
|
||||
}
|
||||
|
||||
async executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return {
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
};
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function fixtureText(fixtureDir: string, fileName: string): Promise<string> {
|
||||
const rawPath = join(fixtureDir, fileName);
|
||||
try {
|
||||
return await readFile(rawPath, 'utf-8');
|
||||
} catch (error) {
|
||||
if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const compressed = await readFile(`${rawPath}.gz`);
|
||||
return gunzipSync(compressed).toString('utf-8');
|
||||
}
|
||||
|
||||
async function fixtureDataPath(fixtureDir: string): Promise<string | null> {
|
||||
const dataPath = join(fixtureDir, 'data.sqlite');
|
||||
try {
|
||||
const dataStat = await stat(dataPath);
|
||||
return dataStat.isFile() ? dataPath : null;
|
||||
} catch (error) {
|
||||
if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const compressedPath = `${dataPath}.gz`;
|
||||
try {
|
||||
const compressedStat = await stat(compressedPath);
|
||||
if (!compressedStat.isFile()) {
|
||||
return null;
|
||||
}
|
||||
const digest = createHash('sha256').update(fixtureDir).digest('hex').slice(0, 16);
|
||||
const tempRoot = await mkdtemp(join(tmpdir(), `ktx-relationship-benchmark-${digest}-`));
|
||||
const extractedPath = join(tempRoot, 'data.sqlite');
|
||||
await writeFile(extractedPath, gunzipSync(await readFile(compressedPath)));
|
||||
return extractedPath;
|
||||
} catch (error) {
|
||||
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||
return null;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function fixtureColumnEmbeddings(fixtureDir: string): Promise<Record<string, number[]>> {
|
||||
const embeddingsPath = join(fixtureDir, 'column-embeddings.json');
|
||||
try {
|
||||
const raw = await readFile(embeddingsPath, 'utf-8');
|
||||
const parsed = JSON.parse(raw) as Record<string, unknown>;
|
||||
return Object.fromEntries(
|
||||
Object.entries(parsed).flatMap(([columnId, value]) => {
|
||||
if (!Array.isArray(value) || value.some((item) => typeof item !== 'number')) {
|
||||
return [];
|
||||
}
|
||||
return [[columnId, value as number[]]];
|
||||
}),
|
||||
);
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
const modeSchema = z.enum(KTX_RELATIONSHIP_BENCHMARK_MODES);
|
||||
const tierSchema = z.enum(KTX_RELATIONSHIP_BENCHMARK_TIERS);
|
||||
const originSchema = z.enum(['synthetic', 'public', 'customer']);
|
||||
const validationBudgetSchema = z.union([z.literal('all'), z.number().int().nonnegative()]);
|
||||
|
||||
const fixtureConfigSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
name: z.string().min(1),
|
||||
tier: tierSchema.default('unit'),
|
||||
origin: originSchema,
|
||||
thresholdEligible: z.boolean().optional(),
|
||||
validationBudget: validationBudgetSchema.optional(),
|
||||
defaultModes: z.array(modeSchema).min(1),
|
||||
});
|
||||
|
||||
const expectedLinksSchema = z.object({
|
||||
expectedPks: z.array(
|
||||
z.object({
|
||||
table: z.string().min(1),
|
||||
columns: z.array(z.string().min(1)).min(1),
|
||||
}),
|
||||
),
|
||||
expectedLinks: z.array(
|
||||
z.object({
|
||||
fromTable: z.string().min(1),
|
||||
fromColumns: z.array(z.string().min(1)).min(1),
|
||||
toTable: z.string().min(1),
|
||||
toColumns: z.array(z.string().min(1)).min(1),
|
||||
relationship: z.enum(['many_to_one', 'one_to_many', 'one_to_one']),
|
||||
}),
|
||||
),
|
||||
});
|
||||
|
||||
function sortedUnique(values: Iterable<string>): string[] {
|
||||
return Array.from(new Set(values)).sort((left, right) => left.localeCompare(right));
|
||||
}
|
||||
|
||||
function tupleKey(columns: readonly string[]): string {
|
||||
return `(${columns.join(',')})`;
|
||||
}
|
||||
|
||||
function pkKey(pk: Pick<KtxRelationshipBenchmarkExpectedPk, 'table' | 'columns'>): string {
|
||||
return `${pk.table}.${tupleKey(pk.columns)}`;
|
||||
}
|
||||
|
||||
function fkKey(
|
||||
link: Pick<KtxRelationshipBenchmarkExpectedLink, 'fromTable' | 'fromColumns' | 'toTable' | 'toColumns'>,
|
||||
): string {
|
||||
return `${link.fromTable}.${tupleKey(link.fromColumns)}->${link.toTable}.${tupleKey(link.toColumns)}`;
|
||||
}
|
||||
|
||||
function relationshipKey(link: KtxRelationshipBenchmarkDetectedLink): string {
|
||||
return fkKey(link);
|
||||
}
|
||||
|
||||
function relationshipToBenchmarkLink(candidate: KtxEnrichedRelationship): KtxRelationshipBenchmarkDetectedLink {
|
||||
return {
|
||||
fromTable: candidate.from.table.name,
|
||||
fromColumns: candidate.from.columns,
|
||||
toTable: candidate.to.table.name,
|
||||
toColumns: candidate.to.columns,
|
||||
relationship: candidate.relationshipType,
|
||||
score: candidate.confidence,
|
||||
status: 'accepted',
|
||||
source: candidate.source,
|
||||
};
|
||||
}
|
||||
|
||||
function broadCandidateToBenchmarkLink(
|
||||
candidate: Pick<KtxRelationshipDiscoveryCandidate, 'confidence' | 'from' | 'relationshipType' | 'source' | 'to'>,
|
||||
): KtxRelationshipBenchmarkDetectedLink {
|
||||
return {
|
||||
fromTable: candidate.from.table.name,
|
||||
fromColumns: candidate.from.columns,
|
||||
toTable: candidate.to.table.name,
|
||||
toColumns: candidate.to.columns,
|
||||
relationship: candidate.relationshipType,
|
||||
score: candidate.confidence,
|
||||
status: 'review',
|
||||
source: candidate.source,
|
||||
};
|
||||
}
|
||||
|
||||
function compositePkToBenchmarkPk(candidate: KtxCompositePrimaryKeyCandidate): KtxRelationshipBenchmarkDetectedPk {
|
||||
return {
|
||||
table: candidate.table.name,
|
||||
columns: candidate.columns,
|
||||
score: candidate.score,
|
||||
status: candidate.status,
|
||||
};
|
||||
}
|
||||
|
||||
function compositeRelationshipToBenchmarkLink(
|
||||
candidate: KtxCompositeRelationshipCandidate,
|
||||
): KtxRelationshipBenchmarkDetectedLink {
|
||||
return {
|
||||
fromTable: candidate.from.table.name,
|
||||
fromColumns: candidate.from.columns,
|
||||
toTable: candidate.to.table.name,
|
||||
toColumns: candidate.to.columns,
|
||||
relationship: candidate.relationshipType,
|
||||
score: candidate.confidence,
|
||||
status: candidate.status,
|
||||
source: candidate.source,
|
||||
};
|
||||
}
|
||||
|
||||
function ratio(numerator: number, denominator: number): number {
|
||||
return denominator === 0 ? 1 : numerator / denominator;
|
||||
}
|
||||
|
||||
function f1(precision: number, recall: number): number {
|
||||
return precision + recall === 0 ? 0 : (2 * precision * recall) / (precision + recall);
|
||||
}
|
||||
|
||||
function difference(left: readonly string[], right: readonly string[]): string[] {
|
||||
const rightSet = new Set(right);
|
||||
return left.filter((item) => !rightSet.has(item));
|
||||
}
|
||||
|
||||
function intersectionSize(left: readonly string[], right: readonly string[]): number {
|
||||
const rightSet = new Set(right);
|
||||
return left.filter((item) => rightSet.has(item)).length;
|
||||
}
|
||||
|
||||
function compositePkKeys(expected: KtxRelationshipBenchmarkExpectedLinks): string[] {
|
||||
return sortedUnique(expected.expectedPks.filter((pk) => pk.columns.length > 1).map(pkKey));
|
||||
}
|
||||
|
||||
function compositeFkKeys(expected: KtxRelationshipBenchmarkExpectedLinks): string[] {
|
||||
return sortedUnique(
|
||||
expected.expectedLinks.filter((link) => link.fromColumns.length > 1 || link.toColumns.length > 1).map(fkKey),
|
||||
);
|
||||
}
|
||||
|
||||
function scalarExpectedPkKeys(expected: KtxRelationshipBenchmarkExpectedLinks): string[] {
|
||||
return sortedUnique(expected.expectedPks.map(pkKey));
|
||||
}
|
||||
|
||||
function scalarExpectedFkKeys(expected: KtxRelationshipBenchmarkExpectedLinks): string[] {
|
||||
return sortedUnique(expected.expectedLinks.map(fkKey));
|
||||
}
|
||||
|
||||
function scoreBenchmarkCase(input: {
|
||||
fixtureId: string;
|
||||
mode: KtxRelationshipBenchmarkMode;
|
||||
expected: KtxRelationshipBenchmarkExpectedLinks;
|
||||
detected: KtxRelationshipBenchmarkDetectorResult;
|
||||
}): KtxRelationshipBenchmarkCaseResult {
|
||||
const expectedPk = scalarExpectedPkKeys(input.expected);
|
||||
const expectedFk = scalarExpectedFkKeys(input.expected);
|
||||
const predictedPk = sortedUnique(input.detected.pks.map(pkKey));
|
||||
const predictedFk = sortedUnique(input.detected.links.map(relationshipKey));
|
||||
const acceptedFk = sortedUnique(
|
||||
input.detected.links.filter((link) => link.status === 'accepted').map(relationshipKey),
|
||||
);
|
||||
const reviewFk = sortedUnique(input.detected.links.filter((link) => link.status === 'review').map(relationshipKey));
|
||||
const acceptedOrReviewFk = sortedUnique([...acceptedFk, ...reviewFk]);
|
||||
|
||||
const truePositivePk = intersectionSize(predictedPk, expectedPk);
|
||||
const truePositiveFk = intersectionSize(acceptedFk, expectedFk);
|
||||
const acceptedOrReviewTruePositiveFk = intersectionSize(acceptedOrReviewFk, expectedFk);
|
||||
const reviewTruePositiveFk = intersectionSize(reviewFk, expectedFk);
|
||||
const pkPrecision = ratio(truePositivePk, predictedPk.length);
|
||||
const pkRecall = ratio(truePositivePk, expectedPk.length);
|
||||
const fkPrecision = ratio(truePositiveFk, acceptedFk.length);
|
||||
const fkRecall = ratio(truePositiveFk, expectedFk.length);
|
||||
|
||||
const falsePositiveFk = difference(acceptedFk, expectedFk);
|
||||
return {
|
||||
fixtureId: input.fixtureId,
|
||||
mode: input.mode,
|
||||
metrics: {
|
||||
pkPrecision,
|
||||
pkRecall,
|
||||
pkF1: f1(pkPrecision, pkRecall),
|
||||
fkPrecision,
|
||||
fkRecall,
|
||||
fkF1: f1(fkPrecision, fkRecall),
|
||||
acceptedFalsePositiveCount: falsePositiveFk.length,
|
||||
reviewRecall: ratio(reviewTruePositiveFk, expectedFk.length),
|
||||
acceptedOrReviewRecall: ratio(acceptedOrReviewTruePositiveFk, expectedFk.length),
|
||||
runtimeSeconds: input.detected.runtimeSeconds,
|
||||
sqlQueries: input.detected.sqlQueries,
|
||||
llmCalls: input.detected.llmCalls,
|
||||
},
|
||||
expected: {
|
||||
pk: expectedPk,
|
||||
fk: expectedFk,
|
||||
},
|
||||
predicted: {
|
||||
pk: predictedPk,
|
||||
fk: predictedFk,
|
||||
acceptedFk,
|
||||
reviewFk,
|
||||
},
|
||||
falsePositives: {
|
||||
pk: difference(predictedPk, expectedPk),
|
||||
fk: falsePositiveFk,
|
||||
},
|
||||
falseNegatives: {
|
||||
pk: difference(expectedPk, predictedPk),
|
||||
fk: difference(expectedFk, acceptedOrReviewFk),
|
||||
},
|
||||
skippedComposite: {
|
||||
pk: difference(compositePkKeys(input.expected), predictedPk),
|
||||
fk: difference(compositeFkKeys(input.expected), acceptedOrReviewFk),
|
||||
},
|
||||
validationBlocked: input.detected.validationBlocked,
|
||||
};
|
||||
}
|
||||
|
||||
export function maskKtxRelationshipBenchmarkSnapshot(
|
||||
snapshot: KtxSchemaSnapshot,
|
||||
mode: KtxRelationshipBenchmarkMode,
|
||||
): KtxSchemaSnapshot {
|
||||
const relationshipDiscoveryMode =
|
||||
mode === 'declared_pks_and_declared_fks_removed' ||
|
||||
mode === 'llm_disabled' ||
|
||||
mode === 'profiling_disabled' ||
|
||||
mode === 'validation_disabled' ||
|
||||
mode === 'embeddings_disabled';
|
||||
const removePks = relationshipDiscoveryMode || mode === 'declared_pks_removed';
|
||||
const removeFks = relationshipDiscoveryMode || mode === 'declared_fks_removed';
|
||||
|
||||
return {
|
||||
...snapshot,
|
||||
scope: { ...snapshot.scope },
|
||||
metadata: { ...snapshot.metadata },
|
||||
tables: snapshot.tables.map((table) => ({
|
||||
...table,
|
||||
columns: table.columns.map((column) => ({
|
||||
...column,
|
||||
primaryKey: removePks ? false : column.primaryKey,
|
||||
})),
|
||||
foreignKeys: removeFks ? [] : table.foreignKeys.map((foreignKey) => ({ ...foreignKey })),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
export function isKtxRelationshipBenchmarkTuningEligible(input: {
|
||||
fixture: Pick<KtxRelationshipBenchmarkFixture, 'tier' | 'thresholdEligible'>;
|
||||
mode: KtxRelationshipBenchmarkMode;
|
||||
validationBlocked: boolean;
|
||||
}): boolean {
|
||||
if (input.validationBlocked || input.mode !== 'declared_pks_and_declared_fks_removed') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (input.fixture.tier === 'smoke' || input.fixture.tier === 'schema_only') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (input.fixture.thresholdEligible !== undefined) {
|
||||
return input.fixture.thresholdEligible;
|
||||
}
|
||||
|
||||
return input.fixture.tier === 'unit' || input.fixture.tier === 'row_bearing';
|
||||
}
|
||||
|
||||
export function ktxRelationshipBenchmarkDetectorWithLlm(
|
||||
llmRuntime: KtxLlmRuntimePort,
|
||||
): KtxRelationshipBenchmarkDetector {
|
||||
return {
|
||||
async detect(input) {
|
||||
const startedAt = performance.now();
|
||||
const formalMetadata = collectKtxFormalMetadataRelationships(input.schema);
|
||||
const formalLinks = formalMetadata.accepted.map((relationship) => relationshipToBenchmarkLink(relationship));
|
||||
const acceptedKeys = new Set(formalLinks.map(fkKey));
|
||||
const sqliteDataAvailable = Boolean(input.dataPath && input.snapshot.driver === 'sqlite');
|
||||
const profilingExecutor =
|
||||
sqliteDataAvailable && input.mode !== 'profiling_disabled'
|
||||
? new KtxRelationshipBenchmarkSqliteExecutor(input.dataPath as string)
|
||||
: null;
|
||||
const validationExecutor = profilingExecutor && input.mode !== 'validation_disabled' ? profilingExecutor : null;
|
||||
const profiles =
|
||||
input.mode === 'profiling_disabled'
|
||||
? emptyKtxRelationshipProfileArtifact({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
reason: 'relationship_benchmark_profiling_disabled',
|
||||
})
|
||||
: await profileKtxRelationshipSchema({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
schema: input.schema,
|
||||
executor: profilingExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:profile` },
|
||||
});
|
||||
const broadRelationshipCandidates = generateKtxRelationshipDiscoveryCandidates(input.schema, {
|
||||
profiles,
|
||||
useEmbeddings: input.mode !== 'embeddings_disabled',
|
||||
});
|
||||
const llmProposalResult =
|
||||
input.mode === 'llm_disabled'
|
||||
? { candidates: [], warnings: [], llmCalls: 0, summary: 'skipped' as const }
|
||||
: await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
schema: input.schema,
|
||||
profile: profiles,
|
||||
llmRuntime,
|
||||
});
|
||||
const candidates = mergeKtxRelationshipDiscoveryCandidates([
|
||||
...broadRelationshipCandidates,
|
||||
...llmProposalResult.candidates,
|
||||
]);
|
||||
const validationBudget =
|
||||
input.validationBudget === 'all'
|
||||
? 'all'
|
||||
: input.validationBudget === undefined
|
||||
? 'all'
|
||||
: Math.max(0, input.validationBudget - profiles.queryCount);
|
||||
const validatedBroadCandidates = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
candidates,
|
||||
profiles,
|
||||
executor: validationExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:validate` },
|
||||
tableCount: input.schema.tables.length,
|
||||
settings: {
|
||||
validationBudget,
|
||||
},
|
||||
});
|
||||
const compositeDetection =
|
||||
validationBudget === 'all' &&
|
||||
validationExecutor &&
|
||||
input.mode !== 'profiling_disabled' &&
|
||||
input.mode !== 'validation_disabled'
|
||||
? await discoverKtxCompositeRelationships({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
schema: input.schema,
|
||||
profiles,
|
||||
executor: validationExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:composite` },
|
||||
})
|
||||
: { primaryKeys: [], relationships: [], queryCount: 0, warnings: [] };
|
||||
profilingExecutor?.close();
|
||||
const graph = resolveKtxRelationshipGraph({
|
||||
schema: input.schema,
|
||||
profiles,
|
||||
candidates: validatedBroadCandidates,
|
||||
});
|
||||
const acceptedBroadCandidates = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'accepted')
|
||||
.map((candidate) => ({
|
||||
...broadCandidateToBenchmarkLink(candidate),
|
||||
score: candidate.fkScore,
|
||||
status: 'accepted' as const,
|
||||
}))
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
|
||||
const reviewCandidates = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'review')
|
||||
.map((candidate) => ({
|
||||
...broadCandidateToBenchmarkLink(candidate),
|
||||
score: candidate.fkScore,
|
||||
status: 'review' as const,
|
||||
}))
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
|
||||
const resolvedPks = graph.pks
|
||||
.filter((pk) => pk.status !== 'rejected')
|
||||
.map((pk) => ({
|
||||
table: pk.table,
|
||||
columns: pk.columns,
|
||||
score: pk.pkScore,
|
||||
status: pk.status,
|
||||
}));
|
||||
const compositePks = compositeDetection.primaryKeys.map(compositePkToBenchmarkPk);
|
||||
const allPksByKey = new Map([...resolvedPks, ...compositePks].map((candidate) => [pkKey(candidate), candidate]));
|
||||
const pks = sortedUnique(allPksByKey.keys()).flatMap((key) => {
|
||||
const candidate = allPksByKey.get(key);
|
||||
return candidate ? [candidate] : [];
|
||||
});
|
||||
|
||||
return {
|
||||
pks,
|
||||
links: [
|
||||
...formalLinks,
|
||||
...acceptedBroadCandidates,
|
||||
...reviewCandidates,
|
||||
...compositeDetection.relationships
|
||||
.map(compositeRelationshipToBenchmarkLink)
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate))),
|
||||
],
|
||||
validationBlocked:
|
||||
input.mode === 'validation_disabled' ||
|
||||
input.mode === 'profiling_disabled' ||
|
||||
(input.dataPath !== null && broadRelationshipCandidates.length > 0 && !profiles.sqlAvailable),
|
||||
sqlQueries: profilingExecutor?.queryCount ?? profiles.queryCount,
|
||||
llmCalls: llmProposalResult.llmCalls,
|
||||
runtimeSeconds: Number(((performance.now() - startedAt) / 1000).toFixed(6)),
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function currentKtxRelationshipBenchmarkDetector(): KtxRelationshipBenchmarkDetector {
|
||||
return {
|
||||
async detect(input) {
|
||||
const startedAt = performance.now();
|
||||
const formalMetadata = collectKtxFormalMetadataRelationships(input.schema);
|
||||
const formalLinks = formalMetadata.accepted.map((relationship) => relationshipToBenchmarkLink(relationship));
|
||||
const acceptedKeys = new Set(formalLinks.map(fkKey));
|
||||
const sqliteDataAvailable = Boolean(input.dataPath && input.snapshot.driver === 'sqlite');
|
||||
const profilingExecutor =
|
||||
sqliteDataAvailable && input.mode !== 'profiling_disabled'
|
||||
? new KtxRelationshipBenchmarkSqliteExecutor(input.dataPath as string)
|
||||
: null;
|
||||
const validationExecutor = profilingExecutor && input.mode !== 'validation_disabled' ? profilingExecutor : null;
|
||||
const profiles =
|
||||
input.mode === 'profiling_disabled'
|
||||
? emptyKtxRelationshipProfileArtifact({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
reason: 'relationship_benchmark_profiling_disabled',
|
||||
})
|
||||
: await profileKtxRelationshipSchema({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
schema: input.schema,
|
||||
executor: profilingExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:profile` },
|
||||
});
|
||||
const broadRelationshipCandidates = generateKtxRelationshipDiscoveryCandidates(input.schema, {
|
||||
profiles,
|
||||
useEmbeddings: input.mode !== 'embeddings_disabled',
|
||||
});
|
||||
const validationBudget =
|
||||
input.validationBudget === 'all'
|
||||
? 'all'
|
||||
: input.validationBudget === undefined
|
||||
? 'all'
|
||||
: Math.max(0, input.validationBudget - profiles.queryCount);
|
||||
const validatedBroadCandidates = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
candidates: broadRelationshipCandidates,
|
||||
profiles,
|
||||
executor: validationExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:validate` },
|
||||
tableCount: input.schema.tables.length,
|
||||
settings: {
|
||||
validationBudget,
|
||||
},
|
||||
});
|
||||
const compositeDetection =
|
||||
validationBudget === 'all' &&
|
||||
validationExecutor &&
|
||||
input.mode !== 'profiling_disabled' &&
|
||||
input.mode !== 'validation_disabled'
|
||||
? await discoverKtxCompositeRelationships({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
schema: input.schema,
|
||||
profiles,
|
||||
executor: validationExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:composite` },
|
||||
})
|
||||
: { primaryKeys: [], relationships: [], queryCount: 0, warnings: [] };
|
||||
profilingExecutor?.close();
|
||||
const graph = resolveKtxRelationshipGraph({
|
||||
schema: input.schema,
|
||||
profiles,
|
||||
candidates: validatedBroadCandidates,
|
||||
});
|
||||
const acceptedBroadCandidates = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'accepted')
|
||||
.map((candidate) => ({
|
||||
...broadCandidateToBenchmarkLink(candidate),
|
||||
score: candidate.fkScore,
|
||||
status: 'accepted' as const,
|
||||
}))
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
|
||||
const reviewCandidates = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'review')
|
||||
.map((candidate) => ({
|
||||
...broadCandidateToBenchmarkLink(candidate),
|
||||
score: candidate.fkScore,
|
||||
status: 'review' as const,
|
||||
}))
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
|
||||
const resolvedPks = graph.pks
|
||||
.filter((pk) => pk.status !== 'rejected')
|
||||
.map((pk) => ({
|
||||
table: pk.table,
|
||||
columns: pk.columns,
|
||||
score: pk.pkScore,
|
||||
status: pk.status,
|
||||
}));
|
||||
const compositePks = compositeDetection.primaryKeys.map(compositePkToBenchmarkPk);
|
||||
const allPksByKey = new Map([...resolvedPks, ...compositePks].map((candidate) => [pkKey(candidate), candidate]));
|
||||
const pks = sortedUnique(allPksByKey.keys()).flatMap((key) => {
|
||||
const candidate = allPksByKey.get(key);
|
||||
return candidate ? [candidate] : [];
|
||||
});
|
||||
|
||||
return {
|
||||
pks,
|
||||
links: [
|
||||
...formalLinks,
|
||||
...acceptedBroadCandidates,
|
||||
...reviewCandidates,
|
||||
...compositeDetection.relationships
|
||||
.map(compositeRelationshipToBenchmarkLink)
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate))),
|
||||
],
|
||||
validationBlocked:
|
||||
input.mode === 'validation_disabled' ||
|
||||
input.mode === 'profiling_disabled' ||
|
||||
(input.dataPath !== null && broadRelationshipCandidates.length > 0 && !profiles.sqlAvailable),
|
||||
sqlQueries: profilingExecutor?.queryCount ?? profiles.queryCount,
|
||||
llmCalls: 0,
|
||||
runtimeSeconds: Number(((performance.now() - startedAt) / 1000).toFixed(6)),
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function loadKtxRelationshipBenchmarkFixture(
|
||||
fixtureDir: string,
|
||||
): Promise<KtxRelationshipBenchmarkFixture> {
|
||||
const [fixtureRaw, snapshotRaw, expectedRaw] = await Promise.all([
|
||||
fixtureText(fixtureDir, 'fixture.yaml'),
|
||||
fixtureText(fixtureDir, 'snapshot.json'),
|
||||
fixtureText(fixtureDir, 'expected-links.yaml'),
|
||||
]);
|
||||
const fixture = fixtureConfigSchema.parse(YAML.parse(fixtureRaw));
|
||||
const expected = expectedLinksSchema.parse(YAML.parse(expectedRaw));
|
||||
const snapshot = JSON.parse(snapshotRaw) as KtxSchemaSnapshot;
|
||||
|
||||
return {
|
||||
...fixture,
|
||||
snapshot,
|
||||
expected,
|
||||
dataPath: await fixtureDataPath(fixtureDir),
|
||||
columnEmbeddings: await fixtureColumnEmbeddings(fixtureDir),
|
||||
};
|
||||
}
|
||||
|
||||
export async function loadKtxRelationshipBenchmarkFixtures(
|
||||
fixtureRoot: string,
|
||||
): Promise<KtxRelationshipBenchmarkFixture[]> {
|
||||
const entries = await readdir(fixtureRoot, { withFileTypes: true });
|
||||
const fixtureDirs = entries
|
||||
.filter((entry) => entry.isDirectory())
|
||||
.map((entry) => join(fixtureRoot, entry.name))
|
||||
.sort((left, right) => left.localeCompare(right));
|
||||
|
||||
return Promise.all(fixtureDirs.map((fixtureDir) => loadKtxRelationshipBenchmarkFixture(fixtureDir)));
|
||||
}
|
||||
|
||||
export async function runKtxRelationshipBenchmarkCase(input: {
|
||||
fixture: KtxRelationshipBenchmarkFixture;
|
||||
mode: KtxRelationshipBenchmarkMode;
|
||||
detector?: KtxRelationshipBenchmarkDetector;
|
||||
}): Promise<KtxRelationshipBenchmarkCaseResult> {
|
||||
const snapshot = maskKtxRelationshipBenchmarkSnapshot(input.fixture.snapshot, input.mode);
|
||||
const embeddings =
|
||||
input.mode === 'embeddings_disabled'
|
||||
? new Map<string, number[]>()
|
||||
: new Map(Object.entries(input.fixture.columnEmbeddings));
|
||||
const schema = snapshotToKtxEnrichedSchema(snapshot, embeddings);
|
||||
const detected = await (input.detector ?? currentKtxRelationshipBenchmarkDetector()).detect({
|
||||
fixtureId: input.fixture.id,
|
||||
mode: input.mode,
|
||||
snapshot,
|
||||
schema,
|
||||
dataPath: input.fixture.dataPath,
|
||||
validationBudget: input.fixture.validationBudget,
|
||||
});
|
||||
|
||||
return scoreBenchmarkCase({
|
||||
fixtureId: input.fixture.id,
|
||||
mode: input.mode,
|
||||
expected: input.fixture.expected,
|
||||
detected,
|
||||
});
|
||||
}
|
||||
|
||||
export async function runKtxRelationshipBenchmarkSuite(input: {
|
||||
fixtures: KtxRelationshipBenchmarkFixture[];
|
||||
detector?: KtxRelationshipBenchmarkDetector;
|
||||
}): Promise<KtxRelationshipBenchmarkSuiteResult> {
|
||||
const cases: KtxRelationshipBenchmarkCaseResult[] = [];
|
||||
for (const fixture of input.fixtures) {
|
||||
for (const mode of fixture.defaultModes) {
|
||||
cases.push(
|
||||
await runKtxRelationshipBenchmarkCase({
|
||||
fixture,
|
||||
mode,
|
||||
detector: input.detector,
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const fixtureById = new Map(input.fixtures.map((fixture) => [fixture.id, fixture]));
|
||||
const headlineCases = cases.filter((item) => {
|
||||
const fixture = fixtureById.get(item.fixtureId);
|
||||
return fixture
|
||||
? isKtxRelationshipBenchmarkTuningEligible({
|
||||
fixture,
|
||||
mode: item.mode,
|
||||
validationBlocked: item.validationBlocked,
|
||||
})
|
||||
: false;
|
||||
});
|
||||
const aggregateCases = cases.length === 0 ? [] : cases;
|
||||
|
||||
return {
|
||||
cases,
|
||||
validationBlockedCases: cases
|
||||
.filter((item) => item.validationBlocked)
|
||||
.map((item) => `${item.fixtureId}:${item.mode}`),
|
||||
aggregate: {
|
||||
caseCount: cases.length,
|
||||
headlineCaseCount: headlineCases.length,
|
||||
headlinePkRecall: mean(headlineCases.map((item) => item.metrics.pkRecall)),
|
||||
headlineFkRecall: mean(headlineCases.map((item) => item.metrics.fkRecall)),
|
||||
headlineAcceptedOrReviewRecall: mean(headlineCases.map((item) => item.metrics.acceptedOrReviewRecall)),
|
||||
meanPkRecall: mean(aggregateCases.map((item) => item.metrics.pkRecall)),
|
||||
meanFkRecall: mean(aggregateCases.map((item) => item.metrics.fkRecall)),
|
||||
meanAcceptedOrReviewRecall: mean(aggregateCases.map((item) => item.metrics.acceptedOrReviewRecall)),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function mean(values: number[]): number {
|
||||
if (values.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
||||
}
|
||||
86
packages/cli/src/context/scan/relationship-budget.test.ts
Normal file
86
packages/cli/src/context/scan/relationship-budget.test.ts
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { applyKtxRelationshipValidationBudget, defaultKtxRelationshipValidationBudget } from './relationship-budget.js';
|
||||
|
||||
interface Candidate {
|
||||
id: string;
|
||||
confidence: number;
|
||||
}
|
||||
|
||||
describe('relationship validation budget', () => {
|
||||
it('computes the default validation budget from table count', () => {
|
||||
expect(defaultKtxRelationshipValidationBudget(0)).toBe(0);
|
||||
expect(defaultKtxRelationshipValidationBudget(3)).toBe(6);
|
||||
expect(defaultKtxRelationshipValidationBudget(400)).toBe(800);
|
||||
expect(defaultKtxRelationshipValidationBudget(900)).toBe(1000);
|
||||
expect(defaultKtxRelationshipValidationBudget(-4)).toBe(0);
|
||||
expect(defaultKtxRelationshipValidationBudget(3.8)).toBe(6);
|
||||
});
|
||||
|
||||
it('splits candidates by descending score with stable tie ordering', () => {
|
||||
const result = applyKtxRelationshipValidationBudget<Candidate>({
|
||||
candidates: [
|
||||
{ id: 'first', confidence: 0.8 },
|
||||
{ id: 'second', confidence: 0.9 },
|
||||
{ id: 'third', confidence: 0.9 },
|
||||
{ id: 'fourth', confidence: 0.2 },
|
||||
],
|
||||
tableCount: 100,
|
||||
budget: 2,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe(2);
|
||||
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['second', 'third']);
|
||||
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'fourth']);
|
||||
expect(result.toValidate.map((entry) => entry.originalIndex)).toEqual([1, 2]);
|
||||
});
|
||||
|
||||
it('uses the default budget when the budget is omitted', () => {
|
||||
const candidates = Array.from({ length: 8 }, (_, index) => ({
|
||||
id: `candidate-${index}`,
|
||||
confidence: 1 - index / 10,
|
||||
}));
|
||||
|
||||
const result = applyKtxRelationshipValidationBudget<Candidate>({
|
||||
candidates,
|
||||
tableCount: 2,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe(4);
|
||||
expect(result.toValidate).toHaveLength(4);
|
||||
expect(result.deferred).toHaveLength(4);
|
||||
});
|
||||
|
||||
it('treats budget zero as disabling SQL validation', () => {
|
||||
const result = applyKtxRelationshipValidationBudget<Candidate>({
|
||||
candidates: [
|
||||
{ id: 'first', confidence: 1 },
|
||||
{ id: 'second', confidence: 0.5 },
|
||||
],
|
||||
tableCount: 10,
|
||||
budget: 0,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe(0);
|
||||
expect(result.toValidate).toEqual([]);
|
||||
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
|
||||
});
|
||||
|
||||
it('treats budget all as validating every candidate', () => {
|
||||
const result = applyKtxRelationshipValidationBudget<Candidate>({
|
||||
candidates: [
|
||||
{ id: 'first', confidence: 0.1 },
|
||||
{ id: 'second', confidence: 0.9 },
|
||||
],
|
||||
tableCount: 1,
|
||||
budget: 'all',
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe('all');
|
||||
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
|
||||
expect(result.deferred).toEqual([]);
|
||||
});
|
||||
});
|
||||
61
packages/cli/src/context/scan/relationship-budget.ts
Normal file
61
packages/cli/src/context/scan/relationship-budget.ts
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
export type KtxRelationshipValidationBudget = number | 'all' | undefined;
|
||||
|
||||
interface KtxRelationshipBudgetedCandidate<TCandidate> {
|
||||
candidate: TCandidate;
|
||||
originalIndex: number;
|
||||
score: number;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipValidationBudgetResult<TCandidate> {
|
||||
effectiveBudget: number | 'all';
|
||||
toValidate: KtxRelationshipBudgetedCandidate<TCandidate>[];
|
||||
deferred: KtxRelationshipBudgetedCandidate<TCandidate>[];
|
||||
}
|
||||
|
||||
export interface ApplyKtxRelationshipValidationBudgetInput<TCandidate> {
|
||||
candidates: readonly TCandidate[];
|
||||
tableCount: number;
|
||||
budget?: KtxRelationshipValidationBudget;
|
||||
score: (candidate: TCandidate) => number;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function defaultKtxRelationshipValidationBudget(tableCount: number): number {
|
||||
const safeTableCount = Number.isFinite(tableCount) ? Math.max(0, Math.floor(tableCount)) : 0;
|
||||
return Math.min(2 * safeTableCount, 1000);
|
||||
}
|
||||
|
||||
export function applyKtxRelationshipValidationBudget<TCandidate>(
|
||||
input: ApplyKtxRelationshipValidationBudgetInput<TCandidate>,
|
||||
): KtxRelationshipValidationBudgetResult<TCandidate> {
|
||||
const ranked = input.candidates
|
||||
.map((candidate, originalIndex) => ({
|
||||
candidate,
|
||||
originalIndex,
|
||||
score: input.score(candidate),
|
||||
}))
|
||||
.sort((left, right) => {
|
||||
const scoreDelta = right.score - left.score;
|
||||
return scoreDelta === 0 ? left.originalIndex - right.originalIndex : scoreDelta;
|
||||
});
|
||||
|
||||
if (input.budget === 'all') {
|
||||
return {
|
||||
effectiveBudget: 'all',
|
||||
toValidate: input.candidates.map((candidate, originalIndex) => ({
|
||||
candidate,
|
||||
originalIndex,
|
||||
score: input.score(candidate),
|
||||
})),
|
||||
deferred: [],
|
||||
};
|
||||
}
|
||||
|
||||
const effectiveBudget = input.budget ?? defaultKtxRelationshipValidationBudget(input.tableCount);
|
||||
const safeBudget = Math.max(0, Math.floor(effectiveBudget));
|
||||
return {
|
||||
effectiveBudget: safeBudget,
|
||||
toValidate: ranked.slice(0, safeBudget),
|
||||
deferred: ranked.slice(safeBudget),
|
||||
};
|
||||
}
|
||||
881
packages/cli/src/context/scan/relationship-candidates.test.ts
Normal file
881
packages/cli/src/context/scan/relationship-candidates.test.ts
Normal file
|
|
@ -0,0 +1,881 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
||||
import { normalizeKtxRelationshipName } from './relationship-name-similarity.js';
|
||||
import {
|
||||
generateKtxRelationshipDiscoveryCandidates,
|
||||
inferKtxRelationshipTargetPks,
|
||||
mergeKtxRelationshipDiscoveryCandidates,
|
||||
} from './relationship-candidates.js';
|
||||
import type { KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
|
||||
function column(
|
||||
tableId: string,
|
||||
id: string,
|
||||
name: string,
|
||||
options: Partial<KtxEnrichedColumn> = {},
|
||||
): KtxEnrichedColumn {
|
||||
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
|
||||
return {
|
||||
id,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: options.nativeType ?? 'INTEGER',
|
||||
normalizedType: options.normalizedType ?? 'integer',
|
||||
dimensionType: options.dimensionType ?? 'number',
|
||||
nullable: options.nullable ?? true,
|
||||
primaryKey: options.primaryKey ?? false,
|
||||
parentColumnId: options.parentColumnId ?? null,
|
||||
descriptions: options.descriptions ?? {},
|
||||
embedding: options.embedding ?? null,
|
||||
sampleValues: options.sampleValues ?? null,
|
||||
cardinality: options.cardinality ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function table(id: string, name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: 'public', name };
|
||||
return {
|
||||
id,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(tables: KtxEnrichedTable[]): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables,
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
|
||||
function planCodeProfiles(): KtxRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [
|
||||
{ table: { catalog: null, db: 'public', name: 'stg_plans' }, rowCount: 4 },
|
||||
{ table: { catalog: null, db: 'public', name: 'mart_account_segments' }, rowCount: 4 },
|
||||
{ table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' }, rowCount: 4 },
|
||||
],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'stg_plans.plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.created_at': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'created_at',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['2026-05-01', '2026-05-02', '2026-05-03', '2026-05-04'],
|
||||
minTextLength: 10,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.email': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'email',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['a@example.test', 'b@example.test', 'c@example.test', 'd@example.test'],
|
||||
minTextLength: 14,
|
||||
maxTextLength: 14,
|
||||
},
|
||||
'stg_plans.is_deleted': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'is_deleted',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['deleted-a', 'deleted-b', 'deleted-c', 'deleted-d'],
|
||||
minTextLength: 9,
|
||||
maxTextLength: 9,
|
||||
},
|
||||
'mart_account_segments.current_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
|
||||
column: 'current_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'mart_account_segments.normalized_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
|
||||
column: 'normalized_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plan_segment_mapping.canonical_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' },
|
||||
column: 'canonical_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.canonical_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'canonical_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship discovery candidates', () => {
|
||||
it('normalizes warehouse prefixes and emits review candidates without declared primary keys', () => {
|
||||
const accounts = table('accounts-id', 'dim_accounts', [
|
||||
column('accounts-id', 'accounts-id-col', 'id', { primaryKey: false }),
|
||||
column('accounts-id', 'accounts-name-col', 'account_name', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const invoices = table('invoices-id', 'fct_invoices', [
|
||||
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: false }),
|
||||
column('invoices-id', 'account-id-col', 'account_id', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]));
|
||||
|
||||
expect(candidates).toHaveLength(1);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
from: { tableId: 'invoices-id', columnIds: ['account-id-col'], columns: ['account_id'] },
|
||||
to: { tableId: 'accounts-id', columnIds: ['accounts-id-col'], columns: ['id'] },
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'review',
|
||||
source: 'normalized_table_match',
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: 'account',
|
||||
targetKeyScore: 0.92,
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.8);
|
||||
expect(candidates[0]?.evidence.signalVector).toMatchObject({
|
||||
nameSimilarity: 0.92,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: 0.92,
|
||||
});
|
||||
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
|
||||
expect(candidates[0]?.evidence.scoreBreakdown?.contributions.nameSimilarity).toBeGreaterThan(0);
|
||||
expect(candidates[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
|
||||
);
|
||||
});
|
||||
|
||||
it('generates candidates for PascalCase ID columns without declared keys', () => {
|
||||
const artists = table('artist-id', 'Artist', [
|
||||
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
|
||||
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const albums = table('album-id', 'Album', [
|
||||
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
|
||||
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([artists, albums]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['Album.ArtistId->Artist.ArtistId']);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'normalized_table_match',
|
||||
evidence: {
|
||||
sourceColumnBase: 'artist',
|
||||
targetTableBase: 'artist',
|
||||
targetColumnBase: 'artist_id',
|
||||
targetKeyScore: 0.9,
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
|
||||
});
|
||||
|
||||
it('uses the locality cap before scanning parent tables', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id-col', 'id'),
|
||||
column('invoices-id', 'account-id-col', 'account_id'),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]), {
|
||||
maxCandidateParentTables: 0,
|
||||
});
|
||||
|
||||
expect(candidates).toEqual([]);
|
||||
});
|
||||
|
||||
it('keeps the nearest parent when the locality cap is one', () => {
|
||||
const artists = table('artist-id', 'Artist', [
|
||||
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
|
||||
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const albums = table('album-id', 'Album', [
|
||||
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
|
||||
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
|
||||
]);
|
||||
const fillerTables = Array.from({ length: 25 }, (_, index) =>
|
||||
table(`filler-${index}`, `WarehouseFiller${index}`, [
|
||||
column(`filler-${index}`, `filler-${index}-id`, 'WarehouseFillerId', { primaryKey: false }),
|
||||
]),
|
||||
);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([albums, ...fillerTables, artists]), {
|
||||
maxCandidateParentTables: 1,
|
||||
});
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['Album.ArtistId->Artist.ArtistId']);
|
||||
});
|
||||
|
||||
it('uses final table tokens from dotted parent table names', () => {
|
||||
const customers = table('customer-id', 'SalesLT.Customer', [
|
||||
column('customer-id', 'customer-id-col', 'CustomerID', { primaryKey: false }),
|
||||
column('customer-id', 'customer-name-col', 'CustomerName', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const orders = table('order-id', 'SalesLT.SalesOrderHeader', [
|
||||
column('order-id', 'order-id-col', 'SalesOrderID', { primaryKey: false }),
|
||||
column('order-id', 'customer-id-fk-col', 'CustomerID', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customers, orders]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['SalesLT.SalesOrderHeader.CustomerID->SalesLT.Customer.CustomerID']);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
evidence: {
|
||||
sourceColumnBase: 'customer',
|
||||
targetTableBase: 'sales_lt_customer',
|
||||
targetColumnBase: 'customer_id',
|
||||
targetKeyScore: 0.9,
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'inflection', 'target_key_like']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('emits lower-confidence parent-table-name candidates when the target key name differs from the table name', () => {
|
||||
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
|
||||
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', { primaryKey: true }),
|
||||
column('customer-account-id', 'account-name-col', 'AccountName', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
|
||||
column('subscriptions-id', 'subscription-id-col', 'SubscriptionID', { primaryKey: false }),
|
||||
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID']);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'parent_table_name_match',
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'review',
|
||||
evidence: {
|
||||
sourceColumnBase: 'customer_account',
|
||||
targetTableBase: 'crm_customer_account',
|
||||
targetColumnBase: 'business_entity_id',
|
||||
targetKeyScore: 1,
|
||||
nameScore: 0.82,
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'parent_table_name_match', 'target_key_like']),
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.evidence.signalVector).toMatchObject({
|
||||
nameSimilarity: 0.82,
|
||||
typeCompatibility: 1,
|
||||
});
|
||||
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
|
||||
});
|
||||
|
||||
it('does not emit parent-table-name candidates when the target key type is incompatible', () => {
|
||||
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
|
||||
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', {
|
||||
primaryKey: true,
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
|
||||
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', {
|
||||
primaryKey: false,
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).not.toContain('fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID');
|
||||
});
|
||||
|
||||
it('does not use parent-table-name matching to create same-table same-column self-links', () => {
|
||||
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
|
||||
column('customer-account-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
|
||||
column('customer-account-id', 'account-name-col', 'AccountName', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customerAccounts]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).not.toContain('crm.CustomerAccount.CustomerAccountID->crm.CustomerAccount.CustomerAccountID');
|
||||
});
|
||||
|
||||
it('uses profile evidence to generate natural-key candidates without id-like target names', () => {
|
||||
const countries = table('countries-id', 'dim_countries', [
|
||||
column('countries-id', 'countries-code-col', 'iso_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('countries-id', 'countries-name-col', 'name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const accounts = table('accounts-id', 'fct_accounts', [
|
||||
column('accounts-id', 'account-id-col', 'id', { primaryKey: false }),
|
||||
column('accounts-id', 'country-code-col', 'country_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const profiles = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'dim_countries.iso_code': {
|
||||
table: { catalog: null, db: 'public', name: 'dim_countries' },
|
||||
column: 'iso_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['DE', 'FR', 'US'],
|
||||
minTextLength: 2,
|
||||
maxTextLength: 2,
|
||||
},
|
||||
'fct_accounts.country_code': {
|
||||
table: { catalog: null, db: 'public', name: 'fct_accounts' },
|
||||
column: 'country_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.75,
|
||||
nullRate: 0,
|
||||
sampleValues: ['FR', 'US'],
|
||||
minTextLength: 2,
|
||||
maxTextLength: 2,
|
||||
},
|
||||
},
|
||||
} satisfies KtxRelationshipProfileArtifact;
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([countries, accounts]), { profiles });
|
||||
|
||||
expect(candidates).toHaveLength(1);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'profile_match',
|
||||
from: { tableId: 'accounts-id', columnIds: ['country-code-col'], columns: ['country_code'] },
|
||||
to: { tableId: 'countries-id', columnIds: ['countries-code-col'], columns: ['iso_code'] },
|
||||
evidence: {
|
||||
sourceColumnBase: 'country',
|
||||
targetTableBase: 'country',
|
||||
targetColumnBase: 'iso_code',
|
||||
targetKeyScore: 0.86,
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.78);
|
||||
expect(candidates[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining([
|
||||
'foreign_key_code_suffix',
|
||||
'normalized_table_name',
|
||||
'profile_unique_target',
|
||||
'profile_sample_overlap',
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('drops same-table same-column self-links using ordered endpoint equality', () => {
|
||||
const accounts = table('accounts-id', 'stg_accounts', [
|
||||
column('accounts-id', 'accounts-account-id-col', 'account_id', { primaryKey: false }),
|
||||
column('accounts-id', 'accounts-name-col', 'account_name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).not.toContain('stg_accounts.account_id->stg_accounts.account_id');
|
||||
});
|
||||
|
||||
it('keeps legitimate same-table different-column self-references', () => {
|
||||
const employees = table('employees-id', 'employees', [
|
||||
column('employees-id', 'employees-id-col', 'id', { primaryKey: false }),
|
||||
column('employees-id', 'employees-parent-id-col', 'parent_id', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([employees]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toContain('employees.parent_id->employees.id');
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'self_reference',
|
||||
evidence: {
|
||||
reasons: expect.arrayContaining(['self_reference']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('emits column_suffix_match candidates for relationship-key-shaped trailing target columns', () => {
|
||||
const plans = table('plans-id', 'stg_plans', [
|
||||
column('plans-id', 'plans-plan-code-col', 'plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-canonical-plan-code-col', 'canonical_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-created-at-col', 'created_at', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-email-col', 'email', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-is-deleted-col', 'is_deleted', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const accountSegments = table('account-segments-id', 'mart_account_segments', [
|
||||
column('account-segments-id', 'current-plan-code-col', 'current_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'normalized-plan-code-col', 'normalized_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'source-created-at-col', 'source_created_at', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'billing-email-col', 'billing_email', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'source-is-deleted-col', 'source_is_deleted', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const mapping = table('mapping-id', 'stg_plan_segment_mapping', [
|
||||
column('mapping-id', 'mapping-canonical-plan-code-col', 'canonical_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([plans, accountSegments, mapping]), {
|
||||
profiles: planCodeProfiles(),
|
||||
});
|
||||
const candidateKeys = candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
);
|
||||
|
||||
expect(candidateKeys).toEqual([
|
||||
'mart_account_segments.current_plan_code->stg_plans.plan_code',
|
||||
'mart_account_segments.normalized_plan_code->stg_plans.plan_code',
|
||||
'stg_plan_segment_mapping.canonical_plan_code->stg_plans.plan_code',
|
||||
'stg_plans.canonical_plan_code->stg_plans.plan_code',
|
||||
]);
|
||||
expect(candidates).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
source: 'column_suffix_match',
|
||||
confidence: expect.any(Number),
|
||||
evidence: expect.objectContaining({
|
||||
nameScore: 0.78,
|
||||
targetKeyScore: 0.86,
|
||||
reasons: expect.arrayContaining(['column_suffix_match', 'profile_unique_target']),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
expect(candidateKeys).not.toContain('mart_account_segments.source_created_at->stg_plans.created_at');
|
||||
expect(candidateKeys).not.toContain('mart_account_segments.billing_email->stg_plans.email');
|
||||
expect(candidateKeys).not.toContain('mart_account_segments.source_is_deleted->stg_plans.is_deleted');
|
||||
const suffixCandidate = candidates.find(
|
||||
(candidate) => candidate.from.table.name === 'mart_account_segments' && candidate.from.columns[0] === 'current_plan_code',
|
||||
);
|
||||
expect(suffixCandidate?.confidence).toBe(suffixCandidate?.evidence.scoreBreakdown?.score);
|
||||
expect(suffixCandidate?.evidence.signalVector).toMatchObject({
|
||||
nameSimilarity: 0.78,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 1,
|
||||
profileUniqueness: 1,
|
||||
profileNullRate: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it('does not suffix-match bare single-token targets or incompatible target types', () => {
|
||||
const users = table('users-id', 'users', [
|
||||
column('users-id', 'users-id-col', 'id', { primaryKey: false }),
|
||||
column('users-id', 'users-account-id-col', 'account_id', { primaryKey: false }),
|
||||
]);
|
||||
const plans = table('plans-id', 'plans', [
|
||||
column('plans-id', 'plans-plan-code-col', 'plan_code', {
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
}),
|
||||
]);
|
||||
const accounts = table('accounts-id', 'accounts', [
|
||||
column('accounts-id', 'current-plan-code-col', 'current_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const profiles = {
|
||||
...planCodeProfiles(),
|
||||
columns: {
|
||||
...planCodeProfiles().columns,
|
||||
'users.id': {
|
||||
table: { catalog: null, db: 'public', name: 'users' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'plans.plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
} satisfies KtxRelationshipProfileArtifact;
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([users, plans, accounts]), { profiles });
|
||||
const candidateKeys = candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
);
|
||||
|
||||
expect(candidateKeys).not.toContain('users.account_id->users.id');
|
||||
expect(candidateKeys).not.toContain('accounts.current_plan_code->plans.plan_code');
|
||||
});
|
||||
|
||||
it('uses column embeddings as a recall source for non-standard source names', () => {
|
||||
const customers = table('customers-id', 'customers', [
|
||||
column('customers-id', 'customers-id-col', 'id', {
|
||||
primaryKey: false,
|
||||
embedding: [1, 0, 0],
|
||||
}),
|
||||
column('customers-id', 'customers-name-col', 'name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
embedding: [0, 1, 0],
|
||||
}),
|
||||
]);
|
||||
const orders = table('orders-id', 'orders', [
|
||||
column('orders-id', 'orders-id-col', 'id', {
|
||||
primaryKey: false,
|
||||
embedding: [0, 0, 1],
|
||||
}),
|
||||
column('orders-id', 'buyer-ref-col', 'buyer_ref', {
|
||||
primaryKey: false,
|
||||
embedding: [0.995, 0.005, 0],
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([customers, orders]), {
|
||||
embeddingSimilarityThreshold: 0.95,
|
||||
});
|
||||
|
||||
expect(candidates).toHaveLength(1);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'embedding_similarity',
|
||||
from: { tableId: 'orders-id', columnIds: ['buyer-ref-col'], columns: ['buyer_ref'] },
|
||||
to: { tableId: 'customers-id', columnIds: ['customers-id-col'], columns: ['id'] },
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'review',
|
||||
evidence: {
|
||||
sourceColumnBase: 'buyer_ref',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.92,
|
||||
embeddingSimilarity: expect.any(Number),
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
|
||||
expect(candidates[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining(['embedding_similarity', 'target_key_like']),
|
||||
);
|
||||
});
|
||||
|
||||
it('singularizes names and caps candidates per source column deterministically', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const archivedAccounts = table('archived-accounts-id', 'accounts_archive', [
|
||||
column('archived-accounts-id', 'archived-accounts-id-col', 'id'),
|
||||
]);
|
||||
const events = table('events-id', 'product_events', [
|
||||
column('events-id', 'event-id-col', 'id'),
|
||||
column('events-id', 'account-id-col', 'account_id'),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([events, archivedAccounts, accounts]), {
|
||||
maxCandidatesPerColumn: 1,
|
||||
});
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['product_events.account_id->accounts.id']);
|
||||
});
|
||||
|
||||
it('infers target primary-key candidates from incoming review links', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const users = table('users-id', 'users', [column('users-id', 'users-id-col', 'id')]);
|
||||
const events = table('events-id', 'product_events', [
|
||||
column('events-id', 'event-id-col', 'id'),
|
||||
column('events-id', 'account-id-col', 'account_id'),
|
||||
column('events-id', 'user-id-col', 'user_id'),
|
||||
]);
|
||||
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(schema([accounts, users, events]));
|
||||
const inferredPks = inferKtxRelationshipTargetPks(candidates);
|
||||
|
||||
expect(inferredPks).toEqual([
|
||||
{
|
||||
table: 'accounts',
|
||||
columns: ['id'],
|
||||
score: expect.any(Number),
|
||||
status: 'review',
|
||||
incomingCandidateCount: 1,
|
||||
},
|
||||
{
|
||||
table: 'users',
|
||||
columns: ['id'],
|
||||
score: expect.any(Number),
|
||||
status: 'review',
|
||||
incomingCandidateCount: 1,
|
||||
},
|
||||
]);
|
||||
expect(inferredPks.every((pk) => pk.score >= 0.8)).toBe(true);
|
||||
});
|
||||
|
||||
it('does not generate candidates from primary-key source columns or incompatible target types', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [
|
||||
column('accounts-id', 'accounts-id-col', 'id', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: true }),
|
||||
column('invoices-id', 'account-id-col', 'account_id', { nativeType: 'INTEGER', normalizedType: 'integer' }),
|
||||
]);
|
||||
|
||||
expect(generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]))).toEqual([]);
|
||||
});
|
||||
|
||||
it('normalizes layer prefixes, punctuation, plural forms, and non-plural trailing s words', () => {
|
||||
expect(normalizeKtxRelationshipName('mart__Sales_Accounts')).toMatchObject({
|
||||
normalized: 'sales_accounts',
|
||||
singular: 'sales_account',
|
||||
tokens: ['sales', 'accounts'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('dim_users')).toMatchObject({
|
||||
normalized: 'users',
|
||||
singular: 'user',
|
||||
tokens: ['users'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('Address')).toMatchObject({
|
||||
normalized: 'address',
|
||||
singular: 'address',
|
||||
plural: 'addresses',
|
||||
tokens: ['address'],
|
||||
});
|
||||
});
|
||||
|
||||
it('merges duplicate deterministic and LLM proposal candidates without losing LLM rationale', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'account-id-col', 'account_id')]);
|
||||
const [deterministic] = generateKtxRelationshipDiscoveryCandidates(schema([accounts, invoices]));
|
||||
if (!deterministic) {
|
||||
throw new Error('Expected deterministic relationship candidate');
|
||||
}
|
||||
const llmCandidate = {
|
||||
...deterministic,
|
||||
confidence: 0.99,
|
||||
source: 'llm_proposal' as const,
|
||||
evidence: {
|
||||
...deterministic.evidence,
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Invoices point at the owning account dimension.',
|
||||
},
|
||||
};
|
||||
|
||||
const merged = mergeKtxRelationshipDiscoveryCandidates([deterministic, llmCandidate]);
|
||||
|
||||
expect(merged).toHaveLength(1);
|
||||
expect(merged[0]).toMatchObject({
|
||||
id: deterministic.id,
|
||||
source: 'normalized_table_match',
|
||||
confidence: 0.99,
|
||||
evidence: {
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Invoices point at the owning account dimension.',
|
||||
},
|
||||
});
|
||||
expect(merged[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like', 'llm_proposal']),
|
||||
);
|
||||
});
|
||||
});
|
||||
783
packages/cli/src/context/scan/relationship-candidates.ts
Normal file
783
packages/cli/src/context/scan/relationship-candidates.ts
Normal file
|
|
@ -0,0 +1,783 @@
|
|||
import type {
|
||||
KtxEnrichedColumn,
|
||||
KtxEnrichedSchema,
|
||||
KtxEnrichedTable,
|
||||
KtxRelationshipEndpoint,
|
||||
KtxRelationshipType,
|
||||
} from './enrichment-types.js';
|
||||
import { localCandidateTables } from './relationship-locality.js';
|
||||
import {
|
||||
type KtxRelationshipNormalizedName,
|
||||
normalizeKtxRelationshipName,
|
||||
pluralizeKtxRelationshipToken,
|
||||
singularizeKtxRelationshipToken,
|
||||
} from './relationship-name-similarity.js';
|
||||
export { normalizeKtxRelationshipName } from './relationship-name-similarity.js';
|
||||
import type { KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import {
|
||||
scoreKtxRelationshipCandidate,
|
||||
type KtxRelationshipScoreBreakdown,
|
||||
type KtxRelationshipSignalVector,
|
||||
} from './relationship-scoring.js';
|
||||
|
||||
type KtxRelationshipDiscoveryCandidateSource =
|
||||
| 'exact_column_match'
|
||||
| 'normalized_table_match'
|
||||
| 'parent_table_name_match'
|
||||
| 'inflection'
|
||||
| 'self_reference'
|
||||
| 'profile_match'
|
||||
| 'column_suffix_match'
|
||||
| 'embedding_similarity'
|
||||
| 'llm_proposal';
|
||||
|
||||
type KtxRelationshipDiscoveryCandidateStatus = 'review';
|
||||
|
||||
interface KtxRelationshipDiscoveryCandidateEvidence {
|
||||
sourceColumnBase: string;
|
||||
targetTableBase: string;
|
||||
targetColumnBase: string;
|
||||
targetKeyScore: number;
|
||||
nameScore: number;
|
||||
reasons: string[];
|
||||
signalVector?: KtxRelationshipSignalVector;
|
||||
scoreBreakdown?: KtxRelationshipScoreBreakdown;
|
||||
embeddingSimilarity?: number;
|
||||
llmConfidence?: number;
|
||||
llmRationale?: string;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipDiscoveryCandidate {
|
||||
id: string;
|
||||
from: KtxRelationshipEndpoint;
|
||||
to: KtxRelationshipEndpoint;
|
||||
relationshipType: KtxRelationshipType;
|
||||
confidence: number;
|
||||
source: KtxRelationshipDiscoveryCandidateSource;
|
||||
status: KtxRelationshipDiscoveryCandidateStatus;
|
||||
evidence: KtxRelationshipDiscoveryCandidateEvidence;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipDiscoveryCandidateOptions {
|
||||
maxCandidatesPerColumn?: number;
|
||||
maxCandidateParentTables?: number;
|
||||
maxEmbeddingCandidatesPerColumn?: number;
|
||||
minConfidence?: number;
|
||||
embeddingSimilarityThreshold?: number;
|
||||
useEmbeddings?: boolean;
|
||||
profiles?: KtxRelationshipProfileArtifact;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxRelationshipInferredTargetPk {
|
||||
table: string;
|
||||
columns: string[];
|
||||
score: number;
|
||||
status: 'review';
|
||||
incomingCandidateCount: number;
|
||||
}
|
||||
|
||||
interface KtxRelationshipSourceColumnReference {
|
||||
base: string;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
interface KtxRelationshipTargetKeyEvidence {
|
||||
score: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
const INTEGER_TYPES = new Set(['integer', 'int', 'bigint', 'smallint', 'tinyint', 'int4', 'int8', 'number']);
|
||||
const STRING_TYPES = new Set(['text', 'varchar', 'character varying', 'char', 'character', 'string']);
|
||||
const UUID_TYPES = new Set(['uuid', 'uniqueidentifier']);
|
||||
const SELF_REFERENCE_NAMES = new Set(['parent_id', 'manager_id', 'reported_to_id', 'supervisor_id', 'reports_to_id']);
|
||||
const REFERENCE_SUFFIXES: Array<{ suffix: string; reason: string }> = [
|
||||
{ suffix: '_id', reason: 'foreign_key_suffix' },
|
||||
{ suffix: '_key', reason: 'foreign_key_key_suffix' },
|
||||
{ suffix: '_code', reason: 'foreign_key_code_suffix' },
|
||||
{ suffix: '_uuid', reason: 'foreign_key_uuid_suffix' },
|
||||
];
|
||||
const RELATIONSHIP_KEY_TARGET_SUFFIXES = ['_id', '_key', '_code', '_uuid'] as const;
|
||||
const tableAliasesCache = new WeakMap<KtxEnrichedTable, Set<string>>();
|
||||
const parentTableNameAliasesCache = new WeakMap<KtxEnrichedTable, Set<string>>();
|
||||
const normalizedColumnNameCache = new WeakMap<KtxEnrichedColumn, KtxRelationshipNormalizedName>();
|
||||
|
||||
function normalizedColumnName(column: KtxEnrichedColumn): KtxRelationshipNormalizedName {
|
||||
const cached = normalizedColumnNameCache.get(column);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
const normalized = normalizeKtxRelationshipName(column.name);
|
||||
normalizedColumnNameCache.set(column, normalized);
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function isRelationshipKeyShapedTarget(column: KtxEnrichedColumn): boolean {
|
||||
const normalized = normalizedColumnName(column);
|
||||
return (
|
||||
normalized.tokens.length >= 2 &&
|
||||
RELATIONSHIP_KEY_TARGET_SUFFIXES.some((suffix) => normalized.normalized.endsWith(suffix))
|
||||
);
|
||||
}
|
||||
|
||||
function columnSuffixMatchesTarget(input: { fromColumn: KtxEnrichedColumn; toColumn: KtxEnrichedColumn }): boolean {
|
||||
const source = normalizedColumnName(input.fromColumn).normalized;
|
||||
const target = normalizedColumnName(input.toColumn).normalized;
|
||||
return source !== target && target.length > 0 && source.endsWith(`_${target}`);
|
||||
}
|
||||
|
||||
function normalizeType(column: KtxEnrichedColumn): string {
|
||||
const rawType = (column.normalizedType || column.nativeType || '').toLowerCase().trim();
|
||||
return rawType.includes('(') ? (rawType.split('(')[0] ?? '') : rawType;
|
||||
}
|
||||
|
||||
function typesCompatible(left: KtxEnrichedColumn, right: KtxEnrichedColumn): boolean {
|
||||
const leftType = normalizeType(left);
|
||||
const rightType = normalizeType(right);
|
||||
if (leftType === rightType) {
|
||||
return true;
|
||||
}
|
||||
if (INTEGER_TYPES.has(leftType) && INTEGER_TYPES.has(rightType)) {
|
||||
return true;
|
||||
}
|
||||
if (STRING_TYPES.has(leftType) && STRING_TYPES.has(rightType)) {
|
||||
return true;
|
||||
}
|
||||
return UUID_TYPES.has(leftType) && UUID_TYPES.has(rightType);
|
||||
}
|
||||
|
||||
function cosineSimilarity(left: readonly number[] | null, right: readonly number[] | null): number {
|
||||
if (!left || !right || left.length === 0 || left.length !== right.length) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let dot = 0;
|
||||
let leftMagnitude = 0;
|
||||
let rightMagnitude = 0;
|
||||
for (let index = 0; index < left.length; index += 1) {
|
||||
const leftValue = left[index] ?? 0;
|
||||
const rightValue = right[index] ?? 0;
|
||||
dot += leftValue * rightValue;
|
||||
leftMagnitude += leftValue * leftValue;
|
||||
rightMagnitude += rightValue * rightValue;
|
||||
}
|
||||
|
||||
if (leftMagnitude === 0 || rightMagnitude === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return dot / (Math.sqrt(leftMagnitude) * Math.sqrt(rightMagnitude));
|
||||
}
|
||||
|
||||
function hasUsableEmbedding(column: KtxEnrichedColumn): boolean {
|
||||
return Array.isArray(column.embedding) && column.embedding.length > 0;
|
||||
}
|
||||
|
||||
function sourceColumnReference(column: KtxEnrichedColumn): KtxRelationshipSourceColumnReference | null {
|
||||
const normalized = normalizedColumnName(column);
|
||||
if (SELF_REFERENCE_NAMES.has(normalized.normalized)) {
|
||||
return { base: normalized.normalized.replace(/_id$/u, ''), reason: 'foreign_key_suffix' };
|
||||
}
|
||||
|
||||
for (const item of REFERENCE_SUFFIXES) {
|
||||
if (!normalized.normalized.endsWith(item.suffix)) {
|
||||
continue;
|
||||
}
|
||||
const base = normalized.normalized.slice(0, -item.suffix.length);
|
||||
if (base.length > 1) {
|
||||
return { base: singularizeKtxRelationshipToken(base), reason: item.reason };
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function addNormalizedTableAlias(aliases: Set<string>, name: string): void {
|
||||
const normalized = normalizeKtxRelationshipName(name);
|
||||
if (normalized.normalized.length > 0) {
|
||||
aliases.add(normalized.normalized);
|
||||
}
|
||||
if (normalized.singular.length > 0) {
|
||||
aliases.add(normalized.singular);
|
||||
}
|
||||
if (normalized.plural.length > 0) {
|
||||
aliases.add(normalized.plural);
|
||||
}
|
||||
}
|
||||
|
||||
function tableAliases(table: KtxEnrichedTable): Set<string> {
|
||||
const cached = tableAliasesCache.get(table);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
const normalized = normalizeKtxRelationshipName(table.ref.name);
|
||||
const aliases = new Set([normalized.normalized, normalized.singular, normalized.plural]);
|
||||
if (normalized.tokens.length > 1) {
|
||||
const lastToken = normalized.tokens[normalized.tokens.length - 1];
|
||||
if (lastToken) {
|
||||
aliases.add(lastToken);
|
||||
const singularLastToken = singularizeKtxRelationshipToken(lastToken);
|
||||
aliases.add(singularLastToken);
|
||||
aliases.add(pluralizeKtxRelationshipToken(singularLastToken));
|
||||
}
|
||||
}
|
||||
tableAliasesCache.set(table, aliases);
|
||||
return aliases;
|
||||
}
|
||||
|
||||
function finalTableNamePart(table: KtxEnrichedTable): string {
|
||||
const parts = table.ref.name.split(/[^\p{L}\p{N}]+/u).filter(Boolean);
|
||||
return parts[parts.length - 1] ?? table.ref.name;
|
||||
}
|
||||
|
||||
function parentTableNameAliases(table: KtxEnrichedTable): Set<string> {
|
||||
const cached = parentTableNameAliasesCache.get(table);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
const aliases = new Set(tableAliases(table));
|
||||
addNormalizedTableAlias(aliases, finalTableNamePart(table));
|
||||
parentTableNameAliasesCache.set(table, aliases);
|
||||
return aliases;
|
||||
}
|
||||
|
||||
function targetKeyScore(table: KtxEnrichedTable, column: KtxEnrichedColumn): number {
|
||||
const columnName = normalizedColumnName(column).normalized;
|
||||
const tableKeyBases = parentTableNameAliases(table);
|
||||
if (column.primaryKey) {
|
||||
return 1;
|
||||
}
|
||||
if (columnName === 'id') {
|
||||
return 0.92;
|
||||
}
|
||||
if (Array.from(tableKeyBases).some((tableKeyBase) => columnName === `${tableKeyBase}_id`)) {
|
||||
return 0.9;
|
||||
}
|
||||
if (Array.from(tableKeyBases).some((tableKeyBase) => columnName === `${tableKeyBase}_key`)) {
|
||||
return 0.82;
|
||||
}
|
||||
if (columnName === 'key' || columnName === 'uuid') {
|
||||
return 0.74;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function profileColumn(
|
||||
profiles: KtxRelationshipProfileArtifact | undefined,
|
||||
tableName: string,
|
||||
columnName: string,
|
||||
) {
|
||||
return profiles?.columns[`${tableName}.${columnName}`] ?? null;
|
||||
}
|
||||
|
||||
function profileSampleOverlap(input: {
|
||||
profiles: KtxRelationshipProfileArtifact | undefined;
|
||||
fromTable: KtxEnrichedTable;
|
||||
fromColumn: KtxEnrichedColumn;
|
||||
toTable: KtxEnrichedTable;
|
||||
toColumn: KtxEnrichedColumn;
|
||||
}): number {
|
||||
const source = profileColumn(input.profiles, input.fromTable.ref.name, input.fromColumn.name);
|
||||
const target = profileColumn(input.profiles, input.toTable.ref.name, input.toColumn.name);
|
||||
if (!source || !target || source.sampleValues.length === 0 || target.sampleValues.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
const targetValues = new Set(target.sampleValues.map((value) => value.toLowerCase()));
|
||||
const overlap = source.sampleValues.filter((value) => targetValues.has(value.toLowerCase())).length;
|
||||
return overlap / source.sampleValues.length;
|
||||
}
|
||||
|
||||
function tableProfileRowCount(profiles: KtxRelationshipProfileArtifact | undefined, tableName: string): number | null {
|
||||
return profiles?.tables.find((table) => table.table.name === tableName)?.rowCount ?? null;
|
||||
}
|
||||
|
||||
function structuralPriorScore(input: {
|
||||
profiles: KtxRelationshipProfileArtifact | undefined;
|
||||
fromTable: KtxEnrichedTable;
|
||||
toTable: KtxEnrichedTable;
|
||||
}): number {
|
||||
if (input.fromTable.id === input.toTable.id) {
|
||||
return 0.72;
|
||||
}
|
||||
|
||||
const sourceRows = tableProfileRowCount(input.profiles, input.fromTable.ref.name);
|
||||
const targetRows = tableProfileRowCount(input.profiles, input.toTable.ref.name);
|
||||
if (sourceRows === null || targetRows === null || sourceRows <= 0 || targetRows <= 0) {
|
||||
return 0.5;
|
||||
}
|
||||
|
||||
const ratio = targetRows / sourceRows;
|
||||
if (ratio >= 0.05 && ratio <= 20) {
|
||||
return 0.7;
|
||||
}
|
||||
return 0.4;
|
||||
}
|
||||
|
||||
function candidateSignalVector(input: {
|
||||
profiles: KtxRelationshipProfileArtifact | undefined;
|
||||
fromTable: KtxEnrichedTable;
|
||||
fromColumn: KtxEnrichedColumn;
|
||||
toTable: KtxEnrichedTable;
|
||||
toColumn: KtxEnrichedColumn;
|
||||
targetKeyScore: number;
|
||||
nameScore: number;
|
||||
valueOverlap: number;
|
||||
embeddingSimilarity?: number;
|
||||
}): KtxRelationshipSignalVector {
|
||||
const sourceProfile = profileColumn(input.profiles, input.fromTable.ref.name, input.fromColumn.name);
|
||||
const targetProfile = profileColumn(input.profiles, input.toTable.ref.name, input.toColumn.name);
|
||||
const targetUniqueness = targetProfile?.uniquenessRatio ?? input.targetKeyScore;
|
||||
const sourceNonNullness = sourceProfile ? 1 - sourceProfile.nullRate : 0.5;
|
||||
|
||||
return {
|
||||
nameSimilarity: input.nameScore,
|
||||
typeCompatibility: typesCompatible(input.fromColumn, input.toColumn) ? 1 : 0,
|
||||
valueOverlap: input.valueOverlap,
|
||||
embeddingSimilarity: input.embeddingSimilarity ?? 0,
|
||||
profileUniqueness: targetUniqueness,
|
||||
profileNullRate: sourceNonNullness,
|
||||
structuralPrior: structuralPriorScore({
|
||||
profiles: input.profiles,
|
||||
fromTable: input.fromTable,
|
||||
toTable: input.toTable,
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function candidateParentTables(input: {
|
||||
tables: readonly KtxEnrichedTable[];
|
||||
fromTable: KtxEnrichedTable;
|
||||
fromColumn: KtxEnrichedColumn;
|
||||
options: KtxRelationshipDiscoveryCandidateOptions;
|
||||
}): KtxEnrichedTable[] {
|
||||
const maxParentTables = input.options.maxCandidateParentTables ?? 20;
|
||||
if (maxParentTables <= 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: input.fromTable,
|
||||
childColumn: input.fromColumn,
|
||||
parentTables: input.tables,
|
||||
maxParentTables,
|
||||
}).map((item) => item.table);
|
||||
|
||||
const normalizedColumn = normalizedColumnName(input.fromColumn).normalized;
|
||||
if (!SELF_REFERENCE_NAMES.has(normalizedColumn) || ranked.some((table) => table.id === input.fromTable.id)) {
|
||||
return ranked;
|
||||
}
|
||||
|
||||
return [
|
||||
input.fromTable,
|
||||
...ranked.filter((table) => table.id !== input.fromTable.id).slice(0, Math.max(0, maxParentTables - 1)),
|
||||
];
|
||||
}
|
||||
|
||||
function targetKeyEvidence(
|
||||
table: KtxEnrichedTable,
|
||||
column: KtxEnrichedColumn,
|
||||
profiles: KtxRelationshipProfileArtifact | undefined,
|
||||
): KtxRelationshipTargetKeyEvidence {
|
||||
const deterministicScore = targetKeyScore(table, column);
|
||||
if (deterministicScore > 0) {
|
||||
return { score: deterministicScore, reasons: ['target_key_like'] };
|
||||
}
|
||||
|
||||
const profile = profileColumn(profiles, table.ref.name, column.name);
|
||||
if (!profile || profile.uniquenessRatio < 0.98 || profile.nullRate > 0.05) {
|
||||
return { score: 0, reasons: [] };
|
||||
}
|
||||
|
||||
const columnName = normalizedColumnName(column).normalized;
|
||||
if (columnName === 'code' || columnName.endsWith('_code') || columnName === 'key' || columnName.endsWith('_key')) {
|
||||
return { score: 0.86, reasons: ['profile_unique_target'] };
|
||||
}
|
||||
|
||||
return { score: 0.78, reasons: ['profile_unique_target'] };
|
||||
}
|
||||
|
||||
function endpoint(table: KtxEnrichedTable, column: KtxEnrichedColumn): KtxRelationshipEndpoint {
|
||||
return {
|
||||
tableId: table.id,
|
||||
columnIds: [column.id],
|
||||
table: table.ref,
|
||||
columns: [column.name],
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipId(from: KtxRelationshipEndpoint, to: KtxRelationshipEndpoint): string {
|
||||
return `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`;
|
||||
}
|
||||
|
||||
function endpointsHaveSameOrderedColumns(left: KtxRelationshipEndpoint, right: KtxRelationshipEndpoint): boolean {
|
||||
if (left.columnIds.length !== right.columnIds.length || left.columns.length !== right.columns.length) {
|
||||
return false;
|
||||
}
|
||||
return left.columnIds.every(
|
||||
(columnId, index) => columnId === right.columnIds[index] && left.columns[index] === right.columns[index],
|
||||
);
|
||||
}
|
||||
|
||||
function isDegenerateSameColumnSelfLink(candidate: Pick<KtxRelationshipDiscoveryCandidate, 'from' | 'to'>): boolean {
|
||||
return candidate.from.tableId === candidate.to.tableId && endpointsHaveSameOrderedColumns(candidate.from, candidate.to);
|
||||
}
|
||||
|
||||
function singleRelationshipColumn(endpointValue: KtxRelationshipEndpoint): string {
|
||||
const column = endpointValue.columns[0];
|
||||
if (!column) {
|
||||
throw new Error(`Expected relationship endpoint ${endpointValue.table.name} to contain one column`);
|
||||
}
|
||||
return column;
|
||||
}
|
||||
|
||||
function candidateSortKey(candidate: KtxRelationshipDiscoveryCandidate): string {
|
||||
return `${candidate.from.table.name}.${singleRelationshipColumn(candidate.from)}->${candidate.to.table.name}.${singleRelationshipColumn(candidate.to)}`;
|
||||
}
|
||||
|
||||
function uniqueReasons(values: readonly string[]): string[] {
|
||||
return Array.from(new Set(values.filter((value) => value.trim().length > 0)));
|
||||
}
|
||||
|
||||
function mergeCandidateEvidence(
|
||||
left: KtxRelationshipDiscoveryCandidate,
|
||||
right: KtxRelationshipDiscoveryCandidate,
|
||||
): KtxRelationshipDiscoveryCandidate {
|
||||
const preferred = right.confidence > left.confidence && left.source === 'llm_proposal' ? right : left;
|
||||
const supplement = preferred === left ? right : left;
|
||||
return {
|
||||
...preferred,
|
||||
confidence: Math.max(left.confidence, right.confidence),
|
||||
evidence: {
|
||||
...preferred.evidence,
|
||||
llmConfidence: preferred.evidence.llmConfidence ?? supplement.evidence.llmConfidence,
|
||||
llmRationale: preferred.evidence.llmRationale ?? supplement.evidence.llmRationale,
|
||||
reasons: uniqueReasons([...preferred.evidence.reasons, ...supplement.evidence.reasons]),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function sourceForEvidence(reasons: string[]): KtxRelationshipDiscoveryCandidateSource {
|
||||
if (reasons.includes('self_reference')) {
|
||||
return 'self_reference';
|
||||
}
|
||||
if (reasons.includes('embedding_similarity')) {
|
||||
return 'embedding_similarity';
|
||||
}
|
||||
if (reasons.includes('column_suffix_match')) {
|
||||
return 'column_suffix_match';
|
||||
}
|
||||
if (reasons.includes('parent_table_name_match')) {
|
||||
return 'parent_table_name_match';
|
||||
}
|
||||
if (reasons.includes('profile_sample_overlap') || reasons.includes('profile_unique_target')) {
|
||||
return 'profile_match';
|
||||
}
|
||||
if (reasons.includes('normalized_table_name')) {
|
||||
return 'normalized_table_match';
|
||||
}
|
||||
if (reasons.includes('exact_column_name')) {
|
||||
return 'exact_column_match';
|
||||
}
|
||||
if (reasons.includes('inflection')) {
|
||||
return 'inflection';
|
||||
}
|
||||
return 'normalized_table_match';
|
||||
}
|
||||
|
||||
function createCandidate(input: {
|
||||
fromTable: KtxEnrichedTable;
|
||||
fromColumn: KtxEnrichedColumn;
|
||||
toTable: KtxEnrichedTable;
|
||||
toColumn: KtxEnrichedColumn;
|
||||
sourceBase: string;
|
||||
targetBase: string;
|
||||
targetKeyScore: number;
|
||||
nameScore: number;
|
||||
reasons: string[];
|
||||
profiles: KtxRelationshipProfileArtifact | undefined;
|
||||
valueOverlap: number;
|
||||
embeddingSimilarity?: number;
|
||||
}): KtxRelationshipDiscoveryCandidate {
|
||||
const from = endpoint(input.fromTable, input.fromColumn);
|
||||
const to = endpoint(input.toTable, input.toColumn);
|
||||
const signalVector = candidateSignalVector({
|
||||
profiles: input.profiles,
|
||||
fromTable: input.fromTable,
|
||||
fromColumn: input.fromColumn,
|
||||
toTable: input.toTable,
|
||||
toColumn: input.toColumn,
|
||||
targetKeyScore: input.targetKeyScore,
|
||||
nameScore: input.nameScore,
|
||||
valueOverlap: input.valueOverlap,
|
||||
embeddingSimilarity: input.embeddingSimilarity,
|
||||
});
|
||||
const scoreBreakdown = scoreKtxRelationshipCandidate(signalVector);
|
||||
|
||||
return {
|
||||
id: relationshipId(from, to),
|
||||
from,
|
||||
to,
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: scoreBreakdown.score,
|
||||
source: sourceForEvidence(input.reasons),
|
||||
status: 'review',
|
||||
evidence: {
|
||||
sourceColumnBase: input.sourceBase,
|
||||
targetTableBase: input.targetBase,
|
||||
targetColumnBase: normalizedColumnName(input.toColumn).normalized,
|
||||
targetKeyScore: input.targetKeyScore,
|
||||
nameScore: input.nameScore,
|
||||
reasons: input.reasons,
|
||||
signalVector,
|
||||
scoreBreakdown,
|
||||
...(input.embeddingSimilarity === undefined
|
||||
? {}
|
||||
: { embeddingSimilarity: Number(input.embeddingSimilarity.toFixed(3)) }),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function generateKtxEmbeddingRelationshipCandidates(
|
||||
schema: KtxEnrichedSchema,
|
||||
options: KtxRelationshipDiscoveryCandidateOptions,
|
||||
): KtxRelationshipDiscoveryCandidate[] {
|
||||
if (options.useEmbeddings === false) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const threshold = options.embeddingSimilarityThreshold ?? 0.92;
|
||||
const maxCandidatesPerColumn = options.maxEmbeddingCandidatesPerColumn ?? options.maxCandidatesPerColumn ?? 25;
|
||||
const tables = schema.tables.filter((table) => table.enabled);
|
||||
const candidates: KtxRelationshipDiscoveryCandidate[] = [];
|
||||
|
||||
for (const fromTable of tables) {
|
||||
for (const fromColumn of fromTable.columns) {
|
||||
if (fromColumn.primaryKey || !hasUsableEmbedding(fromColumn)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const columnCandidates: KtxRelationshipDiscoveryCandidate[] = [];
|
||||
for (const toTable of candidateParentTables({ tables, fromTable, fromColumn, options })) {
|
||||
if (fromTable.id === toTable.id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const toColumn of toTable.columns) {
|
||||
if (!hasUsableEmbedding(toColumn) || !typesCompatible(fromColumn, toColumn)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const keyEvidence = targetKeyEvidence(toTable, toColumn, options.profiles);
|
||||
if (keyEvidence.score === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const similarity = cosineSimilarity(fromColumn.embedding, toColumn.embedding);
|
||||
if (similarity < threshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const sourceBase = normalizedColumnName(fromColumn).normalized;
|
||||
const targetBase = normalizeKtxRelationshipName(toTable.ref.name).singular;
|
||||
const reasons = ['embedding_similarity', ...keyEvidence.reasons];
|
||||
const candidate = createCandidate({
|
||||
fromTable,
|
||||
fromColumn,
|
||||
toTable,
|
||||
toColumn,
|
||||
sourceBase,
|
||||
targetBase,
|
||||
targetKeyScore: keyEvidence.score,
|
||||
nameScore: similarity,
|
||||
reasons,
|
||||
profiles: options.profiles,
|
||||
valueOverlap: profileSampleOverlap({
|
||||
profiles: options.profiles,
|
||||
fromTable,
|
||||
fromColumn,
|
||||
toTable,
|
||||
toColumn,
|
||||
}),
|
||||
embeddingSimilarity: similarity,
|
||||
});
|
||||
if (candidate.confidence >= (options.minConfidence ?? 0.72) && !isDegenerateSameColumnSelfLink(candidate)) {
|
||||
columnCandidates.push(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
columnCandidates.sort(
|
||||
(left, right) => right.confidence - left.confidence || candidateSortKey(left).localeCompare(candidateSortKey(right)),
|
||||
);
|
||||
candidates.push(...columnCandidates.slice(0, maxCandidatesPerColumn));
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
export function generateKtxRelationshipDiscoveryCandidates(
|
||||
schema: KtxEnrichedSchema,
|
||||
options: KtxRelationshipDiscoveryCandidateOptions = {},
|
||||
): KtxRelationshipDiscoveryCandidate[] {
|
||||
const maxCandidatesPerColumn = options.maxCandidatesPerColumn ?? 25;
|
||||
const minConfidence = options.minConfidence ?? 0.72;
|
||||
const tables = schema.tables.filter((table) => table.enabled);
|
||||
const candidates: KtxRelationshipDiscoveryCandidate[] = [];
|
||||
|
||||
for (const fromTable of tables) {
|
||||
for (const fromColumn of fromTable.columns) {
|
||||
if (fromColumn.primaryKey) {
|
||||
continue;
|
||||
}
|
||||
const sourceReference = sourceColumnReference(fromColumn);
|
||||
if (!sourceReference) {
|
||||
continue;
|
||||
}
|
||||
const sourceBase = sourceReference.base;
|
||||
|
||||
const columnCandidates: KtxRelationshipDiscoveryCandidate[] = [];
|
||||
for (const toTable of candidateParentTables({ tables, fromTable, fromColumn, options })) {
|
||||
const strictAliases = tableAliases(toTable);
|
||||
const parentAliases = parentTableNameAliases(toTable);
|
||||
const targetBase = normalizeKtxRelationshipName(toTable.ref.name).singular;
|
||||
const sameTable = fromTable.id === toTable.id;
|
||||
const nameMatchesTarget = strictAliases.has(sourceBase);
|
||||
const parentTableNameMatcher = !sameTable && !nameMatchesTarget && parentAliases.has(sourceBase);
|
||||
const selfReference = sameTable && SELF_REFERENCE_NAMES.has(normalizedColumnName(fromColumn).normalized);
|
||||
const strictTableMatcher = (!sameTable && nameMatchesTarget) || selfReference;
|
||||
|
||||
for (const toColumn of toTable.columns) {
|
||||
const keyEvidence = targetKeyEvidence(toTable, toColumn, options.profiles);
|
||||
if (keyEvidence.score === 0 || !typesCompatible(fromColumn, toColumn)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const suffixMatcher =
|
||||
!strictTableMatcher &&
|
||||
!parentTableNameMatcher &&
|
||||
columnSuffixMatchesTarget({ fromColumn, toColumn }) &&
|
||||
isRelationshipKeyShapedTarget(toColumn);
|
||||
if (!strictTableMatcher && !suffixMatcher && !parentTableNameMatcher) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const overlap = profileSampleOverlap({
|
||||
profiles: options.profiles,
|
||||
fromTable,
|
||||
fromColumn,
|
||||
toTable,
|
||||
toColumn,
|
||||
});
|
||||
if (
|
||||
(strictTableMatcher || parentTableNameMatcher) &&
|
||||
keyEvidence.reasons.includes('profile_unique_target') &&
|
||||
overlap === 0
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
const reasons = suffixMatcher
|
||||
? ['column_suffix_match', ...keyEvidence.reasons]
|
||||
: [sourceReference.reason, ...keyEvidence.reasons];
|
||||
if (overlap > 0) {
|
||||
reasons.push('profile_sample_overlap');
|
||||
}
|
||||
let nameScore = suffixMatcher ? 0.78 : 0.88;
|
||||
if (parentTableNameMatcher) {
|
||||
reasons.push('parent_table_name_match');
|
||||
nameScore = 0.82;
|
||||
} else if (selfReference) {
|
||||
reasons.push('self_reference');
|
||||
nameScore = 0.82;
|
||||
} else if (!suffixMatcher && normalizeKtxRelationshipName(toTable.ref.name).singular === sourceBase) {
|
||||
reasons.push('normalized_table_name');
|
||||
nameScore = 0.92;
|
||||
} else if (!suffixMatcher && strictAliases.has(sourceBase)) {
|
||||
reasons.push('inflection');
|
||||
nameScore = 0.88;
|
||||
}
|
||||
if (
|
||||
!suffixMatcher &&
|
||||
!parentTableNameMatcher &&
|
||||
normalizedColumnName(fromColumn).normalized === normalizedColumnName(toColumn).normalized
|
||||
) {
|
||||
reasons.push('exact_column_name');
|
||||
nameScore = Math.max(nameScore, 0.9);
|
||||
}
|
||||
|
||||
const candidate = createCandidate({
|
||||
fromTable,
|
||||
fromColumn,
|
||||
toTable,
|
||||
toColumn,
|
||||
sourceBase,
|
||||
targetBase,
|
||||
targetKeyScore: keyEvidence.score,
|
||||
nameScore,
|
||||
reasons,
|
||||
profiles: options.profiles,
|
||||
valueOverlap: overlap,
|
||||
});
|
||||
if (candidate.confidence >= minConfidence && !isDegenerateSameColumnSelfLink(candidate)) {
|
||||
columnCandidates.push(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
columnCandidates.sort(
|
||||
(left, right) => right.confidence - left.confidence || candidateSortKey(left).localeCompare(candidateSortKey(right)),
|
||||
);
|
||||
candidates.push(...columnCandidates.slice(0, maxCandidatesPerColumn));
|
||||
}
|
||||
}
|
||||
|
||||
candidates.push(...generateKtxEmbeddingRelationshipCandidates(schema, options));
|
||||
|
||||
const byId = new Map<string, KtxRelationshipDiscoveryCandidate>();
|
||||
for (const candidate of candidates) {
|
||||
const existing = byId.get(candidate.id);
|
||||
if (!existing || candidate.confidence > existing.confidence) {
|
||||
byId.set(candidate.id, candidate);
|
||||
}
|
||||
}
|
||||
return Array.from(byId.values()).sort(
|
||||
(left, right) => right.confidence - left.confidence || candidateSortKey(left).localeCompare(candidateSortKey(right)),
|
||||
);
|
||||
}
|
||||
|
||||
export function mergeKtxRelationshipDiscoveryCandidates(
|
||||
candidates: readonly KtxRelationshipDiscoveryCandidate[],
|
||||
): KtxRelationshipDiscoveryCandidate[] {
|
||||
const byId = new Map<string, KtxRelationshipDiscoveryCandidate>();
|
||||
for (const candidate of candidates) {
|
||||
const existing = byId.get(candidate.id);
|
||||
byId.set(candidate.id, existing ? mergeCandidateEvidence(existing, candidate) : candidate);
|
||||
}
|
||||
return Array.from(byId.values()).sort((left, right) => candidateSortKey(left).localeCompare(candidateSortKey(right)));
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function inferKtxRelationshipTargetPks(
|
||||
candidates: readonly KtxRelationshipDiscoveryCandidate[],
|
||||
): KtxRelationshipInferredTargetPk[] {
|
||||
const incoming = new Map<string, { table: string; column: string; scores: number[] }>();
|
||||
for (const candidate of candidates) {
|
||||
const toColumn = singleRelationshipColumn(candidate.to);
|
||||
const key = `${candidate.to.table.name}.${toColumn}`;
|
||||
const item = incoming.get(key) ?? { table: candidate.to.table.name, column: toColumn, scores: [] };
|
||||
item.scores.push(candidate.confidence);
|
||||
incoming.set(key, item);
|
||||
}
|
||||
|
||||
return Array.from(incoming.values())
|
||||
.map((item) => ({
|
||||
table: item.table,
|
||||
columns: [item.column],
|
||||
score: Number(Math.min(0.95, Math.max(...item.scores)).toFixed(3)),
|
||||
status: 'review' as const,
|
||||
incomingCandidateCount: item.scores.length,
|
||||
}))
|
||||
.sort((left, right) => left.table.localeCompare(right.table) || left.columns[0]!.localeCompare(right.columns[0]!));
|
||||
}
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { snapshotToKtxEnrichedSchema } from './local-enrichment.js';
|
||||
import { loadKtxRelationshipBenchmarkFixture, maskKtxRelationshipBenchmarkSnapshot } from './relationship-benchmarks.js';
|
||||
import { discoverKtxCompositeRelationships } from './relationship-composite-candidates.js';
|
||||
import { profileKtxRelationshipSchema, type KtxRelationshipReadOnlyExecutor } from './relationship-profiling.js';
|
||||
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext } from './types.js';
|
||||
|
||||
class TestSqliteExecutor implements KtxRelationshipReadOnlyExecutor {
|
||||
private readonly db: Database.Database;
|
||||
|
||||
constructor(dataPath: string) {
|
||||
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
|
||||
}
|
||||
|
||||
async executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return {
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
};
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
describe('composite relationship discovery detector', () => {
|
||||
it('infers composite primary keys and validates composite foreign keys from row evidence', async () => {
|
||||
const fixtureRoot = new URL('../../test/fixtures/relationship-benchmarks/', import.meta.url);
|
||||
const fixture = await loadKtxRelationshipBenchmarkFixture(
|
||||
join(fixtureRoot.pathname, 'composite_keys_no_declared_constraints'),
|
||||
);
|
||||
const snapshot = maskKtxRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
|
||||
const schema = snapshotToKtxEnrichedSchema(snapshot, new Map());
|
||||
const executor = new TestSqliteExecutor(fixture.dataPath ?? '');
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: snapshot.connectionId,
|
||||
driver: snapshot.driver,
|
||||
schema,
|
||||
executor,
|
||||
ctx: { runId: 'test:composite-profile' },
|
||||
});
|
||||
|
||||
const result = await discoverKtxCompositeRelationships({
|
||||
connectionId: snapshot.connectionId,
|
||||
driver: snapshot.driver,
|
||||
schema,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'test:composite-detect' },
|
||||
});
|
||||
executor.close();
|
||||
|
||||
expect(result.primaryKeys.map((item) => `${item.table.name}.(${item.columns.join(',')})`)).toEqual([
|
||||
'order_line_allocations.(order_id,line_number,warehouse_code)',
|
||||
'order_lines.(order_id,line_number)',
|
||||
]);
|
||||
expect(
|
||||
result.relationships.map(
|
||||
(item) =>
|
||||
`${item.from.table.name}.(${item.from.columns.join(',')})->${item.to.table.name}.(${item.to.columns.join(',')})`,
|
||||
),
|
||||
).toEqual(['order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)']);
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'accepted',
|
||||
confidence: 0.95,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
reasons: ['composite_validation_passed'],
|
||||
},
|
||||
});
|
||||
expect(result.queryCount).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,622 @@
|
|||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable, KtxRelationshipType } from './enrichment-types.js';
|
||||
import {
|
||||
formatKtxRelationshipTableRef,
|
||||
quoteKtxRelationshipIdentifier,
|
||||
type KtxRelationshipProfileArtifact,
|
||||
type KtxRelationshipReadOnlyExecutor,
|
||||
} from './relationship-profiling.js';
|
||||
import type { KtxConnectionDriver, KtxQueryResult, KtxScanContext, KtxTableRef } from './types.js';
|
||||
|
||||
type KtxCompositeRelationshipStatus = 'accepted' | 'review' | 'rejected';
|
||||
|
||||
interface KtxCompositeRelationshipTupleEndpoint {
|
||||
tableId: string;
|
||||
columnIds: string[];
|
||||
table: KtxTableRef;
|
||||
columns: string[];
|
||||
}
|
||||
|
||||
export interface KtxCompositePrimaryKeyCandidate {
|
||||
id: string;
|
||||
tableId: string;
|
||||
table: KtxTableRef;
|
||||
columns: string[];
|
||||
columnIds: string[];
|
||||
score: number;
|
||||
status: KtxCompositeRelationshipStatus;
|
||||
evidence: {
|
||||
rowCount: number;
|
||||
distinctCount: number;
|
||||
uniquenessRatio: number;
|
||||
nullRate: number;
|
||||
reasons: string[];
|
||||
};
|
||||
}
|
||||
|
||||
interface KtxCompositeRelationshipValidationEvidence {
|
||||
targetUniqueness: number;
|
||||
sourceCoverage: number;
|
||||
violationCount: number;
|
||||
violationRatio: number;
|
||||
childDistinct: number;
|
||||
parentDistinct: number;
|
||||
overlap: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KtxCompositeRelationshipCandidate {
|
||||
id: string;
|
||||
from: KtxCompositeRelationshipTupleEndpoint;
|
||||
to: KtxCompositeRelationshipTupleEndpoint;
|
||||
relationshipType: KtxRelationshipType;
|
||||
confidence: number;
|
||||
status: KtxCompositeRelationshipStatus;
|
||||
source: 'composite_profile_match';
|
||||
validation: KtxCompositeRelationshipValidationEvidence;
|
||||
}
|
||||
|
||||
export interface DiscoverKtxCompositeRelationshipsInput {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
schema: KtxEnrichedSchema;
|
||||
profiles: KtxRelationshipProfileArtifact;
|
||||
executor: KtxRelationshipReadOnlyExecutor | null;
|
||||
ctx: KtxScanContext;
|
||||
maxCompositeWidth?: number;
|
||||
maxColumnsPerTable?: number;
|
||||
minPrimaryKeyUniqueness?: number;
|
||||
minSourceCoverage?: number;
|
||||
maxViolationRatio?: number;
|
||||
}
|
||||
|
||||
export interface DiscoverKtxCompositeRelationshipsResult {
|
||||
primaryKeys: KtxCompositePrimaryKeyCandidate[];
|
||||
relationships: KtxCompositeRelationshipCandidate[];
|
||||
queryCount: number;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
const KEY_NAME_PARTS = new Set(['id', 'key', 'code', 'number', 'num', 'line', 'warehouse', 'account', 'order']);
|
||||
const DEFAULT_MAX_COMPOSITE_WIDTH = 3;
|
||||
const DEFAULT_MAX_COLUMNS_PER_TABLE = 8;
|
||||
const DEFAULT_MIN_PRIMARY_KEY_UNIQUENESS = 0.98;
|
||||
const DEFAULT_MIN_SOURCE_COVERAGE = 0.9;
|
||||
const DEFAULT_MAX_VIOLATION_RATIO = 0.01;
|
||||
|
||||
function enabledTables(schema: KtxEnrichedSchema): KtxEnrichedTable[] {
|
||||
return schema.tables.filter((table) => table.enabled);
|
||||
}
|
||||
|
||||
function tableRowCount(profiles: KtxRelationshipProfileArtifact, tableName: string): number {
|
||||
return profiles.tables.find((item) => item.table.name === tableName)?.rowCount ?? 0;
|
||||
}
|
||||
|
||||
function profileKey(tableName: string, columnName: string): string {
|
||||
return `${tableName}.${columnName}`;
|
||||
}
|
||||
|
||||
function profileNullRate(profiles: KtxRelationshipProfileArtifact, tableName: string, columnName: string): number {
|
||||
return profiles.columns[profileKey(tableName, columnName)]?.nullRate ?? 1;
|
||||
}
|
||||
|
||||
function normalizedColumnName(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/gu, '_')
|
||||
.replace(/^_+|_+$/gu, '');
|
||||
}
|
||||
|
||||
function columnNameScore(column: KtxEnrichedColumn): number {
|
||||
const parts = normalizedColumnName(column.name).split('_').filter(Boolean);
|
||||
if (parts.some((part) => KEY_NAME_PARTS.has(part))) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function nameParts(name: string): string[] {
|
||||
return normalizedColumnName(name).split('_').filter(Boolean);
|
||||
}
|
||||
|
||||
function keyLikeTableNameParts(tableName: string): Set<string> {
|
||||
return new Set(nameParts(tableName).filter((part) => KEY_NAME_PARTS.has(part)));
|
||||
}
|
||||
|
||||
function tupleCoversTableNameKeyParts(tableName: string, columns: readonly KtxEnrichedColumn[]): boolean {
|
||||
const required = keyLikeTableNameParts(tableName);
|
||||
if (required.size === 0) {
|
||||
return true;
|
||||
}
|
||||
const columnParts = new Set(columns.flatMap((column) => nameParts(column.name)));
|
||||
return Array.from(required).every((part) => columnParts.has(part));
|
||||
}
|
||||
|
||||
function candidateKeyColumns(input: {
|
||||
table: KtxEnrichedTable;
|
||||
profiles: KtxRelationshipProfileArtifact;
|
||||
maxColumnsPerTable: number;
|
||||
}): KtxEnrichedColumn[] {
|
||||
return input.table.columns
|
||||
.map((column, index) => ({ column, index }))
|
||||
.filter(({ column }) => {
|
||||
if (column.dimensionType === 'time' || column.dimensionType === 'boolean') {
|
||||
return false;
|
||||
}
|
||||
const profile = input.profiles.columns[profileKey(input.table.ref.name, column.name)];
|
||||
return Boolean(profile) && profile!.nullRate <= 0.02 && columnNameScore(column) > 0;
|
||||
})
|
||||
.sort(
|
||||
(left, right) =>
|
||||
columnNameScore(right.column) - columnNameScore(left.column) || left.index - right.index,
|
||||
)
|
||||
.slice(0, input.maxColumnsPerTable)
|
||||
.map(({ column }) => column);
|
||||
}
|
||||
|
||||
function hasStrongSingleColumnKey(input: {
|
||||
table: KtxEnrichedTable;
|
||||
profiles: KtxRelationshipProfileArtifact;
|
||||
minPrimaryKeyUniqueness: number;
|
||||
}): boolean {
|
||||
return input.table.columns.some((column) => {
|
||||
if (column.dimensionType === 'time' || column.dimensionType === 'boolean' || columnNameScore(column) === 0) {
|
||||
return false;
|
||||
}
|
||||
const profile = input.profiles.columns[profileKey(input.table.ref.name, column.name)];
|
||||
return Boolean(profile) && profile!.nullRate <= 0.02 && profile!.uniquenessRatio >= input.minPrimaryKeyUniqueness;
|
||||
});
|
||||
}
|
||||
|
||||
function combinations<T>(values: readonly T[], width: number): T[][] {
|
||||
if (width <= 0) {
|
||||
return [[]];
|
||||
}
|
||||
if (values.length < width) {
|
||||
return [];
|
||||
}
|
||||
const output: T[][] = [];
|
||||
values.forEach((value, index) => {
|
||||
for (const tail of combinations(values.slice(index + 1), width - 1)) {
|
||||
output.push([value, ...tail]);
|
||||
}
|
||||
});
|
||||
return output;
|
||||
}
|
||||
|
||||
function tupleKey(tableName: string, columns: readonly string[]): string {
|
||||
return `${tableName}.(${columns.join(',')})`;
|
||||
}
|
||||
|
||||
function relationshipKey(input: {
|
||||
fromTable: string;
|
||||
fromColumns: readonly string[];
|
||||
toTable: string;
|
||||
toColumns: readonly string[];
|
||||
}): string {
|
||||
return `${tupleKey(input.fromTable, input.fromColumns)}->${tupleKey(input.toTable, input.toColumns)}`;
|
||||
}
|
||||
|
||||
function tupleEndpoint(table: KtxEnrichedTable, columns: readonly KtxEnrichedColumn[]): KtxCompositeRelationshipTupleEndpoint {
|
||||
return {
|
||||
tableId: table.id,
|
||||
columnIds: columns.map((column) => column.id),
|
||||
table: table.ref,
|
||||
columns: columns.map((column) => column.name),
|
||||
};
|
||||
}
|
||||
|
||||
function row(result: KtxQueryResult): unknown[] {
|
||||
return result.rows[0] ?? [];
|
||||
}
|
||||
|
||||
function numberAt(result: KtxQueryResult, header: string): number {
|
||||
const index = result.headers.findIndex((candidate) => candidate.toLowerCase() === header.toLowerCase());
|
||||
const value = row(result)[index];
|
||||
if (typeof value === 'number') {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === 'bigint') {
|
||||
return Number(value);
|
||||
}
|
||||
if (typeof value === 'string' && value.trim() !== '') {
|
||||
return Number(value);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function topSql(driver: KtxConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return ` TOP (${Math.max(1, Math.floor(limit))})`;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function limitSql(driver: KtxConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return '';
|
||||
}
|
||||
return ` LIMIT ${Math.max(1, Math.floor(limit))}`;
|
||||
}
|
||||
|
||||
function aliasedTupleSelect(driver: KtxConnectionDriver, columns: readonly string[]): string {
|
||||
return columns
|
||||
.map((column, index) => `${quoteKtxRelationshipIdentifier(driver, column)} AS c${index}`)
|
||||
.join(', ');
|
||||
}
|
||||
|
||||
function nonNullPredicate(driver: KtxConnectionDriver, columns: readonly string[]): string {
|
||||
return columns.map((column) => `${quoteKtxRelationshipIdentifier(driver, column)} IS NOT NULL`).join(' AND ');
|
||||
}
|
||||
|
||||
function tupleEquality(columns: number): string {
|
||||
return Array.from({ length: columns }, (_, index) => `child_values.c${index} = parent_values.c${index}`).join(
|
||||
' AND ',
|
||||
);
|
||||
}
|
||||
|
||||
function buildTupleDistinctSql(input: {
|
||||
driver: KtxConnectionDriver;
|
||||
table: KtxTableRef;
|
||||
columns: readonly string[];
|
||||
}): string {
|
||||
const tableSql = formatKtxRelationshipTableRef(input.driver, input.table);
|
||||
return [
|
||||
'WITH tuple_values AS (',
|
||||
`SELECT DISTINCT ${aliasedTupleSelect(input.driver, input.columns)} FROM ${tableSql}`,
|
||||
`WHERE ${nonNullPredicate(input.driver, input.columns)}`,
|
||||
')',
|
||||
'SELECT COUNT(*) AS distinct_count FROM tuple_values',
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function buildCompositeCoverageSql(input: {
|
||||
driver: KtxConnectionDriver;
|
||||
childTable: KtxTableRef;
|
||||
childColumns: readonly string[];
|
||||
parentTable: KtxTableRef;
|
||||
parentColumns: readonly string[];
|
||||
maxDistinctSourceValues: number;
|
||||
}): string {
|
||||
const childTableSql = formatKtxRelationshipTableRef(input.driver, input.childTable);
|
||||
const parentTableSql = formatKtxRelationshipTableRef(input.driver, input.parentTable);
|
||||
const top = topSql(input.driver, input.maxDistinctSourceValues);
|
||||
const limit = limitSql(input.driver, input.maxDistinctSourceValues);
|
||||
return [
|
||||
'WITH child_values AS (',
|
||||
`SELECT DISTINCT${top} ${aliasedTupleSelect(input.driver, input.childColumns)} FROM ${childTableSql}`,
|
||||
`WHERE ${nonNullPredicate(input.driver, input.childColumns)}${limit}`,
|
||||
'), parent_values AS (',
|
||||
`SELECT DISTINCT ${aliasedTupleSelect(input.driver, input.parentColumns)} FROM ${parentTableSql}`,
|
||||
`WHERE ${nonNullPredicate(input.driver, input.parentColumns)}`,
|
||||
')',
|
||||
'SELECT',
|
||||
'(SELECT COUNT(*) FROM child_values) AS child_distinct,',
|
||||
'(SELECT COUNT(*) FROM parent_values) AS parent_distinct,',
|
||||
'SUM(CASE WHEN parent_values.c0 IS NOT NULL THEN 1 ELSE 0 END) AS overlap,',
|
||||
'SUM(CASE WHEN parent_values.c0 IS NULL THEN 1 ELSE 0 END) AS violation_count',
|
||||
'FROM child_values',
|
||||
`LEFT JOIN parent_values ON ${tupleEquality(input.childColumns.length)}`,
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function relationshipStatus(input: {
|
||||
targetUniqueness: number;
|
||||
sourceCoverage: number;
|
||||
violationRatio: number;
|
||||
minSourceCoverage: number;
|
||||
maxViolationRatio: number;
|
||||
}): KtxCompositeRelationshipStatus {
|
||||
if (
|
||||
input.targetUniqueness >= DEFAULT_MIN_PRIMARY_KEY_UNIQUENESS &&
|
||||
input.sourceCoverage >= input.minSourceCoverage &&
|
||||
input.violationRatio <= input.maxViolationRatio
|
||||
) {
|
||||
return 'accepted';
|
||||
}
|
||||
if (input.sourceCoverage >= 0.55) {
|
||||
return 'review';
|
||||
}
|
||||
return 'rejected';
|
||||
}
|
||||
|
||||
function hasAcceptedSubset(
|
||||
accepted: readonly KtxCompositePrimaryKeyCandidate[],
|
||||
tableName: string,
|
||||
columns: readonly string[],
|
||||
): boolean {
|
||||
const columnSet = new Set(columns);
|
||||
return accepted.some(
|
||||
(candidate) =>
|
||||
candidate.table.name === tableName &&
|
||||
candidate.columns.length < columns.length &&
|
||||
candidate.columns.every((column) => columnSet.has(column)),
|
||||
);
|
||||
}
|
||||
|
||||
async function detectCompositePrimaryKeys(input: {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
table: KtxEnrichedTable;
|
||||
profiles: KtxRelationshipProfileArtifact;
|
||||
executor: KtxRelationshipReadOnlyExecutor;
|
||||
ctx: KtxScanContext;
|
||||
maxCompositeWidth: number;
|
||||
maxColumnsPerTable: number;
|
||||
minPrimaryKeyUniqueness: number;
|
||||
}): Promise<{ primaryKeys: KtxCompositePrimaryKeyCandidate[]; queryCount: number }> {
|
||||
const rowCount = tableRowCount(input.profiles, input.table.ref.name);
|
||||
if (rowCount === 0) {
|
||||
return { primaryKeys: [], queryCount: 0 };
|
||||
}
|
||||
if (
|
||||
hasStrongSingleColumnKey({
|
||||
table: input.table,
|
||||
profiles: input.profiles,
|
||||
minPrimaryKeyUniqueness: input.minPrimaryKeyUniqueness,
|
||||
})
|
||||
) {
|
||||
return { primaryKeys: [], queryCount: 0 };
|
||||
}
|
||||
|
||||
const columns = candidateKeyColumns({
|
||||
table: input.table,
|
||||
profiles: input.profiles,
|
||||
maxColumnsPerTable: input.maxColumnsPerTable,
|
||||
});
|
||||
const primaryKeys: KtxCompositePrimaryKeyCandidate[] = [];
|
||||
let queryCount = 0;
|
||||
|
||||
for (let width = 2; width <= input.maxCompositeWidth; width += 1) {
|
||||
for (const columnTuple of combinations(columns, width)) {
|
||||
const columnNames = columnTuple.map((column) => column.name);
|
||||
if (!tupleCoversTableNameKeyParts(input.table.ref.name, columnTuple)) {
|
||||
continue;
|
||||
}
|
||||
if (hasAcceptedSubset(primaryKeys, input.table.ref.name, columnNames)) {
|
||||
continue;
|
||||
}
|
||||
const result = await input.executor.executeReadOnly(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
sql: buildTupleDistinctSql({
|
||||
driver: input.driver,
|
||||
table: input.table.ref,
|
||||
columns: columnNames,
|
||||
}),
|
||||
maxRows: 1,
|
||||
},
|
||||
input.ctx,
|
||||
);
|
||||
queryCount += 1;
|
||||
const distinctCount = numberAt(result, 'distinct_count');
|
||||
const uniquenessRatio = rowCount === 0 ? 0 : distinctCount / rowCount;
|
||||
if (uniquenessRatio < input.minPrimaryKeyUniqueness) {
|
||||
continue;
|
||||
}
|
||||
const nullRate = Math.max(
|
||||
...columnNames.map((columnName) => profileNullRate(input.profiles, input.table.ref.name, columnName)),
|
||||
);
|
||||
primaryKeys.push({
|
||||
id: tupleKey(input.table.ref.name, columnNames),
|
||||
tableId: input.table.id,
|
||||
table: input.table.ref,
|
||||
columns: columnNames,
|
||||
columnIds: columnTuple.map((column) => column.id),
|
||||
score: Number(Math.min(0.99, 0.72 + uniquenessRatio * 0.22 + (1 - nullRate) * 0.06).toFixed(3)),
|
||||
status: 'accepted',
|
||||
evidence: {
|
||||
rowCount,
|
||||
distinctCount,
|
||||
uniquenessRatio,
|
||||
nullRate,
|
||||
reasons: ['composite_unique_tuple', 'not_null_profile'],
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
primaryKeys: primaryKeys.sort((left, right) =>
|
||||
tupleKey(left.table.name, left.columns).localeCompare(tupleKey(right.table.name, right.columns)),
|
||||
),
|
||||
queryCount,
|
||||
};
|
||||
}
|
||||
|
||||
function columnsByName(table: KtxEnrichedTable): Map<string, KtxEnrichedColumn> {
|
||||
return new Map(table.columns.map((column) => [column.name, column]));
|
||||
}
|
||||
|
||||
function compatibleTuple(sourceColumns: readonly KtxEnrichedColumn[], targetColumns: readonly KtxEnrichedColumn[]): boolean {
|
||||
if (sourceColumns.length !== targetColumns.length) {
|
||||
return false;
|
||||
}
|
||||
return sourceColumns.every((source, index) => {
|
||||
const target = targetColumns[index];
|
||||
return Boolean(target) && source.dimensionType === target.dimensionType;
|
||||
});
|
||||
}
|
||||
|
||||
async function validateCompositeRelationship(input: {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
sourceTable: KtxEnrichedTable;
|
||||
sourceColumns: readonly KtxEnrichedColumn[];
|
||||
targetKey: KtxCompositePrimaryKeyCandidate;
|
||||
targetTable: KtxEnrichedTable;
|
||||
targetColumns: readonly KtxEnrichedColumn[];
|
||||
executor: KtxRelationshipReadOnlyExecutor;
|
||||
ctx: KtxScanContext;
|
||||
minSourceCoverage: number;
|
||||
maxViolationRatio: number;
|
||||
}): Promise<{ relationship: KtxCompositeRelationshipCandidate; queryCount: number }> {
|
||||
const result = await input.executor.executeReadOnly(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
sql: buildCompositeCoverageSql({
|
||||
driver: input.driver,
|
||||
childTable: input.sourceTable.ref,
|
||||
childColumns: input.sourceColumns.map((column) => column.name),
|
||||
parentTable: input.targetTable.ref,
|
||||
parentColumns: input.targetColumns.map((column) => column.name),
|
||||
maxDistinctSourceValues: 10000,
|
||||
}),
|
||||
maxRows: 1,
|
||||
},
|
||||
input.ctx,
|
||||
);
|
||||
const childDistinct = numberAt(result, 'child_distinct');
|
||||
const parentDistinct = numberAt(result, 'parent_distinct');
|
||||
const overlap = numberAt(result, 'overlap');
|
||||
const violationCount = numberAt(result, 'violation_count');
|
||||
const sourceCoverage = childDistinct === 0 ? 0 : overlap / childDistinct;
|
||||
const violationRatio = childDistinct === 0 ? 1 : violationCount / childDistinct;
|
||||
const targetUniqueness = input.targetKey.evidence.uniquenessRatio;
|
||||
const status = relationshipStatus({
|
||||
targetUniqueness,
|
||||
sourceCoverage,
|
||||
violationRatio,
|
||||
minSourceCoverage: input.minSourceCoverage,
|
||||
maxViolationRatio: input.maxViolationRatio,
|
||||
});
|
||||
|
||||
const from = tupleEndpoint(input.sourceTable, input.sourceColumns);
|
||||
const to = {
|
||||
tableId: input.targetKey.tableId,
|
||||
columnIds: input.targetKey.columnIds,
|
||||
table: input.targetKey.table,
|
||||
columns: input.targetKey.columns,
|
||||
};
|
||||
const reasons =
|
||||
status === 'accepted'
|
||||
? ['composite_validation_passed']
|
||||
: [
|
||||
'composite_validation_failed',
|
||||
sourceCoverage < input.minSourceCoverage ? 'low_source_coverage' : '',
|
||||
violationRatio > input.maxViolationRatio ? 'excessive_violations' : '',
|
||||
].filter(Boolean);
|
||||
|
||||
return {
|
||||
queryCount: 1,
|
||||
relationship: {
|
||||
id: relationshipKey({
|
||||
fromTable: from.table.name,
|
||||
fromColumns: from.columns,
|
||||
toTable: to.table.name,
|
||||
toColumns: to.columns,
|
||||
}),
|
||||
from,
|
||||
to,
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: status === 'accepted' ? 0.95 : 0.62,
|
||||
status,
|
||||
source: 'composite_profile_match',
|
||||
validation: {
|
||||
targetUniqueness,
|
||||
sourceCoverage,
|
||||
violationCount,
|
||||
violationRatio,
|
||||
childDistinct,
|
||||
parentDistinct,
|
||||
overlap,
|
||||
reasons,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function discoverKtxCompositeRelationships(
|
||||
input: DiscoverKtxCompositeRelationshipsInput,
|
||||
): Promise<DiscoverKtxCompositeRelationshipsResult> {
|
||||
if (!input.executor || !input.profiles.sqlAvailable) {
|
||||
return {
|
||||
primaryKeys: [],
|
||||
relationships: [],
|
||||
queryCount: 0,
|
||||
warnings: ['composite_relationship_validation_unavailable'],
|
||||
};
|
||||
}
|
||||
|
||||
const settings = {
|
||||
maxCompositeWidth: input.maxCompositeWidth ?? DEFAULT_MAX_COMPOSITE_WIDTH,
|
||||
maxColumnsPerTable: input.maxColumnsPerTable ?? DEFAULT_MAX_COLUMNS_PER_TABLE,
|
||||
minPrimaryKeyUniqueness: input.minPrimaryKeyUniqueness ?? DEFAULT_MIN_PRIMARY_KEY_UNIQUENESS,
|
||||
minSourceCoverage: input.minSourceCoverage ?? DEFAULT_MIN_SOURCE_COVERAGE,
|
||||
maxViolationRatio: input.maxViolationRatio ?? DEFAULT_MAX_VIOLATION_RATIO,
|
||||
};
|
||||
const tables = enabledTables(input.schema);
|
||||
const tableByName = new Map(tables.map((table) => [table.ref.name, table]));
|
||||
const primaryKeys: KtxCompositePrimaryKeyCandidate[] = [];
|
||||
let queryCount = 0;
|
||||
|
||||
for (const table of tables) {
|
||||
const result = await detectCompositePrimaryKeys({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
table,
|
||||
profiles: input.profiles,
|
||||
executor: input.executor,
|
||||
ctx: input.ctx,
|
||||
maxCompositeWidth: settings.maxCompositeWidth,
|
||||
maxColumnsPerTable: settings.maxColumnsPerTable,
|
||||
minPrimaryKeyUniqueness: settings.minPrimaryKeyUniqueness,
|
||||
});
|
||||
primaryKeys.push(...result.primaryKeys);
|
||||
queryCount += result.queryCount;
|
||||
}
|
||||
|
||||
const relationships: KtxCompositeRelationshipCandidate[] = [];
|
||||
for (const targetKey of primaryKeys) {
|
||||
const targetTable = tableByName.get(targetKey.table.name);
|
||||
if (!targetTable) {
|
||||
continue;
|
||||
}
|
||||
const targetColumnByName = columnsByName(targetTable);
|
||||
const targetColumns = targetKey.columns.flatMap((columnName) => {
|
||||
const column = targetColumnByName.get(columnName);
|
||||
return column ? [column] : [];
|
||||
});
|
||||
if (targetColumns.length !== targetKey.columns.length) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const sourceTable of tables) {
|
||||
if (sourceTable.id === targetTable.id) {
|
||||
continue;
|
||||
}
|
||||
const sourceColumnByName = columnsByName(sourceTable);
|
||||
const sourceColumns = targetKey.columns.flatMap((columnName) => {
|
||||
const column = sourceColumnByName.get(columnName);
|
||||
return column ? [column] : [];
|
||||
});
|
||||
if (sourceColumns.length !== targetKey.columns.length || !compatibleTuple(sourceColumns, targetColumns)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const result = await validateCompositeRelationship({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
sourceTable,
|
||||
sourceColumns,
|
||||
targetKey,
|
||||
targetTable,
|
||||
targetColumns,
|
||||
executor: input.executor,
|
||||
ctx: input.ctx,
|
||||
minSourceCoverage: settings.minSourceCoverage,
|
||||
maxViolationRatio: settings.maxViolationRatio,
|
||||
});
|
||||
queryCount += result.queryCount;
|
||||
if (result.relationship.status !== 'rejected') {
|
||||
relationships.push(result.relationship);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
primaryKeys: primaryKeys.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
relationships: relationships.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
queryCount,
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
373
packages/cli/src/context/scan/relationship-diagnostics.test.ts
Normal file
373
packages/cli/src/context/scan/relationship-diagnostics.test.ts
Normal file
|
|
@ -0,0 +1,373 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedRelationship, KtxRelationshipEndpoint } from './enrichment-types.js';
|
||||
import type { KtxResolvedRelationshipDiscoveryCandidate } from './relationship-graph-resolver.js';
|
||||
import {
|
||||
buildKtxRelationshipArtifacts,
|
||||
buildKtxRelationshipDiagnostics,
|
||||
emptyKtxRelationshipProfileArtifact,
|
||||
} from './relationship-diagnostics.js';
|
||||
|
||||
function endpoint(table: string, column: string): KtxRelationshipEndpoint {
|
||||
return {
|
||||
tableId: table,
|
||||
columnIds: [`${table}.${column}`],
|
||||
table: { catalog: null, db: null, name: table },
|
||||
columns: [column],
|
||||
};
|
||||
}
|
||||
|
||||
function enrichedRelationship(input: {
|
||||
id: string;
|
||||
fromTable: string;
|
||||
fromColumn: string;
|
||||
toTable: string;
|
||||
toColumn: string;
|
||||
confidence?: number;
|
||||
}): KtxEnrichedRelationship {
|
||||
return {
|
||||
id: input.id,
|
||||
source: 'inferred',
|
||||
from: endpoint(input.fromTable, input.fromColumn),
|
||||
to: endpoint(input.toTable, input.toColumn),
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: input.confidence ?? 0.92,
|
||||
isPrimaryKeyReference: true,
|
||||
};
|
||||
}
|
||||
|
||||
function resolvedRelationship(input: {
|
||||
id: string;
|
||||
status: 'accepted' | 'review' | 'rejected';
|
||||
source?: 'normalized_table_match' | 'exact_column_match' | 'inflection' | 'self_reference' | 'llm_proposal';
|
||||
fkScore?: number;
|
||||
pkScore?: number;
|
||||
validationReasons?: string[];
|
||||
graphReasons?: string[];
|
||||
}): KtxResolvedRelationshipDiscoveryCandidate {
|
||||
return {
|
||||
id: input.id,
|
||||
from: endpoint('orders', 'customer_id'),
|
||||
to: endpoint('customers', 'id'),
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.88,
|
||||
source: input.source ?? 'normalized_table_match',
|
||||
status: input.status,
|
||||
evidence:
|
||||
input.source === 'llm_proposal'
|
||||
? {
|
||||
sourceColumnBase: 'buyer',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.88,
|
||||
nameScore: 0.45,
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
}
|
||||
: {
|
||||
sourceColumnBase: 'customer',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.9,
|
||||
nameScore: 0.85,
|
||||
reasons: ['table_name_matches_source_column'],
|
||||
},
|
||||
score: 0.91,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: input.status === 'rejected' ? 0.2 : 1,
|
||||
violationCount: input.status === 'rejected' ? 8 : 0,
|
||||
violationRatio: input.status === 'rejected' ? 0.8 : 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 10,
|
||||
parentDistinct: 10,
|
||||
overlap: input.status === 'rejected' ? 2 : 10,
|
||||
checkedValues: 10,
|
||||
reasons: input.validationReasons ?? ['validation_passed'],
|
||||
},
|
||||
pkScore: input.pkScore ?? 0.97,
|
||||
fkScore: input.fkScore ?? 0.94,
|
||||
graph: {
|
||||
targetPkScore: input.pkScore ?? 0.97,
|
||||
incomingCandidateCount: 1,
|
||||
conflictRank: 1,
|
||||
reasons: input.graphReasons ?? ['target_pk_score_passed', 'fk_score_passed'],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship diagnostics artifacts', () => {
|
||||
it('groups graph-resolved relationships and preserves evidence reasons', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
resolvedRelationships: [
|
||||
resolvedRelationship({ id: 'accepted-edge', status: 'accepted', source: 'llm_proposal' }),
|
||||
resolvedRelationship({
|
||||
id: 'review-edge',
|
||||
status: 'review',
|
||||
validationReasons: ['validation_unavailable'],
|
||||
graphReasons: ['validation_unavailable_review_only', 'fk_score_review'],
|
||||
}),
|
||||
resolvedRelationship({
|
||||
id: 'rejected-edge',
|
||||
status: 'rejected',
|
||||
validationReasons: ['low_source_coverage'],
|
||||
graphReasons: ['fk_score_rejected'],
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
expect(artifacts.accepted).toHaveLength(1);
|
||||
expect(artifacts.accepted[0]).toMatchObject({
|
||||
source: 'llm_proposal',
|
||||
evidence: {
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
|
||||
});
|
||||
expect(artifacts.review).toHaveLength(1);
|
||||
expect(artifacts.rejected).toHaveLength(1);
|
||||
expect(artifacts.review[0]).toMatchObject({
|
||||
id: 'review-edge',
|
||||
status: 'review',
|
||||
source: 'normalized_table_match',
|
||||
fkScore: 0.94,
|
||||
reasons: expect.arrayContaining(['validation_unavailable', 'validation_unavailable_review_only']),
|
||||
});
|
||||
expect(artifacts.rejected[0]?.reasons).toEqual(
|
||||
expect.arrayContaining(['table_name_matches_source_column', 'low_source_coverage', 'fk_score_rejected']),
|
||||
);
|
||||
});
|
||||
|
||||
it('adapts relationship updates into the artifact shape', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
enrichedRelationship({
|
||||
id: 'orders-customer',
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
}),
|
||||
],
|
||||
rejected: [
|
||||
enrichedRelationship({
|
||||
id: 'orders-account',
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'account_id',
|
||||
toTable: 'accounts',
|
||||
toColumn: 'id',
|
||||
confidence: 0.4,
|
||||
}),
|
||||
],
|
||||
skipped: [{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(artifacts.accepted[0]).toMatchObject({
|
||||
id: 'orders-customer',
|
||||
status: 'accepted',
|
||||
source: 'inferred',
|
||||
reasons: ['accepted_relationship_update'],
|
||||
});
|
||||
expect(artifacts.rejected[0]).toMatchObject({
|
||||
id: 'orders-account',
|
||||
status: 'rejected',
|
||||
reasons: ['rejected_relationship_update'],
|
||||
});
|
||||
expect(artifacts.skipped).toEqual([{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }]);
|
||||
});
|
||||
|
||||
it('deduplicates resolved and formal relationship update artifacts by edge id', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
resolvedRelationships: [
|
||||
{
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
from: endpoint('orders', 'account_id'),
|
||||
to: endpoint('accounts', 'id'),
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'normalized_table_match',
|
||||
status: 'accepted',
|
||||
confidence: 0.92,
|
||||
score: 0.9,
|
||||
pkScore: 0.92,
|
||||
fkScore: 0.9,
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: 'account',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.92,
|
||||
nameScore: 0.92,
|
||||
reasons: ['foreign_key_suffix'],
|
||||
},
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
checkedValues: 2,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
graph: {
|
||||
targetPkScore: 0.92,
|
||||
incomingCandidateCount: 1,
|
||||
conflictRank: 1,
|
||||
reasons: ['fk_score_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'formal',
|
||||
from: endpoint('orders', 'account_id'),
|
||||
to: endpoint('accounts', 'id'),
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
});
|
||||
|
||||
expect(artifacts.accepted).toHaveLength(1);
|
||||
expect(artifacts.accepted[0]).toMatchObject({
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'normalized_table_match',
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'validation_passed', 'fk_score_passed']),
|
||||
});
|
||||
});
|
||||
|
||||
it('explains validation-unavailable review candidates', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
resolvedRelationships: [
|
||||
resolvedRelationship({
|
||||
id: 'review-edge',
|
||||
status: 'review',
|
||||
validationReasons: ['validation_unavailable'],
|
||||
graphReasons: ['validation_unavailable_review_only'],
|
||||
}),
|
||||
],
|
||||
});
|
||||
const profile = emptyKtxRelationshipProfileArtifact({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
reason: 'read_only_sql_unavailable',
|
||||
});
|
||||
|
||||
const diagnostics = buildKtxRelationshipDiagnostics({
|
||||
connectionId: 'warehouse',
|
||||
generatedAt: '2026-05-07T12:00:00.000Z',
|
||||
artifacts,
|
||||
profile,
|
||||
warnings: [
|
||||
{
|
||||
code: 'connector_capability_missing',
|
||||
message: 'KTX scan connector cannot run standalone statistical relationship validation',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
},
|
||||
],
|
||||
thresholds: { acceptThreshold: 0.85, reviewThreshold: 0.55 },
|
||||
});
|
||||
|
||||
expect(diagnostics.summary).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(diagnostics.noAcceptedReason).toBe('validation unavailable; review candidates written');
|
||||
expect(diagnostics.candidateCountsBySource).toEqual({ normalized_table_match: 1 });
|
||||
expect(diagnostics.validation).toEqual({
|
||||
available: false,
|
||||
sqlAvailable: false,
|
||||
queryCount: 0,
|
||||
});
|
||||
expect(diagnostics.profileWarnings).toEqual(['read_only_sql_unavailable']);
|
||||
expect(diagnostics.warnings[0]).toMatchObject({ code: 'connector_capability_missing' });
|
||||
});
|
||||
|
||||
it('explains empty relationship output as a no-candidate outcome', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({ connectionId: 'warehouse' });
|
||||
const diagnostics = buildKtxRelationshipDiagnostics({
|
||||
connectionId: 'warehouse',
|
||||
generatedAt: '2026-05-07T12:00:00.000Z',
|
||||
artifacts,
|
||||
profile: emptyKtxRelationshipProfileArtifact({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
reason: 'relationship_profiling_not_run',
|
||||
}),
|
||||
});
|
||||
|
||||
expect(diagnostics.summary).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(diagnostics.noAcceptedReason).toBe('no candidate pairs passed type compatibility');
|
||||
expect(diagnostics.candidateCountsBySource).toEqual({});
|
||||
});
|
||||
|
||||
it('records composite relationship endpoints in relationship artifacts', () => {
|
||||
const artifacts = buildKtxRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
compositeRelationships: [
|
||||
{
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'composite_profile_match',
|
||||
status: 'accepted',
|
||||
from: {
|
||||
tableId: 'order_line_allocations',
|
||||
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
|
||||
table: { catalog: null, db: null, name: 'order_line_allocations' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'order_lines',
|
||||
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
|
||||
table: { catalog: null, db: null, name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
reasons: ['composite_validation_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(artifacts.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'composite_profile_match',
|
||||
from: expect.objectContaining({
|
||||
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
|
||||
columns: ['order_id', 'line_number'],
|
||||
}),
|
||||
to: expect.objectContaining({
|
||||
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
|
||||
columns: ['order_id', 'line_number'],
|
||||
}),
|
||||
reasons: ['composite_validation_passed'],
|
||||
validation: expect.objectContaining({ sourceCoverage: 1 }),
|
||||
}),
|
||||
]);
|
||||
});
|
||||
});
|
||||
364
packages/cli/src/context/scan/relationship-diagnostics.ts
Normal file
364
packages/cli/src/context/scan/relationship-diagnostics.ts
Normal file
|
|
@ -0,0 +1,364 @@
|
|||
import type {
|
||||
KtxEnrichedRelationship,
|
||||
KtxRelationshipEndpoint,
|
||||
KtxRelationshipType,
|
||||
KtxRelationshipUpdate,
|
||||
} from './enrichment-types.js';
|
||||
import type {
|
||||
KtxResolvedRelationshipDiscoveryCandidate,
|
||||
KtxResolvedRelationshipStatus,
|
||||
} from './relationship-graph-resolver.js';
|
||||
import type { KtxCompositeRelationshipCandidate } from './relationship-composite-candidates.js';
|
||||
import type { KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type { KtxConnectionDriver, KtxScanWarning } from './types.js';
|
||||
|
||||
interface KtxRelationshipArtifactEndpoint {
|
||||
tableId: string;
|
||||
columnIds: string[];
|
||||
table: {
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
name: string;
|
||||
};
|
||||
columns: string[];
|
||||
}
|
||||
|
||||
interface KtxRelationshipArtifactEdge {
|
||||
id: string;
|
||||
status: KtxResolvedRelationshipStatus;
|
||||
source: string;
|
||||
from: KtxRelationshipArtifactEndpoint;
|
||||
to: KtxRelationshipArtifactEndpoint;
|
||||
relationshipType: KtxRelationshipType;
|
||||
confidence: number;
|
||||
pkScore: number | null;
|
||||
fkScore: number | null;
|
||||
score: number | null;
|
||||
evidence: unknown | null;
|
||||
validation: unknown | null;
|
||||
graph: unknown | null;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KtxRelationshipArtifact {
|
||||
connectionId: string;
|
||||
accepted: KtxRelationshipArtifactEdge[];
|
||||
review: KtxRelationshipArtifactEdge[];
|
||||
rejected: KtxRelationshipArtifactEdge[];
|
||||
skipped: KtxRelationshipUpdate['skipped'];
|
||||
}
|
||||
|
||||
interface KtxRelationshipDiagnosticsSummary {
|
||||
accepted: number;
|
||||
review: number;
|
||||
rejected: number;
|
||||
skipped: number;
|
||||
}
|
||||
|
||||
interface KtxRelationshipDiagnosticsValidation {
|
||||
available: boolean;
|
||||
sqlAvailable: boolean;
|
||||
queryCount: number;
|
||||
}
|
||||
|
||||
interface KtxRelationshipDiagnosticsThresholds {
|
||||
acceptThreshold: number;
|
||||
reviewThreshold: number;
|
||||
}
|
||||
|
||||
interface KtxRelationshipDiagnosticsPolicy {
|
||||
validationRequiredForManifest: boolean;
|
||||
maxCandidatesPerColumn: number;
|
||||
profileSampleRows: number;
|
||||
validationConcurrency: number;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipDiagnosticsArtifact {
|
||||
connectionId: string;
|
||||
generatedAt: string;
|
||||
summary: KtxRelationshipDiagnosticsSummary;
|
||||
noAcceptedReason: string | null;
|
||||
candidateCountsBySource: Record<string, number>;
|
||||
validation: KtxRelationshipDiagnosticsValidation;
|
||||
thresholds: KtxRelationshipDiagnosticsThresholds;
|
||||
policy: KtxRelationshipDiagnosticsPolicy;
|
||||
warnings: KtxScanWarning[];
|
||||
profileWarnings: string[];
|
||||
}
|
||||
|
||||
export interface BuildKtxRelationshipArtifactsInput {
|
||||
connectionId: string;
|
||||
relationshipUpdate?: KtxRelationshipUpdate | null;
|
||||
resolvedRelationships?: readonly KtxResolvedRelationshipDiscoveryCandidate[];
|
||||
compositeRelationships?: readonly KtxCompositeRelationshipCandidate[];
|
||||
}
|
||||
|
||||
export interface BuildKtxRelationshipDiagnosticsInput {
|
||||
connectionId: string;
|
||||
artifacts: KtxRelationshipArtifact;
|
||||
profile: KtxRelationshipProfileArtifact;
|
||||
warnings?: readonly KtxScanWarning[];
|
||||
thresholds?: Partial<KtxRelationshipDiagnosticsThresholds>;
|
||||
policy?: Partial<KtxRelationshipDiagnosticsPolicy>;
|
||||
generatedAt?: string;
|
||||
}
|
||||
|
||||
export interface EmptyKtxRelationshipProfileArtifactInput {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
const DEFAULT_THRESHOLDS: KtxRelationshipDiagnosticsThresholds = {
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
};
|
||||
|
||||
const DEFAULT_POLICY: KtxRelationshipDiagnosticsPolicy = {
|
||||
validationRequiredForManifest: true,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
validationConcurrency: 4,
|
||||
};
|
||||
|
||||
function endpointArtifact(endpoint: KtxRelationshipEndpoint): KtxRelationshipArtifactEndpoint {
|
||||
return {
|
||||
tableId: endpoint.tableId,
|
||||
columnIds: endpoint.columnIds,
|
||||
table: {
|
||||
catalog: endpoint.table.catalog,
|
||||
db: endpoint.table.db,
|
||||
name: endpoint.table.name,
|
||||
},
|
||||
columns: endpoint.columns,
|
||||
};
|
||||
}
|
||||
|
||||
function uniqueReasons(values: readonly string[]): string[] {
|
||||
return Array.from(new Set(values.filter((value) => value.trim().length > 0)));
|
||||
}
|
||||
|
||||
function relationshipUpdateEdge(
|
||||
relationship: KtxEnrichedRelationship,
|
||||
status: 'accepted' | 'rejected',
|
||||
): KtxRelationshipArtifactEdge {
|
||||
const acceptedReason = relationship.source === 'formal' ? 'formal_metadata_accepted' : 'accepted_relationship_update';
|
||||
return {
|
||||
id: relationship.id,
|
||||
status,
|
||||
source: relationship.source,
|
||||
from: endpointArtifact(relationship.from),
|
||||
to: endpointArtifact(relationship.to),
|
||||
relationshipType: relationship.relationshipType,
|
||||
confidence: relationship.confidence,
|
||||
pkScore: null,
|
||||
fkScore: null,
|
||||
score: relationship.confidence,
|
||||
evidence: relationship.source === 'formal' ? { source: 'formal_metadata' } : null,
|
||||
validation: relationship.source === 'formal' ? { status: 'formal_metadata' } : null,
|
||||
graph: null,
|
||||
reasons: [status === 'accepted' ? acceptedReason : 'rejected_relationship_update'],
|
||||
};
|
||||
}
|
||||
|
||||
function resolvedEdge(candidate: KtxResolvedRelationshipDiscoveryCandidate): KtxRelationshipArtifactEdge {
|
||||
return {
|
||||
id: candidate.id,
|
||||
status: candidate.status,
|
||||
source: candidate.source,
|
||||
from: endpointArtifact(candidate.from),
|
||||
to: endpointArtifact(candidate.to),
|
||||
relationshipType: candidate.relationshipType,
|
||||
confidence: candidate.confidence,
|
||||
pkScore: candidate.pkScore,
|
||||
fkScore: candidate.fkScore,
|
||||
score: candidate.score,
|
||||
evidence: candidate.evidence,
|
||||
validation: candidate.validation,
|
||||
graph: candidate.graph,
|
||||
reasons: uniqueReasons([
|
||||
...candidate.evidence.reasons,
|
||||
...candidate.validation.reasons,
|
||||
...candidate.graph.reasons,
|
||||
]),
|
||||
};
|
||||
}
|
||||
|
||||
function compositeEndpointArtifact(endpoint: KtxCompositeRelationshipCandidate['from']): KtxRelationshipArtifactEndpoint {
|
||||
return {
|
||||
tableId: endpoint.tableId,
|
||||
columnIds: endpoint.columnIds,
|
||||
table: {
|
||||
catalog: endpoint.table.catalog,
|
||||
db: endpoint.table.db,
|
||||
name: endpoint.table.name,
|
||||
},
|
||||
columns: endpoint.columns,
|
||||
};
|
||||
}
|
||||
|
||||
function compositeEdge(candidate: KtxCompositeRelationshipCandidate): KtxRelationshipArtifactEdge {
|
||||
return {
|
||||
id: candidate.id,
|
||||
status: candidate.status,
|
||||
source: candidate.source,
|
||||
from: compositeEndpointArtifact(candidate.from),
|
||||
to: compositeEndpointArtifact(candidate.to),
|
||||
relationshipType: candidate.relationshipType,
|
||||
confidence: candidate.confidence,
|
||||
pkScore: null,
|
||||
fkScore: candidate.confidence,
|
||||
score: candidate.confidence,
|
||||
evidence: { source: candidate.source },
|
||||
validation: candidate.validation,
|
||||
graph: null,
|
||||
reasons: uniqueReasons(candidate.validation.reasons),
|
||||
};
|
||||
}
|
||||
|
||||
function emptyArtifacts(connectionId: string): KtxRelationshipArtifact {
|
||||
return {
|
||||
connectionId,
|
||||
accepted: [],
|
||||
review: [],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
};
|
||||
}
|
||||
|
||||
function pushUniqueEdge(edges: KtxRelationshipArtifactEdge[], edge: KtxRelationshipArtifactEdge): void {
|
||||
if (!edges.some((item) => item.id === edge.id)) {
|
||||
edges.push(edge);
|
||||
}
|
||||
}
|
||||
|
||||
export function buildKtxRelationshipArtifacts(input: BuildKtxRelationshipArtifactsInput): KtxRelationshipArtifact {
|
||||
const artifacts = emptyArtifacts(input.connectionId);
|
||||
|
||||
if (input.resolvedRelationships) {
|
||||
for (const candidate of input.resolvedRelationships) {
|
||||
const edge = resolvedEdge(candidate);
|
||||
if (edge.status === 'accepted') {
|
||||
pushUniqueEdge(artifacts.accepted, edge);
|
||||
} else if (edge.status === 'review') {
|
||||
pushUniqueEdge(artifacts.review, edge);
|
||||
} else {
|
||||
pushUniqueEdge(artifacts.rejected, edge);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const candidate of input.compositeRelationships ?? []) {
|
||||
const edge = compositeEdge(candidate);
|
||||
if (edge.status === 'accepted') {
|
||||
pushUniqueEdge(artifacts.accepted, edge);
|
||||
} else if (edge.status === 'review') {
|
||||
pushUniqueEdge(artifacts.review, edge);
|
||||
} else {
|
||||
pushUniqueEdge(artifacts.rejected, edge);
|
||||
}
|
||||
}
|
||||
|
||||
const relationshipUpdate = input.relationshipUpdate;
|
||||
if (relationshipUpdate) {
|
||||
for (const relationship of relationshipUpdate.accepted) {
|
||||
pushUniqueEdge(artifacts.accepted, relationshipUpdateEdge(relationship, 'accepted'));
|
||||
}
|
||||
for (const relationship of relationshipUpdate.rejected) {
|
||||
pushUniqueEdge(artifacts.rejected, relationshipUpdateEdge(relationship, 'rejected'));
|
||||
}
|
||||
artifacts.skipped.push(...relationshipUpdate.skipped);
|
||||
}
|
||||
|
||||
return {
|
||||
connectionId: artifacts.connectionId,
|
||||
accepted: artifacts.accepted.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
review: artifacts.review.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
rejected: artifacts.rejected.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
skipped: [...artifacts.skipped].sort((left, right) => left.relationshipId.localeCompare(right.relationshipId)),
|
||||
};
|
||||
}
|
||||
|
||||
function allEdges(artifacts: KtxRelationshipArtifact): KtxRelationshipArtifactEdge[] {
|
||||
return [...artifacts.accepted, ...artifacts.review, ...artifacts.rejected];
|
||||
}
|
||||
|
||||
function candidateCountsBySource(artifacts: KtxRelationshipArtifact): Record<string, number> {
|
||||
const counts: Record<string, number> = {};
|
||||
for (const edge of allEdges(artifacts)) {
|
||||
counts[edge.source] = (counts[edge.source] ?? 0) + 1;
|
||||
}
|
||||
return Object.fromEntries(Object.entries(counts).sort(([left], [right]) => left.localeCompare(right)));
|
||||
}
|
||||
|
||||
function hasReason(artifacts: KtxRelationshipArtifact, reason: string): boolean {
|
||||
return allEdges(artifacts).some((edge) => edge.reasons.includes(reason));
|
||||
}
|
||||
|
||||
function noAcceptedReason(input: {
|
||||
artifacts: KtxRelationshipArtifact;
|
||||
profile: KtxRelationshipProfileArtifact;
|
||||
}): string | null {
|
||||
if (input.artifacts.accepted.length > 0) {
|
||||
return null;
|
||||
}
|
||||
if (
|
||||
input.artifacts.review.length > 0 &&
|
||||
(!input.profile.sqlAvailable ||
|
||||
hasReason(input.artifacts, 'validation_unavailable') ||
|
||||
hasReason(input.artifacts, 'validation_unavailable_review_only'))
|
||||
) {
|
||||
return 'validation unavailable; review candidates written';
|
||||
}
|
||||
if (input.artifacts.review.length > 0) {
|
||||
return 'relationship candidates require review before manifest writes';
|
||||
}
|
||||
if (input.artifacts.rejected.length > 0) {
|
||||
return 'all candidate pairs were rejected';
|
||||
}
|
||||
return 'no candidate pairs passed type compatibility';
|
||||
}
|
||||
|
||||
export function emptyKtxRelationshipProfileArtifact(
|
||||
input: EmptyKtxRelationshipProfileArtifactInput,
|
||||
): KtxRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
sqlAvailable: false,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
columns: {},
|
||||
warnings: [input.reason],
|
||||
};
|
||||
}
|
||||
|
||||
export function buildKtxRelationshipDiagnostics(
|
||||
input: BuildKtxRelationshipDiagnosticsInput,
|
||||
): KtxRelationshipDiagnosticsArtifact {
|
||||
const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
|
||||
const policy = { ...DEFAULT_POLICY, ...input.policy };
|
||||
const summary: KtxRelationshipDiagnosticsSummary = {
|
||||
accepted: input.artifacts.accepted.length,
|
||||
review: input.artifacts.review.length,
|
||||
rejected: input.artifacts.rejected.length,
|
||||
skipped: input.artifacts.skipped.length,
|
||||
};
|
||||
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
generatedAt: input.generatedAt ?? new Date().toISOString(),
|
||||
summary,
|
||||
noAcceptedReason: noAcceptedReason({ artifacts: input.artifacts, profile: input.profile }),
|
||||
candidateCountsBySource: candidateCountsBySource(input.artifacts),
|
||||
validation: {
|
||||
available: input.profile.sqlAvailable,
|
||||
sqlAvailable: input.profile.sqlAvailable,
|
||||
queryCount: input.profile.queryCount,
|
||||
},
|
||||
thresholds,
|
||||
policy,
|
||||
warnings: [...(input.warnings ?? [])],
|
||||
profileWarnings: [...input.profile.warnings],
|
||||
};
|
||||
}
|
||||
678
packages/cli/src/context/scan/relationship-discovery.test.ts
Normal file
678
packages/cli/src/context/scan/relationship-discovery.test.ts
Normal file
|
|
@ -0,0 +1,678 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { afterEach, describe, expect, it, vi } from 'vitest';
|
||||
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
||||
import { buildDefaultKtxProjectConfig } from '../project/config.js';
|
||||
import { snapshotToKtxEnrichedSchema } from './local-enrichment.js';
|
||||
import {
|
||||
loadKtxRelationshipBenchmarkFixture,
|
||||
maskKtxRelationshipBenchmarkSnapshot,
|
||||
} from './relationship-benchmarks.js';
|
||||
import { discoverKtxRelationships } from './relationship-discovery.js';
|
||||
import { createKtxConnectorCapabilities } from './types.js';
|
||||
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanConnector, KtxScanContext, KtxSchemaSnapshot } from './types.js';
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
queryCount = 0;
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function snapshot(): KtxSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'accounts',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function declaredForeignKeySnapshot(): KtxSchemaSnapshot {
|
||||
const source = snapshot();
|
||||
return {
|
||||
...source,
|
||||
tables: source.tables.map((table) =>
|
||||
table.name === 'accounts'
|
||||
? {
|
||||
...table,
|
||||
columns: table.columns.map((column) => (column.name === 'id' ? { ...column, primaryKey: true } : column)),
|
||||
}
|
||||
: table.name === 'orders'
|
||||
? {
|
||||
...table,
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'account_id',
|
||||
toCatalog: null,
|
||||
toDb: null,
|
||||
toTable: 'accounts',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_account_id_fkey',
|
||||
},
|
||||
],
|
||||
}
|
||||
: table,
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
function naturalKeySnapshot(): KtxSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'dim_countries',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'iso_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'fct_accounts',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 4,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'country_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function connector(executor: InMemorySqliteExecutor | null): KtxScanConnector {
|
||||
return {
|
||||
id: 'sqlite:test',
|
||||
driver: 'sqlite',
|
||||
capabilities: createKtxConnectorCapabilities({
|
||||
readOnlySql: executor !== null,
|
||||
columnStats: executor !== null,
|
||||
tableSampling: false,
|
||||
columnSampling: false,
|
||||
}),
|
||||
introspect: async () => snapshot(),
|
||||
executeReadOnly: executor ? executor.executeReadOnly.bind(executor) : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
function llmRuntime(output: unknown): KtxLlmRuntimePort {
|
||||
return {
|
||||
generateText: vi.fn(),
|
||||
generateObject: vi.fn(async () => output) as KtxLlmRuntimePort['generateObject'],
|
||||
runAgentLoop: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipSettings() {
|
||||
return buildDefaultKtxProjectConfig().scan.relationships;
|
||||
}
|
||||
|
||||
function llmOnlyRelationshipSnapshot(): KtxSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'buyer_ref',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
describe('production relationship discovery', () => {
|
||||
let executor: InMemorySqliteExecutor | null = null;
|
||||
|
||||
afterEach(() => {
|
||||
executor?.close();
|
||||
executor = null;
|
||||
});
|
||||
|
||||
it('accepts a validated relationship without declared PK or FK metadata', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
|
||||
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(executor),
|
||||
schema: snapshotToKtxEnrichedSchema(snapshot()),
|
||||
context: { runId: 'relationship-run-1' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.statisticalValidation).toBe('completed');
|
||||
expect(result.profile.sqlAvailable).toBe(true);
|
||||
expect(result.profile.queryCount).toBeGreaterThan(0);
|
||||
expect(result.relationshipUpdate.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'inferred',
|
||||
isPrimaryKeyReference: true,
|
||||
}),
|
||||
]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
status: 'accepted',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
|
||||
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts a profile-driven natural-key relationship without declared metadata', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE dim_countries (iso_code TEXT NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE fct_accounts (id INTEGER NOT NULL, country_code TEXT NOT NULL);
|
||||
INSERT INTO dim_countries (iso_code, name) VALUES ('US', 'United States'), ('FR', 'France'), ('DE', 'Germany');
|
||||
INSERT INTO fct_accounts (id, country_code) VALUES (1, 'US'), (2, 'FR'), (3, 'US'), (4, 'DE');
|
||||
`);
|
||||
|
||||
const schema = naturalKeySnapshot();
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: {
|
||||
...connector(executor),
|
||||
introspect: async () => schema,
|
||||
},
|
||||
schema: snapshotToKtxEnrichedSchema(schema),
|
||||
context: { runId: 'natural-key-relationship-run' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'fct_accounts' }), columns: ['country_code'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'dim_countries' }), columns: ['iso_code'] }),
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'inferred',
|
||||
isPrimaryKeyReference: true,
|
||||
}),
|
||||
]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
source: 'profile_match',
|
||||
status: 'accepted',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
|
||||
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts an embedding-driven relationship without declared metadata or LLM proposals', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE customers (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, buyer_ref INTEGER NOT NULL);
|
||||
INSERT INTO customers (id, name) VALUES (1, 'Acme'), (2, 'Orbit'), (3, 'Globex');
|
||||
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2), (12, 2), (13, 3);
|
||||
`);
|
||||
|
||||
const sourceSnapshot = llmOnlyRelationshipSnapshot();
|
||||
const schema = snapshotToKtxEnrichedSchema(
|
||||
sourceSnapshot,
|
||||
new Map([
|
||||
['customers.id', [1, 0, 0]],
|
||||
['customers.name', [0, 1, 0]],
|
||||
['orders.id', [0, 0, 1]],
|
||||
['orders.buyer_ref', [0.995, 0.005, 0]],
|
||||
]),
|
||||
);
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: {
|
||||
...connector(executor),
|
||||
introspect: async () => sourceSnapshot,
|
||||
},
|
||||
schema,
|
||||
context: { runId: 'embedding-relationship-run' },
|
||||
settings: {
|
||||
...relationshipSettings(),
|
||||
llmProposals: false,
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.llmRelationshipValidation).toBe('skipped');
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
|
||||
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
|
||||
to: { table: { name: 'customers' }, columns: ['id'] },
|
||||
});
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
source: 'embedding_similarity',
|
||||
status: 'accepted',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['embedding_similarity', 'target_key_like']),
|
||||
embeddingSimilarity: expect.any(Number),
|
||||
}),
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps candidates review-only when read-only SQL is unavailable', async () => {
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(null),
|
||||
schema: snapshotToKtxEnrichedSchema(snapshot()),
|
||||
context: { runId: 'relationship-run-no-sql' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(result.statisticalValidation).toBe('skipped');
|
||||
expect(result.relationshipUpdate.accepted).toEqual([]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
status: 'review',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_unavailable']) }),
|
||||
});
|
||||
expect(result.warnings).toContainEqual({
|
||||
code: 'connector_capability_missing',
|
||||
message: 'KTX scan connector cannot run read-only SQL relationship validation',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts formal metadata relationships when read-only SQL is unavailable', async () => {
|
||||
const sourceSnapshot = declaredForeignKeySnapshot();
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(null),
|
||||
schema: snapshotToKtxEnrichedSchema(sourceSnapshot),
|
||||
context: { runId: 'formal-metadata-no-sql' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.statisticalValidation).toBe('skipped');
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.resolvedRelationships).toEqual([]);
|
||||
expect(result.relationshipUpdate.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
}),
|
||||
]);
|
||||
expect(result.relationshipUpdate.rejected).toEqual([]);
|
||||
expect(result.relationshipUpdate.skipped).toEqual([]);
|
||||
});
|
||||
|
||||
it('accepts LLM-only relationship proposals only after SQL validation and graph resolution pass', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE customers (id INTEGER);
|
||||
CREATE TABLE orders (id INTEGER, buyer_ref INTEGER);
|
||||
INSERT INTO customers (id) VALUES (1), (2);
|
||||
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2);
|
||||
`);
|
||||
const llmOutput = {
|
||||
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.91, rationale: 'Unique customer key.' }],
|
||||
fkCandidates: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'buyer_ref',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
confidence: 0.89,
|
||||
rationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(executor),
|
||||
schema: snapshotToKtxEnrichedSchema(llmOnlyRelationshipSnapshot()),
|
||||
context: { runId: 'llm-relationship-orchestrator' },
|
||||
settings: relationshipSettings(),
|
||||
llmRuntime: llmRuntime(llmOutput),
|
||||
});
|
||||
|
||||
expect(result.llmRelationshipValidation).toBe('completed');
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
source: 'llm_proposal',
|
||||
status: 'accepted',
|
||||
evidence: {
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
});
|
||||
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
|
||||
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
|
||||
to: { table: { name: 'customers' }, columns: ['id'] },
|
||||
});
|
||||
});
|
||||
|
||||
it('uses configured acceptance thresholds when resolving graph relationships', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
|
||||
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
|
||||
const settings = {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
acceptThreshold: 0.99,
|
||||
reviewThreshold: 0.55,
|
||||
};
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(executor),
|
||||
schema: snapshotToKtxEnrichedSchema(snapshot()),
|
||||
context: { runId: 'configured-thresholds' },
|
||||
settings,
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate.accepted).toEqual([]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
status: 'review',
|
||||
graph: { reasons: expect.arrayContaining(['fk_score_review']) },
|
||||
});
|
||||
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('passes maxCandidatesPerColumn into broad deterministic candidate generation', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE account_archive (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
|
||||
INSERT INTO account_archive VALUES (99, 'Archive');
|
||||
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
|
||||
const richSnapshot = snapshot();
|
||||
richSnapshot.tables.splice(1, 0, {
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'account_archive',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 1,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: {
|
||||
...connector(executor),
|
||||
introspect: async () => richSnapshot,
|
||||
},
|
||||
schema: snapshotToKtxEnrichedSchema(richSnapshot),
|
||||
context: { runId: 'candidate-cap' },
|
||||
settings: {
|
||||
...buildDefaultKtxProjectConfig().scan.relationships,
|
||||
maxCandidatesPerColumn: 1,
|
||||
},
|
||||
});
|
||||
|
||||
const sourceTargets = result.resolvedRelationships
|
||||
.filter((relationship) => relationship.from.columns[0] === 'account_id')
|
||||
.map((relationship) => `${relationship.to.table.name}.${relationship.to.columns[0]}`);
|
||||
expect(sourceTargets).toHaveLength(1);
|
||||
expect(sourceTargets).toEqual(['accounts.id']);
|
||||
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('accepts SQL-validated composite relationships in production relationship-discovery detection', async () => {
|
||||
const fixtureRoot = new URL(
|
||||
'../../test/fixtures/relationship-benchmarks/composite_keys_no_declared_constraints',
|
||||
import.meta.url,
|
||||
);
|
||||
const fixture = await loadKtxRelationshipBenchmarkFixture(fixtureRoot.pathname);
|
||||
const maskedSnapshot = maskKtxRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
|
||||
const database = new Database(fixture.dataPath ?? '', { readonly: true, fileMustExist: true });
|
||||
const testConnector: KtxScanConnector = {
|
||||
id: 'sqlite:composite',
|
||||
driver: 'sqlite',
|
||||
capabilities: createKtxConnectorCapabilities({
|
||||
readOnlySql: true,
|
||||
columnStats: true,
|
||||
tableSampling: false,
|
||||
columnSampling: false,
|
||||
}),
|
||||
introspect: async () => maskedSnapshot,
|
||||
executeReadOnly: async (input) => {
|
||||
const rows = database.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return {
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const result = await discoverKtxRelationships({
|
||||
connectionId: maskedSnapshot.connectionId,
|
||||
driver: maskedSnapshot.driver,
|
||||
connector: testConnector,
|
||||
schema: snapshotToKtxEnrichedSchema(maskedSnapshot, new Map()),
|
||||
context: { runId: 'test:production-composite' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
database.close();
|
||||
|
||||
expect(
|
||||
result.relationshipUpdate.accepted.map(
|
||||
(relationship) =>
|
||||
`${relationship.from.table.name}.(${relationship.from.columns.join(',')})->${relationship.to.table.name}.(${relationship.to.columns.join(',')})`,
|
||||
),
|
||||
).toContain('order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)');
|
||||
expect(result.relationships.accepted).toBeGreaterThanOrEqual(1);
|
||||
expect(result.compositeRelationships.map((relationship) => relationship.status)).toContain('accepted');
|
||||
});
|
||||
});
|
||||
333
packages/cli/src/context/scan/relationship-discovery.ts
Normal file
333
packages/cli/src/context/scan/relationship-discovery.ts
Normal file
|
|
@ -0,0 +1,333 @@
|
|||
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
||||
import type { KtxScanRelationshipConfig } from '../project/config.js';
|
||||
import type { KtxEnrichedRelationship, KtxEnrichedSchema, KtxRelationshipUpdate } from './enrichment-types.js';
|
||||
import {
|
||||
generateKtxRelationshipDiscoveryCandidates,
|
||||
type KtxRelationshipDiscoveryCandidate,
|
||||
mergeKtxRelationshipDiscoveryCandidates,
|
||||
} from './relationship-candidates.js';
|
||||
import {
|
||||
discoverKtxCompositeRelationships,
|
||||
type KtxCompositeRelationshipCandidate,
|
||||
} from './relationship-composite-candidates.js';
|
||||
import { collectKtxFormalMetadataRelationships } from './relationship-formal-metadata.js';
|
||||
import {
|
||||
type KtxResolvedRelationshipDiscoveryCandidate,
|
||||
resolveKtxRelationshipGraph,
|
||||
} from './relationship-graph-resolver.js';
|
||||
import { proposeKtxRelationshipCandidatesWithLlm } from './relationship-llm-proposal.js';
|
||||
import {
|
||||
createKtxRelationshipProfileCache,
|
||||
type KtxRelationshipProfileArtifact,
|
||||
type KtxRelationshipReadOnlyExecutor,
|
||||
profileKtxRelationshipSchema,
|
||||
} from './relationship-profiling.js';
|
||||
import { validateKtxRelationshipDiscoveryCandidates } from './relationship-validation.js';
|
||||
import type {
|
||||
KtxConnectionDriver,
|
||||
KtxScanConnector,
|
||||
KtxScanContext,
|
||||
KtxScanEnrichmentSummary,
|
||||
KtxScanRelationshipSummary,
|
||||
KtxScanWarning,
|
||||
} from './types.js';
|
||||
|
||||
export interface DiscoverKtxRelationshipsInput {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
connector: KtxScanConnector;
|
||||
schema: KtxEnrichedSchema;
|
||||
context: KtxScanContext;
|
||||
settings: KtxScanRelationshipConfig;
|
||||
llmRuntime?: KtxLlmRuntimePort | null;
|
||||
}
|
||||
|
||||
export interface DiscoverKtxRelationshipsResult {
|
||||
relationshipUpdate: KtxRelationshipUpdate;
|
||||
relationships: KtxScanRelationshipSummary;
|
||||
profile: KtxRelationshipProfileArtifact;
|
||||
resolvedRelationships: KtxResolvedRelationshipDiscoveryCandidate[];
|
||||
compositeRelationships: KtxCompositeRelationshipCandidate[];
|
||||
statisticalValidation: KtxScanEnrichmentSummary['statisticalValidation'];
|
||||
llmRelationshipValidation: KtxScanEnrichmentSummary['llmRelationshipValidation'];
|
||||
warnings: KtxScanWarning[];
|
||||
}
|
||||
|
||||
function relationshipFromResolved(candidate: KtxResolvedRelationshipDiscoveryCandidate): KtxEnrichedRelationship {
|
||||
return {
|
||||
id: candidate.id,
|
||||
source: 'inferred',
|
||||
from: candidate.from,
|
||||
to: candidate.to,
|
||||
relationshipType: candidate.relationshipType,
|
||||
confidence: candidate.fkScore,
|
||||
isPrimaryKeyReference: candidate.pkScore >= 0.78,
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipFromComposite(candidate: KtxCompositeRelationshipCandidate): KtxEnrichedRelationship {
|
||||
return {
|
||||
id: candidate.id,
|
||||
source: 'inferred',
|
||||
from: {
|
||||
tableId: candidate.from.tableId,
|
||||
columnIds: candidate.from.columnIds,
|
||||
table: candidate.from.table,
|
||||
columns: candidate.from.columns,
|
||||
},
|
||||
to: {
|
||||
tableId: candidate.to.tableId,
|
||||
columnIds: candidate.to.columnIds,
|
||||
table: candidate.to.table,
|
||||
columns: candidate.to.columns,
|
||||
},
|
||||
relationshipType: candidate.relationshipType,
|
||||
confidence: candidate.confidence,
|
||||
isPrimaryKeyReference: candidate.status === 'accepted',
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipId(input: Pick<KtxEnrichedRelationship, 'from' | 'to'>): string {
|
||||
return `${input.from.tableId}:(${input.from.columnIds.join(',')})->${input.to.tableId}:(${input.to.columnIds.join(',')})`;
|
||||
}
|
||||
|
||||
function nonFormalAcceptedRelationships(input: {
|
||||
formalIds: ReadonlySet<string>;
|
||||
resolvedRelationships: readonly KtxResolvedRelationshipDiscoveryCandidate[];
|
||||
}): KtxEnrichedRelationship[] {
|
||||
return input.resolvedRelationships
|
||||
.filter((candidate) => candidate.status === 'accepted' && !input.formalIds.has(candidate.id))
|
||||
.map(relationshipFromResolved);
|
||||
}
|
||||
|
||||
function relationshipSummary(
|
||||
resolvedRelationships: readonly KtxResolvedRelationshipDiscoveryCandidate[],
|
||||
): KtxScanRelationshipSummary {
|
||||
return {
|
||||
accepted: resolvedRelationships.filter((candidate) => candidate.status === 'accepted').length,
|
||||
review: resolvedRelationships.filter((candidate) => candidate.status === 'review').length,
|
||||
rejected: resolvedRelationships.filter((candidate) => candidate.status === 'rejected').length,
|
||||
skipped: 0,
|
||||
};
|
||||
}
|
||||
|
||||
function compositeSummary(relationships: readonly KtxCompositeRelationshipCandidate[]): KtxScanRelationshipSummary {
|
||||
return {
|
||||
accepted: relationships.filter((candidate) => candidate.status === 'accepted').length,
|
||||
review: relationships.filter((candidate) => candidate.status === 'review').length,
|
||||
rejected: relationships.filter((candidate) => candidate.status === 'rejected').length,
|
||||
skipped: 0,
|
||||
};
|
||||
}
|
||||
|
||||
async function detectCompositeRelationships(input: {
|
||||
connectionId: string;
|
||||
driver: DiscoverKtxRelationshipsInput['driver'];
|
||||
schema: KtxEnrichedSchema;
|
||||
profile: KtxRelationshipProfileArtifact;
|
||||
executor: KtxRelationshipReadOnlyExecutor | null;
|
||||
context: DiscoverKtxRelationshipsInput['context'];
|
||||
warnings: KtxScanWarning[];
|
||||
}): Promise<KtxCompositeRelationshipCandidate[]> {
|
||||
if (!input.executor || !input.profile.sqlAvailable) {
|
||||
return [];
|
||||
}
|
||||
try {
|
||||
const compositeDetection = await discoverKtxCompositeRelationships({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
schema: input.schema,
|
||||
profiles: input.profile,
|
||||
executor: input.executor,
|
||||
ctx: input.context,
|
||||
});
|
||||
for (const warning of compositeDetection.warnings) {
|
||||
input.warnings.push({
|
||||
code: 'relationship_validation_failed',
|
||||
message: warning,
|
||||
recoverable: true,
|
||||
metadata: { source: 'composite_relationship_detection' },
|
||||
});
|
||||
}
|
||||
return compositeDetection.relationships;
|
||||
} catch (error) {
|
||||
input.warnings.push({
|
||||
code: 'relationship_validation_failed',
|
||||
message: `KTX composite relationship detection failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
recoverable: true,
|
||||
metadata: { source: 'composite_relationship_detection' },
|
||||
});
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function combinedRelationshipSummary(input: {
|
||||
formalAccepted: number;
|
||||
formalSkipped: number;
|
||||
resolvedRelationships: readonly KtxResolvedRelationshipDiscoveryCandidate[];
|
||||
}): KtxScanRelationshipSummary {
|
||||
const graph = relationshipSummary(input.resolvedRelationships);
|
||||
return {
|
||||
accepted: input.formalAccepted + graph.accepted,
|
||||
review: graph.review,
|
||||
rejected: graph.rejected,
|
||||
skipped: input.formalSkipped,
|
||||
};
|
||||
}
|
||||
|
||||
function sqlExecutor(input: DiscoverKtxRelationshipsInput): {
|
||||
executor: KtxRelationshipReadOnlyExecutor | null;
|
||||
warnings: KtxScanWarning[];
|
||||
} {
|
||||
if (!input.connector.capabilities.readOnlySql) {
|
||||
return {
|
||||
executor: null,
|
||||
warnings: [
|
||||
{
|
||||
code: 'connector_capability_missing',
|
||||
message: 'KTX scan connector cannot run read-only SQL relationship validation',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
if (!input.connector.executeReadOnly) {
|
||||
return {
|
||||
executor: null,
|
||||
warnings: [
|
||||
{
|
||||
code: 'relationship_validation_failed',
|
||||
message: 'KTX scan connector advertises readOnlySql but does not expose executeReadOnly',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
executor: {
|
||||
executeReadOnly: input.connector.executeReadOnly.bind(input.connector),
|
||||
},
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
|
||||
export async function discoverKtxRelationships(
|
||||
input: DiscoverKtxRelationshipsInput,
|
||||
): Promise<DiscoverKtxRelationshipsResult> {
|
||||
const { executor, warnings } = sqlExecutor(input);
|
||||
const formalMetadata = collectKtxFormalMetadataRelationships(input.schema);
|
||||
const profileCache = createKtxRelationshipProfileCache();
|
||||
const profile = await profileKtxRelationshipSchema({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
schema: input.schema,
|
||||
executor,
|
||||
ctx: input.context,
|
||||
profileSampleRows: input.settings.profileSampleRows,
|
||||
cache: profileCache,
|
||||
});
|
||||
const deterministicCandidates: KtxRelationshipDiscoveryCandidate[] = generateKtxRelationshipDiscoveryCandidates(
|
||||
input.schema,
|
||||
{
|
||||
maxCandidatesPerColumn: input.settings.maxCandidatesPerColumn,
|
||||
profiles: profile,
|
||||
},
|
||||
);
|
||||
const llmProposalResult = input.settings.llmProposals
|
||||
? await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: input.connectionId,
|
||||
schema: input.schema,
|
||||
profile,
|
||||
llmRuntime: input.llmRuntime ?? null,
|
||||
settings: {
|
||||
maxTablesPerBatch: input.settings.maxLlmTablesPerBatch,
|
||||
},
|
||||
})
|
||||
: { candidates: [], warnings: [], llmCalls: 0, summary: 'skipped' as const };
|
||||
const candidates = mergeKtxRelationshipDiscoveryCandidates([
|
||||
...deterministicCandidates,
|
||||
...llmProposalResult.candidates,
|
||||
]).filter((candidate) => !formalMetadata.acceptedIds.has(candidate.id));
|
||||
warnings.push(...llmProposalResult.warnings);
|
||||
const validated = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
candidates,
|
||||
profiles: profile,
|
||||
executor,
|
||||
ctx: input.context,
|
||||
tableCount: input.schema.tables.length,
|
||||
settings: {
|
||||
acceptThreshold: input.settings.acceptThreshold,
|
||||
reviewThreshold: input.settings.reviewThreshold,
|
||||
maxDistinctSourceValues: input.settings.profileSampleRows,
|
||||
concurrency: input.settings.validationConcurrency,
|
||||
validationBudget: input.settings.validationBudget,
|
||||
},
|
||||
});
|
||||
const graph = resolveKtxRelationshipGraph({
|
||||
schema: input.schema,
|
||||
profiles: profile,
|
||||
candidates: validated,
|
||||
settings: {
|
||||
acceptThreshold: input.settings.acceptThreshold,
|
||||
reviewThreshold: input.settings.reviewThreshold,
|
||||
validationRequiredForManifest: input.settings.validationRequiredForManifest,
|
||||
},
|
||||
});
|
||||
const compositeRelationships = await detectCompositeRelationships({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
schema: input.schema,
|
||||
profile,
|
||||
executor,
|
||||
context: input.context,
|
||||
warnings,
|
||||
});
|
||||
const inferredAccepted = nonFormalAcceptedRelationships({
|
||||
formalIds: formalMetadata.acceptedIds,
|
||||
resolvedRelationships: graph.relationships,
|
||||
});
|
||||
const compositeAccepted = compositeRelationships
|
||||
.filter((candidate) => candidate.status === 'accepted')
|
||||
.map(relationshipFromComposite);
|
||||
const relationshipsForAcceptance = formalMetadata.accepted.concat(inferredAccepted, compositeAccepted);
|
||||
const acceptedById = new Map(relationshipsForAcceptance.map((relationship) => [relationship.id, relationship]));
|
||||
const accepted = Array.from(acceptedById.values()).sort((left, right) =>
|
||||
relationshipId(left).localeCompare(relationshipId(right)),
|
||||
);
|
||||
const rejected = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'rejected')
|
||||
.map(relationshipFromResolved);
|
||||
const combined = combinedRelationshipSummary({
|
||||
formalAccepted: formalMetadata.accepted.length,
|
||||
formalSkipped: formalMetadata.skipped.length,
|
||||
resolvedRelationships: graph.relationships,
|
||||
});
|
||||
const compositeCounts = compositeSummary(compositeRelationships);
|
||||
|
||||
return {
|
||||
relationshipUpdate: {
|
||||
connectionId: input.connectionId,
|
||||
accepted,
|
||||
rejected,
|
||||
skipped: formalMetadata.skipped,
|
||||
},
|
||||
relationships: {
|
||||
accepted: combined.accepted + compositeCounts.accepted,
|
||||
review: combined.review + compositeCounts.review,
|
||||
rejected: combined.rejected + compositeCounts.rejected,
|
||||
skipped: combined.skipped,
|
||||
},
|
||||
profile,
|
||||
resolvedRelationships: graph.relationships,
|
||||
compositeRelationships,
|
||||
statisticalValidation: profile.sqlAvailable ? 'completed' : 'skipped',
|
||||
llmRelationshipValidation: llmProposalResult.summary,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedRelationship, KtxEnrichedSchema } from './enrichment-types.js';
|
||||
import { collectKtxFormalMetadataRelationships } from './relationship-formal-metadata.js';
|
||||
|
||||
function schema(relationships: KtxEnrichedRelationship[]): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables: [
|
||||
{
|
||||
id: 'accounts',
|
||||
ref: { catalog: null, db: null, name: 'accounts' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'accounts.id',
|
||||
tableId: 'accounts',
|
||||
tableRef: { catalog: null, db: null, name: 'accounts' },
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'orders',
|
||||
ref: { catalog: null, db: null, name: 'orders' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'orders.account_id',
|
||||
tableId: 'orders',
|
||||
tableRef: { catalog: null, db: null, name: 'orders' },
|
||||
name: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
relationships,
|
||||
};
|
||||
}
|
||||
|
||||
function formalRelationship(overrides: Partial<KtxEnrichedRelationship> = {}): KtxEnrichedRelationship {
|
||||
return {
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.account_id'],
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
columns: ['account_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'accounts',
|
||||
columnIds: ['accounts.id'],
|
||||
table: { catalog: null, db: null, name: 'accounts' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.6,
|
||||
isPrimaryKeyReference: false,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('formal metadata relationship collection', () => {
|
||||
it('accepts valid formal relationships with ground-truth confidence', () => {
|
||||
const result = collectKtxFormalMetadataRelationships(schema([formalRelationship()]));
|
||||
|
||||
expect(result.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
}),
|
||||
]);
|
||||
expect(result.skipped).toEqual([]);
|
||||
expect(result.acceptedIds).toEqual(new Set(['orders:orders.account_id->accounts:accounts.id']));
|
||||
});
|
||||
|
||||
it('skips duplicate and invalid formal relationships with reasons', () => {
|
||||
const result = collectKtxFormalMetadataRelationships(
|
||||
schema([
|
||||
formalRelationship(),
|
||||
formalRelationship(),
|
||||
formalRelationship({
|
||||
id: 'orders:orders.missing_account_id->accounts:accounts.id',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.missing_account_id'],
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
columns: ['missing_account_id'],
|
||||
},
|
||||
}),
|
||||
formalRelationship({
|
||||
id: 'manual-edge',
|
||||
source: 'manual',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
expect(result.accepted).toHaveLength(1);
|
||||
expect(result.skipped).toEqual([
|
||||
{
|
||||
relationshipId: 'orders:orders.account_id->accounts:accounts.id',
|
||||
reason: 'formal_metadata_duplicate',
|
||||
},
|
||||
{
|
||||
relationshipId: 'orders:orders.missing_account_id->accounts:accounts.id',
|
||||
reason: 'formal_metadata_endpoint_not_found',
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
import type { KtxEnrichedRelationship, KtxEnrichedSchema, KtxSkippedRelationship } from './enrichment-types.js';
|
||||
|
||||
export interface KtxFormalMetadataRelationshipCollection {
|
||||
accepted: KtxEnrichedRelationship[];
|
||||
skipped: KtxSkippedRelationship[];
|
||||
acceptedIds: Set<string>;
|
||||
}
|
||||
|
||||
function relationshipEndpointExists(schema: KtxEnrichedSchema, relationship: KtxEnrichedRelationship): boolean {
|
||||
const fromTable = schema.tables.find((table) => table.id === relationship.from.tableId && table.enabled);
|
||||
const toTable = schema.tables.find((table) => table.id === relationship.to.tableId && table.enabled);
|
||||
const fromColumn = fromTable?.columns.some(
|
||||
(column) => relationship.from.columnIds.includes(column.id) && relationship.from.columns.includes(column.name),
|
||||
);
|
||||
const toColumn = toTable?.columns.some(
|
||||
(column) => relationship.to.columnIds.includes(column.id) && relationship.to.columns.includes(column.name),
|
||||
);
|
||||
return Boolean(fromTable && toTable && fromColumn && toColumn);
|
||||
}
|
||||
|
||||
export function collectKtxFormalMetadataRelationships(
|
||||
schema: KtxEnrichedSchema,
|
||||
): KtxFormalMetadataRelationshipCollection {
|
||||
const accepted: KtxEnrichedRelationship[] = [];
|
||||
const skipped: KtxSkippedRelationship[] = [];
|
||||
const acceptedIds = new Set<string>();
|
||||
|
||||
for (const relationship of schema.relationships) {
|
||||
if (relationship.source !== 'formal') {
|
||||
continue;
|
||||
}
|
||||
if (acceptedIds.has(relationship.id)) {
|
||||
skipped.push({
|
||||
relationshipId: relationship.id,
|
||||
reason: 'formal_metadata_duplicate',
|
||||
});
|
||||
continue;
|
||||
}
|
||||
if (!relationshipEndpointExists(schema, relationship)) {
|
||||
skipped.push({
|
||||
relationshipId: relationship.id,
|
||||
reason: 'formal_metadata_endpoint_not_found',
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
acceptedIds.add(relationship.id);
|
||||
accepted.push({
|
||||
...relationship,
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
accepted: accepted.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
skipped,
|
||||
acceptedIds,
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,649 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type {
|
||||
KtxEnrichedColumn,
|
||||
KtxEnrichedSchema,
|
||||
KtxEnrichedTable,
|
||||
KtxRelationshipEndpoint,
|
||||
} from './enrichment-types.js';
|
||||
import type { KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type { KtxValidatedRelationshipDiscoveryCandidate } from './relationship-validation.js';
|
||||
import { resolveKtxRelationshipGraph } from './relationship-graph-resolver.js';
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(overrides: { accountsPrimaryKey?: boolean } = {}): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables: [
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false, primaryKey: overrides.accountsPrimaryKey ?? false }),
|
||||
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('account_archive', [column('account_archive', 'id', { nullable: false })]),
|
||||
table('users', [
|
||||
column('users', 'id', { nullable: false }),
|
||||
column('users', 'account_id', { nullable: false }),
|
||||
]),
|
||||
],
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
|
||||
function endpoint(tableName: string, columnName: string): KtxRelationshipEndpoint {
|
||||
return {
|
||||
tableId: tableName,
|
||||
columnIds: [`${tableName}.${columnName}`],
|
||||
table: { catalog: null, db: null, name: tableName },
|
||||
columns: [columnName],
|
||||
};
|
||||
}
|
||||
|
||||
function profiles(): KtxRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [
|
||||
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 3 },
|
||||
{ table: { catalog: null, db: null, name: 'account_archive' }, rowCount: 3 },
|
||||
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
|
||||
],
|
||||
columns: {
|
||||
'accounts.id': {
|
||||
table: { catalog: null, db: null, name: 'accounts' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'account_archive.id': {
|
||||
table: { catalog: null, db: null, name: 'account_archive' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'users.account_id': {
|
||||
table: { catalog: null, db: null, name: 'users' },
|
||||
column: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
|
||||
function validatedCandidate(
|
||||
overrides: Partial<KtxValidatedRelationshipDiscoveryCandidate> = {},
|
||||
): KtxValidatedRelationshipDiscoveryCandidate {
|
||||
const from = overrides.from ?? endpoint('users', 'account_id');
|
||||
const to = overrides.to ?? endpoint('accounts', 'id');
|
||||
return {
|
||||
id: `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`,
|
||||
from,
|
||||
to,
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: overrides.confidence ?? 0.95,
|
||||
source: overrides.source ?? 'normalized_table_match',
|
||||
status: overrides.status ?? 'accepted',
|
||||
score: overrides.score ?? 0.96,
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: to.table.name,
|
||||
targetColumnBase: to.columns[0] ?? '',
|
||||
targetKeyScore: 0.92,
|
||||
nameScore: 0.92,
|
||||
reasons: ['foreign_key_suffix', 'normalized_table_name', 'target_key_like'],
|
||||
...overrides.evidence,
|
||||
},
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 3,
|
||||
parentDistinct: 3,
|
||||
overlap: 3,
|
||||
checkedValues: 3,
|
||||
reasons: ['validation_passed'],
|
||||
...overrides.validation,
|
||||
},
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship graph resolver', () => {
|
||||
it('promotes validated relationship discovery references to accepted relationships and inferred PKs', () => {
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: schema(),
|
||||
profiles: profiles(),
|
||||
candidates: [validatedCandidate()],
|
||||
});
|
||||
|
||||
expect(result.pks).toContainEqual({
|
||||
table: 'accounts',
|
||||
columns: ['id'],
|
||||
pkScore: expect.any(Number),
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 1,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 1,
|
||||
incomingReviewCount: 0,
|
||||
reasons: expect.arrayContaining(['unique_target_column', 'incoming_validated_reference']),
|
||||
},
|
||||
});
|
||||
expect(result.pks.find((pk) => pk.table === 'accounts')?.pkScore).toBeGreaterThanOrEqual(0.85);
|
||||
expect(result.relationships).toHaveLength(1);
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
from: { table: { name: 'users' }, columns: ['account_id'] },
|
||||
to: { table: { name: 'accounts' }, columns: ['id'] },
|
||||
status: 'accepted',
|
||||
pkScore: expect.any(Number),
|
||||
fkScore: expect.any(Number),
|
||||
graph: {
|
||||
reasons: expect.arrayContaining(['target_pk_score_passed', 'fk_score_passed']),
|
||||
},
|
||||
});
|
||||
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.85);
|
||||
});
|
||||
|
||||
it('keeps validation-unavailable candidates in review even when name evidence is strong', () => {
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: schema(),
|
||||
profiles: { ...profiles(), sqlAvailable: false, columns: {}, warnings: ['read_only_sql_unavailable'] },
|
||||
candidates: [
|
||||
validatedCandidate({
|
||||
status: 'review',
|
||||
score: 0.57,
|
||||
validation: {
|
||||
targetUniqueness: 0,
|
||||
sourceCoverage: 0,
|
||||
violationCount: 0,
|
||||
violationRatio: 1,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 0,
|
||||
parentDistinct: 0,
|
||||
overlap: 0,
|
||||
checkedValues: 0,
|
||||
reasons: ['validation_unavailable'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.relationships).toHaveLength(1);
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
status: 'review',
|
||||
graph: {
|
||||
reasons: expect.arrayContaining(['validation_unavailable_review_only']),
|
||||
},
|
||||
});
|
||||
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.55);
|
||||
});
|
||||
|
||||
it('accepts at most one target per source column and rejects the lower-scored conflict loser', () => {
|
||||
const winner = validatedCandidate({ confidence: 0.95, score: 0.96 });
|
||||
const loser = validatedCandidate({
|
||||
from: endpoint('users', 'account_id'),
|
||||
to: endpoint('account_archive', 'id'),
|
||||
confidence: 0.85,
|
||||
score: 0.9,
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: 'account_archive',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.92,
|
||||
nameScore: 0.78,
|
||||
reasons: ['foreign_key_suffix', 'inflection', 'target_key_like'],
|
||||
},
|
||||
});
|
||||
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: schema(),
|
||||
profiles: profiles(),
|
||||
candidates: [loser, winner],
|
||||
});
|
||||
|
||||
expect(result.relationships.map((relationship) => relationship.status)).toEqual(['accepted', 'rejected']);
|
||||
expect(result.relationships[0]?.to.table.name).toBe('accounts');
|
||||
expect(result.relationships[1]).toMatchObject({
|
||||
to: { table: { name: 'account_archive' }, columns: ['id'] },
|
||||
status: 'rejected',
|
||||
graph: {
|
||||
reasons: expect.arrayContaining(['conflict_lost']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('preserves declared primary keys as accepted even without incoming candidates', () => {
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: schema({ accountsPrimaryKey: true }),
|
||||
profiles: profiles(),
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual([]);
|
||||
expect(result.pks).toContainEqual({
|
||||
table: 'accounts',
|
||||
columns: ['id'],
|
||||
pkScore: 1,
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: true,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons: ['declared_primary_key'],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('infers profile-only key-like columns without incoming relationship candidates', () => {
|
||||
const baseSchema = schema();
|
||||
const invoices = table('invoices', [
|
||||
column('invoices', 'id', { nullable: false }),
|
||||
column('invoices', 'invoice_number', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
}),
|
||||
column('invoices', 'amount', {
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
}),
|
||||
]);
|
||||
const baseProfiles = profiles();
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: { ...baseSchema, tables: [...baseSchema.tables, invoices] },
|
||||
profiles: {
|
||||
...baseProfiles,
|
||||
tables: [...baseProfiles.tables, { table: invoices.ref, rowCount: 3 }],
|
||||
columns: {
|
||||
...baseProfiles.columns,
|
||||
'invoices.id': {
|
||||
table: invoices.ref,
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'invoices.invoice_number': {
|
||||
table: invoices.ref,
|
||||
column: 'invoice_number',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['INV-1', 'INV-2', 'INV-3'],
|
||||
minTextLength: 5,
|
||||
maxTextLength: 5,
|
||||
},
|
||||
'invoices.amount': {
|
||||
table: invoices.ref,
|
||||
column: 'amount',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 2 / 3,
|
||||
nullRate: 0,
|
||||
sampleValues: ['100', '200'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual([]);
|
||||
expect(result.pks).toContainEqual({
|
||||
table: 'invoices',
|
||||
columns: ['id'],
|
||||
pkScore: 1,
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons: expect.arrayContaining([
|
||||
'unique_target_column',
|
||||
'profile_key_name',
|
||||
'not_null_profile',
|
||||
'profile_only_primary_key',
|
||||
'no_incoming_references',
|
||||
]),
|
||||
},
|
||||
});
|
||||
expect(result.pks).toContainEqual(
|
||||
expect.objectContaining({
|
||||
table: 'invoices',
|
||||
columns: ['invoice_number'],
|
||||
status: 'review',
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(result.pks.some((pk) => pk.table === 'invoices' && pk.columns[0] === 'amount')).toBe(false);
|
||||
});
|
||||
|
||||
it('pins single-incoming column_suffix_match resolver scores', () => {
|
||||
const schema = {
|
||||
connectionId: 'warehouse',
|
||||
relationships: [],
|
||||
tables: [
|
||||
{
|
||||
id: 'plans-id',
|
||||
ref: { catalog: null, db: null, name: 'stg_plans' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'plan-code-col',
|
||||
tableId: 'plans-id',
|
||||
tableRef: { catalog: null, db: null, name: 'stg_plans' },
|
||||
name: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'segments-id',
|
||||
ref: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'current-plan-code-col',
|
||||
tableId: 'segments-id',
|
||||
tableRef: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
name: 'current_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
} satisfies KtxEnrichedSchema;
|
||||
const profiles = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite' as const,
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'stg_plans.plan_code': {
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
},
|
||||
};
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema,
|
||||
profiles,
|
||||
candidates: [
|
||||
{
|
||||
id: 'segments:(current_plan_code)->plans:(plan_code)',
|
||||
from: {
|
||||
tableId: 'segments-id',
|
||||
columnIds: ['current-plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
columns: ['current_plan_code'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'plans-id',
|
||||
columnIds: ['plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
columns: ['plan_code'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.902,
|
||||
source: 'column_suffix_match',
|
||||
evidence: {
|
||||
sourceColumnBase: 'current_plan',
|
||||
targetTableBase: 'plan',
|
||||
targetColumnBase: 'plan_code',
|
||||
targetKeyScore: 0.86,
|
||||
nameScore: 0.78,
|
||||
reasons: ['column_suffix_match', 'profile_unique_target'],
|
||||
},
|
||||
status: 'accepted',
|
||||
score: 0.98,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 4,
|
||||
parentDistinct: 4,
|
||||
overlap: 4,
|
||||
checkedValues: 4,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.pks).toEqual([
|
||||
expect.objectContaining({
|
||||
table: 'stg_plans',
|
||||
columns: ['plan_code'],
|
||||
pkScore: 0.922,
|
||||
status: 'accepted',
|
||||
}),
|
||||
]);
|
||||
expect(result.relationships).toEqual([
|
||||
expect.objectContaining({
|
||||
source: 'column_suffix_match',
|
||||
status: 'accepted',
|
||||
pkScore: 0.922,
|
||||
fkScore: 0.953,
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it('keeps strong profile-only primary key evidence when name evidence is weak', () => {
|
||||
const baseSchema = schema();
|
||||
baseSchema.tables.push(
|
||||
table('events', [
|
||||
column('events', 'warehouse_key', {
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
const baseProfiles = profiles();
|
||||
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
|
||||
baseProfiles.columns['events.warehouse_key'] = {
|
||||
table: { catalog: null, db: null, name: 'events' },
|
||||
column: 'warehouse_key',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['100', '101', '102'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
};
|
||||
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: baseSchema,
|
||||
profiles: baseProfiles,
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
expect(result.pks).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
table: 'events',
|
||||
columns: ['warehouse_key'],
|
||||
status: 'review',
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('keeps strong profile-only primary key evidence when the column is not key-shaped', () => {
|
||||
const baseSchema = schema();
|
||||
baseSchema.tables.push(
|
||||
table('events', [
|
||||
column('events', 'opaque_reference', {
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
const baseProfiles = profiles();
|
||||
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
|
||||
baseProfiles.columns['events.opaque_reference'] = {
|
||||
table: { catalog: null, db: null, name: 'events' },
|
||||
column: 'opaque_reference',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['100', '101', '102'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
};
|
||||
|
||||
const result = resolveKtxRelationshipGraph({
|
||||
schema: baseSchema,
|
||||
profiles: baseProfiles,
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
const inferredPk = result.pks.find((candidate) => candidate.table === 'events');
|
||||
expect(inferredPk).toMatchObject({
|
||||
table: 'events',
|
||||
columns: ['opaque_reference'],
|
||||
status: 'review',
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
|
||||
}),
|
||||
});
|
||||
expect(inferredPk?.pkScore).toBeGreaterThanOrEqual(0.55);
|
||||
});
|
||||
});
|
||||
508
packages/cli/src/context/scan/relationship-graph-resolver.ts
Normal file
508
packages/cli/src/context/scan/relationship-graph-resolver.ts
Normal file
|
|
@ -0,0 +1,508 @@
|
|||
import type {
|
||||
KtxEnrichedColumn,
|
||||
KtxEnrichedSchema,
|
||||
KtxEnrichedTable,
|
||||
KtxRelationshipEndpoint,
|
||||
} from './enrichment-types.js';
|
||||
import { normalizeKtxRelationshipName } from './relationship-candidates.js';
|
||||
import type { KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import { scoreKtxRelationshipCandidate } from './relationship-scoring.js';
|
||||
import type { KtxValidatedRelationshipDiscoveryCandidate } from './relationship-validation.js';
|
||||
|
||||
export type KtxResolvedRelationshipStatus = 'accepted' | 'review' | 'rejected';
|
||||
|
||||
interface KtxRelationshipGraphResolverSettings {
|
||||
acceptThreshold: number;
|
||||
reviewThreshold: number;
|
||||
minTargetPkScoreForAcceptance: number;
|
||||
validationRequiredForManifest: boolean;
|
||||
}
|
||||
|
||||
interface KtxResolvedRelationshipPkEvidence {
|
||||
declaredPrimaryKey: boolean;
|
||||
targetUniqueness: number;
|
||||
incomingAcceptedCount: number;
|
||||
incomingReviewCount: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
interface KtxResolvedRelationshipPk {
|
||||
table: string;
|
||||
columns: string[];
|
||||
pkScore: number;
|
||||
status: KtxResolvedRelationshipStatus;
|
||||
incomingCandidateCount: number;
|
||||
evidence: KtxResolvedRelationshipPkEvidence;
|
||||
}
|
||||
|
||||
interface KtxResolvedRelationshipGraphEvidence {
|
||||
targetPkScore: number;
|
||||
incomingCandidateCount: number;
|
||||
conflictRank: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KtxResolvedRelationshipDiscoveryCandidate
|
||||
extends Omit<KtxValidatedRelationshipDiscoveryCandidate, 'status'> {
|
||||
status: KtxResolvedRelationshipStatus;
|
||||
pkScore: number;
|
||||
fkScore: number;
|
||||
graph: KtxResolvedRelationshipGraphEvidence;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipGraphResolutionResult {
|
||||
pks: KtxResolvedRelationshipPk[];
|
||||
relationships: KtxResolvedRelationshipDiscoveryCandidate[];
|
||||
}
|
||||
|
||||
export interface ResolveKtxRelationshipGraphInput {
|
||||
schema: KtxEnrichedSchema;
|
||||
profiles: KtxRelationshipProfileArtifact;
|
||||
candidates: readonly KtxValidatedRelationshipDiscoveryCandidate[];
|
||||
settings?: Partial<KtxRelationshipGraphResolverSettings>;
|
||||
}
|
||||
|
||||
const DEFAULT_SETTINGS: KtxRelationshipGraphResolverSettings = {
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
minTargetPkScoreForAcceptance: 0.78,
|
||||
validationRequiredForManifest: true,
|
||||
};
|
||||
|
||||
const PROFILE_ONLY_PK_MEASURE_NAME_TOKENS = new Set(['amount', 'count', 'price', 'quantity', 'subtotal', 'total']);
|
||||
|
||||
function mergeSettings(
|
||||
settings: Partial<KtxRelationshipGraphResolverSettings> | undefined,
|
||||
): KtxRelationshipGraphResolverSettings {
|
||||
return { ...DEFAULT_SETTINGS, ...settings };
|
||||
}
|
||||
|
||||
function roundScore(value: number): number {
|
||||
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
||||
}
|
||||
|
||||
function endpointKey(endpoint: KtxRelationshipEndpoint): string {
|
||||
return `${endpoint.table.name}.${singleRelationshipColumn(endpoint)}`;
|
||||
}
|
||||
|
||||
function sourceKey(endpoint: KtxRelationshipEndpoint): string {
|
||||
return `${endpoint.tableId}:${endpoint.columnIds.join(',')}`;
|
||||
}
|
||||
|
||||
function singleRelationshipColumn(endpoint: KtxRelationshipEndpoint): string {
|
||||
const column = endpoint.columns[0];
|
||||
if (!column) {
|
||||
throw new Error(`Expected relationship endpoint ${endpoint.table.name} to contain one column`);
|
||||
}
|
||||
return column;
|
||||
}
|
||||
|
||||
function pkKey(pk: Pick<KtxResolvedRelationshipPk, 'table' | 'columns'>): string {
|
||||
return `${pk.table}.(${pk.columns.join(',')})`;
|
||||
}
|
||||
|
||||
function candidateSortKey(candidate: Pick<KtxValidatedRelationshipDiscoveryCandidate, 'from' | 'to'>): string {
|
||||
return `${candidate.from.table.name}.${singleRelationshipColumn(candidate.from)}->${candidate.to.table.name}.${singleRelationshipColumn(candidate.to)}`;
|
||||
}
|
||||
|
||||
function statusForScore(
|
||||
score: number,
|
||||
settings: KtxRelationshipGraphResolverSettings,
|
||||
acceptedAllowed: boolean,
|
||||
): KtxResolvedRelationshipStatus {
|
||||
if (acceptedAllowed && score >= settings.acceptThreshold) {
|
||||
return 'accepted';
|
||||
}
|
||||
if (score >= settings.reviewThreshold) {
|
||||
return 'review';
|
||||
}
|
||||
return 'rejected';
|
||||
}
|
||||
|
||||
function candidateHasValidationPassed(candidate: KtxValidatedRelationshipDiscoveryCandidate): boolean {
|
||||
return candidate.validation.reasons.includes('validation_passed');
|
||||
}
|
||||
|
||||
function candidateIsValidationUnavailable(candidate: KtxValidatedRelationshipDiscoveryCandidate): boolean {
|
||||
return (
|
||||
candidate.validation.reasons.includes('validation_unavailable') ||
|
||||
candidate.validation.reasons.includes('profile_unavailable')
|
||||
);
|
||||
}
|
||||
|
||||
function declaredPrimaryKeys(schema: KtxEnrichedSchema): KtxResolvedRelationshipPk[] {
|
||||
const pks: KtxResolvedRelationshipPk[] = [];
|
||||
for (const table of schema.tables.filter((candidate) => candidate.enabled)) {
|
||||
for (const column of table.columns.filter((candidate) => candidate.primaryKey)) {
|
||||
pks.push({
|
||||
table: table.ref.name,
|
||||
columns: [column.name],
|
||||
pkScore: 1,
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: true,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons: ['declared_primary_key'],
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
return pks;
|
||||
}
|
||||
|
||||
function schemaTargetColumns(schema: KtxEnrichedSchema): Array<{ table: KtxEnrichedTable; column: KtxEnrichedColumn }> {
|
||||
return schema.tables
|
||||
.filter((table) => table.enabled)
|
||||
.flatMap((table) => table.columns.map((column) => ({ table, column })));
|
||||
}
|
||||
|
||||
function profileUniqueness(profiles: KtxRelationshipProfileArtifact, tableName: string, columnName: string): number {
|
||||
return profiles.columns[`${tableName}.${columnName}`]?.uniquenessRatio ?? 0;
|
||||
}
|
||||
|
||||
function profileNullRate(profiles: KtxRelationshipProfileArtifact, tableName: string, columnName: string): number {
|
||||
return profiles.columns[`${tableName}.${columnName}`]?.nullRate ?? 1;
|
||||
}
|
||||
|
||||
function profileColumnExists(profiles: KtxRelationshipProfileArtifact, tableName: string, columnName: string): boolean {
|
||||
return Boolean(profiles.columns[`${tableName}.${columnName}`]);
|
||||
}
|
||||
|
||||
function profileOnlyPkNameScore(tableName: string, columnName: string): number {
|
||||
const table = normalizeKtxRelationshipName(tableName).singular;
|
||||
const column = normalizeKtxRelationshipName(columnName).normalized;
|
||||
if (column === 'id') {
|
||||
return 1;
|
||||
}
|
||||
if (column === `${table}_id`) {
|
||||
return 0.96;
|
||||
}
|
||||
if (column === `${table}_key`) {
|
||||
return 0.88;
|
||||
}
|
||||
if (column === 'key' || column === 'uuid') {
|
||||
return 0.76;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function profileOnlyPkTypeCompatibility(columnName: string): number {
|
||||
const tokens = normalizeKtxRelationshipName(columnName).normalized.split('_').filter(Boolean);
|
||||
return tokens.some((token) => PROFILE_ONLY_PK_MEASURE_NAME_TOKENS.has(token)) ? 0 : 1;
|
||||
}
|
||||
|
||||
function profileOnlyPkEvidence(input: {
|
||||
profiles: KtxRelationshipProfileArtifact;
|
||||
tableName: string;
|
||||
columnName: string;
|
||||
}): { nameScore: number; nullRate: number; uniqueness: number; pkScore: number; weakName: boolean } | null {
|
||||
if (!profileColumnExists(input.profiles, input.tableName, input.columnName)) {
|
||||
return null;
|
||||
}
|
||||
const uniqueness = profileUniqueness(input.profiles, input.tableName, input.columnName);
|
||||
const nullRate = profileNullRate(input.profiles, input.tableName, input.columnName);
|
||||
const nameScore = profileOnlyPkNameScore(input.tableName, input.columnName);
|
||||
if (uniqueness < 0.98 || nullRate > 0.05) {
|
||||
return null;
|
||||
}
|
||||
const typeCompatibility = profileOnlyPkTypeCompatibility(input.columnName);
|
||||
const scoreBreakdown = scoreKtxRelationshipCandidate(
|
||||
{
|
||||
nameSimilarity: nameScore,
|
||||
typeCompatibility,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: uniqueness,
|
||||
profileNullRate: 1 - nullRate,
|
||||
structuralPrior: 0.65,
|
||||
},
|
||||
{
|
||||
nameSimilarity: 0.2,
|
||||
typeCompatibility: 0.08,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: 0.48,
|
||||
profileNullRate: 0.2,
|
||||
structuralPrior: 0.04,
|
||||
},
|
||||
);
|
||||
|
||||
if (scoreBreakdown.score < DEFAULT_SETTINGS.reviewThreshold) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return { nameScore, nullRate, uniqueness, pkScore: scoreBreakdown.score, weakName: nameScore < 0.74 };
|
||||
}
|
||||
|
||||
function resolveTargetPk(input: {
|
||||
table: string;
|
||||
column: string;
|
||||
declared: KtxResolvedRelationshipPk | undefined;
|
||||
profiles: KtxRelationshipProfileArtifact;
|
||||
incoming: readonly KtxValidatedRelationshipDiscoveryCandidate[];
|
||||
settings: KtxRelationshipGraphResolverSettings;
|
||||
profileOnly?: { nameScore: number; nullRate: number; uniqueness: number; pkScore: number; weakName: boolean } | null;
|
||||
}): KtxResolvedRelationshipPk {
|
||||
if (input.declared) {
|
||||
return input.declared;
|
||||
}
|
||||
|
||||
const targetUniqueness = profileUniqueness(input.profiles, input.table, input.column);
|
||||
const incomingAccepted = input.incoming.filter((candidate) => candidate.status === 'accepted');
|
||||
const incomingReview = input.incoming.filter((candidate) => candidate.status === 'review');
|
||||
const incomingQuality = Math.max(0, ...input.incoming.map((candidate) => candidate.score));
|
||||
const incomingVolume = Math.min(1, incomingAccepted.length * 0.3 + incomingReview.length * 0.15);
|
||||
const keyEvidence = Math.max(0, ...input.incoming.map((candidate) => candidate.evidence.targetKeyScore));
|
||||
const reasons: string[] = [];
|
||||
|
||||
if (targetUniqueness >= 0.9) {
|
||||
reasons.push('unique_target_column');
|
||||
}
|
||||
if (incomingAccepted.length > 0) {
|
||||
reasons.push('incoming_validated_reference');
|
||||
}
|
||||
if (incomingReview.length > 0) {
|
||||
reasons.push('incoming_review_reference');
|
||||
}
|
||||
if (keyEvidence >= 0.8) {
|
||||
reasons.push('target_key_like');
|
||||
}
|
||||
if (input.incoming.length === 0) {
|
||||
reasons.push('no_incoming_references');
|
||||
}
|
||||
|
||||
if (input.profileOnly) {
|
||||
reasons.push('not_null_profile', 'profile_only_primary_key');
|
||||
if (input.profileOnly.weakName) {
|
||||
reasons.push('weak_name_profile_key');
|
||||
} else {
|
||||
reasons.push('profile_key_name');
|
||||
}
|
||||
const pkScore = input.profileOnly.pkScore;
|
||||
return {
|
||||
table: input.table,
|
||||
columns: [input.column],
|
||||
pkScore,
|
||||
status: statusForScore(pkScore, input.settings, !input.profileOnly.weakName),
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const pkScore = roundScore(0.52 * targetUniqueness + 0.28 * incomingQuality + 0.12 * keyEvidence + 0.08 * incomingVolume);
|
||||
const acceptedAllowed = incomingAccepted.length > 0 && targetUniqueness >= 0.9;
|
||||
const status =
|
||||
incomingReview.length > 0 && pkScore < input.settings.reviewThreshold
|
||||
? 'review'
|
||||
: statusForScore(pkScore, input.settings, acceptedAllowed);
|
||||
|
||||
return {
|
||||
table: input.table,
|
||||
columns: [input.column],
|
||||
pkScore,
|
||||
status,
|
||||
incomingCandidateCount: input.incoming.length,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness,
|
||||
incomingAcceptedCount: incomingAccepted.length,
|
||||
incomingReviewCount: incomingReview.length,
|
||||
reasons,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function baseRelationshipResolution(input: {
|
||||
candidate: KtxValidatedRelationshipDiscoveryCandidate;
|
||||
pk: KtxResolvedRelationshipPk;
|
||||
settings: KtxRelationshipGraphResolverSettings;
|
||||
}): KtxResolvedRelationshipDiscoveryCandidate {
|
||||
const reasons: string[] = [];
|
||||
if (input.candidate.status === 'rejected') {
|
||||
reasons.push('candidate_validation_rejected');
|
||||
}
|
||||
if (candidateIsValidationUnavailable(input.candidate)) {
|
||||
reasons.push('validation_unavailable_review_only');
|
||||
}
|
||||
if (input.pk.pkScore >= input.settings.minTargetPkScoreForAcceptance) {
|
||||
reasons.push('target_pk_score_passed');
|
||||
} else {
|
||||
reasons.push('target_pk_score_low');
|
||||
}
|
||||
if (candidateHasValidationPassed(input.candidate)) {
|
||||
reasons.push('validation_passed');
|
||||
}
|
||||
|
||||
const validationPassBonus = candidateHasValidationPassed(input.candidate) ? 1 : 0;
|
||||
let fkScore = roundScore(
|
||||
0.48 * input.candidate.score +
|
||||
0.3 * input.pk.pkScore +
|
||||
0.14 * input.candidate.confidence +
|
||||
0.08 * validationPassBonus,
|
||||
);
|
||||
let status: KtxResolvedRelationshipStatus;
|
||||
|
||||
if (input.candidate.status === 'rejected') {
|
||||
status = 'rejected';
|
||||
} else if (candidateIsValidationUnavailable(input.candidate)) {
|
||||
status = 'review';
|
||||
fkScore = Math.max(fkScore, input.settings.reviewThreshold);
|
||||
} else {
|
||||
const acceptedAllowed =
|
||||
input.candidate.status === 'accepted' &&
|
||||
input.pk.pkScore >= input.settings.minTargetPkScoreForAcceptance &&
|
||||
(!input.settings.validationRequiredForManifest || candidateHasValidationPassed(input.candidate));
|
||||
status = statusForScore(fkScore, input.settings, acceptedAllowed);
|
||||
}
|
||||
|
||||
if (status === 'accepted') {
|
||||
reasons.push('fk_score_passed');
|
||||
} else if (status === 'review') {
|
||||
reasons.push('fk_score_review');
|
||||
} else {
|
||||
reasons.push('fk_score_rejected');
|
||||
}
|
||||
|
||||
return {
|
||||
...input.candidate,
|
||||
status,
|
||||
pkScore: input.pk.pkScore,
|
||||
fkScore,
|
||||
graph: {
|
||||
targetPkScore: input.pk.pkScore,
|
||||
incomingCandidateCount: input.pk.incomingCandidateCount,
|
||||
conflictRank: 1,
|
||||
reasons,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipRank(
|
||||
left: KtxResolvedRelationshipDiscoveryCandidate,
|
||||
right: KtxResolvedRelationshipDiscoveryCandidate,
|
||||
): number {
|
||||
return (
|
||||
right.fkScore - left.fkScore ||
|
||||
right.validation.sourceCoverage - left.validation.sourceCoverage ||
|
||||
right.pkScore - left.pkScore ||
|
||||
candidateSortKey(left).localeCompare(candidateSortKey(right))
|
||||
);
|
||||
}
|
||||
|
||||
function applySourceConflicts(
|
||||
relationships: readonly KtxResolvedRelationshipDiscoveryCandidate[],
|
||||
): KtxResolvedRelationshipDiscoveryCandidate[] {
|
||||
const bySource = new Map<string, KtxResolvedRelationshipDiscoveryCandidate[]>();
|
||||
for (const relationship of relationships) {
|
||||
const key = sourceKey(relationship.from);
|
||||
bySource.set(key, [...(bySource.get(key) ?? []), relationship]);
|
||||
}
|
||||
|
||||
const resolved: KtxResolvedRelationshipDiscoveryCandidate[] = [];
|
||||
for (const group of bySource.values()) {
|
||||
const ranked = [...group].sort(relationshipRank);
|
||||
let acceptedSeen = false;
|
||||
ranked.forEach((relationship, index) => {
|
||||
const conflictRank = index + 1;
|
||||
if (relationship.status === 'accepted' && acceptedSeen) {
|
||||
resolved.push({
|
||||
...relationship,
|
||||
status: 'rejected',
|
||||
graph: {
|
||||
...relationship.graph,
|
||||
conflictRank,
|
||||
reasons: [...relationship.graph.reasons.filter((reason) => reason !== 'fk_score_passed'), 'conflict_lost'],
|
||||
},
|
||||
});
|
||||
return;
|
||||
}
|
||||
if (relationship.status === 'accepted') {
|
||||
acceptedSeen = true;
|
||||
}
|
||||
resolved.push({
|
||||
...relationship,
|
||||
graph: {
|
||||
...relationship.graph,
|
||||
conflictRank,
|
||||
},
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return resolved.sort(relationshipRank);
|
||||
}
|
||||
|
||||
export function resolveKtxRelationshipGraph(
|
||||
input: ResolveKtxRelationshipGraphInput,
|
||||
): KtxRelationshipGraphResolutionResult {
|
||||
const settings = mergeSettings(input.settings);
|
||||
const declared = declaredPrimaryKeys(input.schema);
|
||||
const declaredByKey = new Map(declared.map((pk) => [pkKey(pk), pk]));
|
||||
const incomingByTarget = new Map<string, KtxValidatedRelationshipDiscoveryCandidate[]>();
|
||||
|
||||
for (const candidate of input.candidates) {
|
||||
const key = endpointKey(candidate.to);
|
||||
incomingByTarget.set(key, [...(incomingByTarget.get(key) ?? []), candidate]);
|
||||
}
|
||||
|
||||
const pkCandidates = new Map<string, KtxResolvedRelationshipPk>();
|
||||
for (const item of schemaTargetColumns(input.schema)) {
|
||||
const key = `${item.table.ref.name}.(${item.column.name})`;
|
||||
const incoming = incomingByTarget.get(`${item.table.ref.name}.${item.column.name}`) ?? [];
|
||||
const profileOnly =
|
||||
incoming.length === 0 && !item.column.primaryKey
|
||||
? profileOnlyPkEvidence({
|
||||
profiles: input.profiles,
|
||||
tableName: item.table.ref.name,
|
||||
columnName: item.column.name,
|
||||
})
|
||||
: null;
|
||||
if (incoming.length === 0 && !item.column.primaryKey && !profileOnly) {
|
||||
continue;
|
||||
}
|
||||
const pk = resolveTargetPk({
|
||||
table: item.table.ref.name,
|
||||
column: item.column.name,
|
||||
declared: declaredByKey.get(key),
|
||||
profiles: input.profiles,
|
||||
incoming,
|
||||
settings,
|
||||
profileOnly,
|
||||
});
|
||||
pkCandidates.set(key, pk);
|
||||
}
|
||||
|
||||
const relationships = input.candidates.map((candidate) => {
|
||||
const toColumn = singleRelationshipColumn(candidate.to);
|
||||
const key = `${candidate.to.table.name}.(${toColumn})`;
|
||||
const pk =
|
||||
pkCandidates.get(key) ??
|
||||
resolveTargetPk({
|
||||
table: candidate.to.table.name,
|
||||
column: toColumn,
|
||||
declared: undefined,
|
||||
profiles: input.profiles,
|
||||
incoming: incomingByTarget.get(endpointKey(candidate.to)) ?? [],
|
||||
settings,
|
||||
profileOnly: null,
|
||||
});
|
||||
pkCandidates.set(key, pk);
|
||||
return baseRelationshipResolution({ candidate, pk, settings });
|
||||
});
|
||||
|
||||
return {
|
||||
pks: Array.from(pkCandidates.values()).sort(
|
||||
(left, right) => right.pkScore - left.pkScore || pkKey(left).localeCompare(pkKey(right)),
|
||||
),
|
||||
relationships: applySourceConflicts(relationships),
|
||||
};
|
||||
}
|
||||
214
packages/cli/src/context/scan/relationship-llm-proposal.test.ts
Normal file
214
packages/cli/src/context/scan/relationship-llm-proposal.test.ts
Normal file
|
|
@ -0,0 +1,214 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
||||
import type { KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import { proposeKtxRelationshipCandidatesWithLlm } from './relationship-llm-proposal.js';
|
||||
|
||||
function llmRuntime(output?: unknown): KtxLlmRuntimePort {
|
||||
return {
|
||||
generateText: vi.fn(),
|
||||
generateObject: vi.fn(async () => output) as KtxLlmRuntimePort['generateObject'],
|
||||
runAgentLoop: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
relationships: [],
|
||||
tables: [
|
||||
table('customers', [
|
||||
column('customers', 'id', { nullable: false }),
|
||||
column('customers', 'email', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('orders', [
|
||||
column('orders', 'id', { nullable: false }),
|
||||
column('orders', 'buyer_ref'),
|
||||
]),
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function profile(): KtxRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 4,
|
||||
warnings: [],
|
||||
tables: [
|
||||
{ table: { catalog: null, db: null, name: 'customers' }, rowCount: 2 },
|
||||
{ table: { catalog: null, db: null, name: 'orders' }, rowCount: 2 },
|
||||
],
|
||||
columns: {
|
||||
'customers.id': {
|
||||
table: { catalog: null, db: null, name: 'customers' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'orders.buyer_ref': {
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
column: 'buyer_ref',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship LLM proposals', () => {
|
||||
it('maps valid structured FK proposals into review candidates with rationale evidence', async () => {
|
||||
const runtime = llmRuntime({
|
||||
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.94, rationale: 'Unique customer identifier.' }],
|
||||
fkCandidates: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'buyer_ref',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
confidence: 0.88,
|
||||
rationale: 'Buyer reference values match customer identifiers.',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmRuntime: runtime,
|
||||
});
|
||||
|
||||
expect(result.summary).toBe('completed');
|
||||
expect(result.llmCalls).toBe(1);
|
||||
expect(result.warnings).toEqual([]);
|
||||
expect(result.candidates).toHaveLength(1);
|
||||
expect(result.candidates[0]).toMatchObject({
|
||||
from: { tableId: 'orders', columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
|
||||
to: { tableId: 'customers', columnIds: ['customers.id'], columns: ['id'] },
|
||||
source: 'llm_proposal',
|
||||
status: 'review',
|
||||
evidence: {
|
||||
llmConfidence: 0.88,
|
||||
llmRationale: 'Buyer reference values match customer identifiers.',
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
},
|
||||
});
|
||||
expect(runtime.generateObject).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
role: 'candidateExtraction',
|
||||
system: expect.stringContaining('You are helping KTX review possible SQL relationships'),
|
||||
prompt: expect.stringContaining('"tables"'),
|
||||
}),
|
||||
);
|
||||
const call = vi.mocked(runtime.generateObject).mock.calls[0]?.[0];
|
||||
expect(call?.prompt).not.toContain('You are helping KTX review possible SQL relationships');
|
||||
});
|
||||
|
||||
it('skips when no runtime is configured', async () => {
|
||||
const result = await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmRuntime: null,
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({ candidates: [], llmCalls: 0, summary: 'skipped' });
|
||||
expect(result.warnings).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns recoverable warnings for invalid references and generation failures', async () => {
|
||||
const invalidReference = await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmRuntime: llmRuntime({
|
||||
pkCandidates: [],
|
||||
fkCandidates: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'missing_column',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
confidence: 0.7,
|
||||
rationale: 'Invalid source column.',
|
||||
},
|
||||
],
|
||||
}),
|
||||
});
|
||||
expect(invalidReference.candidates).toEqual([]);
|
||||
expect(invalidReference.summary).toBe('completed');
|
||||
expect(invalidReference.warnings[0]).toMatchObject({
|
||||
code: 'relationship_llm_invalid_reference',
|
||||
recoverable: true,
|
||||
});
|
||||
|
||||
const failed = await proposeKtxRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmRuntime: {
|
||||
generateText: vi.fn(),
|
||||
generateObject: vi.fn(async () => {
|
||||
throw new Error('model unavailable');
|
||||
}),
|
||||
runAgentLoop: vi.fn(),
|
||||
},
|
||||
});
|
||||
expect(failed).toMatchObject({ candidates: [], llmCalls: 1, summary: 'failed' });
|
||||
expect(failed.warnings[0]).toMatchObject({
|
||||
code: 'relationship_llm_proposal_failed',
|
||||
message: 'KTX relationship LLM proposal failed: model unavailable',
|
||||
recoverable: true,
|
||||
});
|
||||
});
|
||||
});
|
||||
268
packages/cli/src/context/scan/relationship-llm-proposal.ts
Normal file
268
packages/cli/src/context/scan/relationship-llm-proposal.ts
Normal file
|
|
@ -0,0 +1,268 @@
|
|||
import { z } from 'zod';
|
||||
import type { KtxLlmRuntimePort } from '../../context/llm/runtime-port.js';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
||||
import {
|
||||
normalizeKtxRelationshipName,
|
||||
type KtxRelationshipDiscoveryCandidate,
|
||||
} from './relationship-candidates.js';
|
||||
import type { KtxRelationshipColumnProfile, KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type { KtxScanEnrichmentSummary, KtxScanWarning, KtxTableRef } from './types.js';
|
||||
|
||||
const relationshipLlmProposalSchema = z.object({
|
||||
pkCandidates: z.array(
|
||||
z.object({
|
||||
table: z.string(),
|
||||
column: z.string(),
|
||||
confidence: z.number(),
|
||||
rationale: z.string(),
|
||||
}),
|
||||
),
|
||||
fkCandidates: z.array(
|
||||
z.object({
|
||||
fromTable: z.string(),
|
||||
fromColumn: z.string(),
|
||||
toTable: z.string(),
|
||||
toColumn: z.string(),
|
||||
confidence: z.number(),
|
||||
rationale: z.string(),
|
||||
}),
|
||||
),
|
||||
});
|
||||
|
||||
type KtxRelationshipLlmProposalOutput = z.infer<typeof relationshipLlmProposalSchema>;
|
||||
|
||||
interface KtxRelationshipLlmProposalSettings {
|
||||
maxTablesPerBatch: number;
|
||||
maxColumnsPerTable: number;
|
||||
maxSampleValuesPerColumn: number;
|
||||
minConfidence: number;
|
||||
}
|
||||
|
||||
export interface ProposeKtxRelationshipCandidatesWithLlmInput {
|
||||
connectionId: string;
|
||||
schema: KtxEnrichedSchema;
|
||||
profile: KtxRelationshipProfileArtifact;
|
||||
llmRuntime: KtxLlmRuntimePort | null;
|
||||
settings?: Partial<KtxRelationshipLlmProposalSettings>;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipLlmProposalResult {
|
||||
candidates: KtxRelationshipDiscoveryCandidate[];
|
||||
warnings: KtxScanWarning[];
|
||||
llmCalls: number;
|
||||
summary: KtxScanEnrichmentSummary['llmRelationshipValidation'];
|
||||
}
|
||||
|
||||
const DEFAULT_SETTINGS: KtxRelationshipLlmProposalSettings = {
|
||||
maxTablesPerBatch: 40,
|
||||
maxColumnsPerTable: 80,
|
||||
maxSampleValuesPerColumn: 5,
|
||||
minConfidence: 0.55,
|
||||
};
|
||||
|
||||
function mergeSettings(
|
||||
settings: Partial<KtxRelationshipLlmProposalSettings> | undefined,
|
||||
): KtxRelationshipLlmProposalSettings {
|
||||
return { ...DEFAULT_SETTINGS, ...settings };
|
||||
}
|
||||
|
||||
function clampConfidence(value: number): number {
|
||||
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
||||
}
|
||||
|
||||
function findTable(schema: KtxEnrichedSchema, name: string): KtxEnrichedTable | null {
|
||||
const normalized = name.toLowerCase();
|
||||
return schema.tables.find((table) => table.ref.name.toLowerCase() === normalized) ?? null;
|
||||
}
|
||||
|
||||
function findColumn(table: KtxEnrichedTable, name: string): KtxEnrichedColumn | null {
|
||||
const normalized = name.toLowerCase();
|
||||
return table.columns.find((column) => column.name.toLowerCase() === normalized) ?? null;
|
||||
}
|
||||
|
||||
function profileKey(table: KtxTableRef, column: KtxEnrichedColumn): string {
|
||||
return `${table.name}.${column.name}`;
|
||||
}
|
||||
|
||||
function profileForColumn(
|
||||
profile: KtxRelationshipProfileArtifact,
|
||||
table: KtxEnrichedTable,
|
||||
column: KtxEnrichedColumn,
|
||||
): KtxRelationshipColumnProfile | null {
|
||||
return profile.columns[profileKey(table.ref, column)] ?? null;
|
||||
}
|
||||
|
||||
function rowCountForTable(profile: KtxRelationshipProfileArtifact, table: KtxEnrichedTable): number | null {
|
||||
return profile.tables.find((item) => item.table.name.toLowerCase() === table.ref.name.toLowerCase())?.rowCount ?? null;
|
||||
}
|
||||
|
||||
function buildEvidencePacket(
|
||||
schema: KtxEnrichedSchema,
|
||||
profile: KtxRelationshipProfileArtifact,
|
||||
settings: KtxRelationshipLlmProposalSettings,
|
||||
): Record<string, unknown> {
|
||||
return {
|
||||
connectionId: schema.connectionId,
|
||||
sqlAvailable: profile.sqlAvailable,
|
||||
tables: schema.tables
|
||||
.filter((table) => table.enabled)
|
||||
.slice(0, settings.maxTablesPerBatch)
|
||||
.map((table) => ({
|
||||
name: table.ref.name,
|
||||
catalog: table.ref.catalog,
|
||||
db: table.ref.db,
|
||||
rowCount: rowCountForTable(profile, table),
|
||||
columns: table.columns.slice(0, settings.maxColumnsPerTable).map((column) => {
|
||||
const columnProfile = profileForColumn(profile, table, column);
|
||||
return {
|
||||
name: column.name,
|
||||
nativeType: column.nativeType,
|
||||
normalizedType: column.normalizedType,
|
||||
dimensionType: column.dimensionType,
|
||||
nullable: column.nullable,
|
||||
declaredPrimaryKey: column.primaryKey,
|
||||
profile: columnProfile
|
||||
? {
|
||||
rowCount: columnProfile.rowCount,
|
||||
nullCount: columnProfile.nullCount,
|
||||
distinctCount: columnProfile.distinctCount,
|
||||
uniquenessRatio: columnProfile.uniquenessRatio,
|
||||
nullRate: columnProfile.nullRate,
|
||||
sampleValues: columnProfile.sampleValues.slice(0, settings.maxSampleValuesPerColumn),
|
||||
}
|
||||
: null,
|
||||
};
|
||||
}),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
function pkProposalKey(table: string, column: string): string {
|
||||
return `${table.toLowerCase()}.${column.toLowerCase()}`;
|
||||
}
|
||||
|
||||
function endpoint(table: KtxEnrichedTable, column: KtxEnrichedColumn) {
|
||||
return {
|
||||
tableId: table.id,
|
||||
columnIds: [column.id],
|
||||
table: table.ref,
|
||||
columns: [column.name],
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipId(fromTable: KtxEnrichedTable, fromColumn: KtxEnrichedColumn, toTable: KtxEnrichedTable, toColumn: KtxEnrichedColumn): string {
|
||||
return `${fromTable.id}:(${fromColumn.id})->${toTable.id}:(${toColumn.id})`;
|
||||
}
|
||||
|
||||
function invalidReferenceWarning(message: string, metadata: Record<string, unknown>): KtxScanWarning {
|
||||
return {
|
||||
code: 'relationship_llm_invalid_reference',
|
||||
message,
|
||||
recoverable: true,
|
||||
metadata,
|
||||
};
|
||||
}
|
||||
|
||||
function mapValidProposals(
|
||||
schema: KtxEnrichedSchema,
|
||||
output: KtxRelationshipLlmProposalOutput,
|
||||
settings: KtxRelationshipLlmProposalSettings,
|
||||
): { candidates: KtxRelationshipDiscoveryCandidate[]; warnings: KtxScanWarning[] } {
|
||||
const warnings: KtxScanWarning[] = [];
|
||||
const pkProposals = new Set(output.pkCandidates.map((item) => pkProposalKey(item.table, item.column)));
|
||||
const candidates: KtxRelationshipDiscoveryCandidate[] = [];
|
||||
|
||||
for (const item of output.fkCandidates) {
|
||||
if (item.confidence < settings.minConfidence) {
|
||||
continue;
|
||||
}
|
||||
const fromTable = findTable(schema, item.fromTable);
|
||||
const toTable = findTable(schema, item.toTable);
|
||||
const fromColumn = fromTable ? findColumn(fromTable, item.fromColumn) : null;
|
||||
const toColumn = toTable ? findColumn(toTable, item.toColumn) : null;
|
||||
if (!fromTable || !toTable || !fromColumn || !toColumn) {
|
||||
warnings.push(
|
||||
invalidReferenceWarning('KTX relationship LLM proposal referenced a table or column that is not in the schema.', {
|
||||
proposal: item,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
const pkProposalExists = pkProposals.has(pkProposalKey(toTable.ref.name, toColumn.name));
|
||||
candidates.push({
|
||||
id: relationshipId(fromTable, fromColumn, toTable, toColumn),
|
||||
from: endpoint(fromTable, fromColumn),
|
||||
to: endpoint(toTable, toColumn),
|
||||
source: 'llm_proposal',
|
||||
status: 'review',
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: clampConfidence(item.confidence),
|
||||
evidence: {
|
||||
sourceColumnBase: normalizeKtxRelationshipName(fromColumn.name).singular,
|
||||
targetTableBase: normalizeKtxRelationshipName(toTable.ref.name).singular,
|
||||
targetColumnBase: normalizeKtxRelationshipName(toColumn.name).singular,
|
||||
targetKeyScore: pkProposalExists ? 0.88 : 0.68,
|
||||
nameScore: 0.45,
|
||||
reasons: pkProposalExists ? ['llm_proposal', 'llm_pk_proposal'] : ['llm_proposal'],
|
||||
llmConfidence: clampConfidence(item.confidence),
|
||||
llmRationale: item.rationale,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
return { candidates, warnings };
|
||||
}
|
||||
|
||||
function generationFailureWarning(error: unknown): KtxScanWarning {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
code: 'relationship_llm_proposal_failed',
|
||||
message: `KTX relationship LLM proposal failed: ${message}`,
|
||||
recoverable: true,
|
||||
};
|
||||
}
|
||||
|
||||
export async function proposeKtxRelationshipCandidatesWithLlm(
|
||||
input: ProposeKtxRelationshipCandidatesWithLlmInput,
|
||||
): Promise<KtxRelationshipLlmProposalResult> {
|
||||
if (!input.llmRuntime) {
|
||||
return { candidates: [], warnings: [], llmCalls: 0, summary: 'skipped' };
|
||||
}
|
||||
|
||||
const settings = mergeSettings(input.settings);
|
||||
const evidence = buildEvidencePacket(input.schema, input.profile, settings);
|
||||
const system = [
|
||||
'You are helping KTX review possible SQL relationships before validation.',
|
||||
'Use only the compact schema evidence. Propose likely primary keys and foreign keys for later SQL validation.',
|
||||
'Return structured output only; never assume a join is accepted.',
|
||||
].join('\n');
|
||||
const prompt = JSON.stringify(evidence);
|
||||
|
||||
try {
|
||||
const generated = await input.llmRuntime.generateObject<
|
||||
KtxRelationshipLlmProposalOutput,
|
||||
typeof relationshipLlmProposalSchema
|
||||
>({
|
||||
role: 'candidateExtraction',
|
||||
system,
|
||||
prompt,
|
||||
schema: relationshipLlmProposalSchema,
|
||||
});
|
||||
const output = relationshipLlmProposalSchema.parse(generated);
|
||||
const mapped = mapValidProposals(input.schema, output, settings);
|
||||
return {
|
||||
candidates: mapped.candidates,
|
||||
warnings: mapped.warnings,
|
||||
llmCalls: 1,
|
||||
summary: 'completed',
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
candidates: [],
|
||||
warnings: [generationFailureWarning(error)],
|
||||
llmCalls: 1,
|
||||
summary: 'failed',
|
||||
};
|
||||
}
|
||||
}
|
||||
151
packages/cli/src/context/scan/relationship-locality.test.ts
Normal file
151
packages/cli/src/context/scan/relationship-locality.test.ts
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedTable } from './enrichment-types.js';
|
||||
import { localCandidateTables } from './relationship-locality.js';
|
||||
|
||||
function column(
|
||||
tableId: string,
|
||||
id: string,
|
||||
name: string,
|
||||
options: Partial<KtxEnrichedColumn> = {},
|
||||
): KtxEnrichedColumn {
|
||||
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
|
||||
return {
|
||||
id,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: options.nativeType ?? 'INTEGER',
|
||||
normalizedType: options.normalizedType ?? 'integer',
|
||||
dimensionType: options.dimensionType ?? 'number',
|
||||
nullable: options.nullable ?? true,
|
||||
primaryKey: options.primaryKey ?? false,
|
||||
parentColumnId: options.parentColumnId ?? null,
|
||||
descriptions: options.descriptions ?? {},
|
||||
embedding: options.embedding ?? null,
|
||||
sampleValues: options.sampleValues ?? null,
|
||||
cardinality: options.cardinality ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function table(id: string, name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: 'public', name };
|
||||
return {
|
||||
id,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship locality', () => {
|
||||
it('ranks the referenced parent table ahead of the child table for id-like source columns', () => {
|
||||
const artists = table('artist-id', 'Artist', [column('artist-id', 'artist-pk', 'ArtistId')]);
|
||||
const albums = table('album-id', 'Album', [
|
||||
column('album-id', 'album-pk', 'AlbumId'),
|
||||
column('album-id', 'artist-fk', 'ArtistId'),
|
||||
]);
|
||||
const unrelated = table('invoice-id', 'Invoice', [column('invoice-id', 'invoice-pk', 'InvoiceId')]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: albums,
|
||||
childColumn: albums.columns[1]!,
|
||||
parentTables: [albums, unrelated, artists],
|
||||
maxParentTables: 1,
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name)).toEqual(['Artist']);
|
||||
expect(ranked[0]).toMatchObject({
|
||||
score: expect.any(Number),
|
||||
tokenScore: expect.any(Number),
|
||||
embeddingScore: 0,
|
||||
reasons: expect.arrayContaining(['column_table_token_overlap']),
|
||||
});
|
||||
});
|
||||
|
||||
it('uses singular and plural variants so plan_code can rank stg_plans', () => {
|
||||
const plans = table('plans-id', 'stg_plans', [column('plans-id', 'plan-code', 'plan_code')]);
|
||||
const segments = table('segments-id', 'mart_account_segments', [
|
||||
column('segments-id', 'current-plan-code', 'current_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: segments,
|
||||
childColumn: segments.columns[0]!,
|
||||
parentTables: [accounts, segments, plans],
|
||||
maxParentTables: 1,
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name)).toEqual(['stg_plans']);
|
||||
expect(ranked[0]?.tokenScore).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('returns all tables when the schema is smaller than the default locality cap', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id', 'id'),
|
||||
column('invoices-id', 'account-id', 'account_id'),
|
||||
]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: invoices,
|
||||
childColumn: invoices.columns[1]!,
|
||||
parentTables: [invoices, accounts],
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name).sort()).toEqual(['accounts', 'invoices']);
|
||||
});
|
||||
|
||||
it('supports an explicit zero cap for deterministic tests', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id', 'id'),
|
||||
column('invoices-id', 'account-id', 'account_id'),
|
||||
]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: invoices,
|
||||
childColumn: invoices.columns[1]!,
|
||||
parentTables: [invoices, accounts],
|
||||
maxParentTables: 0,
|
||||
});
|
||||
|
||||
expect(ranked).toEqual([]);
|
||||
});
|
||||
|
||||
it('uses parent-column embeddings when token locality is weak', () => {
|
||||
const customers = table('customers-id', 'customers', [
|
||||
column('customers-id', 'customers-id-col', 'id', { embedding: [1, 0, 0] }),
|
||||
column('customers-id', 'customers-name-col', 'name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
embedding: [0, 1, 0],
|
||||
}),
|
||||
]);
|
||||
const orders = table('orders-id', 'orders', [
|
||||
column('orders-id', 'orders-id-col', 'id', { embedding: [0, 0, 1] }),
|
||||
column('orders-id', 'buyer-ref-col', 'buyer_ref', { embedding: [0.995, 0.005, 0] }),
|
||||
]);
|
||||
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'invoice-id', 'id')]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: orders,
|
||||
childColumn: orders.columns[1]!,
|
||||
parentTables: [invoices, customers],
|
||||
maxParentTables: 1,
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name)).toEqual(['customers']);
|
||||
expect(ranked[0]).toMatchObject({
|
||||
embeddingScore: expect.any(Number),
|
||||
reasons: expect.arrayContaining(['embedding_similarity']),
|
||||
});
|
||||
expect(ranked[0]!.embeddingScore).toBeGreaterThan(0.99);
|
||||
});
|
||||
});
|
||||
183
packages/cli/src/context/scan/relationship-locality.ts
Normal file
183
packages/cli/src/context/scan/relationship-locality.ts
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
import type { KtxEnrichedColumn, KtxEnrichedTable } from './enrichment-types.js';
|
||||
import { normalizeKtxRelationshipName, tokenizeKtxRelationshipName } from './relationship-name-similarity.js';
|
||||
|
||||
export interface KtxRelationshipLocalityCandidateTable {
|
||||
table: KtxEnrichedTable;
|
||||
score: number;
|
||||
tokenScore: number;
|
||||
embeddingScore: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface LocalKtxRelationshipCandidateTablesInput {
|
||||
childTable: KtxEnrichedTable;
|
||||
childColumn: KtxEnrichedColumn;
|
||||
parentTables: readonly KtxEnrichedTable[];
|
||||
maxParentTables?: number;
|
||||
}
|
||||
|
||||
const DEFAULT_MAX_PARENT_TABLES = 20;
|
||||
const RELATIONSHIP_SUFFIX_TOKENS = new Set(['id', 'ids', 'key', 'keys', 'code', 'codes', 'uuid', 'uuids']);
|
||||
const normalizedTokenVariantsCache = new Map<string, string[]>();
|
||||
|
||||
function roundedScore(value: number): number {
|
||||
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
||||
}
|
||||
|
||||
function normalizedTokenVariants(name: string): string[] {
|
||||
const cached = normalizedTokenVariantsCache.get(name);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
const normalized = normalizeKtxRelationshipName(name);
|
||||
const variants = Array.from(
|
||||
new Set([
|
||||
...normalized.tokens,
|
||||
...tokenizeKtxRelationshipName(normalized.singular),
|
||||
...tokenizeKtxRelationshipName(normalized.plural),
|
||||
]),
|
||||
).filter(Boolean);
|
||||
normalizedTokenVariantsCache.set(name, variants);
|
||||
return variants;
|
||||
}
|
||||
|
||||
function childColumnLocalityTokens(column: KtxEnrichedColumn): string[] {
|
||||
const tokens = normalizedTokenVariants(column.name);
|
||||
const withoutSuffix = tokens.filter((token) => !RELATIONSHIP_SUFFIX_TOKENS.has(token));
|
||||
return withoutSuffix.length > 0 ? withoutSuffix : tokens;
|
||||
}
|
||||
|
||||
function uniqueTokens(values: readonly string[]): string[] {
|
||||
return Array.from(new Set(values.filter((value) => value.length > 0)));
|
||||
}
|
||||
|
||||
function jaccard(left: readonly string[], right: readonly string[]): number {
|
||||
if (left.length === 0 || right.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
const leftSet = new Set(left);
|
||||
const rightSet = new Set(right);
|
||||
const intersectionSize = Array.from(leftSet).filter((token) => rightSet.has(token)).length;
|
||||
const unionSize = new Set([...leftSet, ...rightSet]).size;
|
||||
return unionSize === 0 ? 0 : intersectionSize / unionSize;
|
||||
}
|
||||
|
||||
function cosineSimilarity(left: readonly number[] | null, right: readonly number[] | null): number {
|
||||
if (!left || !right || left.length === 0 || left.length !== right.length) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let dot = 0;
|
||||
let leftMagnitude = 0;
|
||||
let rightMagnitude = 0;
|
||||
for (let index = 0; index < left.length; index += 1) {
|
||||
const leftValue = left[index] ?? 0;
|
||||
const rightValue = right[index] ?? 0;
|
||||
dot += leftValue * rightValue;
|
||||
leftMagnitude += leftValue * leftValue;
|
||||
rightMagnitude += rightValue * rightValue;
|
||||
}
|
||||
|
||||
if (leftMagnitude === 0 || rightMagnitude === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return dot / (Math.sqrt(leftMagnitude) * Math.sqrt(rightMagnitude));
|
||||
}
|
||||
|
||||
function parentEmbeddingScore(childColumn: KtxEnrichedColumn, parentTable: KtxEnrichedTable): number {
|
||||
if (!Array.isArray(childColumn.embedding) || childColumn.embedding.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let best = 0;
|
||||
for (const parentColumn of parentTable.columns) {
|
||||
best = Math.max(best, cosineSimilarity(childColumn.embedding, parentColumn.embedding));
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
function tableTokenScore(input: {
|
||||
childTableId: string;
|
||||
childTableTokens: readonly string[];
|
||||
childColumnTokens: readonly string[];
|
||||
parentTable: KtxEnrichedTable;
|
||||
}): number {
|
||||
const parentTokens = normalizedTokenVariants(input.parentTable.ref.name);
|
||||
const columnOnlyScore = jaccard(input.childColumnTokens, parentTokens);
|
||||
if (parentTokens.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
if (input.parentTable.id === input.childTableId) {
|
||||
return columnOnlyScore;
|
||||
}
|
||||
const columnAndTableScore = jaccard(uniqueTokens([...input.childTableTokens, ...input.childColumnTokens]), parentTokens);
|
||||
return Math.max(columnOnlyScore, columnAndTableScore * 0.6);
|
||||
}
|
||||
|
||||
function localityScore(input: {
|
||||
childTable: KtxEnrichedTable;
|
||||
childTableId: string;
|
||||
childTableTokens: readonly string[];
|
||||
childColumn: KtxEnrichedColumn;
|
||||
childColumnTokens: readonly string[];
|
||||
parentTable: KtxEnrichedTable;
|
||||
}): Omit<KtxRelationshipLocalityCandidateTable, 'table'> {
|
||||
const tokenScore = roundedScore(tableTokenScore(input));
|
||||
const embeddingScore = roundedScore(parentEmbeddingScore(input.childColumn, input.parentTable));
|
||||
const score =
|
||||
embeddingScore > 0
|
||||
? roundedScore(Math.max(tokenScore, tokenScore * 0.8 + embeddingScore * 0.2, embeddingScore * 0.65))
|
||||
: tokenScore;
|
||||
const reasons: string[] = [];
|
||||
if (tokenScore > 0) {
|
||||
reasons.push('column_table_token_overlap');
|
||||
}
|
||||
if (embeddingScore > 0) {
|
||||
reasons.push('embedding_similarity');
|
||||
}
|
||||
if (reasons.length === 0) {
|
||||
reasons.push('locality_tie_breaker');
|
||||
}
|
||||
return {
|
||||
score,
|
||||
tokenScore,
|
||||
embeddingScore,
|
||||
reasons,
|
||||
};
|
||||
}
|
||||
|
||||
export function localCandidateTables(
|
||||
input: LocalKtxRelationshipCandidateTablesInput,
|
||||
): KtxRelationshipLocalityCandidateTable[] {
|
||||
const limit = input.maxParentTables ?? DEFAULT_MAX_PARENT_TABLES;
|
||||
if (!Number.isFinite(limit) || limit <= 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const childTableTokens = normalizedTokenVariants(input.childTable.ref.name);
|
||||
const childColumnTokens = childColumnLocalityTokens(input.childColumn);
|
||||
|
||||
return input.parentTables
|
||||
.map((table) => ({
|
||||
table,
|
||||
...localityScore({
|
||||
childTable: input.childTable,
|
||||
childTableId: input.childTable.id,
|
||||
childTableTokens,
|
||||
childColumn: input.childColumn,
|
||||
childColumnTokens,
|
||||
parentTable: table,
|
||||
}),
|
||||
}))
|
||||
.sort(
|
||||
(left, right) =>
|
||||
right.score - left.score ||
|
||||
right.tokenScore - left.tokenScore ||
|
||||
right.embeddingScore - left.embeddingScore ||
|
||||
left.table.ref.name.localeCompare(right.table.ref.name) ||
|
||||
left.table.id.localeCompare(right.table.id),
|
||||
)
|
||||
.slice(0, Math.floor(limit));
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
normalizeKtxRelationshipName,
|
||||
pluralizeKtxRelationshipToken,
|
||||
singularizeKtxRelationshipToken,
|
||||
tokenSimilarity,
|
||||
tokenizeKtxRelationshipName,
|
||||
} from './relationship-name-similarity.js';
|
||||
|
||||
describe('relationship name similarity', () => {
|
||||
it('tokenizes common warehouse naming styles', () => {
|
||||
expect(normalizeKtxRelationshipName('AlbumId')).toMatchObject({
|
||||
normalized: 'album_id',
|
||||
singular: 'album_id',
|
||||
plural: 'album_ids',
|
||||
tokens: ['album', 'id'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('artistID')).toMatchObject({
|
||||
normalized: 'artist_id',
|
||||
tokens: ['artist', 'id'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('SalesLT.CustomerID')).toMatchObject({
|
||||
normalized: 'sales_lt_customer_id',
|
||||
singular: 'sales_lt_customer_id',
|
||||
tokens: ['sales', 'lt', 'customer', 'id'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('SCREAMING_CUSTOMER_UUID')).toMatchObject({
|
||||
normalized: 'screaming_customer_uuid',
|
||||
tokens: ['screaming', 'customer', 'uuid'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('billing-account-key')).toMatchObject({
|
||||
normalized: 'billing_account_key',
|
||||
tokens: ['billing', 'account', 'key'],
|
||||
});
|
||||
});
|
||||
|
||||
it('removes only leading warehouse layer prefixes', () => {
|
||||
expect(normalizeKtxRelationshipName('mart__Sales_Accounts')).toMatchObject({
|
||||
normalized: 'sales_accounts',
|
||||
singular: 'sales_account',
|
||||
plural: 'sales_accounts',
|
||||
tokens: ['sales', 'accounts'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('dim_users')).toMatchObject({
|
||||
normalized: 'users',
|
||||
singular: 'user',
|
||||
plural: 'users',
|
||||
tokens: ['users'],
|
||||
});
|
||||
expect(normalizeKtxRelationshipName('customer_dim_id')).toMatchObject({
|
||||
normalized: 'customer_dim_id',
|
||||
tokens: ['customer', 'dim', 'id'],
|
||||
});
|
||||
});
|
||||
|
||||
it('folds accents and preserves non-suffix trailing s words', () => {
|
||||
expect(normalizeKtxRelationshipName('KundénID')).toMatchObject({
|
||||
normalized: 'kunden_id',
|
||||
tokens: ['kunden', 'id'],
|
||||
});
|
||||
expect(singularizeKtxRelationshipToken('address')).toBe('address');
|
||||
expect(singularizeKtxRelationshipToken('addresses')).toBe('address');
|
||||
expect(singularizeKtxRelationshipToken('status')).toBe('status');
|
||||
expect(pluralizeKtxRelationshipToken('address')).toBe('addresses');
|
||||
expect(pluralizeKtxRelationshipToken('company')).toBe('companies');
|
||||
});
|
||||
|
||||
it('returns deterministic tokens for direct tokenization calls', () => {
|
||||
expect(tokenizeKtxRelationshipName('HTTPResponseCode')).toEqual(['http', 'response', 'code']);
|
||||
expect(tokenizeKtxRelationshipName('customer2AddressID')).toEqual(['customer', '2', 'address', 'id']);
|
||||
});
|
||||
|
||||
it('scores token overlap and ordered suffix similarity', () => {
|
||||
expect(tokenSimilarity('artist_id', 'artist_id')).toBe(1);
|
||||
expect(tokenSimilarity('Album.ArtistId', 'ArtistID')).toBeGreaterThanOrEqual(0.74);
|
||||
expect(tokenSimilarity('customer_account_id', 'account_id')).toBeGreaterThan(
|
||||
tokenSimilarity('customer_account_id', 'invoice_id'),
|
||||
);
|
||||
expect(tokenSimilarity('', 'artist')).toBe(0);
|
||||
});
|
||||
});
|
||||
153
packages/cli/src/context/scan/relationship-name-similarity.ts
Normal file
153
packages/cli/src/context/scan/relationship-name-similarity.ts
Normal file
|
|
@ -0,0 +1,153 @@
|
|||
export interface KtxRelationshipNormalizedName {
|
||||
raw: string;
|
||||
normalized: string;
|
||||
singular: string;
|
||||
plural: string;
|
||||
tokens: string[];
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export type KtxRelationshipTokenInput = string | readonly string[] | KtxRelationshipNormalizedName;
|
||||
|
||||
const WAREHOUSE_LAYER_PREFIXES = new Set(['stg', 'stage', 'staging', 'dim', 'fct', 'fact', 'int', 'mart']);
|
||||
|
||||
function splitCaseBoundaries(value: string): string {
|
||||
return value
|
||||
.replace(/([\p{Lu}]+)([\p{Lu}][\p{Ll}])/gu, '$1_$2')
|
||||
.replace(/([\p{Ll}\p{N}])([\p{Lu}])/gu, '$1_$2')
|
||||
.replace(/(\p{L})(\p{N})/gu, '$1_$2')
|
||||
.replace(/(\p{N})(\p{L})/gu, '$1_$2');
|
||||
}
|
||||
|
||||
function foldAccents(value: string): string {
|
||||
return value
|
||||
.normalize('NFKD')
|
||||
.replace(/\p{Mark}+/gu, '')
|
||||
.replace(/ß/giu, 'ss')
|
||||
.replace(/æ/giu, 'ae')
|
||||
.replace(/œ/giu, 'oe');
|
||||
}
|
||||
|
||||
export function singularizeKtxRelationshipToken(value: string): string {
|
||||
if (value.length <= 2) {
|
||||
return value;
|
||||
}
|
||||
if (value.endsWith('ies') && value.length > 3) {
|
||||
return `${value.slice(0, -3)}y`;
|
||||
}
|
||||
if (/(ches|shes|sses|xes|zes)$/u.test(value)) {
|
||||
return value.slice(0, -2);
|
||||
}
|
||||
if (value.endsWith('ves') && value.length > 4) {
|
||||
return `${value.slice(0, -3)}f`;
|
||||
}
|
||||
if (value.endsWith('s') && !/(ss|us|is)$/u.test(value)) {
|
||||
return value.slice(0, -1);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
export function pluralizeKtxRelationshipToken(value: string): string {
|
||||
if (value.endsWith('y')) {
|
||||
return `${value.slice(0, -1)}ies`;
|
||||
}
|
||||
if (/(s|x|z|ch|sh)$/u.test(value)) {
|
||||
return `${value}es`;
|
||||
}
|
||||
return `${value}s`;
|
||||
}
|
||||
|
||||
function singularizeTokens(tokens: readonly string[]): string[] {
|
||||
if (tokens.length === 0) {
|
||||
return [];
|
||||
}
|
||||
const result = [...tokens];
|
||||
const last = result[result.length - 1];
|
||||
if (last) {
|
||||
result[result.length - 1] = singularizeKtxRelationshipToken(last);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function pluralizeTokens(tokens: readonly string[]): string[] {
|
||||
if (tokens.length === 0) {
|
||||
return [];
|
||||
}
|
||||
const result = [...tokens];
|
||||
const last = result[result.length - 1];
|
||||
if (last) {
|
||||
result[result.length - 1] = pluralizeKtxRelationshipToken(last);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
export function tokenizeKtxRelationshipName(name: string): string[] {
|
||||
const boundarySeparated = splitCaseBoundaries(foldAccents(name.trim()));
|
||||
const tokens = boundarySeparated
|
||||
.toLowerCase()
|
||||
.replace(/[^\p{L}\p{N}]+/gu, '_')
|
||||
.replace(/^_+|_+$/gu, '')
|
||||
.split('_')
|
||||
.filter(Boolean);
|
||||
|
||||
return tokens.filter((token, index) => index > 0 || !WAREHOUSE_LAYER_PREFIXES.has(token));
|
||||
}
|
||||
|
||||
export function normalizeKtxRelationshipName(name: string): KtxRelationshipNormalizedName {
|
||||
const tokens = tokenizeKtxRelationshipName(name);
|
||||
const singularTokens = singularizeTokens(tokens);
|
||||
const pluralTokens = pluralizeTokens(singularTokens);
|
||||
|
||||
return {
|
||||
raw: name,
|
||||
normalized: tokens.join('_'),
|
||||
singular: singularTokens.join('_'),
|
||||
plural: pluralTokens.join('_'),
|
||||
tokens,
|
||||
};
|
||||
}
|
||||
|
||||
function tokensFromInput(input: KtxRelationshipTokenInput): string[] {
|
||||
if (typeof input === 'string') {
|
||||
return tokenizeKtxRelationshipName(input);
|
||||
}
|
||||
if ('tokens' in input) {
|
||||
return input.tokens;
|
||||
}
|
||||
return input.map((token) => normalizeKtxRelationshipName(token).normalized).filter(Boolean);
|
||||
}
|
||||
|
||||
function longestCommonSuffixLength(left: readonly string[], right: readonly string[]): number {
|
||||
let count = 0;
|
||||
while (
|
||||
count < left.length &&
|
||||
count < right.length &&
|
||||
left[left.length - 1 - count] === right[right.length - 1 - count]
|
||||
) {
|
||||
count += 1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
function roundedScore(value: number): number {
|
||||
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function tokenSimilarity(leftInput: KtxRelationshipTokenInput, rightInput: KtxRelationshipTokenInput): number {
|
||||
const left = tokensFromInput(leftInput);
|
||||
const right = tokensFromInput(rightInput);
|
||||
if (left.length === 0 || right.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const leftSet = new Set(left);
|
||||
const rightSet = new Set(right);
|
||||
const intersectionSize = Array.from(leftSet).filter((token) => rightSet.has(token)).length;
|
||||
const unionSize = new Set([...leftSet, ...rightSet]).size;
|
||||
const jaccard = unionSize === 0 ? 0 : intersectionSize / unionSize;
|
||||
const suffixLength = longestCommonSuffixLength(left, right);
|
||||
const suffixScore = suffixLength / Math.min(left.length, right.length);
|
||||
|
||||
return roundedScore(jaccard * 0.75 + suffixScore * 0.25);
|
||||
}
|
||||
354
packages/cli/src/context/scan/relationship-profiling.test.ts
Normal file
354
packages/cli/src/context/scan/relationship-profiling.test.ts
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
import { readFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import Database from 'better-sqlite3';
|
||||
import { afterEach, describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
||||
import { snapshotToKtxEnrichedSchema } from './local-enrichment.js';
|
||||
import { loadKtxRelationshipBenchmarkFixture, maskKtxRelationshipBenchmarkSnapshot } from './relationship-benchmarks.js';
|
||||
import {
|
||||
createKtxRelationshipProfileCache,
|
||||
formatKtxRelationshipTableRef,
|
||||
profileKtxRelationshipSchema,
|
||||
quoteKtxRelationshipIdentifier,
|
||||
} from './relationship-profiling.js';
|
||||
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext } from './types.js';
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
queryCount = 0;
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
class FileSqliteExecutor {
|
||||
readonly db: Database.Database;
|
||||
queryCount = 0;
|
||||
|
||||
constructor(dataPath: string) {
|
||||
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
|
||||
}
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(tables: KtxEnrichedTable[]): KtxEnrichedSchema {
|
||||
return { connectionId: 'warehouse', tables, relationships: [] };
|
||||
}
|
||||
|
||||
describe('relationship profiling', () => {
|
||||
let executor: InMemorySqliteExecutor | null = null;
|
||||
|
||||
afterEach(() => {
|
||||
executor?.close();
|
||||
executor = null;
|
||||
});
|
||||
|
||||
it('keeps profiling on the batched table path', async () => {
|
||||
const source = await readFile(new URL('relationship-profiling.ts', import.meta.url), 'utf-8');
|
||||
|
||||
expect(source).not.toMatch(new RegExp('queryColumn' + 'Profile'));
|
||||
expect(source).not.toMatch(/for \(const column of table\.columns\)[\s\S]*executeReadOnly/);
|
||||
expect(source).toMatch(/queryTableProfile/);
|
||||
expect(source).toMatch(/UNION ALL/);
|
||||
});
|
||||
|
||||
it('quotes identifiers and formats table refs for supported local SQL drivers', () => {
|
||||
expect(quoteKtxRelationshipIdentifier('sqlite', 'odd"name')).toBe('"odd""name"');
|
||||
expect(quoteKtxRelationshipIdentifier('mysql', 'odd`name')).toBe('`odd``name`');
|
||||
expect(quoteKtxRelationshipIdentifier('sqlserver', 'odd]name')).toBe('[odd]]name]');
|
||||
expect(formatKtxRelationshipTableRef('sqlite', { catalog: null, db: null, name: 'accounts' })).toBe('"accounts"');
|
||||
expect(formatKtxRelationshipTableRef('postgres', { catalog: null, db: 'analytics', name: 'accounts' })).toBe(
|
||||
'"analytics"."accounts"',
|
||||
);
|
||||
});
|
||||
|
||||
it('profiles row count, null rate, uniqueness, sample values, and text lengths', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
|
||||
INSERT INTO accounts (id, code, parent_id) VALUES
|
||||
(1, 'A-1', NULL),
|
||||
(2, 'B-2', 1),
|
||||
(3, 'C-3', 1),
|
||||
(4, 'C-3', 2);
|
||||
`);
|
||||
|
||||
const result = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { primaryKey: false, nullable: false }),
|
||||
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
column('accounts', 'parent_id'),
|
||||
]),
|
||||
]),
|
||||
executor,
|
||||
ctx: { runId: 'profile-test' },
|
||||
sampleValuesPerColumn: 3,
|
||||
});
|
||||
|
||||
expect(result.sqlAvailable).toBe(true);
|
||||
expect(result.queryCount).toBe(1);
|
||||
expect(executor.queryCount).toBe(1);
|
||||
expect(result.tables).toHaveLength(1);
|
||||
expect(result.tables[0]).toMatchObject({ table: { name: 'accounts' }, rowCount: 4 });
|
||||
expect(result.columns['accounts.id']).toMatchObject({
|
||||
table: { name: 'accounts' },
|
||||
column: 'id',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
});
|
||||
expect(result.columns['accounts.code']).toMatchObject({
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.75,
|
||||
sampleValues: ['C-3', 'A-1', 'B-2'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
});
|
||||
expect(result.columns['accounts.parent_id']).toMatchObject({
|
||||
nullCount: 1,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 0.5,
|
||||
nullRate: 0.25,
|
||||
});
|
||||
});
|
||||
|
||||
it('profiles each enabled table with one read-only SQL query', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, code, parent_id) VALUES
|
||||
(1, 'A-1', NULL),
|
||||
(2, 'B-2', 1),
|
||||
(3, 'C-3', 1),
|
||||
(4, 'C-3', 2);
|
||||
INSERT INTO users (id, account_id) VALUES
|
||||
(10, 1),
|
||||
(11, 1),
|
||||
(12, 2);
|
||||
`);
|
||||
|
||||
const result = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
column('accounts', 'parent_id'),
|
||||
]),
|
||||
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id')]),
|
||||
]),
|
||||
executor,
|
||||
ctx: { runId: 'profile-batched-query-count' },
|
||||
sampleValuesPerColumn: 3,
|
||||
});
|
||||
|
||||
expect(result.sqlAvailable).toBe(true);
|
||||
expect(result.queryCount).toBe(2);
|
||||
expect(executor.queryCount).toBe(2);
|
||||
expect(result.tables).toEqual([
|
||||
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 },
|
||||
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
|
||||
]);
|
||||
expect(result.columns['accounts.code']).toMatchObject({
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.75,
|
||||
sampleValues: ['C-3', 'A-1', 'B-2'],
|
||||
});
|
||||
expect(result.columns['users.account_id']).toMatchObject({
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 2 / 3,
|
||||
});
|
||||
});
|
||||
|
||||
it('bounds column profile statistics with profileSampleRows', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a3'), (4, 'a4');
|
||||
`);
|
||||
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'account_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
}),
|
||||
]),
|
||||
]),
|
||||
executor,
|
||||
ctx: { runId: 'profile-sample-rows' },
|
||||
profileSampleRows: 2,
|
||||
});
|
||||
|
||||
expect(profiles.queryCount).toBe(1);
|
||||
expect(executor.queryCount).toBe(1);
|
||||
expect(profiles.tables).toEqual([{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 }]);
|
||||
expect(profiles.columns['accounts.id']).toMatchObject({
|
||||
rowCount: 2,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
});
|
||||
expect(profiles.columns['accounts.account_code']?.sampleValues).toEqual(['a1', 'a2']);
|
||||
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('reuses a profile cache inside one scan run but re-queries with a fresh cache', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a2');
|
||||
`);
|
||||
const relationshipSchema = schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'account_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
}),
|
||||
]),
|
||||
]);
|
||||
const cache = createKtxRelationshipProfileCache();
|
||||
|
||||
const first = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: relationshipSchema,
|
||||
executor,
|
||||
ctx: { runId: 'profile-cache-run' },
|
||||
cache,
|
||||
});
|
||||
const second = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: relationshipSchema,
|
||||
executor,
|
||||
ctx: { runId: 'profile-cache-run' },
|
||||
cache,
|
||||
});
|
||||
const third = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: relationshipSchema,
|
||||
executor,
|
||||
ctx: { runId: 'profile-cache-fresh-run' },
|
||||
cache: createKtxRelationshipProfileCache(),
|
||||
});
|
||||
|
||||
expect(first.queryCount).toBe(1);
|
||||
expect(second.queryCount).toBe(0);
|
||||
expect(third.queryCount).toBe(1);
|
||||
expect(executor.queryCount).toBe(2);
|
||||
expect(second.tables).toEqual(first.tables);
|
||||
expect(second.columns).toEqual(first.columns);
|
||||
});
|
||||
|
||||
it('profiles the checked-in scale stress fixture with one query per table', async () => {
|
||||
const fixtureRoot = new URL('../../test/fixtures/relationship-benchmarks/', import.meta.url);
|
||||
const fixture = await loadKtxRelationshipBenchmarkFixture(join(fixtureRoot.pathname, 'scale_stress_no_declared_constraints'));
|
||||
if (!fixture.dataPath) {
|
||||
throw new Error('scale_stress_no_declared_constraints is missing data.sqlite');
|
||||
}
|
||||
const maskedSnapshot = maskKtxRelationshipBenchmarkSnapshot(
|
||||
fixture.snapshot,
|
||||
'declared_pks_and_declared_fks_removed',
|
||||
);
|
||||
const scaleExecutor = new FileSqliteExecutor(fixture.dataPath);
|
||||
try {
|
||||
const result = await profileKtxRelationshipSchema({
|
||||
connectionId: fixture.snapshot.connectionId,
|
||||
driver: fixture.snapshot.driver,
|
||||
schema: snapshotToKtxEnrichedSchema(maskedSnapshot, new Map()),
|
||||
executor: scaleExecutor,
|
||||
ctx: { runId: 'scale-stress-profile-query-count' },
|
||||
profileSampleRows: 3,
|
||||
});
|
||||
|
||||
expect(fixture.snapshot.tables).toHaveLength(400);
|
||||
expect(result.queryCount).toBe(400);
|
||||
expect(result.queryCount).toBeLessThanOrEqual(2 * fixture.snapshot.tables.length);
|
||||
expect(scaleExecutor.queryCount).toBe(400);
|
||||
} finally {
|
||||
scaleExecutor.close();
|
||||
}
|
||||
});
|
||||
});
|
||||
468
packages/cli/src/context/scan/relationship-profiling.ts
Normal file
468
packages/cli/src/context/scan/relationship-profiling.ts
Normal file
|
|
@ -0,0 +1,468 @@
|
|||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
||||
import type {
|
||||
KtxConnectionDriver,
|
||||
KtxQueryResult,
|
||||
KtxReadOnlyQueryInput,
|
||||
KtxScanContext,
|
||||
KtxTableRef,
|
||||
} from './types.js';
|
||||
|
||||
export interface KtxRelationshipReadOnlyExecutor {
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, ctx: KtxScanContext): Promise<KtxQueryResult>;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipColumnProfile {
|
||||
table: KtxTableRef;
|
||||
column: string;
|
||||
nativeType: string;
|
||||
normalizedType: string;
|
||||
rowCount: number;
|
||||
nullCount: number;
|
||||
distinctCount: number;
|
||||
uniquenessRatio: number;
|
||||
nullRate: number;
|
||||
sampleValues: string[];
|
||||
minTextLength: number | null;
|
||||
maxTextLength: number | null;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxRelationshipTableProfile {
|
||||
table: KtxTableRef;
|
||||
rowCount: number;
|
||||
}
|
||||
|
||||
export interface KtxRelationshipProfileArtifact {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
sqlAvailable: boolean;
|
||||
queryCount: number;
|
||||
tables: KtxRelationshipTableProfile[];
|
||||
columns: Record<string, KtxRelationshipColumnProfile>;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
interface KtxRelationshipCachedTableProfile {
|
||||
table: KtxRelationshipTableProfile;
|
||||
columns: Record<string, KtxRelationshipColumnProfile>;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
export interface KtxRelationshipProfileCache {
|
||||
readonly tableProfiles: Map<string, KtxRelationshipCachedTableProfile>;
|
||||
}
|
||||
|
||||
export interface ProfileKtxRelationshipSchemaInput {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
schema: KtxEnrichedSchema;
|
||||
executor: KtxRelationshipReadOnlyExecutor | null;
|
||||
ctx: KtxScanContext;
|
||||
sampleValuesPerColumn?: number;
|
||||
profileSampleRows?: number;
|
||||
cache?: KtxRelationshipProfileCache;
|
||||
}
|
||||
|
||||
export function createKtxRelationshipProfileCache(): KtxRelationshipProfileCache {
|
||||
return { tableProfiles: new Map() };
|
||||
}
|
||||
|
||||
const SAMPLE_VALUE_DELIMITER = '\u001f';
|
||||
|
||||
type QuoteStyle = 'double' | 'backtick' | 'bracket';
|
||||
|
||||
function quoteStyle(driver: KtxConnectionDriver): QuoteStyle {
|
||||
if (driver === 'mysql' || driver === 'clickhouse') {
|
||||
return 'backtick';
|
||||
}
|
||||
if (driver === 'sqlserver') {
|
||||
return 'bracket';
|
||||
}
|
||||
return 'double';
|
||||
}
|
||||
|
||||
export function quoteKtxRelationshipIdentifier(driver: KtxConnectionDriver, identifier: string): string {
|
||||
switch (quoteStyle(driver)) {
|
||||
case 'backtick':
|
||||
return `\`${identifier.replace(/`/g, '``')}\``;
|
||||
case 'bracket':
|
||||
return `[${identifier.replace(/\]/g, ']]')}]`;
|
||||
case 'double':
|
||||
return `"${identifier.replace(/"/g, '""')}"`;
|
||||
}
|
||||
}
|
||||
|
||||
export function formatKtxRelationshipTableRef(driver: KtxConnectionDriver, table: KtxTableRef): string {
|
||||
const parts =
|
||||
driver === 'sqlite'
|
||||
? [table.name]
|
||||
: [table.catalog, table.db, table.name].filter((value): value is string => Boolean(value));
|
||||
return parts.map((part) => quoteKtxRelationshipIdentifier(driver, part)).join('.');
|
||||
}
|
||||
|
||||
function textLengthExpression(driver: KtxConnectionDriver, columnSql: string): string {
|
||||
if (driver === 'mysql') {
|
||||
return `CHAR_LENGTH(CAST(${columnSql} AS CHAR))`;
|
||||
}
|
||||
if (driver === 'sqlserver') {
|
||||
return `LEN(CAST(${columnSql} AS NVARCHAR(MAX)))`;
|
||||
}
|
||||
if (driver === 'bigquery') {
|
||||
return `LENGTH(CAST(${columnSql} AS STRING))`;
|
||||
}
|
||||
if (driver === 'clickhouse') {
|
||||
return `length(toString(${columnSql}))`;
|
||||
}
|
||||
return `LENGTH(CAST(${columnSql} AS TEXT))`;
|
||||
}
|
||||
|
||||
function limitSql(driver: KtxConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return '';
|
||||
}
|
||||
return ` LIMIT ${Math.max(1, Math.floor(limit))}`;
|
||||
}
|
||||
|
||||
function topSql(driver: KtxConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return ` TOP (${Math.max(1, Math.floor(limit))})`;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function sampledTableSql(driver: KtxConnectionDriver, tableSql: string, limit: number): string {
|
||||
const safeLimit = Math.max(1, Math.floor(limit));
|
||||
if (driver === 'sqlserver') {
|
||||
return `(SELECT TOP (${safeLimit}) * FROM ${tableSql}) AS relationship_profile_sample`;
|
||||
}
|
||||
return `(SELECT * FROM ${tableSql}${limitSql(driver, safeLimit)}) AS relationship_profile_sample`;
|
||||
}
|
||||
|
||||
function firstRow(result: KtxQueryResult): unknown[] {
|
||||
return result.rows[0] ?? [];
|
||||
}
|
||||
|
||||
function headerIndex(result: KtxQueryResult, header: string): number {
|
||||
return result.headers.findIndex((candidate) => candidate.toLowerCase() === header.toLowerCase());
|
||||
}
|
||||
|
||||
function valueAt(result: KtxQueryResult, row: unknown[], header: string): unknown {
|
||||
return row[headerIndex(result, header)];
|
||||
}
|
||||
|
||||
function numberFromValue(value: unknown): number {
|
||||
if (typeof value === 'number') {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === 'bigint') {
|
||||
return Number(value);
|
||||
}
|
||||
if (typeof value === 'string' && value.trim() !== '') {
|
||||
return Number(value);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function nullableNumberFromValue(value: unknown): number | null {
|
||||
if (value === null || value === undefined) {
|
||||
return null;
|
||||
}
|
||||
if (typeof value === 'number') {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === 'bigint') {
|
||||
return Number(value);
|
||||
}
|
||||
if (typeof value === 'string' && value.trim() !== '') {
|
||||
return Number(value);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function numberAt(result: KtxQueryResult, header: string): number {
|
||||
return numberFromValue(valueAt(result, firstRow(result), header));
|
||||
}
|
||||
|
||||
function columnKey(table: KtxEnrichedTable, column: KtxEnrichedColumn): string {
|
||||
return `${table.ref.name}.${column.name}`;
|
||||
}
|
||||
|
||||
function tableProfileCacheKey(input: {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
ctx: KtxScanContext;
|
||||
table: KtxTableRef;
|
||||
sampleValuesPerColumn: number;
|
||||
profileSampleRows: number;
|
||||
}): string {
|
||||
return [
|
||||
input.ctx.runId,
|
||||
input.connectionId,
|
||||
input.driver,
|
||||
input.table.catalog ?? '',
|
||||
input.table.db ?? '',
|
||||
input.table.name,
|
||||
String(input.sampleValuesPerColumn),
|
||||
String(input.profileSampleRows),
|
||||
].join('\u001e');
|
||||
}
|
||||
|
||||
function sqlStringLiteral(value: string): string {
|
||||
return `'${value.replace(/'/g, "''")}'`;
|
||||
}
|
||||
|
||||
function sampleAggregateSql(driver: KtxConnectionDriver, innerSql: string): string {
|
||||
if (driver === 'postgres') {
|
||||
return `(SELECT STRING_AGG(CAST(value AS TEXT), CHR(31)) FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
if (driver === 'bigquery') {
|
||||
return `(SELECT STRING_AGG(CAST(value AS STRING), '\\u001F') FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
if (driver === 'mysql') {
|
||||
return `(SELECT GROUP_CONCAT(CAST(value AS CHAR) SEPARATOR CHAR(31)) FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
if (driver === 'sqlserver') {
|
||||
return `(SELECT STRING_AGG(CAST(value AS NVARCHAR(MAX)), CHAR(31)) FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
if (driver === 'clickhouse') {
|
||||
return `(SELECT arrayStringConcat(groupArray(toString(value)), '\\x1F') FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
return `(SELECT GROUP_CONCAT(CAST(value AS TEXT), char(31)) FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
|
||||
function sampleValuesSql(input: {
|
||||
driver: KtxConnectionDriver;
|
||||
tableSql: string;
|
||||
columnSql: string;
|
||||
limit: number;
|
||||
}): string {
|
||||
return [
|
||||
`SELECT${topSql(input.driver, input.limit)} ${input.columnSql} AS value`,
|
||||
`FROM ${input.tableSql}`,
|
||||
`WHERE ${input.columnSql} IS NOT NULL`,
|
||||
`GROUP BY ${input.columnSql}`,
|
||||
`ORDER BY COUNT(*) DESC, ${input.columnSql} ASC`,
|
||||
limitSql(input.driver, input.limit),
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function columnProfileSelectSql(input: {
|
||||
connectionDriver: KtxConnectionDriver;
|
||||
tableSql: string;
|
||||
profileTableSql: string;
|
||||
column: KtxEnrichedColumn;
|
||||
sampleValuesPerColumn: number;
|
||||
}): string {
|
||||
const columnSql = quoteKtxRelationshipIdentifier(input.connectionDriver, input.column.name);
|
||||
const textLengthSql = textLengthExpression(input.connectionDriver, columnSql);
|
||||
const samplesSql = sampleAggregateSql(
|
||||
input.connectionDriver,
|
||||
sampleValuesSql({
|
||||
driver: input.connectionDriver,
|
||||
tableSql: input.profileTableSql,
|
||||
columnSql,
|
||||
limit: input.sampleValuesPerColumn,
|
||||
}),
|
||||
);
|
||||
return [
|
||||
'SELECT',
|
||||
`${sqlStringLiteral(input.column.name)} AS column_name,`,
|
||||
`(SELECT COUNT(*) FROM ${input.tableSql}) AS table_row_count,`,
|
||||
'COUNT(*) AS row_count,',
|
||||
`SUM(CASE WHEN ${columnSql} IS NULL THEN 1 ELSE 0 END) AS null_count,`,
|
||||
`COUNT(DISTINCT ${columnSql}) AS distinct_count,`,
|
||||
`MIN(${textLengthSql}) AS min_text_length,`,
|
||||
`MAX(${textLengthSql}) AS max_text_length,`,
|
||||
`${samplesSql} AS sample_values`,
|
||||
`FROM ${input.profileTableSql}`,
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function splitSampleValues(value: unknown): string[] {
|
||||
if (value === null || value === undefined) {
|
||||
return [];
|
||||
}
|
||||
const text = String(value);
|
||||
if (text === '') {
|
||||
return [];
|
||||
}
|
||||
return text.split(SAMPLE_VALUE_DELIMITER).filter((item) => item !== '');
|
||||
}
|
||||
|
||||
async function queryCount(input: {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
table: KtxTableRef;
|
||||
executor: KtxRelationshipReadOnlyExecutor;
|
||||
ctx: KtxScanContext;
|
||||
}): Promise<{ rowCount: number; queryCount: number }> {
|
||||
const tableSql = formatKtxRelationshipTableRef(input.driver, input.table);
|
||||
const result = await input.executor.executeReadOnly(
|
||||
{ connectionId: input.connectionId, sql: `SELECT COUNT(*) AS row_count FROM ${tableSql}`, maxRows: 1 },
|
||||
input.ctx,
|
||||
);
|
||||
return { rowCount: numberAt(result, 'row_count'), queryCount: 1 };
|
||||
}
|
||||
|
||||
async function queryTableProfile(input: {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
table: KtxEnrichedTable;
|
||||
executor: KtxRelationshipReadOnlyExecutor;
|
||||
ctx: KtxScanContext;
|
||||
sampleValuesPerColumn: number;
|
||||
profileSampleRows: number;
|
||||
}): Promise<{
|
||||
table: KtxRelationshipTableProfile;
|
||||
columns: Record<string, KtxRelationshipColumnProfile>;
|
||||
queryCount: number;
|
||||
}> {
|
||||
if (input.table.columns.length === 0) {
|
||||
const rowCount = await queryCount({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
table: input.table.ref,
|
||||
executor: input.executor,
|
||||
ctx: input.ctx,
|
||||
});
|
||||
return {
|
||||
table: { table: input.table.ref, rowCount: rowCount.rowCount },
|
||||
columns: {},
|
||||
queryCount: rowCount.queryCount,
|
||||
};
|
||||
}
|
||||
|
||||
const tableSql = formatKtxRelationshipTableRef(input.driver, input.table.ref);
|
||||
const profileTableSql = sampledTableSql(input.driver, tableSql, input.profileSampleRows);
|
||||
const sql = input.table.columns
|
||||
.map((column) =>
|
||||
columnProfileSelectSql({
|
||||
connectionDriver: input.driver,
|
||||
tableSql,
|
||||
profileTableSql,
|
||||
column,
|
||||
sampleValuesPerColumn: input.sampleValuesPerColumn,
|
||||
}),
|
||||
)
|
||||
.join(' UNION ALL ');
|
||||
const result = await input.executor.executeReadOnly(
|
||||
{ connectionId: input.connectionId, sql, maxRows: input.table.columns.length },
|
||||
input.ctx,
|
||||
);
|
||||
const columnsByName = new Map(input.table.columns.map((column) => [column.name, column]));
|
||||
const profiles: Record<string, KtxRelationshipColumnProfile> = {};
|
||||
let tableRowCount = 0;
|
||||
|
||||
for (const row of result.rows) {
|
||||
const columnName = String(valueAt(result, row, 'column_name'));
|
||||
const column = columnsByName.get(columnName);
|
||||
if (!column) {
|
||||
continue;
|
||||
}
|
||||
const rowCount = numberFromValue(valueAt(result, row, 'row_count'));
|
||||
const nullCount = numberFromValue(valueAt(result, row, 'null_count'));
|
||||
const distinctCount = numberFromValue(valueAt(result, row, 'distinct_count'));
|
||||
tableRowCount = Math.max(tableRowCount, numberFromValue(valueAt(result, row, 'table_row_count')));
|
||||
profiles[columnKey(input.table, column)] = {
|
||||
table: input.table.ref,
|
||||
column: column.name,
|
||||
nativeType: column.nativeType,
|
||||
normalizedType: column.normalizedType,
|
||||
rowCount,
|
||||
nullCount,
|
||||
distinctCount,
|
||||
uniquenessRatio: rowCount === 0 ? 0 : distinctCount / rowCount,
|
||||
nullRate: rowCount === 0 ? 0 : nullCount / rowCount,
|
||||
sampleValues: splitSampleValues(valueAt(result, row, 'sample_values')),
|
||||
minTextLength: nullableNumberFromValue(valueAt(result, row, 'min_text_length')),
|
||||
maxTextLength: nullableNumberFromValue(valueAt(result, row, 'max_text_length')),
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
table: { table: input.table.ref, rowCount: tableRowCount },
|
||||
columns: profiles,
|
||||
queryCount: 1,
|
||||
};
|
||||
}
|
||||
|
||||
export async function profileKtxRelationshipSchema(
|
||||
input: ProfileKtxRelationshipSchemaInput,
|
||||
): Promise<KtxRelationshipProfileArtifact> {
|
||||
if (!input.executor) {
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
sqlAvailable: false,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
columns: {},
|
||||
warnings: ['read_only_sql_unavailable'],
|
||||
};
|
||||
}
|
||||
|
||||
let queryTotal = 0;
|
||||
const tables: KtxRelationshipTableProfile[] = [];
|
||||
const columns: Record<string, KtxRelationshipColumnProfile> = {};
|
||||
const warnings: string[] = [];
|
||||
|
||||
for (const table of input.schema.tables.filter((candidate) => candidate.enabled)) {
|
||||
const sampleValuesPerColumn = input.sampleValuesPerColumn ?? 5;
|
||||
const profileSampleRows = input.profileSampleRows ?? 10000;
|
||||
const cacheKey = tableProfileCacheKey({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
ctx: input.ctx,
|
||||
table: table.ref,
|
||||
sampleValuesPerColumn,
|
||||
profileSampleRows,
|
||||
});
|
||||
const cached = input.cache?.tableProfiles.get(cacheKey);
|
||||
if (cached) {
|
||||
tables.push(cached.table);
|
||||
Object.assign(columns, cached.columns);
|
||||
for (const warning of cached.warnings) {
|
||||
warnings.push(warning);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const tableProfile = await queryTableProfile({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
table,
|
||||
executor: input.executor,
|
||||
ctx: input.ctx,
|
||||
sampleValuesPerColumn,
|
||||
profileSampleRows,
|
||||
});
|
||||
queryTotal += tableProfile.queryCount;
|
||||
tables.push(tableProfile.table);
|
||||
Object.assign(columns, tableProfile.columns);
|
||||
input.cache?.tableProfiles.set(cacheKey, {
|
||||
table: tableProfile.table,
|
||||
columns: tableProfile.columns,
|
||||
warnings: [],
|
||||
});
|
||||
} catch (error) {
|
||||
const failureWarning = `profile_failed:${table.ref.name}:${error instanceof Error ? error.message : String(error)}`;
|
||||
warnings.push(failureWarning);
|
||||
input.cache?.tableProfiles.set(cacheKey, {
|
||||
table: { table: table.ref, rowCount: 0 },
|
||||
columns: {},
|
||||
warnings: [failureWarning],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
sqlAvailable: true,
|
||||
queryCount: queryTotal,
|
||||
tables,
|
||||
columns,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
108
packages/cli/src/context/scan/relationship-scoring.test.ts
Normal file
108
packages/cli/src/context/scan/relationship-scoring.test.ts
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
calibrateWeightsFromSyntheticFixtures,
|
||||
defaultKtxRelationshipScoreWeights,
|
||||
normalizeKtxRelationshipScoreWeights,
|
||||
scoreKtxRelationshipCandidate,
|
||||
type KtxRelationshipSignalVector,
|
||||
} from './relationship-scoring.js';
|
||||
|
||||
function signals(overrides: Partial<KtxRelationshipSignalVector> = {}): KtxRelationshipSignalVector {
|
||||
return {
|
||||
nameSimilarity: 0.5,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: 0.5,
|
||||
profileNullRate: 0.5,
|
||||
structuralPrior: 0.5,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship scoring', () => {
|
||||
it('scores stronger evidence higher without hard-gating on names', () => {
|
||||
const weakNameStrongProfile = scoreKtxRelationshipCandidate(
|
||||
signals({
|
||||
nameSimilarity: 0.05,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0.7,
|
||||
profileUniqueness: 1,
|
||||
profileNullRate: 1,
|
||||
structuralPrior: 0.7,
|
||||
}),
|
||||
);
|
||||
const strongNameWeakProfile = scoreKtxRelationshipCandidate(
|
||||
signals({
|
||||
nameSimilarity: 0.95,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0,
|
||||
profileUniqueness: 0.3,
|
||||
profileNullRate: 0.4,
|
||||
structuralPrior: 0.5,
|
||||
}),
|
||||
);
|
||||
|
||||
expect(weakNameStrongProfile.score).toBeGreaterThan(strongNameWeakProfile.score);
|
||||
expect(weakNameStrongProfile.contributions.profileUniqueness).toBeGreaterThan(0);
|
||||
expect(weakNameStrongProfile.contributions.nameSimilarity).toBeLessThan(0.02);
|
||||
});
|
||||
|
||||
it('normalizes partial and invalid weights into a usable vector', () => {
|
||||
const weights = normalizeKtxRelationshipScoreWeights({
|
||||
nameSimilarity: 3,
|
||||
typeCompatibility: -1,
|
||||
valueOverlap: Number.POSITIVE_INFINITY,
|
||||
profileUniqueness: 1,
|
||||
});
|
||||
|
||||
const total = Object.values(weights).reduce((sum, value) => sum + value, 0);
|
||||
expect(total).toBeCloseTo(1, 6);
|
||||
expect(weights.nameSimilarity).toBeGreaterThan(weights.profileUniqueness);
|
||||
expect(weights.typeCompatibility).toBe(0);
|
||||
expect(weights.valueOverlap).toBe(0);
|
||||
});
|
||||
|
||||
it('returns deterministic defaults as a defensive copy', () => {
|
||||
const first = defaultKtxRelationshipScoreWeights();
|
||||
const second = defaultKtxRelationshipScoreWeights();
|
||||
|
||||
expect(first).toEqual(second);
|
||||
expect(first).not.toBe(second);
|
||||
expect(Object.values(first).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
|
||||
});
|
||||
|
||||
it('calibrates only from synthetic observations', () => {
|
||||
expect(() =>
|
||||
calibrateWeightsFromSyntheticFixtures([
|
||||
{
|
||||
fixtureId: 'chinook_with_declared_metadata',
|
||||
origin: 'public',
|
||||
expectedRelationship: true,
|
||||
signals: signals({ nameSimilarity: 1 }),
|
||||
},
|
||||
]),
|
||||
).toThrow(/synthetic/i);
|
||||
});
|
||||
|
||||
it('calibrates deterministic weights from positive and negative synthetic observations', () => {
|
||||
const weights = calibrateWeightsFromSyntheticFixtures([
|
||||
{
|
||||
fixtureId: 'synthetic_positive',
|
||||
origin: 'synthetic',
|
||||
expectedRelationship: true,
|
||||
signals: signals({ nameSimilarity: 0.8, valueOverlap: 0.9, profileUniqueness: 1, profileNullRate: 1 }),
|
||||
},
|
||||
{
|
||||
fixtureId: 'synthetic_negative',
|
||||
origin: 'synthetic',
|
||||
expectedRelationship: false,
|
||||
signals: signals({ nameSimilarity: 0.2, valueOverlap: 0.1, profileUniqueness: 0.4, profileNullRate: 0.5 }),
|
||||
},
|
||||
]);
|
||||
|
||||
expect(Object.values(weights).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
|
||||
expect(weights.valueOverlap).toBeGreaterThan(weights.structuralPrior);
|
||||
expect(weights.profileUniqueness).toBeGreaterThan(weights.embeddingSimilarity);
|
||||
});
|
||||
});
|
||||
159
packages/cli/src/context/scan/relationship-scoring.ts
Normal file
159
packages/cli/src/context/scan/relationship-scoring.ts
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
const KTX_RELATIONSHIP_SCORE_SIGNAL_KEYS = [
|
||||
'nameSimilarity',
|
||||
'typeCompatibility',
|
||||
'valueOverlap',
|
||||
'embeddingSimilarity',
|
||||
'profileUniqueness',
|
||||
'profileNullRate',
|
||||
'structuralPrior',
|
||||
] as const;
|
||||
|
||||
type KtxRelationshipScoreSignal = (typeof KTX_RELATIONSHIP_SCORE_SIGNAL_KEYS)[number];
|
||||
|
||||
export type KtxRelationshipFixtureOrigin = 'synthetic' | 'public' | 'customer';
|
||||
|
||||
export interface KtxRelationshipSignalVector {
|
||||
nameSimilarity: number;
|
||||
typeCompatibility: number;
|
||||
valueOverlap: number;
|
||||
embeddingSimilarity: number;
|
||||
profileUniqueness: number;
|
||||
profileNullRate: number;
|
||||
structuralPrior: number;
|
||||
}
|
||||
|
||||
export type KtxRelationshipScoreWeights = Record<KtxRelationshipScoreSignal, number>;
|
||||
|
||||
export interface KtxRelationshipScoreBreakdown {
|
||||
score: number;
|
||||
signals: KtxRelationshipSignalVector;
|
||||
weights: KtxRelationshipScoreWeights;
|
||||
contributions: KtxRelationshipScoreWeights;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxRelationshipScoringCalibrationObservation {
|
||||
fixtureId: string;
|
||||
origin: KtxRelationshipFixtureOrigin;
|
||||
expectedRelationship: boolean;
|
||||
signals: KtxRelationshipSignalVector;
|
||||
}
|
||||
|
||||
const DEFAULT_WEIGHTS: KtxRelationshipScoreWeights = {
|
||||
nameSimilarity: 0.24,
|
||||
typeCompatibility: 0.1,
|
||||
valueOverlap: 0.22,
|
||||
embeddingSimilarity: 0.1,
|
||||
profileUniqueness: 0.22,
|
||||
profileNullRate: 0.08,
|
||||
structuralPrior: 0.04,
|
||||
};
|
||||
|
||||
function clampScore(value: number): number {
|
||||
if (!Number.isFinite(value)) {
|
||||
return 0;
|
||||
}
|
||||
return Math.max(0, Math.min(1, value));
|
||||
}
|
||||
|
||||
function roundScore(value: number): number {
|
||||
return Number(clampScore(value).toFixed(3));
|
||||
}
|
||||
|
||||
function sanitizeSignalVector(signals: KtxRelationshipSignalVector): KtxRelationshipSignalVector {
|
||||
return {
|
||||
nameSimilarity: roundScore(signals.nameSimilarity),
|
||||
typeCompatibility: roundScore(signals.typeCompatibility),
|
||||
valueOverlap: roundScore(signals.valueOverlap),
|
||||
embeddingSimilarity: roundScore(signals.embeddingSimilarity),
|
||||
profileUniqueness: roundScore(signals.profileUniqueness),
|
||||
profileNullRate: roundScore(signals.profileNullRate),
|
||||
structuralPrior: roundScore(signals.structuralPrior),
|
||||
};
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function defaultKtxRelationshipScoreWeights(): KtxRelationshipScoreWeights {
|
||||
return { ...DEFAULT_WEIGHTS };
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function normalizeKtxRelationshipScoreWeights(
|
||||
weights: Partial<KtxRelationshipScoreWeights> = DEFAULT_WEIGHTS,
|
||||
): KtxRelationshipScoreWeights {
|
||||
const rawEntries = KTX_RELATIONSHIP_SCORE_SIGNAL_KEYS.map((key) => {
|
||||
const value = weights[key] ?? 0;
|
||||
return [key, Number.isFinite(value) ? Math.max(0, value) : 0] as const;
|
||||
});
|
||||
const total = rawEntries.reduce((sum, [, value]) => sum + value, 0);
|
||||
if (total <= 0) {
|
||||
return defaultKtxRelationshipScoreWeights();
|
||||
}
|
||||
|
||||
return Object.fromEntries(rawEntries.map(([key, value]) => [key, value / total])) as KtxRelationshipScoreWeights;
|
||||
}
|
||||
|
||||
export function scoreKtxRelationshipCandidate(
|
||||
signals: KtxRelationshipSignalVector,
|
||||
weights: Partial<KtxRelationshipScoreWeights> = DEFAULT_WEIGHTS,
|
||||
): KtxRelationshipScoreBreakdown {
|
||||
const sanitizedSignals = sanitizeSignalVector(signals);
|
||||
const normalizedWeights = normalizeKtxRelationshipScoreWeights(weights);
|
||||
const contributions = Object.fromEntries(
|
||||
KTX_RELATIONSHIP_SCORE_SIGNAL_KEYS.map((key) => [
|
||||
key,
|
||||
Number((sanitizedSignals[key] * normalizedWeights[key]).toFixed(6)),
|
||||
]),
|
||||
) as KtxRelationshipScoreWeights;
|
||||
const rawWeightedScore = KTX_RELATIONSHIP_SCORE_SIGNAL_KEYS.reduce((sum, key) => sum + contributions[key], 0);
|
||||
const scoredConfidence = sanitizedSignals.typeCompatibility <= 0 ? 0 : 0.56 + rawWeightedScore * 0.65;
|
||||
|
||||
return {
|
||||
score: roundScore(scoredConfidence),
|
||||
signals: sanitizedSignals,
|
||||
weights: normalizedWeights,
|
||||
contributions,
|
||||
};
|
||||
}
|
||||
|
||||
function averageSignal(
|
||||
observations: readonly KtxRelationshipScoringCalibrationObservation[],
|
||||
key: KtxRelationshipScoreSignal,
|
||||
): number {
|
||||
if (observations.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
return observations.reduce((sum, observation) => sum + clampScore(observation.signals[key]), 0) / observations.length;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function calibrateWeightsFromSyntheticFixtures(
|
||||
observations: readonly KtxRelationshipScoringCalibrationObservation[],
|
||||
): KtxRelationshipScoreWeights {
|
||||
const nonSynthetic = observations.find((observation) => observation.origin !== 'synthetic');
|
||||
if (nonSynthetic) {
|
||||
throw new Error(
|
||||
`Relationship score calibration accepts only synthetic fixtures; ${nonSynthetic.fixtureId} is ${nonSynthetic.origin}`,
|
||||
);
|
||||
}
|
||||
if (observations.length === 0) {
|
||||
return defaultKtxRelationshipScoreWeights();
|
||||
}
|
||||
|
||||
const positives = observations.filter((observation) => observation.expectedRelationship);
|
||||
const negatives = observations.filter((observation) => !observation.expectedRelationship);
|
||||
if (positives.length === 0 || negatives.length === 0) {
|
||||
return defaultKtxRelationshipScoreWeights();
|
||||
}
|
||||
|
||||
const calibrated = Object.fromEntries(
|
||||
KTX_RELATIONSHIP_SCORE_SIGNAL_KEYS.map((key) => {
|
||||
const positiveAverage = averageSignal(positives, key);
|
||||
const negativeAverage = averageSignal(negatives, key);
|
||||
const separation = Math.max(0, positiveAverage - negativeAverage);
|
||||
return [key, separation + DEFAULT_WEIGHTS[key] * 0.25];
|
||||
}),
|
||||
) as KtxRelationshipScoreWeights;
|
||||
|
||||
return normalizeKtxRelationshipScoreWeights(calibrated);
|
||||
}
|
||||
497
packages/cli/src/context/scan/relationship-validation.test.ts
Normal file
497
packages/cli/src/context/scan/relationship-validation.test.ts
Normal file
|
|
@ -0,0 +1,497 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { afterEach, describe, expect, it } from 'vitest';
|
||||
import type { KtxEnrichedColumn, KtxEnrichedSchema, KtxEnrichedTable } from './enrichment-types.js';
|
||||
import { generateKtxRelationshipDiscoveryCandidates } from './relationship-candidates.js';
|
||||
import type { KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import { profileKtxRelationshipSchema } from './relationship-profiling.js';
|
||||
import { validateKtxRelationshipDiscoveryCandidates } from './relationship-validation.js';
|
||||
import type { KtxQueryResult, KtxReadOnlyQueryInput, KtxScanContext } from './types.js';
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
queryCount = 0;
|
||||
|
||||
executeReadOnly(input: KtxReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KtxEnrichedColumn> = {}): KtxEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KtxEnrichedColumn[]): KtxEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(tables?: KtxEnrichedTable[]): KtxEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables: tables ?? [
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
|
||||
table('invoices', [
|
||||
column('invoices', 'id', { nullable: false }),
|
||||
column('invoices', 'account_id', { nullable: false }),
|
||||
]),
|
||||
],
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship validation', () => {
|
||||
let executor: InMemorySqliteExecutor | null = null;
|
||||
|
||||
afterEach(() => {
|
||||
executor?.close();
|
||||
executor = null;
|
||||
});
|
||||
|
||||
it('accepts a relationship-discovery candidate with unique parent values and full source coverage', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
|
||||
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 999);
|
||||
`);
|
||||
const testSchema = schema();
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
});
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema).filter(
|
||||
(candidate) => candidate.from.table.name === 'users',
|
||||
);
|
||||
|
||||
const validated = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
tableCount: testSchema.tables.length,
|
||||
});
|
||||
|
||||
expect(validated).toHaveLength(1);
|
||||
expect(validated[0]).toMatchObject({
|
||||
from: { table: { name: 'users' }, columns: ['account_id'] },
|
||||
to: { table: { name: 'accounts' }, columns: ['id'] },
|
||||
status: 'accepted',
|
||||
score: expect.any(Number),
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
reasons: expect.arrayContaining(['validation_passed']),
|
||||
},
|
||||
});
|
||||
expect(validated[0]?.score).toBeGreaterThanOrEqual(0.85);
|
||||
});
|
||||
|
||||
it('rejects a candidate with missing parent values and records the deterministic reason', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
|
||||
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 999), (22, 1000);
|
||||
`);
|
||||
const testSchema = schema();
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
});
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema).filter(
|
||||
(candidate) => candidate.from.table.name === 'invoices',
|
||||
);
|
||||
|
||||
const validated = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: {
|
||||
minSourceCoverage: 0.9,
|
||||
maxViolationRatio: 0.01,
|
||||
},
|
||||
});
|
||||
|
||||
expect(validated).toHaveLength(1);
|
||||
expect(validated[0]).toMatchObject({
|
||||
from: { table: { name: 'invoices' }, columns: ['account_id'] },
|
||||
to: { table: { name: 'accounts' }, columns: ['id'] },
|
||||
status: 'rejected',
|
||||
validation: {
|
||||
sourceCoverage: 1 / 3,
|
||||
violationCount: 2,
|
||||
violationRatio: 2 / 3,
|
||||
reasons: expect.arrayContaining(['low_source_coverage', 'excessive_violations']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps over-budget candidates review-only without executing coverage SQL for them', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
|
||||
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 3);
|
||||
`);
|
||||
const testSchema = schema();
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-budget-profile' },
|
||||
});
|
||||
executor.queryCount = 0;
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema).map((candidate) => ({
|
||||
...candidate,
|
||||
confidence: candidate.from.table.name === 'users' ? 0.99 : 0.5,
|
||||
}));
|
||||
|
||||
const validated = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-budget' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: {
|
||||
validationBudget: 1,
|
||||
},
|
||||
});
|
||||
|
||||
expect(executor.queryCount).toBe(1);
|
||||
expect(validated).toHaveLength(2);
|
||||
expect(validated.find((candidate) => candidate.from.table.name === 'users')).toMatchObject({
|
||||
status: 'accepted',
|
||||
validation: { reasons: expect.arrayContaining(['validation_passed']) },
|
||||
});
|
||||
expect(validated.find((candidate) => candidate.from.table.name === 'invoices')).toMatchObject({
|
||||
status: 'review',
|
||||
validation: {
|
||||
reasons: ['validation_unattempted'],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('treats validation budget zero as review-only validation without coverage SQL', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
|
||||
`);
|
||||
const testSchema = schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
|
||||
]);
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-zero-budget-profile' },
|
||||
});
|
||||
executor.queryCount = 0;
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema);
|
||||
|
||||
const validated = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-zero-budget' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: {
|
||||
validationBudget: 0,
|
||||
},
|
||||
});
|
||||
|
||||
expect(executor.queryCount).toBe(0);
|
||||
expect(validated).toHaveLength(1);
|
||||
expect(validated[0]).toMatchObject({
|
||||
status: 'review',
|
||||
score: expect.any(Number),
|
||||
validation: {
|
||||
checkedValues: 0,
|
||||
reasons: ['validation_unattempted'],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('marks rejected LLM proposals with the spec rejection reason', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE customers (id INTEGER);
|
||||
CREATE TABLE orders (buyer_ref INTEGER);
|
||||
INSERT INTO customers (id) VALUES (1), (2);
|
||||
INSERT INTO orders (buyer_ref) VALUES (98), (99);
|
||||
`);
|
||||
const testSchema = schema([
|
||||
table('customers', [column('customers', 'id', { nullable: false })]),
|
||||
table('orders', [column('orders', 'buyer_ref')]),
|
||||
]);
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'llm-rejected-validation' },
|
||||
});
|
||||
const [candidate] = generateKtxRelationshipDiscoveryCandidates(
|
||||
schema([
|
||||
table('customers', [column('customers', 'id', { nullable: false })]),
|
||||
table('orders', [column('orders', 'customer_id')]),
|
||||
]),
|
||||
);
|
||||
if (!candidate) {
|
||||
throw new Error('Expected base candidate');
|
||||
}
|
||||
const llmCandidate = {
|
||||
...candidate,
|
||||
id: 'orders:(orders.buyer_ref)->customers:(customers.id)',
|
||||
from: { ...candidate.from, columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
|
||||
source: 'llm_proposal' as const,
|
||||
evidence: {
|
||||
...candidate.evidence,
|
||||
reasons: ['llm_proposal'],
|
||||
llmConfidence: 0.84,
|
||||
llmRationale: 'Buyer references should map to customers.',
|
||||
},
|
||||
};
|
||||
|
||||
const [validated] = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates: [llmCandidate],
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'llm-rejected-validation' },
|
||||
tableCount: testSchema.tables.length,
|
||||
});
|
||||
|
||||
expect(validated?.status).toBe('rejected');
|
||||
expect(validated?.validation.reasons).toEqual(
|
||||
expect.arrayContaining(['low_source_coverage', 'llm_proposed_but_validation_failed']),
|
||||
);
|
||||
});
|
||||
|
||||
it('limits validation query concurrency', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
CREATE TABLE invoices (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts VALUES (1), (2);
|
||||
INSERT INTO orders VALUES (10, 1), (11, 2);
|
||||
INSERT INTO invoices VALUES (20, 1), (21, 2);
|
||||
`);
|
||||
|
||||
let active = 0;
|
||||
let maxActive = 0;
|
||||
const throttled = {
|
||||
executeReadOnly: async (input: KtxReadOnlyQueryInput, ctx: KtxScanContext) => {
|
||||
active += 1;
|
||||
maxActive = Math.max(maxActive, active);
|
||||
await new Promise((resolve) => setTimeout(resolve, input.sql.includes('WITH child_values') ? 10 : 0));
|
||||
const result = await executor.executeReadOnly(input, ctx);
|
||||
active -= 1;
|
||||
return result;
|
||||
},
|
||||
};
|
||||
|
||||
const testSchema = schema([
|
||||
table('accounts', [column('accounts', 'id', { nullable: false })]),
|
||||
table('orders', [column('orders', 'id', { nullable: false }), column('orders', 'account_id')]),
|
||||
table('invoices', [column('invoices', 'id', { nullable: false }), column('invoices', 'account_id')]),
|
||||
]);
|
||||
const profiles = await profileKtxRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validation-concurrency-profile' },
|
||||
});
|
||||
const candidates = generateKtxRelationshipDiscoveryCandidates(testSchema);
|
||||
|
||||
await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor: throttled,
|
||||
ctx: { runId: 'validation-concurrency' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: { concurrency: 1 },
|
||||
});
|
||||
|
||||
expect(maxActive).toBe(1);
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('pins column_suffix_match validation scoring for plan-code suffix candidates', async () => {
|
||||
const candidate = {
|
||||
id: 'mart:(current_plan_code)->plans:(plan_code)',
|
||||
from: {
|
||||
tableId: 'mart-account-segments-id',
|
||||
columnIds: ['current-plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
columns: ['current_plan_code'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'plans-id',
|
||||
columnIds: ['plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
columns: ['plan_code'],
|
||||
},
|
||||
relationshipType: 'many_to_one' as const,
|
||||
confidence: 0.902,
|
||||
source: 'column_suffix_match' as const,
|
||||
status: 'review' as const,
|
||||
evidence: {
|
||||
sourceColumnBase: 'current_plan',
|
||||
targetTableBase: 'plan',
|
||||
targetColumnBase: 'plan_code',
|
||||
targetKeyScore: 0.86,
|
||||
nameScore: 0.78,
|
||||
reasons: ['column_suffix_match', 'profile_unique_target'],
|
||||
},
|
||||
};
|
||||
const profiles = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'mart_account_segments.current_plan_code': {
|
||||
table: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
column: 'current_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.plan_code': {
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
},
|
||||
} satisfies KtxRelationshipProfileArtifact;
|
||||
const executor = {
|
||||
async executeReadOnly() {
|
||||
return {
|
||||
headers: ['child_distinct', 'parent_distinct', 'overlap', 'violation_count'],
|
||||
rows: [[4, 4, 4, 0]],
|
||||
rowCount: 1,
|
||||
totalRows: 1,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const [validated] = await validateKtxRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates: [candidate],
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'rule-b-validation-score' },
|
||||
tableCount: 2,
|
||||
});
|
||||
|
||||
expect(validated).toMatchObject({
|
||||
status: 'accepted',
|
||||
score: 0.98,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationRatio: 0,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
370
packages/cli/src/context/scan/relationship-validation.ts
Normal file
370
packages/cli/src/context/scan/relationship-validation.ts
Normal file
|
|
@ -0,0 +1,370 @@
|
|||
import type { KtxRelationshipEndpoint } from './enrichment-types.js';
|
||||
import { applyKtxRelationshipValidationBudget, type KtxRelationshipValidationBudget } from './relationship-budget.js';
|
||||
import type { KtxRelationshipDiscoveryCandidate } from './relationship-candidates.js';
|
||||
import {
|
||||
formatKtxRelationshipTableRef,
|
||||
type KtxRelationshipProfileArtifact,
|
||||
type KtxRelationshipReadOnlyExecutor,
|
||||
quoteKtxRelationshipIdentifier,
|
||||
} from './relationship-profiling.js';
|
||||
import type { KtxConnectionDriver, KtxQueryResult, KtxScanContext } from './types.js';
|
||||
|
||||
type KtxValidatedRelationshipStatus = 'accepted' | 'review' | 'rejected';
|
||||
|
||||
interface KtxRelationshipValidationSettings {
|
||||
acceptThreshold: number;
|
||||
reviewThreshold: number;
|
||||
minTargetUniqueness: number;
|
||||
minSourceCoverage: number;
|
||||
maxViolationRatio: number;
|
||||
maxDistinctSourceValues: number;
|
||||
concurrency: number;
|
||||
validationBudget?: KtxRelationshipValidationBudget;
|
||||
}
|
||||
|
||||
interface KtxRelationshipValidationEvidence {
|
||||
targetUniqueness: number;
|
||||
sourceCoverage: number;
|
||||
violationCount: number;
|
||||
violationRatio: number;
|
||||
sourceNullRate: number;
|
||||
targetNullRate: number;
|
||||
childDistinct: number;
|
||||
parentDistinct: number;
|
||||
overlap: number;
|
||||
checkedValues: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KtxValidatedRelationshipDiscoveryCandidate
|
||||
extends Omit<KtxRelationshipDiscoveryCandidate, 'status'> {
|
||||
status: KtxValidatedRelationshipStatus;
|
||||
score: number;
|
||||
validation: KtxRelationshipValidationEvidence;
|
||||
}
|
||||
|
||||
export interface ValidateKtxRelationshipDiscoveryCandidatesInput {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
candidates: readonly KtxRelationshipDiscoveryCandidate[];
|
||||
profiles: KtxRelationshipProfileArtifact;
|
||||
executor: KtxRelationshipReadOnlyExecutor | null;
|
||||
ctx: KtxScanContext;
|
||||
tableCount?: number;
|
||||
settings?: Partial<KtxRelationshipValidationSettings>;
|
||||
}
|
||||
|
||||
const DEFAULT_SETTINGS: KtxRelationshipValidationSettings = {
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
minTargetUniqueness: 0.9,
|
||||
minSourceCoverage: 0.9,
|
||||
maxViolationRatio: 0.01,
|
||||
maxDistinctSourceValues: 10000,
|
||||
concurrency: 4,
|
||||
};
|
||||
|
||||
function mergeSettings(
|
||||
settings: Partial<KtxRelationshipValidationSettings> | undefined,
|
||||
): KtxRelationshipValidationSettings {
|
||||
return { ...DEFAULT_SETTINGS, ...settings };
|
||||
}
|
||||
|
||||
function profileKey(table: string, column: string): string {
|
||||
return `${table}.${column}`;
|
||||
}
|
||||
|
||||
function singleRelationshipColumn(endpointValue: KtxRelationshipEndpoint): string {
|
||||
const column = endpointValue.columns[0];
|
||||
if (!column) {
|
||||
throw new Error(`Expected relationship endpoint ${endpointValue.table.name} to contain one column`);
|
||||
}
|
||||
return column;
|
||||
}
|
||||
|
||||
function headerIndex(result: KtxQueryResult, header: string): number {
|
||||
return result.headers.findIndex((candidate) => candidate.toLowerCase() === header.toLowerCase());
|
||||
}
|
||||
|
||||
function firstRow(result: KtxQueryResult): unknown[] {
|
||||
return result.rows[0] ?? [];
|
||||
}
|
||||
|
||||
function numberAt(result: KtxQueryResult, header: string): number {
|
||||
const value = firstRow(result)[headerIndex(result, header)];
|
||||
if (typeof value === 'number') {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === 'bigint') {
|
||||
return Number(value);
|
||||
}
|
||||
if (typeof value === 'string' && value.trim() !== '') {
|
||||
return Number(value);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function limitSql(driver: KtxConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return '';
|
||||
}
|
||||
return ` LIMIT ${Math.max(1, Math.floor(limit))}`;
|
||||
}
|
||||
|
||||
function topSql(driver: KtxConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return ` TOP (${Math.max(1, Math.floor(limit))})`;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function buildCoverageSql(input: {
|
||||
driver: KtxConnectionDriver;
|
||||
childTable: string;
|
||||
childColumn: string;
|
||||
parentTable: string;
|
||||
parentColumn: string;
|
||||
maxDistinctSourceValues: number;
|
||||
}): string {
|
||||
const childTable = formatKtxRelationshipTableRef(input.driver, { catalog: null, db: null, name: input.childTable });
|
||||
const parentTable = formatKtxRelationshipTableRef(input.driver, { catalog: null, db: null, name: input.parentTable });
|
||||
const childColumn = quoteKtxRelationshipIdentifier(input.driver, input.childColumn);
|
||||
const parentColumn = quoteKtxRelationshipIdentifier(input.driver, input.parentColumn);
|
||||
const limit = limitSql(input.driver, input.maxDistinctSourceValues);
|
||||
const top = topSql(input.driver, input.maxDistinctSourceValues);
|
||||
|
||||
return [
|
||||
'WITH child_values AS (',
|
||||
`SELECT DISTINCT${top} ${childColumn} AS value FROM ${childTable} WHERE ${childColumn} IS NOT NULL${limit}`,
|
||||
'), parent_values AS (',
|
||||
`SELECT DISTINCT ${parentColumn} AS value FROM ${parentTable} WHERE ${parentColumn} IS NOT NULL`,
|
||||
')',
|
||||
'SELECT',
|
||||
'(SELECT COUNT(*) FROM child_values) AS child_distinct,',
|
||||
'(SELECT COUNT(*) FROM parent_values) AS parent_distinct,',
|
||||
'SUM(CASE WHEN parent_values.value IS NOT NULL THEN 1 ELSE 0 END) AS overlap,',
|
||||
'SUM(CASE WHEN parent_values.value IS NULL THEN 1 ELSE 0 END) AS violation_count',
|
||||
'FROM child_values',
|
||||
'LEFT JOIN parent_values ON child_values.value = parent_values.value',
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function score(input: {
|
||||
candidateConfidence: number;
|
||||
targetUniqueness: number;
|
||||
sourceCoverage: number;
|
||||
violationRatio: number;
|
||||
}): number {
|
||||
const violationScore = Math.max(0, 1 - input.violationRatio);
|
||||
return Number(
|
||||
Math.min(
|
||||
1,
|
||||
0.2 * input.candidateConfidence +
|
||||
0.3 * input.targetUniqueness +
|
||||
0.4 * input.sourceCoverage +
|
||||
0.1 * violationScore,
|
||||
).toFixed(3),
|
||||
);
|
||||
}
|
||||
|
||||
function statusFor(input: {
|
||||
score: number;
|
||||
reasons: readonly string[];
|
||||
settings: KtxRelationshipValidationSettings;
|
||||
}): KtxValidatedRelationshipStatus {
|
||||
if (
|
||||
input.reasons.includes('low_target_uniqueness') ||
|
||||
input.reasons.includes('low_source_coverage') ||
|
||||
input.reasons.includes('excessive_violations')
|
||||
) {
|
||||
return 'rejected';
|
||||
}
|
||||
if (
|
||||
input.score >= input.settings.acceptThreshold &&
|
||||
!input.reasons.includes('low_target_uniqueness') &&
|
||||
!input.reasons.includes('low_source_coverage') &&
|
||||
!input.reasons.includes('excessive_violations')
|
||||
) {
|
||||
return 'accepted';
|
||||
}
|
||||
if (input.score >= input.settings.reviewThreshold) {
|
||||
return 'review';
|
||||
}
|
||||
return 'rejected';
|
||||
}
|
||||
|
||||
async function mapWithConcurrency<TInput, TOutput>(
|
||||
inputs: readonly TInput[],
|
||||
concurrency: number,
|
||||
mapOne: (input: TInput) => Promise<TOutput>,
|
||||
): Promise<TOutput[]> {
|
||||
const safeConcurrency = Math.max(1, Math.floor(concurrency));
|
||||
const outputs: TOutput[] = new Array(inputs.length);
|
||||
let nextIndex = 0;
|
||||
|
||||
async function worker(): Promise<void> {
|
||||
while (nextIndex < inputs.length) {
|
||||
const index = nextIndex;
|
||||
nextIndex += 1;
|
||||
outputs[index] = await mapOne(inputs[index] as TInput);
|
||||
}
|
||||
}
|
||||
|
||||
await Promise.all(Array.from({ length: Math.min(safeConcurrency, inputs.length) }, () => worker()));
|
||||
return outputs;
|
||||
}
|
||||
|
||||
function reviewWithoutValidation(
|
||||
candidate: KtxRelationshipDiscoveryCandidate,
|
||||
profiles: KtxRelationshipProfileArtifact,
|
||||
reason: 'validation_unavailable' | 'profile_unavailable' | 'validation_unattempted',
|
||||
): KtxValidatedRelationshipDiscoveryCandidate {
|
||||
const sourceColumn = singleRelationshipColumn(candidate.from);
|
||||
const targetColumn = singleRelationshipColumn(candidate.to);
|
||||
const sourceProfile = profiles.columns[profileKey(candidate.from.table.name, sourceColumn)];
|
||||
const targetProfile = profiles.columns[profileKey(candidate.to.table.name, targetColumn)];
|
||||
|
||||
return {
|
||||
...candidate,
|
||||
status: 'review',
|
||||
score: Number((candidate.confidence * 0.6).toFixed(3)),
|
||||
validation: {
|
||||
targetUniqueness: targetProfile?.uniquenessRatio ?? 0,
|
||||
sourceCoverage: 0,
|
||||
violationCount: 0,
|
||||
violationRatio: 1,
|
||||
sourceNullRate: sourceProfile?.nullRate ?? 0,
|
||||
targetNullRate: targetProfile?.nullRate ?? 0,
|
||||
childDistinct: sourceProfile?.distinctCount ?? 0,
|
||||
parentDistinct: targetProfile?.distinctCount ?? 0,
|
||||
overlap: 0,
|
||||
checkedValues: 0,
|
||||
reasons: [reason],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function validateKtxRelationshipDiscoveryCandidates(
|
||||
input: ValidateKtxRelationshipDiscoveryCandidatesInput,
|
||||
): Promise<KtxValidatedRelationshipDiscoveryCandidate[]> {
|
||||
const settings = mergeSettings(input.settings);
|
||||
if (!input.executor || !input.profiles.sqlAvailable) {
|
||||
return input.candidates.map((candidate) =>
|
||||
reviewWithoutValidation(candidate, input.profiles, 'validation_unavailable'),
|
||||
);
|
||||
}
|
||||
|
||||
const executor = input.executor;
|
||||
|
||||
async function validateCandidate(
|
||||
candidate: KtxRelationshipDiscoveryCandidate,
|
||||
): Promise<KtxValidatedRelationshipDiscoveryCandidate> {
|
||||
const sourceColumn = singleRelationshipColumn(candidate.from);
|
||||
const targetColumn = singleRelationshipColumn(candidate.to);
|
||||
const sourceProfile = input.profiles.columns[profileKey(candidate.from.table.name, sourceColumn)];
|
||||
const targetProfile = input.profiles.columns[profileKey(candidate.to.table.name, targetColumn)];
|
||||
if (!sourceProfile || !targetProfile) {
|
||||
return reviewWithoutValidation(candidate, input.profiles, 'profile_unavailable');
|
||||
}
|
||||
|
||||
const result = await executor.executeReadOnly(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
sql: buildCoverageSql({
|
||||
driver: input.driver,
|
||||
childTable: candidate.from.table.name,
|
||||
childColumn: sourceColumn,
|
||||
parentTable: candidate.to.table.name,
|
||||
parentColumn: targetColumn,
|
||||
maxDistinctSourceValues: settings.maxDistinctSourceValues,
|
||||
}),
|
||||
maxRows: 1,
|
||||
},
|
||||
input.ctx,
|
||||
);
|
||||
const childDistinct = numberAt(result, 'child_distinct');
|
||||
const parentDistinct = numberAt(result, 'parent_distinct');
|
||||
const overlap = numberAt(result, 'overlap');
|
||||
const violationCount = numberAt(result, 'violation_count');
|
||||
const sourceCoverage = childDistinct === 0 ? 0 : overlap / childDistinct;
|
||||
const violationRatio = childDistinct === 0 ? 1 : violationCount / childDistinct;
|
||||
const targetUniqueness = targetProfile.uniquenessRatio;
|
||||
const reasons: string[] = [];
|
||||
|
||||
if (targetUniqueness < settings.minTargetUniqueness) {
|
||||
reasons.push('low_target_uniqueness');
|
||||
}
|
||||
if (sourceCoverage < settings.minSourceCoverage) {
|
||||
reasons.push('low_source_coverage');
|
||||
}
|
||||
if (violationRatio > settings.maxViolationRatio) {
|
||||
reasons.push('excessive_violations');
|
||||
}
|
||||
if (reasons.length === 0) {
|
||||
reasons.push('validation_passed');
|
||||
}
|
||||
|
||||
const candidateScore = score({
|
||||
candidateConfidence: candidate.confidence,
|
||||
targetUniqueness,
|
||||
sourceCoverage,
|
||||
violationRatio,
|
||||
});
|
||||
const candidateStatus = statusFor({ score: candidateScore, reasons, settings });
|
||||
if (candidate.source === 'llm_proposal' && candidateStatus === 'rejected') {
|
||||
reasons.push('llm_proposed_but_validation_failed');
|
||||
}
|
||||
return {
|
||||
...candidate,
|
||||
status: candidateStatus,
|
||||
score: candidateScore,
|
||||
validation: {
|
||||
targetUniqueness,
|
||||
sourceCoverage,
|
||||
violationCount,
|
||||
violationRatio,
|
||||
sourceNullRate: sourceProfile.nullRate,
|
||||
targetNullRate: targetProfile.nullRate,
|
||||
childDistinct,
|
||||
parentDistinct,
|
||||
overlap,
|
||||
checkedValues: childDistinct,
|
||||
reasons,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const budgeted = applyKtxRelationshipValidationBudget({
|
||||
candidates: input.candidates,
|
||||
tableCount: input.tableCount ?? 0,
|
||||
budget: settings.validationBudget,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
const validated = await mapWithConcurrency(
|
||||
budgeted.toValidate.map((entry) => entry.candidate),
|
||||
settings.concurrency,
|
||||
validateCandidate,
|
||||
);
|
||||
const byOriginalIndex = new Map<number, KtxValidatedRelationshipDiscoveryCandidate>();
|
||||
for (let index = 0; index < budgeted.toValidate.length; index += 1) {
|
||||
const originalIndex = budgeted.toValidate[index]?.originalIndex;
|
||||
const candidate = validated[index];
|
||||
if (originalIndex !== undefined && candidate) {
|
||||
byOriginalIndex.set(originalIndex, candidate);
|
||||
}
|
||||
}
|
||||
for (const entry of budgeted.deferred) {
|
||||
byOriginalIndex.set(
|
||||
entry.originalIndex,
|
||||
reviewWithoutValidation(entry.candidate, input.profiles, 'validation_unattempted'),
|
||||
);
|
||||
}
|
||||
|
||||
return input.candidates.map((_, index) => {
|
||||
const candidate = byOriginalIndex.get(index);
|
||||
if (!candidate) {
|
||||
throw new Error(`Missing relationship validation result for candidate at index ${index}`);
|
||||
}
|
||||
return candidate;
|
||||
});
|
||||
}
|
||||
|
|
@ -0,0 +1,237 @@
|
|||
import { mkdirSync } from 'node:fs';
|
||||
import { dirname } from 'node:path';
|
||||
import Database from 'better-sqlite3';
|
||||
import type {
|
||||
KtxScanEnrichmentCompletedStage,
|
||||
KtxScanEnrichmentFailedStage,
|
||||
KtxScanEnrichmentStageLookup,
|
||||
KtxScanEnrichmentStageRecord,
|
||||
KtxScanEnrichmentStateStore,
|
||||
} from './enrichment-state.js';
|
||||
import type { KtxScanEnrichmentStage, KtxScanMode } from './types.js';
|
||||
|
||||
export interface SqliteLocalScanEnrichmentStateStoreOptions {
|
||||
dbPath: string;
|
||||
}
|
||||
|
||||
interface StageRow {
|
||||
run_id: string;
|
||||
connection_id: string;
|
||||
sync_id: string;
|
||||
mode: KtxScanMode;
|
||||
stage: KtxScanEnrichmentStage;
|
||||
input_hash: string;
|
||||
status: 'completed' | 'failed';
|
||||
output_json: string | null;
|
||||
error_message: string | null;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
function parseStageRow<TOutput = unknown>(row: StageRow): KtxScanEnrichmentStageRecord<TOutput> {
|
||||
if (row.status === 'completed') {
|
||||
return {
|
||||
runId: row.run_id,
|
||||
connectionId: row.connection_id,
|
||||
syncId: row.sync_id,
|
||||
mode: row.mode,
|
||||
stage: row.stage,
|
||||
inputHash: row.input_hash,
|
||||
status: 'completed',
|
||||
output: JSON.parse(row.output_json ?? 'null') as TOutput,
|
||||
errorMessage: null,
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
runId: row.run_id,
|
||||
connectionId: row.connection_id,
|
||||
syncId: row.sync_id,
|
||||
mode: row.mode,
|
||||
stage: row.stage,
|
||||
inputHash: row.input_hash,
|
||||
status: 'failed',
|
||||
output: null,
|
||||
errorMessage: row.error_message ?? 'Unknown enrichment stage failure',
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
}
|
||||
|
||||
function isSafeRunId(runId: string): boolean {
|
||||
return /^[a-zA-Z0-9][a-zA-Z0-9_.-]*$/.test(runId);
|
||||
}
|
||||
|
||||
export class SqliteLocalScanEnrichmentStateStore implements KtxScanEnrichmentStateStore {
|
||||
private readonly db: Database.Database;
|
||||
|
||||
constructor(options: SqliteLocalScanEnrichmentStateStoreOptions) {
|
||||
mkdirSync(dirname(options.dbPath), { recursive: true });
|
||||
this.db = new Database(options.dbPath);
|
||||
this.db.pragma('journal_mode = WAL');
|
||||
this.db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS local_scan_enrichment_stages (
|
||||
run_id TEXT NOT NULL,
|
||||
stage TEXT NOT NULL,
|
||||
input_hash TEXT NOT NULL,
|
||||
connection_id TEXT NOT NULL,
|
||||
sync_id TEXT NOT NULL,
|
||||
mode TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
output_json TEXT,
|
||||
error_message TEXT,
|
||||
updated_at TEXT NOT NULL,
|
||||
PRIMARY KEY (run_id, stage)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS local_scan_enrichment_stages_run_idx
|
||||
ON local_scan_enrichment_stages (run_id, updated_at, stage);
|
||||
`);
|
||||
}
|
||||
|
||||
async findCompletedStage<TOutput = unknown>(
|
||||
input: KtxScanEnrichmentStageLookup,
|
||||
): Promise<KtxScanEnrichmentCompletedStage<TOutput> | null> {
|
||||
if (!isSafeRunId(input.runId)) {
|
||||
return null;
|
||||
}
|
||||
const row = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT *
|
||||
FROM local_scan_enrichment_stages
|
||||
WHERE run_id = ?
|
||||
AND stage = ?
|
||||
AND input_hash = ?
|
||||
AND status = 'completed'
|
||||
`,
|
||||
)
|
||||
.get(input.runId, input.stage, input.inputHash) as StageRow | undefined;
|
||||
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
const parsed = parseStageRow<TOutput>(row);
|
||||
return parsed.status === 'completed' ? parsed : null;
|
||||
}
|
||||
|
||||
async saveCompletedStage<TOutput = unknown>(
|
||||
input: Omit<KtxScanEnrichmentCompletedStage<TOutput>, 'status' | 'errorMessage'>,
|
||||
): Promise<void> {
|
||||
this.db
|
||||
.prepare(
|
||||
`
|
||||
INSERT INTO local_scan_enrichment_stages (
|
||||
run_id,
|
||||
stage,
|
||||
input_hash,
|
||||
connection_id,
|
||||
sync_id,
|
||||
mode,
|
||||
status,
|
||||
output_json,
|
||||
error_message,
|
||||
updated_at
|
||||
)
|
||||
VALUES (
|
||||
@runId,
|
||||
@stage,
|
||||
@inputHash,
|
||||
@connectionId,
|
||||
@syncId,
|
||||
@mode,
|
||||
'completed',
|
||||
@outputJson,
|
||||
NULL,
|
||||
@updatedAt
|
||||
)
|
||||
ON CONFLICT(run_id, stage) DO UPDATE SET
|
||||
input_hash = excluded.input_hash,
|
||||
connection_id = excluded.connection_id,
|
||||
sync_id = excluded.sync_id,
|
||||
mode = excluded.mode,
|
||||
status = excluded.status,
|
||||
output_json = excluded.output_json,
|
||||
error_message = excluded.error_message,
|
||||
updated_at = excluded.updated_at
|
||||
`,
|
||||
)
|
||||
.run({
|
||||
runId: input.runId,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
mode: input.mode,
|
||||
outputJson: JSON.stringify(input.output),
|
||||
updatedAt: input.updatedAt,
|
||||
});
|
||||
}
|
||||
|
||||
async saveFailedStage(input: Omit<KtxScanEnrichmentFailedStage, 'status' | 'output'>): Promise<void> {
|
||||
this.db
|
||||
.prepare(
|
||||
`
|
||||
INSERT INTO local_scan_enrichment_stages (
|
||||
run_id,
|
||||
stage,
|
||||
input_hash,
|
||||
connection_id,
|
||||
sync_id,
|
||||
mode,
|
||||
status,
|
||||
output_json,
|
||||
error_message,
|
||||
updated_at
|
||||
)
|
||||
VALUES (
|
||||
@runId,
|
||||
@stage,
|
||||
@inputHash,
|
||||
@connectionId,
|
||||
@syncId,
|
||||
@mode,
|
||||
'failed',
|
||||
NULL,
|
||||
@errorMessage,
|
||||
@updatedAt
|
||||
)
|
||||
ON CONFLICT(run_id, stage) DO UPDATE SET
|
||||
input_hash = excluded.input_hash,
|
||||
connection_id = excluded.connection_id,
|
||||
sync_id = excluded.sync_id,
|
||||
mode = excluded.mode,
|
||||
status = excluded.status,
|
||||
output_json = excluded.output_json,
|
||||
error_message = excluded.error_message,
|
||||
updated_at = excluded.updated_at
|
||||
`,
|
||||
)
|
||||
.run({
|
||||
runId: input.runId,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
mode: input.mode,
|
||||
errorMessage: input.errorMessage,
|
||||
updatedAt: input.updatedAt,
|
||||
});
|
||||
}
|
||||
|
||||
async listRunStages(runId: string): Promise<KtxScanEnrichmentStageRecord[]> {
|
||||
if (!isSafeRunId(runId)) {
|
||||
return [];
|
||||
}
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT *
|
||||
FROM local_scan_enrichment_stages
|
||||
WHERE run_id = ?
|
||||
ORDER BY updated_at ASC, stage ASC
|
||||
`,
|
||||
)
|
||||
.all(runId) as StageRow[];
|
||||
return rows.map((row) => parseStageRow(row));
|
||||
}
|
||||
}
|
||||
24
packages/cli/src/context/scan/type-normalization.test.ts
Normal file
24
packages/cli/src/context/scan/type-normalization.test.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { inferKtxDimensionType, ktxColumnTypeMappingFromNative, normalizeKtxNativeType } from './type-normalization.js';
|
||||
|
||||
describe('KTX scan type normalization', () => {
|
||||
it('normalizes native database type strings', () => {
|
||||
expect(normalizeKtxNativeType(' NUMERIC(12, 2) ')).toBe('numeric');
|
||||
expect(normalizeKtxNativeType('TIMESTAMP WITH TIME ZONE')).toBe('timestamp with time zone');
|
||||
expect(normalizeKtxNativeType('')).toBe('unknown');
|
||||
});
|
||||
|
||||
it('infers dimension types from native types', () => {
|
||||
expect(inferKtxDimensionType('BOOLEAN')).toBe('boolean');
|
||||
expect(inferKtxDimensionType('timestamp with time zone')).toBe('time');
|
||||
expect(inferKtxDimensionType('decimal(10,2)')).toBe('number');
|
||||
expect(inferKtxDimensionType('varchar(255)')).toBe('string');
|
||||
});
|
||||
|
||||
it('builds a complete column type mapping', () => {
|
||||
expect(ktxColumnTypeMappingFromNative('BIGINT')).toEqual({
|
||||
normalizedType: 'bigint',
|
||||
dimensionType: 'number',
|
||||
});
|
||||
});
|
||||
});
|
||||
34
packages/cli/src/context/scan/type-normalization.ts
Normal file
34
packages/cli/src/context/scan/type-normalization.ts
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import type { KtxSchemaDimensionType } from './types.js';
|
||||
|
||||
/** @internal */
|
||||
export interface KtxColumnTypeMapping {
|
||||
normalizedType: string;
|
||||
dimensionType: KtxSchemaDimensionType;
|
||||
}
|
||||
|
||||
export function normalizeKtxNativeType(nativeType: string): string {
|
||||
const normalized = nativeType.toLowerCase().replace(/\([^)]*\)/g, '').replace(/\s+/g, ' ').trim();
|
||||
return normalized.length > 0 ? normalized : 'unknown';
|
||||
}
|
||||
|
||||
export function inferKtxDimensionType(nativeType: string): KtxSchemaDimensionType {
|
||||
const normalized = normalizeKtxNativeType(nativeType);
|
||||
if (/\b(bool|boolean)\b/.test(normalized)) {
|
||||
return 'boolean';
|
||||
}
|
||||
if (/\b(date|datetime|time|timestamp)\b/.test(normalized)) {
|
||||
return 'time';
|
||||
}
|
||||
if (/\b(int|integer|bigint|smallint|decimal|numeric|number|float|double|real)\b/.test(normalized)) {
|
||||
return 'number';
|
||||
}
|
||||
return 'string';
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export function ktxColumnTypeMappingFromNative(nativeType: string): KtxColumnTypeMapping {
|
||||
return {
|
||||
normalizedType: normalizeKtxNativeType(nativeType),
|
||||
dimensionType: inferKtxDimensionType(nativeType),
|
||||
};
|
||||
}
|
||||
258
packages/cli/src/context/scan/types.test.ts
Normal file
258
packages/cli/src/context/scan/types.test.ts
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
createKtxConnectorCapabilities,
|
||||
type KtxEventPropertyDiscovery,
|
||||
type KtxEventPropertyDiscoveryInput,
|
||||
type KtxEventPropertyValuesInput,
|
||||
type KtxEventPropertyValuesResult,
|
||||
type KtxEventStreamDiscoveryPort,
|
||||
type KtxEventTypeDiscovery,
|
||||
type KtxEventTypeDiscoveryInput,
|
||||
type KtxNetworkEndpoint,
|
||||
type KtxNetworkTunnelPort,
|
||||
type KtxQueryResult,
|
||||
type KtxScanConnector,
|
||||
type KtxScanContext,
|
||||
type KtxScanInput,
|
||||
type KtxSchemaSnapshot,
|
||||
} from './types.js';
|
||||
|
||||
describe('KTX scan contract types', () => {
|
||||
it('defaults to structural-only connector capabilities', () => {
|
||||
expect(createKtxConnectorCapabilities()).toEqual({
|
||||
structuralIntrospection: true,
|
||||
tableSampling: false,
|
||||
columnSampling: false,
|
||||
columnStats: false,
|
||||
readOnlySql: false,
|
||||
nestedAnalysis: false,
|
||||
eventStreamDiscovery: false,
|
||||
formalForeignKeys: false,
|
||||
estimatedRowCounts: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps structural introspection mandatory when optional capabilities are enabled', () => {
|
||||
expect(
|
||||
createKtxConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
readOnlySql: true,
|
||||
eventStreamDiscovery: true,
|
||||
estimatedRowCounts: true,
|
||||
}),
|
||||
).toEqual({
|
||||
structuralIntrospection: true,
|
||||
tableSampling: true,
|
||||
columnSampling: false,
|
||||
columnStats: false,
|
||||
readOnlySql: true,
|
||||
nestedAnalysis: false,
|
||||
eventStreamDiscovery: true,
|
||||
formalForeignKeys: false,
|
||||
estimatedRowCounts: true,
|
||||
});
|
||||
});
|
||||
|
||||
it('describes the connector surface without requiring enrichment methods', async () => {
|
||||
const snapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T00:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: { source: 'unit-test' },
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: 42,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Primary key',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const connector: KtxScanConnector = {
|
||||
id: 'test-postgres',
|
||||
driver: 'postgres',
|
||||
capabilities: createKtxConnectorCapabilities({ estimatedRowCounts: true }),
|
||||
async introspect(input: KtxScanInput, ctx: KtxScanContext) {
|
||||
expect(input.connectionId).toBe('warehouse');
|
||||
expect(ctx.runId).toBe('scan-run-1');
|
||||
return snapshot;
|
||||
},
|
||||
};
|
||||
|
||||
await expect(
|
||||
connector.introspect(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
scope: { schemas: ['public'] },
|
||||
mode: 'structural',
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual(snapshot);
|
||||
});
|
||||
|
||||
it('models optional event-stream discovery as a connector capability and port', async () => {
|
||||
const eventTypes: KtxEventTypeDiscovery[] = [{ value: '$pageview', count: 42 }];
|
||||
const propertyKeys: KtxEventPropertyDiscovery[] = [{ key: '$browser', count: 31 }];
|
||||
const propertyValues: KtxEventPropertyValuesResult = { values: ['Chrome', 'Safari'], cardinality: 2 };
|
||||
const discovery: KtxEventStreamDiscoveryPort = {
|
||||
async listEventTypes(input: KtxEventTypeDiscoveryInput) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
eventColumn: 'event',
|
||||
limit: 2,
|
||||
minCount: 30,
|
||||
lookbackDays: 14,
|
||||
});
|
||||
return eventTypes;
|
||||
},
|
||||
async listPropertyKeys(input: KtxEventPropertyDiscoveryInput) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
sampleSize: 1000,
|
||||
limit: 5,
|
||||
lookbackDays: 7,
|
||||
});
|
||||
return propertyKeys;
|
||||
},
|
||||
async listPropertyValues(input: KtxEventPropertyValuesInput) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
propertyKey: '$browser',
|
||||
limit: 3,
|
||||
maxCardinality: 1000,
|
||||
lookbackDays: 30,
|
||||
});
|
||||
return propertyValues;
|
||||
},
|
||||
};
|
||||
|
||||
const connector: KtxScanConnector = {
|
||||
id: 'clickhouse:product',
|
||||
driver: 'clickhouse',
|
||||
capabilities: createKtxConnectorCapabilities({ eventStreamDiscovery: true }),
|
||||
eventStreamDiscovery: discovery,
|
||||
async introspect() {
|
||||
return {
|
||||
connectionId: 'product',
|
||||
driver: 'clickhouse',
|
||||
extractedAt: '2026-04-29T00:00:00.000Z',
|
||||
scope: { catalogs: ['157881'] },
|
||||
metadata: {},
|
||||
tables: [],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
await expect(
|
||||
connector.eventStreamDiscovery?.listEventTypes(
|
||||
{
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
eventColumn: 'event',
|
||||
limit: 2,
|
||||
minCount: 30,
|
||||
lookbackDays: 14,
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual([{ value: '$pageview', count: 42 }]);
|
||||
await expect(
|
||||
connector.eventStreamDiscovery?.listPropertyKeys(
|
||||
{
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
sampleSize: 1000,
|
||||
limit: 5,
|
||||
lookbackDays: 7,
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual([{ key: '$browser', count: 31 }]);
|
||||
await expect(
|
||||
connector.eventStreamDiscovery?.listPropertyValues(
|
||||
{
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
propertyKey: '$browser',
|
||||
limit: 3,
|
||||
maxCardinality: 1000,
|
||||
lookbackDays: 30,
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual({ values: ['Chrome', 'Safari'], cardinality: 2 });
|
||||
});
|
||||
|
||||
it('keeps read-only query results separate from schema snapshots', () => {
|
||||
const result: KtxQueryResult = {
|
||||
headers: ['id', 'amount'],
|
||||
headerTypes: ['integer', 'numeric'],
|
||||
rows: [[1, 10.5]],
|
||||
totalRows: 1,
|
||||
rowCount: 1,
|
||||
};
|
||||
|
||||
expect(result).toEqual({
|
||||
headers: ['id', 'amount'],
|
||||
headerTypes: ['integer', 'numeric'],
|
||||
rows: [[1, 10.5]],
|
||||
totalRows: 1,
|
||||
rowCount: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it('models host-provided network tunnel endpoint resolution without app imports', async () => {
|
||||
const endpoint: KtxNetworkEndpoint = {
|
||||
host: '127.0.0.1',
|
||||
port: 15432,
|
||||
close: async () => undefined,
|
||||
};
|
||||
const tunnelPort: KtxNetworkTunnelPort<{ networkProxy?: { type: 'ssh_tunnel' } }> = {
|
||||
async resolveEndpoint(input) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
host: 'db.internal',
|
||||
port: 5432,
|
||||
connection: { networkProxy: { type: 'ssh_tunnel' } },
|
||||
});
|
||||
return endpoint;
|
||||
},
|
||||
};
|
||||
|
||||
await expect(
|
||||
tunnelPort.resolveEndpoint({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
host: 'db.internal',
|
||||
port: 5432,
|
||||
connection: { networkProxy: { type: 'ssh_tunnel' } },
|
||||
}),
|
||||
).resolves.toBe(endpoint);
|
||||
});
|
||||
});
|
||||
412
packages/cli/src/context/scan/types.ts
Normal file
412
packages/cli/src/context/scan/types.ts
Normal file
|
|
@ -0,0 +1,412 @@
|
|||
export type KtxConnectionDriver =
|
||||
| 'sqlite'
|
||||
| 'postgres'
|
||||
| 'postgresql'
|
||||
| 'sqlserver'
|
||||
| 'bigquery'
|
||||
| 'snowflake'
|
||||
| 'mysql'
|
||||
| 'clickhouse';
|
||||
|
||||
export type KtxScanMode = 'structural' | 'relationships' | 'enriched';
|
||||
|
||||
export type KtxScanTrigger = 'cli' | 'mcp' | 'schema_scan' | 'scheduled' | 'manual';
|
||||
|
||||
export interface KtxConnectorCapabilities {
|
||||
structuralIntrospection: true;
|
||||
tableSampling: boolean;
|
||||
columnSampling: boolean;
|
||||
columnStats: boolean;
|
||||
readOnlySql: boolean;
|
||||
nestedAnalysis: boolean;
|
||||
eventStreamDiscovery: boolean;
|
||||
formalForeignKeys: boolean;
|
||||
estimatedRowCounts: boolean;
|
||||
}
|
||||
|
||||
export type KtxOptionalConnectorCapabilities = Partial<Omit<KtxConnectorCapabilities, 'structuralIntrospection'>>;
|
||||
|
||||
export function createKtxConnectorCapabilities(
|
||||
capabilities: KtxOptionalConnectorCapabilities = {},
|
||||
): KtxConnectorCapabilities {
|
||||
return {
|
||||
structuralIntrospection: true,
|
||||
tableSampling: capabilities.tableSampling ?? false,
|
||||
columnSampling: capabilities.columnSampling ?? false,
|
||||
columnStats: capabilities.columnStats ?? false,
|
||||
readOnlySql: capabilities.readOnlySql ?? false,
|
||||
nestedAnalysis: capabilities.nestedAnalysis ?? false,
|
||||
eventStreamDiscovery: capabilities.eventStreamDiscovery ?? false,
|
||||
formalForeignKeys: capabilities.formalForeignKeys ?? false,
|
||||
estimatedRowCounts: capabilities.estimatedRowCounts ?? false,
|
||||
};
|
||||
}
|
||||
|
||||
interface KtxSchemaScope {
|
||||
catalogs?: string[];
|
||||
schemas?: string[];
|
||||
datasets?: string[];
|
||||
}
|
||||
|
||||
type KtxSchemaTableKind = 'table' | 'view' | 'external' | 'event_stream';
|
||||
|
||||
export type KtxSchemaDimensionType = 'time' | 'string' | 'number' | 'boolean';
|
||||
|
||||
export interface KtxSchemaColumn {
|
||||
name: string;
|
||||
nativeType: string;
|
||||
normalizedType: string;
|
||||
dimensionType: KtxSchemaDimensionType;
|
||||
nullable: boolean;
|
||||
primaryKey: boolean;
|
||||
comment: string | null;
|
||||
}
|
||||
|
||||
export interface KtxSchemaForeignKey {
|
||||
fromColumn: string;
|
||||
toCatalog: string | null;
|
||||
toDb: string | null;
|
||||
toTable: string;
|
||||
toColumn: string;
|
||||
constraintName: string | null;
|
||||
}
|
||||
|
||||
export interface KtxSchemaTable {
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
name: string;
|
||||
kind: KtxSchemaTableKind;
|
||||
comment: string | null;
|
||||
estimatedRows: number | null;
|
||||
columns: KtxSchemaColumn[];
|
||||
foreignKeys: KtxSchemaForeignKey[];
|
||||
}
|
||||
|
||||
export interface KtxSchemaSnapshot {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
extractedAt: string;
|
||||
scope: KtxSchemaScope;
|
||||
tables: KtxSchemaTable[];
|
||||
metadata: Record<string, unknown>;
|
||||
}
|
||||
|
||||
interface KtxCredentialEnvReference {
|
||||
kind: 'env';
|
||||
name: string;
|
||||
}
|
||||
|
||||
interface KtxCredentialFileReference {
|
||||
kind: 'file';
|
||||
path: string;
|
||||
}
|
||||
|
||||
interface KtxResolvedCredentialEnvelope {
|
||||
kind: 'resolved';
|
||||
source: 'standalone' | 'host';
|
||||
values: Record<string, unknown>;
|
||||
redacted?: boolean;
|
||||
}
|
||||
|
||||
export type KtxCredentialEnvelope =
|
||||
| KtxCredentialEnvReference
|
||||
| KtxCredentialFileReference
|
||||
| KtxResolvedCredentialEnvelope;
|
||||
|
||||
/** @internal */
|
||||
export interface KtxNetworkEndpoint {
|
||||
host: string;
|
||||
port: number;
|
||||
close?: () => Promise<void>;
|
||||
}
|
||||
|
||||
interface KtxNetworkTunnelRequest<TConnection = Record<string, unknown>> {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
host: string;
|
||||
port: number;
|
||||
connection: TConnection;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxNetworkTunnelPort<TConnection = Record<string, unknown>> {
|
||||
resolveEndpoint(input: KtxNetworkTunnelRequest<TConnection>): Promise<KtxNetworkEndpoint | null>;
|
||||
}
|
||||
|
||||
export interface KtxScanInput {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
scope?: KtxSchemaScope;
|
||||
mode?: KtxScanMode;
|
||||
dryRun?: boolean;
|
||||
detectRelationships?: boolean;
|
||||
credentials?: KtxCredentialEnvelope;
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface KtxProgressUpdateOptions {
|
||||
transient?: boolean;
|
||||
}
|
||||
|
||||
export interface KtxProgressPort {
|
||||
update(progress: number, message?: string, options?: KtxProgressUpdateOptions): Promise<void>;
|
||||
startPhase(weight: number): KtxProgressPort;
|
||||
}
|
||||
|
||||
export interface KtxScanLoggerPort {
|
||||
debug(message: string, metadata?: Record<string, unknown>): void;
|
||||
info(message: string, metadata?: Record<string, unknown>): void;
|
||||
warn(message: string, metadata?: Record<string, unknown>): void;
|
||||
error(message: string, metadata?: Record<string, unknown>): void;
|
||||
}
|
||||
|
||||
export interface KtxScanContext {
|
||||
runId: string;
|
||||
signal?: AbortSignal;
|
||||
progress?: KtxProgressPort;
|
||||
logger?: KtxScanLoggerPort;
|
||||
}
|
||||
|
||||
export interface KtxTableRef {
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
name: string;
|
||||
}
|
||||
|
||||
export interface KtxTableSampleInput {
|
||||
connectionId: string;
|
||||
table: KtxTableRef;
|
||||
columns?: string[];
|
||||
limit: number;
|
||||
}
|
||||
|
||||
export interface KtxTableSampleResult {
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
}
|
||||
|
||||
export interface KtxColumnSampleInput {
|
||||
connectionId: string;
|
||||
table: KtxTableRef;
|
||||
column: string;
|
||||
limit: number;
|
||||
}
|
||||
|
||||
export interface KtxColumnSampleResult {
|
||||
values: unknown[];
|
||||
nullCount: number | null;
|
||||
distinctCount: number | null;
|
||||
}
|
||||
|
||||
export interface KtxColumnStatsInput {
|
||||
connectionId: string;
|
||||
table: KtxTableRef;
|
||||
column: string;
|
||||
}
|
||||
|
||||
export interface KtxColumnStatsResult {
|
||||
min: unknown;
|
||||
max: unknown;
|
||||
average: number | null;
|
||||
nullCount: number | null;
|
||||
distinctCount: number | null;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxEventTypeDiscoveryInput {
|
||||
connectionId: string;
|
||||
table: KtxTableRef;
|
||||
eventColumn: string;
|
||||
limit: number;
|
||||
minCount?: number;
|
||||
lookbackDays?: number;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxEventTypeDiscovery {
|
||||
value: string;
|
||||
count: number;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxEventPropertyDiscoveryInput {
|
||||
connectionId: string;
|
||||
table: KtxTableRef;
|
||||
jsonColumn: string;
|
||||
sampleSize: number;
|
||||
limit: number;
|
||||
lookbackDays?: number;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxEventPropertyDiscovery {
|
||||
key: string;
|
||||
count: number;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxEventPropertyValuesInput {
|
||||
connectionId: string;
|
||||
table: KtxTableRef;
|
||||
jsonColumn: string;
|
||||
propertyKey: string;
|
||||
limit: number;
|
||||
maxCardinality?: number;
|
||||
lookbackDays?: number;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxEventPropertyValuesResult {
|
||||
values: string[];
|
||||
cardinality: number;
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export interface KtxEventStreamDiscoveryPort {
|
||||
listEventTypes(input: KtxEventTypeDiscoveryInput, ctx: KtxScanContext): Promise<KtxEventTypeDiscovery[]>;
|
||||
listPropertyKeys(input: KtxEventPropertyDiscoveryInput, ctx: KtxScanContext): Promise<KtxEventPropertyDiscovery[]>;
|
||||
listPropertyValues(
|
||||
input: KtxEventPropertyValuesInput,
|
||||
ctx: KtxScanContext,
|
||||
): Promise<KtxEventPropertyValuesResult | null>;
|
||||
}
|
||||
|
||||
export interface KtxReadOnlyQueryInput {
|
||||
connectionId: string;
|
||||
sql: string;
|
||||
maxRows?: number;
|
||||
}
|
||||
|
||||
export interface KtxQueryResult {
|
||||
headers: string[];
|
||||
headerTypes?: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
rowCount: number | null;
|
||||
}
|
||||
|
||||
export interface KtxTableListEntry {
|
||||
schema: string;
|
||||
name: string;
|
||||
kind: 'table' | 'view';
|
||||
}
|
||||
|
||||
interface KtxConnectorTestResult {
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface KtxScanConnector {
|
||||
id: string;
|
||||
driver: KtxConnectionDriver;
|
||||
capabilities: KtxConnectorCapabilities;
|
||||
eventStreamDiscovery?: KtxEventStreamDiscoveryPort;
|
||||
introspect(input: KtxScanInput, ctx: KtxScanContext): Promise<KtxSchemaSnapshot>;
|
||||
testConnection?(): Promise<KtxConnectorTestResult>;
|
||||
sampleColumn?(input: KtxColumnSampleInput, ctx: KtxScanContext): Promise<KtxColumnSampleResult>;
|
||||
sampleTable?(input: KtxTableSampleInput, ctx: KtxScanContext): Promise<KtxTableSampleResult>;
|
||||
columnStats?(input: KtxColumnStatsInput, ctx: KtxScanContext): Promise<KtxColumnStatsResult | null>;
|
||||
executeReadOnly?(input: KtxReadOnlyQueryInput, ctx: KtxScanContext): Promise<KtxQueryResult>;
|
||||
cleanup?(): Promise<void>;
|
||||
}
|
||||
|
||||
export interface KtxEmbeddingPort {
|
||||
dimensions: number;
|
||||
maxBatchSize: number;
|
||||
embedBatch(texts: string[]): Promise<number[][]>;
|
||||
}
|
||||
|
||||
interface KtxStructuralSyncStats {
|
||||
tablesCreated: number;
|
||||
tablesUpdated: number;
|
||||
tablesDeleted: number;
|
||||
columnsCreated: number;
|
||||
columnsUpdated: number;
|
||||
columnsDeleted: number;
|
||||
}
|
||||
|
||||
interface KtxScanDiffSummary {
|
||||
tablesAdded: number;
|
||||
tablesModified: number;
|
||||
tablesDeleted: number;
|
||||
tablesUnchanged: number;
|
||||
columnsAdded: number;
|
||||
columnsModified: number;
|
||||
columnsDeleted: number;
|
||||
}
|
||||
|
||||
interface KtxScanArtifactPaths {
|
||||
rawSourcesDir: string | null;
|
||||
reportPath: string | null;
|
||||
manifestShards: string[];
|
||||
enrichmentArtifacts: string[];
|
||||
}
|
||||
|
||||
type KtxScanWarningCode =
|
||||
| 'connector_capability_missing'
|
||||
| 'sampling_failed'
|
||||
| 'statistics_failed'
|
||||
| 'llm_unavailable'
|
||||
| 'embedding_unavailable'
|
||||
| 'scan_enrichment_backend_not_configured'
|
||||
| 'relationship_validation_failed'
|
||||
| 'relationship_llm_invalid_reference'
|
||||
| 'relationship_llm_proposal_failed'
|
||||
| 'credential_redacted'
|
||||
| 'enrichment_failed'
|
||||
| 'description_fallback_used';
|
||||
|
||||
export interface KtxScanWarning {
|
||||
code: KtxScanWarningCode;
|
||||
message: string;
|
||||
table?: string;
|
||||
column?: string;
|
||||
recoverable: boolean;
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface KtxScanEnrichmentSummary {
|
||||
dataDictionary: 'skipped' | 'completed' | 'failed';
|
||||
tableDescriptions: 'skipped' | 'completed' | 'failed';
|
||||
columnDescriptions: 'skipped' | 'completed' | 'failed';
|
||||
embeddings: 'skipped' | 'completed' | 'failed';
|
||||
deterministicRelationships: 'skipped' | 'completed' | 'failed';
|
||||
llmRelationshipValidation: 'skipped' | 'completed' | 'failed';
|
||||
statisticalValidation: 'skipped' | 'completed' | 'failed';
|
||||
}
|
||||
|
||||
export interface KtxScanRelationshipSummary {
|
||||
accepted: number;
|
||||
review: number;
|
||||
rejected: number;
|
||||
skipped: number;
|
||||
}
|
||||
|
||||
export type KtxScanEnrichmentStage = 'descriptions' | 'embeddings' | 'relationships';
|
||||
|
||||
export interface KtxScanEnrichmentStateSummary {
|
||||
resumedStages: KtxScanEnrichmentStage[];
|
||||
completedStages: KtxScanEnrichmentStage[];
|
||||
failedStages: KtxScanEnrichmentStage[];
|
||||
}
|
||||
|
||||
export interface KtxScanReport {
|
||||
connectionId: string;
|
||||
driver: KtxConnectionDriver;
|
||||
syncId: string;
|
||||
runId: string;
|
||||
trigger: KtxScanTrigger;
|
||||
mode: KtxScanMode;
|
||||
dryRun: boolean;
|
||||
artifactPaths: KtxScanArtifactPaths;
|
||||
diffSummary: KtxScanDiffSummary;
|
||||
manifestShardsWritten: number;
|
||||
structuralSyncStats: KtxStructuralSyncStats;
|
||||
enrichment: KtxScanEnrichmentSummary;
|
||||
capabilityGaps: Array<keyof Omit<KtxConnectorCapabilities, 'structuralIntrospection'>>;
|
||||
warnings: KtxScanWarning[];
|
||||
relationships: KtxScanRelationshipSummary;
|
||||
enrichmentState: KtxScanEnrichmentStateSummary;
|
||||
createdAt: string;
|
||||
}
|
||||
205
packages/cli/src/context/scan/warehouse-catalog.test.ts
Normal file
205
packages/cli/src/context/scan/warehouse-catalog.test.ts
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKtxProject, type KtxLocalProject } from '../../context/project/project.js';
|
||||
import { WarehouseCatalogService } from './warehouse-catalog.js';
|
||||
|
||||
describe('WarehouseCatalogService', () => {
|
||||
let tempDir: string;
|
||||
let project: KtxLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-warehouse-catalog-'));
|
||||
project = await initKtxProject({ projectDir: join(tempDir, 'project') });
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
async function seedLiveDatabaseScan(connectionId = 'warehouse', syncId = 'sync-2', driver = 'postgres') {
|
||||
const root = `raw-sources/${connectionId}/live-database/${syncId}`;
|
||||
const tableRef = {
|
||||
catalog: driver === 'bigquery' ? 'analytics' : null,
|
||||
db: driver === 'sqlite' ? null : 'public',
|
||||
name: 'orders',
|
||||
};
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/connection.json`,
|
||||
JSON.stringify({ connectionId, driver, extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed connection',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/tables/orders.json`,
|
||||
JSON.stringify(
|
||||
{
|
||||
catalog: tableRef.catalog,
|
||||
db: tableRef.db,
|
||||
name: tableRef.name,
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: 12,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'Order status',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed orders',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${root}/enrichment/relationship-profile.json`,
|
||||
JSON.stringify(
|
||||
{
|
||||
connectionId,
|
||||
driver,
|
||||
sqlAvailable: true,
|
||||
queryCount: 3,
|
||||
tables: [{ table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name }, rowCount: 12 }],
|
||||
columns: {
|
||||
'orders.status': {
|
||||
table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name },
|
||||
column: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'text',
|
||||
rowCount: 12,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 0.1667,
|
||||
nullRate: 0,
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 8,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed profile',
|
||||
);
|
||||
}
|
||||
|
||||
it('finds the latest sync and merges table schema with relationship profile values', async () => {
|
||||
await seedLiveDatabaseScan('warehouse', 'sync-1');
|
||||
await seedLiveDatabaseScan('warehouse', 'sync-2');
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.getLatestSyncId('warehouse')).resolves.toBe('sync-2');
|
||||
const detail = await catalog.getTable({ connectionId: 'warehouse', catalog: null, db: 'public', name: 'orders' });
|
||||
|
||||
expect(detail).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
display: 'public.orders',
|
||||
rowCount: 12,
|
||||
columns: [
|
||||
{ name: 'id', nativeType: 'integer', primaryKey: true },
|
||||
{ name: 'status', nativeType: 'text', sampleValues: ['paid', 'refunded'], distinctCount: 2 },
|
||||
],
|
||||
});
|
||||
expect(detail).not.toHaveProperty(['connection', 'Name'].join(''));
|
||||
|
||||
const hits = await catalog.searchByName('warehouse', 'orders', 5);
|
||||
expect(hits[0]).toMatchObject({
|
||||
kind: 'table',
|
||||
connectionId: 'warehouse',
|
||||
display: 'public.orders',
|
||||
});
|
||||
expect(hits[0]).not.toHaveProperty(['connection', 'Name'].join(''));
|
||||
});
|
||||
|
||||
it('returns scanAvailable=false when no live-database scan exists', async () => {
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
await expect(catalog.getTable({ connectionId: 'missing', catalog: null, db: 'public', name: 'orders' })).resolves.toBeNull();
|
||||
await expect(catalog.hasScan('missing')).resolves.toBe(false);
|
||||
});
|
||||
|
||||
it('resolves postgres display strings and returns closest candidates for missing tables', async () => {
|
||||
await seedLiveDatabaseScan();
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
|
||||
resolved: { catalog: null, db: 'public', name: 'orders' },
|
||||
candidates: [],
|
||||
dialect: 'postgres',
|
||||
});
|
||||
await expect(catalog.resolveDisplay('warehouse', 'public.orderz')).resolves.toMatchObject({
|
||||
resolved: null,
|
||||
candidates: [{ name: 'orders' }],
|
||||
});
|
||||
});
|
||||
|
||||
it('treats two-part BigQuery identifiers as ambiguous instead of guessing', async () => {
|
||||
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
|
||||
resolved: null,
|
||||
dialect: 'bigquery',
|
||||
});
|
||||
});
|
||||
|
||||
it('resolves postgres column display strings without treating the column as a table', async () => {
|
||||
await seedLiveDatabaseScan();
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.resolveDisplayTarget('warehouse', 'public.orders.status')).resolves.toMatchObject({
|
||||
resolved: { catalog: null, db: 'public', name: 'orders', column: 'status' },
|
||||
candidates: [],
|
||||
dialect: 'postgres',
|
||||
});
|
||||
});
|
||||
|
||||
it('resolves BigQuery column display strings with four parts', async () => {
|
||||
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.resolveDisplayTarget('warehouse', 'analytics.public.orders.status')).resolves.toMatchObject({
|
||||
resolved: { catalog: 'analytics', db: 'public', name: 'orders', column: 'status' },
|
||||
candidates: [],
|
||||
dialect: 'bigquery',
|
||||
});
|
||||
});
|
||||
|
||||
it('searches table names, column names, comments, and descriptions', async () => {
|
||||
await seedLiveDatabaseScan();
|
||||
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
|
||||
|
||||
await expect(catalog.searchByName('warehouse', 'status', 10)).resolves.toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
kind: 'column',
|
||||
ref: expect.objectContaining({ db: 'public', name: 'orders', column: 'status' }),
|
||||
matchedOn: 'name',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
});
|
||||
448
packages/cli/src/context/scan/warehouse-catalog.ts
Normal file
448
packages/cli/src/context/scan/warehouse-catalog.ts
Normal file
|
|
@ -0,0 +1,448 @@
|
|||
import { getDialectForDriver } from '../../context/connections/dialects.js';
|
||||
import type { KtxFileStorePort } from '../../context/core/file-store.js';
|
||||
import type {
|
||||
KtxConnectionDriver,
|
||||
KtxSchemaColumn,
|
||||
KtxSchemaForeignKey,
|
||||
KtxSchemaTable,
|
||||
KtxTableRef,
|
||||
} from './types.js';
|
||||
|
||||
type CatalogDriver = KtxConnectionDriver | 'sqlite3';
|
||||
|
||||
export interface WarehouseCatalogServiceDeps {
|
||||
fileStore: KtxFileStorePort;
|
||||
}
|
||||
|
||||
interface WarehouseColumnDetail extends KtxSchemaColumn {
|
||||
descriptions: Record<string, string>;
|
||||
rowCount: number | null;
|
||||
nullCount: number | null;
|
||||
distinctCount: number | null;
|
||||
nullRate: number | null;
|
||||
sampleValues: string[];
|
||||
}
|
||||
|
||||
export interface TableDetail {
|
||||
connectionId: string;
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
name: string;
|
||||
display: string;
|
||||
kind: string;
|
||||
comment: string | null;
|
||||
description: string | null;
|
||||
rowCount: number | null;
|
||||
columns: WarehouseColumnDetail[];
|
||||
foreignKeys: KtxSchemaForeignKey[];
|
||||
}
|
||||
|
||||
export type RawSchemaHit =
|
||||
| {
|
||||
kind: 'table';
|
||||
connectionId: string;
|
||||
ref: KtxTableRef;
|
||||
display: string;
|
||||
matchedOn: 'name' | 'db' | 'comment' | 'description';
|
||||
}
|
||||
| {
|
||||
kind: 'column';
|
||||
connectionId: string;
|
||||
ref: KtxTableRef & { column: string };
|
||||
display: string;
|
||||
matchedOn: 'name' | 'comment' | 'description';
|
||||
};
|
||||
|
||||
export interface DisplayTargetResolution {
|
||||
resolved: (KtxTableRef & { column?: string }) | null;
|
||||
candidates: KtxTableRef[];
|
||||
dialect: string;
|
||||
}
|
||||
|
||||
interface ConnectionArtifact {
|
||||
driver?: CatalogDriver;
|
||||
}
|
||||
|
||||
interface RelationshipProfileColumn {
|
||||
table?: KtxTableRef;
|
||||
column?: string;
|
||||
rowCount?: number;
|
||||
nullCount?: number;
|
||||
distinctCount?: number;
|
||||
nullRate?: number;
|
||||
sampleValues?: unknown[];
|
||||
}
|
||||
|
||||
interface RelationshipProfileArtifact {
|
||||
driver?: CatalogDriver;
|
||||
tables?: Array<{ table?: KtxTableRef; rowCount?: number }>;
|
||||
columns?: Record<string, RelationshipProfileColumn>;
|
||||
}
|
||||
|
||||
interface ConnectionCatalog {
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
driver: CatalogDriver;
|
||||
tables: KtxSchemaTable[];
|
||||
profile: RelationshipProfileArtifact | null;
|
||||
}
|
||||
|
||||
type TableWithDescriptions = KtxSchemaTable & {
|
||||
descriptions?: Record<string, string>;
|
||||
columns: Array<KtxSchemaColumn & { descriptions?: Record<string, string> }>;
|
||||
};
|
||||
|
||||
function normalize(value: string | null | undefined): string {
|
||||
return (value ?? '').toLowerCase();
|
||||
}
|
||||
|
||||
function refsEqual(left: KtxTableRef, right: KtxTableRef): boolean {
|
||||
return (
|
||||
normalize(left.catalog) === normalize(right.catalog) &&
|
||||
normalize(left.db) === normalize(right.db) &&
|
||||
normalize(left.name) === normalize(right.name)
|
||||
);
|
||||
}
|
||||
|
||||
function refKey(ref: KtxTableRef): string {
|
||||
return [ref.catalog, ref.db, ref.name].map((part) => normalize(part)).join('.');
|
||||
}
|
||||
|
||||
function columnKey(ref: KtxTableRef, column: string): string {
|
||||
return `${refKey(ref)}.${normalize(column)}`;
|
||||
}
|
||||
|
||||
function readJson<T>(content: string): T {
|
||||
return JSON.parse(content) as T;
|
||||
}
|
||||
|
||||
function cleanIdentifierPart(part: string): string {
|
||||
return part.trim().replace(/^["'`\[]|["'`\]]$/g, '');
|
||||
}
|
||||
|
||||
function splitDisplay(display: string): string[] {
|
||||
return display
|
||||
.trim()
|
||||
.split('.')
|
||||
.map(cleanIdentifierPart)
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function formatDisplay(driver: CatalogDriver, table: KtxTableRef): string {
|
||||
if (driver === 'sqlite' || driver === 'sqlite3') {
|
||||
return table.name;
|
||||
}
|
||||
return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.');
|
||||
}
|
||||
|
||||
function parseDisplay(driver: CatalogDriver, display: string): KtxTableRef | null {
|
||||
const parts = splitDisplay(display);
|
||||
if (driver === 'sqlite' || driver === 'sqlite3') {
|
||||
return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null;
|
||||
}
|
||||
if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') {
|
||||
if (parts.length !== 3) {
|
||||
return null;
|
||||
}
|
||||
return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! };
|
||||
}
|
||||
if (parts.length === 2) {
|
||||
return { catalog: null, db: parts[0]!, name: parts[1]! };
|
||||
}
|
||||
if (parts.length === 3) {
|
||||
return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! };
|
||||
}
|
||||
return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null;
|
||||
}
|
||||
|
||||
function expectedDisplayPartCount(driver: CatalogDriver): number {
|
||||
if (driver === 'sqlite' || driver === 'sqlite3') {
|
||||
return 1;
|
||||
}
|
||||
if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') {
|
||||
return 3;
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
function parseColumnDisplay(driver: CatalogDriver, display: string): (KtxTableRef & { column: string }) | null {
|
||||
const parts = splitDisplay(display);
|
||||
const tablePartCount = expectedDisplayPartCount(driver);
|
||||
if (parts.length !== tablePartCount + 1) {
|
||||
return null;
|
||||
}
|
||||
const column = parts.at(-1);
|
||||
if (!column) {
|
||||
return null;
|
||||
}
|
||||
const table = parseDisplay(driver, parts.slice(0, -1).join('.'));
|
||||
return table ? { ...table, column } : null;
|
||||
}
|
||||
|
||||
function bestCandidates(tables: KtxSchemaTable[], display: string, limit = 5): KtxTableRef[] {
|
||||
const needle = normalize(splitDisplay(display).at(-1) ?? display);
|
||||
return tables
|
||||
.map((table) => {
|
||||
const name = normalize(table.name);
|
||||
let score = 0;
|
||||
if (name === needle) {
|
||||
score = 100;
|
||||
} else if (name.includes(needle) || needle.includes(name)) {
|
||||
score = 80;
|
||||
} else {
|
||||
const samePrefix = [...name].filter((char, index) => needle[index] === char).length;
|
||||
score = samePrefix / Math.max(name.length, needle.length, 1);
|
||||
}
|
||||
return { table, score };
|
||||
})
|
||||
.filter((entry) => entry.score > 0)
|
||||
.sort((left, right) => right.score - left.score || left.table.name.localeCompare(right.table.name))
|
||||
.slice(0, limit)
|
||||
.map(({ table }) => ({ catalog: table.catalog, db: table.db, name: table.name }));
|
||||
}
|
||||
|
||||
function firstDescription(descriptions: Record<string, string> | undefined): string | null {
|
||||
return Object.values(descriptions ?? {}).find((value) => value.trim().length > 0) ?? null;
|
||||
}
|
||||
|
||||
function matchedOnTable(table: TableWithDescriptions, query: string): RawSchemaHit['matchedOn'] | null {
|
||||
const q = normalize(query);
|
||||
if (!q) {
|
||||
return null;
|
||||
}
|
||||
if (normalize(table.name).includes(q)) {
|
||||
return 'name';
|
||||
}
|
||||
if (normalize(table.db).includes(q)) {
|
||||
return 'db';
|
||||
}
|
||||
if (normalize(table.comment).includes(q)) {
|
||||
return 'comment';
|
||||
}
|
||||
if (normalize(firstDescription(table.descriptions)).includes(q)) {
|
||||
return 'description';
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function matchedOnColumn(
|
||||
column: KtxSchemaColumn & { descriptions?: Record<string, string> },
|
||||
query: string,
|
||||
): 'name' | 'comment' | 'description' | null {
|
||||
const q = normalize(query);
|
||||
if (!q) {
|
||||
return null;
|
||||
}
|
||||
if (normalize(column.name).includes(q)) {
|
||||
return 'name';
|
||||
}
|
||||
if (normalize(column.comment).includes(q)) {
|
||||
return 'comment';
|
||||
}
|
||||
if (normalize(firstDescription(column.descriptions)).includes(q)) {
|
||||
return 'description';
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export class WarehouseCatalogService {
|
||||
private readonly catalogs = new Map<string, Promise<ConnectionCatalog | null>>();
|
||||
|
||||
constructor(private readonly deps: WarehouseCatalogServiceDeps) {}
|
||||
|
||||
async hasScan(connectionId: string): Promise<boolean> {
|
||||
return (await this.loadCatalog(connectionId)) !== null;
|
||||
}
|
||||
|
||||
async getLatestSyncId(connectionId: string): Promise<string | null> {
|
||||
return (await this.loadCatalog(connectionId))?.syncId ?? null;
|
||||
}
|
||||
|
||||
async listTables(connectionId: string): Promise<KtxTableRef[]> {
|
||||
const catalog = await this.loadCatalog(connectionId);
|
||||
return catalog?.tables.map((table) => ({ catalog: table.catalog, db: table.db, name: table.name })) ?? [];
|
||||
}
|
||||
|
||||
async getTable(ref: { connectionId: string } & KtxTableRef): Promise<TableDetail | null> {
|
||||
const catalog = await this.loadCatalog(ref.connectionId);
|
||||
if (!catalog) {
|
||||
return null;
|
||||
}
|
||||
const table = catalog.tables.find((candidate) => refsEqual(candidate, ref)) as TableWithDescriptions | undefined;
|
||||
if (!table) {
|
||||
return null;
|
||||
}
|
||||
const profileTables = catalog.profile?.tables ?? [];
|
||||
const profileTable = profileTables.find((candidate) => candidate.table && refsEqual(candidate.table, table));
|
||||
const profileColumns = catalog.profile?.columns ?? {};
|
||||
|
||||
return {
|
||||
connectionId: ref.connectionId,
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
display: formatDisplay(catalog.driver, table),
|
||||
kind: table.kind,
|
||||
comment: table.comment,
|
||||
description: firstDescription(table.descriptions),
|
||||
rowCount: profileTable?.rowCount ?? table.estimatedRows ?? null,
|
||||
columns: table.columns.map((rawColumn) => {
|
||||
const column = rawColumn as KtxSchemaColumn & { descriptions?: Record<string, string> };
|
||||
const profileColumn =
|
||||
profileColumns[columnKey(table, column.name)] ??
|
||||
Object.entries(profileColumns).find(
|
||||
([key, value]) =>
|
||||
normalize(key) === `${normalize(table.name)}.${normalize(column.name)}` ||
|
||||
(value.table && refsEqual(value.table, table) && normalize(value.column) === normalize(column.name)),
|
||||
)?.[1];
|
||||
return {
|
||||
...column,
|
||||
descriptions: column.descriptions ?? {},
|
||||
rowCount: profileColumn?.rowCount ?? null,
|
||||
nullCount: profileColumn?.nullCount ?? null,
|
||||
distinctCount: profileColumn?.distinctCount ?? null,
|
||||
nullRate: profileColumn?.nullRate ?? null,
|
||||
sampleValues: (profileColumn?.sampleValues ?? []).map((value) => String(value)),
|
||||
};
|
||||
}),
|
||||
foreignKeys: table.foreignKeys,
|
||||
};
|
||||
}
|
||||
|
||||
async resolveDisplay(
|
||||
connectionId: string,
|
||||
display: string,
|
||||
): Promise<{
|
||||
resolved: KtxTableRef | null;
|
||||
candidates: KtxTableRef[];
|
||||
dialect: string;
|
||||
}> {
|
||||
const catalog = await this.loadCatalog(connectionId);
|
||||
if (!catalog) {
|
||||
return { resolved: null, candidates: [], dialect: 'unknown' };
|
||||
}
|
||||
const dialect = getDialectForDriver(catalog.driver).type;
|
||||
const parsed = parseDisplay(catalog.driver, display);
|
||||
if (!parsed) {
|
||||
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
|
||||
}
|
||||
const table = catalog.tables.find((candidate) => refsEqual(candidate, parsed));
|
||||
if (!table) {
|
||||
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
|
||||
}
|
||||
return { resolved: { catalog: table.catalog, db: table.db, name: table.name }, candidates: [], dialect };
|
||||
}
|
||||
|
||||
async resolveDisplayTarget(connectionId: string, display: string): Promise<DisplayTargetResolution> {
|
||||
const catalog = await this.loadCatalog(connectionId);
|
||||
if (!catalog) {
|
||||
return { resolved: null, candidates: [], dialect: 'unknown' };
|
||||
}
|
||||
|
||||
const dialect = getDialectForDriver(catalog.driver).type;
|
||||
const tableResolution = await this.resolveDisplay(connectionId, display);
|
||||
if (tableResolution.resolved) {
|
||||
return tableResolution;
|
||||
}
|
||||
|
||||
const parsedColumn = parseColumnDisplay(catalog.driver, display);
|
||||
if (!parsedColumn) {
|
||||
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
|
||||
}
|
||||
|
||||
const table = catalog.tables.find((candidate) => refsEqual(candidate, parsedColumn));
|
||||
if (!table) {
|
||||
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
|
||||
}
|
||||
|
||||
return {
|
||||
resolved: {
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
column: parsedColumn.column,
|
||||
},
|
||||
candidates: [],
|
||||
dialect,
|
||||
};
|
||||
}
|
||||
|
||||
async searchByName(connectionId: string, query: string, limit: number): Promise<RawSchemaHit[]> {
|
||||
const catalog = await this.loadCatalog(connectionId);
|
||||
if (!catalog) {
|
||||
return [];
|
||||
}
|
||||
const hits: RawSchemaHit[] = [];
|
||||
for (const table of catalog.tables as TableWithDescriptions[]) {
|
||||
const tableMatch = matchedOnTable(table, query);
|
||||
if (tableMatch) {
|
||||
hits.push({
|
||||
kind: 'table',
|
||||
connectionId,
|
||||
ref: { catalog: table.catalog, db: table.db, name: table.name },
|
||||
display: formatDisplay(catalog.driver, table),
|
||||
matchedOn: tableMatch,
|
||||
});
|
||||
}
|
||||
for (const column of table.columns) {
|
||||
const columnMatch = matchedOnColumn(column, query);
|
||||
if (!columnMatch) {
|
||||
continue;
|
||||
}
|
||||
hits.push({
|
||||
kind: 'column',
|
||||
connectionId,
|
||||
ref: { catalog: table.catalog, db: table.db, name: table.name, column: column.name },
|
||||
display: `${formatDisplay(catalog.driver, table)}.${column.name}`,
|
||||
matchedOn: columnMatch,
|
||||
});
|
||||
}
|
||||
}
|
||||
return hits.slice(0, Math.max(0, limit));
|
||||
}
|
||||
|
||||
private loadCatalog(connectionId: string): Promise<ConnectionCatalog | null> {
|
||||
const existing = this.catalogs.get(connectionId);
|
||||
if (existing) {
|
||||
return existing;
|
||||
}
|
||||
const pending = this.readCatalog(connectionId);
|
||||
this.catalogs.set(connectionId, pending);
|
||||
return pending;
|
||||
}
|
||||
|
||||
private async readCatalog(connectionId: string): Promise<ConnectionCatalog | null> {
|
||||
const root = `raw-sources/${connectionId}/live-database`;
|
||||
const listed = await this.deps.fileStore.listFiles(root);
|
||||
const connectionFiles = listed.files.filter((file) => file.endsWith('/connection.json')).sort();
|
||||
const latestConnectionPath = connectionFiles.at(-1);
|
||||
if (!latestConnectionPath) {
|
||||
return null;
|
||||
}
|
||||
const latestRoot = latestConnectionPath.slice(0, -'/connection.json'.length);
|
||||
const syncId = latestRoot.split('/').at(-1) ?? '';
|
||||
const connection = readJson<ConnectionArtifact>((await this.deps.fileStore.readFile(latestConnectionPath)).content);
|
||||
const tablesListing = await this.deps.fileStore.listFiles(`${latestRoot}/tables`);
|
||||
const tables: KtxSchemaTable[] = [];
|
||||
for (const tablePath of tablesListing.files.filter((file) => file.endsWith('.json')).sort()) {
|
||||
tables.push(readJson<KtxSchemaTable>((await this.deps.fileStore.readFile(tablePath)).content));
|
||||
}
|
||||
|
||||
let profile: RelationshipProfileArtifact | null = null;
|
||||
try {
|
||||
profile = readJson<RelationshipProfileArtifact>(
|
||||
(await this.deps.fileStore.readFile(`${latestRoot}/enrichment/relationship-profile.json`)).content,
|
||||
);
|
||||
} catch {
|
||||
profile = null;
|
||||
}
|
||||
|
||||
return {
|
||||
connectionId,
|
||||
syncId,
|
||||
driver: connection.driver ?? profile?.driver ?? 'postgres',
|
||||
tables,
|
||||
profile,
|
||||
};
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue