diff --git a/packages/context/package.json b/packages/context/package.json index b85947e5..84ea92bf 100644 --- a/packages/context/package.json +++ b/packages/context/package.json @@ -120,6 +120,7 @@ "scripts": { "build": "tsc -p tsconfig.json", "relationships:benchmarks": "pnpm --silent run build && node scripts/relationship-benchmark-report.mjs", + "relationships:benchmarks:test": "KTX_RUN_RELATIONSHIP_BENCHMARKS=1 vitest run src/scan/relationship-benchmarks.test.ts", "search:pglite-spike": "node scripts/pglite-hybrid-search-spike.mjs", "search:pglite-owner-prototype": "node scripts/pglite-owner-process-prototype.mjs", "search:pglite-sl-prototype": "node scripts/pglite-sl-search-prototype.mjs", diff --git a/packages/context/src/scan/relationship-benchmarks.test.ts b/packages/context/src/scan/relationship-benchmarks.test.ts index ecc9e4a9..b4e5c782 100644 --- a/packages/context/src/scan/relationship-benchmarks.test.ts +++ b/packages/context/src/scan/relationship-benchmarks.test.ts @@ -53,6 +53,12 @@ const CHECKED_IN_FIXTURE_ORIGINS = { semantic_embedding_aliases_no_declared_constraints: 'synthetic', } as const; +function runAdHocRelationshipBenchmarks(): boolean { + return process.env.KTX_RUN_RELATIONSHIP_BENCHMARKS === '1'; +} + +const adHocRelationshipBenchmarkIt = runAdHocRelationshipBenchmarks() ? it : it.skip; + function snapshot(): KtxSchemaSnapshot { return { connectionId: 'warehouse', @@ -644,7 +650,7 @@ describe('relationship benchmarks', () => { expect(fixture.expected.expectedLinks).toHaveLength(1900); }); - it('runs the scale stress fixture inside the benchmark validation budget', async () => { + adHocRelationshipBenchmarkIt('runs the scale stress fixture inside the benchmark validation budget', async () => { const fixtureRoot = new URL('../../test/fixtures/relationship-benchmarks/', import.meta.url); const fixture = await loadKtxRelationshipBenchmarkFixture( join(fixtureRoot.pathname, 'scale_stress_no_declared_constraints'), diff --git a/packages/context/src/scan/relationship-candidates.ts b/packages/context/src/scan/relationship-candidates.ts index b10aa069..cd3b7767 100644 --- a/packages/context/src/scan/relationship-candidates.ts +++ b/packages/context/src/scan/relationship-candidates.ts @@ -7,6 +7,7 @@ import type { } from './enrichment-types.js'; import { localCandidateTables } from './relationship-locality.js'; import { + type KtxRelationshipNormalizedName, normalizeKtxRelationshipName, pluralizeKtxRelationshipToken, singularizeKtxRelationshipToken, @@ -97,9 +98,22 @@ const REFERENCE_SUFFIXES: Array<{ suffix: string; reason: string }> = [ { suffix: '_uuid', reason: 'foreign_key_uuid_suffix' }, ]; const RELATIONSHIP_KEY_TARGET_SUFFIXES = ['_id', '_key', '_code', '_uuid'] as const; +const tableAliasesCache = new WeakMap>(); +const parentTableNameAliasesCache = new WeakMap>(); +const normalizedColumnNameCache = new WeakMap(); + +function normalizedColumnName(column: KtxEnrichedColumn): KtxRelationshipNormalizedName { + const cached = normalizedColumnNameCache.get(column); + if (cached) { + return cached; + } + const normalized = normalizeKtxRelationshipName(column.name); + normalizedColumnNameCache.set(column, normalized); + return normalized; +} function isRelationshipKeyShapedTarget(column: KtxEnrichedColumn): boolean { - const normalized = normalizeKtxRelationshipName(column.name); + const normalized = normalizedColumnName(column); return ( normalized.tokens.length >= 2 && RELATIONSHIP_KEY_TARGET_SUFFIXES.some((suffix) => normalized.normalized.endsWith(suffix)) @@ -107,8 +121,8 @@ function isRelationshipKeyShapedTarget(column: KtxEnrichedColumn): boolean { } function columnSuffixMatchesTarget(input: { fromColumn: KtxEnrichedColumn; toColumn: KtxEnrichedColumn }): boolean { - const source = normalizeKtxRelationshipName(input.fromColumn.name).normalized; - const target = normalizeKtxRelationshipName(input.toColumn.name).normalized; + const source = normalizedColumnName(input.fromColumn).normalized; + const target = normalizedColumnName(input.toColumn).normalized; return source !== target && target.length > 0 && source.endsWith(`_${target}`); } @@ -160,7 +174,7 @@ function hasUsableEmbedding(column: KtxEnrichedColumn): boolean { } function sourceColumnReference(column: KtxEnrichedColumn): KtxRelationshipSourceColumnReference | null { - const normalized = normalizeKtxRelationshipName(column.name); + const normalized = normalizedColumnName(column); if (SELF_REFERENCE_NAMES.has(normalized.normalized)) { return { base: normalized.normalized.replace(/_id$/u, ''), reason: 'foreign_key_suffix' }; } @@ -192,6 +206,11 @@ function addNormalizedTableAlias(aliases: Set, name: string): void { } function tableAliases(table: KtxEnrichedTable): Set { + const cached = tableAliasesCache.get(table); + if (cached) { + return cached; + } + const normalized = normalizeKtxRelationshipName(table.ref.name); const aliases = new Set([normalized.normalized, normalized.singular, normalized.plural]); if (normalized.tokens.length > 1) { @@ -203,6 +222,7 @@ function tableAliases(table: KtxEnrichedTable): Set { aliases.add(pluralizeKtxRelationshipToken(singularLastToken)); } } + tableAliasesCache.set(table, aliases); return aliases; } @@ -212,13 +232,19 @@ function finalTableNamePart(table: KtxEnrichedTable): string { } function parentTableNameAliases(table: KtxEnrichedTable): Set { - const aliases = tableAliases(table); + const cached = parentTableNameAliasesCache.get(table); + if (cached) { + return cached; + } + + const aliases = new Set(tableAliases(table)); addNormalizedTableAlias(aliases, finalTableNamePart(table)); + parentTableNameAliasesCache.set(table, aliases); return aliases; } function targetKeyScore(table: KtxEnrichedTable, column: KtxEnrichedColumn): number { - const columnName = normalizeKtxRelationshipName(column.name).normalized; + const columnName = normalizedColumnName(column).normalized; const tableKeyBases = parentTableNameAliases(table); if (column.primaryKey) { return 1; @@ -338,7 +364,7 @@ function candidateParentTables(input: { maxParentTables, }).map((item) => item.table); - const normalizedColumn = normalizeKtxRelationshipName(input.fromColumn.name).normalized; + const normalizedColumn = normalizedColumnName(input.fromColumn).normalized; if (!SELF_REFERENCE_NAMES.has(normalizedColumn) || ranked.some((table) => table.id === input.fromTable.id)) { return ranked; } @@ -364,7 +390,7 @@ function targetKeyEvidence( return { score: 0, reasons: [] }; } - const columnName = normalizeKtxRelationshipName(column.name).normalized; + const columnName = normalizedColumnName(column).normalized; if (columnName === 'code' || columnName.endsWith('_code') || columnName === 'key' || columnName.endsWith('_key')) { return { score: 0.86, reasons: ['profile_unique_target'] }; } @@ -500,7 +526,7 @@ function createCandidate(input: { evidence: { sourceColumnBase: input.sourceBase, targetTableBase: input.targetBase, - targetColumnBase: normalizeKtxRelationshipName(input.toColumn.name).normalized, + targetColumnBase: normalizedColumnName(input.toColumn).normalized, targetKeyScore: input.targetKeyScore, nameScore: input.nameScore, reasons: input.reasons, @@ -553,7 +579,7 @@ function generateKtxEmbeddingRelationshipCandidates( continue; } - const sourceBase = normalizeKtxRelationshipName(fromColumn.name).normalized; + const sourceBase = normalizedColumnName(fromColumn).normalized; const targetBase = normalizeKtxRelationshipName(toTable.ref.name).singular; const reasons = ['embedding_similarity', ...keyEvidence.reasons]; const candidate = createCandidate({ @@ -620,7 +646,7 @@ export function generateKtxRelationshipDiscoveryCandidates( const sameTable = fromTable.id === toTable.id; const nameMatchesTarget = strictAliases.has(sourceBase); const parentTableNameMatcher = !sameTable && !nameMatchesTarget && parentAliases.has(sourceBase); - const selfReference = sameTable && SELF_REFERENCE_NAMES.has(normalizeKtxRelationshipName(fromColumn.name).normalized); + const selfReference = sameTable && SELF_REFERENCE_NAMES.has(normalizedColumnName(fromColumn).normalized); const strictTableMatcher = (!sameTable && nameMatchesTarget) || selfReference; for (const toColumn of toTable.columns) { @@ -675,7 +701,7 @@ export function generateKtxRelationshipDiscoveryCandidates( if ( !suffixMatcher && !parentTableNameMatcher && - normalizeKtxRelationshipName(fromColumn.name).normalized === normalizeKtxRelationshipName(toColumn.name).normalized + normalizedColumnName(fromColumn).normalized === normalizedColumnName(toColumn).normalized ) { reasons.push('exact_column_name'); nameScore = Math.max(nameScore, 0.9); diff --git a/packages/context/src/scan/relationship-locality.ts b/packages/context/src/scan/relationship-locality.ts index 5b180430..246ce84d 100644 --- a/packages/context/src/scan/relationship-locality.ts +++ b/packages/context/src/scan/relationship-locality.ts @@ -18,20 +18,28 @@ export interface LocalKtxRelationshipCandidateTablesInput { const DEFAULT_MAX_PARENT_TABLES = 20; const RELATIONSHIP_SUFFIX_TOKENS = new Set(['id', 'ids', 'key', 'keys', 'code', 'codes', 'uuid', 'uuids']); +const normalizedTokenVariantsCache = new Map(); function roundedScore(value: number): number { return Number(Math.max(0, Math.min(1, value)).toFixed(3)); } function normalizedTokenVariants(name: string): string[] { + const cached = normalizedTokenVariantsCache.get(name); + if (cached) { + return cached; + } + const normalized = normalizeKtxRelationshipName(name); - return Array.from( + const variants = Array.from( new Set([ ...normalized.tokens, ...tokenizeKtxRelationshipName(normalized.singular), ...tokenizeKtxRelationshipName(normalized.plural), ]), ).filter(Boolean); + normalizedTokenVariantsCache.set(name, variants); + return variants; } function childColumnLocalityTokens(column: KtxEnrichedColumn): string[] { @@ -91,24 +99,29 @@ function parentEmbeddingScore(childColumn: KtxEnrichedColumn, parentTable: KtxEn } function tableTokenScore(input: { - childTable: KtxEnrichedTable; - childColumn: KtxEnrichedColumn; + childTableId: string; + childTableTokens: readonly string[]; + childColumnTokens: readonly string[]; parentTable: KtxEnrichedTable; }): number { - const childTableTokens = normalizedTokenVariants(input.childTable.ref.name); - const childColumnTokens = childColumnLocalityTokens(input.childColumn); const parentTokens = normalizedTokenVariants(input.parentTable.ref.name); - const columnOnlyScore = jaccard(childColumnTokens, parentTokens); - if (input.parentTable.id === input.childTable.id) { + const columnOnlyScore = jaccard(input.childColumnTokens, parentTokens); + if (parentTokens.length === 0) { + return 0; + } + if (input.parentTable.id === input.childTableId) { return columnOnlyScore; } - const columnAndTableScore = jaccard(uniqueTokens([...childTableTokens, ...childColumnTokens]), parentTokens); + const columnAndTableScore = jaccard(uniqueTokens([...input.childTableTokens, ...input.childColumnTokens]), parentTokens); return Math.max(columnOnlyScore, columnAndTableScore * 0.6); } function localityScore(input: { childTable: KtxEnrichedTable; + childTableId: string; + childTableTokens: readonly string[]; childColumn: KtxEnrichedColumn; + childColumnTokens: readonly string[]; parentTable: KtxEnrichedTable; }): Omit { const tokenScore = roundedScore(tableTokenScore(input)); @@ -143,12 +156,18 @@ export function localCandidateTables( return []; } + const childTableTokens = normalizedTokenVariants(input.childTable.ref.name); + const childColumnTokens = childColumnLocalityTokens(input.childColumn); + return input.parentTables .map((table) => ({ table, ...localityScore({ childTable: input.childTable, + childTableId: input.childTable.id, + childTableTokens, childColumn: input.childColumn, + childColumnTokens, parentTable: table, }), })) diff --git a/scripts/build-benchmark-snapshot.test.mjs b/scripts/build-benchmark-snapshot.test.mjs index adc30173..26ac6419 100644 --- a/scripts/build-benchmark-snapshot.test.mjs +++ b/scripts/build-benchmark-snapshot.test.mjs @@ -1,4 +1,5 @@ import assert from 'node:assert/strict'; +import { readFile } from 'node:fs/promises'; import { createRequire } from 'node:module'; import { describe, it } from 'node:test'; import { buildBenchmarkSnapshot } from './build-benchmark-snapshot.mjs'; @@ -250,4 +251,13 @@ describe('buildBenchmarkSnapshot', () => { }, ]); }); + + it('exposes relationship benchmarks as an explicit context package script', async () => { + const packageJson = JSON.parse(await readFile(new URL('../packages/context/package.json', import.meta.url), 'utf8')); + + assert.equal( + packageJson.scripts['relationships:benchmarks:test'], + 'KTX_RUN_RELATIONSHIP_BENCHMARKS=1 vitest run src/scan/relationship-benchmarks.test.ts', + ); + }); });