mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-07 07:55:13 +02:00
Optimize relationship discovery benchmarks
This commit is contained in:
parent
9530d33c00
commit
eee46896f1
5 changed files with 83 additions and 21 deletions
|
|
@ -120,6 +120,7 @@
|
|||
"scripts": {
|
||||
"build": "tsc -p tsconfig.json",
|
||||
"relationships:benchmarks": "pnpm --silent run build && node scripts/relationship-benchmark-report.mjs",
|
||||
"relationships:benchmarks:test": "KTX_RUN_RELATIONSHIP_BENCHMARKS=1 vitest run src/scan/relationship-benchmarks.test.ts",
|
||||
"search:pglite-spike": "node scripts/pglite-hybrid-search-spike.mjs",
|
||||
"search:pglite-owner-prototype": "node scripts/pglite-owner-process-prototype.mjs",
|
||||
"search:pglite-sl-prototype": "node scripts/pglite-sl-search-prototype.mjs",
|
||||
|
|
|
|||
|
|
@ -53,6 +53,12 @@ const CHECKED_IN_FIXTURE_ORIGINS = {
|
|||
semantic_embedding_aliases_no_declared_constraints: 'synthetic',
|
||||
} as const;
|
||||
|
||||
function runAdHocRelationshipBenchmarks(): boolean {
|
||||
return process.env.KTX_RUN_RELATIONSHIP_BENCHMARKS === '1';
|
||||
}
|
||||
|
||||
const adHocRelationshipBenchmarkIt = runAdHocRelationshipBenchmarks() ? it : it.skip;
|
||||
|
||||
function snapshot(): KtxSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
|
|
@ -644,7 +650,7 @@ describe('relationship benchmarks', () => {
|
|||
expect(fixture.expected.expectedLinks).toHaveLength(1900);
|
||||
});
|
||||
|
||||
it('runs the scale stress fixture inside the benchmark validation budget', async () => {
|
||||
adHocRelationshipBenchmarkIt('runs the scale stress fixture inside the benchmark validation budget', async () => {
|
||||
const fixtureRoot = new URL('../../test/fixtures/relationship-benchmarks/', import.meta.url);
|
||||
const fixture = await loadKtxRelationshipBenchmarkFixture(
|
||||
join(fixtureRoot.pathname, 'scale_stress_no_declared_constraints'),
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import type {
|
|||
} from './enrichment-types.js';
|
||||
import { localCandidateTables } from './relationship-locality.js';
|
||||
import {
|
||||
type KtxRelationshipNormalizedName,
|
||||
normalizeKtxRelationshipName,
|
||||
pluralizeKtxRelationshipToken,
|
||||
singularizeKtxRelationshipToken,
|
||||
|
|
@ -97,9 +98,22 @@ const REFERENCE_SUFFIXES: Array<{ suffix: string; reason: string }> = [
|
|||
{ suffix: '_uuid', reason: 'foreign_key_uuid_suffix' },
|
||||
];
|
||||
const RELATIONSHIP_KEY_TARGET_SUFFIXES = ['_id', '_key', '_code', '_uuid'] as const;
|
||||
const tableAliasesCache = new WeakMap<KtxEnrichedTable, Set<string>>();
|
||||
const parentTableNameAliasesCache = new WeakMap<KtxEnrichedTable, Set<string>>();
|
||||
const normalizedColumnNameCache = new WeakMap<KtxEnrichedColumn, KtxRelationshipNormalizedName>();
|
||||
|
||||
function normalizedColumnName(column: KtxEnrichedColumn): KtxRelationshipNormalizedName {
|
||||
const cached = normalizedColumnNameCache.get(column);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
const normalized = normalizeKtxRelationshipName(column.name);
|
||||
normalizedColumnNameCache.set(column, normalized);
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function isRelationshipKeyShapedTarget(column: KtxEnrichedColumn): boolean {
|
||||
const normalized = normalizeKtxRelationshipName(column.name);
|
||||
const normalized = normalizedColumnName(column);
|
||||
return (
|
||||
normalized.tokens.length >= 2 &&
|
||||
RELATIONSHIP_KEY_TARGET_SUFFIXES.some((suffix) => normalized.normalized.endsWith(suffix))
|
||||
|
|
@ -107,8 +121,8 @@ function isRelationshipKeyShapedTarget(column: KtxEnrichedColumn): boolean {
|
|||
}
|
||||
|
||||
function columnSuffixMatchesTarget(input: { fromColumn: KtxEnrichedColumn; toColumn: KtxEnrichedColumn }): boolean {
|
||||
const source = normalizeKtxRelationshipName(input.fromColumn.name).normalized;
|
||||
const target = normalizeKtxRelationshipName(input.toColumn.name).normalized;
|
||||
const source = normalizedColumnName(input.fromColumn).normalized;
|
||||
const target = normalizedColumnName(input.toColumn).normalized;
|
||||
return source !== target && target.length > 0 && source.endsWith(`_${target}`);
|
||||
}
|
||||
|
||||
|
|
@ -160,7 +174,7 @@ function hasUsableEmbedding(column: KtxEnrichedColumn): boolean {
|
|||
}
|
||||
|
||||
function sourceColumnReference(column: KtxEnrichedColumn): KtxRelationshipSourceColumnReference | null {
|
||||
const normalized = normalizeKtxRelationshipName(column.name);
|
||||
const normalized = normalizedColumnName(column);
|
||||
if (SELF_REFERENCE_NAMES.has(normalized.normalized)) {
|
||||
return { base: normalized.normalized.replace(/_id$/u, ''), reason: 'foreign_key_suffix' };
|
||||
}
|
||||
|
|
@ -192,6 +206,11 @@ function addNormalizedTableAlias(aliases: Set<string>, name: string): void {
|
|||
}
|
||||
|
||||
function tableAliases(table: KtxEnrichedTable): Set<string> {
|
||||
const cached = tableAliasesCache.get(table);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
const normalized = normalizeKtxRelationshipName(table.ref.name);
|
||||
const aliases = new Set([normalized.normalized, normalized.singular, normalized.plural]);
|
||||
if (normalized.tokens.length > 1) {
|
||||
|
|
@ -203,6 +222,7 @@ function tableAliases(table: KtxEnrichedTable): Set<string> {
|
|||
aliases.add(pluralizeKtxRelationshipToken(singularLastToken));
|
||||
}
|
||||
}
|
||||
tableAliasesCache.set(table, aliases);
|
||||
return aliases;
|
||||
}
|
||||
|
||||
|
|
@ -212,13 +232,19 @@ function finalTableNamePart(table: KtxEnrichedTable): string {
|
|||
}
|
||||
|
||||
function parentTableNameAliases(table: KtxEnrichedTable): Set<string> {
|
||||
const aliases = tableAliases(table);
|
||||
const cached = parentTableNameAliasesCache.get(table);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
const aliases = new Set(tableAliases(table));
|
||||
addNormalizedTableAlias(aliases, finalTableNamePart(table));
|
||||
parentTableNameAliasesCache.set(table, aliases);
|
||||
return aliases;
|
||||
}
|
||||
|
||||
function targetKeyScore(table: KtxEnrichedTable, column: KtxEnrichedColumn): number {
|
||||
const columnName = normalizeKtxRelationshipName(column.name).normalized;
|
||||
const columnName = normalizedColumnName(column).normalized;
|
||||
const tableKeyBases = parentTableNameAliases(table);
|
||||
if (column.primaryKey) {
|
||||
return 1;
|
||||
|
|
@ -338,7 +364,7 @@ function candidateParentTables(input: {
|
|||
maxParentTables,
|
||||
}).map((item) => item.table);
|
||||
|
||||
const normalizedColumn = normalizeKtxRelationshipName(input.fromColumn.name).normalized;
|
||||
const normalizedColumn = normalizedColumnName(input.fromColumn).normalized;
|
||||
if (!SELF_REFERENCE_NAMES.has(normalizedColumn) || ranked.some((table) => table.id === input.fromTable.id)) {
|
||||
return ranked;
|
||||
}
|
||||
|
|
@ -364,7 +390,7 @@ function targetKeyEvidence(
|
|||
return { score: 0, reasons: [] };
|
||||
}
|
||||
|
||||
const columnName = normalizeKtxRelationshipName(column.name).normalized;
|
||||
const columnName = normalizedColumnName(column).normalized;
|
||||
if (columnName === 'code' || columnName.endsWith('_code') || columnName === 'key' || columnName.endsWith('_key')) {
|
||||
return { score: 0.86, reasons: ['profile_unique_target'] };
|
||||
}
|
||||
|
|
@ -500,7 +526,7 @@ function createCandidate(input: {
|
|||
evidence: {
|
||||
sourceColumnBase: input.sourceBase,
|
||||
targetTableBase: input.targetBase,
|
||||
targetColumnBase: normalizeKtxRelationshipName(input.toColumn.name).normalized,
|
||||
targetColumnBase: normalizedColumnName(input.toColumn).normalized,
|
||||
targetKeyScore: input.targetKeyScore,
|
||||
nameScore: input.nameScore,
|
||||
reasons: input.reasons,
|
||||
|
|
@ -553,7 +579,7 @@ function generateKtxEmbeddingRelationshipCandidates(
|
|||
continue;
|
||||
}
|
||||
|
||||
const sourceBase = normalizeKtxRelationshipName(fromColumn.name).normalized;
|
||||
const sourceBase = normalizedColumnName(fromColumn).normalized;
|
||||
const targetBase = normalizeKtxRelationshipName(toTable.ref.name).singular;
|
||||
const reasons = ['embedding_similarity', ...keyEvidence.reasons];
|
||||
const candidate = createCandidate({
|
||||
|
|
@ -620,7 +646,7 @@ export function generateKtxRelationshipDiscoveryCandidates(
|
|||
const sameTable = fromTable.id === toTable.id;
|
||||
const nameMatchesTarget = strictAliases.has(sourceBase);
|
||||
const parentTableNameMatcher = !sameTable && !nameMatchesTarget && parentAliases.has(sourceBase);
|
||||
const selfReference = sameTable && SELF_REFERENCE_NAMES.has(normalizeKtxRelationshipName(fromColumn.name).normalized);
|
||||
const selfReference = sameTable && SELF_REFERENCE_NAMES.has(normalizedColumnName(fromColumn).normalized);
|
||||
const strictTableMatcher = (!sameTable && nameMatchesTarget) || selfReference;
|
||||
|
||||
for (const toColumn of toTable.columns) {
|
||||
|
|
@ -675,7 +701,7 @@ export function generateKtxRelationshipDiscoveryCandidates(
|
|||
if (
|
||||
!suffixMatcher &&
|
||||
!parentTableNameMatcher &&
|
||||
normalizeKtxRelationshipName(fromColumn.name).normalized === normalizeKtxRelationshipName(toColumn.name).normalized
|
||||
normalizedColumnName(fromColumn).normalized === normalizedColumnName(toColumn).normalized
|
||||
) {
|
||||
reasons.push('exact_column_name');
|
||||
nameScore = Math.max(nameScore, 0.9);
|
||||
|
|
|
|||
|
|
@ -18,20 +18,28 @@ export interface LocalKtxRelationshipCandidateTablesInput {
|
|||
|
||||
const DEFAULT_MAX_PARENT_TABLES = 20;
|
||||
const RELATIONSHIP_SUFFIX_TOKENS = new Set(['id', 'ids', 'key', 'keys', 'code', 'codes', 'uuid', 'uuids']);
|
||||
const normalizedTokenVariantsCache = new Map<string, string[]>();
|
||||
|
||||
function roundedScore(value: number): number {
|
||||
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
||||
}
|
||||
|
||||
function normalizedTokenVariants(name: string): string[] {
|
||||
const cached = normalizedTokenVariantsCache.get(name);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
const normalized = normalizeKtxRelationshipName(name);
|
||||
return Array.from(
|
||||
const variants = Array.from(
|
||||
new Set([
|
||||
...normalized.tokens,
|
||||
...tokenizeKtxRelationshipName(normalized.singular),
|
||||
...tokenizeKtxRelationshipName(normalized.plural),
|
||||
]),
|
||||
).filter(Boolean);
|
||||
normalizedTokenVariantsCache.set(name, variants);
|
||||
return variants;
|
||||
}
|
||||
|
||||
function childColumnLocalityTokens(column: KtxEnrichedColumn): string[] {
|
||||
|
|
@ -91,24 +99,29 @@ function parentEmbeddingScore(childColumn: KtxEnrichedColumn, parentTable: KtxEn
|
|||
}
|
||||
|
||||
function tableTokenScore(input: {
|
||||
childTable: KtxEnrichedTable;
|
||||
childColumn: KtxEnrichedColumn;
|
||||
childTableId: string;
|
||||
childTableTokens: readonly string[];
|
||||
childColumnTokens: readonly string[];
|
||||
parentTable: KtxEnrichedTable;
|
||||
}): number {
|
||||
const childTableTokens = normalizedTokenVariants(input.childTable.ref.name);
|
||||
const childColumnTokens = childColumnLocalityTokens(input.childColumn);
|
||||
const parentTokens = normalizedTokenVariants(input.parentTable.ref.name);
|
||||
const columnOnlyScore = jaccard(childColumnTokens, parentTokens);
|
||||
if (input.parentTable.id === input.childTable.id) {
|
||||
const columnOnlyScore = jaccard(input.childColumnTokens, parentTokens);
|
||||
if (parentTokens.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
if (input.parentTable.id === input.childTableId) {
|
||||
return columnOnlyScore;
|
||||
}
|
||||
const columnAndTableScore = jaccard(uniqueTokens([...childTableTokens, ...childColumnTokens]), parentTokens);
|
||||
const columnAndTableScore = jaccard(uniqueTokens([...input.childTableTokens, ...input.childColumnTokens]), parentTokens);
|
||||
return Math.max(columnOnlyScore, columnAndTableScore * 0.6);
|
||||
}
|
||||
|
||||
function localityScore(input: {
|
||||
childTable: KtxEnrichedTable;
|
||||
childTableId: string;
|
||||
childTableTokens: readonly string[];
|
||||
childColumn: KtxEnrichedColumn;
|
||||
childColumnTokens: readonly string[];
|
||||
parentTable: KtxEnrichedTable;
|
||||
}): Omit<KtxRelationshipLocalityCandidateTable, 'table'> {
|
||||
const tokenScore = roundedScore(tableTokenScore(input));
|
||||
|
|
@ -143,12 +156,18 @@ export function localCandidateTables(
|
|||
return [];
|
||||
}
|
||||
|
||||
const childTableTokens = normalizedTokenVariants(input.childTable.ref.name);
|
||||
const childColumnTokens = childColumnLocalityTokens(input.childColumn);
|
||||
|
||||
return input.parentTables
|
||||
.map((table) => ({
|
||||
table,
|
||||
...localityScore({
|
||||
childTable: input.childTable,
|
||||
childTableId: input.childTable.id,
|
||||
childTableTokens,
|
||||
childColumn: input.childColumn,
|
||||
childColumnTokens,
|
||||
parentTable: table,
|
||||
}),
|
||||
}))
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import assert from 'node:assert/strict';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { createRequire } from 'node:module';
|
||||
import { describe, it } from 'node:test';
|
||||
import { buildBenchmarkSnapshot } from './build-benchmark-snapshot.mjs';
|
||||
|
|
@ -250,4 +251,13 @@ describe('buildBenchmarkSnapshot', () => {
|
|||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('exposes relationship benchmarks as an explicit context package script', async () => {
|
||||
const packageJson = JSON.parse(await readFile(new URL('../packages/context/package.json', import.meta.url), 'utf8'));
|
||||
|
||||
assert.equal(
|
||||
packageJson.scripts['relationships:benchmarks:test'],
|
||||
'KTX_RUN_RELATIONSHIP_BENCHMARKS=1 vitest run src/scan/relationship-benchmarks.test.ts',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue