mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-16 08:25:14 +02:00
* refactor(workspace): relocate @ktx/llm source into packages/cli/src/llm * refactor(workspace): rewrite @ktx/llm imports to relative paths * refactor(workspace): fold internal packages into cli * chore(workspace): gate dead-code with knip production mode Turn on production-mode knip plus an autofix run in pre-commit and the `pnpm dead-code` script, document the `/** @internal */` convention for test-only exports in AGENTS.md, annotate test-only exports across the CLI with that JSDoc, and drop dead exports/wrappers the new gate surfaced (e.g. `cli-project.ts`, `lookerRuntimeSourceToFileAdapterSource`, `createLocalScanEnrichmentProvidersFromConfig`, `PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES`, stale type re-exports). Replace the loose `ignoreIssues` allowlist in `knip.json` with explicit production entries so cross-package barrel leaks are caught. * refactor(cli): delete internal barrel index.ts files The 34 `index.ts` re-export barrels inside `packages/cli/src/` were holdovers from the pre-fold multi-workspace structure. Post-fold-in they served no production purpose: external consumers go through the single package main entry, and in-repo callers mostly imported through them only because the path was short. Internally, knip flagged most barrel re-exports as production-dead (only reached via tests). This change: - Deletes every internal barrel except `packages/cli/src/index.ts` (the published package entry). - Rewrites ~270 source/test files to import each name directly from the file that defines it. - Moves `tools/warehouse-verification/index.ts` to `create-warehouse-verification-tools.ts` (the function it defined locally) and updates its single consumer. - Renames `search/backend-conformance.ts` → `.test-utils.ts` to match the existing test-helper file convention. - Deletes 13 dead test-only chains (dbt-descriptions/*, live-database/extracted-schema, live-database/structural-sync, relationship-* feedback/review chain) plus their tests and a cascading orphan integration test. - Updates test mocks that pointed at deleted barrel paths (notion-client, connector barrels in scan/local-scan-connectors tests) to mock the source files instead. - Points the maintainer benchmark script (`scripts/relationship-benchmark-report.mjs`) at source files instead of `dist/context/scan/index.js`. - Drops the barrel `!` entries from `knip.json`; adds explicit production entries only for the benchmark code reached via dist by the maintainer script. Net: 413 files changed, ~1.2k insertions, ~9.4k deletions. `pnpm run dead-code` (Biome + knip default + knip production) and `pnpm run type-check` are clean; 2277 tests pass. * refactor(workspace): rename @ktx/cli to @kaelio/ktx and pack it directly Promote the CLI workspace package to the public name `@kaelio/ktx` and drop the separate `scripts/build-public-npm-package.mjs` wrapper. The CLI package is now publishable in place (`publishConfig.access: public`, `provenance: true`), so artifact packing uses `pnpm pack` against `packages/cli/` instead of assembling a parallel package tree. Updates all workspace filter invocations, docs, tests, and release readiness checks to reference the new package name, and folds the tarball-name helper into `scripts/public-npm-release-metadata.mjs`. * docs: align "agent clients" and "data agents" terminology Replace "client agents" with "agent clients" and "database agents" with "data agents" across AGENTS.md, README.md, the docs-site copy, and the matching setup-agents test description, matching the canonical vocabulary in docs/terminology.md. Also moves packages/cli/tsconfig.json's tsBuildInfoFile from node_modules/.cache/ to dist/.tsbuildinfo so incremental builds survive node_modules reinstalls. * refactor(release): single source of truth for package version Make packages/cli/package.json the single source of truth for the @kaelio/ktx version. publicNpmPackageVersion() now reads it directly, so artifact filenames, release-readiness checks, and the Python wheel version all derive from one field. The duplicate release-policy.json.publicNpmPackageVersion is removed. Previously the two fields could drift: tarballs were named kaelio-ktx-0.4.1.tgz while internally containing @kaelio/ktx@0.0.0-private. - update-public-release-version.mjs rewrites both Python pyproject.toml files (ktx-daemon, ktx-sl) alongside the npm package.jsons, normalizing the version for PEP 440 (e.g. 0.1.0-rc.2 -> 0.1.0rc2). - semantic-release-config.cjs adds the two pyproject.toml files to @semantic-release/git assets so the release commit back to main carries every version source in lockstep. - The six "?? '0.0.0-private'" fallback literals across the CLI are replaced with "?? getKtxCliPackageInfo().version", and createDefaultKtxMcpServer makes its version arg required. - docs/release.md describes the actual commit-back model: the dev tree always reflects the most recent release; no sentinel pin to maintain. Verified: pnpm run artifacts:build now produces kaelio-ktx-0.4.1.tgz and kaelio_ktx-0.4.1-py3-none-any.whl with @kaelio/ktx@0.4.1 inside. Full type-check, dead-code, and 2287 vitests + 173 script tests pass. * refactor(cli): inject embedding provider resolution and detect sentence-transformers runtime Make resolveProjectEmbeddingProvider and runtimeIo injectable in ingest and scan command entrypoints so tests can stub them, and teach resolvePublicIngestRuntimeRequirements to flag the local-embeddings runtime feature when ktx.yaml selects sentence-transformers. * chore(cli): mark buildLocalStatsStatus and LocalStatsStatus as @internal Both symbols are consumed only by status-project.test.ts. Annotating with /** @internal */ keeps knip's production-mode check clean without changing runtime behavior. * fix(cli): use real package metadata in print-command-tree The stubbed package name embedded a forbidden product identifier that tripped the boundary check in CI. Read the metadata from package.json instead — keeps the rendered tree unchanged and removes a duplicate source of truth. * feat(cli): show embedding coverage in `ktx status`, drop duplicate disk counts Inline `(N embedded)` next to the Wiki scope counts and Semantic-layer source counts, computed with `SUM(embedding_json IS NOT NULL)` over `knowledge_pages` and `local_sl_sources`. Rename the "Knowledge" label to "Wiki" (canonical per `docs/terminology.md`) and rename the matching `localStats.knowledgePages` field to `localStats.wikiPages`. Drop `wiki=N md` and `semantic-layer=N yaml` from the Disk row — those duplicated the per-surface rows above. Disk now reports only actual byte usage (db, cache, raw-sources). The unused `wikiGlobalMarkdownCount` / `semanticLayerYamlCount` fields, the `isMarkdownEntry` / `isYamlEntry` helpers, and the `filter` arg on `summarizeDir` are removed.
183 lines
5.9 KiB
TypeScript
183 lines
5.9 KiB
TypeScript
import type { KtxEnrichedColumn, KtxEnrichedTable } from './enrichment-types.js';
|
|
import { normalizeKtxRelationshipName, tokenizeKtxRelationshipName } from './relationship-name-similarity.js';
|
|
|
|
export interface KtxRelationshipLocalityCandidateTable {
|
|
table: KtxEnrichedTable;
|
|
score: number;
|
|
tokenScore: number;
|
|
embeddingScore: number;
|
|
reasons: string[];
|
|
}
|
|
|
|
export interface LocalKtxRelationshipCandidateTablesInput {
|
|
childTable: KtxEnrichedTable;
|
|
childColumn: KtxEnrichedColumn;
|
|
parentTables: readonly KtxEnrichedTable[];
|
|
maxParentTables?: number;
|
|
}
|
|
|
|
const DEFAULT_MAX_PARENT_TABLES = 20;
|
|
const RELATIONSHIP_SUFFIX_TOKENS = new Set(['id', 'ids', 'key', 'keys', 'code', 'codes', 'uuid', 'uuids']);
|
|
const normalizedTokenVariantsCache = new Map<string, string[]>();
|
|
|
|
function roundedScore(value: number): number {
|
|
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
|
}
|
|
|
|
function normalizedTokenVariants(name: string): string[] {
|
|
const cached = normalizedTokenVariantsCache.get(name);
|
|
if (cached) {
|
|
return cached;
|
|
}
|
|
|
|
const normalized = normalizeKtxRelationshipName(name);
|
|
const variants = Array.from(
|
|
new Set([
|
|
...normalized.tokens,
|
|
...tokenizeKtxRelationshipName(normalized.singular),
|
|
...tokenizeKtxRelationshipName(normalized.plural),
|
|
]),
|
|
).filter(Boolean);
|
|
normalizedTokenVariantsCache.set(name, variants);
|
|
return variants;
|
|
}
|
|
|
|
function childColumnLocalityTokens(column: KtxEnrichedColumn): string[] {
|
|
const tokens = normalizedTokenVariants(column.name);
|
|
const withoutSuffix = tokens.filter((token) => !RELATIONSHIP_SUFFIX_TOKENS.has(token));
|
|
return withoutSuffix.length > 0 ? withoutSuffix : tokens;
|
|
}
|
|
|
|
function uniqueTokens(values: readonly string[]): string[] {
|
|
return Array.from(new Set(values.filter((value) => value.length > 0)));
|
|
}
|
|
|
|
function jaccard(left: readonly string[], right: readonly string[]): number {
|
|
if (left.length === 0 || right.length === 0) {
|
|
return 0;
|
|
}
|
|
const leftSet = new Set(left);
|
|
const rightSet = new Set(right);
|
|
const intersectionSize = Array.from(leftSet).filter((token) => rightSet.has(token)).length;
|
|
const unionSize = new Set([...leftSet, ...rightSet]).size;
|
|
return unionSize === 0 ? 0 : intersectionSize / unionSize;
|
|
}
|
|
|
|
function cosineSimilarity(left: readonly number[] | null, right: readonly number[] | null): number {
|
|
if (!left || !right || left.length === 0 || left.length !== right.length) {
|
|
return 0;
|
|
}
|
|
|
|
let dot = 0;
|
|
let leftMagnitude = 0;
|
|
let rightMagnitude = 0;
|
|
for (let index = 0; index < left.length; index += 1) {
|
|
const leftValue = left[index] ?? 0;
|
|
const rightValue = right[index] ?? 0;
|
|
dot += leftValue * rightValue;
|
|
leftMagnitude += leftValue * leftValue;
|
|
rightMagnitude += rightValue * rightValue;
|
|
}
|
|
|
|
if (leftMagnitude === 0 || rightMagnitude === 0) {
|
|
return 0;
|
|
}
|
|
|
|
return dot / (Math.sqrt(leftMagnitude) * Math.sqrt(rightMagnitude));
|
|
}
|
|
|
|
function parentEmbeddingScore(childColumn: KtxEnrichedColumn, parentTable: KtxEnrichedTable): number {
|
|
if (!Array.isArray(childColumn.embedding) || childColumn.embedding.length === 0) {
|
|
return 0;
|
|
}
|
|
|
|
let best = 0;
|
|
for (const parentColumn of parentTable.columns) {
|
|
best = Math.max(best, cosineSimilarity(childColumn.embedding, parentColumn.embedding));
|
|
}
|
|
return best;
|
|
}
|
|
|
|
function tableTokenScore(input: {
|
|
childTableId: string;
|
|
childTableTokens: readonly string[];
|
|
childColumnTokens: readonly string[];
|
|
parentTable: KtxEnrichedTable;
|
|
}): number {
|
|
const parentTokens = normalizedTokenVariants(input.parentTable.ref.name);
|
|
const columnOnlyScore = jaccard(input.childColumnTokens, parentTokens);
|
|
if (parentTokens.length === 0) {
|
|
return 0;
|
|
}
|
|
if (input.parentTable.id === input.childTableId) {
|
|
return columnOnlyScore;
|
|
}
|
|
const columnAndTableScore = jaccard(uniqueTokens([...input.childTableTokens, ...input.childColumnTokens]), parentTokens);
|
|
return Math.max(columnOnlyScore, columnAndTableScore * 0.6);
|
|
}
|
|
|
|
function localityScore(input: {
|
|
childTable: KtxEnrichedTable;
|
|
childTableId: string;
|
|
childTableTokens: readonly string[];
|
|
childColumn: KtxEnrichedColumn;
|
|
childColumnTokens: readonly string[];
|
|
parentTable: KtxEnrichedTable;
|
|
}): Omit<KtxRelationshipLocalityCandidateTable, 'table'> {
|
|
const tokenScore = roundedScore(tableTokenScore(input));
|
|
const embeddingScore = roundedScore(parentEmbeddingScore(input.childColumn, input.parentTable));
|
|
const score =
|
|
embeddingScore > 0
|
|
? roundedScore(Math.max(tokenScore, tokenScore * 0.8 + embeddingScore * 0.2, embeddingScore * 0.65))
|
|
: tokenScore;
|
|
const reasons: string[] = [];
|
|
if (tokenScore > 0) {
|
|
reasons.push('column_table_token_overlap');
|
|
}
|
|
if (embeddingScore > 0) {
|
|
reasons.push('embedding_similarity');
|
|
}
|
|
if (reasons.length === 0) {
|
|
reasons.push('locality_tie_breaker');
|
|
}
|
|
return {
|
|
score,
|
|
tokenScore,
|
|
embeddingScore,
|
|
reasons,
|
|
};
|
|
}
|
|
|
|
export function localCandidateTables(
|
|
input: LocalKtxRelationshipCandidateTablesInput,
|
|
): KtxRelationshipLocalityCandidateTable[] {
|
|
const limit = input.maxParentTables ?? DEFAULT_MAX_PARENT_TABLES;
|
|
if (!Number.isFinite(limit) || limit <= 0) {
|
|
return [];
|
|
}
|
|
|
|
const childTableTokens = normalizedTokenVariants(input.childTable.ref.name);
|
|
const childColumnTokens = childColumnLocalityTokens(input.childColumn);
|
|
|
|
return input.parentTables
|
|
.map((table) => ({
|
|
table,
|
|
...localityScore({
|
|
childTable: input.childTable,
|
|
childTableId: input.childTable.id,
|
|
childTableTokens,
|
|
childColumn: input.childColumn,
|
|
childColumnTokens,
|
|
parentTable: table,
|
|
}),
|
|
}))
|
|
.sort(
|
|
(left, right) =>
|
|
right.score - left.score ||
|
|
right.tokenScore - left.tokenScore ||
|
|
right.embeddingScore - left.embeddingScore ||
|
|
left.table.ref.name.localeCompare(right.table.ref.name) ||
|
|
left.table.id.localeCompare(right.table.id),
|
|
)
|
|
.slice(0, Math.floor(limit));
|
|
}
|