fix: remove deterministic embedding backend

This commit is contained in:
Andrey Avtomonov 2026-05-19 16:09:05 +02:00
parent b4c77c0563
commit 6ee15018ad
26 changed files with 144 additions and 218 deletions

View file

@ -676,8 +676,7 @@ describe('runKtxDoctor', () => {
' adapters:',
' - live-database',
' embeddings:',
' backend: deterministic',
' model: deterministic',
' backend: none',
' dimensions: 8',
'',
].join('\n'),
@ -694,8 +693,8 @@ describe('runKtxDoctor', () => {
).resolves.toBe(0);
expect(testIo.stdout()).toContain('Embeddings');
expect(testIo.stdout()).toContain('deterministic');
expect(testIo.stdout()).toContain('semantic search degraded');
expect(testIo.stdout()).toContain('none');
expect(testIo.stdout()).toContain('semantic search will be skipped');
delete process.env.ANTHROPIC_API_KEY;
});

View file

@ -62,7 +62,6 @@ export function deepReadinessGaps(config: KtxProjectConfig): string[] {
if (
!embeddings ||
embeddings.backend === 'none' ||
embeddings.backend === 'deterministic' ||
!embeddings.model ||
embeddings.dimensions <= 0
) {

View file

@ -133,7 +133,7 @@ export async function writeMetabaseConfig(projectDir: string): Promise<void> {
' adapters:',
' - metabase',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',
@ -502,7 +502,7 @@ export async function runPublicMetabaseSyncModeCase(tempDir: string, input: Sync
' adapters:',
' - metabase',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',

View file

@ -777,7 +777,7 @@ describe('runKtxIngest', () => {
' adapters:',
' - metabase',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',
@ -1845,7 +1845,7 @@ describe('runKtxIngest', () => {
' adapters:',
' - looker',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',

View file

@ -324,7 +324,7 @@ describe('setup embeddings step', () => {
expect(result.status).toBe('failed');
const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8'));
expect(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')).not.toContain('completed_steps:');
expect(config.ingest.embeddings.backend).toBe('deterministic');
expect(config.ingest.embeddings.backend).toBe('none');
expect(io.stderr()).toContain('Local embedding health check failed: 401 invalid api key [redacted]');
expect(io.stderr()).toContain('Prepare the runtime with: ktx dev runtime start --feature local-embeddings');
expect(io.stderr()).not.toContain('skip for now');
@ -436,7 +436,7 @@ describe('setup embeddings step', () => {
expect(result.status).toBe('skipped');
const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8'));
expect(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')).not.toContain('completed_steps:');
expect(config.ingest.embeddings.backend).toBe('deterministic');
expect(config.ingest.embeddings.backend).toBe('none');
});
it('returns back without writing config when the local health check fails and Back is selected', async () => {
@ -460,7 +460,7 @@ describe('setup embeddings step', () => {
expect(result.status).toBe('back');
const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8'));
expect(config.ingest.embeddings.backend).toBe('deterministic');
expect(config.ingest.embeddings.backend).toBe('none');
});
it('preserves already completed embeddings setup when no embedding args request changes', async () => {

View file

@ -94,7 +94,6 @@ async function hasCompletedEmbeddings(projectDir: string, config: KtxProjectConf
return (
(await readKtxSetupState(projectDir)).completed_steps.includes('embeddings') &&
config.ingest.embeddings.backend !== 'none' &&
config.ingest.embeddings.backend !== 'deterministic' &&
typeof config.ingest.embeddings.model === 'string' &&
config.ingest.embeddings.model.length > 0 &&
config.ingest.embeddings.dimensions > 0

View file

@ -1166,8 +1166,7 @@ describe('setup Anthropic model step', () => {
' default: claude-sonnet-4-6',
'ingest:',
' embeddings:',
' backend: deterministic',
' model: deterministic',
' backend: none',
' dimensions: 8',
].join('\n'),
'utf-8',
@ -1209,8 +1208,7 @@ describe('setup Anthropic model step', () => {
` default: ${fixture.model}`,
'ingest:',
' embeddings:',
' backend: deterministic',
' model: deterministic',
' backend: none',
' dimensions: 8',
].join('\n'),
'utf-8',

View file

@ -108,7 +108,7 @@ describe('setup status', () => {
});
});
it('reports deterministic default embeddings as not setup-ready', async () => {
it('reports disabled default embeddings as not setup-ready', async () => {
await mkdir(tempDir, { recursive: true });
await writeFile(
join(tempDir, 'ktx.yaml'),
@ -122,8 +122,7 @@ describe('setup status', () => {
' default: claude-sonnet-4-6',
'ingest:',
' embeddings:',
' backend: deterministic',
' model: deterministic',
' backend: none',
' dimensions: 8',
'connections: {}',
].join('\n'),
@ -133,7 +132,7 @@ describe('setup status', () => {
await expect(readKtxSetupStatus(tempDir)).resolves.toMatchObject({
project: { path: tempDir, ready: true },
llm: { backend: 'anthropic', ready: true, model: 'claude-sonnet-4-6' },
embeddings: { backend: 'deterministic', ready: false, model: 'deterministic', dimensions: 8 },
embeddings: { backend: 'none', ready: false, dimensions: 8 },
});
});
@ -373,8 +372,7 @@ describe('setup status', () => {
' default: claude-sonnet-4-6',
'ingest:',
' embeddings:',
' backend: deterministic',
' model: deterministic',
' backend: none',
' dimensions: 8',
'',
].join('\n'),

View file

@ -238,7 +238,6 @@ function embeddingsReady(status: KtxSetupStatus['embeddings']): boolean {
return (
status.backend !== undefined &&
status.backend !== 'none' &&
status.backend !== 'deterministic' &&
typeof status.model === 'string' &&
status.model.length > 0 &&
typeof status.dimensions === 'number' &&

View file

@ -93,7 +93,7 @@ async function writeSqliteScanConfig(projectDir: string, dbPath: string, enrich
' enrichment:',
' mode: deterministic',
' embeddings:',
' backend: deterministic',
' backend: none',
' dimensions: 6',
]
: []),
@ -166,7 +166,7 @@ describe('standalone built ktx CLI smoke', () => {
});
it('runs status setup checks through the built binary', async () => {
const result = await runBuiltCli(['status', '--verbose', '--no-input']);
const result = await runBuiltCli(['status', '--verbose', '--no-input'], { cwd: tempDir });
expect(result.stdout).toMatch(/KTX status/);
if (result.stdout.includes('No project here yet.')) {

View file

@ -242,15 +242,6 @@ function buildEmbeddingsStatus(config: KtxProjectEmbeddingConfig, env: NodeJS.Pr
detail: 'disabled — semantic search will be skipped',
};
}
if (backend === 'deterministic') {
return {
backend,
model,
dimensions,
status: 'warn',
detail: 'deterministic — semantic search degraded (lexical/dictionary lanes still work)',
};
}
if (backend === 'openai') {
const ref = config.openai?.api_key;
const resolved = resolveRef(ref, env);
@ -645,7 +636,7 @@ function buildVerdict(
const reasons: string[] = [];
if (llm.status === 'warn') reasons.push('LLM credentials missing');
if (embeddings.status === 'warn') {
if (embeddings.backend === 'deterministic' || embeddings.backend === 'none') {
if (embeddings.backend === 'none') {
reasons.push('semantic search disabled');
} else {
reasons.push('embedding credentials missing');

View file

@ -165,7 +165,7 @@ async function writeHistoricSqlProject(project: KtxLocalProject): Promise<KtxLoc
' adapters:',
' - historic-sql',
' embeddings:',
' backend: deterministic',
' backend: none',
'storage:',
' state: sqlite',
' search: sqlite-fts5',

View file

@ -284,7 +284,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - fake',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',
@ -389,9 +389,11 @@ describe('canonical local ingest', () => {
expect(result.result.failedWorkUnits).toEqual([]);
const db = new Database(join(project.projectDir, '.ktx', 'db.sqlite'), { readonly: true });
try {
expect(db.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key').all()).toEqual([
{ key: 'orders_context', summary: 'Orders source context', has_embedding: 1 },
]);
expect(
db
.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key')
.all(),
).toEqual([{ key: 'orders_context', summary: 'Orders source context', has_embedding: 0 }]);
} finally {
db.close();
}
@ -489,7 +491,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - historic-sql',
' embeddings:',
' backend: deterministic',
' backend: none',
'storage:',
' state: sqlite',
' search: sqlite-fts5',
@ -572,7 +574,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - metabase',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',
@ -650,7 +652,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - metricflow',
' embeddings:',
' backend: deterministic',
' backend: none',
'storage:',
' state: sqlite',
' search: sqlite-fts5',
@ -778,7 +780,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - looker',
' embeddings:',
' backend: deterministic',
' backend: none',
'storage:',
' state: sqlite',
' search: sqlite-fts5',

View file

@ -57,7 +57,7 @@ describe('createLocalBundleIngestRuntime', () => {
' adapters:',
' - fake',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',
@ -303,7 +303,7 @@ describe('createLocalBundleIngestRuntime', () => {
' adapters:',
' - fake',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',

View file

@ -2,7 +2,7 @@ import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { createLocalKtxEmbeddingProviderFromConfig, KtxIngestEmbeddingPortAdapter } from '../llm/index.js';
import type { KtxEmbeddingPort } from '../core/embedding.js';
import { CandidateDedupService } from './context-candidates/candidate-dedup.service.js';
import { ContextEvidenceIndexService } from './context-evidence/context-evidence-index.service.js';
import { SqliteContextEvidenceStore } from './context-evidence/sqlite-context-evidence-store.js';
@ -43,16 +43,16 @@ describe('local ingest embedding providers with SQLite ingest stores', () => {
await rm(tempDir, { recursive: true, force: true });
});
function embeddings() {
const provider = createLocalKtxEmbeddingProviderFromConfig({
backend: 'deterministic',
dimensions: 8,
batchSize: 4,
});
if (!provider) {
throw new Error('deterministic local embedding provider was not created');
}
return new KtxIngestEmbeddingPortAdapter(provider);
function embeddings(): KtxEmbeddingPort {
return {
maxBatchSize: 4,
async computeEmbedding() {
return [1, 0, 0];
},
async computeEmbeddingsBulk(texts) {
return texts.map(() => [1, 0, 0]);
},
};
}
it('indexes and searches context evidence using a package-owned local embedding provider', async () => {

View file

@ -221,21 +221,15 @@ describe('local KTX embedding config', () => {
});
});
it('constructs deterministic embeddings from the default project config', () => {
it('returns null for the default disabled project embedding config', () => {
const createKtxEmbeddingProvider = vi.fn(() => ({}) as never);
const provider = createLocalKtxEmbeddingProviderFromConfig(
buildDefaultKtxProjectConfig().ingest.embeddings,
{ createKtxEmbeddingProvider },
);
expect(provider).not.toBeNull();
expect(createKtxEmbeddingProvider).toHaveBeenCalledWith(
expect.objectContaining({
backend: 'deterministic',
model: 'deterministic',
dimensions: 8,
}),
);
expect(provider).toBeNull();
expect(createKtxEmbeddingProvider).not.toHaveBeenCalled();
});
it('returns null when embeddings are disabled', () => {

View file

@ -184,26 +184,13 @@ export function resolveLocalKtxEmbeddingConfig(
}
return {
backend: config.backend,
model: config.model ?? 'deterministic',
model: config.model ?? 'text-embedding-3-small',
dimensions: config.dimensions,
openai,
batchSize: config.batchSize,
};
}
return {
backend: config.backend,
model: config.model ?? 'deterministic',
dimensions: config.dimensions,
...(config.sentenceTransformers
? {
sentenceTransformers: {
baseURL: config.sentenceTransformers.base_url,
pathPrefix: config.sentenceTransformers.pathPrefix,
},
}
: {}),
batchSize: config.batchSize,
};
throw new Error(`Unsupported KTX embedding backend: ${String((config as { backend?: string }).backend)}`);
}
export function createLocalKtxEmbeddingProviderFromConfig(

View file

@ -42,8 +42,7 @@ connections:
ingest: {
adapters: [],
embeddings: {
backend: 'deterministic',
model: 'deterministic',
backend: 'none',
dimensions: 8,
},
workUnits: {
@ -87,13 +86,10 @@ connections:
expect(serialized).not.toContain('project:');
expect(serialized).not.toContain('live-database');
expect(serialized).toContain(
' embeddings:\n backend: deterministic\n model: deterministic\n dimensions: 8',
);
expect(serialized).toContain(' embeddings:\n backend: none\n dimensions: 8');
expect(parsed.ingest.adapters).toEqual([]);
expect(parsed.ingest.embeddings).toEqual({
backend: 'deterministic',
model: 'deterministic',
backend: 'none',
dimensions: 8,
});
});
@ -404,8 +400,7 @@ scan:
expect(config).toEqual(buildDefaultKtxProjectConfig());
expect(config.ingest.embeddings).toEqual({
backend: 'deterministic',
model: 'deterministic',
backend: 'none',
dimensions: 8,
});
});

View file

@ -4,7 +4,7 @@ import * as z from 'zod';
import { connectionConfigSchema } from './driver-schemas.js';
const KTX_LLM_BACKENDS = ['none', 'anthropic', 'vertex', 'gateway', 'claude-code'] as const;
const KTX_EMBEDDING_BACKENDS = ['none', 'deterministic', 'openai', 'sentence-transformers'] as const;
const KTX_EMBEDDING_BACKENDS = ['none', 'openai', 'sentence-transformers'] as const;
const KTX_PROMPT_CACHE_TTLS = ['5m', '1h'] as const;
const KTX_ENRICHMENT_MODES = ['none', 'deterministic', 'llm'] as const;
const KTX_WORK_UNIT_FAILURE_MODES = ['abort', 'continue'] as const;
@ -80,9 +80,9 @@ const embeddingSchema = z
.strictObject({
backend: z
.enum(KTX_EMBEDDING_BACKENDS)
.default('deterministic')
.describe('Embedding backend. "deterministic" is a built-in hash-based vector for offline use; "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'),
model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small"). Ignored by the "deterministic" backend.'),
.default('none')
.describe('Embedding backend. "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'),
model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small").'),
dimensions: z.int().positive().default(8).describe('Embedding vector dimensionality. Must match the chosen model when using a real provider.'),
openai: apiCredentialsSchema.optional().describe('OpenAI credentials, used when backend is "openai".'),
sentenceTransformers: sentenceTransformersSchema.optional().describe('Sentence-transformers server config, used when backend is "sentence-transformers".'),
@ -108,7 +108,7 @@ const ingestSchema = z
.default([])
.describe('Ingest adapter identifiers to run (e.g. "metabase", "looker", "historic-sql"). Empty array means no adapters are run.'),
embeddings: embeddingSchema
.prefault({ backend: 'deterministic', model: 'deterministic' })
.prefault({ backend: 'none' })
.describe('Embedding configuration used when ingest adapters need to embed documents.'),
workUnits: workUnitsSchema.prefault({}).describe('Concurrency and failure handling for ingest work units.'),
})

View file

@ -58,13 +58,13 @@ describe('scan enrichment state', () => {
snapshot,
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
});
const second = computeKtxScanEnrichmentInputHash({
snapshot: { ...snapshot, metadata: {} },
mode: 'enriched',
detectRelationships: true,
providerIdentity: { llmModel: 'a', embeddingDimensions: 8, provider: 'deterministic' },
providerIdentity: { llmModel: 'a', provider: 'local-heuristic' },
});
const firstTable = snapshot.tables[0];
if (!firstTable) {
@ -74,7 +74,7 @@ describe('scan enrichment state', () => {
snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] },
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
});
expect(first).toMatch(/^[a-f0-9]{64}$/);
@ -87,7 +87,7 @@ describe('scan enrichment state', () => {
snapshot,
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8 },
providerIdentity: { provider: 'local-heuristic' },
});
await store.saveCompletedStage({

View file

@ -96,7 +96,6 @@ export type {
KtxStructuralSyncPlan,
} from './enrichment-types.js';
export type {
DeterministicLocalScanEnrichmentProviderOptions,
KtxLocalScanEnrichmentInput,
KtxLocalScanEnrichmentProviders,
KtxLocalScanEnrichmentResult,

View file

@ -17,11 +17,24 @@ import {
createKtxConnectorCapabilities,
type KtxQueryResult,
type KtxReadOnlyQueryInput,
type KtxEmbeddingPort,
type KtxScanConnector,
type KtxScanContext,
type KtxSchemaSnapshot,
} from './types.js';
function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort {
return {
dimensions: options.dimensions,
maxBatchSize: options.maxBatchSize ?? 64,
async embedBatch(texts) {
return texts.map((_, textIndex) =>
Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex),
);
},
};
}
const snapshot: KtxSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
@ -355,7 +368,7 @@ describe('local scan enrichment', () => {
});
it('honors scan relationship config when LLM proposals are disabled', async () => {
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
const providers = createDeterministicLocalScanEnrichmentProviders();
const generateObject = vi.fn();
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
@ -424,7 +437,7 @@ describe('local scan enrichment', () => {
detectRelationships: false,
connector: failingConnector,
context: { runId: 'scan-run-warnings', logger },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
providers: createDeterministicLocalScanEnrichmentProviders(),
});
const codes = result.warnings.map((warning) => warning.code);
@ -439,25 +452,24 @@ describe('local scan enrichment', () => {
expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
});
it('runs configured deterministic enrichment with descriptions and embeddings', async () => {
it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-2' },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
providers: createDeterministicLocalScanEnrichmentProviders(),
});
expect(result.summary).toMatchObject({
dataDictionary: 'completed',
tableDescriptions: 'completed',
columnDescriptions: 'completed',
embeddings: 'completed',
embeddings: 'skipped',
deterministicRelationships: 'completed',
});
expect(result.embeddingUpdates).toHaveLength(3);
expect(result.embeddingUpdates[0]?.embedding).toHaveLength(6);
expect(result.embeddingUpdates).toEqual([]);
expect(result.snapshot).toEqual(snapshot);
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
});
@ -518,7 +530,7 @@ describe('local scan enrichment', () => {
mode: 'enriched',
connector: scanConnector,
context: { runId: 'scan-run-concurrent-descriptions' },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 }),
providers: createDeterministicLocalScanEnrichmentProviders(),
relationshipSettings: settings,
});
@ -542,7 +554,10 @@ describe('local scan enrichment', () => {
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-progress', progress },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
providers: {
...createDeterministicLocalScanEnrichmentProviders(),
embedding: fakeScanEmbedding({ dimensions: 6 }),
},
});
expect(events).toEqual(
@ -613,7 +628,7 @@ describe('local scan enrichment', () => {
...connector(),
introspect: vi.fn(async () => manyColumnSnapshot),
};
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
const embedBatch = vi.fn(async (texts: string[]) => {
if (texts.length > 2) {
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
@ -644,7 +659,10 @@ describe('local scan enrichment', () => {
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
const stateStore = memoryEnrichmentStateStore();
const scanConnector = connector();
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
const providers = {
...createDeterministicLocalScanEnrichmentProviders(),
embedding: fakeScanEmbedding({ dimensions: 6 }),
};
const first = await runLocalScanEnrichment({
connectionId: 'warehouse',
@ -655,7 +673,7 @@ describe('local scan enrichment', () => {
providers,
stateStore,
syncId: 'sync-resume-1',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
const generateText = vi.spyOn(providers.llmRuntime, 'generateText');
@ -669,7 +687,7 @@ describe('local scan enrichment', () => {
providers,
stateStore,
syncId: 'sync-resume-1',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
@ -685,7 +703,10 @@ describe('local scan enrichment', () => {
it('does not reuse completed stages when the snapshot changes', async () => {
const stateStore = memoryEnrichmentStateStore();
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
const providers = {
...createDeterministicLocalScanEnrichmentProviders(),
embedding: fakeScanEmbedding({ dimensions: 6 }),
};
const scanConnector = connector();
await runLocalScanEnrichment({
@ -697,7 +718,7 @@ describe('local scan enrichment', () => {
providers,
stateStore,
syncId: 'sync-resume-hash',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
const firstTable = snapshot.tables[0];
@ -722,7 +743,7 @@ describe('local scan enrichment', () => {
providers,
stateStore,
syncId: 'sync-resume-hash',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
expect(result.state.resumedStages).toEqual([]);
@ -828,8 +849,8 @@ describe('local scan enrichment', () => {
},
);
expect(providers?.embedding.dimensions).toBe(1536);
expect(providers?.embedding.maxBatchSize).toBe(8);
expect(providers?.embedding?.dimensions).toBe(1536);
expect(providers?.embedding?.maxBatchSize).toBe(8);
expect(createKtxLlmProvider).toHaveBeenCalledWith(
expect.objectContaining({ backend: 'gateway', modelSlots: { default: 'provider/language-model' } }),
);

View file

@ -43,14 +43,9 @@ import type {
const DESCRIPTION_TABLE_CONCURRENCY = 6;
export interface DeterministicLocalScanEnrichmentProviderOptions {
embeddingDimensions?: number;
maxBatchSize?: number;
}
export interface KtxLocalScanEnrichmentProviders {
llmRuntime: KtxLlmRuntimePort;
embedding: KtxEmbeddingPort;
embedding?: KtxEmbeddingPort | null;
}
export interface KtxLocalScanEnrichmentInput {
@ -173,31 +168,9 @@ function providerlessEnrichedWarning(relationshipDetection: boolean): KtxScanWar
};
}
function hashEmbedding(text: string, dimensions: number): number[] {
const values = Array.from({ length: dimensions }, (_, index) => {
let hash = index + 17;
for (const char of text) {
hash = (hash * 31 + char.charCodeAt(0) + index) % 1009;
}
return Number(((hash % 200) / 100 - 1).toFixed(4));
});
return values;
}
export function createDeterministicLocalScanEnrichmentProviders(
options: DeterministicLocalScanEnrichmentProviderOptions = {},
): KtxLocalScanEnrichmentProviders {
const dimensions = options.embeddingDimensions ?? 8;
const maxBatchSize = options.maxBatchSize ?? 64;
export function createDeterministicLocalScanEnrichmentProviders(): KtxLocalScanEnrichmentProviders {
return {
llmRuntime: deterministicLlmRuntime(),
embedding: {
dimensions,
maxBatchSize,
async embedBatch(texts) {
return texts.map((text) => hashEmbedding(text, dimensions));
},
},
};
}
@ -370,7 +343,7 @@ async function generateDescriptions(input: {
async function buildEmbeddings(input: {
snapshot: KtxSchemaSnapshot;
providers: KtxLocalScanEnrichmentProviders;
embedding: KtxEmbeddingPort;
descriptions: KtxLocalScanEnrichmentResult['descriptionUpdates'];
progress?: KtxProgressPort;
}): Promise<{ updates: KtxEmbeddingUpdate[]; byColumnId: Map<string, number[]> }> {
@ -400,7 +373,7 @@ async function buildEmbeddings(input: {
}
const embeddings: number[][] = [];
const maxBatchSize = embeddingBatchSize(input.providers.embedding.maxBatchSize);
const maxBatchSize = embeddingBatchSize(input.embedding.maxBatchSize);
const embeddingTexts = texts.map((item) => item.text);
const batchCount = Math.ceil(embeddingTexts.length / maxBatchSize);
if (batchCount === 0) {
@ -412,7 +385,7 @@ async function buildEmbeddings(input: {
transient: true,
});
const batch = embeddingTexts.slice(offset, offset + maxBatchSize);
const batchEmbeddings = await input.providers.embedding.embedBatch(batch);
const batchEmbeddings = await input.embedding.embedBatch(batch);
if (batchEmbeddings.length !== batch.length) {
throw new Error(`expected ${batch.length} embeddings, received ${batchEmbeddings.length}`);
}
@ -560,34 +533,38 @@ export async function runLocalScanEnrichment(
warnings,
}),
});
const embeddingProgress = progress?.startPhase(0.2);
embeddingUpdates = await runEnrichmentStage({
stateStore: input.stateStore,
runId: input.context.runId,
connectionId: input.connectionId,
syncId,
mode: input.mode,
stage: 'embeddings',
inputHash,
now,
resumedStages: state.resumedStages,
completedStages: state.completedStages,
failedStages: state.failedStages,
compute: async () => {
const embeddings = await buildEmbeddings({
snapshot,
providers,
descriptions,
progress: embeddingProgress,
});
return embeddings.updates;
},
});
schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
summary.dataDictionary = input.connector.sampleColumn ? 'completed' : 'skipped';
summary.tableDescriptions = 'completed';
summary.columnDescriptions = 'completed';
summary.embeddings = 'completed';
const embeddingProgress = progress?.startPhase(0.2);
const embedding = providers.embedding;
if (embedding) {
embeddingUpdates = await runEnrichmentStage({
stateStore: input.stateStore,
runId: input.context.runId,
connectionId: input.connectionId,
syncId,
mode: input.mode,
stage: 'embeddings',
inputHash,
now,
resumedStages: state.resumedStages,
completedStages: state.completedStages,
failedStages: state.failedStages,
compute: async () => {
const embeddings = await buildEmbeddings({
snapshot,
embedding,
descriptions,
progress: embeddingProgress,
});
return embeddings.updates;
},
});
schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
summary.embeddings = 'completed';
}
}
let relationshipUpdate: KtxRelationshipUpdate | null = null;

View file

@ -3,19 +3,16 @@ import { createKtxEmbeddingProvider } from './embedding-provider.js';
import type { KtxEmbeddingConfig } from './types.js';
describe('createKtxEmbeddingProvider', () => {
it('creates deterministic embeddings with stable dimensions', async () => {
const provider = createKtxEmbeddingProvider({
backend: 'deterministic',
model: 'sha256',
dimensions: 6,
batchSize: 4,
});
it('rejects deterministic embeddings', () => {
const config = JSON.parse(
JSON.stringify({
backend: 'deterministic',
model: 'sha256',
dimensions: 6,
}),
) as KtxEmbeddingConfig;
await expect(provider.embed('Revenue policy')).resolves.toHaveLength(6);
await expect(provider.embed('Revenue policy')).resolves.toEqual(await provider.embed('Revenue policy'));
await expect(provider.embed('Revenue policy')).resolves.not.toEqual(await provider.embed('Approval policy'));
await expect(provider.embedMany(['a', 'b'])).resolves.toHaveLength(2);
expect(provider.maxBatchSize).toBe(4);
expect(() => createKtxEmbeddingProvider(config)).toThrow('Unsupported KTX embedding backend: deterministic');
});
it('rejects gateway embeddings', () => {

View file

@ -1,4 +1,3 @@
import { createHash } from 'node:crypto';
import { spawn } from 'node:child_process';
import { join } from 'node:path';
import OpenAI from 'openai';
@ -33,14 +32,6 @@ export interface KtxEmbeddingProviderDeps {
const DEFAULT_BATCH_SIZE = 100;
function deterministicVector(text: string, dimensions: number): number[] {
const digest = createHash('sha256').update(text).digest();
return Array.from({ length: dimensions }, (_, index) => {
const byte = digest[index % digest.length];
return Number(((byte / 255) * 2 - 1).toFixed(6));
});
}
function assertNonEmptyText(text: string): void {
if (!text.trim()) {
throw new Error('Embedding text must be non-empty');
@ -184,24 +175,6 @@ function runSentenceTransformersProcessJson(options: {
};
}
class DeterministicEmbeddingProvider implements KtxEmbeddingProvider {
readonly maxBatchSize: number;
constructor(readonly dimensions: number, batchSize = DEFAULT_BATCH_SIZE) {
this.maxBatchSize = batchSize;
}
async embed(text: string): Promise<number[]> {
assertNonEmptyText(text);
return deterministicVector(text, this.dimensions);
}
async embedMany(texts: string[]): Promise<number[][]> {
assertBatchSize(texts, this.maxBatchSize);
return texts.map((text) => deterministicVector(text, this.dimensions));
}
}
class OpenAIEmbeddingProvider implements KtxEmbeddingProvider {
readonly dimensions: number;
readonly maxBatchSize: number;
@ -367,8 +340,6 @@ export function createKtxEmbeddingProvider(
deps: KtxEmbeddingProviderDeps = {},
): KtxEmbeddingProvider {
switch (config.backend) {
case 'deterministic':
return new DeterministicEmbeddingProvider(config.dimensions, config.batchSize);
case 'openai':
return new OpenAIEmbeddingProvider(config, deps);
case 'sentence-transformers':

View file

@ -62,7 +62,7 @@ export interface KtxLlmProvider {
activeBackend(): KtxLlmBackend;
}
export type KtxEmbeddingBackend = 'openai' | 'deterministic' | 'sentence-transformers';
export type KtxEmbeddingBackend = 'openai' | 'sentence-transformers';
export interface KtxEmbeddingTokenUsageEvent {
backend: KtxEmbeddingBackend;