fix: remove deterministic embedding backend (#146)

* fix: remove deterministic embedding backend

* test: update slow tests for disabled embeddings
This commit is contained in:
Andrey Avtomonov 2026-05-19 16:40:01 +02:00 committed by GitHub
parent e80f755a6c
commit 06aeb56f39
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 148 additions and 222 deletions

View file

@ -165,7 +165,7 @@ async function writeHistoricSqlProject(project: KtxLocalProject): Promise<KtxLoc
' adapters:',
' - historic-sql',
' embeddings:',
' backend: deterministic',
' backend: none',
'storage:',
' state: sqlite',
' search: sqlite-fts5',

View file

@ -284,7 +284,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - fake',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',
@ -389,9 +389,11 @@ describe('canonical local ingest', () => {
expect(result.result.failedWorkUnits).toEqual([]);
const db = new Database(join(project.projectDir, '.ktx', 'db.sqlite'), { readonly: true });
try {
expect(db.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key').all()).toEqual([
{ key: 'orders_context', summary: 'Orders source context', has_embedding: 1 },
]);
expect(
db
.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key')
.all(),
).toEqual([{ key: 'orders_context', summary: 'Orders source context', has_embedding: 0 }]);
} finally {
db.close();
}
@ -489,7 +491,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - historic-sql',
' embeddings:',
' backend: deterministic',
' backend: none',
'storage:',
' state: sqlite',
' search: sqlite-fts5',
@ -572,7 +574,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - metabase',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',
@ -650,7 +652,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - metricflow',
' embeddings:',
' backend: deterministic',
' backend: none',
'storage:',
' state: sqlite',
' search: sqlite-fts5',
@ -778,7 +780,7 @@ describe('canonical local ingest', () => {
' adapters:',
' - looker',
' embeddings:',
' backend: deterministic',
' backend: none',
'storage:',
' state: sqlite',
' search: sqlite-fts5',

View file

@ -57,7 +57,7 @@ describe('createLocalBundleIngestRuntime', () => {
' adapters:',
' - fake',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',
@ -303,7 +303,7 @@ describe('createLocalBundleIngestRuntime', () => {
' adapters:',
' - fake',
' embeddings:',
' backend: deterministic',
' backend: none',
'',
].join('\n'),
'utf-8',

View file

@ -2,7 +2,7 @@ import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { createLocalKtxEmbeddingProviderFromConfig, KtxIngestEmbeddingPortAdapter } from '../llm/index.js';
import type { KtxEmbeddingPort } from '../core/embedding.js';
import { CandidateDedupService } from './context-candidates/candidate-dedup.service.js';
import { ContextEvidenceIndexService } from './context-evidence/context-evidence-index.service.js';
import { SqliteContextEvidenceStore } from './context-evidence/sqlite-context-evidence-store.js';
@ -43,16 +43,16 @@ describe('local ingest embedding providers with SQLite ingest stores', () => {
await rm(tempDir, { recursive: true, force: true });
});
function embeddings() {
const provider = createLocalKtxEmbeddingProviderFromConfig({
backend: 'deterministic',
dimensions: 8,
batchSize: 4,
});
if (!provider) {
throw new Error('deterministic local embedding provider was not created');
}
return new KtxIngestEmbeddingPortAdapter(provider);
function embeddings(): KtxEmbeddingPort {
return {
maxBatchSize: 4,
async computeEmbedding() {
return [1, 0, 0];
},
async computeEmbeddingsBulk(texts) {
return texts.map(() => [1, 0, 0]);
},
};
}
it('indexes and searches context evidence using a package-owned local embedding provider', async () => {

View file

@ -221,21 +221,15 @@ describe('local KTX embedding config', () => {
});
});
it('constructs deterministic embeddings from the default project config', () => {
it('returns null for the default disabled project embedding config', () => {
const createKtxEmbeddingProvider = vi.fn(() => ({}) as never);
const provider = createLocalKtxEmbeddingProviderFromConfig(
buildDefaultKtxProjectConfig().ingest.embeddings,
{ createKtxEmbeddingProvider },
);
expect(provider).not.toBeNull();
expect(createKtxEmbeddingProvider).toHaveBeenCalledWith(
expect.objectContaining({
backend: 'deterministic',
model: 'deterministic',
dimensions: 8,
}),
);
expect(provider).toBeNull();
expect(createKtxEmbeddingProvider).not.toHaveBeenCalled();
});
it('returns null when embeddings are disabled', () => {

View file

@ -184,26 +184,13 @@ export function resolveLocalKtxEmbeddingConfig(
}
return {
backend: config.backend,
model: config.model ?? 'deterministic',
model: config.model ?? 'text-embedding-3-small',
dimensions: config.dimensions,
openai,
batchSize: config.batchSize,
};
}
return {
backend: config.backend,
model: config.model ?? 'deterministic',
dimensions: config.dimensions,
...(config.sentenceTransformers
? {
sentenceTransformers: {
baseURL: config.sentenceTransformers.base_url,
pathPrefix: config.sentenceTransformers.pathPrefix,
},
}
: {}),
batchSize: config.batchSize,
};
throw new Error(`Unsupported KTX embedding backend: ${String((config as { backend?: string }).backend)}`);
}
export function createLocalKtxEmbeddingProviderFromConfig(

View file

@ -42,8 +42,7 @@ connections:
ingest: {
adapters: [],
embeddings: {
backend: 'deterministic',
model: 'deterministic',
backend: 'none',
dimensions: 8,
},
workUnits: {
@ -87,13 +86,10 @@ connections:
expect(serialized).not.toContain('project:');
expect(serialized).not.toContain('live-database');
expect(serialized).toContain(
' embeddings:\n backend: deterministic\n model: deterministic\n dimensions: 8',
);
expect(serialized).toContain(' embeddings:\n backend: none\n dimensions: 8');
expect(parsed.ingest.adapters).toEqual([]);
expect(parsed.ingest.embeddings).toEqual({
backend: 'deterministic',
model: 'deterministic',
backend: 'none',
dimensions: 8,
});
});
@ -404,8 +400,7 @@ scan:
expect(config).toEqual(buildDefaultKtxProjectConfig());
expect(config.ingest.embeddings).toEqual({
backend: 'deterministic',
model: 'deterministic',
backend: 'none',
dimensions: 8,
});
});

View file

@ -4,7 +4,7 @@ import * as z from 'zod';
import { connectionConfigSchema } from './driver-schemas.js';
const KTX_LLM_BACKENDS = ['none', 'anthropic', 'vertex', 'gateway', 'claude-code'] as const;
const KTX_EMBEDDING_BACKENDS = ['none', 'deterministic', 'openai', 'sentence-transformers'] as const;
const KTX_EMBEDDING_BACKENDS = ['none', 'openai', 'sentence-transformers'] as const;
const KTX_PROMPT_CACHE_TTLS = ['5m', '1h'] as const;
const KTX_ENRICHMENT_MODES = ['none', 'deterministic', 'llm'] as const;
const KTX_WORK_UNIT_FAILURE_MODES = ['abort', 'continue'] as const;
@ -80,9 +80,9 @@ const embeddingSchema = z
.strictObject({
backend: z
.enum(KTX_EMBEDDING_BACKENDS)
.default('deterministic')
.describe('Embedding backend. "deterministic" is a built-in hash-based vector for offline use; "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'),
model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small"). Ignored by the "deterministic" backend.'),
.default('none')
.describe('Embedding backend. "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'),
model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small").'),
dimensions: z.int().positive().default(8).describe('Embedding vector dimensionality. Must match the chosen model when using a real provider.'),
openai: apiCredentialsSchema.optional().describe('OpenAI credentials, used when backend is "openai".'),
sentenceTransformers: sentenceTransformersSchema.optional().describe('Sentence-transformers server config, used when backend is "sentence-transformers".'),
@ -108,7 +108,7 @@ const ingestSchema = z
.default([])
.describe('Ingest adapter identifiers to run (e.g. "metabase", "looker", "historic-sql"). Empty array means no adapters are run.'),
embeddings: embeddingSchema
.prefault({ backend: 'deterministic', model: 'deterministic' })
.prefault({ backend: 'none' })
.describe('Embedding configuration used when ingest adapters need to embed documents.'),
workUnits: workUnitsSchema.prefault({}).describe('Concurrency and failure handling for ingest work units.'),
})

View file

@ -58,13 +58,13 @@ describe('scan enrichment state', () => {
snapshot,
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
});
const second = computeKtxScanEnrichmentInputHash({
snapshot: { ...snapshot, metadata: {} },
mode: 'enriched',
detectRelationships: true,
providerIdentity: { llmModel: 'a', embeddingDimensions: 8, provider: 'deterministic' },
providerIdentity: { llmModel: 'a', provider: 'local-heuristic' },
});
const firstTable = snapshot.tables[0];
if (!firstTable) {
@ -74,7 +74,7 @@ describe('scan enrichment state', () => {
snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] },
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
});
expect(first).toMatch(/^[a-f0-9]{64}$/);
@ -87,7 +87,7 @@ describe('scan enrichment state', () => {
snapshot,
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8 },
providerIdentity: { provider: 'local-heuristic' },
});
await store.saveCompletedStage({

View file

@ -96,7 +96,6 @@ export type {
KtxStructuralSyncPlan,
} from './enrichment-types.js';
export type {
DeterministicLocalScanEnrichmentProviderOptions,
KtxLocalScanEnrichmentInput,
KtxLocalScanEnrichmentProviders,
KtxLocalScanEnrichmentResult,

View file

@ -17,11 +17,24 @@ import {
createKtxConnectorCapabilities,
type KtxQueryResult,
type KtxReadOnlyQueryInput,
type KtxEmbeddingPort,
type KtxScanConnector,
type KtxScanContext,
type KtxSchemaSnapshot,
} from './types.js';
function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort {
return {
dimensions: options.dimensions,
maxBatchSize: options.maxBatchSize ?? 64,
async embedBatch(texts) {
return texts.map((_, textIndex) =>
Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex),
);
},
};
}
const snapshot: KtxSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
@ -355,7 +368,7 @@ describe('local scan enrichment', () => {
});
it('honors scan relationship config when LLM proposals are disabled', async () => {
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
const providers = createDeterministicLocalScanEnrichmentProviders();
const generateObject = vi.fn();
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
@ -424,7 +437,7 @@ describe('local scan enrichment', () => {
detectRelationships: false,
connector: failingConnector,
context: { runId: 'scan-run-warnings', logger },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
providers: createDeterministicLocalScanEnrichmentProviders(),
});
const codes = result.warnings.map((warning) => warning.code);
@ -439,25 +452,24 @@ describe('local scan enrichment', () => {
expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
});
it('runs configured deterministic enrichment with descriptions and embeddings', async () => {
it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-2' },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
providers: createDeterministicLocalScanEnrichmentProviders(),
});
expect(result.summary).toMatchObject({
dataDictionary: 'completed',
tableDescriptions: 'completed',
columnDescriptions: 'completed',
embeddings: 'completed',
embeddings: 'skipped',
deterministicRelationships: 'completed',
});
expect(result.embeddingUpdates).toHaveLength(3);
expect(result.embeddingUpdates[0]?.embedding).toHaveLength(6);
expect(result.embeddingUpdates).toEqual([]);
expect(result.snapshot).toEqual(snapshot);
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
});
@ -518,7 +530,7 @@ describe('local scan enrichment', () => {
mode: 'enriched',
connector: scanConnector,
context: { runId: 'scan-run-concurrent-descriptions' },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 }),
providers: createDeterministicLocalScanEnrichmentProviders(),
relationshipSettings: settings,
});
@ -542,7 +554,10 @@ describe('local scan enrichment', () => {
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-progress', progress },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
providers: {
...createDeterministicLocalScanEnrichmentProviders(),
embedding: fakeScanEmbedding({ dimensions: 6 }),
},
});
expect(events).toEqual(
@ -613,7 +628,7 @@ describe('local scan enrichment', () => {
...connector(),
introspect: vi.fn(async () => manyColumnSnapshot),
};
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
const embedBatch = vi.fn(async (texts: string[]) => {
if (texts.length > 2) {
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
@ -644,7 +659,10 @@ describe('local scan enrichment', () => {
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
const stateStore = memoryEnrichmentStateStore();
const scanConnector = connector();
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
const providers = {
...createDeterministicLocalScanEnrichmentProviders(),
embedding: fakeScanEmbedding({ dimensions: 6 }),
};
const first = await runLocalScanEnrichment({
connectionId: 'warehouse',
@ -655,7 +673,7 @@ describe('local scan enrichment', () => {
providers,
stateStore,
syncId: 'sync-resume-1',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
const generateText = vi.spyOn(providers.llmRuntime, 'generateText');
@ -669,7 +687,7 @@ describe('local scan enrichment', () => {
providers,
stateStore,
syncId: 'sync-resume-1',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
@ -685,7 +703,10 @@ describe('local scan enrichment', () => {
it('does not reuse completed stages when the snapshot changes', async () => {
const stateStore = memoryEnrichmentStateStore();
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
const providers = {
...createDeterministicLocalScanEnrichmentProviders(),
embedding: fakeScanEmbedding({ dimensions: 6 }),
};
const scanConnector = connector();
await runLocalScanEnrichment({
@ -697,7 +718,7 @@ describe('local scan enrichment', () => {
providers,
stateStore,
syncId: 'sync-resume-hash',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
const firstTable = snapshot.tables[0];
@ -722,7 +743,7 @@ describe('local scan enrichment', () => {
providers,
stateStore,
syncId: 'sync-resume-hash',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
});
expect(result.state.resumedStages).toEqual([]);
@ -828,8 +849,8 @@ describe('local scan enrichment', () => {
},
);
expect(providers?.embedding.dimensions).toBe(1536);
expect(providers?.embedding.maxBatchSize).toBe(8);
expect(providers?.embedding?.dimensions).toBe(1536);
expect(providers?.embedding?.maxBatchSize).toBe(8);
expect(createKtxLlmProvider).toHaveBeenCalledWith(
expect.objectContaining({ backend: 'gateway', modelSlots: { default: 'provider/language-model' } }),
);

View file

@ -43,14 +43,9 @@ import type {
const DESCRIPTION_TABLE_CONCURRENCY = 6;
export interface DeterministicLocalScanEnrichmentProviderOptions {
embeddingDimensions?: number;
maxBatchSize?: number;
}
export interface KtxLocalScanEnrichmentProviders {
llmRuntime: KtxLlmRuntimePort;
embedding: KtxEmbeddingPort;
embedding?: KtxEmbeddingPort | null;
}
export interface KtxLocalScanEnrichmentInput {
@ -173,31 +168,9 @@ function providerlessEnrichedWarning(relationshipDetection: boolean): KtxScanWar
};
}
function hashEmbedding(text: string, dimensions: number): number[] {
const values = Array.from({ length: dimensions }, (_, index) => {
let hash = index + 17;
for (const char of text) {
hash = (hash * 31 + char.charCodeAt(0) + index) % 1009;
}
return Number(((hash % 200) / 100 - 1).toFixed(4));
});
return values;
}
export function createDeterministicLocalScanEnrichmentProviders(
options: DeterministicLocalScanEnrichmentProviderOptions = {},
): KtxLocalScanEnrichmentProviders {
const dimensions = options.embeddingDimensions ?? 8;
const maxBatchSize = options.maxBatchSize ?? 64;
export function createDeterministicLocalScanEnrichmentProviders(): KtxLocalScanEnrichmentProviders {
return {
llmRuntime: deterministicLlmRuntime(),
embedding: {
dimensions,
maxBatchSize,
async embedBatch(texts) {
return texts.map((text) => hashEmbedding(text, dimensions));
},
},
};
}
@ -370,7 +343,7 @@ async function generateDescriptions(input: {
async function buildEmbeddings(input: {
snapshot: KtxSchemaSnapshot;
providers: KtxLocalScanEnrichmentProviders;
embedding: KtxEmbeddingPort;
descriptions: KtxLocalScanEnrichmentResult['descriptionUpdates'];
progress?: KtxProgressPort;
}): Promise<{ updates: KtxEmbeddingUpdate[]; byColumnId: Map<string, number[]> }> {
@ -400,7 +373,7 @@ async function buildEmbeddings(input: {
}
const embeddings: number[][] = [];
const maxBatchSize = embeddingBatchSize(input.providers.embedding.maxBatchSize);
const maxBatchSize = embeddingBatchSize(input.embedding.maxBatchSize);
const embeddingTexts = texts.map((item) => item.text);
const batchCount = Math.ceil(embeddingTexts.length / maxBatchSize);
if (batchCount === 0) {
@ -412,7 +385,7 @@ async function buildEmbeddings(input: {
transient: true,
});
const batch = embeddingTexts.slice(offset, offset + maxBatchSize);
const batchEmbeddings = await input.providers.embedding.embedBatch(batch);
const batchEmbeddings = await input.embedding.embedBatch(batch);
if (batchEmbeddings.length !== batch.length) {
throw new Error(`expected ${batch.length} embeddings, received ${batchEmbeddings.length}`);
}
@ -560,34 +533,38 @@ export async function runLocalScanEnrichment(
warnings,
}),
});
const embeddingProgress = progress?.startPhase(0.2);
embeddingUpdates = await runEnrichmentStage({
stateStore: input.stateStore,
runId: input.context.runId,
connectionId: input.connectionId,
syncId,
mode: input.mode,
stage: 'embeddings',
inputHash,
now,
resumedStages: state.resumedStages,
completedStages: state.completedStages,
failedStages: state.failedStages,
compute: async () => {
const embeddings = await buildEmbeddings({
snapshot,
providers,
descriptions,
progress: embeddingProgress,
});
return embeddings.updates;
},
});
schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
summary.dataDictionary = input.connector.sampleColumn ? 'completed' : 'skipped';
summary.tableDescriptions = 'completed';
summary.columnDescriptions = 'completed';
summary.embeddings = 'completed';
const embeddingProgress = progress?.startPhase(0.2);
const embedding = providers.embedding;
if (embedding) {
embeddingUpdates = await runEnrichmentStage({
stateStore: input.stateStore,
runId: input.context.runId,
connectionId: input.connectionId,
syncId,
mode: input.mode,
stage: 'embeddings',
inputHash,
now,
resumedStages: state.resumedStages,
completedStages: state.completedStages,
failedStages: state.failedStages,
compute: async () => {
const embeddings = await buildEmbeddings({
snapshot,
embedding,
descriptions,
progress: embeddingProgress,
});
return embeddings.updates;
},
});
schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
summary.embeddings = 'completed';
}
}
let relationshipUpdate: KtxRelationshipUpdate | null = null;

View file

@ -1017,7 +1017,7 @@ describe('local scan', () => {
expect(persistedReport).not.toContain('postgres://reader:secret@example.test/db'); // pragma: allowlist secret
});
it('runs enriched scans when deterministic standalone enrichment is configured', async () => {
it('runs enriched scans when deterministic standalone enrichment is configured without embeddings', async () => {
await writeFile(
join(project.projectDir, 'ktx.yaml'),
[
@ -1103,10 +1103,9 @@ describe('local scan', () => {
expect(result.report.mode).toBe('enriched');
expect(result.report.enrichment.tableDescriptions).toBe('completed');
expect(result.report.enrichment.columnDescriptions).toBe('completed');
expect(result.report.enrichment.embeddings).toBe('completed');
expect(result.report.enrichment.embeddings).toBe('skipped');
expect(result.report.artifactPaths.enrichmentArtifacts).toEqual([
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/descriptions.json',
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/embeddings.json',
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/relationships.json',
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/relationship-profile.json',
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/relationship-diagnostics.json',