From 6ee15018ad30075c21aa0c83795b26fedbccf2f8 Mon Sep 17 00:00:00 2001 From: Andrey Avtomonov Date: Tue, 19 May 2026 16:09:05 +0200 Subject: [PATCH] fix: remove deterministic embedding backend --- packages/cli/src/doctor.test.ts | 7 +- packages/cli/src/ingest-depth.ts | 1 - packages/cli/src/ingest.test-utils.ts | 4 +- packages/cli/src/ingest.test.ts | 4 +- packages/cli/src/setup-embeddings.test.ts | 6 +- packages/cli/src/setup-embeddings.ts | 1 - packages/cli/src/setup-models.test.ts | 6 +- packages/cli/src/setup.test.ts | 10 +- packages/cli/src/setup.ts | 1 - packages/cli/src/standalone-smoke.test.ts | 4 +- packages/cli/src/status-project.ts | 11 +-- .../local-ingest-acceptance.test.ts | 2 +- .../src/ingest/local-bundle-ingest.test.ts | 18 ++-- .../src/ingest/local-bundle-runtime.test.ts | 4 +- ...cal-embedding-provider.integration.test.ts | 22 ++--- packages/context/src/llm/local-config.test.ts | 12 +-- packages/context/src/llm/local-config.ts | 17 +--- packages/context/src/project/config.test.ts | 13 +-- packages/context/src/project/config.ts | 10 +- .../context/src/scan/enrichment-state.test.ts | 8 +- packages/context/src/scan/index.ts | 1 - .../context/src/scan/local-enrichment.test.ts | 57 ++++++++---- packages/context/src/scan/local-enrichment.ts | 91 +++++++------------ packages/llm/src/embedding-provider.test.ts | 21 ++--- packages/llm/src/embedding-provider.ts | 29 ------ packages/llm/src/types.ts | 2 +- 26 files changed, 144 insertions(+), 218 deletions(-) diff --git a/packages/cli/src/doctor.test.ts b/packages/cli/src/doctor.test.ts index f08e9d2d..2a397653 100644 --- a/packages/cli/src/doctor.test.ts +++ b/packages/cli/src/doctor.test.ts @@ -676,8 +676,7 @@ describe('runKtxDoctor', () => { ' adapters:', ' - live-database', ' embeddings:', - ' backend: deterministic', - ' model: deterministic', + ' backend: none', ' dimensions: 8', '', ].join('\n'), @@ -694,8 +693,8 @@ describe('runKtxDoctor', () => { ).resolves.toBe(0); expect(testIo.stdout()).toContain('Embeddings'); - expect(testIo.stdout()).toContain('deterministic'); - expect(testIo.stdout()).toContain('semantic search degraded'); + expect(testIo.stdout()).toContain('none'); + expect(testIo.stdout()).toContain('semantic search will be skipped'); delete process.env.ANTHROPIC_API_KEY; }); diff --git a/packages/cli/src/ingest-depth.ts b/packages/cli/src/ingest-depth.ts index f5706d8d..f8e5d06e 100644 --- a/packages/cli/src/ingest-depth.ts +++ b/packages/cli/src/ingest-depth.ts @@ -62,7 +62,6 @@ export function deepReadinessGaps(config: KtxProjectConfig): string[] { if ( !embeddings || embeddings.backend === 'none' || - embeddings.backend === 'deterministic' || !embeddings.model || embeddings.dimensions <= 0 ) { diff --git a/packages/cli/src/ingest.test-utils.ts b/packages/cli/src/ingest.test-utils.ts index 8b18716f..d64dd7e1 100644 --- a/packages/cli/src/ingest.test-utils.ts +++ b/packages/cli/src/ingest.test-utils.ts @@ -133,7 +133,7 @@ export async function writeMetabaseConfig(projectDir: string): Promise { ' adapters:', ' - metabase', ' embeddings:', - ' backend: deterministic', + ' backend: none', '', ].join('\n'), 'utf-8', @@ -502,7 +502,7 @@ export async function runPublicMetabaseSyncModeCase(tempDir: string, input: Sync ' adapters:', ' - metabase', ' embeddings:', - ' backend: deterministic', + ' backend: none', '', ].join('\n'), 'utf-8', diff --git a/packages/cli/src/ingest.test.ts b/packages/cli/src/ingest.test.ts index ab7c717b..d2620caa 100644 --- a/packages/cli/src/ingest.test.ts +++ b/packages/cli/src/ingest.test.ts @@ -777,7 +777,7 @@ describe('runKtxIngest', () => { ' adapters:', ' - metabase', ' embeddings:', - ' backend: deterministic', + ' backend: none', '', ].join('\n'), 'utf-8', @@ -1845,7 +1845,7 @@ describe('runKtxIngest', () => { ' adapters:', ' - looker', ' embeddings:', - ' backend: deterministic', + ' backend: none', '', ].join('\n'), 'utf-8', diff --git a/packages/cli/src/setup-embeddings.test.ts b/packages/cli/src/setup-embeddings.test.ts index 12ab947f..8d9ca0bc 100644 --- a/packages/cli/src/setup-embeddings.test.ts +++ b/packages/cli/src/setup-embeddings.test.ts @@ -324,7 +324,7 @@ describe('setup embeddings step', () => { expect(result.status).toBe('failed'); const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')); expect(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')).not.toContain('completed_steps:'); - expect(config.ingest.embeddings.backend).toBe('deterministic'); + expect(config.ingest.embeddings.backend).toBe('none'); expect(io.stderr()).toContain('Local embedding health check failed: 401 invalid api key [redacted]'); expect(io.stderr()).toContain('Prepare the runtime with: ktx dev runtime start --feature local-embeddings'); expect(io.stderr()).not.toContain('skip for now'); @@ -436,7 +436,7 @@ describe('setup embeddings step', () => { expect(result.status).toBe('skipped'); const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')); expect(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')).not.toContain('completed_steps:'); - expect(config.ingest.embeddings.backend).toBe('deterministic'); + expect(config.ingest.embeddings.backend).toBe('none'); }); it('returns back without writing config when the local health check fails and Back is selected', async () => { @@ -460,7 +460,7 @@ describe('setup embeddings step', () => { expect(result.status).toBe('back'); const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')); - expect(config.ingest.embeddings.backend).toBe('deterministic'); + expect(config.ingest.embeddings.backend).toBe('none'); }); it('preserves already completed embeddings setup when no embedding args request changes', async () => { diff --git a/packages/cli/src/setup-embeddings.ts b/packages/cli/src/setup-embeddings.ts index 3e2e47db..1f3c73ae 100644 --- a/packages/cli/src/setup-embeddings.ts +++ b/packages/cli/src/setup-embeddings.ts @@ -94,7 +94,6 @@ async function hasCompletedEmbeddings(projectDir: string, config: KtxProjectConf return ( (await readKtxSetupState(projectDir)).completed_steps.includes('embeddings') && config.ingest.embeddings.backend !== 'none' && - config.ingest.embeddings.backend !== 'deterministic' && typeof config.ingest.embeddings.model === 'string' && config.ingest.embeddings.model.length > 0 && config.ingest.embeddings.dimensions > 0 diff --git a/packages/cli/src/setup-models.test.ts b/packages/cli/src/setup-models.test.ts index 4c25d092..dafbbe72 100644 --- a/packages/cli/src/setup-models.test.ts +++ b/packages/cli/src/setup-models.test.ts @@ -1166,8 +1166,7 @@ describe('setup Anthropic model step', () => { ' default: claude-sonnet-4-6', 'ingest:', ' embeddings:', - ' backend: deterministic', - ' model: deterministic', + ' backend: none', ' dimensions: 8', ].join('\n'), 'utf-8', @@ -1209,8 +1208,7 @@ describe('setup Anthropic model step', () => { ` default: ${fixture.model}`, 'ingest:', ' embeddings:', - ' backend: deterministic', - ' model: deterministic', + ' backend: none', ' dimensions: 8', ].join('\n'), 'utf-8', diff --git a/packages/cli/src/setup.test.ts b/packages/cli/src/setup.test.ts index 3d0a44a5..60118207 100644 --- a/packages/cli/src/setup.test.ts +++ b/packages/cli/src/setup.test.ts @@ -108,7 +108,7 @@ describe('setup status', () => { }); }); - it('reports deterministic default embeddings as not setup-ready', async () => { + it('reports disabled default embeddings as not setup-ready', async () => { await mkdir(tempDir, { recursive: true }); await writeFile( join(tempDir, 'ktx.yaml'), @@ -122,8 +122,7 @@ describe('setup status', () => { ' default: claude-sonnet-4-6', 'ingest:', ' embeddings:', - ' backend: deterministic', - ' model: deterministic', + ' backend: none', ' dimensions: 8', 'connections: {}', ].join('\n'), @@ -133,7 +132,7 @@ describe('setup status', () => { await expect(readKtxSetupStatus(tempDir)).resolves.toMatchObject({ project: { path: tempDir, ready: true }, llm: { backend: 'anthropic', ready: true, model: 'claude-sonnet-4-6' }, - embeddings: { backend: 'deterministic', ready: false, model: 'deterministic', dimensions: 8 }, + embeddings: { backend: 'none', ready: false, dimensions: 8 }, }); }); @@ -373,8 +372,7 @@ describe('setup status', () => { ' default: claude-sonnet-4-6', 'ingest:', ' embeddings:', - ' backend: deterministic', - ' model: deterministic', + ' backend: none', ' dimensions: 8', '', ].join('\n'), diff --git a/packages/cli/src/setup.ts b/packages/cli/src/setup.ts index 02a81771..b06c9198 100644 --- a/packages/cli/src/setup.ts +++ b/packages/cli/src/setup.ts @@ -238,7 +238,6 @@ function embeddingsReady(status: KtxSetupStatus['embeddings']): boolean { return ( status.backend !== undefined && status.backend !== 'none' && - status.backend !== 'deterministic' && typeof status.model === 'string' && status.model.length > 0 && typeof status.dimensions === 'number' && diff --git a/packages/cli/src/standalone-smoke.test.ts b/packages/cli/src/standalone-smoke.test.ts index d63be434..817879ca 100644 --- a/packages/cli/src/standalone-smoke.test.ts +++ b/packages/cli/src/standalone-smoke.test.ts @@ -93,7 +93,7 @@ async function writeSqliteScanConfig(projectDir: string, dbPath: string, enrich ' enrichment:', ' mode: deterministic', ' embeddings:', - ' backend: deterministic', + ' backend: none', ' dimensions: 6', ] : []), @@ -166,7 +166,7 @@ describe('standalone built ktx CLI smoke', () => { }); it('runs status setup checks through the built binary', async () => { - const result = await runBuiltCli(['status', '--verbose', '--no-input']); + const result = await runBuiltCli(['status', '--verbose', '--no-input'], { cwd: tempDir }); expect(result.stdout).toMatch(/KTX status/); if (result.stdout.includes('No project here yet.')) { diff --git a/packages/cli/src/status-project.ts b/packages/cli/src/status-project.ts index 76d55851..297229de 100644 --- a/packages/cli/src/status-project.ts +++ b/packages/cli/src/status-project.ts @@ -242,15 +242,6 @@ function buildEmbeddingsStatus(config: KtxProjectEmbeddingConfig, env: NodeJS.Pr detail: 'disabled — semantic search will be skipped', }; } - if (backend === 'deterministic') { - return { - backend, - model, - dimensions, - status: 'warn', - detail: 'deterministic — semantic search degraded (lexical/dictionary lanes still work)', - }; - } if (backend === 'openai') { const ref = config.openai?.api_key; const resolved = resolveRef(ref, env); @@ -645,7 +636,7 @@ function buildVerdict( const reasons: string[] = []; if (llm.status === 'warn') reasons.push('LLM credentials missing'); if (embeddings.status === 'warn') { - if (embeddings.backend === 'deterministic' || embeddings.backend === 'none') { + if (embeddings.backend === 'none') { reasons.push('semantic search disabled'); } else { reasons.push('embedding credentials missing'); diff --git a/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts b/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts index 81fa9d30..19bcd6be 100644 --- a/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +++ b/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts @@ -165,7 +165,7 @@ async function writeHistoricSqlProject(project: KtxLocalProject): Promise { ' adapters:', ' - fake', ' embeddings:', - ' backend: deterministic', + ' backend: none', '', ].join('\n'), 'utf-8', @@ -389,9 +389,11 @@ describe('canonical local ingest', () => { expect(result.result.failedWorkUnits).toEqual([]); const db = new Database(join(project.projectDir, '.ktx', 'db.sqlite'), { readonly: true }); try { - expect(db.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key').all()).toEqual([ - { key: 'orders_context', summary: 'Orders source context', has_embedding: 1 }, - ]); + expect( + db + .prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key') + .all(), + ).toEqual([{ key: 'orders_context', summary: 'Orders source context', has_embedding: 0 }]); } finally { db.close(); } @@ -489,7 +491,7 @@ describe('canonical local ingest', () => { ' adapters:', ' - historic-sql', ' embeddings:', - ' backend: deterministic', + ' backend: none', 'storage:', ' state: sqlite', ' search: sqlite-fts5', @@ -572,7 +574,7 @@ describe('canonical local ingest', () => { ' adapters:', ' - metabase', ' embeddings:', - ' backend: deterministic', + ' backend: none', '', ].join('\n'), 'utf-8', @@ -650,7 +652,7 @@ describe('canonical local ingest', () => { ' adapters:', ' - metricflow', ' embeddings:', - ' backend: deterministic', + ' backend: none', 'storage:', ' state: sqlite', ' search: sqlite-fts5', @@ -778,7 +780,7 @@ describe('canonical local ingest', () => { ' adapters:', ' - looker', ' embeddings:', - ' backend: deterministic', + ' backend: none', 'storage:', ' state: sqlite', ' search: sqlite-fts5', diff --git a/packages/context/src/ingest/local-bundle-runtime.test.ts b/packages/context/src/ingest/local-bundle-runtime.test.ts index a8ec8c20..4997948b 100644 --- a/packages/context/src/ingest/local-bundle-runtime.test.ts +++ b/packages/context/src/ingest/local-bundle-runtime.test.ts @@ -57,7 +57,7 @@ describe('createLocalBundleIngestRuntime', () => { ' adapters:', ' - fake', ' embeddings:', - ' backend: deterministic', + ' backend: none', '', ].join('\n'), 'utf-8', @@ -303,7 +303,7 @@ describe('createLocalBundleIngestRuntime', () => { ' adapters:', ' - fake', ' embeddings:', - ' backend: deterministic', + ' backend: none', '', ].join('\n'), 'utf-8', diff --git a/packages/context/src/ingest/local-embedding-provider.integration.test.ts b/packages/context/src/ingest/local-embedding-provider.integration.test.ts index af14fea9..34114e88 100644 --- a/packages/context/src/ingest/local-embedding-provider.integration.test.ts +++ b/packages/context/src/ingest/local-embedding-provider.integration.test.ts @@ -2,7 +2,7 @@ import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { createLocalKtxEmbeddingProviderFromConfig, KtxIngestEmbeddingPortAdapter } from '../llm/index.js'; +import type { KtxEmbeddingPort } from '../core/embedding.js'; import { CandidateDedupService } from './context-candidates/candidate-dedup.service.js'; import { ContextEvidenceIndexService } from './context-evidence/context-evidence-index.service.js'; import { SqliteContextEvidenceStore } from './context-evidence/sqlite-context-evidence-store.js'; @@ -43,16 +43,16 @@ describe('local ingest embedding providers with SQLite ingest stores', () => { await rm(tempDir, { recursive: true, force: true }); }); - function embeddings() { - const provider = createLocalKtxEmbeddingProviderFromConfig({ - backend: 'deterministic', - dimensions: 8, - batchSize: 4, - }); - if (!provider) { - throw new Error('deterministic local embedding provider was not created'); - } - return new KtxIngestEmbeddingPortAdapter(provider); + function embeddings(): KtxEmbeddingPort { + return { + maxBatchSize: 4, + async computeEmbedding() { + return [1, 0, 0]; + }, + async computeEmbeddingsBulk(texts) { + return texts.map(() => [1, 0, 0]); + }, + }; } it('indexes and searches context evidence using a package-owned local embedding provider', async () => { diff --git a/packages/context/src/llm/local-config.test.ts b/packages/context/src/llm/local-config.test.ts index 539afe45..23487e36 100644 --- a/packages/context/src/llm/local-config.test.ts +++ b/packages/context/src/llm/local-config.test.ts @@ -221,21 +221,15 @@ describe('local KTX embedding config', () => { }); }); - it('constructs deterministic embeddings from the default project config', () => { + it('returns null for the default disabled project embedding config', () => { const createKtxEmbeddingProvider = vi.fn(() => ({}) as never); const provider = createLocalKtxEmbeddingProviderFromConfig( buildDefaultKtxProjectConfig().ingest.embeddings, { createKtxEmbeddingProvider }, ); - expect(provider).not.toBeNull(); - expect(createKtxEmbeddingProvider).toHaveBeenCalledWith( - expect.objectContaining({ - backend: 'deterministic', - model: 'deterministic', - dimensions: 8, - }), - ); + expect(provider).toBeNull(); + expect(createKtxEmbeddingProvider).not.toHaveBeenCalled(); }); it('returns null when embeddings are disabled', () => { diff --git a/packages/context/src/llm/local-config.ts b/packages/context/src/llm/local-config.ts index 4b04e99b..e63a5ed1 100644 --- a/packages/context/src/llm/local-config.ts +++ b/packages/context/src/llm/local-config.ts @@ -184,26 +184,13 @@ export function resolveLocalKtxEmbeddingConfig( } return { backend: config.backend, - model: config.model ?? 'deterministic', + model: config.model ?? 'text-embedding-3-small', dimensions: config.dimensions, openai, batchSize: config.batchSize, }; } - return { - backend: config.backend, - model: config.model ?? 'deterministic', - dimensions: config.dimensions, - ...(config.sentenceTransformers - ? { - sentenceTransformers: { - baseURL: config.sentenceTransformers.base_url, - pathPrefix: config.sentenceTransformers.pathPrefix, - }, - } - : {}), - batchSize: config.batchSize, - }; + throw new Error(`Unsupported KTX embedding backend: ${String((config as { backend?: string }).backend)}`); } export function createLocalKtxEmbeddingProviderFromConfig( diff --git a/packages/context/src/project/config.test.ts b/packages/context/src/project/config.test.ts index 3967b363..f8faad58 100644 --- a/packages/context/src/project/config.test.ts +++ b/packages/context/src/project/config.test.ts @@ -42,8 +42,7 @@ connections: ingest: { adapters: [], embeddings: { - backend: 'deterministic', - model: 'deterministic', + backend: 'none', dimensions: 8, }, workUnits: { @@ -87,13 +86,10 @@ connections: expect(serialized).not.toContain('project:'); expect(serialized).not.toContain('live-database'); - expect(serialized).toContain( - ' embeddings:\n backend: deterministic\n model: deterministic\n dimensions: 8', - ); + expect(serialized).toContain(' embeddings:\n backend: none\n dimensions: 8'); expect(parsed.ingest.adapters).toEqual([]); expect(parsed.ingest.embeddings).toEqual({ - backend: 'deterministic', - model: 'deterministic', + backend: 'none', dimensions: 8, }); }); @@ -404,8 +400,7 @@ scan: expect(config).toEqual(buildDefaultKtxProjectConfig()); expect(config.ingest.embeddings).toEqual({ - backend: 'deterministic', - model: 'deterministic', + backend: 'none', dimensions: 8, }); }); diff --git a/packages/context/src/project/config.ts b/packages/context/src/project/config.ts index 912c31de..b639c922 100644 --- a/packages/context/src/project/config.ts +++ b/packages/context/src/project/config.ts @@ -4,7 +4,7 @@ import * as z from 'zod'; import { connectionConfigSchema } from './driver-schemas.js'; const KTX_LLM_BACKENDS = ['none', 'anthropic', 'vertex', 'gateway', 'claude-code'] as const; -const KTX_EMBEDDING_BACKENDS = ['none', 'deterministic', 'openai', 'sentence-transformers'] as const; +const KTX_EMBEDDING_BACKENDS = ['none', 'openai', 'sentence-transformers'] as const; const KTX_PROMPT_CACHE_TTLS = ['5m', '1h'] as const; const KTX_ENRICHMENT_MODES = ['none', 'deterministic', 'llm'] as const; const KTX_WORK_UNIT_FAILURE_MODES = ['abort', 'continue'] as const; @@ -80,9 +80,9 @@ const embeddingSchema = z .strictObject({ backend: z .enum(KTX_EMBEDDING_BACKENDS) - .default('deterministic') - .describe('Embedding backend. "deterministic" is a built-in hash-based vector for offline use; "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'), - model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small"). Ignored by the "deterministic" backend.'), + .default('none') + .describe('Embedding backend. "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'), + model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small").'), dimensions: z.int().positive().default(8).describe('Embedding vector dimensionality. Must match the chosen model when using a real provider.'), openai: apiCredentialsSchema.optional().describe('OpenAI credentials, used when backend is "openai".'), sentenceTransformers: sentenceTransformersSchema.optional().describe('Sentence-transformers server config, used when backend is "sentence-transformers".'), @@ -108,7 +108,7 @@ const ingestSchema = z .default([]) .describe('Ingest adapter identifiers to run (e.g. "metabase", "looker", "historic-sql"). Empty array means no adapters are run.'), embeddings: embeddingSchema - .prefault({ backend: 'deterministic', model: 'deterministic' }) + .prefault({ backend: 'none' }) .describe('Embedding configuration used when ingest adapters need to embed documents.'), workUnits: workUnitsSchema.prefault({}).describe('Concurrency and failure handling for ingest work units.'), }) diff --git a/packages/context/src/scan/enrichment-state.test.ts b/packages/context/src/scan/enrichment-state.test.ts index 036db607..4ae597c6 100644 --- a/packages/context/src/scan/enrichment-state.test.ts +++ b/packages/context/src/scan/enrichment-state.test.ts @@ -58,13 +58,13 @@ describe('scan enrichment state', () => { snapshot, mode: 'enriched', detectRelationships: true, - providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' }, + providerIdentity: { provider: 'local-heuristic', llmModel: 'a' }, }); const second = computeKtxScanEnrichmentInputHash({ snapshot: { ...snapshot, metadata: {} }, mode: 'enriched', detectRelationships: true, - providerIdentity: { llmModel: 'a', embeddingDimensions: 8, provider: 'deterministic' }, + providerIdentity: { llmModel: 'a', provider: 'local-heuristic' }, }); const firstTable = snapshot.tables[0]; if (!firstTable) { @@ -74,7 +74,7 @@ describe('scan enrichment state', () => { snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] }, mode: 'enriched', detectRelationships: true, - providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' }, + providerIdentity: { provider: 'local-heuristic', llmModel: 'a' }, }); expect(first).toMatch(/^[a-f0-9]{64}$/); @@ -87,7 +87,7 @@ describe('scan enrichment state', () => { snapshot, mode: 'enriched', detectRelationships: true, - providerIdentity: { provider: 'deterministic', embeddingDimensions: 8 }, + providerIdentity: { provider: 'local-heuristic' }, }); await store.saveCompletedStage({ diff --git a/packages/context/src/scan/index.ts b/packages/context/src/scan/index.ts index 1eecdfeb..75e3b27a 100644 --- a/packages/context/src/scan/index.ts +++ b/packages/context/src/scan/index.ts @@ -96,7 +96,6 @@ export type { KtxStructuralSyncPlan, } from './enrichment-types.js'; export type { - DeterministicLocalScanEnrichmentProviderOptions, KtxLocalScanEnrichmentInput, KtxLocalScanEnrichmentProviders, KtxLocalScanEnrichmentResult, diff --git a/packages/context/src/scan/local-enrichment.test.ts b/packages/context/src/scan/local-enrichment.test.ts index 72307bc4..c45589bd 100644 --- a/packages/context/src/scan/local-enrichment.test.ts +++ b/packages/context/src/scan/local-enrichment.test.ts @@ -17,11 +17,24 @@ import { createKtxConnectorCapabilities, type KtxQueryResult, type KtxReadOnlyQueryInput, + type KtxEmbeddingPort, type KtxScanConnector, type KtxScanContext, type KtxSchemaSnapshot, } from './types.js'; +function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort { + return { + dimensions: options.dimensions, + maxBatchSize: options.maxBatchSize ?? 64, + async embedBatch(texts) { + return texts.map((_, textIndex) => + Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex), + ); + }, + }; +} + const snapshot: KtxSchemaSnapshot = { connectionId: 'warehouse', driver: 'postgres', @@ -355,7 +368,7 @@ describe('local scan enrichment', () => { }); it('honors scan relationship config when LLM proposals are disabled', async () => { - const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 }); + const providers = createDeterministicLocalScanEnrichmentProviders(); const generateObject = vi.fn(); const result = await runLocalScanEnrichment({ connectionId: 'warehouse', @@ -424,7 +437,7 @@ describe('local scan enrichment', () => { detectRelationships: false, connector: failingConnector, context: { runId: 'scan-run-warnings', logger }, - providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }), + providers: createDeterministicLocalScanEnrichmentProviders(), }); const codes = result.warnings.map((warning) => warning.code); @@ -439,25 +452,24 @@ describe('local scan enrichment', () => { expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6); }); - it('runs configured deterministic enrichment with descriptions and embeddings', async () => { + it('runs configured deterministic enrichment with descriptions and no embeddings', async () => { const result = await runLocalScanEnrichment({ connectionId: 'warehouse', mode: 'enriched', detectRelationships: true, connector: connector(), context: { runId: 'scan-run-2' }, - providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }), + providers: createDeterministicLocalScanEnrichmentProviders(), }); expect(result.summary).toMatchObject({ dataDictionary: 'completed', tableDescriptions: 'completed', columnDescriptions: 'completed', - embeddings: 'completed', + embeddings: 'skipped', deterministicRelationships: 'completed', }); - expect(result.embeddingUpdates).toHaveLength(3); - expect(result.embeddingUpdates[0]?.embedding).toHaveLength(6); + expect(result.embeddingUpdates).toEqual([]); expect(result.snapshot).toEqual(snapshot); expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 }); }); @@ -518,7 +530,7 @@ describe('local scan enrichment', () => { mode: 'enriched', connector: scanConnector, context: { runId: 'scan-run-concurrent-descriptions' }, - providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 }), + providers: createDeterministicLocalScanEnrichmentProviders(), relationshipSettings: settings, }); @@ -542,7 +554,10 @@ describe('local scan enrichment', () => { detectRelationships: true, connector: connector(), context: { runId: 'scan-run-progress', progress }, - providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }), + providers: { + ...createDeterministicLocalScanEnrichmentProviders(), + embedding: fakeScanEmbedding({ dimensions: 6 }), + }, }); expect(events).toEqual( @@ -613,7 +628,7 @@ describe('local scan enrichment', () => { ...connector(), introspect: vi.fn(async () => manyColumnSnapshot), }; - const deterministicProviders = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 }); + const deterministicProviders = createDeterministicLocalScanEnrichmentProviders(); const embedBatch = vi.fn(async (texts: string[]) => { if (texts.length > 2) { throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`); @@ -644,7 +659,10 @@ describe('local scan enrichment', () => { it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => { const stateStore = memoryEnrichmentStateStore(); const scanConnector = connector(); - const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }); + const providers = { + ...createDeterministicLocalScanEnrichmentProviders(), + embedding: fakeScanEmbedding({ dimensions: 6 }), + }; const first = await runLocalScanEnrichment({ connectionId: 'warehouse', @@ -655,7 +673,7 @@ describe('local scan enrichment', () => { providers, stateStore, syncId: 'sync-resume-1', - providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 }, + providerIdentity: { provider: 'fake', embeddingDimensions: 6 }, }); const generateText = vi.spyOn(providers.llmRuntime, 'generateText'); @@ -669,7 +687,7 @@ describe('local scan enrichment', () => { providers, stateStore, syncId: 'sync-resume-1', - providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 }, + providerIdentity: { provider: 'fake', embeddingDimensions: 6 }, }); expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']); @@ -685,7 +703,10 @@ describe('local scan enrichment', () => { it('does not reuse completed stages when the snapshot changes', async () => { const stateStore = memoryEnrichmentStateStore(); - const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }); + const providers = { + ...createDeterministicLocalScanEnrichmentProviders(), + embedding: fakeScanEmbedding({ dimensions: 6 }), + }; const scanConnector = connector(); await runLocalScanEnrichment({ @@ -697,7 +718,7 @@ describe('local scan enrichment', () => { providers, stateStore, syncId: 'sync-resume-hash', - providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 }, + providerIdentity: { provider: 'fake', embeddingDimensions: 6 }, }); const firstTable = snapshot.tables[0]; @@ -722,7 +743,7 @@ describe('local scan enrichment', () => { providers, stateStore, syncId: 'sync-resume-hash', - providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 }, + providerIdentity: { provider: 'fake', embeddingDimensions: 6 }, }); expect(result.state.resumedStages).toEqual([]); @@ -828,8 +849,8 @@ describe('local scan enrichment', () => { }, ); - expect(providers?.embedding.dimensions).toBe(1536); - expect(providers?.embedding.maxBatchSize).toBe(8); + expect(providers?.embedding?.dimensions).toBe(1536); + expect(providers?.embedding?.maxBatchSize).toBe(8); expect(createKtxLlmProvider).toHaveBeenCalledWith( expect.objectContaining({ backend: 'gateway', modelSlots: { default: 'provider/language-model' } }), ); diff --git a/packages/context/src/scan/local-enrichment.ts b/packages/context/src/scan/local-enrichment.ts index 839b6fc9..381f3cb0 100644 --- a/packages/context/src/scan/local-enrichment.ts +++ b/packages/context/src/scan/local-enrichment.ts @@ -43,14 +43,9 @@ import type { const DESCRIPTION_TABLE_CONCURRENCY = 6; -export interface DeterministicLocalScanEnrichmentProviderOptions { - embeddingDimensions?: number; - maxBatchSize?: number; -} - export interface KtxLocalScanEnrichmentProviders { llmRuntime: KtxLlmRuntimePort; - embedding: KtxEmbeddingPort; + embedding?: KtxEmbeddingPort | null; } export interface KtxLocalScanEnrichmentInput { @@ -173,31 +168,9 @@ function providerlessEnrichedWarning(relationshipDetection: boolean): KtxScanWar }; } -function hashEmbedding(text: string, dimensions: number): number[] { - const values = Array.from({ length: dimensions }, (_, index) => { - let hash = index + 17; - for (const char of text) { - hash = (hash * 31 + char.charCodeAt(0) + index) % 1009; - } - return Number(((hash % 200) / 100 - 1).toFixed(4)); - }); - return values; -} - -export function createDeterministicLocalScanEnrichmentProviders( - options: DeterministicLocalScanEnrichmentProviderOptions = {}, -): KtxLocalScanEnrichmentProviders { - const dimensions = options.embeddingDimensions ?? 8; - const maxBatchSize = options.maxBatchSize ?? 64; +export function createDeterministicLocalScanEnrichmentProviders(): KtxLocalScanEnrichmentProviders { return { llmRuntime: deterministicLlmRuntime(), - embedding: { - dimensions, - maxBatchSize, - async embedBatch(texts) { - return texts.map((text) => hashEmbedding(text, dimensions)); - }, - }, }; } @@ -370,7 +343,7 @@ async function generateDescriptions(input: { async function buildEmbeddings(input: { snapshot: KtxSchemaSnapshot; - providers: KtxLocalScanEnrichmentProviders; + embedding: KtxEmbeddingPort; descriptions: KtxLocalScanEnrichmentResult['descriptionUpdates']; progress?: KtxProgressPort; }): Promise<{ updates: KtxEmbeddingUpdate[]; byColumnId: Map }> { @@ -400,7 +373,7 @@ async function buildEmbeddings(input: { } const embeddings: number[][] = []; - const maxBatchSize = embeddingBatchSize(input.providers.embedding.maxBatchSize); + const maxBatchSize = embeddingBatchSize(input.embedding.maxBatchSize); const embeddingTexts = texts.map((item) => item.text); const batchCount = Math.ceil(embeddingTexts.length / maxBatchSize); if (batchCount === 0) { @@ -412,7 +385,7 @@ async function buildEmbeddings(input: { transient: true, }); const batch = embeddingTexts.slice(offset, offset + maxBatchSize); - const batchEmbeddings = await input.providers.embedding.embedBatch(batch); + const batchEmbeddings = await input.embedding.embedBatch(batch); if (batchEmbeddings.length !== batch.length) { throw new Error(`expected ${batch.length} embeddings, received ${batchEmbeddings.length}`); } @@ -560,34 +533,38 @@ export async function runLocalScanEnrichment( warnings, }), }); - const embeddingProgress = progress?.startPhase(0.2); - embeddingUpdates = await runEnrichmentStage({ - stateStore: input.stateStore, - runId: input.context.runId, - connectionId: input.connectionId, - syncId, - mode: input.mode, - stage: 'embeddings', - inputHash, - now, - resumedStages: state.resumedStages, - completedStages: state.completedStages, - failedStages: state.failedStages, - compute: async () => { - const embeddings = await buildEmbeddings({ - snapshot, - providers, - descriptions, - progress: embeddingProgress, - }); - return embeddings.updates; - }, - }); - schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates)); summary.dataDictionary = input.connector.sampleColumn ? 'completed' : 'skipped'; summary.tableDescriptions = 'completed'; summary.columnDescriptions = 'completed'; - summary.embeddings = 'completed'; + + const embeddingProgress = progress?.startPhase(0.2); + const embedding = providers.embedding; + if (embedding) { + embeddingUpdates = await runEnrichmentStage({ + stateStore: input.stateStore, + runId: input.context.runId, + connectionId: input.connectionId, + syncId, + mode: input.mode, + stage: 'embeddings', + inputHash, + now, + resumedStages: state.resumedStages, + completedStages: state.completedStages, + failedStages: state.failedStages, + compute: async () => { + const embeddings = await buildEmbeddings({ + snapshot, + embedding, + descriptions, + progress: embeddingProgress, + }); + return embeddings.updates; + }, + }); + schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates)); + summary.embeddings = 'completed'; + } } let relationshipUpdate: KtxRelationshipUpdate | null = null; diff --git a/packages/llm/src/embedding-provider.test.ts b/packages/llm/src/embedding-provider.test.ts index 3a0a6f9a..41d11b1a 100644 --- a/packages/llm/src/embedding-provider.test.ts +++ b/packages/llm/src/embedding-provider.test.ts @@ -3,19 +3,16 @@ import { createKtxEmbeddingProvider } from './embedding-provider.js'; import type { KtxEmbeddingConfig } from './types.js'; describe('createKtxEmbeddingProvider', () => { - it('creates deterministic embeddings with stable dimensions', async () => { - const provider = createKtxEmbeddingProvider({ - backend: 'deterministic', - model: 'sha256', - dimensions: 6, - batchSize: 4, - }); + it('rejects deterministic embeddings', () => { + const config = JSON.parse( + JSON.stringify({ + backend: 'deterministic', + model: 'sha256', + dimensions: 6, + }), + ) as KtxEmbeddingConfig; - await expect(provider.embed('Revenue policy')).resolves.toHaveLength(6); - await expect(provider.embed('Revenue policy')).resolves.toEqual(await provider.embed('Revenue policy')); - await expect(provider.embed('Revenue policy')).resolves.not.toEqual(await provider.embed('Approval policy')); - await expect(provider.embedMany(['a', 'b'])).resolves.toHaveLength(2); - expect(provider.maxBatchSize).toBe(4); + expect(() => createKtxEmbeddingProvider(config)).toThrow('Unsupported KTX embedding backend: deterministic'); }); it('rejects gateway embeddings', () => { diff --git a/packages/llm/src/embedding-provider.ts b/packages/llm/src/embedding-provider.ts index fa16b561..d24e3749 100644 --- a/packages/llm/src/embedding-provider.ts +++ b/packages/llm/src/embedding-provider.ts @@ -1,4 +1,3 @@ -import { createHash } from 'node:crypto'; import { spawn } from 'node:child_process'; import { join } from 'node:path'; import OpenAI from 'openai'; @@ -33,14 +32,6 @@ export interface KtxEmbeddingProviderDeps { const DEFAULT_BATCH_SIZE = 100; -function deterministicVector(text: string, dimensions: number): number[] { - const digest = createHash('sha256').update(text).digest(); - return Array.from({ length: dimensions }, (_, index) => { - const byte = digest[index % digest.length]; - return Number(((byte / 255) * 2 - 1).toFixed(6)); - }); -} - function assertNonEmptyText(text: string): void { if (!text.trim()) { throw new Error('Embedding text must be non-empty'); @@ -184,24 +175,6 @@ function runSentenceTransformersProcessJson(options: { }; } -class DeterministicEmbeddingProvider implements KtxEmbeddingProvider { - readonly maxBatchSize: number; - - constructor(readonly dimensions: number, batchSize = DEFAULT_BATCH_SIZE) { - this.maxBatchSize = batchSize; - } - - async embed(text: string): Promise { - assertNonEmptyText(text); - return deterministicVector(text, this.dimensions); - } - - async embedMany(texts: string[]): Promise { - assertBatchSize(texts, this.maxBatchSize); - return texts.map((text) => deterministicVector(text, this.dimensions)); - } -} - class OpenAIEmbeddingProvider implements KtxEmbeddingProvider { readonly dimensions: number; readonly maxBatchSize: number; @@ -367,8 +340,6 @@ export function createKtxEmbeddingProvider( deps: KtxEmbeddingProviderDeps = {}, ): KtxEmbeddingProvider { switch (config.backend) { - case 'deterministic': - return new DeterministicEmbeddingProvider(config.dimensions, config.batchSize); case 'openai': return new OpenAIEmbeddingProvider(config, deps); case 'sentence-transformers': diff --git a/packages/llm/src/types.ts b/packages/llm/src/types.ts index b91aec25..bc928e08 100644 --- a/packages/llm/src/types.ts +++ b/packages/llm/src/types.ts @@ -62,7 +62,7 @@ export interface KtxLlmProvider { activeBackend(): KtxLlmBackend; } -export type KtxEmbeddingBackend = 'openai' | 'deterministic' | 'sentence-transformers'; +export type KtxEmbeddingBackend = 'openai' | 'sentence-transformers'; export interface KtxEmbeddingTokenUsageEvent { backend: KtxEmbeddingBackend;