mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-07 07:55:13 +02:00
fix: remove deterministic embedding backend
This commit is contained in:
parent
b4c77c0563
commit
6ee15018ad
26 changed files with 144 additions and 218 deletions
|
|
@ -676,8 +676,7 @@ describe('runKtxDoctor', () => {
|
|||
' adapters:',
|
||||
' - live-database',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' model: deterministic',
|
||||
' backend: none',
|
||||
' dimensions: 8',
|
||||
'',
|
||||
].join('\n'),
|
||||
|
|
@ -694,8 +693,8 @@ describe('runKtxDoctor', () => {
|
|||
).resolves.toBe(0);
|
||||
|
||||
expect(testIo.stdout()).toContain('Embeddings');
|
||||
expect(testIo.stdout()).toContain('deterministic');
|
||||
expect(testIo.stdout()).toContain('semantic search degraded');
|
||||
expect(testIo.stdout()).toContain('none');
|
||||
expect(testIo.stdout()).toContain('semantic search will be skipped');
|
||||
delete process.env.ANTHROPIC_API_KEY;
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -62,7 +62,6 @@ export function deepReadinessGaps(config: KtxProjectConfig): string[] {
|
|||
if (
|
||||
!embeddings ||
|
||||
embeddings.backend === 'none' ||
|
||||
embeddings.backend === 'deterministic' ||
|
||||
!embeddings.model ||
|
||||
embeddings.dimensions <= 0
|
||||
) {
|
||||
|
|
|
|||
|
|
@ -133,7 +133,7 @@ export async function writeMetabaseConfig(projectDir: string): Promise<void> {
|
|||
' adapters:',
|
||||
' - metabase',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
@ -502,7 +502,7 @@ export async function runPublicMetabaseSyncModeCase(tempDir: string, input: Sync
|
|||
' adapters:',
|
||||
' - metabase',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
|
|||
|
|
@ -777,7 +777,7 @@ describe('runKtxIngest', () => {
|
|||
' adapters:',
|
||||
' - metabase',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
@ -1845,7 +1845,7 @@ describe('runKtxIngest', () => {
|
|||
' adapters:',
|
||||
' - looker',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
|
|||
|
|
@ -324,7 +324,7 @@ describe('setup embeddings step', () => {
|
|||
expect(result.status).toBe('failed');
|
||||
const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8'));
|
||||
expect(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')).not.toContain('completed_steps:');
|
||||
expect(config.ingest.embeddings.backend).toBe('deterministic');
|
||||
expect(config.ingest.embeddings.backend).toBe('none');
|
||||
expect(io.stderr()).toContain('Local embedding health check failed: 401 invalid api key [redacted]');
|
||||
expect(io.stderr()).toContain('Prepare the runtime with: ktx dev runtime start --feature local-embeddings');
|
||||
expect(io.stderr()).not.toContain('skip for now');
|
||||
|
|
@ -436,7 +436,7 @@ describe('setup embeddings step', () => {
|
|||
expect(result.status).toBe('skipped');
|
||||
const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8'));
|
||||
expect(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')).not.toContain('completed_steps:');
|
||||
expect(config.ingest.embeddings.backend).toBe('deterministic');
|
||||
expect(config.ingest.embeddings.backend).toBe('none');
|
||||
});
|
||||
|
||||
it('returns back without writing config when the local health check fails and Back is selected', async () => {
|
||||
|
|
@ -460,7 +460,7 @@ describe('setup embeddings step', () => {
|
|||
|
||||
expect(result.status).toBe('back');
|
||||
const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8'));
|
||||
expect(config.ingest.embeddings.backend).toBe('deterministic');
|
||||
expect(config.ingest.embeddings.backend).toBe('none');
|
||||
});
|
||||
|
||||
it('preserves already completed embeddings setup when no embedding args request changes', async () => {
|
||||
|
|
|
|||
|
|
@ -94,7 +94,6 @@ async function hasCompletedEmbeddings(projectDir: string, config: KtxProjectConf
|
|||
return (
|
||||
(await readKtxSetupState(projectDir)).completed_steps.includes('embeddings') &&
|
||||
config.ingest.embeddings.backend !== 'none' &&
|
||||
config.ingest.embeddings.backend !== 'deterministic' &&
|
||||
typeof config.ingest.embeddings.model === 'string' &&
|
||||
config.ingest.embeddings.model.length > 0 &&
|
||||
config.ingest.embeddings.dimensions > 0
|
||||
|
|
|
|||
|
|
@ -1166,8 +1166,7 @@ describe('setup Anthropic model step', () => {
|
|||
' default: claude-sonnet-4-6',
|
||||
'ingest:',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' model: deterministic',
|
||||
' backend: none',
|
||||
' dimensions: 8',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
@ -1209,8 +1208,7 @@ describe('setup Anthropic model step', () => {
|
|||
` default: ${fixture.model}`,
|
||||
'ingest:',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' model: deterministic',
|
||||
' backend: none',
|
||||
' dimensions: 8',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ describe('setup status', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('reports deterministic default embeddings as not setup-ready', async () => {
|
||||
it('reports disabled default embeddings as not setup-ready', async () => {
|
||||
await mkdir(tempDir, { recursive: true });
|
||||
await writeFile(
|
||||
join(tempDir, 'ktx.yaml'),
|
||||
|
|
@ -122,8 +122,7 @@ describe('setup status', () => {
|
|||
' default: claude-sonnet-4-6',
|
||||
'ingest:',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' model: deterministic',
|
||||
' backend: none',
|
||||
' dimensions: 8',
|
||||
'connections: {}',
|
||||
].join('\n'),
|
||||
|
|
@ -133,7 +132,7 @@ describe('setup status', () => {
|
|||
await expect(readKtxSetupStatus(tempDir)).resolves.toMatchObject({
|
||||
project: { path: tempDir, ready: true },
|
||||
llm: { backend: 'anthropic', ready: true, model: 'claude-sonnet-4-6' },
|
||||
embeddings: { backend: 'deterministic', ready: false, model: 'deterministic', dimensions: 8 },
|
||||
embeddings: { backend: 'none', ready: false, dimensions: 8 },
|
||||
});
|
||||
});
|
||||
|
||||
|
|
@ -373,8 +372,7 @@ describe('setup status', () => {
|
|||
' default: claude-sonnet-4-6',
|
||||
'ingest:',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' model: deterministic',
|
||||
' backend: none',
|
||||
' dimensions: 8',
|
||||
'',
|
||||
].join('\n'),
|
||||
|
|
|
|||
|
|
@ -238,7 +238,6 @@ function embeddingsReady(status: KtxSetupStatus['embeddings']): boolean {
|
|||
return (
|
||||
status.backend !== undefined &&
|
||||
status.backend !== 'none' &&
|
||||
status.backend !== 'deterministic' &&
|
||||
typeof status.model === 'string' &&
|
||||
status.model.length > 0 &&
|
||||
typeof status.dimensions === 'number' &&
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ async function writeSqliteScanConfig(projectDir: string, dbPath: string, enrich
|
|||
' enrichment:',
|
||||
' mode: deterministic',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
' dimensions: 6',
|
||||
]
|
||||
: []),
|
||||
|
|
@ -166,7 +166,7 @@ describe('standalone built ktx CLI smoke', () => {
|
|||
});
|
||||
|
||||
it('runs status setup checks through the built binary', async () => {
|
||||
const result = await runBuiltCli(['status', '--verbose', '--no-input']);
|
||||
const result = await runBuiltCli(['status', '--verbose', '--no-input'], { cwd: tempDir });
|
||||
|
||||
expect(result.stdout).toMatch(/KTX status/);
|
||||
if (result.stdout.includes('No project here yet.')) {
|
||||
|
|
|
|||
|
|
@ -242,15 +242,6 @@ function buildEmbeddingsStatus(config: KtxProjectEmbeddingConfig, env: NodeJS.Pr
|
|||
detail: 'disabled — semantic search will be skipped',
|
||||
};
|
||||
}
|
||||
if (backend === 'deterministic') {
|
||||
return {
|
||||
backend,
|
||||
model,
|
||||
dimensions,
|
||||
status: 'warn',
|
||||
detail: 'deterministic — semantic search degraded (lexical/dictionary lanes still work)',
|
||||
};
|
||||
}
|
||||
if (backend === 'openai') {
|
||||
const ref = config.openai?.api_key;
|
||||
const resolved = resolveRef(ref, env);
|
||||
|
|
@ -645,7 +636,7 @@ function buildVerdict(
|
|||
const reasons: string[] = [];
|
||||
if (llm.status === 'warn') reasons.push('LLM credentials missing');
|
||||
if (embeddings.status === 'warn') {
|
||||
if (embeddings.backend === 'deterministic' || embeddings.backend === 'none') {
|
||||
if (embeddings.backend === 'none') {
|
||||
reasons.push('semantic search disabled');
|
||||
} else {
|
||||
reasons.push('embedding credentials missing');
|
||||
|
|
|
|||
|
|
@ -165,7 +165,7 @@ async function writeHistoricSqlProject(project: KtxLocalProject): Promise<KtxLoc
|
|||
' adapters:',
|
||||
' - historic-sql',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'storage:',
|
||||
' state: sqlite',
|
||||
' search: sqlite-fts5',
|
||||
|
|
|
|||
|
|
@ -284,7 +284,7 @@ describe('canonical local ingest', () => {
|
|||
' adapters:',
|
||||
' - fake',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
@ -389,9 +389,11 @@ describe('canonical local ingest', () => {
|
|||
expect(result.result.failedWorkUnits).toEqual([]);
|
||||
const db = new Database(join(project.projectDir, '.ktx', 'db.sqlite'), { readonly: true });
|
||||
try {
|
||||
expect(db.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key').all()).toEqual([
|
||||
{ key: 'orders_context', summary: 'Orders source context', has_embedding: 1 },
|
||||
]);
|
||||
expect(
|
||||
db
|
||||
.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key')
|
||||
.all(),
|
||||
).toEqual([{ key: 'orders_context', summary: 'Orders source context', has_embedding: 0 }]);
|
||||
} finally {
|
||||
db.close();
|
||||
}
|
||||
|
|
@ -489,7 +491,7 @@ describe('canonical local ingest', () => {
|
|||
' adapters:',
|
||||
' - historic-sql',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'storage:',
|
||||
' state: sqlite',
|
||||
' search: sqlite-fts5',
|
||||
|
|
@ -572,7 +574,7 @@ describe('canonical local ingest', () => {
|
|||
' adapters:',
|
||||
' - metabase',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
@ -650,7 +652,7 @@ describe('canonical local ingest', () => {
|
|||
' adapters:',
|
||||
' - metricflow',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'storage:',
|
||||
' state: sqlite',
|
||||
' search: sqlite-fts5',
|
||||
|
|
@ -778,7 +780,7 @@ describe('canonical local ingest', () => {
|
|||
' adapters:',
|
||||
' - looker',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'storage:',
|
||||
' state: sqlite',
|
||||
' search: sqlite-fts5',
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ describe('createLocalBundleIngestRuntime', () => {
|
|||
' adapters:',
|
||||
' - fake',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
@ -303,7 +303,7 @@ describe('createLocalBundleIngestRuntime', () => {
|
|||
' adapters:',
|
||||
' - fake',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
' backend: none',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
|
|||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { createLocalKtxEmbeddingProviderFromConfig, KtxIngestEmbeddingPortAdapter } from '../llm/index.js';
|
||||
import type { KtxEmbeddingPort } from '../core/embedding.js';
|
||||
import { CandidateDedupService } from './context-candidates/candidate-dedup.service.js';
|
||||
import { ContextEvidenceIndexService } from './context-evidence/context-evidence-index.service.js';
|
||||
import { SqliteContextEvidenceStore } from './context-evidence/sqlite-context-evidence-store.js';
|
||||
|
|
@ -43,16 +43,16 @@ describe('local ingest embedding providers with SQLite ingest stores', () => {
|
|||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
function embeddings() {
|
||||
const provider = createLocalKtxEmbeddingProviderFromConfig({
|
||||
backend: 'deterministic',
|
||||
dimensions: 8,
|
||||
batchSize: 4,
|
||||
});
|
||||
if (!provider) {
|
||||
throw new Error('deterministic local embedding provider was not created');
|
||||
}
|
||||
return new KtxIngestEmbeddingPortAdapter(provider);
|
||||
function embeddings(): KtxEmbeddingPort {
|
||||
return {
|
||||
maxBatchSize: 4,
|
||||
async computeEmbedding() {
|
||||
return [1, 0, 0];
|
||||
},
|
||||
async computeEmbeddingsBulk(texts) {
|
||||
return texts.map(() => [1, 0, 0]);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
it('indexes and searches context evidence using a package-owned local embedding provider', async () => {
|
||||
|
|
|
|||
|
|
@ -221,21 +221,15 @@ describe('local KTX embedding config', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('constructs deterministic embeddings from the default project config', () => {
|
||||
it('returns null for the default disabled project embedding config', () => {
|
||||
const createKtxEmbeddingProvider = vi.fn(() => ({}) as never);
|
||||
const provider = createLocalKtxEmbeddingProviderFromConfig(
|
||||
buildDefaultKtxProjectConfig().ingest.embeddings,
|
||||
{ createKtxEmbeddingProvider },
|
||||
);
|
||||
|
||||
expect(provider).not.toBeNull();
|
||||
expect(createKtxEmbeddingProvider).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
backend: 'deterministic',
|
||||
model: 'deterministic',
|
||||
dimensions: 8,
|
||||
}),
|
||||
);
|
||||
expect(provider).toBeNull();
|
||||
expect(createKtxEmbeddingProvider).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('returns null when embeddings are disabled', () => {
|
||||
|
|
|
|||
|
|
@ -184,26 +184,13 @@ export function resolveLocalKtxEmbeddingConfig(
|
|||
}
|
||||
return {
|
||||
backend: config.backend,
|
||||
model: config.model ?? 'deterministic',
|
||||
model: config.model ?? 'text-embedding-3-small',
|
||||
dimensions: config.dimensions,
|
||||
openai,
|
||||
batchSize: config.batchSize,
|
||||
};
|
||||
}
|
||||
return {
|
||||
backend: config.backend,
|
||||
model: config.model ?? 'deterministic',
|
||||
dimensions: config.dimensions,
|
||||
...(config.sentenceTransformers
|
||||
? {
|
||||
sentenceTransformers: {
|
||||
baseURL: config.sentenceTransformers.base_url,
|
||||
pathPrefix: config.sentenceTransformers.pathPrefix,
|
||||
},
|
||||
}
|
||||
: {}),
|
||||
batchSize: config.batchSize,
|
||||
};
|
||||
throw new Error(`Unsupported KTX embedding backend: ${String((config as { backend?: string }).backend)}`);
|
||||
}
|
||||
|
||||
export function createLocalKtxEmbeddingProviderFromConfig(
|
||||
|
|
|
|||
|
|
@ -42,8 +42,7 @@ connections:
|
|||
ingest: {
|
||||
adapters: [],
|
||||
embeddings: {
|
||||
backend: 'deterministic',
|
||||
model: 'deterministic',
|
||||
backend: 'none',
|
||||
dimensions: 8,
|
||||
},
|
||||
workUnits: {
|
||||
|
|
@ -87,13 +86,10 @@ connections:
|
|||
|
||||
expect(serialized).not.toContain('project:');
|
||||
expect(serialized).not.toContain('live-database');
|
||||
expect(serialized).toContain(
|
||||
' embeddings:\n backend: deterministic\n model: deterministic\n dimensions: 8',
|
||||
);
|
||||
expect(serialized).toContain(' embeddings:\n backend: none\n dimensions: 8');
|
||||
expect(parsed.ingest.adapters).toEqual([]);
|
||||
expect(parsed.ingest.embeddings).toEqual({
|
||||
backend: 'deterministic',
|
||||
model: 'deterministic',
|
||||
backend: 'none',
|
||||
dimensions: 8,
|
||||
});
|
||||
});
|
||||
|
|
@ -404,8 +400,7 @@ scan:
|
|||
|
||||
expect(config).toEqual(buildDefaultKtxProjectConfig());
|
||||
expect(config.ingest.embeddings).toEqual({
|
||||
backend: 'deterministic',
|
||||
model: 'deterministic',
|
||||
backend: 'none',
|
||||
dimensions: 8,
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import * as z from 'zod';
|
|||
import { connectionConfigSchema } from './driver-schemas.js';
|
||||
|
||||
const KTX_LLM_BACKENDS = ['none', 'anthropic', 'vertex', 'gateway', 'claude-code'] as const;
|
||||
const KTX_EMBEDDING_BACKENDS = ['none', 'deterministic', 'openai', 'sentence-transformers'] as const;
|
||||
const KTX_EMBEDDING_BACKENDS = ['none', 'openai', 'sentence-transformers'] as const;
|
||||
const KTX_PROMPT_CACHE_TTLS = ['5m', '1h'] as const;
|
||||
const KTX_ENRICHMENT_MODES = ['none', 'deterministic', 'llm'] as const;
|
||||
const KTX_WORK_UNIT_FAILURE_MODES = ['abort', 'continue'] as const;
|
||||
|
|
@ -80,9 +80,9 @@ const embeddingSchema = z
|
|||
.strictObject({
|
||||
backend: z
|
||||
.enum(KTX_EMBEDDING_BACKENDS)
|
||||
.default('deterministic')
|
||||
.describe('Embedding backend. "deterministic" is a built-in hash-based vector for offline use; "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'),
|
||||
model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small"). Ignored by the "deterministic" backend.'),
|
||||
.default('none')
|
||||
.describe('Embedding backend. "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'),
|
||||
model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small").'),
|
||||
dimensions: z.int().positive().default(8).describe('Embedding vector dimensionality. Must match the chosen model when using a real provider.'),
|
||||
openai: apiCredentialsSchema.optional().describe('OpenAI credentials, used when backend is "openai".'),
|
||||
sentenceTransformers: sentenceTransformersSchema.optional().describe('Sentence-transformers server config, used when backend is "sentence-transformers".'),
|
||||
|
|
@ -108,7 +108,7 @@ const ingestSchema = z
|
|||
.default([])
|
||||
.describe('Ingest adapter identifiers to run (e.g. "metabase", "looker", "historic-sql"). Empty array means no adapters are run.'),
|
||||
embeddings: embeddingSchema
|
||||
.prefault({ backend: 'deterministic', model: 'deterministic' })
|
||||
.prefault({ backend: 'none' })
|
||||
.describe('Embedding configuration used when ingest adapters need to embed documents.'),
|
||||
workUnits: workUnitsSchema.prefault({}).describe('Concurrency and failure handling for ingest work units.'),
|
||||
})
|
||||
|
|
|
|||
|
|
@ -58,13 +58,13 @@ describe('scan enrichment state', () => {
|
|||
snapshot,
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
|
||||
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
|
||||
});
|
||||
const second = computeKtxScanEnrichmentInputHash({
|
||||
snapshot: { ...snapshot, metadata: {} },
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { llmModel: 'a', embeddingDimensions: 8, provider: 'deterministic' },
|
||||
providerIdentity: { llmModel: 'a', provider: 'local-heuristic' },
|
||||
});
|
||||
const firstTable = snapshot.tables[0];
|
||||
if (!firstTable) {
|
||||
|
|
@ -74,7 +74,7 @@ describe('scan enrichment state', () => {
|
|||
snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] },
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
|
||||
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
|
||||
});
|
||||
|
||||
expect(first).toMatch(/^[a-f0-9]{64}$/);
|
||||
|
|
@ -87,7 +87,7 @@ describe('scan enrichment state', () => {
|
|||
snapshot,
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8 },
|
||||
providerIdentity: { provider: 'local-heuristic' },
|
||||
});
|
||||
|
||||
await store.saveCompletedStage({
|
||||
|
|
|
|||
|
|
@ -96,7 +96,6 @@ export type {
|
|||
KtxStructuralSyncPlan,
|
||||
} from './enrichment-types.js';
|
||||
export type {
|
||||
DeterministicLocalScanEnrichmentProviderOptions,
|
||||
KtxLocalScanEnrichmentInput,
|
||||
KtxLocalScanEnrichmentProviders,
|
||||
KtxLocalScanEnrichmentResult,
|
||||
|
|
|
|||
|
|
@ -17,11 +17,24 @@ import {
|
|||
createKtxConnectorCapabilities,
|
||||
type KtxQueryResult,
|
||||
type KtxReadOnlyQueryInput,
|
||||
type KtxEmbeddingPort,
|
||||
type KtxScanConnector,
|
||||
type KtxScanContext,
|
||||
type KtxSchemaSnapshot,
|
||||
} from './types.js';
|
||||
|
||||
function fakeScanEmbedding(options: { dimensions: number; maxBatchSize?: number }): KtxEmbeddingPort {
|
||||
return {
|
||||
dimensions: options.dimensions,
|
||||
maxBatchSize: options.maxBatchSize ?? 64,
|
||||
async embedBatch(texts) {
|
||||
return texts.map((_, textIndex) =>
|
||||
Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex),
|
||||
);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const snapshot: KtxSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
|
|
@ -355,7 +368,7 @@ describe('local scan enrichment', () => {
|
|||
});
|
||||
|
||||
it('honors scan relationship config when LLM proposals are disabled', async () => {
|
||||
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
|
||||
const providers = createDeterministicLocalScanEnrichmentProviders();
|
||||
const generateObject = vi.fn();
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
|
|
@ -424,7 +437,7 @@ describe('local scan enrichment', () => {
|
|||
detectRelationships: false,
|
||||
connector: failingConnector,
|
||||
context: { runId: 'scan-run-warnings', logger },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
});
|
||||
|
||||
const codes = result.warnings.map((warning) => warning.code);
|
||||
|
|
@ -439,25 +452,24 @@ describe('local scan enrichment', () => {
|
|||
expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
|
||||
});
|
||||
|
||||
it('runs configured deterministic enrichment with descriptions and embeddings', async () => {
|
||||
it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-2' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
});
|
||||
|
||||
expect(result.summary).toMatchObject({
|
||||
dataDictionary: 'completed',
|
||||
tableDescriptions: 'completed',
|
||||
columnDescriptions: 'completed',
|
||||
embeddings: 'completed',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'completed',
|
||||
});
|
||||
expect(result.embeddingUpdates).toHaveLength(3);
|
||||
expect(result.embeddingUpdates[0]?.embedding).toHaveLength(6);
|
||||
expect(result.embeddingUpdates).toEqual([]);
|
||||
expect(result.snapshot).toEqual(snapshot);
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
});
|
||||
|
|
@ -518,7 +530,7 @@ describe('local scan enrichment', () => {
|
|||
mode: 'enriched',
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-concurrent-descriptions' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 }),
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
relationshipSettings: settings,
|
||||
});
|
||||
|
||||
|
|
@ -542,7 +554,10 @@ describe('local scan enrichment', () => {
|
|||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-progress', progress },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
|
||||
providers: {
|
||||
...createDeterministicLocalScanEnrichmentProviders(),
|
||||
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
||||
},
|
||||
});
|
||||
|
||||
expect(events).toEqual(
|
||||
|
|
@ -613,7 +628,7 @@ describe('local scan enrichment', () => {
|
|||
...connector(),
|
||||
introspect: vi.fn(async () => manyColumnSnapshot),
|
||||
};
|
||||
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
|
||||
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
|
||||
const embedBatch = vi.fn(async (texts: string[]) => {
|
||||
if (texts.length > 2) {
|
||||
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
|
||||
|
|
@ -644,7 +659,10 @@ describe('local scan enrichment', () => {
|
|||
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
|
||||
const stateStore = memoryEnrichmentStateStore();
|
||||
const scanConnector = connector();
|
||||
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
|
||||
const providers = {
|
||||
...createDeterministicLocalScanEnrichmentProviders(),
|
||||
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
||||
};
|
||||
|
||||
const first = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
|
|
@ -655,7 +673,7 @@ describe('local scan enrichment', () => {
|
|||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-1',
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
const generateText = vi.spyOn(providers.llmRuntime, 'generateText');
|
||||
|
|
@ -669,7 +687,7 @@ describe('local scan enrichment', () => {
|
|||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-1',
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
|
|
@ -685,7 +703,10 @@ describe('local scan enrichment', () => {
|
|||
|
||||
it('does not reuse completed stages when the snapshot changes', async () => {
|
||||
const stateStore = memoryEnrichmentStateStore();
|
||||
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
|
||||
const providers = {
|
||||
...createDeterministicLocalScanEnrichmentProviders(),
|
||||
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
||||
};
|
||||
const scanConnector = connector();
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
|
|
@ -697,7 +718,7 @@ describe('local scan enrichment', () => {
|
|||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-hash',
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
const firstTable = snapshot.tables[0];
|
||||
|
|
@ -722,7 +743,7 @@ describe('local scan enrichment', () => {
|
|||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-hash',
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
|
||||
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
expect(result.state.resumedStages).toEqual([]);
|
||||
|
|
@ -828,8 +849,8 @@ describe('local scan enrichment', () => {
|
|||
},
|
||||
);
|
||||
|
||||
expect(providers?.embedding.dimensions).toBe(1536);
|
||||
expect(providers?.embedding.maxBatchSize).toBe(8);
|
||||
expect(providers?.embedding?.dimensions).toBe(1536);
|
||||
expect(providers?.embedding?.maxBatchSize).toBe(8);
|
||||
expect(createKtxLlmProvider).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ backend: 'gateway', modelSlots: { default: 'provider/language-model' } }),
|
||||
);
|
||||
|
|
|
|||
|
|
@ -43,14 +43,9 @@ import type {
|
|||
|
||||
const DESCRIPTION_TABLE_CONCURRENCY = 6;
|
||||
|
||||
export interface DeterministicLocalScanEnrichmentProviderOptions {
|
||||
embeddingDimensions?: number;
|
||||
maxBatchSize?: number;
|
||||
}
|
||||
|
||||
export interface KtxLocalScanEnrichmentProviders {
|
||||
llmRuntime: KtxLlmRuntimePort;
|
||||
embedding: KtxEmbeddingPort;
|
||||
embedding?: KtxEmbeddingPort | null;
|
||||
}
|
||||
|
||||
export interface KtxLocalScanEnrichmentInput {
|
||||
|
|
@ -173,31 +168,9 @@ function providerlessEnrichedWarning(relationshipDetection: boolean): KtxScanWar
|
|||
};
|
||||
}
|
||||
|
||||
function hashEmbedding(text: string, dimensions: number): number[] {
|
||||
const values = Array.from({ length: dimensions }, (_, index) => {
|
||||
let hash = index + 17;
|
||||
for (const char of text) {
|
||||
hash = (hash * 31 + char.charCodeAt(0) + index) % 1009;
|
||||
}
|
||||
return Number(((hash % 200) / 100 - 1).toFixed(4));
|
||||
});
|
||||
return values;
|
||||
}
|
||||
|
||||
export function createDeterministicLocalScanEnrichmentProviders(
|
||||
options: DeterministicLocalScanEnrichmentProviderOptions = {},
|
||||
): KtxLocalScanEnrichmentProviders {
|
||||
const dimensions = options.embeddingDimensions ?? 8;
|
||||
const maxBatchSize = options.maxBatchSize ?? 64;
|
||||
export function createDeterministicLocalScanEnrichmentProviders(): KtxLocalScanEnrichmentProviders {
|
||||
return {
|
||||
llmRuntime: deterministicLlmRuntime(),
|
||||
embedding: {
|
||||
dimensions,
|
||||
maxBatchSize,
|
||||
async embedBatch(texts) {
|
||||
return texts.map((text) => hashEmbedding(text, dimensions));
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -370,7 +343,7 @@ async function generateDescriptions(input: {
|
|||
|
||||
async function buildEmbeddings(input: {
|
||||
snapshot: KtxSchemaSnapshot;
|
||||
providers: KtxLocalScanEnrichmentProviders;
|
||||
embedding: KtxEmbeddingPort;
|
||||
descriptions: KtxLocalScanEnrichmentResult['descriptionUpdates'];
|
||||
progress?: KtxProgressPort;
|
||||
}): Promise<{ updates: KtxEmbeddingUpdate[]; byColumnId: Map<string, number[]> }> {
|
||||
|
|
@ -400,7 +373,7 @@ async function buildEmbeddings(input: {
|
|||
}
|
||||
|
||||
const embeddings: number[][] = [];
|
||||
const maxBatchSize = embeddingBatchSize(input.providers.embedding.maxBatchSize);
|
||||
const maxBatchSize = embeddingBatchSize(input.embedding.maxBatchSize);
|
||||
const embeddingTexts = texts.map((item) => item.text);
|
||||
const batchCount = Math.ceil(embeddingTexts.length / maxBatchSize);
|
||||
if (batchCount === 0) {
|
||||
|
|
@ -412,7 +385,7 @@ async function buildEmbeddings(input: {
|
|||
transient: true,
|
||||
});
|
||||
const batch = embeddingTexts.slice(offset, offset + maxBatchSize);
|
||||
const batchEmbeddings = await input.providers.embedding.embedBatch(batch);
|
||||
const batchEmbeddings = await input.embedding.embedBatch(batch);
|
||||
if (batchEmbeddings.length !== batch.length) {
|
||||
throw new Error(`expected ${batch.length} embeddings, received ${batchEmbeddings.length}`);
|
||||
}
|
||||
|
|
@ -560,34 +533,38 @@ export async function runLocalScanEnrichment(
|
|||
warnings,
|
||||
}),
|
||||
});
|
||||
const embeddingProgress = progress?.startPhase(0.2);
|
||||
embeddingUpdates = await runEnrichmentStage({
|
||||
stateStore: input.stateStore,
|
||||
runId: input.context.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId,
|
||||
mode: input.mode,
|
||||
stage: 'embeddings',
|
||||
inputHash,
|
||||
now,
|
||||
resumedStages: state.resumedStages,
|
||||
completedStages: state.completedStages,
|
||||
failedStages: state.failedStages,
|
||||
compute: async () => {
|
||||
const embeddings = await buildEmbeddings({
|
||||
snapshot,
|
||||
providers,
|
||||
descriptions,
|
||||
progress: embeddingProgress,
|
||||
});
|
||||
return embeddings.updates;
|
||||
},
|
||||
});
|
||||
schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
|
||||
summary.dataDictionary = input.connector.sampleColumn ? 'completed' : 'skipped';
|
||||
summary.tableDescriptions = 'completed';
|
||||
summary.columnDescriptions = 'completed';
|
||||
summary.embeddings = 'completed';
|
||||
|
||||
const embeddingProgress = progress?.startPhase(0.2);
|
||||
const embedding = providers.embedding;
|
||||
if (embedding) {
|
||||
embeddingUpdates = await runEnrichmentStage({
|
||||
stateStore: input.stateStore,
|
||||
runId: input.context.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId,
|
||||
mode: input.mode,
|
||||
stage: 'embeddings',
|
||||
inputHash,
|
||||
now,
|
||||
resumedStages: state.resumedStages,
|
||||
completedStages: state.completedStages,
|
||||
failedStages: state.failedStages,
|
||||
compute: async () => {
|
||||
const embeddings = await buildEmbeddings({
|
||||
snapshot,
|
||||
embedding,
|
||||
descriptions,
|
||||
progress: embeddingProgress,
|
||||
});
|
||||
return embeddings.updates;
|
||||
},
|
||||
});
|
||||
schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
|
||||
summary.embeddings = 'completed';
|
||||
}
|
||||
}
|
||||
|
||||
let relationshipUpdate: KtxRelationshipUpdate | null = null;
|
||||
|
|
|
|||
|
|
@ -3,19 +3,16 @@ import { createKtxEmbeddingProvider } from './embedding-provider.js';
|
|||
import type { KtxEmbeddingConfig } from './types.js';
|
||||
|
||||
describe('createKtxEmbeddingProvider', () => {
|
||||
it('creates deterministic embeddings with stable dimensions', async () => {
|
||||
const provider = createKtxEmbeddingProvider({
|
||||
backend: 'deterministic',
|
||||
model: 'sha256',
|
||||
dimensions: 6,
|
||||
batchSize: 4,
|
||||
});
|
||||
it('rejects deterministic embeddings', () => {
|
||||
const config = JSON.parse(
|
||||
JSON.stringify({
|
||||
backend: 'deterministic',
|
||||
model: 'sha256',
|
||||
dimensions: 6,
|
||||
}),
|
||||
) as KtxEmbeddingConfig;
|
||||
|
||||
await expect(provider.embed('Revenue policy')).resolves.toHaveLength(6);
|
||||
await expect(provider.embed('Revenue policy')).resolves.toEqual(await provider.embed('Revenue policy'));
|
||||
await expect(provider.embed('Revenue policy')).resolves.not.toEqual(await provider.embed('Approval policy'));
|
||||
await expect(provider.embedMany(['a', 'b'])).resolves.toHaveLength(2);
|
||||
expect(provider.maxBatchSize).toBe(4);
|
||||
expect(() => createKtxEmbeddingProvider(config)).toThrow('Unsupported KTX embedding backend: deterministic');
|
||||
});
|
||||
|
||||
it('rejects gateway embeddings', () => {
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import { spawn } from 'node:child_process';
|
||||
import { join } from 'node:path';
|
||||
import OpenAI from 'openai';
|
||||
|
|
@ -33,14 +32,6 @@ export interface KtxEmbeddingProviderDeps {
|
|||
|
||||
const DEFAULT_BATCH_SIZE = 100;
|
||||
|
||||
function deterministicVector(text: string, dimensions: number): number[] {
|
||||
const digest = createHash('sha256').update(text).digest();
|
||||
return Array.from({ length: dimensions }, (_, index) => {
|
||||
const byte = digest[index % digest.length];
|
||||
return Number(((byte / 255) * 2 - 1).toFixed(6));
|
||||
});
|
||||
}
|
||||
|
||||
function assertNonEmptyText(text: string): void {
|
||||
if (!text.trim()) {
|
||||
throw new Error('Embedding text must be non-empty');
|
||||
|
|
@ -184,24 +175,6 @@ function runSentenceTransformersProcessJson(options: {
|
|||
};
|
||||
}
|
||||
|
||||
class DeterministicEmbeddingProvider implements KtxEmbeddingProvider {
|
||||
readonly maxBatchSize: number;
|
||||
|
||||
constructor(readonly dimensions: number, batchSize = DEFAULT_BATCH_SIZE) {
|
||||
this.maxBatchSize = batchSize;
|
||||
}
|
||||
|
||||
async embed(text: string): Promise<number[]> {
|
||||
assertNonEmptyText(text);
|
||||
return deterministicVector(text, this.dimensions);
|
||||
}
|
||||
|
||||
async embedMany(texts: string[]): Promise<number[][]> {
|
||||
assertBatchSize(texts, this.maxBatchSize);
|
||||
return texts.map((text) => deterministicVector(text, this.dimensions));
|
||||
}
|
||||
}
|
||||
|
||||
class OpenAIEmbeddingProvider implements KtxEmbeddingProvider {
|
||||
readonly dimensions: number;
|
||||
readonly maxBatchSize: number;
|
||||
|
|
@ -367,8 +340,6 @@ export function createKtxEmbeddingProvider(
|
|||
deps: KtxEmbeddingProviderDeps = {},
|
||||
): KtxEmbeddingProvider {
|
||||
switch (config.backend) {
|
||||
case 'deterministic':
|
||||
return new DeterministicEmbeddingProvider(config.dimensions, config.batchSize);
|
||||
case 'openai':
|
||||
return new OpenAIEmbeddingProvider(config, deps);
|
||||
case 'sentence-transformers':
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ export interface KtxLlmProvider {
|
|||
activeBackend(): KtxLlmBackend;
|
||||
}
|
||||
|
||||
export type KtxEmbeddingBackend = 'openai' | 'deterministic' | 'sentence-transformers';
|
||||
export type KtxEmbeddingBackend = 'openai' | 'sentence-transformers';
|
||||
|
||||
export interface KtxEmbeddingTokenUsageEvent {
|
||||
backend: KtxEmbeddingBackend;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue