ktx/packages/cli/test/context/ingest/local-embedding-provider.integration.test.ts

import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import type { KtxEmbeddingPort } from '../../../src/context/core/embedding.js';
import { CandidateDedupService } from '../../../src/context/ingest/context-candidates/candidate-dedup.service.js';
import { ContextEvidenceIndexService } from '../../../src/context/ingest/context-evidence/context-evidence-index.service.js';
import { SqliteContextEvidenceStore } from '../../../src/context/ingest/context-evidence/sqlite-context-evidence-store.js';
import type { DiffSet } from '../../../src/context/ingest/types.js';

describe('local ingest embedding providers with SQLite ingest stores', () => {
  let tempDir: string;
  let dbPath: string;
  let stagedDir: string;

  beforeEach(async () => {
    tempDir = await mkdtemp(join(tmpdir(), 'ktx-local-ingest-embedding-'));
    dbPath = join(tempDir, '.ktx', 'db.sqlite');
    stagedDir = join(tempDir, 'staged');
    await mkdir(join(stagedDir, 'pages', 'revenue'), { recursive: true });
    await writeFile(
      join(stagedDir, 'pages', 'revenue', 'metadata.json'),
      `${JSON.stringify({
        objectType: 'page',
        id: 'page-revenue',
        title: 'Revenue Policy',
        path: 'Revenue Policy',
        url: 'https://notion.test/revenue',
        parentId: null,
        lastEditedAt: '2026-04-30T12:00:00.000Z',
        properties: {},
      })}\n`,
      'utf8',
    );
    await writeFile(
      join(stagedDir, 'pages', 'revenue', 'page.md'),
      ['# Approval', '', 'Owner approval is required before enterprise discounts are granted.', ''].join('\n'),
      'utf8',
    );
  });

  afterEach(async () => {
    await rm(tempDir, { recursive: true, force: true });
  });

  function embeddings(): KtxEmbeddingPort {
    return {
      maxBatchSize: 4,
      async computeEmbedding() {
        return [1, 0, 0];
      },
      async computeEmbeddingsBulk(texts) {
        return texts.map(() => [1, 0, 0]);
      },
    };
  }

  it('indexes and searches context evidence using a package-owned local embedding provider', async () => {
    const store = new SqliteContextEvidenceStore({ dbPath });
    const embeddingPort = embeddings();
    const indexer = new ContextEvidenceIndexService({ store, embeddings: embeddingPort });
    const diffSet: DiffSet = {
      added: ['pages/revenue/metadata.json', 'pages/revenue/page.md'],
      modified: [],
      deleted: [],
      unchanged: [],
    };

    const summary = await indexer.indexStagedDir({
      stagedDir,
      runId: 'run-1',
      connectionId: 'docs',
      sourceKey: 'notion',
      syncId: 'sync-1',
      diffSet,
      currentHashes: new Map([
        ['pages/revenue/metadata.json', 'metadata-hash'],
        ['pages/revenue/page.md', 'page-hash'],
      ]),
    });

    expect(summary).toMatchObject({
      documentsIndexed: 1,
      embeddingFailures: 0,
    });
    expect(summary.chunksIndexed).toBeGreaterThan(0);

    const queryText = [
      'Revenue Policy',
      'Revenue Policy',
      'Approval',
      'Owner approval is required before enterprise discounts are granted.',
    ].join('\n');
    const queryEmbedding = await embeddingPort.computeEmbedding(queryText);
    const results = await store.searchRRF({
      connectionId: 'docs',
      sourceKey: 'notion',
      queryEmbedding,
      queryText,
      limit: 5,
      includeDeleted: false,
      currentRunId: 'run-1',
    });

    expect(results[0]?.title).toBe('Revenue Policy');
    expect(results[0]?.stableCitationKey).toContain('notion:page-revenue');
    expect(results[0]).toMatchObject({
      matchReasons: expect.arrayContaining(['semantic']),
      lanes: expect.arrayContaining([
        expect.objectContaining({ lane: 'semantic', status: 'available' }),
        expect.objectContaining({ lane: 'lexical', status: 'available' }),
        expect.objectContaining({ lane: 'token', status: 'available' }),
      ]),
    });
  });

  it('deduplicates candidates using package-owned local embeddings and SQLite persistence', async () => {
    const store = new SqliteContextEvidenceStore({ dbPath });
    const embeddingPort = embeddings();
    const candidateBase = {
      runId: 'run-1',
      connectionId: 'docs',
      sourceKey: 'notion',
      topic: 'Enterprise discount approval',
      assertion: 'Owner approval is required before enterprise discounts are granted.',
      rationale: 'The source policy states that approval is required.',
      evidenceChunkIds: [],
      evidenceRefs: [],
      suggestedPageKey: 'revenue-policy',
      actionHint: 'create' as const,
      durabilityScore: 3,
      authorityScore: 3,
      reuseScore: 3,
      noveltyScore: 2,
      riskScore: 0,
      promotionScore: 11,
      status: 'pending' as const,
      rejectionReason: null,
      lane: 'full' as const,
      embedding: null,
    };

    await store.insertCandidate({ ...candidateBase, candidateKey: 'discount-policy-a' });
    await store.insertCandidate({ ...candidateBase, candidateKey: 'discount-policy-b' });

    const result = await new CandidateDedupService({
      store,
      embeddings: embeddingPort,
      settings: {
        enabled: true,
        topicSimilarityThreshold: -1,
        scoreAggregation: 'max',
      },
    }).deduplicateRun('run-1');

    expect(result.enabled).toBe(true);
    expect(result.embeddingFailures).toBe(0);
    expect(result.candidatesIn).toBe(2);
    expect(result.clustersOut).toBe(1);
    expect(result.mergedCount).toBe(1);
  });
});
Initial open-source release 2026-05-10 23:12:26 +02:00			`import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';`
			`import { tmpdir } from 'node:os';`
			`import { join } from 'node:path';`
			`import { afterEach, beforeEach, describe, expect, it } from 'vitest';`
test: split cli tests from source tree (#216) * feat(cli): define full warehouse dialect contract * test(cli): keep dialect edge tests focused * fix(cli): stabilize dialect contract foundation * refactor(connectors): own read-only query preparation * refactor(connectors): resolve dialects through registry * refactor(connectors): keep concrete dialect classes internal * chore(workspace): enforce dialect import boundary * refactor(cli): resolve relationship dialect at scan boundary * refactor(cli): use dialect display parsing for entity details * refactor(cli): use dialect display parsing for warehouse catalog * refactor(cli): use dialect SQL in relationship workflows * test(cli): verify solid dialect scan workflow closure * test: split cli tests from source tree * refactor(cli): standardize BigQuery scope listing * feat(sqlite): implement connector scope listing * test(connectors): cover required table listing * feat(cli): add warehouse driver registry * refactor(setup): route scope discovery through driver registry * refactor(cli): route local query execution through driver registry * refactor(historic-sql): route dialect support through driver registry * refactor(cli): test warehouse connections through driver registry * fix(cli): close driver registry type export gaps * Improve setup daemon diagnostics * refactor(setup): centralize rail-prefixed diagnostics + query-history fallback Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput into clack.ts so the setup wizard, managed daemons, and embedding/agent steps share one rail-formatted writer. setup-databases.ts also adds a "disable query history and retry" option when the schema-context build fails and query history is the likely culprit, surfaced via a new failed-query-history-unavailable status. * fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match The setup picker's KtxTableListEntry was a 2-level { schema, name }, so qualifiedTableId always wrote db.name into enabled_tables. When BigQuery, Snowflake, or SQL Server later ran fast ingest, their introspect step filtered the scope set with scopedTableNames(scope, { catalog: projectId\|database, db }) — catalog was non-null on the introspect side but null in the scope refs, so every entry was rejected, the live-database adapter staged zero table files, and detect() failed with 'Adapter "live-database" did not recognize fetched source output'. Align the picker boundary with the canonical 3-level KtxTableRef: - Add catalog: string \| null to KtxTableListEntry. - BigQuery/Snowflake/SQL Server listTables populate catalog from the resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null. - qualifiedTableId emits catalog.schema.name when catalog is non-null (resolveEnabledTables already accepts the 3-part shape) and schemasFromEnabledTables now goes through parseDottedTableEntry so it recovers the schema correctly from both 2-part and 3-part entries. - Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker reuse. Update listTables expectations in all seven connector tests and the setup / picker test fixtures. Add a picker regression test that covers the catalog-bearing round-trip (save + refine). * fix(cli): allow debug telemetry under opt-out env 2026-05-26 08:49:05 +02:00			`import type { KtxEmbeddingPort } from '../../../src/context/core/embedding.js';`
			`import { CandidateDedupService } from '../../../src/context/ingest/context-candidates/candidate-dedup.service.js';`
			`import { ContextEvidenceIndexService } from '../../../src/context/ingest/context-evidence/context-evidence-index.service.js';`
			`import { SqliteContextEvidenceStore } from '../../../src/context/ingest/context-evidence/sqlite-context-evidence-store.js';`
			`import type { DiffSet } from '../../../src/context/ingest/types.js';`
Initial open-source release 2026-05-10 23:12:26 +02:00
			`describe('local ingest embedding providers with SQLite ingest stores', () => {`
			`let tempDir: string;`
			`let dbPath: string;`
			`let stagedDir: string;`

			`beforeEach(async () => {`
rename klo to ktx 2026-05-10 23:51:24 +02:00			`tempDir = await mkdtemp(join(tmpdir(), 'ktx-local-ingest-embedding-'));`
			`dbPath = join(tempDir, '.ktx', 'db.sqlite');`
Initial open-source release 2026-05-10 23:12:26 +02:00			`stagedDir = join(tempDir, 'staged');`
			`await mkdir(join(stagedDir, 'pages', 'revenue'), { recursive: true });`
			`await writeFile(`
			`join(stagedDir, 'pages', 'revenue', 'metadata.json'),`
			`${JSON.stringify({
			`objectType: 'page',`
			`id: 'page-revenue',`
			`title: 'Revenue Policy',`
			`path: 'Revenue Policy',`
			`url: 'https://notion.test/revenue',`
			`parentId: null,`
			`lastEditedAt: '2026-04-30T12:00:00.000Z',`
			`properties: {},`
			})}\n`,
			`'utf8',`
			`);`
			`await writeFile(`
			`join(stagedDir, 'pages', 'revenue', 'page.md'),`
			`['# Approval', '', 'Owner approval is required before enterprise discounts are granted.', ''].join('\n'),`
			`'utf8',`
			`);`
			`});`

			`afterEach(async () => {`
			`await rm(tempDir, { recursive: true, force: true });`
			`});`

fix: remove deterministic embedding backend (#146) * fix: remove deterministic embedding backend * test: update slow tests for disabled embeddings 2026-05-19 16:40:01 +02:00			`function embeddings(): KtxEmbeddingPort {`
			`return {`
			`maxBatchSize: 4,`
			`async computeEmbedding() {`
			`return [1, 0, 0];`
			`},`
			`async computeEmbeddingsBulk(texts) {`
			`return texts.map(() => [1, 0, 0]);`
			`},`
			`};`
Initial open-source release 2026-05-10 23:12:26 +02:00			`}`

			`it('indexes and searches context evidence using a package-owned local embedding provider', async () => {`
			`const store = new SqliteContextEvidenceStore({ dbPath });`
			`const embeddingPort = embeddings();`
			`const indexer = new ContextEvidenceIndexService({ store, embeddings: embeddingPort });`
			`const diffSet: DiffSet = {`
			`added: ['pages/revenue/metadata.json', 'pages/revenue/page.md'],`
			`modified: [],`
			`deleted: [],`
			`unchanged: [],`
			`};`

			`const summary = await indexer.indexStagedDir({`
			`stagedDir,`
			`runId: 'run-1',`
			`connectionId: 'docs',`
			`sourceKey: 'notion',`
			`syncId: 'sync-1',`
			`diffSet,`
			`currentHashes: new Map([`
			`['pages/revenue/metadata.json', 'metadata-hash'],`
			`['pages/revenue/page.md', 'page-hash'],`
			`]),`
			`});`

			`expect(summary).toMatchObject({`
			`documentsIndexed: 1,`
			`embeddingFailures: 0,`
			`});`
			`expect(summary.chunksIndexed).toBeGreaterThan(0);`

			`const queryText = [`
			`'Revenue Policy',`
			`'Revenue Policy',`
			`'Approval',`
			`'Owner approval is required before enterprise discounts are granted.',`
			`].join('\n');`
			`const queryEmbedding = await embeddingPort.computeEmbedding(queryText);`
			`const results = await store.searchRRF({`
			`connectionId: 'docs',`
			`sourceKey: 'notion',`
			`queryEmbedding,`
			`queryText,`
			`limit: 5,`
			`includeDeleted: false,`
			`currentRunId: 'run-1',`
			`});`

			`expect(results[0]?.title).toBe('Revenue Policy');`
			`expect(results[0]?.stableCitationKey).toContain('notion:page-revenue');`
			`expect(results[0]).toMatchObject({`
			`matchReasons: expect.arrayContaining(['semantic']),`
			`lanes: expect.arrayContaining([`
			`expect.objectContaining({ lane: 'semantic', status: 'available' }),`
			`expect.objectContaining({ lane: 'lexical', status: 'available' }),`
			`expect.objectContaining({ lane: 'token', status: 'available' }),`
			`]),`
			`});`
			`});`

			`it('deduplicates candidates using package-owned local embeddings and SQLite persistence', async () => {`
			`const store = new SqliteContextEvidenceStore({ dbPath });`
			`const embeddingPort = embeddings();`
			`const candidateBase = {`
			`runId: 'run-1',`
			`connectionId: 'docs',`
			`sourceKey: 'notion',`
			`topic: 'Enterprise discount approval',`
			`assertion: 'Owner approval is required before enterprise discounts are granted.',`
			`rationale: 'The source policy states that approval is required.',`
			`evidenceChunkIds: [],`
			`evidenceRefs: [],`
			`suggestedPageKey: 'revenue-policy',`
			`actionHint: 'create' as const,`
			`durabilityScore: 3,`
			`authorityScore: 3,`
			`reuseScore: 3,`
			`noveltyScore: 2,`
			`riskScore: 0,`
			`promotionScore: 11,`
			`status: 'pending' as const,`
			`rejectionReason: null,`
			`lane: 'full' as const,`
			`embedding: null,`
			`};`

			`await store.insertCandidate({ ...candidateBase, candidateKey: 'discount-policy-a' });`
			`await store.insertCandidate({ ...candidateBase, candidateKey: 'discount-policy-b' });`

			`const result = await new CandidateDedupService({`
			`store,`
			`embeddings: embeddingPort,`
			`settings: {`
			`enabled: true,`
			`topicSimilarityThreshold: -1,`
			`scoreAggregation: 'max',`
			`},`
			`}).deduplicateRun('run-1');`

			`expect(result.enabled).toBe(true);`
			`expect(result.embeddingFailures).toBe(0);`
			`expect(result.candidatesIn).toBe(2);`
			`expect(result.clustersOut).toBe(1);`
			`expect(result.mergedCount).toBe(1);`
			`});`
			`});`