mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-28 08:49:38 +02:00
Initial open-source release
This commit is contained in:
commit
1a42152e6f
1199 changed files with 257054 additions and 0 deletions
472
packages/context/src/search/backend-conformance.test.ts
Normal file
472
packages/context/src/search/backend-conformance.test.ts
Normal file
|
|
@ -0,0 +1,472 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, it } from 'vitest';
|
||||
import { SqliteContextEvidenceStore } from '../ingest/context-evidence/index.js';
|
||||
import type { JsonValue } from '../ingest/ports.js';
|
||||
import { initKloProject, type KloLocalProject } from '../project/index.js';
|
||||
import { type LocalSlSourceSearchResult, searchLocalSlSources, writeLocalSlSource } from '../sl/local-sl.js';
|
||||
import type { ContextEvidenceSearchResult } from '../tools/context-evidence-tool-store.js';
|
||||
import {
|
||||
type LocalKnowledgeSearchResult,
|
||||
searchLocalKnowledgePages,
|
||||
writeLocalKnowledgePage,
|
||||
} from '../wiki/local-knowledge.js';
|
||||
import {
|
||||
assertSearchBackendCapabilities,
|
||||
assertSearchBackendConformanceCase,
|
||||
type SearchBackendConformanceResult,
|
||||
} from './backend-conformance.js';
|
||||
import type { SearchBackendCapabilities } from './types.js';
|
||||
|
||||
const SQLITE_SEARCH_CAPABILITIES = {
|
||||
fts: true,
|
||||
vector: false,
|
||||
fuzzy: false,
|
||||
jsonSearch: true,
|
||||
arraySearch: false,
|
||||
} satisfies SearchBackendCapabilities;
|
||||
|
||||
const ORDERS_YAML = [
|
||||
'name: orders',
|
||||
'table: public.orders',
|
||||
'grain:',
|
||||
' - order_id',
|
||||
'columns:',
|
||||
' - name: order_id',
|
||||
' type: string',
|
||||
' - name: revenue',
|
||||
' type: number',
|
||||
'measures:',
|
||||
' - name: total_revenue',
|
||||
' expr: sum(revenue)',
|
||||
'',
|
||||
].join('\n');
|
||||
|
||||
const FINANCE_ORDERS_YAML = [
|
||||
'name: orders',
|
||||
'description: Finance orders used for invoice reconciliation.',
|
||||
'table: finance.orders',
|
||||
'grain:',
|
||||
' - order_id',
|
||||
'columns:',
|
||||
' - name: order_id',
|
||||
' type: string',
|
||||
' - name: invoice_status',
|
||||
' type: string',
|
||||
'',
|
||||
].join('\n');
|
||||
|
||||
class FakeEmbeddingPort {
|
||||
readonly maxBatchSize = 16;
|
||||
|
||||
async computeEmbedding(text: string): Promise<number[]> {
|
||||
return text.toLowerCase().includes('semantic revenue') ? [1, 0] : [0, 1];
|
||||
}
|
||||
|
||||
async computeEmbeddingsBulk(texts: string[]): Promise<number[][]> {
|
||||
return Promise.all(texts.map((text) => this.computeEmbedding(text)));
|
||||
}
|
||||
}
|
||||
|
||||
function toSlConformanceResult(result: LocalSlSourceSearchResult): SearchBackendConformanceResult {
|
||||
return {
|
||||
id: `${result.connectionId}/${result.name}`,
|
||||
score: result.score ?? 0,
|
||||
matchReasons: result.matchReasons ?? [],
|
||||
lanes: result.lanes,
|
||||
dictionaryMatches: result.dictionaryMatches,
|
||||
};
|
||||
}
|
||||
|
||||
function toWikiConformanceResult(result: LocalKnowledgeSearchResult): SearchBackendConformanceResult {
|
||||
return {
|
||||
id: result.key,
|
||||
score: result.score,
|
||||
matchReasons: result.matchReasons,
|
||||
lanes: result.lanes,
|
||||
};
|
||||
}
|
||||
|
||||
function toContextConformanceResult(result: ContextEvidenceSearchResult): SearchBackendConformanceResult {
|
||||
return {
|
||||
id: `${result.externalId}:${result.stableCitationKey}`,
|
||||
score: result.score,
|
||||
matchReasons: result.matchReasons ?? [],
|
||||
lanes: result.lanes,
|
||||
};
|
||||
}
|
||||
|
||||
async function seedSemanticLayerProject(project: KloLocalProject): Promise<void> {
|
||||
await writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'orders',
|
||||
yaml: ORDERS_YAML,
|
||||
});
|
||||
await writeLocalSlSource(project, {
|
||||
connectionId: 'finance',
|
||||
sourceName: 'orders',
|
||||
yaml: FINANCE_ORDERS_YAML,
|
||||
});
|
||||
await project.fileStore.writeFile(
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
|
||||
`${JSON.stringify(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 2,
|
||||
tables: [],
|
||||
columns: {
|
||||
'orders.status': {
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'string',
|
||||
rowCount: 10,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 0.2,
|
||||
nullRate: 0,
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 8,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed dictionary profile',
|
||||
);
|
||||
}
|
||||
|
||||
async function seedWikiProject(project: KloLocalProject): Promise<void> {
|
||||
await writeLocalKnowledgePage(project, {
|
||||
key: 'metrics/revenue',
|
||||
scope: 'GLOBAL',
|
||||
summary: 'Semantic revenue definition',
|
||||
content: 'Revenue is recognized when an order is paid.',
|
||||
tags: ['finance'],
|
||||
refs: ['semantic-layer/warehouse/orders.yaml'],
|
||||
slRefs: ['orders'],
|
||||
});
|
||||
await writeLocalKnowledgePage(project, {
|
||||
key: 'support/escalations',
|
||||
scope: 'GLOBAL',
|
||||
summary: 'Support escalation process',
|
||||
content: 'Escalations move urgent support tickets to the operations queue.',
|
||||
tags: ['operations'],
|
||||
});
|
||||
}
|
||||
|
||||
async function seedContextDocument(
|
||||
subject: SqliteContextEvidenceStore,
|
||||
input: {
|
||||
runId?: string;
|
||||
syncId?: string;
|
||||
externalId?: string;
|
||||
title?: string;
|
||||
rawPath?: string;
|
||||
metadata?: JsonValue;
|
||||
publishState?: 'pending' | 'published';
|
||||
embedding?: number[] | null;
|
||||
content?: string;
|
||||
searchText?: string;
|
||||
} = {},
|
||||
): Promise<{ documentId: string; chunkId: string }> {
|
||||
const runId = input.runId ?? 'run-1';
|
||||
const syncId = input.syncId ?? 'sync-1';
|
||||
const externalId = input.externalId ?? 'page-1';
|
||||
const title = input.title ?? 'Revenue Policy';
|
||||
const rawPath = input.rawPath ?? `pages/${externalId}/page.md`;
|
||||
const doc = await subject.upsertDocument({
|
||||
runId,
|
||||
connectionId: 'conn-1',
|
||||
sourceKey: 'notion',
|
||||
externalId,
|
||||
externalParentId: null,
|
||||
databaseId: null,
|
||||
dataSourceId: null,
|
||||
title,
|
||||
path: `Company Handbook / ${title}`,
|
||||
url: `https://notion.test/${externalId}`,
|
||||
objectType: 'page',
|
||||
lastEditedAt: new Date('2026-04-30T10:00:00.000Z'),
|
||||
lastEditedBy: 'user-1',
|
||||
rawPath,
|
||||
syncId,
|
||||
contentHash: `hash-${externalId}`,
|
||||
publishState: input.publishState ?? 'published',
|
||||
metadata: input.metadata ?? {},
|
||||
});
|
||||
await subject.replaceChunks(doc.id, [
|
||||
{
|
||||
chunkKey: 'intro',
|
||||
headingPath: ['Policy'],
|
||||
ordinal: 0,
|
||||
content: input.content ?? `${title} requires approval from the accountable owner.`,
|
||||
searchText: input.searchText ?? `${title} approval accountable owner`,
|
||||
embedding: input.embedding ?? [1, 0, 0],
|
||||
tokenCount: 8,
|
||||
citation: {
|
||||
source: 'notion',
|
||||
pageId: externalId,
|
||||
title,
|
||||
syncId,
|
||||
rawPath,
|
||||
},
|
||||
stableCitationKey: `notion:${externalId}:intro`,
|
||||
syncId,
|
||||
contentHash: `chunk-${externalId}`,
|
||||
},
|
||||
]);
|
||||
|
||||
const read = await subject.readDocumentByExternalId('conn-1', 'notion', externalId, runId);
|
||||
if (!read) {
|
||||
throw new Error(`seeded document ${externalId} was not readable`);
|
||||
}
|
||||
|
||||
return { documentId: doc.id, chunkId: read.chunks[0].id };
|
||||
}
|
||||
|
||||
describe('SQLite hybrid search backend conformance', () => {
|
||||
let tempDir: string;
|
||||
let project: KloLocalProject;
|
||||
let dbPath: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-search-conformance-'));
|
||||
project = await initKloProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
|
||||
dbPath = join(tempDir, '.klo', 'db.sqlite');
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('documents SQLite search backend capabilities', () => {
|
||||
assertSearchBackendCapabilities({
|
||||
backendName: 'sqlite',
|
||||
capabilities: SQLITE_SEARCH_CAPABILITIES,
|
||||
expected: {
|
||||
fts: true,
|
||||
vector: false,
|
||||
fuzzy: false,
|
||||
jsonSearch: true,
|
||||
arraySearch: false,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps semantic-layer global ranking, dictionary evidence, and token fallback stable', async () => {
|
||||
await seedSemanticLayerProject(project);
|
||||
|
||||
const global = await searchLocalSlSources(project, { query: 'orders', limit: 5 });
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'sqlite',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'global source ranking',
|
||||
results: global.map(toSlConformanceResult),
|
||||
expectedTopIds: ['finance/orders', 'warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'finance/orders': ['lexical'],
|
||||
'warehouse/orders': ['lexical'],
|
||||
},
|
||||
expectedLanes: {
|
||||
lexical: { status: 'available' },
|
||||
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
|
||||
},
|
||||
});
|
||||
|
||||
const dictionary = await searchLocalSlSources(project, {
|
||||
connectionId: 'warehouse',
|
||||
query: 'refunded',
|
||||
limit: 5,
|
||||
});
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'sqlite',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'dictionary source evidence',
|
||||
results: dictionary.map(toSlConformanceResult),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['dictionary'],
|
||||
},
|
||||
expectedLanes: {
|
||||
dictionary: { status: 'available' },
|
||||
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
|
||||
},
|
||||
expectedDictionaryMatchesById: {
|
||||
'warehouse/orders': [{ column: 'status', values: ['refunded'] }],
|
||||
},
|
||||
});
|
||||
|
||||
const token = await searchLocalSlSources(project, {
|
||||
connectionId: 'warehouse',
|
||||
query: 'orders---',
|
||||
limit: 5,
|
||||
});
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'sqlite',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'token fallback reason',
|
||||
results: token.map(toSlConformanceResult),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['token'],
|
||||
},
|
||||
expectedLanes: {
|
||||
token: { status: 'available' },
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps wiki lexical, semantic, and token behavior stable', async () => {
|
||||
await seedWikiProject(project);
|
||||
|
||||
const lexical = await searchLocalKnowledgePages(project, {
|
||||
query: 'paid order',
|
||||
userId: 'local',
|
||||
limit: 5,
|
||||
});
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'sqlite',
|
||||
surface: 'wiki',
|
||||
caseName: 'lexical page ranking',
|
||||
results: lexical.map(toWikiConformanceResult),
|
||||
expectedTopIds: ['metrics/revenue'],
|
||||
expectedReasonsById: {
|
||||
'metrics/revenue': ['lexical'],
|
||||
},
|
||||
expectedLanes: {
|
||||
lexical: { status: 'available' },
|
||||
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
|
||||
},
|
||||
});
|
||||
|
||||
const semantic = await searchLocalKnowledgePages(project, {
|
||||
query: 'semantic revenue',
|
||||
userId: 'local',
|
||||
limit: 5,
|
||||
embeddingService: new FakeEmbeddingPort(),
|
||||
});
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'sqlite',
|
||||
surface: 'wiki',
|
||||
caseName: 'semantic page ranking',
|
||||
results: semantic.map(toWikiConformanceResult),
|
||||
expectedTopIds: ['metrics/revenue'],
|
||||
expectedReasonsById: {
|
||||
'metrics/revenue': ['semantic'],
|
||||
},
|
||||
expectedLanes: {
|
||||
semantic: { status: 'available' },
|
||||
},
|
||||
});
|
||||
|
||||
const token = await searchLocalKnowledgePages(project, {
|
||||
query: 'paid---',
|
||||
userId: 'local',
|
||||
limit: 5,
|
||||
});
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'sqlite',
|
||||
surface: 'wiki',
|
||||
caseName: 'token page fallback',
|
||||
results: token.map(toWikiConformanceResult),
|
||||
expectedTopIds: ['metrics/revenue'],
|
||||
expectedReasonsById: {
|
||||
'metrics/revenue': ['token'],
|
||||
},
|
||||
expectedLanes: {
|
||||
token: { status: 'available' },
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps context-evidence lane fusion and token fallback stable', async () => {
|
||||
const subject = new SqliteContextEvidenceStore({ dbPath });
|
||||
await seedContextDocument(subject, {
|
||||
externalId: 'page-discount',
|
||||
title: 'Enterprise Discount Policy',
|
||||
content: 'Enterprise discounts require finance approval before quote approval.',
|
||||
searchText: 'enterprise discount finance approval quote',
|
||||
embedding: [1, 0, 0],
|
||||
});
|
||||
await seedContextDocument(subject, {
|
||||
externalId: 'page-owner',
|
||||
title: 'Accountable Owner Policy',
|
||||
content: 'Every policy has an accountable owner and review date.',
|
||||
searchText: 'accountable owner review date',
|
||||
embedding: [0.95, 0.05, 0],
|
||||
});
|
||||
await seedContextDocument(subject, {
|
||||
externalId: 'page-expense',
|
||||
title: 'Expense Policy',
|
||||
content: 'Expense reimbursement requires receipt review.',
|
||||
searchText: 'expense reimbursement receipt review',
|
||||
embedding: [0, 1, 0],
|
||||
});
|
||||
|
||||
const fused = await subject.searchRRF({
|
||||
connectionId: 'conn-1',
|
||||
sourceKey: 'notion',
|
||||
queryEmbedding: [1, 0, 0],
|
||||
queryText: 'enterprise discount approval',
|
||||
limit: 2,
|
||||
includeDeleted: false,
|
||||
});
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'sqlite',
|
||||
surface: 'context-evidence',
|
||||
caseName: 'chunk lane fusion',
|
||||
results: fused.map(toContextConformanceResult),
|
||||
expectedTopIds: ['page-discount:notion:page-discount:intro'],
|
||||
expectedReasonsById: {
|
||||
'page-discount:notion:page-discount:intro': ['lexical', 'semantic', 'token'],
|
||||
},
|
||||
expectedLanes: {
|
||||
lexical: { status: 'available' },
|
||||
semantic: { status: 'available' },
|
||||
token: { status: 'available' },
|
||||
},
|
||||
});
|
||||
|
||||
const tokenSubject = new SqliteContextEvidenceStore({ dbPath: join(tempDir, 'token.sqlite') });
|
||||
await seedContextDocument(tokenSubject, {
|
||||
externalId: 'page-cpp',
|
||||
title: 'C++ Warehouse Notes',
|
||||
content: 'C++ parser notes for warehouse extraction.',
|
||||
searchText: 'C++ parser warehouse extraction',
|
||||
embedding: null,
|
||||
});
|
||||
|
||||
const token = await tokenSubject.searchRRF({
|
||||
connectionId: 'conn-1',
|
||||
sourceKey: 'notion',
|
||||
queryEmbedding: null,
|
||||
queryText: '++',
|
||||
limit: 5,
|
||||
includeDeleted: false,
|
||||
});
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'sqlite',
|
||||
surface: 'context-evidence',
|
||||
caseName: 'fts-empty token fallback',
|
||||
results: token.map(toContextConformanceResult),
|
||||
expectedTopIds: ['page-cpp:notion:page-cpp:intro'],
|
||||
expectedReasonsById: {
|
||||
'page-cpp:notion:page-cpp:intro': ['token'],
|
||||
},
|
||||
expectedLanes: {
|
||||
lexical: { status: 'skipped', reason: 'fts_query_empty' },
|
||||
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
|
||||
token: { status: 'available' },
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
151
packages/context/src/search/backend-conformance.ts
Normal file
151
packages/context/src/search/backend-conformance.ts
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
import type { SearchBackendCapabilities, SearchLaneStatus } from './types.js';
|
||||
|
||||
export interface SearchBackendConformanceLane {
|
||||
lane: string;
|
||||
status: SearchLaneStatus;
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
export interface SearchBackendConformanceDictionaryMatch {
|
||||
column: string;
|
||||
values: readonly string[];
|
||||
overflowCount?: number;
|
||||
}
|
||||
|
||||
export interface SearchBackendConformanceResult {
|
||||
id: string;
|
||||
score: number;
|
||||
matchReasons: readonly string[];
|
||||
lanes?: readonly SearchBackendConformanceLane[];
|
||||
dictionaryMatches?: readonly SearchBackendConformanceDictionaryMatch[];
|
||||
}
|
||||
|
||||
export interface ExpectedSearchBackendConformanceLane {
|
||||
status: SearchLaneStatus;
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
export interface AssertSearchBackendConformanceCaseInput {
|
||||
backendName: string;
|
||||
surface: string;
|
||||
caseName: string;
|
||||
results: readonly SearchBackendConformanceResult[];
|
||||
expectedTopIds: readonly string[];
|
||||
expectedReasonsById?: Record<string, readonly string[]>;
|
||||
expectedLanes?: Record<string, ExpectedSearchBackendConformanceLane>;
|
||||
expectedDictionaryMatchesById?: Record<string, readonly SearchBackendConformanceDictionaryMatch[]>;
|
||||
}
|
||||
|
||||
export interface AssertSearchBackendCapabilitiesInput {
|
||||
backendName: string;
|
||||
capabilities: SearchBackendCapabilities;
|
||||
expected: Partial<SearchBackendCapabilities>;
|
||||
}
|
||||
|
||||
function caseLabel(
|
||||
input: Pick<AssertSearchBackendConformanceCaseInput, 'backendName' | 'surface' | 'caseName'>,
|
||||
): string {
|
||||
return `${input.backendName} ${input.surface} conformance case "${input.caseName}"`;
|
||||
}
|
||||
|
||||
function fail(label: string, failures: string[]): never {
|
||||
throw new Error([`${label} failed:`, ...failures.map((failure) => `- ${failure}`)].join('\n'));
|
||||
}
|
||||
|
||||
function dictionaryMatchKey(match: SearchBackendConformanceDictionaryMatch): string {
|
||||
const values = [...match.values].sort((left, right) => left.localeCompare(right)).join(',');
|
||||
return `${match.column}:${values}:${match.overflowCount ?? 0}`;
|
||||
}
|
||||
|
||||
function dictionaryMatchKeys(matches: readonly SearchBackendConformanceDictionaryMatch[] | undefined): string[] {
|
||||
return (matches ?? []).map(dictionaryMatchKey).sort((left, right) => left.localeCompare(right));
|
||||
}
|
||||
|
||||
export function assertSearchBackendConformanceCase(input: AssertSearchBackendConformanceCaseInput): void {
|
||||
const label = caseLabel(input);
|
||||
const failures: string[] = [];
|
||||
const topResults = input.results.slice(0, input.expectedTopIds.length);
|
||||
|
||||
input.expectedTopIds.forEach((expectedId, index) => {
|
||||
const actualId = topResults[index]?.id;
|
||||
if (actualId !== expectedId) {
|
||||
failures.push(`expected result ${index + 1} to be ${expectedId}, got ${actualId ?? '<missing>'}`);
|
||||
}
|
||||
});
|
||||
|
||||
const byId = new Map(input.results.map((result) => [result.id, result]));
|
||||
|
||||
for (const expectedId of input.expectedTopIds) {
|
||||
const result = byId.get(expectedId);
|
||||
if (!result) {
|
||||
continue;
|
||||
}
|
||||
if (!Number.isFinite(result.score) || result.score <= 0) {
|
||||
failures.push(`expected ${expectedId} to have a positive finite score, got ${result.score}`);
|
||||
}
|
||||
}
|
||||
|
||||
for (const [id, expectedReasons] of Object.entries(input.expectedReasonsById ?? {})) {
|
||||
const result = byId.get(id);
|
||||
if (!result) {
|
||||
failures.push(`expected reasons for ${id}, but the result was missing`);
|
||||
continue;
|
||||
}
|
||||
for (const reason of expectedReasons) {
|
||||
if (!result.matchReasons.includes(reason)) {
|
||||
failures.push(`expected ${id} to include match reason ${reason}, got [${result.matchReasons.join(', ')}]`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const allLanes = input.results.flatMap((result) => result.lanes ?? []);
|
||||
for (const [lane, expected] of Object.entries(input.expectedLanes ?? {})) {
|
||||
const actual = allLanes.find((entry) => entry.lane === lane);
|
||||
if (!actual) {
|
||||
failures.push(`expected lane ${lane} to be reported`);
|
||||
continue;
|
||||
}
|
||||
if (actual.status !== expected.status) {
|
||||
failures.push(`expected lane ${lane} status ${expected.status}, got ${actual.status}`);
|
||||
}
|
||||
if (expected.reason !== undefined && actual.reason !== expected.reason) {
|
||||
failures.push(`expected lane ${lane} reason ${expected.reason}, got ${actual.reason ?? '<missing>'}`);
|
||||
}
|
||||
}
|
||||
|
||||
for (const [id, expectedMatches] of Object.entries(input.expectedDictionaryMatchesById ?? {})) {
|
||||
const result = byId.get(id);
|
||||
if (!result) {
|
||||
failures.push(`expected dictionary matches for ${id}, but the result was missing`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const actualKeys = dictionaryMatchKeys(result.dictionaryMatches);
|
||||
for (const expectedKey of dictionaryMatchKeys(expectedMatches)) {
|
||||
if (!actualKeys.includes(expectedKey)) {
|
||||
failures.push(`expected ${id} dictionary evidence ${expectedKey}, got [${actualKeys.join(', ')}]`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (failures.length > 0) {
|
||||
fail(label, failures);
|
||||
}
|
||||
}
|
||||
|
||||
export function assertSearchBackendCapabilities(input: AssertSearchBackendCapabilitiesInput): void {
|
||||
const failures: string[] = [];
|
||||
|
||||
for (const [capability, expected] of Object.entries(input.expected) as Array<
|
||||
[keyof SearchBackendCapabilities, boolean]
|
||||
>) {
|
||||
const actual = input.capabilities[capability];
|
||||
if (actual !== expected) {
|
||||
failures.push(`expected ${capability}=${expected}, got ${actual}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (failures.length > 0) {
|
||||
fail(`${input.backendName} search backend capabilities`, failures);
|
||||
}
|
||||
}
|
||||
127
packages/context/src/search/hybrid-search-core.test.ts
Normal file
127
packages/context/src/search/hybrid-search-core.test.ts
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { HybridSearchCore } from './hybrid-search-core.js';
|
||||
import type { SearchCandidateGenerator } from './types.js';
|
||||
|
||||
function generator(
|
||||
lane: string,
|
||||
candidates: Array<{ id: string; rank: number; rawScore?: number; matchReason?: string; evidence?: unknown }>,
|
||||
weight?: number,
|
||||
): SearchCandidateGenerator {
|
||||
return {
|
||||
lane,
|
||||
weight,
|
||||
async generate() {
|
||||
return { candidates };
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('HybridSearchCore', () => {
|
||||
it('runs lane generators with the shared pool size and applies final limit after RRF fusion', async () => {
|
||||
const calls: Array<{ lane: string; laneCandidatePoolLimit: number; finalLimit: number }> = [];
|
||||
const core = new HybridSearchCore();
|
||||
const result = await core.search({
|
||||
queryText: 'gross revenue',
|
||||
limit: 1,
|
||||
generators: [
|
||||
{
|
||||
lane: 'lexical',
|
||||
async generate(args) {
|
||||
calls.push({ lane: 'lexical', ...args });
|
||||
return {
|
||||
candidates: [
|
||||
{ id: 'orders', rank: 1, rawScore: 0.8 },
|
||||
{ id: 'customers', rank: 2, rawScore: 0.7 },
|
||||
],
|
||||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
lane: 'semantic',
|
||||
async generate(args) {
|
||||
calls.push({ lane: 'semantic', ...args });
|
||||
return { candidates: [{ id: 'customers', rank: 1, rawScore: 0.91 }] };
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(calls).toEqual([
|
||||
expect.objectContaining({ lane: 'lexical', laneCandidatePoolLimit: 25, finalLimit: 1 }),
|
||||
expect.objectContaining({ lane: 'semantic', laneCandidatePoolLimit: 25, finalLimit: 1 }),
|
||||
]);
|
||||
expect(result.results.map((candidate) => candidate.id)).toEqual(['customers']);
|
||||
expect(result.results[0]).toMatchObject({
|
||||
matchReasons: ['lexical', 'semantic'],
|
||||
ranksByLane: { lexical: 2, semantic: 1 },
|
||||
rawScoresByLane: { lexical: 0.7, semantic: 0.91 },
|
||||
});
|
||||
expect(result.lanes).toEqual([
|
||||
expect.objectContaining({ lane: 'lexical', status: 'available', returnedCandidateCount: 2, weight: 1.5 }),
|
||||
expect.objectContaining({ lane: 'semantic', status: 'available', returnedCandidateCount: 1, weight: 2 }),
|
||||
]);
|
||||
});
|
||||
|
||||
it('keeps available lane results when another lane is skipped or fails', async () => {
|
||||
const core = new HybridSearchCore();
|
||||
const result = await core.search({
|
||||
queryText: 'paid',
|
||||
limit: 5,
|
||||
generators: [
|
||||
generator('lexical', [{ id: 'orders', rank: 1 }]),
|
||||
{
|
||||
lane: 'semantic',
|
||||
async generate() {
|
||||
return { status: 'skipped', candidates: [], reason: 'embedding_unconfigured' };
|
||||
},
|
||||
},
|
||||
{
|
||||
lane: 'dictionary',
|
||||
async generate() {
|
||||
throw new Error('dictionary index unavailable');
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.results.map((candidate) => candidate.id)).toEqual(['orders']);
|
||||
expect(result.lanes).toEqual([
|
||||
expect.objectContaining({ lane: 'lexical', status: 'available', reason: undefined }),
|
||||
expect.objectContaining({ lane: 'semantic', status: 'skipped', reason: 'embedding_unconfigured' }),
|
||||
expect.objectContaining({ lane: 'dictionary', status: 'failed', reason: 'dictionary index unavailable' }),
|
||||
]);
|
||||
});
|
||||
|
||||
it('deduplicates one lane by best rank before fusion', async () => {
|
||||
const core = new HybridSearchCore();
|
||||
const result = await core.search({
|
||||
queryText: 'paid status',
|
||||
limit: 10,
|
||||
generators: [
|
||||
generator('dictionary', [
|
||||
{ id: 'orders', rank: 4, rawScore: 0.4, evidence: { column: 'state', values: ['paid'] } },
|
||||
{ id: 'orders', rank: 1, rawScore: 0.9, evidence: { column: 'status', values: ['paid'] } },
|
||||
]),
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.results).toHaveLength(1);
|
||||
expect(result.results[0]).toMatchObject({
|
||||
id: 'orders',
|
||||
ranksByLane: { dictionary: 1 },
|
||||
rawScoresByLane: { dictionary: 0.9 },
|
||||
evidenceByLane: { dictionary: [{ column: 'status', values: ['paid'] }] },
|
||||
});
|
||||
});
|
||||
|
||||
it('uses deterministic id ordering when scores and lane counts tie', async () => {
|
||||
const core = new HybridSearchCore();
|
||||
const result = await core.search({
|
||||
queryText: 'revenue',
|
||||
limit: 10,
|
||||
generators: [generator('lexical', [{ id: 'zebra', rank: 1 }, { id: 'alpha', rank: 1 }])],
|
||||
});
|
||||
|
||||
expect(result.results.map((candidate) => candidate.id)).toEqual(['alpha', 'zebra']);
|
||||
});
|
||||
});
|
||||
141
packages/context/src/search/hybrid-search-core.ts
Normal file
141
packages/context/src/search/hybrid-search-core.ts
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
import { defaultLaneCandidatePoolLimit, normalizeSearchQuery } from './query.js';
|
||||
import { compareFusedSearchCandidates, DEFAULT_RRF_K, DEFAULT_SEARCH_LANE_WEIGHTS, rrfContribution } from './rrf.js';
|
||||
import type {
|
||||
FusedSearchCandidate,
|
||||
HybridSearchOptions,
|
||||
HybridSearchResult,
|
||||
SearchCandidate,
|
||||
SearchCandidateGenerator,
|
||||
SearchLaneBreakdown,
|
||||
SearchLaneName,
|
||||
SearchLaneResult,
|
||||
} from './types.js';
|
||||
|
||||
interface ExecutedLane {
|
||||
generator: SearchCandidateGenerator;
|
||||
result: SearchLaneResult;
|
||||
}
|
||||
|
||||
function laneWeight(options: HybridSearchOptions, lane: SearchLaneName, generatorWeight?: number): number {
|
||||
return generatorWeight ?? options.laneWeights?.[lane] ?? DEFAULT_SEARCH_LANE_WEIGHTS[lane] ?? 1;
|
||||
}
|
||||
|
||||
function normalizeCandidate(candidate: SearchCandidate, fallbackRank: number): SearchCandidate {
|
||||
const rank = Number.isFinite(candidate.rank) && candidate.rank > 0 ? Math.floor(candidate.rank) : fallbackRank;
|
||||
return { ...candidate, rank };
|
||||
}
|
||||
|
||||
function bestCandidatesForLane(candidates: SearchCandidate[]): SearchCandidate[] {
|
||||
const byId = new Map<string, SearchCandidate>();
|
||||
candidates.forEach((candidate, index) => {
|
||||
const normalized = normalizeCandidate(candidate, index + 1);
|
||||
const existing = byId.get(normalized.id);
|
||||
if (
|
||||
!existing ||
|
||||
normalized.rank < existing.rank ||
|
||||
(normalized.rank === existing.rank && normalized.id.localeCompare(existing.id) < 0)
|
||||
) {
|
||||
byId.set(normalized.id, normalized);
|
||||
}
|
||||
});
|
||||
|
||||
return [...byId.values()].sort((left, right) => left.rank - right.rank || left.id.localeCompare(right.id));
|
||||
}
|
||||
|
||||
function failedLaneResult(error: unknown): SearchLaneResult {
|
||||
return {
|
||||
status: 'failed',
|
||||
candidates: [],
|
||||
reason: error instanceof Error ? error.message : String(error),
|
||||
};
|
||||
}
|
||||
|
||||
export class HybridSearchCore {
|
||||
async search(options: HybridSearchOptions): Promise<HybridSearchResult> {
|
||||
const finalLimit = Math.max(1, options.limit);
|
||||
const requestedCandidatePoolLimit = options.candidatePoolLimit ?? defaultLaneCandidatePoolLimit(finalLimit);
|
||||
const normalizedQuery = normalizeSearchQuery(options.queryText);
|
||||
|
||||
const executed = await Promise.all(
|
||||
options.generators.map(async (generator): Promise<ExecutedLane> => {
|
||||
try {
|
||||
const result = await generator.generate({
|
||||
queryText: options.queryText,
|
||||
normalizedQuery,
|
||||
finalLimit,
|
||||
laneCandidatePoolLimit: requestedCandidatePoolLimit,
|
||||
});
|
||||
return { generator, result };
|
||||
} catch (error) {
|
||||
return { generator, result: failedLaneResult(error) };
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
const byId = new Map<string, FusedSearchCandidate>();
|
||||
const lanes: SearchLaneBreakdown[] = [];
|
||||
const rrfK = options.rrfK ?? DEFAULT_RRF_K;
|
||||
|
||||
for (const { generator, result } of executed) {
|
||||
const weight = laneWeight(options, generator.lane, generator.weight);
|
||||
const status = result.status ?? 'available';
|
||||
const effectiveCandidatePoolLimit = result.effectiveCandidatePoolLimit ?? requestedCandidatePoolLimit;
|
||||
const laneCandidates = status === 'available' ? bestCandidatesForLane(result.candidates) : [];
|
||||
|
||||
lanes.push({
|
||||
lane: generator.lane,
|
||||
status,
|
||||
requestedCandidatePoolLimit,
|
||||
effectiveCandidatePoolLimit,
|
||||
returnedCandidateCount: laneCandidates.length,
|
||||
weight,
|
||||
reason: result.reason,
|
||||
});
|
||||
|
||||
if (status !== 'available') {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const candidate of laneCandidates) {
|
||||
const existing =
|
||||
byId.get(candidate.id) ??
|
||||
({
|
||||
id: candidate.id,
|
||||
score: 0,
|
||||
matchReasons: [],
|
||||
ranksByLane: {},
|
||||
rawScoresByLane: {},
|
||||
evidenceByLane: {},
|
||||
} satisfies FusedSearchCandidate);
|
||||
|
||||
existing.score += rrfContribution(weight, candidate.rank, rrfK);
|
||||
existing.ranksByLane[generator.lane] = candidate.rank;
|
||||
if (candidate.rawScore !== undefined) {
|
||||
existing.rawScoresByLane[generator.lane] = candidate.rawScore;
|
||||
}
|
||||
const reason = candidate.matchReason ?? generator.lane;
|
||||
if (!existing.matchReasons.includes(reason)) {
|
||||
existing.matchReasons.push(reason);
|
||||
}
|
||||
if (candidate.evidence !== undefined) {
|
||||
existing.evidenceByLane[generator.lane] = [
|
||||
...(existing.evidenceByLane[generator.lane] ?? []),
|
||||
candidate.evidence,
|
||||
];
|
||||
}
|
||||
|
||||
byId.set(candidate.id, existing);
|
||||
}
|
||||
}
|
||||
|
||||
const results = [...byId.values()].sort(compareFusedSearchCandidates).slice(0, finalLimit);
|
||||
|
||||
return {
|
||||
query: normalizedQuery,
|
||||
requestedLimit: finalLimit,
|
||||
requestedCandidatePoolLimit,
|
||||
results,
|
||||
lanes,
|
||||
};
|
||||
}
|
||||
}
|
||||
35
packages/context/src/search/index.ts
Normal file
35
packages/context/src/search/index.ts
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
export type {
|
||||
AssertSearchBackendCapabilitiesInput,
|
||||
AssertSearchBackendConformanceCaseInput,
|
||||
ExpectedSearchBackendConformanceLane,
|
||||
SearchBackendConformanceDictionaryMatch,
|
||||
SearchBackendConformanceLane,
|
||||
SearchBackendConformanceResult,
|
||||
} from './backend-conformance.js';
|
||||
export {
|
||||
assertSearchBackendCapabilities,
|
||||
assertSearchBackendConformanceCase,
|
||||
} from './backend-conformance.js';
|
||||
export { HybridSearchCore } from './hybrid-search-core.js';
|
||||
export { defaultLaneCandidatePoolLimit, normalizeSearchQuery } from './query.js';
|
||||
export {
|
||||
compareFusedSearchCandidates,
|
||||
DEFAULT_RRF_K,
|
||||
DEFAULT_SEARCH_LANE_WEIGHTS,
|
||||
rrfContribution,
|
||||
} from './rrf.js';
|
||||
export type {
|
||||
FusedSearchCandidate,
|
||||
HybridSearchOptions,
|
||||
HybridSearchResult,
|
||||
NormalizedSearchQuery,
|
||||
SearchBackendCapabilities,
|
||||
SearchCandidate,
|
||||
SearchCandidateGenerator,
|
||||
SearchCandidateGeneratorArgs,
|
||||
SearchLaneBreakdown,
|
||||
SearchLaneName,
|
||||
SearchLaneResult,
|
||||
SearchLaneStatus,
|
||||
SearchResultHydrator,
|
||||
} from './types.js';
|
||||
331
packages/context/src/search/pglite-owner-process.test.ts
Normal file
331
packages/context/src/search/pglite-owner-process.test.ts
Normal file
|
|
@ -0,0 +1,331 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { createServer } from 'node:net';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { Client } from 'pg';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { assertSearchBackendCapabilities, assertSearchBackendConformanceCase } from './index.js';
|
||||
import { KloPGliteOwnerProcess, PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES } from './pglite-owner-process.js';
|
||||
|
||||
async function allocatePort(): Promise<number> {
|
||||
const server = createServer();
|
||||
await new Promise<void>((resolve) => server.listen(0, '127.0.0.1', resolve));
|
||||
const address = server.address();
|
||||
if (typeof address !== 'object' || address === null) {
|
||||
throw new Error('Expected TCP server address while allocating a PGlite owner-process port.');
|
||||
}
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
server.close((error) => {
|
||||
if (error) {
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
return address.port;
|
||||
}
|
||||
|
||||
async function createHybridSearchFixture(owner: KloPGliteOwnerProcess): Promise<void> {
|
||||
await owner.query(`
|
||||
CREATE TABLE prototype_documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
search_text TEXT NOT NULL,
|
||||
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
embedding vector(3) NOT NULL
|
||||
);
|
||||
|
||||
CREATE INDEX prototype_documents_fts_idx
|
||||
ON prototype_documents
|
||||
USING GIN (to_tsvector('english', search_text));
|
||||
|
||||
CREATE INDEX prototype_documents_vector_idx
|
||||
ON prototype_documents
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 1);
|
||||
|
||||
CREATE TABLE prototype_dictionary_values (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
column_name TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
PRIMARY KEY (connection_id, source_name, column_name, value)
|
||||
);
|
||||
|
||||
CREATE INDEX prototype_dictionary_values_trgm_idx
|
||||
ON prototype_dictionary_values
|
||||
USING GIN (value gin_trgm_ops);
|
||||
`);
|
||||
}
|
||||
|
||||
async function seedHybridSearchFixture(owner: KloPGliteOwnerProcess): Promise<void> {
|
||||
await owner.query(
|
||||
`
|
||||
INSERT INTO prototype_documents (id, search_text, metadata, embedding)
|
||||
VALUES
|
||||
($1, $2, $3::jsonb, $4::vector),
|
||||
($5, $6, $7::jsonb, $8::vector),
|
||||
($9, $10, $11::jsonb, $12::vector)
|
||||
`,
|
||||
[
|
||||
'warehouse/orders',
|
||||
'orders paid revenue refund status customer',
|
||||
JSON.stringify({ connectionId: 'warehouse', sourceName: 'orders' }),
|
||||
JSON.stringify([1, 0, 0]),
|
||||
'finance/orders',
|
||||
'orders finance bookings gross margin',
|
||||
JSON.stringify({ connectionId: 'finance', sourceName: 'orders' }),
|
||||
JSON.stringify([0.72, 0.28, 0]),
|
||||
'warehouse/customers',
|
||||
'customers accounts lifecycle region',
|
||||
JSON.stringify({ connectionId: 'warehouse', sourceName: 'customers' }),
|
||||
JSON.stringify([0, 1, 0]),
|
||||
],
|
||||
);
|
||||
|
||||
await owner.query(`
|
||||
INSERT INTO prototype_dictionary_values (connection_id, source_name, column_name, value)
|
||||
VALUES
|
||||
('warehouse', 'orders', 'status', 'refunded'),
|
||||
('warehouse', 'orders', 'status', 'paid'),
|
||||
('warehouse', 'customers', 'region', 'emea')
|
||||
`);
|
||||
}
|
||||
|
||||
describe('KloPGliteOwnerProcess', () => {
|
||||
let tempDir: string;
|
||||
let dataDir: string;
|
||||
let port: number;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-owner-process-'));
|
||||
dataDir = join(tempDir, 'pgdata');
|
||||
port = await allocatePort();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('declares the advanced PGlite search capabilities observed by the spike', () => {
|
||||
assertSearchBackendCapabilities({
|
||||
backendName: 'pglite-owner-process',
|
||||
capabilities: PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES,
|
||||
expected: {
|
||||
fts: true,
|
||||
vector: true,
|
||||
fuzzy: true,
|
||||
jsonSearch: true,
|
||||
arraySearch: false,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('starts a socket owner process and serves PostgreSQL clients', async () => {
|
||||
const owner = await KloPGliteOwnerProcess.start({
|
||||
dataDir,
|
||||
host: '127.0.0.1',
|
||||
port,
|
||||
});
|
||||
|
||||
try {
|
||||
await owner.query(`
|
||||
CREATE TABLE owner_process_smoke (
|
||||
id TEXT PRIMARY KEY,
|
||||
search_text TEXT NOT NULL,
|
||||
embedding vector(3) NOT NULL
|
||||
);
|
||||
|
||||
INSERT INTO owner_process_smoke (id, search_text, embedding)
|
||||
VALUES
|
||||
('orders', 'orders paid revenue', '[1,0,0]'::vector),
|
||||
('customers', 'customers region lifecycle', '[0,1,0]'::vector);
|
||||
`);
|
||||
|
||||
const client = new Client(owner.connectionConfig());
|
||||
await client.connect();
|
||||
|
||||
try {
|
||||
const result = await client.query<{ id: string }>(`
|
||||
SELECT id
|
||||
FROM owner_process_smoke
|
||||
ORDER BY embedding <=> '[1,0,0]'::vector, id ASC
|
||||
LIMIT 1
|
||||
`);
|
||||
|
||||
expect(result.rows).toEqual([{ id: 'orders' }]);
|
||||
} finally {
|
||||
await client.end();
|
||||
}
|
||||
} finally {
|
||||
await owner.stop();
|
||||
}
|
||||
});
|
||||
|
||||
it('runs lexical, semantic, and dictionary conformance probes through socket clients', async () => {
|
||||
const owner = await KloPGliteOwnerProcess.start({
|
||||
dataDir,
|
||||
host: '127.0.0.1',
|
||||
port,
|
||||
});
|
||||
|
||||
try {
|
||||
await createHybridSearchFixture(owner);
|
||||
await seedHybridSearchFixture(owner);
|
||||
|
||||
const lexical = await owner.query<{ id: string; score: number }>(
|
||||
`
|
||||
SELECT
|
||||
id,
|
||||
ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) AS score
|
||||
FROM prototype_documents
|
||||
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
|
||||
ORDER BY score DESC, id ASC
|
||||
LIMIT 2
|
||||
`,
|
||||
['paid orders'],
|
||||
);
|
||||
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'pglite-owner-process',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'socket postgres fts lexical ranking',
|
||||
results: lexical.rows.map((row) => ({
|
||||
id: row.id,
|
||||
score: row.score,
|
||||
matchReasons: ['lexical'],
|
||||
})),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['lexical'],
|
||||
},
|
||||
});
|
||||
|
||||
const semantic = await owner.query<{ id: string; similarity: number }>(
|
||||
`
|
||||
SELECT
|
||||
id,
|
||||
1 - (embedding <=> $1::vector) AS similarity
|
||||
FROM prototype_documents
|
||||
ORDER BY embedding <=> $1::vector, id ASC
|
||||
LIMIT 2
|
||||
`,
|
||||
[JSON.stringify([1, 0, 0])],
|
||||
);
|
||||
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'pglite-owner-process',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'socket pgvector semantic ranking',
|
||||
results: semantic.rows.map((row) => ({
|
||||
id: row.id,
|
||||
score: row.similarity,
|
||||
matchReasons: ['semantic'],
|
||||
})),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['semantic'],
|
||||
},
|
||||
});
|
||||
|
||||
const dictionary = await owner.query<{ id: string; value: string; score: number }>(
|
||||
`
|
||||
SELECT
|
||||
connection_id || '/' || source_name AS id,
|
||||
value,
|
||||
similarity(value, $1) AS score
|
||||
FROM prototype_dictionary_values
|
||||
WHERE similarity(value, $1) > 0
|
||||
ORDER BY score DESC, id ASC, value ASC
|
||||
LIMIT 2
|
||||
`,
|
||||
['refund'],
|
||||
);
|
||||
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'pglite-owner-process',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'socket pg_trgm dictionary ranking',
|
||||
results: dictionary.rows.map((row) => ({
|
||||
id: row.id,
|
||||
score: row.score,
|
||||
matchReasons: ['dictionary'],
|
||||
dictionaryMatches: [{ column: 'status', values: [row.value] }],
|
||||
})),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['dictionary'],
|
||||
},
|
||||
expectedDictionaryMatchesById: {
|
||||
'warehouse/orders': [{ column: 'status', values: ['refunded'] }],
|
||||
},
|
||||
});
|
||||
} finally {
|
||||
await owner.stop();
|
||||
}
|
||||
});
|
||||
|
||||
it('persists indexed rows after stopping and restarting the owner process', async () => {
|
||||
const firstOwner = await KloPGliteOwnerProcess.start({
|
||||
dataDir,
|
||||
host: '127.0.0.1',
|
||||
port,
|
||||
});
|
||||
|
||||
try {
|
||||
await createHybridSearchFixture(firstOwner);
|
||||
await seedHybridSearchFixture(firstOwner);
|
||||
} finally {
|
||||
await firstOwner.stop();
|
||||
}
|
||||
|
||||
const secondOwner = await KloPGliteOwnerProcess.start({
|
||||
dataDir,
|
||||
host: '127.0.0.1',
|
||||
port,
|
||||
});
|
||||
|
||||
try {
|
||||
const persisted = await secondOwner.query<{ count: number }>(
|
||||
"SELECT COUNT(*)::int AS count FROM prototype_documents WHERE metadata->>'connectionId' = $1",
|
||||
['warehouse'],
|
||||
);
|
||||
|
||||
expect(persisted.rows).toEqual([{ count: 2 }]);
|
||||
} finally {
|
||||
await secondOwner.stop();
|
||||
}
|
||||
});
|
||||
|
||||
it('serves concurrent PostgreSQL clients through one owner process', async () => {
|
||||
const owner = await KloPGliteOwnerProcess.start({
|
||||
dataDir,
|
||||
host: '127.0.0.1',
|
||||
port,
|
||||
});
|
||||
|
||||
const clients: Client[] = [];
|
||||
|
||||
try {
|
||||
await createHybridSearchFixture(owner);
|
||||
await seedHybridSearchFixture(owner);
|
||||
|
||||
for (let index = 0; index < 4; index += 1) {
|
||||
const client = new Client(owner.connectionConfig());
|
||||
await client.connect();
|
||||
clients.push(client);
|
||||
}
|
||||
|
||||
const results = await Promise.all(
|
||||
clients.map((client) =>
|
||||
client.query<{ count: number }>('SELECT COUNT(*)::int AS count FROM prototype_documents'),
|
||||
),
|
||||
);
|
||||
|
||||
expect(results.map((result) => result.rows[0]?.count)).toEqual([3, 3, 3, 3]);
|
||||
} finally {
|
||||
await Promise.all(clients.map((client) => client.end().catch(() => undefined)));
|
||||
await owner.stop();
|
||||
}
|
||||
});
|
||||
});
|
||||
114
packages/context/src/search/pglite-owner-process.ts
Normal file
114
packages/context/src/search/pglite-owner-process.ts
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import { PGlite, type PGliteInterface } from '@electric-sql/pglite';
|
||||
import { pg_trgm } from '@electric-sql/pglite/contrib/pg_trgm';
|
||||
import { vector } from '@electric-sql/pglite/vector';
|
||||
import { PGLiteSocketServer } from '@electric-sql/pglite-socket';
|
||||
import { Client, type ClientConfig, type QueryResult, type QueryResultRow } from 'pg';
|
||||
import type { SearchBackendCapabilities } from './types.js';
|
||||
|
||||
export const PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES = {
|
||||
fts: true,
|
||||
vector: true,
|
||||
fuzzy: true,
|
||||
jsonSearch: true,
|
||||
arraySearch: false,
|
||||
} satisfies SearchBackendCapabilities;
|
||||
|
||||
export interface KloPGliteOwnerProcessOptions {
|
||||
dataDir: string;
|
||||
host: string;
|
||||
port: number;
|
||||
inspect?: boolean;
|
||||
maxConnections?: number;
|
||||
}
|
||||
|
||||
export class KloPGliteOwnerProcess {
|
||||
readonly dataDir: string;
|
||||
readonly host: string;
|
||||
readonly port: number;
|
||||
|
||||
#db: PGliteInterface;
|
||||
#server: PGLiteSocketServer;
|
||||
#stopped = false;
|
||||
|
||||
private constructor(options: KloPGliteOwnerProcessOptions, db: PGliteInterface, server: PGLiteSocketServer) {
|
||||
this.dataDir = options.dataDir;
|
||||
this.host = options.host;
|
||||
this.port = options.port;
|
||||
this.#db = db;
|
||||
this.#server = server;
|
||||
}
|
||||
|
||||
static async start(options: KloPGliteOwnerProcessOptions): Promise<KloPGliteOwnerProcess> {
|
||||
const db = await PGlite.create({
|
||||
dataDir: options.dataDir,
|
||||
extensions: {
|
||||
vector,
|
||||
pg_trgm,
|
||||
},
|
||||
});
|
||||
|
||||
let server: PGLiteSocketServer | undefined;
|
||||
|
||||
try {
|
||||
await db.exec(`
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
`);
|
||||
|
||||
server = new PGLiteSocketServer({
|
||||
db,
|
||||
host: options.host,
|
||||
port: options.port,
|
||||
inspect: options.inspect ?? false,
|
||||
maxConnections: options.maxConnections ?? 100,
|
||||
});
|
||||
|
||||
await server.start();
|
||||
|
||||
return new KloPGliteOwnerProcess(options, db, server);
|
||||
} catch (error) {
|
||||
await server?.stop().catch(() => undefined);
|
||||
await db.close().catch(() => undefined);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
connectionConfig(): ClientConfig {
|
||||
return {
|
||||
host: this.host,
|
||||
port: this.port,
|
||||
user: 'postgres',
|
||||
database: 'postgres',
|
||||
application_name: 'klo-pglite-owner-prototype',
|
||||
connectionTimeoutMillis: 5_000,
|
||||
};
|
||||
}
|
||||
|
||||
async connect(): Promise<Client> {
|
||||
const client = new Client(this.connectionConfig());
|
||||
await client.connect();
|
||||
return client;
|
||||
}
|
||||
|
||||
async query<T extends QueryResultRow = QueryResultRow>(
|
||||
text: string,
|
||||
values?: readonly unknown[],
|
||||
): Promise<QueryResult<T>> {
|
||||
const client = await this.connect();
|
||||
try {
|
||||
return await client.query<T>(text, values ? [...values] : undefined);
|
||||
} finally {
|
||||
await client.end();
|
||||
}
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
if (this.#stopped) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.#stopped = true;
|
||||
await this.#server.stop();
|
||||
await this.#db.close();
|
||||
}
|
||||
}
|
||||
66
packages/context/src/search/pglite-runtime-boundary.test.ts
Normal file
66
packages/context/src/search/pglite-runtime-boundary.test.ts
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
import { readFileSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
|
||||
const kloRoot = fileURLToPath(new URL('../../../../', import.meta.url));
|
||||
|
||||
function readKloFile(relativePath: string): string {
|
||||
return readFileSync(join(kloRoot, relativePath), 'utf8');
|
||||
}
|
||||
|
||||
function readContextPackageJson(): {
|
||||
dependencies?: Record<string, string>;
|
||||
devDependencies?: Record<string, string>;
|
||||
exports?: Record<string, unknown>;
|
||||
files?: string[];
|
||||
} {
|
||||
return JSON.parse(readKloFile('packages/context/package.json'));
|
||||
}
|
||||
|
||||
describe('PGlite hybrid search runtime boundary', () => {
|
||||
it('keeps PGlite packages as dev-only prototype dependencies', () => {
|
||||
const pkg = readContextPackageJson();
|
||||
|
||||
expect(pkg.dependencies?.['@electric-sql/pglite']).toBeUndefined();
|
||||
expect(pkg.dependencies?.['@electric-sql/pglite-socket']).toBeUndefined();
|
||||
expect(pkg.devDependencies?.['@electric-sql/pglite']).toBeDefined();
|
||||
expect(pkg.devDependencies?.['@electric-sql/pglite-socket']).toBeDefined();
|
||||
expect(pkg.files).toEqual(['dist', 'prompts', 'skills']);
|
||||
});
|
||||
|
||||
it('keeps PGlite prototypes out of public exports and production routing', () => {
|
||||
const pkg = readContextPackageJson();
|
||||
const packageExportKeys = Object.keys(pkg.exports ?? {});
|
||||
|
||||
expect(packageExportKeys.filter((key) => key.toLowerCase().includes('pglite'))).toEqual([]);
|
||||
|
||||
const publicExportFiles = [
|
||||
'packages/context/src/index.ts',
|
||||
'packages/context/src/search/index.ts',
|
||||
'packages/context/src/sl/index.ts',
|
||||
];
|
||||
|
||||
for (const relativePath of publicExportFiles) {
|
||||
expect(readKloFile(relativePath), relativePath).not.toMatch(/pglite/i);
|
||||
}
|
||||
|
||||
const productionRoutingFiles = [
|
||||
'packages/cli/src/agent.ts',
|
||||
'packages/context/src/mcp/local-project-ports.ts',
|
||||
'packages/context/src/wiki/local-knowledge.ts',
|
||||
'packages/context/src/ingest/context-evidence/sqlite-context-evidence-store.ts',
|
||||
];
|
||||
|
||||
for (const relativePath of productionRoutingFiles) {
|
||||
expect(readKloFile(relativePath), relativePath).not.toMatch(
|
||||
/pglite-owner-prototype|pglite-sl-search-prototype|@electric-sql\/pglite/i,
|
||||
);
|
||||
}
|
||||
|
||||
const localSlSource = readKloFile('packages/context/src/sl/local-sl.ts');
|
||||
expect(localSlSource).toContain("input.backend === 'pglite-owner-prototype'");
|
||||
expect(localSlSource).toContain('PGlite semantic-layer search prototype requires pglite owner-process options.');
|
||||
expect(localSlSource).toContain("await import('./pglite-sl-search-prototype.js')");
|
||||
});
|
||||
});
|
||||
302
packages/context/src/search/pglite-spike.test.ts
Normal file
302
packages/context/src/search/pglite-spike.test.ts
Normal file
|
|
@ -0,0 +1,302 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { PGlite, type PGliteInterface } from '@electric-sql/pglite';
|
||||
import { pg_trgm } from '@electric-sql/pglite/contrib/pg_trgm';
|
||||
import { vector } from '@electric-sql/pglite/vector';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import {
|
||||
assertSearchBackendCapabilities,
|
||||
assertSearchBackendConformanceCase,
|
||||
type SearchBackendCapabilities,
|
||||
} from './index.js';
|
||||
|
||||
type PGliteDb = PGliteInterface;
|
||||
|
||||
const PGLITE_SPIKE_CAPABILITIES = {
|
||||
fts: true,
|
||||
vector: true,
|
||||
fuzzy: true,
|
||||
jsonSearch: true,
|
||||
arraySearch: false,
|
||||
} satisfies SearchBackendCapabilities;
|
||||
|
||||
async function createSpikeDb(dataDir: string): Promise<PGliteDb> {
|
||||
const db = await PGlite.create({
|
||||
dataDir,
|
||||
extensions: {
|
||||
vector,
|
||||
pg_trgm,
|
||||
},
|
||||
});
|
||||
|
||||
await db.exec(`
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
`);
|
||||
|
||||
return db;
|
||||
}
|
||||
|
||||
async function createSchema(db: PGliteDb): Promise<void> {
|
||||
await db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS spike_documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
search_text TEXT NOT NULL,
|
||||
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
embedding vector(3) NOT NULL
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS spike_documents_fts_idx
|
||||
ON spike_documents
|
||||
USING GIN (to_tsvector('english', search_text));
|
||||
|
||||
CREATE INDEX IF NOT EXISTS spike_documents_vector_idx
|
||||
ON spike_documents
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 1);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS spike_dictionary_values (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
column_name TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
PRIMARY KEY (connection_id, source_name, column_name, value)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS spike_dictionary_values_trgm_idx
|
||||
ON spike_dictionary_values
|
||||
USING GIN (value gin_trgm_ops);
|
||||
`);
|
||||
}
|
||||
|
||||
async function seedSearchFixture(db: PGliteDb): Promise<void> {
|
||||
await db.query(
|
||||
`
|
||||
INSERT INTO spike_documents (id, search_text, metadata, embedding)
|
||||
VALUES
|
||||
($1, $2, $3::jsonb, $4::vector),
|
||||
($5, $6, $7::jsonb, $8::vector),
|
||||
($9, $10, $11::jsonb, $12::vector)
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET search_text = EXCLUDED.search_text,
|
||||
metadata = EXCLUDED.metadata,
|
||||
embedding = EXCLUDED.embedding
|
||||
`,
|
||||
[
|
||||
'warehouse/orders',
|
||||
'orders paid revenue refund status customer',
|
||||
JSON.stringify({ connectionId: 'warehouse', sourceName: 'orders' }),
|
||||
JSON.stringify([1, 0, 0]),
|
||||
'finance/orders',
|
||||
'orders finance bookings gross margin',
|
||||
JSON.stringify({ connectionId: 'finance', sourceName: 'orders' }),
|
||||
JSON.stringify([0.72, 0.28, 0]),
|
||||
'warehouse/customers',
|
||||
'customers accounts lifecycle region',
|
||||
JSON.stringify({ connectionId: 'warehouse', sourceName: 'customers' }),
|
||||
JSON.stringify([0, 1, 0]),
|
||||
],
|
||||
);
|
||||
|
||||
await db.query(
|
||||
`
|
||||
INSERT INTO spike_dictionary_values (connection_id, source_name, column_name, value)
|
||||
VALUES
|
||||
('warehouse', 'orders', 'status', 'refunded'),
|
||||
('warehouse', 'orders', 'status', 'paid'),
|
||||
('warehouse', 'customers', 'region', 'emea')
|
||||
ON CONFLICT DO NOTHING
|
||||
`,
|
||||
);
|
||||
}
|
||||
|
||||
async function closeDb(db: PGliteDb): Promise<void> {
|
||||
await db.close();
|
||||
}
|
||||
|
||||
describe('PGlite hybrid search spike', () => {
|
||||
let tempDir: string;
|
||||
let dataDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-search-spike-'));
|
||||
dataDir = join(tempDir, 'pgdata');
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('documents PGlite search backend capabilities', () => {
|
||||
assertSearchBackendCapabilities({
|
||||
backendName: 'pglite-spike',
|
||||
capabilities: PGLITE_SPIKE_CAPABILITIES,
|
||||
expected: {
|
||||
fts: true,
|
||||
vector: true,
|
||||
fuzzy: true,
|
||||
jsonSearch: true,
|
||||
arraySearch: false,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('supports FTS, pgvector ordering, and pg_trgm dictionary lookup', async () => {
|
||||
const db = await createSpikeDb(dataDir);
|
||||
|
||||
try {
|
||||
await createSchema(db);
|
||||
await seedSearchFixture(db);
|
||||
|
||||
const lexical = await db.query<{ id: string; score: number }>(
|
||||
`
|
||||
SELECT
|
||||
id,
|
||||
ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) AS score
|
||||
FROM spike_documents
|
||||
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
|
||||
ORDER BY score DESC, id ASC
|
||||
LIMIT 2
|
||||
`,
|
||||
['paid orders'],
|
||||
);
|
||||
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'pglite-spike',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'postgres fts lexical ranking',
|
||||
results: lexical.rows.map((row) => ({
|
||||
id: row.id,
|
||||
score: row.score,
|
||||
matchReasons: ['lexical'],
|
||||
})),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['lexical'],
|
||||
},
|
||||
});
|
||||
|
||||
const semantic = await db.query<{ id: string; similarity: number }>(
|
||||
`
|
||||
SELECT
|
||||
id,
|
||||
1 - (embedding <=> $1::vector) AS similarity
|
||||
FROM spike_documents
|
||||
ORDER BY embedding <=> $1::vector, id ASC
|
||||
LIMIT 2
|
||||
`,
|
||||
[JSON.stringify([1, 0, 0])],
|
||||
);
|
||||
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'pglite-spike',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'pgvector cosine ranking',
|
||||
results: semantic.rows.map((row) => ({
|
||||
id: row.id,
|
||||
score: row.similarity,
|
||||
matchReasons: ['semantic'],
|
||||
})),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['semantic'],
|
||||
},
|
||||
});
|
||||
|
||||
const dictionary = await db.query<{ id: string; value: string; score: number }>(
|
||||
`
|
||||
SELECT
|
||||
connection_id || '/' || source_name AS id,
|
||||
value,
|
||||
similarity(value, $1) AS score
|
||||
FROM spike_dictionary_values
|
||||
WHERE similarity(value, $1) > 0
|
||||
ORDER BY score DESC, id ASC, value ASC
|
||||
LIMIT 2
|
||||
`,
|
||||
['refund'],
|
||||
);
|
||||
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'pglite-spike',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'pg_trgm dictionary ranking',
|
||||
results: dictionary.rows.map((row) => ({
|
||||
id: row.id,
|
||||
score: row.score,
|
||||
matchReasons: ['dictionary'],
|
||||
dictionaryMatches: [{ column: 'status', values: [row.value] }],
|
||||
})),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['dictionary'],
|
||||
},
|
||||
expectedDictionaryMatchesById: {
|
||||
'warehouse/orders': [{ column: 'status', values: ['refunded'] }],
|
||||
},
|
||||
});
|
||||
} finally {
|
||||
await closeDb(db);
|
||||
}
|
||||
});
|
||||
|
||||
it('persists indexed rows after reopening the filesystem database', async () => {
|
||||
const first = await createSpikeDb(dataDir);
|
||||
|
||||
try {
|
||||
await createSchema(first);
|
||||
await seedSearchFixture(first);
|
||||
} finally {
|
||||
await closeDb(first);
|
||||
}
|
||||
|
||||
const second = await createSpikeDb(dataDir);
|
||||
|
||||
try {
|
||||
const persisted = await second.query<{ count: number }>(
|
||||
"SELECT COUNT(*)::int AS count FROM spike_documents WHERE metadata->>'connectionId' = $1",
|
||||
['warehouse'],
|
||||
);
|
||||
|
||||
expect(persisted.rows[0]).toEqual({ count: 2 });
|
||||
} finally {
|
||||
await closeDb(second);
|
||||
}
|
||||
});
|
||||
|
||||
it('records direct concurrency behavior without assuming Postgres server parity', async () => {
|
||||
const db = await createSpikeDb(dataDir);
|
||||
|
||||
try {
|
||||
await createSchema(db);
|
||||
await seedSearchFixture(db);
|
||||
|
||||
const reads = await Promise.all(
|
||||
Array.from({ length: 4 }, () =>
|
||||
db.query<{ count: number }>('SELECT COUNT(*)::int AS count FROM spike_documents'),
|
||||
),
|
||||
);
|
||||
|
||||
expect(reads.map((result) => result.rows[0]?.count)).toEqual([3, 3, 3, 3]);
|
||||
|
||||
let secondOpenStatus: 'opened' | 'blocked' = 'opened';
|
||||
let second: PGliteDb | undefined;
|
||||
|
||||
try {
|
||||
second = await createSpikeDb(dataDir);
|
||||
await second.query('SELECT 1');
|
||||
} catch {
|
||||
secondOpenStatus = 'blocked';
|
||||
} finally {
|
||||
if (second) {
|
||||
await closeDb(second);
|
||||
}
|
||||
}
|
||||
|
||||
expect(['opened', 'blocked']).toContain(secondOpenStatus);
|
||||
} finally {
|
||||
await closeDb(db);
|
||||
}
|
||||
});
|
||||
});
|
||||
26
packages/context/src/search/query.test.ts
Normal file
26
packages/context/src/search/query.test.ts
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { defaultLaneCandidatePoolLimit, normalizeSearchQuery } from './query.js';
|
||||
|
||||
describe('search query helpers', () => {
|
||||
it('normalizes punctuation and duplicate terms into stable lowercase tokens', () => {
|
||||
expect(normalizeSearchQuery(' Gross-Revenue, gross_revenue! Paid orders ')).toEqual({
|
||||
raw: ' Gross-Revenue, gross_revenue! Paid orders ',
|
||||
normalized: 'gross revenue gross_revenue paid orders',
|
||||
terms: ['gross', 'revenue', 'gross_revenue', 'paid', 'orders'],
|
||||
});
|
||||
});
|
||||
|
||||
it('returns an empty normalized query for punctuation-only input', () => {
|
||||
expect(normalizeSearchQuery('--- ///')).toEqual({
|
||||
raw: '--- ///',
|
||||
normalized: '',
|
||||
terms: [],
|
||||
});
|
||||
});
|
||||
|
||||
it('sizes per-lane candidate pools before final limiting', () => {
|
||||
expect(defaultLaneCandidatePoolLimit(1)).toBe(25);
|
||||
expect(defaultLaneCandidatePoolLimit(8)).toBe(25);
|
||||
expect(defaultLaneCandidatePoolLimit(10)).toBe(30);
|
||||
});
|
||||
});
|
||||
19
packages/context/src/search/query.ts
Normal file
19
packages/context/src/search/query.ts
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
import type { NormalizedSearchQuery } from './types.js';
|
||||
|
||||
export function normalizeSearchQuery(queryText: string): NormalizedSearchQuery {
|
||||
const terms = queryText
|
||||
.toLowerCase()
|
||||
.split(/[^a-z0-9_]+/u)
|
||||
.map((term) => term.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
return {
|
||||
raw: queryText,
|
||||
normalized: terms.join(' '),
|
||||
terms,
|
||||
};
|
||||
}
|
||||
|
||||
export function defaultLaneCandidatePoolLimit(finalLimit: number): number {
|
||||
return Math.max(25, Math.max(1, finalLimit) * 3);
|
||||
}
|
||||
52
packages/context/src/search/rrf.test.ts
Normal file
52
packages/context/src/search/rrf.test.ts
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { compareFusedSearchCandidates, DEFAULT_SEARCH_LANE_WEIGHTS, rrfContribution } from './rrf.js';
|
||||
import type { FusedSearchCandidate } from './types.js';
|
||||
|
||||
describe('RRF scoring', () => {
|
||||
it('uses the shared lane weights from the hybrid search spec', () => {
|
||||
expect(DEFAULT_SEARCH_LANE_WEIGHTS).toEqual({
|
||||
semantic: 2,
|
||||
dictionary: 2,
|
||||
lexical: 1.5,
|
||||
token: 0.75,
|
||||
});
|
||||
});
|
||||
|
||||
it('calculates a weighted RRF contribution with k=60 by default', () => {
|
||||
expect(rrfContribution(2, 1)).toBeCloseTo(2 / 61, 12);
|
||||
expect(rrfContribution(1.5, 2)).toBeCloseTo(1.5 / 62, 12);
|
||||
});
|
||||
|
||||
it('sorts fused candidates by score, lane count, and stable id', () => {
|
||||
const first: FusedSearchCandidate = {
|
||||
id: 'orders',
|
||||
score: 0.05,
|
||||
matchReasons: ['lexical'],
|
||||
ranksByLane: { lexical: 1 },
|
||||
rawScoresByLane: {},
|
||||
evidenceByLane: {},
|
||||
};
|
||||
const second: FusedSearchCandidate = {
|
||||
id: 'customers',
|
||||
score: 0.05,
|
||||
matchReasons: ['lexical', 'semantic'],
|
||||
ranksByLane: { lexical: 2, semantic: 1 },
|
||||
rawScoresByLane: {},
|
||||
evidenceByLane: {},
|
||||
};
|
||||
const third: FusedSearchCandidate = {
|
||||
id: 'accounts',
|
||||
score: 0.04,
|
||||
matchReasons: ['semantic'],
|
||||
ranksByLane: { semantic: 1 },
|
||||
rawScoresByLane: {},
|
||||
evidenceByLane: {},
|
||||
};
|
||||
|
||||
expect([first, second, third].sort(compareFusedSearchCandidates).map((candidate) => candidate.id)).toEqual([
|
||||
'customers',
|
||||
'orders',
|
||||
'accounts',
|
||||
]);
|
||||
});
|
||||
});
|
||||
18
packages/context/src/search/rrf.ts
Normal file
18
packages/context/src/search/rrf.ts
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
import type { FusedSearchCandidate, SearchLaneName } from './types.js';
|
||||
|
||||
export const DEFAULT_RRF_K = 60;
|
||||
|
||||
export const DEFAULT_SEARCH_LANE_WEIGHTS: Record<SearchLaneName, number> = {
|
||||
semantic: 2,
|
||||
dictionary: 2,
|
||||
lexical: 1.5,
|
||||
token: 0.75,
|
||||
};
|
||||
|
||||
export function rrfContribution(weight: number, rank: number, rrfK = DEFAULT_RRF_K): number {
|
||||
return weight / (rrfK + rank);
|
||||
}
|
||||
|
||||
export function compareFusedSearchCandidates(left: FusedSearchCandidate, right: FusedSearchCandidate): number {
|
||||
return right.score - left.score || right.matchReasons.length - left.matchReasons.length || left.id.localeCompare(right.id);
|
||||
}
|
||||
85
packages/context/src/search/types.ts
Normal file
85
packages/context/src/search/types.ts
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
export type SearchLaneName = 'lexical' | 'semantic' | 'dictionary' | 'token' | string;
|
||||
|
||||
export type SearchLaneStatus = 'available' | 'skipped' | 'failed';
|
||||
|
||||
export interface NormalizedSearchQuery {
|
||||
raw: string;
|
||||
normalized: string;
|
||||
terms: string[];
|
||||
}
|
||||
|
||||
export interface SearchCandidate {
|
||||
id: string;
|
||||
rank: number;
|
||||
rawScore?: number;
|
||||
matchReason?: string;
|
||||
evidence?: unknown;
|
||||
}
|
||||
|
||||
export interface SearchCandidateGeneratorArgs {
|
||||
queryText: string;
|
||||
normalizedQuery: NormalizedSearchQuery;
|
||||
finalLimit: number;
|
||||
laneCandidatePoolLimit: number;
|
||||
}
|
||||
|
||||
export interface SearchLaneResult {
|
||||
status?: SearchLaneStatus;
|
||||
candidates: SearchCandidate[];
|
||||
effectiveCandidatePoolLimit?: number;
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
export interface SearchCandidateGenerator {
|
||||
lane: SearchLaneName;
|
||||
weight?: number;
|
||||
generate(args: SearchCandidateGeneratorArgs): Promise<SearchLaneResult>;
|
||||
}
|
||||
|
||||
export interface HybridSearchOptions {
|
||||
queryText: string;
|
||||
limit: number;
|
||||
candidatePoolLimit?: number;
|
||||
rrfK?: number;
|
||||
laneWeights?: Partial<Record<SearchLaneName, number>>;
|
||||
generators: SearchCandidateGenerator[];
|
||||
}
|
||||
|
||||
export interface SearchLaneBreakdown {
|
||||
lane: SearchLaneName;
|
||||
status: SearchLaneStatus;
|
||||
requestedCandidatePoolLimit: number;
|
||||
effectiveCandidatePoolLimit: number;
|
||||
returnedCandidateCount: number;
|
||||
weight: number;
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
export interface FusedSearchCandidate {
|
||||
id: string;
|
||||
score: number;
|
||||
matchReasons: SearchLaneName[];
|
||||
ranksByLane: Record<SearchLaneName, number>;
|
||||
rawScoresByLane: Record<SearchLaneName, number>;
|
||||
evidenceByLane: Record<SearchLaneName, unknown[]>;
|
||||
}
|
||||
|
||||
export interface SearchResultHydrator<TResult> {
|
||||
hydrate(candidates: FusedSearchCandidate[]): Promise<TResult[]>;
|
||||
}
|
||||
|
||||
export interface HybridSearchResult {
|
||||
query: NormalizedSearchQuery;
|
||||
requestedLimit: number;
|
||||
requestedCandidatePoolLimit: number;
|
||||
results: FusedSearchCandidate[];
|
||||
lanes: SearchLaneBreakdown[];
|
||||
}
|
||||
|
||||
export interface SearchBackendCapabilities {
|
||||
fts: boolean;
|
||||
vector: boolean;
|
||||
fuzzy: boolean;
|
||||
jsonSearch: boolean;
|
||||
arraySearch: boolean;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue