Initial open-source release

This commit is contained in:
Andrey Avtomonov 2026-05-10 23:12:26 +02:00
commit 1a42152e6f
1199 changed files with 257054 additions and 0 deletions

View file

@ -0,0 +1,472 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, it } from 'vitest';
import { SqliteContextEvidenceStore } from '../ingest/context-evidence/index.js';
import type { JsonValue } from '../ingest/ports.js';
import { initKloProject, type KloLocalProject } from '../project/index.js';
import { type LocalSlSourceSearchResult, searchLocalSlSources, writeLocalSlSource } from '../sl/local-sl.js';
import type { ContextEvidenceSearchResult } from '../tools/context-evidence-tool-store.js';
import {
type LocalKnowledgeSearchResult,
searchLocalKnowledgePages,
writeLocalKnowledgePage,
} from '../wiki/local-knowledge.js';
import {
assertSearchBackendCapabilities,
assertSearchBackendConformanceCase,
type SearchBackendConformanceResult,
} from './backend-conformance.js';
import type { SearchBackendCapabilities } from './types.js';
const SQLITE_SEARCH_CAPABILITIES = {
fts: true,
vector: false,
fuzzy: false,
jsonSearch: true,
arraySearch: false,
} satisfies SearchBackendCapabilities;
const ORDERS_YAML = [
'name: orders',
'table: public.orders',
'grain:',
' - order_id',
'columns:',
' - name: order_id',
' type: string',
' - name: revenue',
' type: number',
'measures:',
' - name: total_revenue',
' expr: sum(revenue)',
'',
].join('\n');
const FINANCE_ORDERS_YAML = [
'name: orders',
'description: Finance orders used for invoice reconciliation.',
'table: finance.orders',
'grain:',
' - order_id',
'columns:',
' - name: order_id',
' type: string',
' - name: invoice_status',
' type: string',
'',
].join('\n');
class FakeEmbeddingPort {
readonly maxBatchSize = 16;
async computeEmbedding(text: string): Promise<number[]> {
return text.toLowerCase().includes('semantic revenue') ? [1, 0] : [0, 1];
}
async computeEmbeddingsBulk(texts: string[]): Promise<number[][]> {
return Promise.all(texts.map((text) => this.computeEmbedding(text)));
}
}
function toSlConformanceResult(result: LocalSlSourceSearchResult): SearchBackendConformanceResult {
return {
id: `${result.connectionId}/${result.name}`,
score: result.score ?? 0,
matchReasons: result.matchReasons ?? [],
lanes: result.lanes,
dictionaryMatches: result.dictionaryMatches,
};
}
function toWikiConformanceResult(result: LocalKnowledgeSearchResult): SearchBackendConformanceResult {
return {
id: result.key,
score: result.score,
matchReasons: result.matchReasons,
lanes: result.lanes,
};
}
function toContextConformanceResult(result: ContextEvidenceSearchResult): SearchBackendConformanceResult {
return {
id: `${result.externalId}:${result.stableCitationKey}`,
score: result.score,
matchReasons: result.matchReasons ?? [],
lanes: result.lanes,
};
}
async function seedSemanticLayerProject(project: KloLocalProject): Promise<void> {
await writeLocalSlSource(project, {
connectionId: 'warehouse',
sourceName: 'orders',
yaml: ORDERS_YAML,
});
await writeLocalSlSource(project, {
connectionId: 'finance',
sourceName: 'orders',
yaml: FINANCE_ORDERS_YAML,
});
await project.fileStore.writeFile(
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
`${JSON.stringify(
{
connectionId: 'warehouse',
driver: 'postgres',
sqlAvailable: true,
queryCount: 2,
tables: [],
columns: {
'orders.status': {
table: { catalog: null, db: 'public', name: 'orders' },
column: 'status',
nativeType: 'text',
normalizedType: 'string',
rowCount: 10,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 0.2,
nullRate: 0,
sampleValues: ['paid', 'refunded'],
minTextLength: 4,
maxTextLength: 8,
},
},
warnings: [],
},
null,
2,
)}\n`,
'klo',
'klo@example.com',
'Seed dictionary profile',
);
}
async function seedWikiProject(project: KloLocalProject): Promise<void> {
await writeLocalKnowledgePage(project, {
key: 'metrics/revenue',
scope: 'GLOBAL',
summary: 'Semantic revenue definition',
content: 'Revenue is recognized when an order is paid.',
tags: ['finance'],
refs: ['semantic-layer/warehouse/orders.yaml'],
slRefs: ['orders'],
});
await writeLocalKnowledgePage(project, {
key: 'support/escalations',
scope: 'GLOBAL',
summary: 'Support escalation process',
content: 'Escalations move urgent support tickets to the operations queue.',
tags: ['operations'],
});
}
async function seedContextDocument(
subject: SqliteContextEvidenceStore,
input: {
runId?: string;
syncId?: string;
externalId?: string;
title?: string;
rawPath?: string;
metadata?: JsonValue;
publishState?: 'pending' | 'published';
embedding?: number[] | null;
content?: string;
searchText?: string;
} = {},
): Promise<{ documentId: string; chunkId: string }> {
const runId = input.runId ?? 'run-1';
const syncId = input.syncId ?? 'sync-1';
const externalId = input.externalId ?? 'page-1';
const title = input.title ?? 'Revenue Policy';
const rawPath = input.rawPath ?? `pages/${externalId}/page.md`;
const doc = await subject.upsertDocument({
runId,
connectionId: 'conn-1',
sourceKey: 'notion',
externalId,
externalParentId: null,
databaseId: null,
dataSourceId: null,
title,
path: `Company Handbook / ${title}`,
url: `https://notion.test/${externalId}`,
objectType: 'page',
lastEditedAt: new Date('2026-04-30T10:00:00.000Z'),
lastEditedBy: 'user-1',
rawPath,
syncId,
contentHash: `hash-${externalId}`,
publishState: input.publishState ?? 'published',
metadata: input.metadata ?? {},
});
await subject.replaceChunks(doc.id, [
{
chunkKey: 'intro',
headingPath: ['Policy'],
ordinal: 0,
content: input.content ?? `${title} requires approval from the accountable owner.`,
searchText: input.searchText ?? `${title} approval accountable owner`,
embedding: input.embedding ?? [1, 0, 0],
tokenCount: 8,
citation: {
source: 'notion',
pageId: externalId,
title,
syncId,
rawPath,
},
stableCitationKey: `notion:${externalId}:intro`,
syncId,
contentHash: `chunk-${externalId}`,
},
]);
const read = await subject.readDocumentByExternalId('conn-1', 'notion', externalId, runId);
if (!read) {
throw new Error(`seeded document ${externalId} was not readable`);
}
return { documentId: doc.id, chunkId: read.chunks[0].id };
}
describe('SQLite hybrid search backend conformance', () => {
let tempDir: string;
let project: KloLocalProject;
let dbPath: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-search-conformance-'));
project = await initKloProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
dbPath = join(tempDir, '.klo', 'db.sqlite');
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('documents SQLite search backend capabilities', () => {
assertSearchBackendCapabilities({
backendName: 'sqlite',
capabilities: SQLITE_SEARCH_CAPABILITIES,
expected: {
fts: true,
vector: false,
fuzzy: false,
jsonSearch: true,
arraySearch: false,
},
});
});
it('keeps semantic-layer global ranking, dictionary evidence, and token fallback stable', async () => {
await seedSemanticLayerProject(project);
const global = await searchLocalSlSources(project, { query: 'orders', limit: 5 });
assertSearchBackendConformanceCase({
backendName: 'sqlite',
surface: 'semantic-layer',
caseName: 'global source ranking',
results: global.map(toSlConformanceResult),
expectedTopIds: ['finance/orders', 'warehouse/orders'],
expectedReasonsById: {
'finance/orders': ['lexical'],
'warehouse/orders': ['lexical'],
},
expectedLanes: {
lexical: { status: 'available' },
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
},
});
const dictionary = await searchLocalSlSources(project, {
connectionId: 'warehouse',
query: 'refunded',
limit: 5,
});
assertSearchBackendConformanceCase({
backendName: 'sqlite',
surface: 'semantic-layer',
caseName: 'dictionary source evidence',
results: dictionary.map(toSlConformanceResult),
expectedTopIds: ['warehouse/orders'],
expectedReasonsById: {
'warehouse/orders': ['dictionary'],
},
expectedLanes: {
dictionary: { status: 'available' },
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
},
expectedDictionaryMatchesById: {
'warehouse/orders': [{ column: 'status', values: ['refunded'] }],
},
});
const token = await searchLocalSlSources(project, {
connectionId: 'warehouse',
query: 'orders---',
limit: 5,
});
assertSearchBackendConformanceCase({
backendName: 'sqlite',
surface: 'semantic-layer',
caseName: 'token fallback reason',
results: token.map(toSlConformanceResult),
expectedTopIds: ['warehouse/orders'],
expectedReasonsById: {
'warehouse/orders': ['token'],
},
expectedLanes: {
token: { status: 'available' },
},
});
});
it('keeps wiki lexical, semantic, and token behavior stable', async () => {
await seedWikiProject(project);
const lexical = await searchLocalKnowledgePages(project, {
query: 'paid order',
userId: 'local',
limit: 5,
});
assertSearchBackendConformanceCase({
backendName: 'sqlite',
surface: 'wiki',
caseName: 'lexical page ranking',
results: lexical.map(toWikiConformanceResult),
expectedTopIds: ['metrics/revenue'],
expectedReasonsById: {
'metrics/revenue': ['lexical'],
},
expectedLanes: {
lexical: { status: 'available' },
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
},
});
const semantic = await searchLocalKnowledgePages(project, {
query: 'semantic revenue',
userId: 'local',
limit: 5,
embeddingService: new FakeEmbeddingPort(),
});
assertSearchBackendConformanceCase({
backendName: 'sqlite',
surface: 'wiki',
caseName: 'semantic page ranking',
results: semantic.map(toWikiConformanceResult),
expectedTopIds: ['metrics/revenue'],
expectedReasonsById: {
'metrics/revenue': ['semantic'],
},
expectedLanes: {
semantic: { status: 'available' },
},
});
const token = await searchLocalKnowledgePages(project, {
query: 'paid---',
userId: 'local',
limit: 5,
});
assertSearchBackendConformanceCase({
backendName: 'sqlite',
surface: 'wiki',
caseName: 'token page fallback',
results: token.map(toWikiConformanceResult),
expectedTopIds: ['metrics/revenue'],
expectedReasonsById: {
'metrics/revenue': ['token'],
},
expectedLanes: {
token: { status: 'available' },
},
});
});
it('keeps context-evidence lane fusion and token fallback stable', async () => {
const subject = new SqliteContextEvidenceStore({ dbPath });
await seedContextDocument(subject, {
externalId: 'page-discount',
title: 'Enterprise Discount Policy',
content: 'Enterprise discounts require finance approval before quote approval.',
searchText: 'enterprise discount finance approval quote',
embedding: [1, 0, 0],
});
await seedContextDocument(subject, {
externalId: 'page-owner',
title: 'Accountable Owner Policy',
content: 'Every policy has an accountable owner and review date.',
searchText: 'accountable owner review date',
embedding: [0.95, 0.05, 0],
});
await seedContextDocument(subject, {
externalId: 'page-expense',
title: 'Expense Policy',
content: 'Expense reimbursement requires receipt review.',
searchText: 'expense reimbursement receipt review',
embedding: [0, 1, 0],
});
const fused = await subject.searchRRF({
connectionId: 'conn-1',
sourceKey: 'notion',
queryEmbedding: [1, 0, 0],
queryText: 'enterprise discount approval',
limit: 2,
includeDeleted: false,
});
assertSearchBackendConformanceCase({
backendName: 'sqlite',
surface: 'context-evidence',
caseName: 'chunk lane fusion',
results: fused.map(toContextConformanceResult),
expectedTopIds: ['page-discount:notion:page-discount:intro'],
expectedReasonsById: {
'page-discount:notion:page-discount:intro': ['lexical', 'semantic', 'token'],
},
expectedLanes: {
lexical: { status: 'available' },
semantic: { status: 'available' },
token: { status: 'available' },
},
});
const tokenSubject = new SqliteContextEvidenceStore({ dbPath: join(tempDir, 'token.sqlite') });
await seedContextDocument(tokenSubject, {
externalId: 'page-cpp',
title: 'C++ Warehouse Notes',
content: 'C++ parser notes for warehouse extraction.',
searchText: 'C++ parser warehouse extraction',
embedding: null,
});
const token = await tokenSubject.searchRRF({
connectionId: 'conn-1',
sourceKey: 'notion',
queryEmbedding: null,
queryText: '++',
limit: 5,
includeDeleted: false,
});
assertSearchBackendConformanceCase({
backendName: 'sqlite',
surface: 'context-evidence',
caseName: 'fts-empty token fallback',
results: token.map(toContextConformanceResult),
expectedTopIds: ['page-cpp:notion:page-cpp:intro'],
expectedReasonsById: {
'page-cpp:notion:page-cpp:intro': ['token'],
},
expectedLanes: {
lexical: { status: 'skipped', reason: 'fts_query_empty' },
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
token: { status: 'available' },
},
});
});
});

View file

@ -0,0 +1,151 @@
import type { SearchBackendCapabilities, SearchLaneStatus } from './types.js';
export interface SearchBackendConformanceLane {
lane: string;
status: SearchLaneStatus;
reason?: string;
}
export interface SearchBackendConformanceDictionaryMatch {
column: string;
values: readonly string[];
overflowCount?: number;
}
export interface SearchBackendConformanceResult {
id: string;
score: number;
matchReasons: readonly string[];
lanes?: readonly SearchBackendConformanceLane[];
dictionaryMatches?: readonly SearchBackendConformanceDictionaryMatch[];
}
export interface ExpectedSearchBackendConformanceLane {
status: SearchLaneStatus;
reason?: string;
}
export interface AssertSearchBackendConformanceCaseInput {
backendName: string;
surface: string;
caseName: string;
results: readonly SearchBackendConformanceResult[];
expectedTopIds: readonly string[];
expectedReasonsById?: Record<string, readonly string[]>;
expectedLanes?: Record<string, ExpectedSearchBackendConformanceLane>;
expectedDictionaryMatchesById?: Record<string, readonly SearchBackendConformanceDictionaryMatch[]>;
}
export interface AssertSearchBackendCapabilitiesInput {
backendName: string;
capabilities: SearchBackendCapabilities;
expected: Partial<SearchBackendCapabilities>;
}
function caseLabel(
input: Pick<AssertSearchBackendConformanceCaseInput, 'backendName' | 'surface' | 'caseName'>,
): string {
return `${input.backendName} ${input.surface} conformance case "${input.caseName}"`;
}
function fail(label: string, failures: string[]): never {
throw new Error([`${label} failed:`, ...failures.map((failure) => `- ${failure}`)].join('\n'));
}
function dictionaryMatchKey(match: SearchBackendConformanceDictionaryMatch): string {
const values = [...match.values].sort((left, right) => left.localeCompare(right)).join(',');
return `${match.column}:${values}:${match.overflowCount ?? 0}`;
}
function dictionaryMatchKeys(matches: readonly SearchBackendConformanceDictionaryMatch[] | undefined): string[] {
return (matches ?? []).map(dictionaryMatchKey).sort((left, right) => left.localeCompare(right));
}
export function assertSearchBackendConformanceCase(input: AssertSearchBackendConformanceCaseInput): void {
const label = caseLabel(input);
const failures: string[] = [];
const topResults = input.results.slice(0, input.expectedTopIds.length);
input.expectedTopIds.forEach((expectedId, index) => {
const actualId = topResults[index]?.id;
if (actualId !== expectedId) {
failures.push(`expected result ${index + 1} to be ${expectedId}, got ${actualId ?? '<missing>'}`);
}
});
const byId = new Map(input.results.map((result) => [result.id, result]));
for (const expectedId of input.expectedTopIds) {
const result = byId.get(expectedId);
if (!result) {
continue;
}
if (!Number.isFinite(result.score) || result.score <= 0) {
failures.push(`expected ${expectedId} to have a positive finite score, got ${result.score}`);
}
}
for (const [id, expectedReasons] of Object.entries(input.expectedReasonsById ?? {})) {
const result = byId.get(id);
if (!result) {
failures.push(`expected reasons for ${id}, but the result was missing`);
continue;
}
for (const reason of expectedReasons) {
if (!result.matchReasons.includes(reason)) {
failures.push(`expected ${id} to include match reason ${reason}, got [${result.matchReasons.join(', ')}]`);
}
}
}
const allLanes = input.results.flatMap((result) => result.lanes ?? []);
for (const [lane, expected] of Object.entries(input.expectedLanes ?? {})) {
const actual = allLanes.find((entry) => entry.lane === lane);
if (!actual) {
failures.push(`expected lane ${lane} to be reported`);
continue;
}
if (actual.status !== expected.status) {
failures.push(`expected lane ${lane} status ${expected.status}, got ${actual.status}`);
}
if (expected.reason !== undefined && actual.reason !== expected.reason) {
failures.push(`expected lane ${lane} reason ${expected.reason}, got ${actual.reason ?? '<missing>'}`);
}
}
for (const [id, expectedMatches] of Object.entries(input.expectedDictionaryMatchesById ?? {})) {
const result = byId.get(id);
if (!result) {
failures.push(`expected dictionary matches for ${id}, but the result was missing`);
continue;
}
const actualKeys = dictionaryMatchKeys(result.dictionaryMatches);
for (const expectedKey of dictionaryMatchKeys(expectedMatches)) {
if (!actualKeys.includes(expectedKey)) {
failures.push(`expected ${id} dictionary evidence ${expectedKey}, got [${actualKeys.join(', ')}]`);
}
}
}
if (failures.length > 0) {
fail(label, failures);
}
}
export function assertSearchBackendCapabilities(input: AssertSearchBackendCapabilitiesInput): void {
const failures: string[] = [];
for (const [capability, expected] of Object.entries(input.expected) as Array<
[keyof SearchBackendCapabilities, boolean]
>) {
const actual = input.capabilities[capability];
if (actual !== expected) {
failures.push(`expected ${capability}=${expected}, got ${actual}`);
}
}
if (failures.length > 0) {
fail(`${input.backendName} search backend capabilities`, failures);
}
}

View file

@ -0,0 +1,127 @@
import { describe, expect, it } from 'vitest';
import { HybridSearchCore } from './hybrid-search-core.js';
import type { SearchCandidateGenerator } from './types.js';
function generator(
lane: string,
candidates: Array<{ id: string; rank: number; rawScore?: number; matchReason?: string; evidence?: unknown }>,
weight?: number,
): SearchCandidateGenerator {
return {
lane,
weight,
async generate() {
return { candidates };
},
};
}
describe('HybridSearchCore', () => {
it('runs lane generators with the shared pool size and applies final limit after RRF fusion', async () => {
const calls: Array<{ lane: string; laneCandidatePoolLimit: number; finalLimit: number }> = [];
const core = new HybridSearchCore();
const result = await core.search({
queryText: 'gross revenue',
limit: 1,
generators: [
{
lane: 'lexical',
async generate(args) {
calls.push({ lane: 'lexical', ...args });
return {
candidates: [
{ id: 'orders', rank: 1, rawScore: 0.8 },
{ id: 'customers', rank: 2, rawScore: 0.7 },
],
};
},
},
{
lane: 'semantic',
async generate(args) {
calls.push({ lane: 'semantic', ...args });
return { candidates: [{ id: 'customers', rank: 1, rawScore: 0.91 }] };
},
},
],
});
expect(calls).toEqual([
expect.objectContaining({ lane: 'lexical', laneCandidatePoolLimit: 25, finalLimit: 1 }),
expect.objectContaining({ lane: 'semantic', laneCandidatePoolLimit: 25, finalLimit: 1 }),
]);
expect(result.results.map((candidate) => candidate.id)).toEqual(['customers']);
expect(result.results[0]).toMatchObject({
matchReasons: ['lexical', 'semantic'],
ranksByLane: { lexical: 2, semantic: 1 },
rawScoresByLane: { lexical: 0.7, semantic: 0.91 },
});
expect(result.lanes).toEqual([
expect.objectContaining({ lane: 'lexical', status: 'available', returnedCandidateCount: 2, weight: 1.5 }),
expect.objectContaining({ lane: 'semantic', status: 'available', returnedCandidateCount: 1, weight: 2 }),
]);
});
it('keeps available lane results when another lane is skipped or fails', async () => {
const core = new HybridSearchCore();
const result = await core.search({
queryText: 'paid',
limit: 5,
generators: [
generator('lexical', [{ id: 'orders', rank: 1 }]),
{
lane: 'semantic',
async generate() {
return { status: 'skipped', candidates: [], reason: 'embedding_unconfigured' };
},
},
{
lane: 'dictionary',
async generate() {
throw new Error('dictionary index unavailable');
},
},
],
});
expect(result.results.map((candidate) => candidate.id)).toEqual(['orders']);
expect(result.lanes).toEqual([
expect.objectContaining({ lane: 'lexical', status: 'available', reason: undefined }),
expect.objectContaining({ lane: 'semantic', status: 'skipped', reason: 'embedding_unconfigured' }),
expect.objectContaining({ lane: 'dictionary', status: 'failed', reason: 'dictionary index unavailable' }),
]);
});
it('deduplicates one lane by best rank before fusion', async () => {
const core = new HybridSearchCore();
const result = await core.search({
queryText: 'paid status',
limit: 10,
generators: [
generator('dictionary', [
{ id: 'orders', rank: 4, rawScore: 0.4, evidence: { column: 'state', values: ['paid'] } },
{ id: 'orders', rank: 1, rawScore: 0.9, evidence: { column: 'status', values: ['paid'] } },
]),
],
});
expect(result.results).toHaveLength(1);
expect(result.results[0]).toMatchObject({
id: 'orders',
ranksByLane: { dictionary: 1 },
rawScoresByLane: { dictionary: 0.9 },
evidenceByLane: { dictionary: [{ column: 'status', values: ['paid'] }] },
});
});
it('uses deterministic id ordering when scores and lane counts tie', async () => {
const core = new HybridSearchCore();
const result = await core.search({
queryText: 'revenue',
limit: 10,
generators: [generator('lexical', [{ id: 'zebra', rank: 1 }, { id: 'alpha', rank: 1 }])],
});
expect(result.results.map((candidate) => candidate.id)).toEqual(['alpha', 'zebra']);
});
});

View file

@ -0,0 +1,141 @@
import { defaultLaneCandidatePoolLimit, normalizeSearchQuery } from './query.js';
import { compareFusedSearchCandidates, DEFAULT_RRF_K, DEFAULT_SEARCH_LANE_WEIGHTS, rrfContribution } from './rrf.js';
import type {
FusedSearchCandidate,
HybridSearchOptions,
HybridSearchResult,
SearchCandidate,
SearchCandidateGenerator,
SearchLaneBreakdown,
SearchLaneName,
SearchLaneResult,
} from './types.js';
interface ExecutedLane {
generator: SearchCandidateGenerator;
result: SearchLaneResult;
}
function laneWeight(options: HybridSearchOptions, lane: SearchLaneName, generatorWeight?: number): number {
return generatorWeight ?? options.laneWeights?.[lane] ?? DEFAULT_SEARCH_LANE_WEIGHTS[lane] ?? 1;
}
function normalizeCandidate(candidate: SearchCandidate, fallbackRank: number): SearchCandidate {
const rank = Number.isFinite(candidate.rank) && candidate.rank > 0 ? Math.floor(candidate.rank) : fallbackRank;
return { ...candidate, rank };
}
function bestCandidatesForLane(candidates: SearchCandidate[]): SearchCandidate[] {
const byId = new Map<string, SearchCandidate>();
candidates.forEach((candidate, index) => {
const normalized = normalizeCandidate(candidate, index + 1);
const existing = byId.get(normalized.id);
if (
!existing ||
normalized.rank < existing.rank ||
(normalized.rank === existing.rank && normalized.id.localeCompare(existing.id) < 0)
) {
byId.set(normalized.id, normalized);
}
});
return [...byId.values()].sort((left, right) => left.rank - right.rank || left.id.localeCompare(right.id));
}
function failedLaneResult(error: unknown): SearchLaneResult {
return {
status: 'failed',
candidates: [],
reason: error instanceof Error ? error.message : String(error),
};
}
export class HybridSearchCore {
async search(options: HybridSearchOptions): Promise<HybridSearchResult> {
const finalLimit = Math.max(1, options.limit);
const requestedCandidatePoolLimit = options.candidatePoolLimit ?? defaultLaneCandidatePoolLimit(finalLimit);
const normalizedQuery = normalizeSearchQuery(options.queryText);
const executed = await Promise.all(
options.generators.map(async (generator): Promise<ExecutedLane> => {
try {
const result = await generator.generate({
queryText: options.queryText,
normalizedQuery,
finalLimit,
laneCandidatePoolLimit: requestedCandidatePoolLimit,
});
return { generator, result };
} catch (error) {
return { generator, result: failedLaneResult(error) };
}
}),
);
const byId = new Map<string, FusedSearchCandidate>();
const lanes: SearchLaneBreakdown[] = [];
const rrfK = options.rrfK ?? DEFAULT_RRF_K;
for (const { generator, result } of executed) {
const weight = laneWeight(options, generator.lane, generator.weight);
const status = result.status ?? 'available';
const effectiveCandidatePoolLimit = result.effectiveCandidatePoolLimit ?? requestedCandidatePoolLimit;
const laneCandidates = status === 'available' ? bestCandidatesForLane(result.candidates) : [];
lanes.push({
lane: generator.lane,
status,
requestedCandidatePoolLimit,
effectiveCandidatePoolLimit,
returnedCandidateCount: laneCandidates.length,
weight,
reason: result.reason,
});
if (status !== 'available') {
continue;
}
for (const candidate of laneCandidates) {
const existing =
byId.get(candidate.id) ??
({
id: candidate.id,
score: 0,
matchReasons: [],
ranksByLane: {},
rawScoresByLane: {},
evidenceByLane: {},
} satisfies FusedSearchCandidate);
existing.score += rrfContribution(weight, candidate.rank, rrfK);
existing.ranksByLane[generator.lane] = candidate.rank;
if (candidate.rawScore !== undefined) {
existing.rawScoresByLane[generator.lane] = candidate.rawScore;
}
const reason = candidate.matchReason ?? generator.lane;
if (!existing.matchReasons.includes(reason)) {
existing.matchReasons.push(reason);
}
if (candidate.evidence !== undefined) {
existing.evidenceByLane[generator.lane] = [
...(existing.evidenceByLane[generator.lane] ?? []),
candidate.evidence,
];
}
byId.set(candidate.id, existing);
}
}
const results = [...byId.values()].sort(compareFusedSearchCandidates).slice(0, finalLimit);
return {
query: normalizedQuery,
requestedLimit: finalLimit,
requestedCandidatePoolLimit,
results,
lanes,
};
}
}

View file

@ -0,0 +1,35 @@
export type {
AssertSearchBackendCapabilitiesInput,
AssertSearchBackendConformanceCaseInput,
ExpectedSearchBackendConformanceLane,
SearchBackendConformanceDictionaryMatch,
SearchBackendConformanceLane,
SearchBackendConformanceResult,
} from './backend-conformance.js';
export {
assertSearchBackendCapabilities,
assertSearchBackendConformanceCase,
} from './backend-conformance.js';
export { HybridSearchCore } from './hybrid-search-core.js';
export { defaultLaneCandidatePoolLimit, normalizeSearchQuery } from './query.js';
export {
compareFusedSearchCandidates,
DEFAULT_RRF_K,
DEFAULT_SEARCH_LANE_WEIGHTS,
rrfContribution,
} from './rrf.js';
export type {
FusedSearchCandidate,
HybridSearchOptions,
HybridSearchResult,
NormalizedSearchQuery,
SearchBackendCapabilities,
SearchCandidate,
SearchCandidateGenerator,
SearchCandidateGeneratorArgs,
SearchLaneBreakdown,
SearchLaneName,
SearchLaneResult,
SearchLaneStatus,
SearchResultHydrator,
} from './types.js';

View file

@ -0,0 +1,331 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { createServer } from 'node:net';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { Client } from 'pg';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { assertSearchBackendCapabilities, assertSearchBackendConformanceCase } from './index.js';
import { KloPGliteOwnerProcess, PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES } from './pglite-owner-process.js';
async function allocatePort(): Promise<number> {
const server = createServer();
await new Promise<void>((resolve) => server.listen(0, '127.0.0.1', resolve));
const address = server.address();
if (typeof address !== 'object' || address === null) {
throw new Error('Expected TCP server address while allocating a PGlite owner-process port.');
}
await new Promise<void>((resolve, reject) => {
server.close((error) => {
if (error) {
reject(error);
return;
}
resolve();
});
});
return address.port;
}
async function createHybridSearchFixture(owner: KloPGliteOwnerProcess): Promise<void> {
await owner.query(`
CREATE TABLE prototype_documents (
id TEXT PRIMARY KEY,
search_text TEXT NOT NULL,
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
embedding vector(3) NOT NULL
);
CREATE INDEX prototype_documents_fts_idx
ON prototype_documents
USING GIN (to_tsvector('english', search_text));
CREATE INDEX prototype_documents_vector_idx
ON prototype_documents
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 1);
CREATE TABLE prototype_dictionary_values (
connection_id TEXT NOT NULL,
source_name TEXT NOT NULL,
column_name TEXT NOT NULL,
value TEXT NOT NULL,
PRIMARY KEY (connection_id, source_name, column_name, value)
);
CREATE INDEX prototype_dictionary_values_trgm_idx
ON prototype_dictionary_values
USING GIN (value gin_trgm_ops);
`);
}
async function seedHybridSearchFixture(owner: KloPGliteOwnerProcess): Promise<void> {
await owner.query(
`
INSERT INTO prototype_documents (id, search_text, metadata, embedding)
VALUES
($1, $2, $3::jsonb, $4::vector),
($5, $6, $7::jsonb, $8::vector),
($9, $10, $11::jsonb, $12::vector)
`,
[
'warehouse/orders',
'orders paid revenue refund status customer',
JSON.stringify({ connectionId: 'warehouse', sourceName: 'orders' }),
JSON.stringify([1, 0, 0]),
'finance/orders',
'orders finance bookings gross margin',
JSON.stringify({ connectionId: 'finance', sourceName: 'orders' }),
JSON.stringify([0.72, 0.28, 0]),
'warehouse/customers',
'customers accounts lifecycle region',
JSON.stringify({ connectionId: 'warehouse', sourceName: 'customers' }),
JSON.stringify([0, 1, 0]),
],
);
await owner.query(`
INSERT INTO prototype_dictionary_values (connection_id, source_name, column_name, value)
VALUES
('warehouse', 'orders', 'status', 'refunded'),
('warehouse', 'orders', 'status', 'paid'),
('warehouse', 'customers', 'region', 'emea')
`);
}
describe('KloPGliteOwnerProcess', () => {
let tempDir: string;
let dataDir: string;
let port: number;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-owner-process-'));
dataDir = join(tempDir, 'pgdata');
port = await allocatePort();
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('declares the advanced PGlite search capabilities observed by the spike', () => {
assertSearchBackendCapabilities({
backendName: 'pglite-owner-process',
capabilities: PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES,
expected: {
fts: true,
vector: true,
fuzzy: true,
jsonSearch: true,
arraySearch: false,
},
});
});
it('starts a socket owner process and serves PostgreSQL clients', async () => {
const owner = await KloPGliteOwnerProcess.start({
dataDir,
host: '127.0.0.1',
port,
});
try {
await owner.query(`
CREATE TABLE owner_process_smoke (
id TEXT PRIMARY KEY,
search_text TEXT NOT NULL,
embedding vector(3) NOT NULL
);
INSERT INTO owner_process_smoke (id, search_text, embedding)
VALUES
('orders', 'orders paid revenue', '[1,0,0]'::vector),
('customers', 'customers region lifecycle', '[0,1,0]'::vector);
`);
const client = new Client(owner.connectionConfig());
await client.connect();
try {
const result = await client.query<{ id: string }>(`
SELECT id
FROM owner_process_smoke
ORDER BY embedding <=> '[1,0,0]'::vector, id ASC
LIMIT 1
`);
expect(result.rows).toEqual([{ id: 'orders' }]);
} finally {
await client.end();
}
} finally {
await owner.stop();
}
});
it('runs lexical, semantic, and dictionary conformance probes through socket clients', async () => {
const owner = await KloPGliteOwnerProcess.start({
dataDir,
host: '127.0.0.1',
port,
});
try {
await createHybridSearchFixture(owner);
await seedHybridSearchFixture(owner);
const lexical = await owner.query<{ id: string; score: number }>(
`
SELECT
id,
ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) AS score
FROM prototype_documents
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
ORDER BY score DESC, id ASC
LIMIT 2
`,
['paid orders'],
);
assertSearchBackendConformanceCase({
backendName: 'pglite-owner-process',
surface: 'semantic-layer',
caseName: 'socket postgres fts lexical ranking',
results: lexical.rows.map((row) => ({
id: row.id,
score: row.score,
matchReasons: ['lexical'],
})),
expectedTopIds: ['warehouse/orders'],
expectedReasonsById: {
'warehouse/orders': ['lexical'],
},
});
const semantic = await owner.query<{ id: string; similarity: number }>(
`
SELECT
id,
1 - (embedding <=> $1::vector) AS similarity
FROM prototype_documents
ORDER BY embedding <=> $1::vector, id ASC
LIMIT 2
`,
[JSON.stringify([1, 0, 0])],
);
assertSearchBackendConformanceCase({
backendName: 'pglite-owner-process',
surface: 'semantic-layer',
caseName: 'socket pgvector semantic ranking',
results: semantic.rows.map((row) => ({
id: row.id,
score: row.similarity,
matchReasons: ['semantic'],
})),
expectedTopIds: ['warehouse/orders'],
expectedReasonsById: {
'warehouse/orders': ['semantic'],
},
});
const dictionary = await owner.query<{ id: string; value: string; score: number }>(
`
SELECT
connection_id || '/' || source_name AS id,
value,
similarity(value, $1) AS score
FROM prototype_dictionary_values
WHERE similarity(value, $1) > 0
ORDER BY score DESC, id ASC, value ASC
LIMIT 2
`,
['refund'],
);
assertSearchBackendConformanceCase({
backendName: 'pglite-owner-process',
surface: 'semantic-layer',
caseName: 'socket pg_trgm dictionary ranking',
results: dictionary.rows.map((row) => ({
id: row.id,
score: row.score,
matchReasons: ['dictionary'],
dictionaryMatches: [{ column: 'status', values: [row.value] }],
})),
expectedTopIds: ['warehouse/orders'],
expectedReasonsById: {
'warehouse/orders': ['dictionary'],
},
expectedDictionaryMatchesById: {
'warehouse/orders': [{ column: 'status', values: ['refunded'] }],
},
});
} finally {
await owner.stop();
}
});
it('persists indexed rows after stopping and restarting the owner process', async () => {
const firstOwner = await KloPGliteOwnerProcess.start({
dataDir,
host: '127.0.0.1',
port,
});
try {
await createHybridSearchFixture(firstOwner);
await seedHybridSearchFixture(firstOwner);
} finally {
await firstOwner.stop();
}
const secondOwner = await KloPGliteOwnerProcess.start({
dataDir,
host: '127.0.0.1',
port,
});
try {
const persisted = await secondOwner.query<{ count: number }>(
"SELECT COUNT(*)::int AS count FROM prototype_documents WHERE metadata->>'connectionId' = $1",
['warehouse'],
);
expect(persisted.rows).toEqual([{ count: 2 }]);
} finally {
await secondOwner.stop();
}
});
it('serves concurrent PostgreSQL clients through one owner process', async () => {
const owner = await KloPGliteOwnerProcess.start({
dataDir,
host: '127.0.0.1',
port,
});
const clients: Client[] = [];
try {
await createHybridSearchFixture(owner);
await seedHybridSearchFixture(owner);
for (let index = 0; index < 4; index += 1) {
const client = new Client(owner.connectionConfig());
await client.connect();
clients.push(client);
}
const results = await Promise.all(
clients.map((client) =>
client.query<{ count: number }>('SELECT COUNT(*)::int AS count FROM prototype_documents'),
),
);
expect(results.map((result) => result.rows[0]?.count)).toEqual([3, 3, 3, 3]);
} finally {
await Promise.all(clients.map((client) => client.end().catch(() => undefined)));
await owner.stop();
}
});
});

View file

@ -0,0 +1,114 @@
import { PGlite, type PGliteInterface } from '@electric-sql/pglite';
import { pg_trgm } from '@electric-sql/pglite/contrib/pg_trgm';
import { vector } from '@electric-sql/pglite/vector';
import { PGLiteSocketServer } from '@electric-sql/pglite-socket';
import { Client, type ClientConfig, type QueryResult, type QueryResultRow } from 'pg';
import type { SearchBackendCapabilities } from './types.js';
export const PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES = {
fts: true,
vector: true,
fuzzy: true,
jsonSearch: true,
arraySearch: false,
} satisfies SearchBackendCapabilities;
export interface KloPGliteOwnerProcessOptions {
dataDir: string;
host: string;
port: number;
inspect?: boolean;
maxConnections?: number;
}
export class KloPGliteOwnerProcess {
readonly dataDir: string;
readonly host: string;
readonly port: number;
#db: PGliteInterface;
#server: PGLiteSocketServer;
#stopped = false;
private constructor(options: KloPGliteOwnerProcessOptions, db: PGliteInterface, server: PGLiteSocketServer) {
this.dataDir = options.dataDir;
this.host = options.host;
this.port = options.port;
this.#db = db;
this.#server = server;
}
static async start(options: KloPGliteOwnerProcessOptions): Promise<KloPGliteOwnerProcess> {
const db = await PGlite.create({
dataDir: options.dataDir,
extensions: {
vector,
pg_trgm,
},
});
let server: PGLiteSocketServer | undefined;
try {
await db.exec(`
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
`);
server = new PGLiteSocketServer({
db,
host: options.host,
port: options.port,
inspect: options.inspect ?? false,
maxConnections: options.maxConnections ?? 100,
});
await server.start();
return new KloPGliteOwnerProcess(options, db, server);
} catch (error) {
await server?.stop().catch(() => undefined);
await db.close().catch(() => undefined);
throw error;
}
}
connectionConfig(): ClientConfig {
return {
host: this.host,
port: this.port,
user: 'postgres',
database: 'postgres',
application_name: 'klo-pglite-owner-prototype',
connectionTimeoutMillis: 5_000,
};
}
async connect(): Promise<Client> {
const client = new Client(this.connectionConfig());
await client.connect();
return client;
}
async query<T extends QueryResultRow = QueryResultRow>(
text: string,
values?: readonly unknown[],
): Promise<QueryResult<T>> {
const client = await this.connect();
try {
return await client.query<T>(text, values ? [...values] : undefined);
} finally {
await client.end();
}
}
async stop(): Promise<void> {
if (this.#stopped) {
return;
}
this.#stopped = true;
await this.#server.stop();
await this.#db.close();
}
}

View file

@ -0,0 +1,66 @@
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import { fileURLToPath } from 'node:url';
import { describe, expect, it } from 'vitest';
const kloRoot = fileURLToPath(new URL('../../../../', import.meta.url));
function readKloFile(relativePath: string): string {
return readFileSync(join(kloRoot, relativePath), 'utf8');
}
function readContextPackageJson(): {
dependencies?: Record<string, string>;
devDependencies?: Record<string, string>;
exports?: Record<string, unknown>;
files?: string[];
} {
return JSON.parse(readKloFile('packages/context/package.json'));
}
describe('PGlite hybrid search runtime boundary', () => {
it('keeps PGlite packages as dev-only prototype dependencies', () => {
const pkg = readContextPackageJson();
expect(pkg.dependencies?.['@electric-sql/pglite']).toBeUndefined();
expect(pkg.dependencies?.['@electric-sql/pglite-socket']).toBeUndefined();
expect(pkg.devDependencies?.['@electric-sql/pglite']).toBeDefined();
expect(pkg.devDependencies?.['@electric-sql/pglite-socket']).toBeDefined();
expect(pkg.files).toEqual(['dist', 'prompts', 'skills']);
});
it('keeps PGlite prototypes out of public exports and production routing', () => {
const pkg = readContextPackageJson();
const packageExportKeys = Object.keys(pkg.exports ?? {});
expect(packageExportKeys.filter((key) => key.toLowerCase().includes('pglite'))).toEqual([]);
const publicExportFiles = [
'packages/context/src/index.ts',
'packages/context/src/search/index.ts',
'packages/context/src/sl/index.ts',
];
for (const relativePath of publicExportFiles) {
expect(readKloFile(relativePath), relativePath).not.toMatch(/pglite/i);
}
const productionRoutingFiles = [
'packages/cli/src/agent.ts',
'packages/context/src/mcp/local-project-ports.ts',
'packages/context/src/wiki/local-knowledge.ts',
'packages/context/src/ingest/context-evidence/sqlite-context-evidence-store.ts',
];
for (const relativePath of productionRoutingFiles) {
expect(readKloFile(relativePath), relativePath).not.toMatch(
/pglite-owner-prototype|pglite-sl-search-prototype|@electric-sql\/pglite/i,
);
}
const localSlSource = readKloFile('packages/context/src/sl/local-sl.ts');
expect(localSlSource).toContain("input.backend === 'pglite-owner-prototype'");
expect(localSlSource).toContain('PGlite semantic-layer search prototype requires pglite owner-process options.');
expect(localSlSource).toContain("await import('./pglite-sl-search-prototype.js')");
});
});

View file

@ -0,0 +1,302 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { PGlite, type PGliteInterface } from '@electric-sql/pglite';
import { pg_trgm } from '@electric-sql/pglite/contrib/pg_trgm';
import { vector } from '@electric-sql/pglite/vector';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import {
assertSearchBackendCapabilities,
assertSearchBackendConformanceCase,
type SearchBackendCapabilities,
} from './index.js';
type PGliteDb = PGliteInterface;
const PGLITE_SPIKE_CAPABILITIES = {
fts: true,
vector: true,
fuzzy: true,
jsonSearch: true,
arraySearch: false,
} satisfies SearchBackendCapabilities;
async function createSpikeDb(dataDir: string): Promise<PGliteDb> {
const db = await PGlite.create({
dataDir,
extensions: {
vector,
pg_trgm,
},
});
await db.exec(`
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
`);
return db;
}
async function createSchema(db: PGliteDb): Promise<void> {
await db.exec(`
CREATE TABLE IF NOT EXISTS spike_documents (
id TEXT PRIMARY KEY,
search_text TEXT NOT NULL,
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
embedding vector(3) NOT NULL
);
CREATE INDEX IF NOT EXISTS spike_documents_fts_idx
ON spike_documents
USING GIN (to_tsvector('english', search_text));
CREATE INDEX IF NOT EXISTS spike_documents_vector_idx
ON spike_documents
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 1);
CREATE TABLE IF NOT EXISTS spike_dictionary_values (
connection_id TEXT NOT NULL,
source_name TEXT NOT NULL,
column_name TEXT NOT NULL,
value TEXT NOT NULL,
PRIMARY KEY (connection_id, source_name, column_name, value)
);
CREATE INDEX IF NOT EXISTS spike_dictionary_values_trgm_idx
ON spike_dictionary_values
USING GIN (value gin_trgm_ops);
`);
}
async function seedSearchFixture(db: PGliteDb): Promise<void> {
await db.query(
`
INSERT INTO spike_documents (id, search_text, metadata, embedding)
VALUES
($1, $2, $3::jsonb, $4::vector),
($5, $6, $7::jsonb, $8::vector),
($9, $10, $11::jsonb, $12::vector)
ON CONFLICT (id) DO UPDATE
SET search_text = EXCLUDED.search_text,
metadata = EXCLUDED.metadata,
embedding = EXCLUDED.embedding
`,
[
'warehouse/orders',
'orders paid revenue refund status customer',
JSON.stringify({ connectionId: 'warehouse', sourceName: 'orders' }),
JSON.stringify([1, 0, 0]),
'finance/orders',
'orders finance bookings gross margin',
JSON.stringify({ connectionId: 'finance', sourceName: 'orders' }),
JSON.stringify([0.72, 0.28, 0]),
'warehouse/customers',
'customers accounts lifecycle region',
JSON.stringify({ connectionId: 'warehouse', sourceName: 'customers' }),
JSON.stringify([0, 1, 0]),
],
);
await db.query(
`
INSERT INTO spike_dictionary_values (connection_id, source_name, column_name, value)
VALUES
('warehouse', 'orders', 'status', 'refunded'),
('warehouse', 'orders', 'status', 'paid'),
('warehouse', 'customers', 'region', 'emea')
ON CONFLICT DO NOTHING
`,
);
}
async function closeDb(db: PGliteDb): Promise<void> {
await db.close();
}
describe('PGlite hybrid search spike', () => {
let tempDir: string;
let dataDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-search-spike-'));
dataDir = join(tempDir, 'pgdata');
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('documents PGlite search backend capabilities', () => {
assertSearchBackendCapabilities({
backendName: 'pglite-spike',
capabilities: PGLITE_SPIKE_CAPABILITIES,
expected: {
fts: true,
vector: true,
fuzzy: true,
jsonSearch: true,
arraySearch: false,
},
});
});
it('supports FTS, pgvector ordering, and pg_trgm dictionary lookup', async () => {
const db = await createSpikeDb(dataDir);
try {
await createSchema(db);
await seedSearchFixture(db);
const lexical = await db.query<{ id: string; score: number }>(
`
SELECT
id,
ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) AS score
FROM spike_documents
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
ORDER BY score DESC, id ASC
LIMIT 2
`,
['paid orders'],
);
assertSearchBackendConformanceCase({
backendName: 'pglite-spike',
surface: 'semantic-layer',
caseName: 'postgres fts lexical ranking',
results: lexical.rows.map((row) => ({
id: row.id,
score: row.score,
matchReasons: ['lexical'],
})),
expectedTopIds: ['warehouse/orders'],
expectedReasonsById: {
'warehouse/orders': ['lexical'],
},
});
const semantic = await db.query<{ id: string; similarity: number }>(
`
SELECT
id,
1 - (embedding <=> $1::vector) AS similarity
FROM spike_documents
ORDER BY embedding <=> $1::vector, id ASC
LIMIT 2
`,
[JSON.stringify([1, 0, 0])],
);
assertSearchBackendConformanceCase({
backendName: 'pglite-spike',
surface: 'semantic-layer',
caseName: 'pgvector cosine ranking',
results: semantic.rows.map((row) => ({
id: row.id,
score: row.similarity,
matchReasons: ['semantic'],
})),
expectedTopIds: ['warehouse/orders'],
expectedReasonsById: {
'warehouse/orders': ['semantic'],
},
});
const dictionary = await db.query<{ id: string; value: string; score: number }>(
`
SELECT
connection_id || '/' || source_name AS id,
value,
similarity(value, $1) AS score
FROM spike_dictionary_values
WHERE similarity(value, $1) > 0
ORDER BY score DESC, id ASC, value ASC
LIMIT 2
`,
['refund'],
);
assertSearchBackendConformanceCase({
backendName: 'pglite-spike',
surface: 'semantic-layer',
caseName: 'pg_trgm dictionary ranking',
results: dictionary.rows.map((row) => ({
id: row.id,
score: row.score,
matchReasons: ['dictionary'],
dictionaryMatches: [{ column: 'status', values: [row.value] }],
})),
expectedTopIds: ['warehouse/orders'],
expectedReasonsById: {
'warehouse/orders': ['dictionary'],
},
expectedDictionaryMatchesById: {
'warehouse/orders': [{ column: 'status', values: ['refunded'] }],
},
});
} finally {
await closeDb(db);
}
});
it('persists indexed rows after reopening the filesystem database', async () => {
const first = await createSpikeDb(dataDir);
try {
await createSchema(first);
await seedSearchFixture(first);
} finally {
await closeDb(first);
}
const second = await createSpikeDb(dataDir);
try {
const persisted = await second.query<{ count: number }>(
"SELECT COUNT(*)::int AS count FROM spike_documents WHERE metadata->>'connectionId' = $1",
['warehouse'],
);
expect(persisted.rows[0]).toEqual({ count: 2 });
} finally {
await closeDb(second);
}
});
it('records direct concurrency behavior without assuming Postgres server parity', async () => {
const db = await createSpikeDb(dataDir);
try {
await createSchema(db);
await seedSearchFixture(db);
const reads = await Promise.all(
Array.from({ length: 4 }, () =>
db.query<{ count: number }>('SELECT COUNT(*)::int AS count FROM spike_documents'),
),
);
expect(reads.map((result) => result.rows[0]?.count)).toEqual([3, 3, 3, 3]);
let secondOpenStatus: 'opened' | 'blocked' = 'opened';
let second: PGliteDb | undefined;
try {
second = await createSpikeDb(dataDir);
await second.query('SELECT 1');
} catch {
secondOpenStatus = 'blocked';
} finally {
if (second) {
await closeDb(second);
}
}
expect(['opened', 'blocked']).toContain(secondOpenStatus);
} finally {
await closeDb(db);
}
});
});

View file

@ -0,0 +1,26 @@
import { describe, expect, it } from 'vitest';
import { defaultLaneCandidatePoolLimit, normalizeSearchQuery } from './query.js';
describe('search query helpers', () => {
it('normalizes punctuation and duplicate terms into stable lowercase tokens', () => {
expect(normalizeSearchQuery(' Gross-Revenue, gross_revenue! Paid orders ')).toEqual({
raw: ' Gross-Revenue, gross_revenue! Paid orders ',
normalized: 'gross revenue gross_revenue paid orders',
terms: ['gross', 'revenue', 'gross_revenue', 'paid', 'orders'],
});
});
it('returns an empty normalized query for punctuation-only input', () => {
expect(normalizeSearchQuery('--- ///')).toEqual({
raw: '--- ///',
normalized: '',
terms: [],
});
});
it('sizes per-lane candidate pools before final limiting', () => {
expect(defaultLaneCandidatePoolLimit(1)).toBe(25);
expect(defaultLaneCandidatePoolLimit(8)).toBe(25);
expect(defaultLaneCandidatePoolLimit(10)).toBe(30);
});
});

View file

@ -0,0 +1,19 @@
import type { NormalizedSearchQuery } from './types.js';
export function normalizeSearchQuery(queryText: string): NormalizedSearchQuery {
const terms = queryText
.toLowerCase()
.split(/[^a-z0-9_]+/u)
.map((term) => term.trim())
.filter(Boolean);
return {
raw: queryText,
normalized: terms.join(' '),
terms,
};
}
export function defaultLaneCandidatePoolLimit(finalLimit: number): number {
return Math.max(25, Math.max(1, finalLimit) * 3);
}

View file

@ -0,0 +1,52 @@
import { describe, expect, it } from 'vitest';
import { compareFusedSearchCandidates, DEFAULT_SEARCH_LANE_WEIGHTS, rrfContribution } from './rrf.js';
import type { FusedSearchCandidate } from './types.js';
describe('RRF scoring', () => {
it('uses the shared lane weights from the hybrid search spec', () => {
expect(DEFAULT_SEARCH_LANE_WEIGHTS).toEqual({
semantic: 2,
dictionary: 2,
lexical: 1.5,
token: 0.75,
});
});
it('calculates a weighted RRF contribution with k=60 by default', () => {
expect(rrfContribution(2, 1)).toBeCloseTo(2 / 61, 12);
expect(rrfContribution(1.5, 2)).toBeCloseTo(1.5 / 62, 12);
});
it('sorts fused candidates by score, lane count, and stable id', () => {
const first: FusedSearchCandidate = {
id: 'orders',
score: 0.05,
matchReasons: ['lexical'],
ranksByLane: { lexical: 1 },
rawScoresByLane: {},
evidenceByLane: {},
};
const second: FusedSearchCandidate = {
id: 'customers',
score: 0.05,
matchReasons: ['lexical', 'semantic'],
ranksByLane: { lexical: 2, semantic: 1 },
rawScoresByLane: {},
evidenceByLane: {},
};
const third: FusedSearchCandidate = {
id: 'accounts',
score: 0.04,
matchReasons: ['semantic'],
ranksByLane: { semantic: 1 },
rawScoresByLane: {},
evidenceByLane: {},
};
expect([first, second, third].sort(compareFusedSearchCandidates).map((candidate) => candidate.id)).toEqual([
'customers',
'orders',
'accounts',
]);
});
});

View file

@ -0,0 +1,18 @@
import type { FusedSearchCandidate, SearchLaneName } from './types.js';
export const DEFAULT_RRF_K = 60;
export const DEFAULT_SEARCH_LANE_WEIGHTS: Record<SearchLaneName, number> = {
semantic: 2,
dictionary: 2,
lexical: 1.5,
token: 0.75,
};
export function rrfContribution(weight: number, rank: number, rrfK = DEFAULT_RRF_K): number {
return weight / (rrfK + rank);
}
export function compareFusedSearchCandidates(left: FusedSearchCandidate, right: FusedSearchCandidate): number {
return right.score - left.score || right.matchReasons.length - left.matchReasons.length || left.id.localeCompare(right.id);
}

View file

@ -0,0 +1,85 @@
export type SearchLaneName = 'lexical' | 'semantic' | 'dictionary' | 'token' | string;
export type SearchLaneStatus = 'available' | 'skipped' | 'failed';
export interface NormalizedSearchQuery {
raw: string;
normalized: string;
terms: string[];
}
export interface SearchCandidate {
id: string;
rank: number;
rawScore?: number;
matchReason?: string;
evidence?: unknown;
}
export interface SearchCandidateGeneratorArgs {
queryText: string;
normalizedQuery: NormalizedSearchQuery;
finalLimit: number;
laneCandidatePoolLimit: number;
}
export interface SearchLaneResult {
status?: SearchLaneStatus;
candidates: SearchCandidate[];
effectiveCandidatePoolLimit?: number;
reason?: string;
}
export interface SearchCandidateGenerator {
lane: SearchLaneName;
weight?: number;
generate(args: SearchCandidateGeneratorArgs): Promise<SearchLaneResult>;
}
export interface HybridSearchOptions {
queryText: string;
limit: number;
candidatePoolLimit?: number;
rrfK?: number;
laneWeights?: Partial<Record<SearchLaneName, number>>;
generators: SearchCandidateGenerator[];
}
export interface SearchLaneBreakdown {
lane: SearchLaneName;
status: SearchLaneStatus;
requestedCandidatePoolLimit: number;
effectiveCandidatePoolLimit: number;
returnedCandidateCount: number;
weight: number;
reason?: string;
}
export interface FusedSearchCandidate {
id: string;
score: number;
matchReasons: SearchLaneName[];
ranksByLane: Record<SearchLaneName, number>;
rawScoresByLane: Record<SearchLaneName, number>;
evidenceByLane: Record<SearchLaneName, unknown[]>;
}
export interface SearchResultHydrator<TResult> {
hydrate(candidates: FusedSearchCandidate[]): Promise<TResult[]>;
}
export interface HybridSearchResult {
query: NormalizedSearchQuery;
requestedLimit: number;
requestedCandidatePoolLimit: number;
results: FusedSearchCandidate[];
lanes: SearchLaneBreakdown[];
}
export interface SearchBackendCapabilities {
fts: boolean;
vector: boolean;
fuzzy: boolean;
jsonSearch: boolean;
arraySearch: boolean;
}