mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-13 08:15:14 +02:00
Initial open-source release
This commit is contained in:
commit
1a42152e6f
1199 changed files with 257054 additions and 0 deletions
354
packages/context/scripts/pglite-hybrid-search-spike.mjs
Normal file
354
packages/context/scripts/pglite-hybrid-search-spike.mjs
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
import { readdir, readFile, realpath, rm, stat, writeFile, mkdtemp } from 'node:fs/promises';
|
||||
import { createRequire } from 'node:module';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { dirname, join, relative, resolve } from 'node:path';
|
||||
import { performance } from 'node:perf_hooks';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const require = createRequire(import.meta.url);
|
||||
const scriptDir = dirname(fileURLToPath(import.meta.url));
|
||||
const contextDir = resolve(scriptDir, '..');
|
||||
const kloRoot = resolve(contextDir, '../..');
|
||||
const docsDir = join(kloRoot, 'docs');
|
||||
const reportPath = join(docsDir, 'hybrid-search-pglite-spike.md');
|
||||
|
||||
async function timed(label, fn) {
|
||||
const started = performance.now();
|
||||
const value = await fn();
|
||||
const durationMs = Number((performance.now() - started).toFixed(2));
|
||||
return { label, durationMs, value };
|
||||
}
|
||||
|
||||
async function directoryBytes(path) {
|
||||
const entry = await stat(path);
|
||||
if (entry.isFile()) {
|
||||
return entry.size;
|
||||
}
|
||||
|
||||
if (!entry.isDirectory()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const children = await readdir(path);
|
||||
const childSizes = await Promise.all(children.map((child) => directoryBytes(join(path, child))));
|
||||
return childSizes.reduce((sum, size) => sum + size, 0);
|
||||
}
|
||||
|
||||
async function resolvePackageJson(packageName) {
|
||||
let currentDir = dirname(require.resolve(packageName));
|
||||
|
||||
while (currentDir !== dirname(currentDir)) {
|
||||
const packageJsonPath = join(currentDir, 'package.json');
|
||||
|
||||
try {
|
||||
const packageJson = JSON.parse(await readFile(packageJsonPath, 'utf8'));
|
||||
if (packageJson.name === packageName) {
|
||||
return { packageJsonPath, packageJson };
|
||||
}
|
||||
} catch (error) {
|
||||
if (error?.code !== 'ENOENT') {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
currentDir = dirname(currentDir);
|
||||
}
|
||||
|
||||
throw new Error(`Could not resolve package.json for ${packageName}`);
|
||||
}
|
||||
|
||||
async function packageInfo(packageName) {
|
||||
const { packageJsonPath, packageJson } = await resolvePackageJson(packageName);
|
||||
const packageDir = await realpath(dirname(packageJsonPath));
|
||||
return {
|
||||
name: packageName,
|
||||
version: packageJson.version,
|
||||
path: relative(kloRoot, packageDir),
|
||||
bytes: await directoryBytes(packageDir),
|
||||
};
|
||||
}
|
||||
|
||||
async function createDb(PGlite, vector, pg_trgm, dataDir) {
|
||||
const db = await PGlite.create({
|
||||
dataDir,
|
||||
extensions: {
|
||||
vector,
|
||||
pg_trgm,
|
||||
},
|
||||
});
|
||||
|
||||
await db.exec(`
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
CREATE TABLE IF NOT EXISTS spike_documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
search_text TEXT NOT NULL,
|
||||
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
embedding vector(3) NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS spike_documents_fts_idx
|
||||
ON spike_documents
|
||||
USING GIN (to_tsvector('english', search_text));
|
||||
CREATE INDEX IF NOT EXISTS spike_documents_vector_idx
|
||||
ON spike_documents
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 1);
|
||||
CREATE TABLE IF NOT EXISTS spike_dictionary_values (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
column_name TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
PRIMARY KEY (connection_id, source_name, column_name, value)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS spike_dictionary_values_trgm_idx
|
||||
ON spike_dictionary_values
|
||||
USING GIN (value gin_trgm_ops);
|
||||
`);
|
||||
|
||||
return db;
|
||||
}
|
||||
|
||||
async function seed(db) {
|
||||
await db.query(
|
||||
`
|
||||
INSERT INTO spike_documents (id, search_text, metadata, embedding)
|
||||
VALUES
|
||||
($1, $2, $3::jsonb, $4::vector),
|
||||
($5, $6, $7::jsonb, $8::vector),
|
||||
($9, $10, $11::jsonb, $12::vector)
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET search_text = EXCLUDED.search_text,
|
||||
metadata = EXCLUDED.metadata,
|
||||
embedding = EXCLUDED.embedding
|
||||
`,
|
||||
[
|
||||
'warehouse/orders',
|
||||
'orders paid revenue refund status customer',
|
||||
JSON.stringify({ connectionId: 'warehouse', sourceName: 'orders' }),
|
||||
JSON.stringify([1, 0, 0]),
|
||||
'finance/orders',
|
||||
'orders finance bookings gross margin',
|
||||
JSON.stringify({ connectionId: 'finance', sourceName: 'orders' }),
|
||||
JSON.stringify([0.72, 0.28, 0]),
|
||||
'warehouse/customers',
|
||||
'customers accounts lifecycle region',
|
||||
JSON.stringify({ connectionId: 'warehouse', sourceName: 'customers' }),
|
||||
JSON.stringify([0, 1, 0]),
|
||||
],
|
||||
);
|
||||
|
||||
await db.query(`
|
||||
INSERT INTO spike_dictionary_values (connection_id, source_name, column_name, value)
|
||||
VALUES
|
||||
('warehouse', 'orders', 'status', 'refunded'),
|
||||
('warehouse', 'orders', 'status', 'paid'),
|
||||
('warehouse', 'customers', 'region', 'emea')
|
||||
ON CONFLICT DO NOTHING
|
||||
`);
|
||||
}
|
||||
|
||||
async function closeDb(db) {
|
||||
if (typeof db.close === 'function') {
|
||||
await db.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const importTimer = await timed('dynamic import @electric-sql/pglite', async () => {
|
||||
const [{ PGlite }, { vector }, { pg_trgm }] = await Promise.all([
|
||||
import('@electric-sql/pglite'),
|
||||
import('@electric-sql/pglite/vector'),
|
||||
import('@electric-sql/pglite/contrib/pg_trgm'),
|
||||
]);
|
||||
return { PGlite, vector, pg_trgm };
|
||||
});
|
||||
|
||||
const { PGlite, vector, pg_trgm } = importTimer.value;
|
||||
const tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-report-'));
|
||||
const dataDir = join(tempDir, 'pgdata');
|
||||
|
||||
let db;
|
||||
let reopened;
|
||||
|
||||
try {
|
||||
const createTimer = await timed('create persistent PGlite database and load extensions', async () => {
|
||||
db = await createDb(PGlite, vector, pg_trgm, dataDir);
|
||||
return true;
|
||||
});
|
||||
|
||||
const seedTimer = await timed('seed hybrid search fixture', async () => seed(db));
|
||||
|
||||
const ftsTimer = await timed('Postgres FTS query', () =>
|
||||
db.query(
|
||||
`
|
||||
SELECT id
|
||||
FROM spike_documents
|
||||
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
|
||||
ORDER BY ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) DESC, id ASC
|
||||
LIMIT 1
|
||||
`,
|
||||
['paid orders'],
|
||||
),
|
||||
);
|
||||
|
||||
const vectorTimer = await timed('pgvector cosine query', () =>
|
||||
db.query(
|
||||
`
|
||||
SELECT id, 1 - (embedding <=> $1::vector) AS similarity
|
||||
FROM spike_documents
|
||||
ORDER BY embedding <=> $1::vector, id ASC
|
||||
LIMIT 1
|
||||
`,
|
||||
[JSON.stringify([1, 0, 0])],
|
||||
),
|
||||
);
|
||||
|
||||
const trigramTimer = await timed('pg_trgm dictionary query', () =>
|
||||
db.query(
|
||||
`
|
||||
SELECT connection_id || '/' || source_name AS id, value, similarity(value, $1) AS score
|
||||
FROM spike_dictionary_values
|
||||
WHERE similarity(value, $1) > 0
|
||||
ORDER BY score DESC, id ASC, value ASC
|
||||
LIMIT 1
|
||||
`,
|
||||
['refund'],
|
||||
),
|
||||
);
|
||||
|
||||
const sameInstanceTimer = await timed('same instance parallel reads', () =>
|
||||
Promise.all(Array.from({ length: 4 }, () => db.query('SELECT COUNT(*)::int AS count FROM spike_documents'))),
|
||||
);
|
||||
|
||||
let secondOpenStatus = 'opened';
|
||||
let secondOpenMessage = 'Second direct opener executed SELECT 1.';
|
||||
let second;
|
||||
try {
|
||||
second = await createDb(PGlite, vector, pg_trgm, dataDir);
|
||||
await second.query('SELECT 1');
|
||||
} catch (error) {
|
||||
secondOpenStatus = 'blocked';
|
||||
secondOpenMessage = error instanceof Error ? error.message : String(error);
|
||||
} finally {
|
||||
if (second) {
|
||||
await closeDb(second);
|
||||
}
|
||||
}
|
||||
|
||||
await closeDb(db);
|
||||
db = undefined;
|
||||
|
||||
const reopenTimer = await timed('reopen persistent PGlite database', async () => {
|
||||
reopened = await createDb(PGlite, vector, pg_trgm, dataDir);
|
||||
return reopened.query('SELECT COUNT(*)::int AS count FROM spike_documents');
|
||||
});
|
||||
|
||||
const packages = await Promise.all([
|
||||
packageInfo('@electric-sql/pglite'),
|
||||
packageInfo('@electric-sql/pglite-socket'),
|
||||
]);
|
||||
|
||||
const result = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
node: process.version,
|
||||
packages,
|
||||
timingsMs: {
|
||||
import: importTimer.durationMs,
|
||||
createAndExtensions: createTimer.durationMs,
|
||||
seed: seedTimer.durationMs,
|
||||
ftsQuery: ftsTimer.durationMs,
|
||||
vectorQuery: vectorTimer.durationMs,
|
||||
trigramQuery: trigramTimer.durationMs,
|
||||
sameInstanceParallelReads: sameInstanceTimer.durationMs,
|
||||
reopen: reopenTimer.durationMs,
|
||||
},
|
||||
topResults: {
|
||||
fts: ftsTimer.value.rows[0]?.id ?? null,
|
||||
vector: vectorTimer.value.rows[0]?.id ?? null,
|
||||
trigram: trigramTimer.value.rows[0]?.id ?? null,
|
||||
persistedRowCount: reopenTimer.value.rows[0]?.count ?? null,
|
||||
},
|
||||
concurrency: {
|
||||
sameInstanceReadCounts: sameInstanceTimer.value.map((queryResult) => queryResult.rows[0]?.count ?? null),
|
||||
secondDirectOpenStatus: secondOpenStatus,
|
||||
secondDirectOpenMessage: secondOpenMessage,
|
||||
},
|
||||
};
|
||||
|
||||
const totalPackageBytes = packages.reduce((sum, pkg) => sum + pkg.bytes, 0);
|
||||
const recommendation =
|
||||
secondOpenStatus === 'opened'
|
||||
? 'Prototype a PGlite backend behind an explicit owner process or socket before exposing CLI plus MCP concurrent access.'
|
||||
: 'Use a socket or owner-process architecture for any PGlite backend prototype because direct second opener access was blocked.';
|
||||
|
||||
const markdown = `# Hybrid Search PGlite Spike
|
||||
|
||||
Generated: ${result.generatedAt}
|
||||
|
||||
## Summary
|
||||
|
||||
PGlite loaded in Node ${result.node}, enabled vector and pg_trgm extensions, executed Postgres FTS, pgvector cosine ranking, pg_trgm dictionary ranking, and reopened a persistent filesystem database.
|
||||
|
||||
Recommendation: ${recommendation}
|
||||
|
||||
## Package Footprint
|
||||
|
||||
| Package | Version | Approx bytes | Resolved path |
|
||||
| --- | --- | ---: | --- |
|
||||
${packages.map((pkg) => `| \`${pkg.name}\` | \`${pkg.version}\` | ${pkg.bytes} | \`${pkg.path}\` |`).join('\n')}
|
||||
|
||||
Total measured package bytes: ${totalPackageBytes}
|
||||
|
||||
## Timings
|
||||
|
||||
| Probe | Duration ms |
|
||||
| --- | ---: |
|
||||
${Object.entries(result.timingsMs)
|
||||
.map(([name, ms]) => `| ${name} | ${ms} |`)
|
||||
.join('\n')}
|
||||
|
||||
## Search Feature Results
|
||||
|
||||
| Probe | Top result |
|
||||
| --- | --- |
|
||||
| Postgres FTS | \`${result.topResults.fts}\` |
|
||||
| pgvector cosine | \`${result.topResults.vector}\` |
|
||||
| pg_trgm dictionary | \`${result.topResults.trigram}\` |
|
||||
| Reopened persisted row count | \`${result.topResults.persistedRowCount}\` |
|
||||
|
||||
## Concurrency Observation
|
||||
|
||||
Same-instance parallel read counts: \`${result.concurrency.sameInstanceReadCounts.join(', ')}\`
|
||||
|
||||
Second direct opener status: \`${result.concurrency.secondDirectOpenStatus}\`
|
||||
|
||||
Second direct opener message:
|
||||
|
||||
\`\`\`text
|
||||
${result.concurrency.secondDirectOpenMessage}
|
||||
\`\`\`
|
||||
|
||||
## Decision
|
||||
|
||||
The SQLite backend remains the production default. The next PGlite step, if approved, is an owner-process or socket-backed prototype that reuses the existing \`SearchBackendCapabilities\` and backend conformance helpers without changing the public CLI surface.
|
||||
`;
|
||||
|
||||
await writeFile(reportPath, markdown);
|
||||
process.stdout.write(`Wrote ${relative(process.cwd(), reportPath)}\n`);
|
||||
process.stdout.write(JSON.stringify(result, null, 2));
|
||||
process.stdout.write('\n');
|
||||
} finally {
|
||||
if (db) {
|
||||
await closeDb(db);
|
||||
}
|
||||
if (reopened) {
|
||||
await closeDb(reopened);
|
||||
}
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(error);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
317
packages/context/scripts/pglite-owner-process-prototype.mjs
Normal file
317
packages/context/scripts/pglite-owner-process-prototype.mjs
Normal file
|
|
@ -0,0 +1,317 @@
|
|||
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
|
||||
import { createServer } from 'node:net';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { dirname, join, resolve } from 'node:path';
|
||||
import { performance } from 'node:perf_hooks';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { PGlite } from '@electric-sql/pglite';
|
||||
import { pg_trgm } from '@electric-sql/pglite/contrib/pg_trgm';
|
||||
import { vector } from '@electric-sql/pglite/vector';
|
||||
import { PGLiteSocketServer } from '@electric-sql/pglite-socket';
|
||||
import { Client } from 'pg';
|
||||
|
||||
const scriptDir = dirname(fileURLToPath(import.meta.url));
|
||||
const contextDir = resolve(scriptDir, '..');
|
||||
const kloRoot = resolve(contextDir, '../..');
|
||||
const reportPath = join(kloRoot, 'docs', 'hybrid-search-pglite-owner-process.md');
|
||||
|
||||
async function timed(label, fn) {
|
||||
const started = performance.now();
|
||||
const value = await fn();
|
||||
return {
|
||||
label,
|
||||
durationMs: Number((performance.now() - started).toFixed(2)),
|
||||
value,
|
||||
};
|
||||
}
|
||||
|
||||
async function allocatePort() {
|
||||
const server = createServer();
|
||||
await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve));
|
||||
const address = server.address();
|
||||
if (typeof address !== 'object' || address === null) {
|
||||
throw new Error('Expected TCP server address while allocating a PGlite owner-process port.');
|
||||
}
|
||||
await new Promise((resolve, reject) => {
|
||||
server.close((error) => {
|
||||
if (error) {
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
return address.port;
|
||||
}
|
||||
|
||||
async function createOwner(dataDir, port) {
|
||||
const db = await PGlite.create({
|
||||
dataDir,
|
||||
extensions: {
|
||||
vector,
|
||||
pg_trgm,
|
||||
},
|
||||
});
|
||||
|
||||
await db.exec(`
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
CREATE TABLE IF NOT EXISTS prototype_documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
search_text TEXT NOT NULL,
|
||||
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
embedding vector(3) NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS prototype_documents_fts_idx
|
||||
ON prototype_documents
|
||||
USING GIN (to_tsvector('english', search_text));
|
||||
CREATE INDEX IF NOT EXISTS prototype_documents_vector_idx
|
||||
ON prototype_documents
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 1);
|
||||
CREATE TABLE IF NOT EXISTS prototype_dictionary_values (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
column_name TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
PRIMARY KEY (connection_id, source_name, column_name, value)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS prototype_dictionary_values_trgm_idx
|
||||
ON prototype_dictionary_values
|
||||
USING GIN (value gin_trgm_ops);
|
||||
`);
|
||||
|
||||
const server = new PGLiteSocketServer({
|
||||
db,
|
||||
host: '127.0.0.1',
|
||||
port,
|
||||
maxConnections: 100,
|
||||
});
|
||||
|
||||
await server.start();
|
||||
|
||||
return {
|
||||
db,
|
||||
server,
|
||||
connectionConfig: {
|
||||
host: '127.0.0.1',
|
||||
port,
|
||||
user: 'postgres',
|
||||
database: 'postgres',
|
||||
application_name: 'klo-pglite-owner-report',
|
||||
connectionTimeoutMillis: 5_000,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async function withClient(connectionConfig, fn) {
|
||||
const client = new Client(connectionConfig);
|
||||
await client.connect();
|
||||
try {
|
||||
return await fn(client);
|
||||
} finally {
|
||||
await client.end();
|
||||
}
|
||||
}
|
||||
|
||||
async function seed(connectionConfig) {
|
||||
await withClient(connectionConfig, async (client) => {
|
||||
await client.query(
|
||||
`
|
||||
INSERT INTO prototype_documents (id, search_text, metadata, embedding)
|
||||
VALUES
|
||||
($1, $2, $3::jsonb, $4::vector),
|
||||
($5, $6, $7::jsonb, $8::vector),
|
||||
($9, $10, $11::jsonb, $12::vector)
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET search_text = EXCLUDED.search_text,
|
||||
metadata = EXCLUDED.metadata,
|
||||
embedding = EXCLUDED.embedding
|
||||
`,
|
||||
[
|
||||
'warehouse/orders',
|
||||
'orders paid revenue refund status customer',
|
||||
JSON.stringify({ connectionId: 'warehouse', sourceName: 'orders' }),
|
||||
JSON.stringify([1, 0, 0]),
|
||||
'finance/orders',
|
||||
'orders finance bookings gross margin',
|
||||
JSON.stringify({ connectionId: 'finance', sourceName: 'orders' }),
|
||||
JSON.stringify([0.72, 0.28, 0]),
|
||||
'warehouse/customers',
|
||||
'customers accounts lifecycle region',
|
||||
JSON.stringify({ connectionId: 'warehouse', sourceName: 'customers' }),
|
||||
JSON.stringify([0, 1, 0]),
|
||||
],
|
||||
);
|
||||
|
||||
await client.query(`
|
||||
INSERT INTO prototype_dictionary_values (connection_id, source_name, column_name, value)
|
||||
VALUES
|
||||
('warehouse', 'orders', 'status', 'refunded'),
|
||||
('warehouse', 'orders', 'status', 'paid'),
|
||||
('warehouse', 'customers', 'region', 'emea')
|
||||
ON CONFLICT DO NOTHING
|
||||
`);
|
||||
});
|
||||
}
|
||||
|
||||
async function queryTopResults(connectionConfig) {
|
||||
return await withClient(connectionConfig, async (client) => {
|
||||
const lexical = await client.query(
|
||||
`
|
||||
SELECT id
|
||||
FROM prototype_documents
|
||||
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
|
||||
ORDER BY ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) DESC, id ASC
|
||||
LIMIT 1
|
||||
`,
|
||||
['paid orders'],
|
||||
);
|
||||
|
||||
const semantic = await client.query(
|
||||
`
|
||||
SELECT id
|
||||
FROM prototype_documents
|
||||
ORDER BY embedding <=> $1::vector, id ASC
|
||||
LIMIT 1
|
||||
`,
|
||||
[JSON.stringify([1, 0, 0])],
|
||||
);
|
||||
|
||||
const dictionary = await client.query(
|
||||
`
|
||||
SELECT connection_id || '/' || source_name AS id
|
||||
FROM prototype_dictionary_values
|
||||
WHERE similarity(value, $1) > 0
|
||||
ORDER BY similarity(value, $1) DESC, id ASC, value ASC
|
||||
LIMIT 1
|
||||
`,
|
||||
['refund'],
|
||||
);
|
||||
|
||||
return {
|
||||
lexical: lexical.rows[0]?.id ?? '<missing>',
|
||||
semantic: semantic.rows[0]?.id ?? '<missing>',
|
||||
dictionary: dictionary.rows[0]?.id ?? '<missing>',
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function concurrentReads(connectionConfig) {
|
||||
const clients = await Promise.all(
|
||||
Array.from({ length: 4 }, async () => {
|
||||
const client = new Client(connectionConfig);
|
||||
await client.connect();
|
||||
return client;
|
||||
}),
|
||||
);
|
||||
|
||||
try {
|
||||
const results = await Promise.all(
|
||||
clients.map((client) => client.query('SELECT COUNT(*)::int AS count FROM prototype_documents')),
|
||||
);
|
||||
return results.map((result) => result.rows[0]?.count ?? null);
|
||||
} finally {
|
||||
await Promise.all(clients.map((client) => client.end().catch(() => undefined)));
|
||||
}
|
||||
}
|
||||
|
||||
async function stopOwner(owner) {
|
||||
await owner.server.stop();
|
||||
await owner.db.close();
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-owner-report-'));
|
||||
const dataDir = join(tempDir, 'pgdata');
|
||||
const port = await allocatePort();
|
||||
|
||||
let owner;
|
||||
|
||||
try {
|
||||
const startTimer = await timed('startOwner', async () => await createOwner(dataDir, port));
|
||||
owner = startTimer.value;
|
||||
|
||||
const seedTimer = await timed('seed', async () => await seed(owner.connectionConfig));
|
||||
const queryTimer = await timed('searchQueries', async () => await queryTopResults(owner.connectionConfig));
|
||||
const concurrentTimer = await timed('concurrentReads', async () => await concurrentReads(owner.connectionConfig));
|
||||
|
||||
await stopOwner(owner);
|
||||
owner = undefined;
|
||||
|
||||
const restartTimer = await timed('restartOwner', async () => await createOwner(dataDir, port));
|
||||
owner = restartTimer.value;
|
||||
|
||||
const persisted = await withClient(owner.connectionConfig, async (client) => {
|
||||
const result = await client.query('SELECT COUNT(*)::int AS count FROM prototype_documents');
|
||||
return result.rows[0]?.count ?? null;
|
||||
});
|
||||
|
||||
const markdown = `# Hybrid Search PGlite Owner Process Prototype
|
||||
|
||||
Generated: ${new Date().toISOString()}
|
||||
|
||||
## Summary
|
||||
|
||||
PGlite started behind one explicit owner process, enabled vector and pg_trgm extensions, served PostgreSQL clients through \`@electric-sql/pglite-socket\`, answered lexical, semantic, and dictionary probes, and preserved rows across owner restart.
|
||||
|
||||
Recommendation: Keep SQLite as the production default. The next PGlite implementation step should be a private adapter prototype behind an explicit configuration flag, still guarded by backend conformance tests, before any CLI or MCP default changes.
|
||||
|
||||
## Timings
|
||||
|
||||
| Probe | Duration ms |
|
||||
| --- | ---: |
|
||||
| startOwner | ${startTimer.durationMs} |
|
||||
| seed | ${seedTimer.durationMs} |
|
||||
| searchQueries | ${queryTimer.durationMs} |
|
||||
| concurrentReads | ${concurrentTimer.durationMs} |
|
||||
| restartOwner | ${restartTimer.durationMs} |
|
||||
|
||||
## Search Feature Results
|
||||
|
||||
| Probe | Top result |
|
||||
| --- | --- |
|
||||
| Postgres FTS through socket | \`${queryTimer.value.lexical}\` |
|
||||
| pgvector cosine through socket | \`${queryTimer.value.semantic}\` |
|
||||
| pg_trgm dictionary through socket | \`${queryTimer.value.dictionary}\` |
|
||||
| Reopened persisted row count | \`${persisted}\` |
|
||||
|
||||
## Concurrency Observation
|
||||
|
||||
Concurrent socket read counts: \`${concurrentTimer.value.join(', ')}\`
|
||||
|
||||
## Decision
|
||||
|
||||
The owner-process shape is viable for a prototype because it gives CLI and MCP callers a PostgreSQL protocol boundary without opening the same PGlite data directory from independent runtimes. This report is not a production adapter acceptance record.
|
||||
`;
|
||||
|
||||
await writeFile(reportPath, markdown);
|
||||
console.log(`Wrote ${reportPath}`);
|
||||
console.log(
|
||||
JSON.stringify(
|
||||
{
|
||||
port,
|
||||
timings: {
|
||||
startOwner: startTimer.durationMs,
|
||||
seed: seedTimer.durationMs,
|
||||
searchQueries: queryTimer.durationMs,
|
||||
concurrentReads: concurrentTimer.durationMs,
|
||||
restartOwner: restartTimer.durationMs,
|
||||
},
|
||||
topResults: queryTimer.value,
|
||||
concurrentReads: concurrentTimer.value,
|
||||
persisted,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
} finally {
|
||||
if (owner) {
|
||||
await stopOwner(owner).catch(() => undefined);
|
||||
}
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
await main();
|
||||
263
packages/context/scripts/pglite-sl-search-prototype.mjs
Normal file
263
packages/context/scripts/pglite-sl-search-prototype.mjs
Normal file
|
|
@ -0,0 +1,263 @@
|
|||
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
|
||||
import { createServer } from 'node:net';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { dirname, join, resolve } from 'node:path';
|
||||
import { performance } from 'node:perf_hooks';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { PGlite } from '@electric-sql/pglite';
|
||||
import { pg_trgm } from '@electric-sql/pglite/contrib/pg_trgm';
|
||||
import { vector } from '@electric-sql/pglite/vector';
|
||||
import { PGLiteSocketServer } from '@electric-sql/pglite-socket';
|
||||
import { Client } from 'pg';
|
||||
|
||||
const scriptDir = dirname(fileURLToPath(import.meta.url));
|
||||
const contextDir = resolve(scriptDir, '..');
|
||||
const kloRoot = resolve(contextDir, '../..');
|
||||
const reportPath = join(kloRoot, 'docs', 'hybrid-search-pglite-sl-adapter-prototype.md');
|
||||
|
||||
async function timed(label, fn) {
|
||||
const started = performance.now();
|
||||
const value = await fn();
|
||||
return {
|
||||
label,
|
||||
durationMs: Number((performance.now() - started).toFixed(2)),
|
||||
value,
|
||||
};
|
||||
}
|
||||
|
||||
async function allocatePort() {
|
||||
const server = createServer();
|
||||
await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve));
|
||||
const address = server.address();
|
||||
if (typeof address !== 'object' || address === null) {
|
||||
throw new Error('Expected TCP server address while allocating a PGlite SL prototype port.');
|
||||
}
|
||||
await new Promise((resolve, reject) => {
|
||||
server.close((error) => {
|
||||
if (error) {
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
return address.port;
|
||||
}
|
||||
|
||||
async function createOwner(dataDir, port) {
|
||||
const db = await PGlite.create({
|
||||
dataDir,
|
||||
extensions: { vector, pg_trgm },
|
||||
});
|
||||
|
||||
await db.exec(`
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
CREATE TABLE prototype_sl_sources (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
search_text TEXT NOT NULL,
|
||||
embedding vector(3),
|
||||
PRIMARY KEY (connection_id, source_name)
|
||||
);
|
||||
CREATE INDEX prototype_sl_sources_fts_idx
|
||||
ON prototype_sl_sources
|
||||
USING GIN (to_tsvector('english', search_text));
|
||||
CREATE INDEX prototype_sl_sources_vector_idx
|
||||
ON prototype_sl_sources
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 1);
|
||||
CREATE TABLE prototype_sl_dictionary_values (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
column_name TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
value_lower TEXT NOT NULL,
|
||||
PRIMARY KEY (connection_id, source_name, column_name, value)
|
||||
);
|
||||
CREATE INDEX prototype_sl_dictionary_values_trgm_idx
|
||||
ON prototype_sl_dictionary_values
|
||||
USING GIN (value gin_trgm_ops);
|
||||
`);
|
||||
|
||||
const server = new PGLiteSocketServer({ db, host: '127.0.0.1', port, maxConnections: 100 });
|
||||
await server.start();
|
||||
|
||||
return {
|
||||
db,
|
||||
server,
|
||||
connectionConfig: {
|
||||
host: '127.0.0.1',
|
||||
port,
|
||||
user: 'postgres',
|
||||
database: 'postgres',
|
||||
application_name: 'klo-pglite-sl-prototype-report',
|
||||
connectionTimeoutMillis: 5_000,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async function withClient(connectionConfig, fn) {
|
||||
const client = new Client(connectionConfig);
|
||||
await client.connect();
|
||||
try {
|
||||
return await fn(client);
|
||||
} finally {
|
||||
await client.end();
|
||||
}
|
||||
}
|
||||
|
||||
async function seed(connectionConfig) {
|
||||
await withClient(connectionConfig, async (client) => {
|
||||
await client.query(
|
||||
`
|
||||
INSERT INTO prototype_sl_sources (connection_id, source_name, search_text, embedding)
|
||||
VALUES
|
||||
($1, $2, $3, $4::vector),
|
||||
($5, $6, $7, $8::vector),
|
||||
($9, $10, $11, $12::vector)
|
||||
`,
|
||||
[
|
||||
'warehouse',
|
||||
'orders',
|
||||
'orders paid revenue refund status customer',
|
||||
JSON.stringify([1, 0, 0]),
|
||||
'finance',
|
||||
'orders',
|
||||
'orders finance bookings gross margin',
|
||||
JSON.stringify([0.72, 0.28, 0]),
|
||||
'warehouse',
|
||||
'customers',
|
||||
'customers accounts lifecycle region',
|
||||
JSON.stringify([0, 1, 0]),
|
||||
],
|
||||
);
|
||||
|
||||
await client.query(`
|
||||
INSERT INTO prototype_sl_dictionary_values (connection_id, source_name, column_name, value, value_lower)
|
||||
VALUES
|
||||
('warehouse', 'orders', 'status', 'refunded', 'refunded'),
|
||||
('warehouse', 'orders', 'status', 'paid', 'paid'),
|
||||
('warehouse', 'customers', 'region', 'emea', 'emea')
|
||||
`);
|
||||
});
|
||||
}
|
||||
|
||||
async function queryTopResults(connectionConfig) {
|
||||
return withClient(connectionConfig, async (client) => {
|
||||
const lexical = await client.query(
|
||||
`
|
||||
SELECT connection_id || '/' || source_name AS id
|
||||
FROM prototype_sl_sources
|
||||
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
|
||||
ORDER BY ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) DESC, id ASC
|
||||
LIMIT 1
|
||||
`,
|
||||
['paid revenue'],
|
||||
);
|
||||
|
||||
const semantic = await client.query(
|
||||
`
|
||||
SELECT connection_id || '/' || source_name AS id
|
||||
FROM prototype_sl_sources
|
||||
ORDER BY embedding <=> $1::vector, id ASC
|
||||
LIMIT 1
|
||||
`,
|
||||
[JSON.stringify([1, 0, 0])],
|
||||
);
|
||||
|
||||
const dictionary = await client.query(
|
||||
`
|
||||
SELECT connection_id || '/' || source_name AS id
|
||||
FROM prototype_sl_dictionary_values
|
||||
WHERE similarity(value, $1) > 0 OR value_lower LIKE '%' || lower($1) || '%'
|
||||
ORDER BY GREATEST(similarity(value, $1), CASE WHEN value_lower LIKE '%' || lower($1) || '%' THEN 0.75 ELSE 0 END) DESC,
|
||||
id ASC,
|
||||
value ASC
|
||||
LIMIT 1
|
||||
`,
|
||||
['refund'],
|
||||
);
|
||||
|
||||
return {
|
||||
lexical: lexical.rows[0]?.id ?? '<missing>',
|
||||
semantic: semantic.rows[0]?.id ?? '<missing>',
|
||||
dictionary: dictionary.rows[0]?.id ?? '<missing>',
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function stopOwner(owner) {
|
||||
await owner.server.stop();
|
||||
await owner.db.close();
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-sl-prototype-report-'));
|
||||
const dataDir = join(tempDir, 'pgdata');
|
||||
const port = await allocatePort();
|
||||
let owner;
|
||||
|
||||
try {
|
||||
const startTimer = await timed('startOwner', async () => createOwner(dataDir, port));
|
||||
owner = startTimer.value;
|
||||
const seedTimer = await timed('seedSemanticLayerIndex', async () => seed(owner.connectionConfig));
|
||||
const searchTimer = await timed('searchQueries', async () => queryTopResults(owner.connectionConfig));
|
||||
|
||||
const markdown = `# Hybrid Search PGlite Semantic-Layer Adapter Prototype
|
||||
|
||||
Generated: ${new Date().toISOString()}
|
||||
|
||||
## Summary
|
||||
|
||||
PGlite served a semantic-layer-style search index through one owner process and PostgreSQL clients. The probe returned lexical, semantic, and dictionary top results through Postgres FTS, pgvector ordering, and pg_trgm matching.
|
||||
|
||||
Recommendation: Keep SQLite as the production default. The PGlite semantic-layer adapter remains private and explicitly opt-in until a separate plan decides runtime dependencies, long-lived owner lifecycle, and CLI/MCP routing.
|
||||
|
||||
## Timings
|
||||
|
||||
| Probe | Duration ms |
|
||||
| --- | ---: |
|
||||
| startOwner | ${startTimer.durationMs} |
|
||||
| seedSemanticLayerIndex | ${seedTimer.durationMs} |
|
||||
| searchQueries | ${searchTimer.durationMs} |
|
||||
|
||||
## Search Feature Results
|
||||
|
||||
| Probe | Top result |
|
||||
| --- | --- |
|
||||
| Postgres FTS through socket | \`${searchTimer.value.lexical}\` |
|
||||
| pgvector cosine through socket | \`${searchTimer.value.semantic}\` |
|
||||
| pg_trgm dictionary through socket | \`${searchTimer.value.dictionary}\` |
|
||||
|
||||
## Decision
|
||||
|
||||
The private adapter shape is viable for semantic-layer search prototypes. It is not a production backend acceptance record and does not change the default SQLite search path.
|
||||
`;
|
||||
|
||||
await writeFile(reportPath, markdown);
|
||||
console.log(`Wrote ${reportPath}`);
|
||||
console.log(
|
||||
JSON.stringify(
|
||||
{
|
||||
port,
|
||||
timings: {
|
||||
startOwner: startTimer.durationMs,
|
||||
seed: seedTimer.durationMs,
|
||||
searchQueries: searchTimer.durationMs,
|
||||
},
|
||||
topResults: searchTimer.value,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
} finally {
|
||||
if (owner) {
|
||||
await stopOwner(owner).catch(() => undefined);
|
||||
}
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
await main();
|
||||
52
packages/context/scripts/relationship-benchmark-report.mjs
Normal file
52
packages/context/scripts/relationship-benchmark-report.mjs
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import { dirname, join, resolve } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import {
|
||||
KLO_RELATIONSHIP_BENCHMARK_MODES,
|
||||
buildKloRelationshipBenchmarkReport,
|
||||
currentKloRelationshipBenchmarkDetector,
|
||||
formatKloRelationshipBenchmarkReportMarkdown,
|
||||
kloRelationshipBenchmarkDetectorWithLlm,
|
||||
loadKloRelationshipBenchmarkFixtures,
|
||||
runKloRelationshipBenchmarkSuite,
|
||||
} from '../dist/scan/index.js';
|
||||
|
||||
const scriptDir = dirname(fileURLToPath(import.meta.url));
|
||||
const packageRoot = resolve(scriptDir, '..');
|
||||
const fixtureRoot = join(packageRoot, 'test/fixtures/relationship-benchmarks');
|
||||
|
||||
async function buildDetector() {
|
||||
const backend = process.env.KLO_BENCHMARK_LLM_BACKEND;
|
||||
if (!backend || backend === 'none') {
|
||||
return currentKloRelationshipBenchmarkDetector();
|
||||
}
|
||||
if (backend !== 'vertex') {
|
||||
throw new Error(`Unsupported KLO_BENCHMARK_LLM_BACKEND: ${backend}`);
|
||||
}
|
||||
const project = process.env.KLO_BENCHMARK_VERTEX_PROJECT;
|
||||
const location = process.env.KLO_BENCHMARK_VERTEX_LOCATION;
|
||||
const model = process.env.KLO_BENCHMARK_LLM_MODEL ?? 'claude-sonnet-4-6';
|
||||
if (!project || !location) {
|
||||
throw new Error('KLO_BENCHMARK_VERTEX_PROJECT and KLO_BENCHMARK_VERTEX_LOCATION are required for vertex backend');
|
||||
}
|
||||
const { createKloLlmProvider } = await import('@klo/llm');
|
||||
const provider = createKloLlmProvider({
|
||||
backend: 'vertex',
|
||||
vertex: { project, location },
|
||||
modelSlots: { default: model },
|
||||
});
|
||||
return kloRelationshipBenchmarkDetectorWithLlm(provider);
|
||||
}
|
||||
|
||||
const fixtures = await loadKloRelationshipBenchmarkFixtures(fixtureRoot);
|
||||
const detector = await buildDetector();
|
||||
const suite = await runKloRelationshipBenchmarkSuite({
|
||||
fixtures,
|
||||
detector,
|
||||
});
|
||||
const report = buildKloRelationshipBenchmarkReport({
|
||||
fixtures,
|
||||
suite,
|
||||
modes: KLO_RELATIONSHIP_BENCHMARK_MODES,
|
||||
});
|
||||
|
||||
process.stdout.write(formatKloRelationshipBenchmarkReportMarkdown(report));
|
||||
Loading…
Add table
Add a link
Reference in a new issue