ktx/packages/context/src/wiki/sqlite-knowledge-index.ts
2026-05-10 23:12:26 +02:00

276 lines
7.8 KiB
TypeScript

import { mkdirSync } from 'node:fs';
import { dirname } from 'node:path';
import Database from 'better-sqlite3';
import { buildKnowledgeSearchText } from './knowledge-search-text.js';
import type { LocalKnowledgeScope } from './local-knowledge.js';
export interface SqliteKnowledgeIndexOptions {
dbPath: string;
}
export interface SqliteKnowledgeIndexPage {
path: string;
key: string;
scope: LocalKnowledgeScope;
summary: string;
content: string;
tags: string[];
embedding?: number[] | null;
}
export interface SqliteKnowledgeIndexSearchResult {
path: string;
score: number;
}
export interface WikiSqliteLaneCandidate {
id: string;
path: string;
rank: number;
rawScore: number;
}
export interface ExistingKnowledgeIndexPage {
searchText: string;
embedding: number[] | null;
}
interface SearchRow {
path: string;
rank: number;
}
type IndexedPageRow = {
path: string;
embedding_json: string | null;
};
function cosineSimilarity(left: number[], right: number[]): number {
if (left.length === 0 || left.length !== right.length) {
return 0;
}
let dot = 0;
let leftNorm = 0;
let rightNorm = 0;
for (let i = 0; i < left.length; i++) {
const l = left[i] ?? 0;
const r = right[i] ?? 0;
dot += l * r;
leftNorm += l * l;
rightNorm += r * r;
}
if (leftNorm === 0 || rightNorm === 0) {
return 0;
}
return dot / (Math.sqrt(leftNorm) * Math.sqrt(rightNorm));
}
function scoreFromRank(rank: number): number {
return Number((1 / (1 + Math.abs(rank))).toFixed(6));
}
function parseEmbedding(raw: string | null): number[] | null {
if (!raw) {
return null;
}
try {
const embedding = JSON.parse(raw) as unknown;
return Array.isArray(embedding) && embedding.every((value) => typeof value === 'number') ? embedding : null;
} catch {
return null;
}
}
function normalizeFtsQuery(query: string): string {
const terms = query
.toLowerCase()
.split(/[^a-z0-9_]+/u)
.map((term) => term.trim())
.filter(Boolean);
return [...new Set(terms)].map((term) => `"${term.replaceAll('"', '""')}"`).join(' OR ');
}
export class SqliteKnowledgeIndex {
private readonly db: Database.Database;
constructor(options: SqliteKnowledgeIndexOptions) {
mkdirSync(dirname(options.dbPath), { recursive: true });
this.db = new Database(options.dbPath);
this.db.pragma('journal_mode = WAL');
this.db.pragma('foreign_keys = ON');
this.db.exec(`
CREATE TABLE IF NOT EXISTS knowledge_pages (
path TEXT PRIMARY KEY,
key TEXT NOT NULL,
scope TEXT NOT NULL,
summary TEXT NOT NULL,
content TEXT NOT NULL,
tags TEXT NOT NULL,
search_text TEXT NOT NULL,
embedding_json TEXT
);
CREATE VIRTUAL TABLE IF NOT EXISTS knowledge_pages_fts USING fts5(
path UNINDEXED,
key,
summary,
content,
tags
);
`);
const columns = this.db.prepare('PRAGMA table_info(knowledge_pages)').all() as Array<{ name: string }>;
const columnNames = new Set(columns.map((column) => column.name));
if (!columnNames.has('search_text')) {
this.db.exec("ALTER TABLE knowledge_pages ADD COLUMN search_text TEXT NOT NULL DEFAULT ''");
}
if (!columnNames.has('embedding_json')) {
this.db.exec('ALTER TABLE knowledge_pages ADD COLUMN embedding_json TEXT');
}
}
sync(pages: SqliteKnowledgeIndexPage[]): void {
const keepPaths = pages.map((page) => page.path);
const clearPages =
keepPaths.length === 0
? this.db.prepare('DELETE FROM knowledge_pages')
: this.db.prepare(`DELETE FROM knowledge_pages WHERE path NOT IN (${keepPaths.map(() => '?').join(', ')})`);
const clearFts =
keepPaths.length === 0
? this.db.prepare('DELETE FROM knowledge_pages_fts')
: this.db.prepare(`DELETE FROM knowledge_pages_fts WHERE path NOT IN (${keepPaths.map(() => '?').join(', ')})`);
const upsertPage = this.db.prepare(`
INSERT INTO knowledge_pages (path, key, scope, summary, content, tags, search_text, embedding_json)
VALUES (@path, @key, @scope, @summary, @content, @tags, @searchText, @embeddingJson)
ON CONFLICT(path) DO UPDATE SET
key = excluded.key,
scope = excluded.scope,
summary = excluded.summary,
content = excluded.content,
tags = excluded.tags,
search_text = excluded.search_text,
embedding_json = excluded.embedding_json
`);
const deleteFts = this.db.prepare('DELETE FROM knowledge_pages_fts WHERE path = @path');
const insertFts = this.db.prepare(`
INSERT INTO knowledge_pages_fts (path, key, summary, content, tags)
VALUES (@path, @key, @summary, @content, @tags)
`);
const transaction = this.db.transaction((items: SqliteKnowledgeIndexPage[]) => {
clearPages.run(...keepPaths);
clearFts.run(...keepPaths);
for (const page of items) {
const searchText = buildKnowledgeSearchText(page.key, page.summary, page.content, page.tags);
const row = {
path: page.path,
key: page.key,
scope: page.scope,
summary: page.summary,
content: searchText,
tags: page.tags.join(' '),
searchText,
embeddingJson: page.embedding ? JSON.stringify(page.embedding) : null,
};
upsertPage.run(row);
deleteFts.run(row);
insertFts.run(row);
}
});
transaction(pages);
}
rebuild(pages: SqliteKnowledgeIndexPage[]): void {
this.sync(pages);
}
getExistingPages(): Map<string, ExistingKnowledgeIndexPage> {
const rows = this.db
.prepare(
`
SELECT path, search_text, embedding_json
FROM knowledge_pages
ORDER BY path ASC
`,
)
.all() as Array<{ path: string; search_text: string; embedding_json: string | null }>;
return new Map(
rows.map((row) => [
row.path,
{
searchText: row.search_text,
embedding: parseEmbedding(row.embedding_json),
},
]),
);
}
searchLexicalCandidates(input: { queryText: string; limit: number }): WikiSqliteLaneCandidate[] {
const ftsQuery = normalizeFtsQuery(input.queryText);
if (!ftsQuery) {
return [];
}
const rows = this.db
.prepare(
`
SELECT path, bm25(knowledge_pages_fts) AS rank
FROM knowledge_pages_fts
WHERE knowledge_pages_fts MATCH ?
ORDER BY rank ASC, path ASC
LIMIT ?
`,
)
.all(ftsQuery, Math.max(1, input.limit)) as SearchRow[];
return rows.map((row, index) => ({
id: row.path,
path: row.path,
rank: index + 1,
rawScore: Number(row.rank),
}));
}
searchSemanticCandidates(input: { queryEmbedding: number[]; limit: number }): WikiSqliteLaneCandidate[] {
const rows = this.db
.prepare(
`
SELECT path, embedding_json
FROM knowledge_pages
ORDER BY path ASC
`,
)
.all() as IndexedPageRow[];
return rows
.flatMap((row) => {
if (!row.embedding_json) {
return [];
}
const embedding = parseEmbedding(row.embedding_json);
if (!embedding) {
return [];
}
return [
{
id: row.path,
path: row.path,
rank: 0,
rawScore: cosineSimilarity(input.queryEmbedding, embedding),
},
];
})
.sort((left, right) => right.rawScore - left.rawScore || left.path.localeCompare(right.path))
.slice(0, Math.max(1, input.limit))
.map((candidate, index) => ({ ...candidate, rank: index + 1 }));
}
search(query: string, limit: number): SqliteKnowledgeIndexSearchResult[] {
return this.searchLexicalCandidates({ queryText: query, limit }).map((row) => ({
path: row.path,
score: scoreFromRank(row.rawScore),
}));
}
}