From 74be832aea5469333f147750931f302859ebc0f8 Mon Sep 17 00:00:00 2001 From: Andrey Avtomonov Date: Sun, 17 May 2026 02:32:41 +0200 Subject: [PATCH] feat(cli): improve search ranking output (#123) --- .../content/docs/cli-reference/ktx-sl.mdx | 3 + .../content/docs/cli-reference/ktx-wiki.mdx | 54 +++++++++++++-- .../cli/src/commands/knowledge-commands.ts | 7 ++ packages/cli/src/index.test.ts | 16 +++++ packages/cli/src/io/print-list.test.ts | 24 +++---- packages/cli/src/io/print-list.ts | 12 +++- packages/cli/src/knowledge.test.ts | 44 +++++++++++++ packages/cli/src/knowledge.ts | 65 ++++++++++++++----- packages/cli/src/sl.test.ts | 18 +++++ packages/cli/src/sl.ts | 4 +- .../context/src/wiki/local-knowledge.test.ts | 50 ++++++++++++++ packages/context/src/wiki/local-knowledge.ts | 5 +- 12 files changed, 267 insertions(+), 35 deletions(-) diff --git a/docs-site/content/docs/cli-reference/ktx-sl.mdx b/docs-site/content/docs/cli-reference/ktx-sl.mdx index 3d7fd8d9..b0282a38 100644 --- a/docs-site/content/docs/cli-reference/ktx-sl.mdx +++ b/docs-site/content/docs/cli-reference/ktx-sl.mdx @@ -141,6 +141,9 @@ Semantic-layer list and search commands return human-readable output by default. Use `--json` on `list` or `search` when an agent needs structured output. Use `--format sql` on `query` to inspect generated SQL before execution, or leave `--format json` for the compiled query and optional rows. +Pretty `sl search` output shows `#1`, `#2`, and later rank badges for the +displayed results. Plain and JSON output keep the raw `score` value, which is a +ranking score rather than a percentage. ```json { diff --git a/docs-site/content/docs/cli-reference/ktx-wiki.mdx b/docs-site/content/docs/cli-reference/ktx-wiki.mdx index ad0e53bf..e1caabb5 100644 --- a/docs-site/content/docs/cli-reference/ktx-wiki.mdx +++ b/docs-site/content/docs/cli-reference/ktx-wiki.mdx @@ -43,6 +43,12 @@ need to add or update wiki knowledge. | `--output ` | Output mode: `pretty` (default in TTY), `plain` (TSV), or `json` | `pretty` | | `--json` | Shortcut for `--output=json` (overrides `--output`) | `false` | +`wiki search` uses hybrid search when `storage.search` is `sqlite-fts5`. KTX +combines lexical SQLite FTS5 matches, token matches, and semantic matches from +wiki page embeddings stored in `.ktx/db.sqlite`. If embeddings are not +configured or the embedding backend is unavailable, KTX skips the semantic lane +and keeps lexical and token results. + ## Examples ```bash @@ -60,14 +66,21 @@ ktx wiki search "monthly recurring revenue" --json --limit 10 # Print search results as TSV ktx wiki search "monthly recurring revenue" --output plain + +# Inspect which search lanes were used +ktx --debug wiki search "monthly recurring revenue" --json ``` ## Output Wiki commands print clack-style pretty output in a TTY and TSV-style plain output when requested. JSON output wraps the items with a command metadata -envelope. Open the matching Markdown files directly when you need the full page -contents. +envelope. Search results include `matchReasons` and `lanes` metadata so you can +see whether lexical, token, or semantic search contributed to the ranking. Open +the matching Markdown files directly when you need the full page contents. +Pretty search output shows `#1`, `#2`, and later rank badges for the displayed +results. Plain and JSON output keep the raw `score` value, which is a ranking +score rather than a percentage. ```json { @@ -77,16 +90,49 @@ contents. { "key": "revenue-definitions", "summary": "Canonical revenue metric definitions", - "score": 0.92 + "score": 0.92, + "matchReasons": ["lexical", "semantic"], + "lanes": [ + { + "lane": "lexical", + "status": "available", + "requestedCandidatePoolLimit": 25, + "effectiveCandidatePoolLimit": 25, + "returnedCandidateCount": 3, + "weight": 1.5 + }, + { + "lane": "semantic", + "status": "available", + "requestedCandidatePoolLimit": 25, + "effectiveCandidatePoolLimit": 25, + "returnedCandidateCount": 8, + "weight": 3 + } + ] } ] + }, + "meta": { + "command": "wiki search" } } ``` +When you pass the global `--debug` flag, KTX writes search diagnostics to +stderr and leaves stdout unchanged. This is useful with `--json` because stdout +stays machine-readable: + +```text +[debug] wiki search mode=sqlite-fts5 embedding=configured results=2 +[debug] wiki search lane=lexical status=available returned=1 weight=1.5 +[debug] wiki search lane=token status=available returned=1 weight=0.75 +[debug] wiki search lane=semantic status=available returned=2 weight=3 +``` + ## Common errors | Error | Cause | Recovery | |-------|-------|----------| -| Search returns no results | The query terms do not match summaries, tags, or content | Retry with business synonyms, then create a page if the knowledge is missing | +| Search returns no results | The query terms do not match summaries, tags, or content, and the semantic lane is unavailable or has no positive matches | Run with `--debug`, check the semantic lane status, retry with business synonyms, then create a page if the knowledge is missing | | A page is missing | No Markdown file exists for that business context | Add a file under `wiki/` or run `ktx ingest ` | diff --git a/packages/cli/src/commands/knowledge-commands.ts b/packages/cli/src/commands/knowledge-commands.ts index d2a93228..1c61a836 100644 --- a/packages/cli/src/commands/knowledge-commands.ts +++ b/packages/cli/src/commands/knowledge-commands.ts @@ -1,5 +1,6 @@ import { type Command, Option } from '@commander-js/extra-typings'; import { + type CommandWithGlobalOptions, type KtxCliCommandContext, parsePositiveIntegerOption, resolveCommandProjectDir, @@ -14,6 +15,11 @@ async function runKnowledgeArgs(context: KtxCliCommandContext, args: KtxKnowledg context.setExitCode(await runner(args, context.io)); } +function isDebugEnabled(command: CommandWithGlobalOptions): boolean { + const options = (command.optsWithGlobals ? command.optsWithGlobals() : command.opts()) as { debug?: unknown }; + return options.debug === true; +} + export function registerWikiCommands(program: Command, context: KtxCliCommandContext): void { const wiki = program .command('wiki') @@ -83,6 +89,7 @@ export function registerWikiCommands(program: Command, context: KtxCliCommandCon userId: options.userId, output: options.output, json: options.json, + ...(isDebugEnabled(command) ? { debug: true } : {}), ...(options.limit !== undefined ? { limit: options.limit } : {}), }); }, diff --git a/packages/cli/src/index.test.ts b/packages/cli/src/index.test.ts index 1d317b03..0c433fb3 100644 --- a/packages/cli/src/index.test.ts +++ b/packages/cli/src/index.test.ts @@ -171,6 +171,22 @@ describe('runKtxCli', () => { }, searchIo.io, ); + + const debugSearchIo = makeIo(); + await expect( + runKtxCli(['--project-dir', tempDir, '--debug', 'wiki', 'search', 'revenue'], debugSearchIo.io, { knowledge }), + ).resolves.toBe(0); + expect(knowledge).toHaveBeenLastCalledWith( + { + command: 'search', + projectDir: tempDir, + query: 'revenue', + userId: 'local', + json: false, + debug: true, + }, + debugSearchIo.io, + ); }); it('rejects removed public wiki read and write commands', async () => { diff --git a/packages/cli/src/io/print-list.test.ts b/packages/cli/src/io/print-list.test.ts index cb6e7947..0cb5a537 100644 --- a/packages/cli/src/io/print-list.test.ts +++ b/packages/cli/src/io/print-list.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; import type { KtxCliIo } from '../cli-runtime.js'; -import { printList, type PrintListColumn } from './print-list.js'; +import { createRankBadgeFormatter, printList, type PrintListColumn } from './print-list.js'; import { SYMBOLS } from './symbols.js'; function recorder(): { io: KtxCliIo; out: () => string; err: () => string } { @@ -239,26 +239,26 @@ describe('printList — pretty mode', () => { expect(out).toContain('2 pages'); }); - it('renders a leading badge column with prettyFormat in pretty mode', () => { + it('renders a leading rank badge column in pretty mode', () => { const r = recorder(); interface SearchRow { score: number; scope: string; key: string; summary: string } + const rows: SearchRow[] = [ + { score: 0.87, scope: 'GLOBAL', key: 'alpha', summary: 'first' }, + { score: 0.04, scope: 'GLOBAL', key: 'beta', summary: 'second' }, + ]; const SEARCH_COLUMNS: ReadonlyArray> = [ { key: 'score', label: 'SCORE', plain: 'score=', role: 'badge', - prettyFormat: (v) => `${Math.round(Number(v) * 100)}%`, + prettyFormat: createRankBadgeFormatter(rows), dim: true, }, { key: 'scope', label: 'SCOPE', plain: '' }, { key: 'key', label: 'KEY', plain: '' }, { key: 'summary', label: 'SUMMARY', plain: '', optional: true, dim: true }, ]; - const rows: SearchRow[] = [ - { score: 0.87, scope: 'GLOBAL', key: 'alpha', summary: 'first' }, - { score: 0.04, scope: 'GLOBAL', key: 'beta', summary: 'second' }, - ]; printList({ rows, columns: SEARCH_COLUMNS, @@ -270,20 +270,22 @@ describe('printList — pretty mode', () => { io: r.io, }); const out = stripAnsi(r.out()); - expect(out).toMatch(/87%\s+alpha\s+/); - expect(out).toMatch(/4%\s+beta\s+/); + expect(out).toMatch(/#1\s+alpha\s+/); + expect(out).toMatch(/#2\s+beta\s+/); + expect(out).not.toContain('%'); }); it('emits the badge column in plain mode using its plain prefix', () => { const r = recorder(); interface SearchRow { score: number; scope: string; key: string; summary: string } + const rows: SearchRow[] = [{ score: 0.87, scope: 'GLOBAL', key: 'alpha', summary: 'first' }]; const SEARCH_COLUMNS: ReadonlyArray> = [ { key: 'score', label: 'SCORE', plain: 'score=', role: 'badge', - prettyFormat: (v) => `${Math.round(Number(v) * 100)}%`, + prettyFormat: createRankBadgeFormatter(rows), dim: true, }, { key: 'scope', label: 'SCOPE', plain: '' }, @@ -291,7 +293,7 @@ describe('printList — pretty mode', () => { { key: 'summary', label: 'SUMMARY', plain: '', optional: true, dim: true }, ]; printList({ - rows: [{ score: 0.87, scope: 'GLOBAL', key: 'alpha', summary: 'first' }], + rows, columns: SEARCH_COLUMNS, groupBy: 'scope', mode: 'plain', diff --git a/packages/cli/src/io/print-list.ts b/packages/cli/src/io/print-list.ts index 3d8d1fba..47703288 100644 --- a/packages/cli/src/io/print-list.ts +++ b/packages/cli/src/io/print-list.ts @@ -24,7 +24,7 @@ export interface PrintListColumn { * - `'suffix'` — trailing em-dash optional value. Default: any column with `optional: true`. */ role?: 'name' | 'metric' | 'badge' | 'suffix'; - /** Custom pretty-mode value formatter (e.g. score → "87%"). Plain/JSON unaffected. */ + /** Custom pretty-mode value formatter (for example, score -> "#1"). Plain/JSON unaffected. */ prettyFormat?: (value: Row[keyof Row & string], row: Row) => string; } @@ -67,6 +67,16 @@ export function printList(args: PrintListArgs): void { } } +export function createRankBadgeFormatter( + rows: ReadonlyArray, +): (_value: Row[keyof Row & string], row: Row) => string { + const ranks = new WeakMap(); + rows.forEach((row, index) => { + ranks.set(row, index + 1); + }); + return (_value, row) => `#${ranks.get(row) ?? rows.indexOf(row) + 1}`; +} + function isEmpty(value: unknown): boolean { return value === undefined || value === null || value === ''; } diff --git a/packages/cli/src/knowledge.test.ts b/packages/cli/src/knowledge.test.ts index 523d8a1b..63b952d0 100644 --- a/packages/cli/src/knowledge.test.ts +++ b/packages/cli/src/knowledge.test.ts @@ -1,6 +1,7 @@ import { mkdtemp, rm } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; +import { stripVTControlCharacters } from 'node:util'; import { initKtxProject, loadKtxProject } from '@ktx/context/project'; import type { KtxEmbeddingPort } from '@ktx/context'; import { writeLocalKnowledgePage } from '@ktx/context/wiki'; @@ -90,6 +91,24 @@ describe('runKtxKnowledge', () => { expect(searchIo.stdout()).toContain('metrics-revenue'); }); + it('prints wiki search rank badges in pretty output', async () => { + const projectDir = join(tempDir, 'rank-project'); + await initKtxProject({ projectDir }); + await seedWikiPage(projectDir); + + const searchIo = makeIo(); + await expect( + runKtxKnowledge( + { command: 'search', projectDir, query: 'paid order', userId: 'local', output: 'pretty' }, + searchIo.io, + ), + ).resolves.toBe(0); + + const stdout = stripVTControlCharacters(searchIo.stdout()); + expect(stdout).toMatch(/#1\s+metrics-revenue/); + expect(stdout).not.toContain('%'); + }); + it('prints wiki list and search as public JSON envelopes', async () => { const projectDir = join(tempDir, 'project'); await initKtxProject({ projectDir }); @@ -156,4 +175,29 @@ describe('runKtxKnowledge', () => { expect(searchIo.stdout()).toContain('active-contract-arr-open-tickets'); expect(searchIo.stderr()).toBe(''); }); + + it('writes wiki search lane diagnostics to stderr when debug is enabled', async () => { + const projectDir = join(tempDir, 'debug-project'); + await initKtxProject({ projectDir }); + await seedWikiPage(projectDir); + + const searchIo = makeIo(); + await expect( + runKtxKnowledge( + { command: 'search', projectDir, query: 'paid order', userId: 'local', json: true, debug: true }, + searchIo.io, + { embeddingService: new FakeEmbeddingPort() }, + ), + ).resolves.toBe(0); + + expect(JSON.parse(searchIo.stdout())).toMatchObject({ + kind: 'list', + data: { items: [expect.objectContaining({ key: 'metrics-revenue' })] }, + meta: { command: 'wiki search' }, + }); + expect(searchIo.stderr()).toContain('[debug] wiki search mode=sqlite-fts5'); + expect(searchIo.stderr()).toContain('embedding=configured'); + expect(searchIo.stderr()).toContain('lane=lexical status=available'); + expect(searchIo.stderr()).toContain('lane=semantic status=available'); + }); }); diff --git a/packages/cli/src/knowledge.ts b/packages/cli/src/knowledge.ts index d98df9e8..8213d05d 100644 --- a/packages/cli/src/knowledge.ts +++ b/packages/cli/src/knowledge.ts @@ -11,7 +11,7 @@ import { searchLocalKnowledgePages, } from '@ktx/context/wiki'; import { resolveOutputMode } from './io/mode.js'; -import { printList, type PrintListColumn } from './io/print-list.js'; +import { createRankBadgeFormatter, printList, type PrintListColumn } from './io/print-list.js'; export type KtxKnowledgeArgs = | { command: 'list'; projectDir: string; userId: string; output?: string; json?: boolean } @@ -23,6 +23,7 @@ export type KtxKnowledgeArgs = output?: string; json?: boolean; limit?: number; + debug?: boolean; }; type KtxKnowledgeIo = import('./cli-runtime.js').KtxCliIo; @@ -33,19 +34,23 @@ const WIKI_LIST_COLUMNS: ReadonlyArray> = { key: 'summary', label: 'SUMMARY', plain: '', optional: true, dim: true }, ]; -const WIKI_SEARCH_COLUMNS: ReadonlyArray> = [ - { - key: 'score', - label: 'SCORE', - plain: 'score=', - role: 'badge', - prettyFormat: (value) => `${Math.round(Number(value) * 100)}%`, - dim: true, - }, - { key: 'scope', label: 'SCOPE', plain: '' }, - { key: 'key', label: 'KEY', plain: '' }, - { key: 'summary', label: 'SUMMARY', plain: '', optional: true, dim: true }, -]; +function wikiSearchColumns( + rows: ReadonlyArray, +): ReadonlyArray> { + return [ + { + key: 'score', + label: 'SCORE', + plain: 'score=', + role: 'badge', + prettyFormat: createRankBadgeFormatter(rows), + dim: true, + }, + { key: 'scope', label: 'SCOPE', plain: '' }, + { key: 'key', label: 'KEY', plain: '' }, + { key: 'summary', label: 'SUMMARY', plain: '', optional: true, dim: true }, + ]; +} interface KtxKnowledgeDeps { embeddingService?: KtxEmbeddingPort | null; @@ -65,6 +70,26 @@ function wikiSearchEmbeddingService( return provider ? new KtxIngestEmbeddingPortAdapter(provider) : null; } +function writeWikiSearchDebug( + io: KtxKnowledgeIo, + input: { + mode: string; + embeddingConfigured: boolean; + results: LocalKnowledgeSearchResult[]; + }, +): void { + io.stderr.write( + `[debug] wiki search mode=${input.mode} embedding=${input.embeddingConfigured ? 'configured' : 'unconfigured'} results=${input.results.length}\n`, + ); + const lanes = input.results[0]?.lanes ?? []; + for (const lane of lanes) { + const reason = lane.reason ? ` reason=${lane.reason}` : ''; + io.stderr.write( + `[debug] wiki search lane=${lane.lane} status=${lane.status} returned=${lane.returnedCandidateCount} weight=${lane.weight}${reason}\n`, + ); + } +} + export async function runKtxKnowledge( args: KtxKnowledgeArgs, io: KtxKnowledgeIo = process, @@ -89,12 +114,20 @@ export async function runKtxKnowledge( return 0; } if (args.command === 'search') { + const embeddingService = wikiSearchEmbeddingService(project, deps); const results = await searchLocalKnowledgePages(project, { query: args.query, userId: args.userId, - embeddingService: wikiSearchEmbeddingService(project, deps), + embeddingService, limit: args.limit, }); + if (args.debug) { + writeWikiSearchDebug(io, { + mode: project.config.storage.search, + embeddingConfigured: embeddingService !== null, + results, + }); + } const mode = resolveOutputMode({ explicit: args.output, json: args.json, io }); let emptyMessage = `No local wiki pages matched "${args.query}"`; let emptyHint = 'Run `ktx wiki list` to inspect available pages.'; @@ -107,7 +140,7 @@ export async function runKtxKnowledge( } printList({ rows: results, - columns: WIKI_SEARCH_COLUMNS, + columns: wikiSearchColumns(results), groupBy: 'scope', emptyMessage, emptyHint, diff --git a/packages/cli/src/sl.test.ts b/packages/cli/src/sl.test.ts index 16041a31..1bfef382 100644 --- a/packages/cli/src/sl.test.ts +++ b/packages/cli/src/sl.test.ts @@ -1,6 +1,7 @@ import { mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; +import { stripVTControlCharacters } from 'node:util'; import Database from 'better-sqlite3'; import { initKtxProject } from '@ktx/context/project'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; @@ -98,6 +99,23 @@ describe('runKtxSl', () => { }); }); + it('prints semantic-layer search rank badges in pretty output', async () => { + const projectDir = join(tempDir, 'rank-project'); + await seedSlSource({ projectDir }); + + const searchIo = makeIo(); + await expect( + runKtxSl( + { command: 'search', projectDir, connectionId: 'warehouse', query: 'order', output: 'pretty' }, + searchIo.io, + ), + ).resolves.toBe(0); + + const stdout = stripVTControlCharacters(searchIo.stdout()); + expect(stdout).toMatch(/#1\s+orders/); + expect(stdout).not.toContain('%'); + }); + it('prints semantic-layer list and search as public JSON envelopes', async () => { const projectDir = join(tempDir, 'project'); await seedSlSource({ diff --git a/packages/cli/src/sl.ts b/packages/cli/src/sl.ts index e7594feb..d77fa4f9 100644 --- a/packages/cli/src/sl.ts +++ b/packages/cli/src/sl.ts @@ -109,7 +109,7 @@ async function printSlSources(input: { emptyHint?: string; }): Promise { const { resolveOutputMode } = await import('./io/mode.js'); - const { printList } = await import('./io/print-list.js'); + const { createRankBadgeFormatter, printList } = await import('./io/print-list.js'); const mode = resolveOutputMode({ explicit: input.output, json: input.json, io: input.io }); if (input.command === 'sl search') { @@ -119,7 +119,7 @@ async function printSlSources(input: { label: 'SCORE', plain: 'score=', role: 'badge', - prettyFormat: (value) => `${Math.round(Number(value) * 100)}%`, + prettyFormat: createRankBadgeFormatter(input.rows as ReadonlyArray), dim: true, }, { key: 'connectionId', label: 'CONNECTION', plain: '' }, diff --git a/packages/context/src/wiki/local-knowledge.test.ts b/packages/context/src/wiki/local-knowledge.test.ts index 122e56b5..2a166fc5 100644 --- a/packages/context/src/wiki/local-knowledge.test.ts +++ b/packages/context/src/wiki/local-knowledge.test.ts @@ -22,6 +22,25 @@ class FakeEmbeddingPort { } } +class ArrSynonymEmbeddingPort { + readonly maxBatchSize = 16; + + async computeEmbedding(text: string): Promise { + const lower = text.toLowerCase(); + if (lower.trim() === 'annual recurring revenue' || lower.includes('arr') || lower.includes('contract-first')) { + return [1, 0]; + } + if (lower.includes('net revenue') || lower.includes('gross') || lower.includes('refund')) { + return [0, 1]; + } + return [0.5, 0.5]; + } + + async computeEmbeddingsBulk(texts: string[]): Promise { + return Promise.all(texts.map((text) => this.computeEmbedding(text))); + } +} + describe('local knowledge helpers', () => { let tempDir: string; let project: KtxLocalProject; @@ -131,6 +150,37 @@ describe('local knowledge helpers', () => { }); }); + it('ranks ARR synonym queries by semantic page embeddings over stronger lexical revenue matches', async () => { + await writeLocalKnowledgePage(project, { + key: 'arr-definition', + scope: 'GLOBAL', + summary: 'ARR is calculated contract-first for active customer contracts.', + content: 'Contract-first active contract value takes precedence over subscription values.', + tags: ['arr', 'contracts', 'finance'], + }); + await writeLocalKnowledgePage(project, { + key: 'net-revenue-definition', + scope: 'GLOBAL', + summary: 'Net revenue definition', + content: 'Annual revenue is gross invoice revenue minus credits and refunds.', + tags: ['revenue', 'finance'], + }); + + const search = await searchLocalKnowledgePages(project, { + query: 'annual recurring revenue', + userId: 'local', + limit: 2, + embeddingService: new ArrSynonymEmbeddingPort(), + }); + + expect(search.map((result) => result.key)).toEqual(['arr-definition', 'net-revenue-definition']); + expect(search[0]).toMatchObject({ + key: 'arr-definition', + matchReasons: expect.arrayContaining(['semantic']), + lanes: expect.arrayContaining([expect.objectContaining({ lane: 'semantic', status: 'available' })]), + }); + }); + it('reports semantic lane as skipped when wiki embeddings are not configured', async () => { await writeLocalKnowledgePage(project, { key: 'metrics-revenue', diff --git a/packages/context/src/wiki/local-knowledge.ts b/packages/context/src/wiki/local-knowledge.ts index f9b25fb1..b228cfd4 100644 --- a/packages/context/src/wiki/local-knowledge.ts +++ b/packages/context/src/wiki/local-knowledge.ts @@ -309,6 +309,7 @@ async function searchLocalKnowledgePagesWithSqlite( }, { lane: 'semantic', + weight: 3, async generate(args) { if (!embeddingService) { return { status: 'skipped', candidates: [], reason: 'embedding_unconfigured' }; @@ -320,7 +321,9 @@ async function searchLocalKnowledgePagesWithSqlite( limit: args.laneCandidatePoolLimit, }); return { - candidates: rows.map((row) => ({ id: row.id, rank: row.rank, rawScore: row.rawScore })), + candidates: rows + .filter((row) => row.rawScore > 0) + .map((row, index) => ({ id: row.id, rank: index + 1, rawScore: row.rawScore })), }; } catch (error) { return {