feat(context): add warehouse verification tools (#46)

* feat(context): add warehouse dialect dispatch

* feat(context): read warehouse scan catalog

* feat(context): add entity details verification tool

* feat(context): add ingest SQL verification tool

* feat(context): add raw warehouse discovery tool

* feat(context): expose warehouse verification tools to ingest

* docs(context): add ingest identifier verification protocol

* test(context): guard ingest identifier verification prompts

* chore(context): verify warehouse verification tools

* docs: add warehouse verification tools plan and spec

* fix(context): expose target warehouses to Notion ingest

* fix(context): update ingest prompts for warehouse verification tools

* fix(context): scope raw schema discovery to allowed connections

* fix(context): verify warehouse column display targets

* docs: add notion warehouse verification gap closure plan

* fix(context): include raw discovery connection names

* fix(context): expose warehouse targets for LookML and MetricFlow

* fix(context): pass connection config to ingest query executors

* fix(cli): enable read-only SQL probes for local ingest

* docs: add warehouse verification final v1 closure plan

* fix(context): align warehouse sql probe prompt shape

* docs: add warehouse verification prompt shape closure plan

* test(context): catch connectionless sql execution prompt examples

* fix(context): include connection name in sl capture sql example

* docs: add warehouse verification sql example closure plan

* fix(context): report structured entity detail misses

* docs: add warehouse verification structured target miss closure plan

* fix: report untracked squash merge conflicts

* feat: require ingest verification ledger

* fix: stabilize ingest wiki references
This commit is contained in:
Andrey Avtomonov 2026-05-13 13:43:23 +02:00 committed by GitHub
parent bcb0d2f8f7
commit c22248dabf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
89 changed files with 7818 additions and 191 deletions

View file

@ -0,0 +1,30 @@
import { describe, expect, it } from 'vitest';
import { getDialectForDriver } from './dialects.js';
describe('getDialectForDriver', () => {
it.each([
['postgres', '"public"."orders"'],
['postgresql', '"public"."orders"'],
['mysql', '`public`.`orders`'],
['clickhouse', '`public`.`orders`'],
['sqlite', '"orders"'],
['snowflake', '"analytics"."public"."orders"'],
['bigquery', '`analytics`.`public`.`orders`'],
['sqlserver', '[analytics].[public].[orders]'],
] as const)('formats table names for %s', (driver, expected) => {
const dialect = getDialectForDriver(driver);
expect(
dialect.formatTableName({
catalog: driver === 'snowflake' || driver === 'bigquery' || driver === 'sqlserver' ? 'analytics' : null,
db: driver === 'sqlite' ? null : 'public',
name: 'orders',
}),
).toBe(expected);
});
it('throws with a supported-driver list for unknown drivers', () => {
expect(() => getDialectForDriver('oracle')).toThrow(
'Unsupported warehouse driver "oracle". Supported drivers: bigquery, clickhouse, mysql, postgres, postgresql, sqlite, sqlite3, snowflake, sqlserver',
);
});
});

View file

@ -0,0 +1,102 @@
import type { KtxSchemaDimensionType, KtxTableRef } from '../scan/types.js';
export type SupportedDriver =
| 'postgres'
| 'postgresql'
| 'mysql'
| 'sqlserver'
| 'snowflake'
| 'bigquery'
| 'clickhouse'
| 'sqlite'
| 'sqlite3';
export interface KtxDialect {
readonly type: SupportedDriver;
quoteIdentifier(identifier: string): string;
formatTableName(table: KtxTableRef): string;
mapToDimensionType(nativeType: string): KtxSchemaDimensionType;
}
const supportedDrivers: SupportedDriver[] = [
'bigquery',
'clickhouse',
'mysql',
'postgres',
'postgresql',
'sqlite',
'sqlite3',
'snowflake',
'sqlserver',
];
function doubleQuoted(identifier: string): string {
return `"${identifier.replace(/"/g, '""')}"`;
}
function backtickQuoted(identifier: string): string {
return `\`${identifier.replace(/`/g, '``')}\``;
}
function bigQueryQuoted(identifier: string): string {
return `\`${identifier.replace(/`/g, '\\`')}\``;
}
function bracketQuoted(identifier: string): string {
return `[${identifier.replace(/\]/g, ']]')}]`;
}
function inferDimensionType(nativeType: string): KtxSchemaDimensionType {
const normalized = nativeType.toLowerCase().trim();
if (normalized.includes('date') || normalized.includes('time')) {
return 'time';
}
if (
normalized.includes('int') ||
normalized.includes('num') ||
normalized.includes('dec') ||
normalized.includes('float') ||
normalized.includes('double') ||
normalized.includes('real')
) {
return 'number';
}
if (normalized.includes('bool') || normalized === 'bit') {
return 'boolean';
}
return 'string';
}
function formatWithParts(table: KtxTableRef, quote: (identifier: string) => string, sqlite = false): string {
const parts = sqlite ? [table.name] : [table.catalog, table.db, table.name].filter((part): part is string => !!part);
return parts.map(quote).join('.');
}
function createDialect(type: SupportedDriver, quote: (identifier: string) => string, sqlite = false): KtxDialect {
return {
type,
quoteIdentifier: quote,
formatTableName: (table) => formatWithParts(table, quote, sqlite),
mapToDimensionType: inferDimensionType,
};
}
const dialects: Record<SupportedDriver, KtxDialect> = {
postgres: createDialect('postgres', doubleQuoted),
postgresql: createDialect('postgresql', doubleQuoted),
mysql: createDialect('mysql', backtickQuoted),
clickhouse: createDialect('clickhouse', backtickQuoted),
sqlite: createDialect('sqlite', doubleQuoted, true),
sqlite3: createDialect('sqlite3', doubleQuoted, true),
snowflake: createDialect('snowflake', doubleQuoted),
bigquery: createDialect('bigquery', bigQueryQuoted),
sqlserver: createDialect('sqlserver', bracketQuoted),
};
export function getDialectForDriver(driver: string): KtxDialect {
const normalized = driver.toLowerCase().trim();
if (normalized in dialects) {
return dialects[normalized as SupportedDriver];
}
throw new Error(`Unsupported warehouse driver "${driver}". Supported drivers: ${supportedDrivers.join(', ')}`);
}

View file

@ -3,7 +3,9 @@ export type {
KtxSqlQueryExecutionResult,
KtxSqlQueryExecutorPort,
} from './query-executor.js';
export type { KtxDialect, SupportedDriver } from './dialects.js';
export { createDefaultLocalQueryExecutor, type DefaultLocalQueryExecutorOptions } from './local-query-executor.js';
export { getDialectForDriver } from './dialects.js';
export { normalizeQueryRows } from './query-executor.js';
export { createPostgresQueryExecutor } from './postgres-query-executor.js';
export { assertReadOnlySql, limitSqlForExecution } from './read-only-sql.js';

View file

@ -379,5 +379,37 @@ describe('GitService', () => {
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('reports untracked files that would be overwritten by the squash merge', async () => {
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-untracked`);
await service.addWorktree(wtDir, 'session/untracked', baseSha);
const scoped = service.forWorktree(wtDir);
await writeFile(join(wtDir, 'knowledge.md'), 'session version\n', 'utf-8');
await scoped.commitFile('knowledge.md', 'session write', 'System User', 'system@example.com');
await writeFile(join(tempDir, 'knowledge.md'), 'untracked local version\n', 'utf-8');
const result = await service.squashMergeIntoMain(
'session/untracked',
'System User',
'system@example.com',
'Memory capture: 1 file [chat=untracked]',
);
expect(result.ok).toBe(false);
if (result.ok) {
throw new Error('unreachable');
}
expect(result.conflict).toBe(true);
expect(result.conflictPaths).toEqual(['knowledge.md']);
const status = await (service as unknown as { git: import('simple-git').SimpleGit }).git.status();
expect(status.not_added).toContain('knowledge.md');
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
});

View file

@ -31,6 +31,40 @@ export type SquashMergeResult =
| { ok: true; squashSha: string; touchedPaths: string[] }
| { ok: false; conflict: true; conflictPaths: string[] };
function mergeErrorMessage(error: unknown): string {
if (error instanceof Error) {
return error.message;
}
return String(error);
}
function extractUntrackedOverwritePaths(message: string): string[] {
const marker = 'The following untracked working tree files would be overwritten by merge:';
const markerIndex = message.indexOf(marker);
if (markerIndex === -1) {
return [];
}
const afterMarker = message.slice(markerIndex + marker.length);
const abortIndex = afterMarker.indexOf('Please move or remove them before you merge.');
const pathBlock = abortIndex === -1 ? afterMarker : afterMarker.slice(0, abortIndex);
return pathBlock
.split('\n')
.map((line) => line.trim())
.filter((line) => line.length > 0 && line !== 'Aborting')
.map((line) => line.replace(/^"(.+)"$/, '$1'));
}
function mergeConflictPaths(unmergedPaths: string[], mergeError: unknown): string[] {
const paths = new Set(unmergedPaths);
if (mergeError !== null) {
for (const path of extractUntrackedOverwritePaths(mergeErrorMessage(mergeError))) {
paths.add(path);
}
}
return [...paths];
}
export class GitService {
private static readonly mutationQueues = new Map<string, Promise<void>>();
@ -639,10 +673,11 @@ export class GitService {
}
const unmergedOut = await this.git.raw(['diff', '--name-only', '--diff-filter=U']).catch(() => '');
const conflictPaths = unmergedOut
const unmergedPaths = unmergedOut
.split('\n')
.map((l) => l.trim())
.filter(Boolean);
const conflictPaths = mergeConflictPaths(unmergedPaths, mergeError);
if (conflictPaths.length > 0 || mergeError !== null) {
// `merge --abort` only works for an in-progress merge; squash sets MERGE_MSG but not
@ -651,7 +686,7 @@ export class GitService {
await this.git.raw(['reset', '--hard', 'HEAD']).catch(() => undefined);
this.logger.warn(
`squashMergeIntoMain: conflict merging ${branch} — aborted. conflictPaths=${conflictPaths.join(',')}` +
(mergeError ? ` error=${mergeError instanceof Error ? mergeError.message : String(mergeError)}` : ''),
(mergeError ? ` error=${mergeErrorMessage(mergeError)}` : ''),
);
return { ok: false, conflict: true, conflictPaths };
}

View file

@ -277,7 +277,7 @@ describe('historic-SQL local ingest retrieval acceptance', () => {
await expect(readFile(join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves
.toContain('Analysts repeatedly inspect paid order lifecycle by customer segment.');
await expect(readFile(join(project.projectDir, 'knowledge/global/historic-sql/paid-order-lifecycle.md'), 'utf-8'))
await expect(readFile(join(project.projectDir, 'knowledge/global/historic-sql-paid-order-lifecycle.md'), 'utf-8'))
.resolves.toContain('Paid Order Lifecycle');
const reloaded = await loadKtxProject({ projectDir: project.projectDir });
@ -295,7 +295,7 @@ describe('historic-SQL local ingest retrieval acceptance', () => {
searchLocalKnowledgePages(reloaded, { query: 'paid order lifecycle', userId: 'local', limit: 5 }),
).resolves.toEqual([
expect.objectContaining({
key: 'historic-sql/paid-order-lifecycle',
key: 'historic-sql-paid-order-lifecycle',
summary: 'Paid Order Lifecycle',
matchReasons: expect.arrayContaining(['lexical']),
}),

View file

@ -10,7 +10,7 @@ async function commitProjectionChanges(workdir: string): Promise<void> {
const status = await git.status();
const paths = status.files
.map((file) => file.path)
.filter((path) => path.startsWith('semantic-layer/') || path.startsWith('knowledge/global/historic-sql/'));
.filter((path) => path.startsWith('semantic-layer/') || path.startsWith('knowledge/global/historic-sql'));
if (paths.length === 0) {
return;
}

View file

@ -106,7 +106,7 @@ describe('projectHistoricSqlEvidence', () => {
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
await writeText(
workdir,
'knowledge/global/historic-sql/old-order-lifecycle.md',
'knowledge/global/historic-sql-old-order-lifecycle.md',
[
'---',
YAML.stringify({
@ -127,7 +127,7 @@ describe('projectHistoricSqlEvidence', () => {
);
await writeText(
workdir,
'knowledge/global/historic-sql/retired-pattern.md',
'knowledge/global/historic-sql-retired-pattern.md',
[
'---',
YAML.stringify({
@ -164,15 +164,15 @@ describe('projectHistoricSqlEvidence', () => {
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.patternPagesWritten).toBe(1);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/old-order-lifecycle.md'), 'utf-8')).resolves.toContain(
await expect(readFile(join(workdir, 'knowledge/global/historic-sql-old-order-lifecycle.md'), 'utf-8')).resolves.toContain(
'Order Lifecycle Analysis',
);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/retired-pattern.md'), 'utf-8')).resolves.toContain(
await expect(readFile(join(workdir, 'knowledge/global/historic-sql-retired-pattern.md'), 'utf-8')).resolves.toContain(
'stale_since: "2026-05-11T00:00:00.000Z"',
);
});
it('writes a reappearing pattern to the active slug instead of reusing an archived page key', async () => {
it('rewrites a reappearing archived pattern at the flat slug', async () => {
const workdir = await tempWorkdir();
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
@ -192,7 +192,7 @@ describe('projectHistoricSqlEvidence', () => {
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
await writeText(
workdir,
'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md',
'knowledge/global/historic-sql-order-lifecycle-analysis.md',
[
'---',
YAML.stringify({
@ -230,15 +230,10 @@ describe('projectHistoricSqlEvidence', () => {
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.patternPagesWritten).toBe(1);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain(
'Order Lifecycle Analysis',
);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain(
'Archived body',
);
await expect(
readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/order-lifecycle-analysis.md'), 'utf-8'),
).rejects.toMatchObject({ code: 'ENOENT' });
const page = await readFile(join(workdir, 'knowledge/global/historic-sql-order-lifecycle-analysis.md'), 'utf-8');
expect(page).toContain('Analysts compare order status with customer segment again.');
expect(page).not.toContain('Archived body');
expect(page).not.toContain('archived');
});
it('leaves already archived pattern pages stable when they are still absent', async () => {
@ -259,7 +254,7 @@ describe('projectHistoricSqlEvidence', () => {
});
await writeText(
workdir,
'knowledge/global/historic-sql/_archived/retired-pattern.md',
'knowledge/global/historic-sql-retired-pattern.md',
[
'---',
YAML.stringify({
@ -284,12 +279,9 @@ describe('projectHistoricSqlEvidence', () => {
expect(result.archivedPatternPages).toBe(0);
expect(result.stalePatternPagesMarked).toBe(0);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/retired-pattern.md'), 'utf-8')).resolves.toContain(
await expect(readFile(join(workdir, 'knowledge/global/historic-sql-retired-pattern.md'), 'utf-8')).resolves.toContain(
'Archived retired body',
);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/retired-pattern.md'), 'utf-8')).rejects.toMatchObject({
code: 'ENOENT',
});
});
it('marks missing table usage stale and deletes legacy historic SQL query pages', async () => {
@ -330,7 +322,7 @@ describe('projectHistoricSqlEvidence', () => {
});
await writeText(
workdir,
'knowledge/global/historic-sql/legacy-template.md',
'knowledge/global/historic-sql-legacy-template.md',
[
'---',
YAML.stringify({
@ -365,7 +357,7 @@ describe('projectHistoricSqlEvidence', () => {
commonJoins: [],
staleSince: '2026-05-11T00:00:00.000Z',
});
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/legacy-template.md'), 'utf-8')).rejects.toMatchObject({
await expect(readFile(join(workdir, 'knowledge/global/historic-sql-legacy-template.md'), 'utf-8')).rejects.toMatchObject({
code: 'ENOENT',
});
});

View file

@ -37,7 +37,7 @@ interface HistoricSqlPatternPage {
}
function safeKnowledgeSlug(value: string): string {
return value.toLowerCase().replace(/[^a-z0-9/-]+/g, '-').replace(/^-+|-+$/g, '');
return value.toLowerCase().replace(/[^a-z0-9_-]+/g, '-').replace(/^-+|-+$/g, '');
}
async function pathExists(path: string): Promise<boolean> {
@ -159,7 +159,7 @@ function isLegacyQueryPage(page: HistoricSqlPatternPage): boolean {
function isArchivedPatternPage(page: HistoricSqlPatternPage): boolean {
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
return page.key.startsWith('_archived/') || tags.includes('archived');
return tags.includes('archived');
}
function stringArray(value: unknown): string[] {
@ -191,6 +191,9 @@ async function loadPatternPages(root: string): Promise<HistoricSqlPatternPage[]>
const files = await walkFiles(root);
const pages: HistoricSqlPatternPage[] = [];
for (const file of files.filter((candidate) => candidate.endsWith('.md'))) {
if (file.includes('/')) {
continue;
}
const key = file.replace(/\.md$/, '');
const path = join(root, file);
const page = parseMarkdownPage(key, path, await readFile(path, 'utf-8'));
@ -201,6 +204,10 @@ async function loadPatternPages(root: string): Promise<HistoricSqlPatternPage[]>
return pages;
}
function historicSqlFlatKey(slug: string): string {
return `historic-sql-${safeKnowledgeSlug(slug)}`;
}
async function currentStagedTables(rawDir: string): Promise<Set<string>> {
const tablesRoot = join(rawDir, 'tables');
const files = await walkFiles(tablesRoot);
@ -276,7 +283,7 @@ export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInp
}
}
const wikiRoot = join(input.workdir, 'knowledge/global/historic-sql');
const wikiRoot = join(input.workdir, 'knowledge/global');
await mkdir(wikiRoot, { recursive: true });
const allPages = await loadPatternPages(wikiRoot);
const activePages = allPages.filter((page) => !isArchivedPatternPage(page));
@ -286,7 +293,7 @@ export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInp
for (const pattern of patternEvidence) {
const incomingSignals = [...pattern.pattern.tablesInvolved, ...pattern.pattern.constituentTemplateIds];
const reusable = patternPages.find((page) => overlapRatio(incomingSignals, existingPageSignals(page)) >= 0.6);
const key = reusable?.key ?? safeKnowledgeSlug(pattern.pattern.slug);
const key = reusable?.key ?? historicSqlFlatKey(pattern.pattern.slug);
const pagePath = join(wikiRoot, `${key}.md`);
const frontmatter = {
summary: pattern.pattern.title,
@ -308,11 +315,12 @@ export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInp
for (const page of patternPages) {
if (writtenKeys.has(page.key)) continue;
if (shouldArchive(page.frontmatter.stale_since, manifest.fetchedAt, manifest.staleArchiveAfterDays)) {
const archivePath = join(wikiRoot, '_archived', `${page.key}.md`);
const tags = [...new Set([...stringArray(page.frontmatter.tags), 'archived'])];
await mkdir(dirname(archivePath), { recursive: true });
await writeFile(archivePath, renderMarkdownPage({ ...page.frontmatter, tags }, page.content), 'utf-8');
await rm(page.path, { force: true });
await writeFile(
page.path,
renderMarkdownPage({ ...page.frontmatter, tags, archived_since: manifest.fetchedAt }, page.content),
'utf-8',
);
result.archivedPatternPages += 1;
continue;
}

View file

@ -15,6 +15,18 @@ describe('LookmlSourceAdapter validation sidecars', () => {
afterEach(async () => rm(tmpRoot, { recursive: true, force: true }));
it('returns configured target warehouse connection ids', async () => {
const adapter = new LookmlSourceAdapter({
homeDir: join(tmpRoot, 'home'),
targetConnectionIds: ['warehouse', 'analytics', 'warehouse'],
});
await expect(adapter.listTargetConnectionIds?.(join(tmpRoot, 'staged'))).resolves.toEqual([
'analytics',
'warehouse',
]);
});
it('writes a partial fetch report and marks mismatched chunks as SL-disallowed', async () => {
const originRoot = join(tmpRoot, 'origin-src');
await mkdir(join(originRoot, 'views'), { recursive: true });

View file

@ -14,6 +14,11 @@ import { parseLookmlPullConfig } from './pull-config.js';
export interface LookmlSourceAdapterDeps {
homeDir: string;
targetConnectionIds?: string[];
}
function uniqueSorted(values: readonly string[] | undefined): string[] {
return [...new Set(values ?? [])].sort((left, right) => left.localeCompare(right));
}
export class LookmlSourceAdapter implements SourceAdapter {
@ -43,6 +48,10 @@ export class LookmlSourceAdapter implements SourceAdapter {
return readLookmlFetchReport(stagedDir);
}
async listTargetConnectionIds(_stagedDir: string): Promise<string[]> {
return uniqueSorted(this.deps.targetConnectionIds);
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const project = await parseLookmlStagedDir(stagedDir);
const mismatchedModelNames = await readLookmlMismatchedModelNames(stagedDir);

View file

@ -42,6 +42,15 @@ describe('MetricflowSourceAdapter', () => {
expect(adapter.skillNames).toEqual(['metricflow_ingest']);
});
it('returns configured target warehouse connection ids', async () => {
const metricflow = new MetricflowSourceAdapter({
homeDir: join(tmpRoot, 'cache-home'),
targetConnectionIds: ['warehouse', 'analytics', 'warehouse'],
});
await expect(metricflow.listTargetConnectionIds?.(stagedDir)).resolves.toEqual(['analytics', 'warehouse']);
});
it('detects a staged dir with a semantic_models YAML', async () => {
await mkdir(join(stagedDir, 'models'), { recursive: true });
await writeFile(

View file

@ -9,6 +9,11 @@ import { parseMetricflowPullConfig } from './pull-config.js';
export interface MetricflowSourceAdapterDeps {
homeDir: string;
targetConnectionIds?: string[];
}
function uniqueSorted(values: readonly string[] | undefined): string[] {
return [...new Set(values ?? [])].sort((left, right) => left.localeCompare(right));
}
export class MetricflowSourceAdapter implements SourceAdapter {
@ -30,6 +35,10 @@ export class MetricflowSourceAdapter implements SourceAdapter {
});
}
async listTargetConnectionIds(_stagedDir: string): Promise<string[]> {
return uniqueSorted(this.deps.targetConnectionIds);
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const project = await parseMetricFlowStagedDir(stagedDir);
const chunk = await chunkMetricFlowProject(project, { diffSet });

View file

@ -8,7 +8,7 @@ const MAX_NOTION_WORK_UNIT_CHARS = 40_000;
export const NOTION_ORG_KNOWLEDGE_WARNING =
'Anything accessible to this Notion integration can become organization knowledge.';
const NOTION_SL_WRITE_GUIDANCE =
'Write wiki entries with wiki_write. Wiki keys must be flat slugs like orbit-company-overview, not orbit/company-overview. Search existing wiki pages for the same tables or sl_refs before creating a new page. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
'Write wiki entries with wiki_write. Wiki keys must be flat slugs like orbit-company-overview, not orbit/company-overview. Search existing wiki pages, SL sources, and raw warehouse schema for the same tables or sl_refs with discover_data before creating a new page. Only write or edit SL sources after discover_data plus sl_discover/sl_read_source or entity_details confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
@ -117,7 +117,7 @@ export async function chunkNotionStagedDir(stagedDir: string, diffSet?: DiffSet)
reconcileNotes: [
`Notion maxKnowledgeCreatesPerRun=${manifest.maxKnowledgeCreatesPerRun}`,
`Notion maxKnowledgeUpdatesPerRun=${manifest.maxKnowledgeUpdatesPerRun}`,
'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.',
'Notion dataSourceCount is Notion-only; use discover_data/entity_details for warehouse/dbt mapping decisions.',
'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.',
],
contextReport: {

View file

@ -52,6 +52,14 @@ describe('NotionSourceAdapter', () => {
expect(adapter.triageSupported).toBe(true);
});
it('returns configured target warehouse connection ids', async () => {
const adapter = new NotionSourceAdapter({
targetConnectionIds: ['warehouse', 'warehouse', 'analytics'],
});
await expect(adapter.listTargetConnectionIds?.(stagedDir)).resolves.toEqual(['analytics', 'warehouse']);
});
it('returns structural triage signals for a staged Notion page', async () => {
await mkdir(join(stagedDir, 'pages', 'page-1'), { recursive: true });
await writeFile(
@ -242,6 +250,8 @@ describe('NotionSourceAdapter', () => {
});
expect(result.workUnits[0].notes).toContain('Synthesize durable wiki and SL knowledge');
expect(result.workUnits[0].notes).toContain('emit_unmapped_fallback');
expect(result.workUnits[0].notes).toContain('discover_data');
expect(result.workUnits[0].notes).toContain('entity_details');
expect(result.workUnits[0].notes).toContain('use reason no_physical_table rather than no_connection_mapping');
expect(result.workUnits[0].notes).toContain('Do not create SL sources under the Notion connection');
expect(result.workUnits[0].notes).toContain(
@ -250,7 +260,7 @@ describe('NotionSourceAdapter', () => {
expect(result.reconcileNotes).toEqual([
'Notion maxKnowledgeCreatesPerRun=25',
'Notion maxKnowledgeUpdatesPerRun=20',
'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.',
'Notion dataSourceCount is Notion-only; use discover_data/entity_details for warehouse/dbt mapping decisions.',
'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.',
]);
expect(result.contextReport).toEqual({ capped: false, warnings: [NOTION_ORG_KNOWLEDGE_WARNING] });

View file

@ -32,6 +32,11 @@ interface NotionPullSucceededContext {
export interface NotionSourceAdapterDeps {
onPullSucceeded?: (ctx: NotionPullSucceededContext) => Promise<void>;
logger?: NotionFetchLogger;
targetConnectionIds?: string[];
}
function uniqueSorted(values: readonly string[] | undefined): string[] {
return [...new Set(values ?? [])].sort((left, right) => left.localeCompare(right));
}
export class NotionSourceAdapter implements SourceAdapter {
@ -73,6 +78,10 @@ export class NotionSourceAdapter implements SourceAdapter {
return describeNotionScope(stagedDir);
}
async listTargetConnectionIds(_stagedDir: string): Promise<string[]> {
return uniqueSorted(this.deps.targetConnectionIds);
}
async getTriageSignals(stagedDir: string, externalId: string): Promise<TriageSignals> {
const metadata = await this.findMetadataByExternalId(stagedDir, externalId);
if (!metadata) {

View file

@ -409,6 +409,38 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
);
});
it('threads target warehouse connection names into WorkUnit and reconcile tool sessions', async () => {
const deps = makeDeps();
const sessions: any[] = [];
deps.adapter.listTargetConnectionIds = vi.fn().mockResolvedValue(['warehouse']);
deps.toolsetFactory.createIngestWuToolset.mockImplementation((toolSession: any) => {
sessions.push(toolSession);
return {
toAiSdkTools: vi.fn().mockReturnValue({}),
getAllTools: vi.fn().mockReturnValue([]),
getToolNames: vi.fn().mockReturnValue([]),
};
});
deps.agentRunner.runLoop.mockResolvedValue({ stopReason: 'natural' });
const runner = buildRunner(deps);
(runner as any).stageRawFilesStage1 = vi.fn().mockResolvedValue({
currentHashes: new Map([['a.yml', 'h1']]),
rawDirInWorktree: 'raw-sources/notion/fake/s',
});
(runner as any).resolveStagedDir = vi.fn().mockResolvedValue('/tmp/stage/upload-x');
await runner.run({
jobId: 'j1',
connectionId: 'notion',
sourceKey: 'fake',
trigger: 'upload',
bundleRef: { kind: 'upload', uploadId: 'upload-x' },
});
expect([...sessions[0].allowedConnectionNames].sort()).toEqual(['notion', 'warehouse']);
});
it('reuses document evidence indexing and page triage for document WorkUnits', async () => {
const deps = makeDeps();
deps.adapter.source = 'notion';
@ -643,6 +675,14 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
});
}
if (params.telemetryTags.operationName === 'ingest-bundle-reconcile') {
await params.toolSet.record_verification_ledger.execute(
{
summary: 'Reconciliation emits no warehouse identifiers before fallback recording.',
verifiedIdentifiers: [],
unverifiedIdentifiers: [],
},
{ toolCallId: 'ledger-1', messages: [] },
);
await params.toolSet.emit_conflict_resolution.execute(
{
kind: 'near_duplicate',
@ -811,6 +851,14 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
{ path: 'a.yml', startLine: 1, endLine: 2 },
{ toolCallId: 'read-1', messages: [] },
);
await params.toolSet.record_verification_ledger.execute(
{
summary: 'Wiki write contains no warehouse identifiers.',
verifiedIdentifiers: [],
unverifiedIdentifiers: [],
},
{ toolCallId: 'ledger-1', messages: [] },
);
await params.toolSet.wiki_write.execute(
{ key: 'knowledge/a.md', content: 'safe summary' },
{ toolCallId: 'wiki-1', messages: [] },
@ -850,9 +898,9 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
{
unitKey: 'u1',
path: '/tmp/ktx-test/run/wu-transcripts/j1/u1.jsonl',
toolCallCount: 2,
toolCallCount: 3,
errorCount: 0,
toolNames: ['read_raw_span', 'wiki_write'],
toolNames: ['read_raw_span', 'record_verification_ledger', 'wiki_write'],
},
],
}),
@ -864,6 +912,14 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
const deps = makeDeps();
deps.agentRunner.runLoop.mockImplementation(async (params: any) => {
if (params.telemetryTags.operationName === 'ingest-bundle-wu') {
await params.toolSet.record_verification_ledger.execute(
{
summary: 'Unmapped fallback records an unsupported conversion metric without verified warehouse identifiers.',
verifiedIdentifiers: [],
unverifiedIdentifiers: [],
},
{ toolCallId: 'ledger-1', messages: [] },
);
await params.toolSet.emit_unmapped_fallback.execute(
{
rawPath: 'a.yml',
@ -920,6 +976,14 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
});
deps.agentRunner.runLoop.mockImplementation(async (params: any) => {
if (params.telemetryTags.operationName === 'ingest-bundle-reconcile') {
await params.toolSet.record_verification_ledger.execute(
{
summary: 'Reconciliation records conflict, eviction, and fallback decisions without warehouse identifiers.',
verifiedIdentifiers: [],
unverifiedIdentifiers: [],
},
{ toolCallId: 'ledger-1', messages: [] },
);
await params.toolSet.emit_conflict_resolution.execute(
{
kind: 'near_duplicate',

View file

@ -53,6 +53,7 @@ import type {
UnresolvedCardInfo,
WorkUnit,
} from './types.js';
import { repairWikiSlRefs, type WikiSlRefRepairResult } from './wiki-sl-ref-repair.js';
function workUnitToMemoryFlowPlannedWorkUnit(workUnit: WorkUnit): MemoryFlowPlannedWorkUnit {
return {
@ -528,6 +529,7 @@ export class IngestBundleRunner {
let sourceContextReport: { capped?: boolean; warnings?: string[] } | undefined;
let parseArtifacts: unknown;
let postProcessorOutcome: IngestReportPostProcessorOutcome | undefined;
let wikiSlRefRepairResult: WikiSlRefRepairResult | null = null;
let reconcileNotes: string[] = [];
let triageResult: PageTriageRunResult | null = null;
if (overrideReport) {
@ -662,6 +664,7 @@ export class IngestBundleRunner {
touchedSlSources: session.touchedSlSources,
actions: sessionActions,
allowedRawPaths: new Set(wu.rawFiles),
allowedConnectionNames: new Set(slConnectionIds),
semanticLayerService: scopedSemanticLayerService,
wikiService: scopedWikiService,
configService: sessionWorktree.config,
@ -898,6 +901,7 @@ export class IngestBundleRunner {
touchedSlSources: reconcileSession.touchedSlSources,
actions: reconcileActions,
allowedRawPaths: reconciliationAllowedRawPaths,
allowedConnectionNames: new Set(slConnectionIds),
semanticLayerService: rcScopedSl,
wikiService: rcScopedWiki,
configService: sessionWorktree.config,
@ -1138,6 +1142,19 @@ export class IngestBundleRunner {
}
}
const repairConnectionIds = [
...new Set([
...slConnectionIds,
...(postProcessorOutcome?.touchedSources ?? []).map((source) => source.connectionId),
]),
].sort();
wikiSlRefRepairResult = await repairWikiSlRefs({
wikiService: this.deps.wikiService.forWorktree(sessionWorktree.workdir),
semanticLayerService: this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir),
configService: sessionWorktree.config,
connectionIds: repairConnectionIds,
});
// Stage 6 — squash commit
const stage6 = ctx?.startPhase(0.04);
await stage6?.updateProgress(0.0, 'Saving changes');
@ -1354,6 +1371,8 @@ export class IngestBundleRunner {
provenanceRows: reportProvenanceRows,
toolTranscripts: reportToolTranscripts,
postProcessor: postProcessorOutcome,
wikiSlRefRepairs: wikiSlRefRepairResult.repairs,
wikiSlRefRepairWarnings: wikiSlRefRepairResult.warnings,
...(reportMemoryFlow ? { memoryFlow: reportMemoryFlow } : {}),
context: contextReport
? {

View file

@ -29,6 +29,18 @@ describe('ingest prompt assets', () => {
expect(prompt).not.toMatch(forbiddenProductPattern());
});
it('uses shipped warehouse verification tools in the WorkUnit prompt', async () => {
const prompt = await readFile(
new URL('../../prompts/memory_agent_bundle_ingest_work_unit.md', import.meta.url),
'utf-8',
);
expect(prompt).toContain('discover_data');
expect(prompt).toContain('entity_details');
expect(prompt).not.toContain('wiki_sl_search');
expect(prompt).not.toContain('sl_describe_table');
});
it('does not route historic-SQL through page-triage prompt examples', async () => {
const prompt = await readFile(new URL('../../prompts/skills/page_triage_classifier.md', import.meta.url), 'utf-8');

View file

@ -91,4 +91,14 @@ describe('ingest runtime assets', () => {
expect(body).toContain('cross-table');
expect(body).not.toMatch(forbiddenProductPattern());
});
it('packages identifier verification prompt assets', async () => {
const shared = await readFile(join(skillsDir, '_shared', 'identifier-verification.md'), 'utf-8');
expect(shared).toContain('## Identifier Verification Protocol');
expect(shared).toContain('discover_data');
expect(shared).toContain('entity_details');
expect(shared).toContain('sql_execution');
expect(shared).toContain('sql_execution({connectionName, sql: "SELECT DISTINCT');
expect(shared).toContain('sql_execution({connectionName, sql: "SELECT 1 FROM');
});
});

View file

@ -498,6 +498,60 @@ describe('local ingest adapters', () => {
await expect(adapter?.listTargetConnectionIds?.('/tmp/staged-dbt')).resolves.toEqual(['warehouse']);
});
it('passes primary warehouse connection ids to the local Notion adapter', async () => {
const adapters = createDefaultLocalIngestAdapters(
projectWithConnections({
notion: {
driver: 'notion',
auth_token: 'secret',
crawl_mode: 'selected_roots',
root_page_ids: ['page-1'],
},
warehouse: {
driver: 'postgres',
url: 'postgresql://readonly@db.example.test/analytics',
},
docs: {
driver: 'dbt',
source_dir: './dbt',
},
} as never),
);
const notion = adapters.find((adapter) => adapter.source === 'notion');
await expect(notion?.listTargetConnectionIds?.('/tmp/staged-notion')).resolves.toEqual(['warehouse']);
});
it('passes primary warehouse connection ids to local LookML and MetricFlow adapters', async () => {
const adapters = createDefaultLocalIngestAdapters(
projectWithConnections({
warehouse: {
driver: 'postgres',
url: 'postgresql://readonly@db.example.test/analytics',
},
lookml_docs: {
driver: 'lookml',
lookml: {
repoUrl: 'https://github.com/acme/lookml.git',
},
},
metrics_repo: {
driver: 'metricflow',
metricflow: {
repoUrl: 'https://github.com/acme/metrics.git',
},
},
} as never),
);
const lookml = adapters.find((adapter) => adapter.source === 'lookml');
const metricflow = adapters.find((adapter) => adapter.source === 'metricflow');
await expect(lookml?.listTargetConnectionIds?.('/tmp/staged-lookml')).resolves.toEqual(['warehouse']);
await expect(metricflow?.listTargetConnectionIds?.('/tmp/staged-metricflow')).resolves.toEqual(['warehouse']);
});
it('resolves MetricFlow auth_token_ref without writing literal tokens to config', async () => {
const project = projectWithConnections({
metricflow_main: {

View file

@ -88,7 +88,10 @@ export function createDefaultLocalIngestAdapters(
...(options.databaseIntrospectionUrl ? { baseUrl: options.databaseIntrospectionUrl } : {}),
}),
}),
new LookmlSourceAdapter({ homeDir: join(project.projectDir, '.ktx/cache') }),
new LookmlSourceAdapter({
homeDir: join(project.projectDir, '.ktx/cache'),
targetConnectionIds: primaryWarehouseConnectionIds(project),
}),
new DbtSourceAdapter({
homeDir: join(project.projectDir, '.ktx/cache'),
targetConnectionIds: primaryWarehouseConnectionIds(project),
@ -106,8 +109,12 @@ export function createDefaultLocalIngestAdapters(
},
},
}),
new MetricflowSourceAdapter({ homeDir: join(project.projectDir, '.ktx/cache') }),
new MetricflowSourceAdapter({
homeDir: join(project.projectDir, '.ktx/cache'),
targetConnectionIds: primaryWarehouseConnectionIds(project),
}),
new NotionSourceAdapter({
targetConnectionIds: primaryWarehouseConnectionIds(project),
...(options.logger ? { logger: options.logger } : {}),
}),
];

View file

@ -27,6 +27,18 @@ class LookerSlWritingAgentRunner extends AgentRunnerService {
params.telemetryTags?.operationName === 'ingest-bundle-wu' &&
params.telemetryTags?.unitKey === 'looker-explore-ecommerce-orders'
) {
const ledger = params.toolSet.record_verification_ledger;
if (!ledger?.execute) {
throw new Error('record_verification_ledger tool was not available to the Looker WorkUnit');
}
await ledger.execute(
{
summary: 'Test fixture verified Looker explore target identifiers before writing SL.',
verifiedIdentifiers: ['prod-warehouse', 'public.orders'],
unverifiedIdentifiers: [],
},
{ toolCallId: 'looker-verification-ledger', messages: [] },
);
const slWrite = params.toolSet.sl_write_source;
if (!slWrite?.execute) {
throw new Error('sl_write_source tool was not available to the Looker WorkUnit');
@ -63,6 +75,18 @@ class LookerSlWritingAgentRunner extends AgentRunnerService {
class WikiWritingAgentRunner extends AgentRunnerService {
override runLoop = vi.fn(async (params: any) => {
if (params.telemetryTags?.operationName === 'ingest-bundle-wu') {
const ledger = params.toolSet.record_verification_ledger;
if (!ledger?.execute) {
throw new Error('record_verification_ledger tool was not available to the WorkUnit');
}
await ledger.execute(
{
summary: 'Test fixture writes wiki-only context with no warehouse identifiers.',
verifiedIdentifiers: [],
unverifiedIdentifiers: [],
},
{ toolCallId: 'wiki-verification-ledger', messages: [] },
);
const wikiWrite = params.toolSet.wiki_write;
if (!wikiWrite?.execute) {
throw new Error('wiki_write tool was not available to the WorkUnit');
@ -91,6 +115,18 @@ class WikiWritingAgentRunner extends AgentRunnerService {
class WikiWritingWithRawPathAgentRunner extends AgentRunnerService {
override runLoop = vi.fn(async (params: any) => {
if (params.telemetryTags?.operationName === 'ingest-bundle-wu') {
const ledger = params.toolSet.record_verification_ledger;
if (!ledger?.execute) {
throw new Error('record_verification_ledger tool was not available to the WorkUnit');
}
await ledger.execute(
{
summary: 'Test fixture writes wiki-only context with explicit raw provenance and no warehouse identifiers.',
verifiedIdentifiers: [],
unverifiedIdentifiers: [],
},
{ toolCallId: 'wiki-raw-path-verification-ledger', messages: [] },
);
const wikiWrite = params.toolSet.wiki_write;
if (!wikiWrite?.execute) {
throw new Error('wiki_write tool was not available to the WorkUnit');

View file

@ -3,7 +3,7 @@ import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { AgentRunnerService } from '../agent/index.js';
import { initKtxProject, type KtxLocalProject, loadKtxProject } from '../project/index.js';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js';
import { createLocalBundleIngestRuntime } from './local-bundle-runtime.js';
@ -12,6 +12,7 @@ type RuntimeWithConnectionDeps = {
connections: {
listEnabledConnections(ids: string[]): Promise<Array<{ id: string; name: string; connectionType: string }>>;
getConnectionById(connectionId: string): Promise<{ id: string; name: string; connectionType: string } | null>;
executeQuery(connectionId: string, sql: string): Promise<unknown>;
};
};
};
@ -113,6 +114,37 @@ describe('createLocalBundleIngestRuntime', () => {
]);
});
it('passes project connection config to local ingest query executors', async () => {
const agentRunner = new AgentRunnerService({ llmProvider: { getModel: () => ({}) as never } as any });
const queryExecutor = {
execute: vi.fn(async () => ({
headers: ['answer'],
rows: [[1]],
totalRows: 1,
command: 'SELECT',
rowCount: 1,
})),
};
const runtime = createLocalBundleIngestRuntime({
project,
adapters: [new FakeSourceAdapter()],
agentRunner,
queryExecutor,
});
const connections = (runtime.runner as unknown as RuntimeWithConnectionDeps).deps.connections;
await expect(connections.executeQuery('warehouse', 'select 1')).resolves.toMatchObject({
headers: ['answer'],
});
expect(queryExecutor.execute).toHaveBeenCalledWith({
connectionId: 'warehouse',
projectDir: project.projectDir,
connection: project.config.connections.warehouse,
sql: 'select 1',
});
});
it('accepts a debug LLM request file when constructing the default agent runner', async () => {
await writeFile(
join(project.projectDir, 'ktx.yaml'),

View file

@ -6,7 +6,7 @@ import type { Tool } from 'ai';
import YAML from 'yaml';
import type { AgentRunnerService } from '../agent/index.js';
import { AgentRunnerService as DefaultAgentRunnerService } from '../agent/index.js';
import { localConnectionInfoFromConfig } from '../connections/index.js';
import { localConnectionInfoFromConfig, type KtxSqlQueryExecutorPort } from '../connections/index.js';
import type { KtxEmbeddingPort, KtxLogger } from '../core/index.js';
import { noopLogger, SessionWorktreeService } from '../core/index.js';
import type { KtxSemanticLayerComputePort } from '../daemon/index.js';
@ -56,6 +56,7 @@ import {
buildKnowledgeSearchText,
type KnowledgeEventPort,
type KnowledgeIndexPort,
type KnowledgeIndexPageListing,
KnowledgeWikiService,
searchLocalKnowledgePages,
SqliteKnowledgeIndex,
@ -77,6 +78,7 @@ import { ContextEvidenceIndexService, SqliteContextEvidenceStore } from './conte
import { DiffSetService } from './diff-set.service.js';
import { IngestBundleRunner } from './ingest-bundle.runner.js';
import { PageTriageService } from './page-triage/index.js';
import { createWarehouseVerificationTools } from './tools/warehouse-verification/index.js';
import type {
IngestBundleRunnerDeps,
IngestCommitMessagePort,
@ -103,7 +105,7 @@ export interface CreateLocalBundleIngestRuntimeOptions {
llmDebugRequestFile?: string;
memoryModel?: string;
semanticLayerCompute?: KtxSemanticLayerComputePort;
queryExecutor?: { execute(input: { connectionId: string; sql: string; maxRows?: number }): Promise<KtxQueryResult> };
queryExecutor?: KtxSqlQueryExecutorPort;
jobIdFactory?: () => string;
logger?: KtxLogger;
}
@ -169,9 +171,7 @@ class LocalAuthorResolver implements GitAuthorResolverPort {
class LocalConnectionCatalog implements SlConnectionCatalogPort {
constructor(
private readonly project: KtxLocalProject,
private readonly queryExecutor?: {
execute(input: { connectionId: string; sql: string; maxRows?: number }): Promise<KtxQueryResult>;
},
private readonly queryExecutor?: KtxSqlQueryExecutorPort,
) {}
async listEnabledConnections(ids: string[]): Promise<KtxConnectionInfo[]> {
@ -192,7 +192,12 @@ class LocalConnectionCatalog implements SlConnectionCatalogPort {
if (!this.queryExecutor) {
throw new Error('Local ingest has no query executor configured');
}
return this.queryExecutor.execute({ connectionId, sql });
return this.queryExecutor.execute({
connectionId,
projectDir: this.project.projectDir,
connection: this.project.config.connections[connectionId],
sql,
});
}
}
@ -347,15 +352,19 @@ class LocalKnowledgeIndex implements KnowledgeIndexPort {
async listPagesForUser(
userId: string,
): Promise<Array<{ page_key: string; summary: string; scope: string; scope_id: string | null }>> {
const pages: Array<{ page_key: string; summary: string; scope: string; scope_id: string | null }> = [];
): Promise<KnowledgeIndexPageListing[]> {
const pages: KnowledgeIndexPageListing[] = [];
for (const scope of [
{ scope: 'GLOBAL', scopeId: null, dir: 'knowledge/global' },
{ scope: 'USER', scopeId: userId, dir: `knowledge/user/${userId}` },
]) {
const listed = await this.project.fileStore.listFiles(scope.dir, true);
for (const file of listed.files.filter((entry) => entry.endsWith('.md'))) {
const pageKey = file.replace(/\.md$/, '');
const parsedPath = parseKnowledgeIndexPath(file.startsWith('global/') || file.startsWith('user/') ? file : `${scope.dir.replace('knowledge/', '')}/${file}`);
if (!parsedPath || parsedPath.scope !== scope.scope) {
continue;
}
const pageKey = parsedPath.pageKey;
const raw = await this.project.fileStore.readFile(`${scope.dir}/${file}`);
const parsed = parseWiki(raw.content);
pages.push({
@ -363,6 +372,7 @@ class LocalKnowledgeIndex implements KnowledgeIndexPort {
summary: parsed.summary,
scope: scope.scope,
scope_id: scope.scopeId,
tags: parseWikiTags(raw.content),
});
}
}
@ -432,13 +442,6 @@ function parseKnowledgeIndexPath(file: string): { scope: 'GLOBAL' | 'USER'; page
const pageKey = segments[1].replace(/\.md$/, '');
return /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(pageKey) ? { scope: 'GLOBAL', pageKey } : null;
}
if (segments.length >= 3 && segments[0] === 'global' && segments[1] === 'historic-sql') {
const historicPath = segments.slice(2).join('/').replace(/\.md$/, '');
if (historicPath.split('/').every((segment) => /^[a-zA-Z0-9_][a-zA-Z0-9_-]*$/.test(segment))) {
return { scope: 'GLOBAL', pageKey: `historic-sql/${historicPath}` };
}
return null;
}
if (segments.length === 3 && segments[0] === 'user') {
const pageKey = segments[2].replace(/\.md$/, '');
return /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(pageKey) ? { scope: 'USER', pageKey } : null;
@ -486,38 +489,47 @@ class LocalIngestToolsetFactory implements IngestToolsetFactoryPort {
slSearchService: deps.slSearchService,
authorResolver: deps.authorResolver,
};
const wikiSearchTool = new WikiSearchTool({
search: async (input) => {
const results = await searchLocalKnowledgePages(deps.project, {
userId: input.userId,
query: input.query,
limit: input.limit,
embeddingService: deps.embedding,
});
return {
results: results.slice(0, input.limit).map((result) => ({
key: result.key,
path: result.path,
summary: result.summary,
score: result.score,
matchReasons: result.matchReasons,
lanes: result.lanes,
})),
totalFound: results.length,
};
},
});
const slDiscoverTool = new SlDiscoverTool(slDeps, { maxSources: 25, minRrfScore: 0, maxDetailedSources: 5 });
const warehouseVerificationTools = createWarehouseVerificationTools({
connections: deps.connections,
fallbackFileStore: deps.project.fileStore,
wikiSearchTool,
slDiscoverTool,
});
this.baseTools = [
new WikiReadTool(deps.wikiService, deps.knowledgeIndex),
new WikiSearchTool({
search: async (input) => {
const results = await searchLocalKnowledgePages(deps.project, {
userId: input.userId,
query: input.query,
limit: input.limit,
embeddingService: deps.embedding,
});
return {
results: results.slice(0, input.limit).map((result) => ({
key: result.key,
path: result.path,
summary: result.summary,
score: result.score,
matchReasons: result.matchReasons,
lanes: result.lanes,
})),
totalFound: results.length,
};
},
}),
new WikiListTagsTool(deps.wikiService, deps.knowledgeIndex),
wikiSearchTool,
new WikiListTagsTool(deps.knowledgeIndex),
new WikiWriteTool(deps.wikiService, deps.knowledgeIndex, deps.knowledgeEvents),
new WikiRemoveTool(deps.wikiService, deps.knowledgeIndex, deps.knowledgeEvents),
new SlDiscoverTool(slDeps, { maxSources: 25, minRrfScore: 0, maxDetailedSources: 5 }),
slDiscoverTool,
new SlEditSourceTool(slDeps),
new SlReadSourceTool(slDeps),
new SlWriteSourceTool(slDeps),
new SlValidateTool(slDeps),
new SlRollbackTool(deps.slSourcesRepository, deps.connections, 0),
...warehouseVerificationTools,
];
this.contextTools = [
new ContextEvidenceSearchTool(deps.contextStore, deps.embedding),

View file

@ -3,11 +3,11 @@ import { cp, mkdir, rm } from 'node:fs/promises';
import { isAbsolute, resolve } from 'node:path';
import type { KtxLlmProvider } from '@ktx/llm';
import type { AgentRunnerService } from '../agent/index.js';
import type { KtxSqlQueryExecutorPort } from '../connections/index.js';
import type { KtxLogger } from '../core/index.js';
import type { KtxSemanticLayerComputePort } from '../daemon/index.js';
import type { KtxLocalProject } from '../project/index.js';
import { ktxLocalStateDbPath } from '../project/index.js';
import type { KtxQueryResult } from '../sl/index.js';
import { planMetabaseFanoutChildren } from './adapters/metabase/fanout-planner.js';
import { LocalMetabaseSourceStateReader } from './adapters/metabase/local-source-state-store.js';
import { localPullConfigForAdapter, type DefaultLocalIngestAdaptersOptions } from './local-adapters.js';
@ -34,7 +34,7 @@ export interface RunLocalIngestOptions {
llmDebugRequestFile?: string;
memoryModel?: string;
semanticLayerCompute?: KtxSemanticLayerComputePort;
queryExecutor?: { execute(input: { connectionId: string; sql: string; maxRows?: number }): Promise<KtxQueryResult> };
queryExecutor?: KtxSqlQueryExecutorPort;
logger?: KtxLogger;
}
@ -172,7 +172,7 @@ async function runScheduledPullJob(options: {
llmProvider?: KtxLlmProvider;
memoryModel?: string;
semanticLayerCompute?: KtxSemanticLayerComputePort;
queryExecutor?: { execute(input: { connectionId: string; sql: string; maxRows?: number }): Promise<KtxQueryResult> };
queryExecutor?: KtxSqlQueryExecutorPort;
logger?: KtxLogger;
}): Promise<LocalIngestResult> {
const runtime = createLocalBundleIngestRuntime(options);

View file

@ -9,6 +9,7 @@ import type {
StageIndex,
UnmappedFallbackRecord,
} from './stages/stage-index.types.js';
import type { WikiSlRefRepair } from './wiki-sl-ref-repair.js';
import type { IngestDiffSummary, SourceFetchReport, UnresolvedCardInfo } from './types.js';
export interface IngestReportWorkUnit {
@ -70,6 +71,8 @@ export interface IngestReportBody {
provenanceRows: IngestReportProvenanceDetail[];
toolTranscripts: IngestReportToolTranscriptSummary[];
postProcessor?: IngestReportPostProcessorOutcome;
wikiSlRefRepairs?: WikiSlRefRepair[];
wikiSlRefRepairWarnings?: string[];
memoryFlow?: MemoryFlowReplayInput;
}

View file

@ -107,6 +107,7 @@ describe('buildReconcileToolSet', () => {
'eviction_list',
'load_skill',
'read_raw_span',
'record_verification_ledger',
'sl_write_source',
'stage_diff',
'stage_list',
@ -114,4 +115,54 @@ describe('buildReconcileToolSet', () => {
].sort(),
);
});
it('requires the verification ledger before reconciliation write tools run', async () => {
const slWrite = vi.fn().mockResolvedValue({ markdown: 'written', structured: { success: true } });
const toolSet = buildReconcileToolSet({
loadSkillTool: { load_skill: { description: 'load', inputSchema: {} as any, execute: vi.fn() } } as any,
stageListTool: { stage_list: { description: 'stage list', inputSchema: {} as any, execute: vi.fn() } } as any,
stageDiffTool: { stage_diff: { description: 'stage diff', inputSchema: {} as any, execute: vi.fn() } } as any,
evictionListTool: {
eviction_list: { description: 'eviction list', inputSchema: {} as any, execute: vi.fn() },
} as any,
emitConflictResolutionTool: {
emit_conflict_resolution: { description: 'conflict', inputSchema: {} as any, execute: vi.fn() },
} as any,
emitEvictionDecisionTool: {
emit_eviction_decision: { description: 'eviction', inputSchema: {} as any, execute: vi.fn() },
} as any,
emitArtifactResolutionTool: {
emit_artifact_resolution: { description: 'resolution', inputSchema: {} as any, execute: vi.fn() },
} as any,
emitUnmappedFallbackTool: {
emit_unmapped_fallback: { description: 'fallback', inputSchema: {} as any, execute: vi.fn() },
} as any,
readRawSpanTool: { read_raw_span: { description: 'raw span', inputSchema: {} as any, execute: vi.fn() } } as any,
toolsetTools: { sl_write_source: { description: 'sl write', inputSchema: {} as any, execute: slWrite } as any },
});
const correction = await toolSet.sl_write_source.execute?.(
{ connectionId: 'warehouse', sourceName: 'accounts' },
{ toolCallId: 't1' } as any,
);
expect(slWrite).not.toHaveBeenCalled();
expect(correction).toMatchObject({ structured: { success: false, reason: 'verification_ledger_required' } });
await toolSet.record_verification_ledger.execute?.(
{
summary: 'Verified warehouse.accounts with entity_details.',
verifiedIdentifiers: ['warehouse.accounts'],
unverifiedIdentifiers: [],
},
{ toolCallId: 't2' } as any,
);
const written = await toolSet.sl_write_source.execute?.(
{ connectionId: 'warehouse', sourceName: 'accounts' },
{ toolCallId: 't3' } as any,
);
expect(slWrite).toHaveBeenCalledTimes(1);
expect(written).toMatchObject({ structured: { success: true } });
});
});

View file

@ -1,5 +1,10 @@
import type { Tool, ToolSet } from 'ai';
import { buildCanonicalPinsPromptBlock, type CanonicalPin } from '../canonical-pins.js';
import {
createVerificationLedgerState,
VERIFICATION_LEDGER_PROMPT,
withVerificationLedger,
} from '../tools/verification-ledger.tool.js';
import type { EvictionUnit } from '../types.js';
import type { StageIndex } from './stage-index.types.js';
@ -12,6 +17,7 @@ export function buildReconcileSystemPrompt(params: {
}): string {
return [
params.baseFraming.trimEnd(),
VERIFICATION_LEDGER_PROMPT,
params.skillsPrompt.trimEnd(),
buildCanonicalPinsPromptBlock(params.canonicalPins),
`\n<context>\nsyncId: ${params.syncId}\nsource: ${params.sourceKey}\n</context>`,
@ -188,16 +194,20 @@ export interface ReconcileToolSetInput {
}
export function buildReconcileToolSet(input: ReconcileToolSetInput): ToolSet {
return {
...input.toolsetTools,
...input.loadSkillTool,
...input.stageListTool,
...input.stageDiffTool,
...input.evictionListTool,
...input.emitConflictResolutionTool,
...input.emitEvictionDecisionTool,
...input.emitArtifactResolutionTool,
...input.emitUnmappedFallbackTool,
...input.readRawSpanTool,
};
const state = createVerificationLedgerState();
return withVerificationLedger(
{
...input.toolsetTools,
...input.loadSkillTool,
...input.stageListTool,
...input.stageDiffTool,
...input.evictionListTool,
...input.emitConflictResolutionTool,
...input.emitEvictionDecisionTool,
...input.emitArtifactResolutionTool,
...input.emitUnmappedFallbackTool,
...input.readRawSpanTool,
},
state,
);
}

View file

@ -68,12 +68,45 @@ describe('buildWuToolSet', () => {
'load_skill',
'read_raw_file',
'read_raw_span',
'record_verification_ledger',
'sl_write_source',
'wiki_search',
].sort(),
);
});
it('requires the verification ledger before write-capable tools run', async () => {
const wikiWrite = vi.fn().mockResolvedValue({ markdown: 'written', structured: { success: true } });
const toolSet = buildWuToolSet({
stagedDir: '/tmp/staged',
wu: { unitKey: 'u1', rawFiles: ['a.yml'], peerFileIndex: [], dependencyPaths: [] },
loadSkillTool: { load_skill: { description: 'load', inputSchema: {} as any, execute: vi.fn() } } as any,
emitUnmappedFallbackTool: {
emit_unmapped_fallback: { description: 'fallback', inputSchema: {} as any, execute: vi.fn() },
} as any,
toolsetTools: { wiki_write: { description: 'write', inputSchema: {} as any, execute: wikiWrite } as any },
});
const correction = await toolSet.wiki_write.execute?.({ key: 'customer-rules' }, { toolCallId: 't1' } as any);
expect(wikiWrite).not.toHaveBeenCalled();
expect(correction).toMatchObject({ structured: { success: false, reason: 'verification_ledger_required' } });
expect(String((correction as any).markdown)).toContain('record_verification_ledger');
await toolSet.record_verification_ledger.execute?.(
{
summary: 'No warehouse identifiers will be emitted in this wiki write.',
verifiedIdentifiers: [],
unverifiedIdentifiers: [],
},
{ toolCallId: 't2' } as any,
);
const written = await toolSet.wiki_write.execute?.({ key: 'customer-rules' }, { toolCallId: 't3' } as any);
expect(wikiWrite).toHaveBeenCalledTimes(1);
expect(written).toMatchObject({ structured: { success: true } });
});
it('includes looker_query_to_sl only for Looker WorkUnits', () => {
const toolSet = buildWuToolSet({
sourceKey: 'looker',
@ -93,6 +126,7 @@ describe('buildWuToolSet', () => {
'looker_query_to_sl',
'read_raw_file',
'read_raw_span',
'record_verification_ledger',
'sl_write_source',
'wiki_search',
].sort(),

View file

@ -4,6 +4,11 @@ import { createLookerQueryToSlTool } from '../adapters/looker/tools/looker-query
import type { IngestProvenanceRow } from '../ports.js';
import { createReadRawFileTool } from '../tools/read-raw-file.tool.js';
import { createReadRawSpanTool } from '../tools/read-raw-span.tool.js';
import {
createVerificationLedgerState,
VERIFICATION_LEDGER_PROMPT,
withVerificationLedger,
} from '../tools/verification-ledger.tool.js';
import type { WorkUnit } from '../types.js';
const PEER_FILE_INDEX_PROMPT_LIMIT = 100;
@ -24,6 +29,7 @@ export function buildWuSystemPrompt(params: {
}): string {
const parts = [
params.baseFraming.trimEnd(),
VERIFICATION_LEDGER_PROMPT,
params.skillsPrompt.trimEnd(),
buildCanonicalPinsPromptBlock(params.canonicalPins ?? []),
`\n<context>\nsyncId: ${params.syncId}\nsource: ${params.sourceKey}\n</context>`,
@ -100,15 +106,19 @@ function withoutWriteSlTools(toolset: ToolSet, wu: WorkUnit): ToolSet {
export function buildWuToolSet(input: BuildWuToolSetInput): ToolSet {
const allowedPaths = new Set<string>([...input.wu.rawFiles, ...input.wu.dependencyPaths]);
const lookerTools: ToolSet = input.sourceKey === 'looker' ? { looker_query_to_sl: createLookerQueryToSlTool() } : {};
return withoutWriteSlTools(
{
...input.toolsetTools,
...lookerTools,
...input.loadSkillTool,
...input.emitUnmappedFallbackTool,
read_raw_file: createReadRawFileTool({ stagedDir: input.stagedDir, allowedPaths }),
read_raw_span: createReadRawSpanTool({ stagedDir: input.stagedDir, allowedPaths }),
},
input.wu,
const state = createVerificationLedgerState();
return withVerificationLedger(
withoutWriteSlTools(
{
...input.toolsetTools,
...lookerTools,
...input.loadSkillTool,
...input.emitUnmappedFallbackTool,
read_raw_file: createReadRawFileTool({ stagedDir: input.stagedDir, allowedPaths }),
read_raw_span: createReadRawSpanTool({ stagedDir: input.stagedDir, allowedPaths }),
},
input.wu,
),
state,
);
}

View file

@ -64,7 +64,7 @@ export function createEmitUnmappedFallbackTool(deps: EmitUnmappedFallbackDeps) {
tableRef: z
.string()
.optional()
.describe('The fully-qualified table or source reference that triggered the fallback (e.g. "orbit_analytics.customer"). Used to generate canonical detail text.'),
.describe('The fully-qualified table or source reference that triggered the fallback (e.g. "<schema>.<table>"). Used to generate canonical detail text.'),
clarification: z
.string()
.optional()

View file

@ -36,6 +36,28 @@ describe('tool transcript summaries', () => {
expect(summary.fatalErrorCount).toBe(0);
});
it('treats a suggested flat wiki key retry as recovery for an invalid nested key', () => {
const summary = createMutableToolTranscriptSummary('wu-1', '/tmp/wu-1.jsonl');
recordToolTranscriptEntry(
summary,
entry({
input: { key: 'historic-sql/top-accounts-by-contract-arr' },
output: { structured: { success: false, key: 'historic-sql/top-accounts-by-contract-arr' } },
}),
);
recordToolTranscriptEntry(
summary,
entry({
input: { key: 'historic-sql-top-accounts-by-contract-arr' },
output: { structured: { success: true, key: 'historic-sql-top-accounts-by-contract-arr' } },
}),
);
expect(summary.errorCount).toBe(1);
expect(summary.fatalErrorCount).toBe(0);
});
it('counts unrecovered wiki_remove structured failures as fatal transcript errors', () => {
const summary = createMutableToolTranscriptSummary('reconcile', '/tmp/reconcile.jsonl');

View file

@ -1,4 +1,5 @@
import type { ToolCallLogEntry } from './tool-call-logger.js';
import { isFlatWikiKey, suggestFlatWikiKey } from '../../wiki/keys.js';
export interface MutableToolTranscriptSummary {
unitKey: string;
@ -112,7 +113,10 @@ function structuredSuccess(output: unknown): boolean | null {
function wikiTargetKey(entry: ToolCallLogEntry): string | null {
const key = stringField(recordField(entry.output, 'structured'), 'key') ?? stringField(entry.input, 'key');
return key ? `wiki:${key}` : null;
if (!key) {
return null;
}
return `wiki:${isFlatWikiKey(key) ? key : suggestFlatWikiKey(key)}`;
}
function slTargetKey(entry: ToolCallLogEntry): string | null {

View file

@ -0,0 +1,97 @@
import { tool, type ToolExecuteFunction, type ToolExecutionOptions, type ToolSet } from 'ai';
import { z } from 'zod';
const verificationLedgerInputSchema = z.object({
summary: z.string().min(1).max(2000),
verifiedIdentifiers: z.array(z.string().min(1)).max(100).default([]),
unverifiedIdentifiers: z.array(z.string().min(1)).max(100).default([]),
notes: z.string().max(2000).optional(),
});
export interface VerificationLedgerEntry {
summary: string;
verifiedIdentifiers: string[];
unverifiedIdentifiers: string[];
notes?: string;
}
export interface VerificationLedgerState {
entries: VerificationLedgerEntry[];
}
const WRITE_TOOL_NAMES = new Set([
'wiki_write',
'wiki_remove',
'sl_write_source',
'sl_edit_source',
'emit_unmapped_fallback',
]);
export const VERIFICATION_LEDGER_PROMPT = `<pre_write_verification>
Before any write-capable tool call (wiki_write, wiki_remove, sl_write_source, sl_edit_source, emit_unmapped_fallback), call record_verification_ledger.
The ledger is a model-authored checkpoint, not a deterministic parser gate. Summarize the verification protocol from the loaded skill, list identifiers verified with discover_data/entity_details/sql_execution, and list anything intentionally left unverified. If the write contains no warehouse identifiers, say that explicitly.
If a write tool returns verification_ledger_required, complete the ledger and retry the write.
</pre_write_verification>`;
export function createVerificationLedgerState(): VerificationLedgerState {
return { entries: [] };
}
export function withVerificationLedger(tools: ToolSet, state: VerificationLedgerState): ToolSet {
const wrapped: ToolSet = {};
for (const [name, original] of Object.entries(tools)) {
if (!WRITE_TOOL_NAMES.has(name) || typeof original.execute !== 'function') {
wrapped[name] = original;
continue;
}
const originalExecute = original.execute;
const guardedExecute: ToolExecuteFunction<unknown, unknown> = async (
input: unknown,
opts: ToolExecutionOptions,
) => {
if (state.entries.length === 0) {
return verificationRequiredOutput(name);
}
return (originalExecute as ToolExecuteFunction<unknown, unknown>)(input, opts);
};
wrapped[name] = { ...original, execute: guardedExecute };
}
wrapped.record_verification_ledger = createRecordVerificationLedgerTool(state);
return wrapped;
}
function createRecordVerificationLedgerTool(state: VerificationLedgerState) {
return tool({
description:
'Record the pre-write verification ledger required by loaded ingest skills. Call this before wiki/SL/fallback writes to state what was verified, which tool calls support it, and what remains intentionally unverified.',
inputSchema: verificationLedgerInputSchema,
execute: async (input) => {
const entry = verificationLedgerInputSchema.parse(input);
state.entries.push(entry);
return {
markdown:
`Verification ledger recorded. Summary: ${entry.summary}\n` +
`Verified identifiers: ${entry.verifiedIdentifiers.length ? entry.verifiedIdentifiers.join(', ') : '(none)'}\n` +
`Unverified identifiers: ${
entry.unverifiedIdentifiers.length ? entry.unverifiedIdentifiers.join(', ') : '(none)'
}`,
structured: { success: true, entry },
};
},
});
}
function verificationRequiredOutput(toolName: string) {
return {
markdown:
`Pre-write verification required before calling ${toolName}. ` +
'Call record_verification_ledger first. In the ledger, summarize the loaded skill protocol you followed, ' +
'list identifiers verified via discover_data/entity_details/sql_execution, and list any identifiers intentionally left unverified. ' +
'If the write contains no warehouse identifiers, say that explicitly in the ledger summary.',
structured: {
success: false,
reason: 'verification_ledger_required',
toolName,
},
};
}

View file

@ -0,0 +1,119 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import type { BaseTool, ToolContext } from '../../../tools/index.js';
import { DiscoverDataTool } from './discover-data.tool.js';
import type { WarehouseCatalogService } from './warehouse-catalog.service.js';
describe('DiscoverDataTool', () => {
const wikiSearchTool = { call: vi.fn() } as unknown as BaseTool & { call: ReturnType<typeof vi.fn> };
const slDiscoverTool = { call: vi.fn() } as unknown as BaseTool & { call: ReturnType<typeof vi.fn> };
const catalog = { searchByName: vi.fn() } as unknown as WarehouseCatalogService & {
searchByName: ReturnType<typeof vi.fn>;
};
const context: ToolContext = {
sourceId: 'ingest',
messageId: 'm1',
userId: 'system',
session: { allowedConnectionNames: new Set(['warehouse']) } as any,
};
const tool = new DiscoverDataTool({
wikiSearchTool,
slDiscoverTool,
catalogFactory: () => catalog,
});
beforeEach(() => {
wikiSearchTool.call.mockReset();
slDiscoverTool.call.mockReset();
catalog.searchByName.mockReset();
wikiSearchTool.call.mockResolvedValue({
markdown: '- orders wiki',
structured: { totalFound: 1, results: [{ key: 'orders' }] },
});
slDiscoverTool.call.mockResolvedValue({
markdown: '- orders source',
structured: { totalSources: 1, sources: [{ sourceName: 'orders' }] },
});
catalog.searchByName.mockResolvedValue([
{
kind: 'table',
connectionName: 'warehouse',
ref: { catalog: null, db: 'public', name: 'orders' },
display: 'public.orders',
matchedOn: 'name',
},
]);
});
it('groups wiki, semantic layer, and raw schema hits with routing hints', async () => {
const result = await tool.call({ query: 'orders', connectionName: 'warehouse', limit: 5 }, context);
expect(result.markdown).toContain('## Wiki Pages');
expect(result.markdown).toContain('use `wiki_read(blockKey)` for full content');
expect(result.markdown).toContain('## Semantic Layer Sources');
expect(result.markdown).toContain('use `sl_read_source(sourceName)` for the YAML');
expect(result.markdown).toContain('## Raw Warehouse Schema');
expect(result.markdown).toContain('use `entity_details({connectionName, targets: [{display}]})`');
expect(result.structured.raw?.hits).toHaveLength(1);
});
it('includes connectionName on raw schema hits so entity_details can follow up', async () => {
const multiConnectionContext: ToolContext = {
...context,
session: { allowedConnectionNames: new Set(['warehouse', 'analytics']) } as any,
};
catalog.searchByName.mockImplementation(async (connectionName: string, query: string) => [
{
kind: 'table',
connectionName,
ref: { catalog: null, db: 'public', name: `${connectionName}_${query}` },
display: `public.${connectionName}_${query}`,
matchedOn: 'name',
},
]);
const result = await tool.call({ query: 'orders', limit: 10 }, multiConnectionContext);
expect(catalog.searchByName).toHaveBeenCalledWith('analytics', 'orders', 10);
expect(catalog.searchByName).toHaveBeenCalledWith('warehouse', 'orders', 10);
expect(result.markdown).toContain('connectionName=analytics');
expect(result.markdown).toContain('connectionName=warehouse');
expect(result.markdown).toContain(
'entity_details({connectionName: "analytics", targets: [{display: "public.analytics_orders"}]})',
);
expect(result.structured.raw?.hits.map((hit) => hit.connectionName)).toEqual(['analytics', 'warehouse']);
});
it('refuses explicit out-of-scope connection names', async () => {
const result = await tool.call({ query: 'orders', connectionName: 'billing' }, context);
expect(result.markdown).toContain('Connection "billing" is not available to this ingest stage.');
expect(result.structured).toEqual({ wiki: null, sl: null, raw: null });
expect(wikiSearchTool.call).not.toHaveBeenCalled();
expect(slDiscoverTool.call).not.toHaveBeenCalled();
expect(catalog.searchByName).not.toHaveBeenCalled();
});
it('delegates sourceName inspect mode to sl_discover only', async () => {
slDiscoverTool.call.mockResolvedValueOnce({
markdown: 'source detail',
structured: { sourceName: 'orders' },
});
const result = await tool.call({ sourceName: 'orders', connectionName: 'warehouse' }, context);
expect(slDiscoverTool.call).toHaveBeenCalledWith({ sourceName: 'orders', connectionId: 'warehouse' }, context);
expect(wikiSearchTool.call).not.toHaveBeenCalled();
expect(catalog.searchByName).not.toHaveBeenCalled();
expect(result.markdown).toContain('source detail');
});
it('returns the empty-state message when all sections are empty', async () => {
wikiSearchTool.call.mockResolvedValueOnce({ markdown: '', structured: { totalFound: 0, results: [] } });
slDiscoverTool.call.mockResolvedValueOnce({ markdown: '', structured: { totalSources: 0, sources: [] } });
catalog.searchByName.mockResolvedValueOnce([]);
const result = await tool.call({ query: 'customer source', connectionName: 'warehouse' }, context);
expect(result.markdown).toContain('No matches for "customer source" across wiki, semantic layer, or raw warehouse schema.');
});
});

View file

@ -0,0 +1,142 @@
import { z } from 'zod';
import { BaseTool, type ToolContext, type ToolOutput } from '../../../tools/index.js';
import { WarehouseCatalogService, type RawSchemaHit } from './warehouse-catalog.service.js';
const discoverDataInputSchema = z.object({
query: z.string().optional(),
connectionName: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/).optional(),
limit: z.number().int().positive().max(50).optional().default(10),
sourceName: z.string().optional(),
});
type DiscoverDataInput = z.input<typeof discoverDataInputSchema>;
export interface DiscoverDataStructured {
wiki: unknown | null;
sl: unknown | null;
raw: { hits: RawSchemaHit[] } | null;
}
interface DiscoverDataDeps {
wikiSearchTool: BaseTool;
slDiscoverTool: BaseTool;
catalogFactory: (context: ToolContext) => WarehouseCatalogService;
}
function totalFound(structured: unknown): number {
return typeof structured === 'object' &&
structured !== null &&
'totalFound' in structured &&
typeof structured.totalFound === 'number'
? structured.totalFound
: 0;
}
function totalSources(structured: unknown): number {
return typeof structured === 'object' &&
structured !== null &&
'totalSources' in structured &&
typeof structured.totalSources === 'number'
? structured.totalSources
: 0;
}
function allowedConnectionNames(context: ToolContext): ReadonlySet<string> | null {
return context.session?.allowedConnectionNames ?? null;
}
export class DiscoverDataTool extends BaseTool<typeof discoverDataInputSchema> {
readonly name = 'discover_data';
constructor(private readonly deps: DiscoverDataDeps) {
super();
}
get description(): string {
return 'Discover existing wiki pages, semantic layer sources, and raw warehouse schema hits before writing ingest output.';
}
get inputSchema() {
return discoverDataInputSchema;
}
async call(input: DiscoverDataInput, context: ToolContext): Promise<ToolOutput<DiscoverDataStructured>> {
const allowed = allowedConnectionNames(context);
if (input.connectionName && allowed && !allowed.has(input.connectionName)) {
return {
markdown: `Connection "${input.connectionName}" is not available to this ingest stage.`,
structured: { wiki: null, sl: null, raw: null },
};
}
if (input.sourceName) {
const sl = await this.deps.slDiscoverTool.call(
{ sourceName: input.sourceName, connectionId: input.connectionName },
context,
);
return { markdown: sl.markdown, structured: { wiki: null, sl: sl.structured, raw: null } };
}
const query = input.query?.trim() || '';
const limit = input.limit ?? 10;
const parts: string[] = [];
let wiki: unknown | null = null;
let sl: unknown | null = null;
let raw: DiscoverDataStructured['raw'] = null;
if (query) {
const wikiResult = await this.deps.wikiSearchTool.call({ query, limit }, context);
if (totalFound(wikiResult.structured) > 0) {
parts.push('## Wiki Pages', '> use `wiki_read(blockKey)` for full content', wikiResult.markdown, '');
wiki = wikiResult.structured;
}
}
const slResult = await this.deps.slDiscoverTool.call(
{ query: query || undefined, connectionId: input.connectionName },
context,
);
if (totalSources(slResult.structured) > 0) {
parts.push(
'## Semantic Layer Sources',
'> use `sl_read_source(sourceName)` for the YAML, or `entity_details` for warehouse-shape details',
slResult.markdown,
'',
);
sl = slResult.structured;
}
const catalog = this.deps.catalogFactory(context);
const connections = input.connectionName ? [input.connectionName] : [...(allowed ?? [])].sort();
const rawHits: RawSchemaHit[] = [];
for (const connectionName of connections) {
rawHits.push(...(await catalog.searchByName(connectionName, query, limit)));
}
if (rawHits.length > 0) {
parts.push(
'## Raw Warehouse Schema',
'> use `entity_details({connectionName, targets: [{display}]})` for full DDL + sample values',
);
parts.push(
rawHits
.slice(0, limit)
.map(
(hit) =>
`- ${hit.kind}: ${hit.display} [connectionName=${hit.connectionName}] (matched on ${hit.matchedOn}) - ` +
`follow up with \`entity_details({connectionName: "${hit.connectionName}", targets: [{display: "${hit.display}"}]})\``,
)
.join('\n'),
);
raw = { hits: rawHits.slice(0, limit) };
}
if (parts.length === 0) {
return {
markdown: `No matches for "${query}" across wiki, semantic layer, or raw warehouse schema. Try broader terms; this concept may not exist yet.`,
structured: { wiki, sl, raw },
};
}
return { markdown: parts.join('\n'), structured: { wiki, sl, raw } };
}
}

View file

@ -0,0 +1,192 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { initKtxProject, type KtxLocalProject } from '../../../project/index.js';
import type { ToolContext } from '../../../tools/index.js';
import { EntityDetailsTool } from './entity-details.tool.js';
import { WarehouseCatalogService } from './warehouse-catalog.service.js';
describe('EntityDetailsTool', () => {
let tempDir: string;
let project: KtxLocalProject;
let tool: EntityDetailsTool;
let context: ToolContext;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-entity-details-'));
project = await initKtxProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
await seedLiveDatabaseScan();
tool = new EntityDetailsTool(() => new WarehouseCatalogService({ fileStore: project.fileStore }));
context = {
sourceId: 'ingest',
messageId: 'm1',
userId: 'system',
session: {
allowedConnectionNames: new Set(['warehouse']),
} as any,
};
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
async function seedLiveDatabaseScan(connectionName = 'warehouse', syncId = 'sync-1') {
const root = `raw-sources/${connectionName}/live-database/${syncId}`;
await project.fileStore.writeFile(
`${root}/connection.json`,
JSON.stringify({ connectionId: connectionName, driver: 'postgres', extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2),
'ktx',
'ktx@example.com',
'seed connection',
);
await project.fileStore.writeFile(
`${root}/tables/orders.json`,
JSON.stringify(
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: 'Customer orders',
estimatedRows: 12,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
{
name: 'status',
nativeType: 'text',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: 'Order status',
},
],
foreignKeys: [],
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed orders',
);
await project.fileStore.writeFile(
`${root}/enrichment/relationship-profile.json`,
JSON.stringify(
{
connectionId: connectionName,
driver: 'postgres',
tables: [{ table: { catalog: null, db: 'public', name: 'orders' }, rowCount: 12 }],
columns: {
'orders.status': {
table: { catalog: null, db: 'public', name: 'orders' },
column: 'status',
rowCount: 12,
nullCount: 0,
distinctCount: 2,
nullRate: 0,
sampleValues: ['paid', 'refunded'],
},
},
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed profile',
);
}
it('returns scoped table detail for a display target', async () => {
const result = await tool.call({ connectionName: 'warehouse', targets: [{ display: 'public.orders' }] }, context);
expect(result.markdown).toContain('### public.orders');
expect(result.markdown).toContain('- status (text, nullable=false)');
expect(result.markdown).toContain('sample: ["paid","refunded"]');
expect(result.structured.scanAvailable).toBe(true);
expect(result.structured.resolved).toHaveLength(1);
});
it('resolves display targets that include a column name', async () => {
const result = await tool.call(
{ connectionName: 'warehouse', targets: [{ display: 'public.orders.status' }] },
context,
);
expect(result.markdown).toContain('### public.orders');
expect(result.markdown).toContain('- status (text, nullable=false)');
expect(result.markdown).not.toContain('- id (integer');
expect(result.structured.resolved).toHaveLength(1);
expect(result.structured.resolved[0]?.columns.map((column) => column.name)).toEqual(['status']);
});
it('reports missing explicit columns instead of returning an empty column list', async () => {
const result = await tool.call(
{ connectionName: 'warehouse', targets: [{ display: 'public.orders.plan_tier' }] },
context,
);
expect(result.markdown).toContain('Column not found in scan: public.orders.plan_tier');
expect(result.markdown).toContain('Available columns: id, status');
expect(result.structured.resolved).toHaveLength(0);
expect(result.structured.missing).toHaveLength(1);
});
it('reports missing structured table targets in model-visible markdown', async () => {
const result = await tool.call(
{
connectionName: 'warehouse',
targets: [{ catalog: null, db: 'public', name: 'orderz' }],
},
context,
);
expect(result.markdown).toContain('Not found in scan: public.orderz');
expect(result.markdown).toContain('Closest matches: orders');
expect(result.structured.resolved).toHaveLength(0);
expect(result.structured.missing).toHaveLength(1);
});
it('reports missing structured column targets in model-visible markdown', async () => {
const result = await tool.call(
{
connectionName: 'warehouse',
targets: [{ catalog: null, db: 'public', name: 'orders', column: 'plan_tier' }],
},
context,
);
expect(result.markdown).toContain('Column not found in scan: public.orders.plan_tier');
expect(result.markdown).toContain('Available columns: id, status');
expect(result.structured.resolved).toHaveLength(0);
expect(result.structured.missing).toHaveLength(1);
});
it('returns a no-scan state distinct from not found', async () => {
const result = await tool.call(
{ connectionName: 'empty', targets: [{ display: 'public.orders' }] },
{ ...context, session: { ...context.session!, allowedConnectionNames: new Set(['empty']) } },
);
expect(result.markdown).toContain('No live-database scan available for connection "empty"; run `ktx scan` first.');
expect(result.structured.scanAvailable).toBe(false);
});
it('refuses out-of-scope connections', async () => {
const result = await tool.call({ connectionName: 'billing', targets: [{ display: 'public.orders' }] }, context);
expect(result.markdown).toContain('Connection "billing" is not available to this ingest stage.');
expect(result.structured.scanAvailable).toBe(false);
});
});

View file

@ -0,0 +1,170 @@
import { z } from 'zod';
import type { KtxTableRef } from '../../../scan/types.js';
import { BaseTool, type ToolContext, type ToolOutput } from '../../../tools/index.js';
import { WarehouseCatalogService, type TableDetail } from './warehouse-catalog.service.js';
const targetSchema = z.union([
z.object({ display: z.string().min(1) }),
z.object({
catalog: z.string().nullable(),
db: z.string().nullable(),
name: z.string().min(1),
column: z.string().optional(),
}),
]);
const entityDetailsInputSchema = z.object({
connectionName: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/),
targets: z.array(targetSchema).min(1).max(50),
});
type EntityDetailsInput = z.infer<typeof entityDetailsInputSchema>;
type EntityDetailsTarget = EntityDetailsInput['targets'][number];
export interface EntityDetailsStructured {
resolved: TableDetail[];
missing: Array<{ target: unknown; candidates: KtxTableRef[] }>;
scanAvailable: boolean;
}
function allowedConnectionNames(context: ToolContext): ReadonlySet<string> | null {
return context.session?.allowedConnectionNames ?? null;
}
function targetLabel(target: EntityDetailsTarget): string {
if ('display' in target) {
return target.display;
}
return [target.catalog, target.db, target.name, target.column].filter((part): part is string => !!part).join('.');
}
function appendMissingTargetMarkdown(parts: string[], target: EntityDetailsTarget, candidates: KtxTableRef[]): void {
parts.push(`Not found in scan: ${targetLabel(target)}`);
if (candidates.length > 0) {
parts.push(`Closest matches: ${candidates.map((candidate) => candidate.name).join(', ')}`);
}
}
async function resolveTarget(
catalog: WarehouseCatalogService,
connectionName: string,
target: EntityDetailsTarget,
): Promise<{ resolved: (KtxTableRef & { column?: string }) | null; candidates: KtxTableRef[] }> {
if ('display' in target) {
return catalog.resolveDisplayTarget(connectionName, target.display);
}
const candidateResolution = await catalog.resolveDisplayTarget(connectionName, targetLabel(target));
return {
resolved: {
catalog: target.catalog,
db: target.db,
name: target.name,
column: target.column,
},
candidates: candidateResolution.candidates,
};
}
function sampleText(values: string[]): string {
return values.length > 0 ? ` - sample: ${JSON.stringify(values.slice(0, 10))}` : '';
}
function appendTableMarkdown(parts: string[], detail: TableDetail, columnName?: string): void {
const columns = columnName ? detail.columns.filter((column) => column.name === columnName) : detail.columns;
parts.push(`### ${detail.display}`);
parts.push(`Type: ${detail.kind} | Native columns: ${detail.columns.length}`);
if (detail.description || detail.comment) {
parts.push(`Description: ${detail.description ?? detail.comment}`);
}
parts.push('', 'Columns:');
for (const column of columns) {
const pk = column.primaryKey ? ', PK' : '';
parts.push(`- ${column.name} (${column.nativeType}, nullable=${column.nullable}${pk})${sampleText(column.sampleValues)}`);
}
parts.push('');
}
function findColumn(detail: TableDetail, columnName: string): TableDetail['columns'][number] | null {
const normalized = columnName.toLowerCase();
return detail.columns.find((column) => column.name.toLowerCase() === normalized) ?? null;
}
export class EntityDetailsTool extends BaseTool<typeof entityDetailsInputSchema> {
readonly name = 'entity_details';
constructor(private readonly catalogFactory: (context: ToolContext) => WarehouseCatalogService) {
super();
}
get description(): string {
return 'Verify warehouse tables and columns from the latest live-database scan before writing them into wiki or semantic-layer output.';
}
get inputSchema() {
return entityDetailsInputSchema;
}
async call(input: EntityDetailsInput, context: ToolContext): Promise<ToolOutput<EntityDetailsStructured>> {
const allowed = allowedConnectionNames(context);
if (allowed && !allowed.has(input.connectionName)) {
return {
markdown: `Connection "${input.connectionName}" is not available to this ingest stage.`,
structured: { resolved: [], missing: [], scanAvailable: false },
};
}
const catalog = this.catalogFactory(context);
const scanAvailable = await catalog.hasScan(input.connectionName);
if (!scanAvailable) {
return {
markdown: `No live-database scan available for connection "${input.connectionName}"; run \`ktx scan\` first.`,
structured: { resolved: [], missing: [], scanAvailable: false },
};
}
const parts: string[] = [];
const resolved: TableDetail[] = [];
const missing: EntityDetailsStructured['missing'] = [];
for (const target of input.targets) {
const resolution = await resolveTarget(catalog, input.connectionName, target);
if (!resolution.resolved) {
missing.push({ target, candidates: resolution.candidates });
appendMissingTargetMarkdown(parts, target, resolution.candidates);
continue;
}
const detail = await catalog.getTable({ connectionName: input.connectionName, ...resolution.resolved });
if (!detail) {
missing.push({ target, candidates: resolution.candidates });
appendMissingTargetMarkdown(parts, target, resolution.candidates);
continue;
}
const requestedColumn = resolution.resolved.column;
if (requestedColumn) {
const column = findColumn(detail, requestedColumn);
if (!column) {
missing.push({
target,
candidates: [{ catalog: detail.catalog, db: detail.db, name: detail.name }],
});
parts.push(`Column not found in scan: ${detail.display}.${requestedColumn}`);
parts.push(`Available columns: ${detail.columns.map((candidate) => candidate.name).join(', ')}`);
continue;
}
const scopedDetail = { ...detail, columns: [column] };
resolved.push(scopedDetail);
appendTableMarkdown(parts, scopedDetail, column.name);
continue;
}
resolved.push(detail);
appendTableMarkdown(parts, detail);
}
return {
markdown: parts.join('\n').trim(),
structured: { resolved, missing, scanAvailable: true },
};
}
}

View file

@ -0,0 +1,34 @@
import type { KtxFileStorePort } from '../../../core/index.js';
import type { SlConnectionCatalogPort } from '../../../sl/index.js';
import type { BaseTool, ToolContext } from '../../../tools/index.js';
import { DiscoverDataTool } from './discover-data.tool.js';
import { EntityDetailsTool } from './entity-details.tool.js';
import { SqlExecutionTool } from './sql-execution.tool.js';
import { WarehouseCatalogService } from './warehouse-catalog.service.js';
export { DiscoverDataTool } from './discover-data.tool.js';
export { EntityDetailsTool } from './entity-details.tool.js';
export { SqlExecutionTool } from './sql-execution.tool.js';
export { WarehouseCatalogService } from './warehouse-catalog.service.js';
export type { RawSchemaHit, TableDetail, WarehouseColumnDetail } from './warehouse-catalog.service.js';
export function createWarehouseVerificationTools(deps: {
connections: SlConnectionCatalogPort;
fallbackFileStore: KtxFileStorePort;
wikiSearchTool: BaseTool;
slDiscoverTool: BaseTool;
}): BaseTool[] {
const catalogFactory = (context: ToolContext) =>
new WarehouseCatalogService({
fileStore: context.session?.configService ?? deps.fallbackFileStore,
});
return [
new EntityDetailsTool(catalogFactory),
new SqlExecutionTool(deps.connections),
new DiscoverDataTool({
wikiSearchTool: deps.wikiSearchTool,
slDiscoverTool: deps.slDiscoverTool,
catalogFactory,
}),
];
}

View file

@ -0,0 +1,54 @@
import { describe, expect, it, vi } from 'vitest';
import type { SlConnectionCatalogPort } from '../../../sl/index.js';
import type { ToolContext } from '../../../tools/index.js';
import { SqlExecutionTool } from './sql-execution.tool.js';
describe('SqlExecutionTool', () => {
const connections = {
executeQuery: vi.fn(),
} as unknown as SlConnectionCatalogPort & { executeQuery: ReturnType<typeof vi.fn> };
const tool = new SqlExecutionTool(connections);
const context: ToolContext = {
sourceId: 'ingest',
messageId: 'm1',
userId: 'system',
session: { allowedConnectionNames: new Set(['warehouse']) } as any,
};
it('wraps read-only SQL with a capped row limit', async () => {
connections.executeQuery.mockResolvedValue({ headers: ['status'], rows: [['paid']], totalRows: 1 });
const result = await tool.call(
{ connectionName: 'warehouse', sql: 'select status from public.orders', rowLimit: 5 },
context,
);
expect(connections.executeQuery).toHaveBeenCalledWith(
'warehouse',
'select * from (select status from public.orders) as ktx_query_result limit 5',
);
expect(result.markdown).toContain('| status |');
expect(result.structured.wrappedSql).toContain('limit 5');
});
it.each(['insert into x values (1)', 'drop table x', 'vacuum'])('rejects mutating SQL: %s', async (sql) => {
connections.executeQuery.mockClear();
const result = await tool.call({ connectionName: 'warehouse', sql }, context);
expect(result.markdown).toContain('Only read-only SELECT/WITH queries can be executed locally.');
expect(connections.executeQuery).not.toHaveBeenCalled();
});
it('surfaces connector errors verbatim', async () => {
connections.executeQuery.mockRejectedValue(new Error('relation "orbit_analytics.customer" does not exist'));
const result = await tool.call(
{ connectionName: 'warehouse', sql: 'select 1 from orbit_analytics.customer', rowLimit: 1 },
context,
);
expect(result.markdown).toContain('relation "orbit_analytics.customer" does not exist');
expect(result.structured.error).toContain('relation "orbit_analytics.customer" does not exist');
});
});

View file

@ -0,0 +1,102 @@
import { z } from 'zod';
import { assertReadOnlySql, limitSqlForExecution } from '../../../connections/index.js';
import type { SlConnectionCatalogPort } from '../../../sl/index.js';
import { BaseTool, type ToolContext, type ToolOutput } from '../../../tools/index.js';
const sqlExecutionInputSchema = z.object({
connectionName: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/),
sql: z.string().min(1),
rowLimit: z.number().int().positive().max(1000).optional().default(100),
});
type SqlExecutionInput = z.input<typeof sqlExecutionInputSchema>;
export interface SqlExecutionStructured {
headers: string[];
rows: unknown[][];
rowCount: number;
truncated: boolean;
sql: string;
wrappedSql: string;
error?: string;
}
function markdownTable(headers: string[], rows: unknown[][], totalRows: number): string {
if (headers.length === 0) {
return rows.length === 0 ? 'Query returned no rows.' : JSON.stringify(rows.slice(0, 20));
}
const visible = rows.slice(0, 20);
const lines = [
`| ${headers.join(' | ')} |`,
`| ${headers.map(() => '---').join(' | ')} |`,
...visible.map((row) => `| ${row.map((value) => String(value ?? '')).join(' | ')} |`),
];
if (totalRows > visible.length) {
lines.push(`... +${totalRows - visible.length} more rows`);
}
return lines.join('\n');
}
export class SqlExecutionTool extends BaseTool<typeof sqlExecutionInputSchema> {
readonly name = 'sql_execution';
constructor(private readonly connections: SlConnectionCatalogPort) {
super();
}
get description(): string {
return 'Run a single read-only SELECT or WITH probe against an allowed warehouse connection and return a capped markdown table or the warehouse error.';
}
get inputSchema() {
return sqlExecutionInputSchema;
}
async call(input: SqlExecutionInput, context: ToolContext): Promise<ToolOutput<SqlExecutionStructured>> {
const allowed = context.session?.allowedConnectionNames;
if (allowed && !allowed.has(input.connectionName)) {
return {
markdown: `Connection "${input.connectionName}" is not available to this ingest stage.`,
structured: {
headers: [],
rows: [],
rowCount: 0,
truncated: false,
sql: input.sql,
wrappedSql: '',
error: 'connection_not_allowed',
},
};
}
let sql: string;
let wrappedSql: string;
try {
sql = assertReadOnlySql(input.sql);
wrappedSql = limitSqlForExecution(sql, input.rowLimit);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return {
markdown: message,
structured: { headers: [], rows: [], rowCount: 0, truncated: false, sql: input.sql, wrappedSql: '', error: message },
};
}
try {
const result = await this.connections.executeQuery(input.connectionName, wrappedSql);
const headers = result.headers ?? [];
const rows = result.rows ?? [];
const rowCount = result.totalRows ?? rows.length;
return {
markdown: markdownTable(headers, rows, rowCount),
structured: { headers, rows, rowCount, truncated: rowCount > rows.length, sql, wrappedSql },
};
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return {
markdown: `SQL execution failed: ${message}`,
structured: { headers: [], rows: [], rowCount: 0, truncated: false, sql, wrappedSql, error: message },
};
}
}
}

View file

@ -0,0 +1,196 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { initKtxProject, type KtxLocalProject } from '../../../project/index.js';
import { WarehouseCatalogService } from './warehouse-catalog.service.js';
describe('WarehouseCatalogService', () => {
let tempDir: string;
let project: KtxLocalProject;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-warehouse-catalog-'));
project = await initKtxProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
async function seedLiveDatabaseScan(connectionName = 'warehouse', syncId = 'sync-2', driver = 'postgres') {
const root = `raw-sources/${connectionName}/live-database/${syncId}`;
const tableRef = {
catalog: driver === 'bigquery' ? 'analytics' : null,
db: driver === 'sqlite' ? null : 'public',
name: 'orders',
};
await project.fileStore.writeFile(
`${root}/connection.json`,
JSON.stringify({ connectionId: connectionName, driver, extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2),
'ktx',
'ktx@example.com',
'seed connection',
);
await project.fileStore.writeFile(
`${root}/tables/orders.json`,
JSON.stringify(
{
catalog: tableRef.catalog,
db: tableRef.db,
name: tableRef.name,
kind: 'table',
comment: 'Customer orders',
estimatedRows: 12,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
{
name: 'status',
nativeType: 'text',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: 'Order status',
},
],
foreignKeys: [],
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed orders',
);
await project.fileStore.writeFile(
`${root}/enrichment/relationship-profile.json`,
JSON.stringify(
{
connectionId: connectionName,
driver,
sqlAvailable: true,
queryCount: 3,
tables: [{ table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name }, rowCount: 12 }],
columns: {
'orders.status': {
table: { catalog: tableRef.catalog, db: tableRef.db, name: tableRef.name },
column: 'status',
nativeType: 'text',
normalizedType: 'text',
rowCount: 12,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 0.1667,
nullRate: 0,
sampleValues: ['paid', 'refunded'],
minTextLength: 4,
maxTextLength: 8,
},
},
warnings: [],
},
null,
2,
),
'ktx',
'ktx@example.com',
'seed profile',
);
}
it('finds the latest sync and merges table schema with relationship profile values', async () => {
await seedLiveDatabaseScan('warehouse', 'sync-1');
await seedLiveDatabaseScan('warehouse', 'sync-2');
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.getLatestSyncId('warehouse')).resolves.toBe('sync-2');
const detail = await catalog.getTable({ connectionName: 'warehouse', catalog: null, db: 'public', name: 'orders' });
expect(detail).toMatchObject({
connectionName: 'warehouse',
display: 'public.orders',
rowCount: 12,
columns: [
{ name: 'id', nativeType: 'integer', primaryKey: true },
{ name: 'status', nativeType: 'text', sampleValues: ['paid', 'refunded'], distinctCount: 2 },
],
});
});
it('returns scanAvailable=false when no live-database scan exists', async () => {
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.getTable({ connectionName: 'missing', catalog: null, db: 'public', name: 'orders' })).resolves.toBeNull();
await expect(catalog.hasScan('missing')).resolves.toBe(false);
});
it('resolves postgres display strings and returns closest candidates for missing tables', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
resolved: { catalog: null, db: 'public', name: 'orders' },
candidates: [],
dialect: 'postgres',
});
await expect(catalog.resolveDisplay('warehouse', 'public.orderz')).resolves.toMatchObject({
resolved: null,
candidates: [{ name: 'orders' }],
});
});
it('treats two-part BigQuery identifiers as ambiguous instead of guessing', async () => {
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplay('warehouse', 'public.orders')).resolves.toMatchObject({
resolved: null,
dialect: 'bigquery',
});
});
it('resolves postgres column display strings without treating the column as a table', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplayTarget('warehouse', 'public.orders.status')).resolves.toMatchObject({
resolved: { catalog: null, db: 'public', name: 'orders', column: 'status' },
candidates: [],
dialect: 'postgres',
});
});
it('resolves BigQuery column display strings with four parts', async () => {
await seedLiveDatabaseScan('warehouse', 'sync-bigquery', 'bigquery');
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.resolveDisplayTarget('warehouse', 'analytics.public.orders.status')).resolves.toMatchObject({
resolved: { catalog: 'analytics', db: 'public', name: 'orders', column: 'status' },
candidates: [],
dialect: 'bigquery',
});
});
it('searches table names, column names, comments, and descriptions', async () => {
await seedLiveDatabaseScan();
const catalog = new WarehouseCatalogService({ fileStore: project.fileStore });
await expect(catalog.searchByName('warehouse', 'status', 10)).resolves.toEqual(
expect.arrayContaining([
expect.objectContaining({
kind: 'column',
ref: expect.objectContaining({ db: 'public', name: 'orders', column: 'status' }),
matchedOn: 'name',
}),
]),
);
});
});

View file

@ -0,0 +1,452 @@
import { getDialectForDriver } from '../../../connections/index.js';
import type { KtxFileStorePort } from '../../../core/index.js';
import type {
KtxConnectionDriver,
KtxSchemaColumn,
KtxSchemaForeignKey,
KtxSchemaTable,
KtxTableRef,
} from '../../../scan/types.js';
type CatalogDriver = KtxConnectionDriver | 'sqlite3';
export interface WarehouseCatalogServiceDeps {
fileStore: KtxFileStorePort;
}
export interface WarehouseColumnDetail extends KtxSchemaColumn {
descriptions: Record<string, string>;
rowCount: number | null;
nullCount: number | null;
distinctCount: number | null;
nullRate: number | null;
sampleValues: string[];
}
export interface TableDetail {
connectionName: string;
catalog: string | null;
db: string | null;
name: string;
display: string;
kind: string;
comment: string | null;
description: string | null;
rowCount: number | null;
columns: WarehouseColumnDetail[];
foreignKeys: KtxSchemaForeignKey[];
}
export type RawSchemaHit =
| {
kind: 'table';
connectionName: string;
ref: KtxTableRef;
display: string;
matchedOn: 'name' | 'db' | 'comment' | 'description';
}
| {
kind: 'column';
connectionName: string;
ref: KtxTableRef & { column: string };
display: string;
matchedOn: 'name' | 'comment' | 'description';
};
export interface DisplayTargetResolution {
resolved: (KtxTableRef & { column?: string }) | null;
candidates: KtxTableRef[];
dialect: string;
}
interface ConnectionArtifact {
driver?: CatalogDriver;
}
interface RelationshipProfileColumn {
table?: KtxTableRef;
column?: string;
rowCount?: number;
nullCount?: number;
distinctCount?: number;
nullRate?: number;
sampleValues?: unknown[];
}
interface RelationshipProfileArtifact {
driver?: CatalogDriver;
tables?: Array<{ table?: KtxTableRef; rowCount?: number }>;
columns?: Record<string, RelationshipProfileColumn>;
}
interface ConnectionCatalog {
connectionName: string;
syncId: string;
driver: CatalogDriver;
tables: KtxSchemaTable[];
profile: RelationshipProfileArtifact | null;
}
type TableWithDescriptions = KtxSchemaTable & {
description?: string | null;
descriptions?: Record<string, string>;
columns: Array<KtxSchemaColumn & { description?: string | null; descriptions?: Record<string, string> }>;
};
function normalize(value: string | null | undefined): string {
return (value ?? '').toLowerCase();
}
function refsEqual(left: KtxTableRef, right: KtxTableRef): boolean {
return (
normalize(left.catalog) === normalize(right.catalog) &&
normalize(left.db) === normalize(right.db) &&
normalize(left.name) === normalize(right.name)
);
}
function refKey(ref: KtxTableRef): string {
return [ref.catalog, ref.db, ref.name].map((part) => normalize(part)).join('.');
}
function columnKey(ref: KtxTableRef, column: string): string {
return `${refKey(ref)}.${normalize(column)}`;
}
function readJson<T>(content: string): T {
return JSON.parse(content) as T;
}
function cleanIdentifierPart(part: string): string {
return part.trim().replace(/^["'`\[]|["'`\]]$/g, '');
}
function splitDisplay(display: string): string[] {
return display
.trim()
.split('.')
.map(cleanIdentifierPart)
.filter(Boolean);
}
function formatDisplay(driver: CatalogDriver, table: KtxTableRef): string {
if (driver === 'sqlite' || driver === 'sqlite3') {
return table.name;
}
return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.');
}
function parseDisplay(driver: CatalogDriver, display: string): KtxTableRef | null {
const parts = splitDisplay(display);
if (driver === 'sqlite' || driver === 'sqlite3') {
return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null;
}
if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') {
if (parts.length !== 3) {
return null;
}
return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! };
}
if (parts.length === 2) {
return { catalog: null, db: parts[0]!, name: parts[1]! };
}
if (parts.length === 3) {
return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! };
}
return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null;
}
function expectedDisplayPartCount(driver: CatalogDriver): number {
if (driver === 'sqlite' || driver === 'sqlite3') {
return 1;
}
if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') {
return 3;
}
return 2;
}
function parseColumnDisplay(driver: CatalogDriver, display: string): (KtxTableRef & { column: string }) | null {
const parts = splitDisplay(display);
const tablePartCount = expectedDisplayPartCount(driver);
if (parts.length !== tablePartCount + 1) {
return null;
}
const column = parts.at(-1);
if (!column) {
return null;
}
const table = parseDisplay(driver, parts.slice(0, -1).join('.'));
return table ? { ...table, column } : null;
}
function bestCandidates(tables: KtxSchemaTable[], display: string, limit = 5): KtxTableRef[] {
const needle = normalize(splitDisplay(display).at(-1) ?? display);
return tables
.map((table) => {
const name = normalize(table.name);
let score = 0;
if (name === needle) {
score = 100;
} else if (name.includes(needle) || needle.includes(name)) {
score = 80;
} else {
const samePrefix = [...name].filter((char, index) => needle[index] === char).length;
score = samePrefix / Math.max(name.length, needle.length, 1);
}
return { table, score };
})
.filter((entry) => entry.score > 0)
.sort((left, right) => right.score - left.score || left.table.name.localeCompare(right.table.name))
.slice(0, limit)
.map(({ table }) => ({ catalog: table.catalog, db: table.db, name: table.name }));
}
function firstDescription(descriptions: Record<string, string> | undefined): string | null {
return Object.values(descriptions ?? {}).find((value) => value.trim().length > 0) ?? null;
}
function matchedOnTable(table: TableWithDescriptions, query: string): RawSchemaHit['matchedOn'] | null {
const q = normalize(query);
if (!q) {
return null;
}
if (normalize(table.name).includes(q)) {
return 'name';
}
if (normalize(table.db).includes(q)) {
return 'db';
}
if (normalize(table.comment).includes(q)) {
return 'comment';
}
if (normalize(firstDescription(table.descriptions) ?? table.description).includes(q)) {
return 'description';
}
return null;
}
function matchedOnColumn(
column: KtxSchemaColumn & { description?: string | null; descriptions?: Record<string, string> },
query: string,
): 'name' | 'comment' | 'description' | null {
const q = normalize(query);
if (!q) {
return null;
}
if (normalize(column.name).includes(q)) {
return 'name';
}
if (normalize(column.comment).includes(q)) {
return 'comment';
}
if (normalize(firstDescription(column.descriptions) ?? column.description).includes(q)) {
return 'description';
}
return null;
}
export class WarehouseCatalogService {
private readonly catalogs = new Map<string, Promise<ConnectionCatalog | null>>();
constructor(private readonly deps: WarehouseCatalogServiceDeps) {}
async hasScan(connectionName: string): Promise<boolean> {
return (await this.loadCatalog(connectionName)) !== null;
}
async getLatestSyncId(connectionName: string): Promise<string | null> {
return (await this.loadCatalog(connectionName))?.syncId ?? null;
}
async listTables(connectionName: string): Promise<KtxTableRef[]> {
const catalog = await this.loadCatalog(connectionName);
return catalog?.tables.map((table) => ({ catalog: table.catalog, db: table.db, name: table.name })) ?? [];
}
async getTable(ref: { connectionName: string } & KtxTableRef): Promise<TableDetail | null> {
const catalog = await this.loadCatalog(ref.connectionName);
if (!catalog) {
return null;
}
const table = catalog.tables.find((candidate) => refsEqual(candidate, ref)) as TableWithDescriptions | undefined;
if (!table) {
return null;
}
const profileTables = catalog.profile?.tables ?? [];
const profileTable = profileTables.find((candidate) => candidate.table && refsEqual(candidate.table, table));
const profileColumns = catalog.profile?.columns ?? {};
return {
connectionName: ref.connectionName,
catalog: table.catalog,
db: table.db,
name: table.name,
display: formatDisplay(catalog.driver, table),
kind: table.kind,
comment: table.comment,
description: table.description ?? firstDescription(table.descriptions),
rowCount: profileTable?.rowCount ?? table.estimatedRows ?? null,
columns: table.columns.map((rawColumn) => {
const column = rawColumn as KtxSchemaColumn & {
description?: string | null;
descriptions?: Record<string, string>;
};
const profileColumn =
profileColumns[columnKey(table, column.name)] ??
Object.entries(profileColumns).find(
([key, value]) =>
normalize(key) === `${normalize(table.name)}.${normalize(column.name)}` ||
(value.table && refsEqual(value.table, table) && normalize(value.column) === normalize(column.name)),
)?.[1];
return {
...column,
descriptions: column.descriptions ?? {},
rowCount: profileColumn?.rowCount ?? null,
nullCount: profileColumn?.nullCount ?? null,
distinctCount: profileColumn?.distinctCount ?? null,
nullRate: profileColumn?.nullRate ?? null,
sampleValues: (profileColumn?.sampleValues ?? []).map((value) => String(value)),
};
}),
foreignKeys: table.foreignKeys,
};
}
async resolveDisplay(
connectionName: string,
display: string,
): Promise<{
resolved: KtxTableRef | null;
candidates: KtxTableRef[];
dialect: string;
}> {
const catalog = await this.loadCatalog(connectionName);
if (!catalog) {
return { resolved: null, candidates: [], dialect: 'unknown' };
}
const dialect = getDialectForDriver(catalog.driver).type;
const parsed = parseDisplay(catalog.driver, display);
if (!parsed) {
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
}
const table = catalog.tables.find((candidate) => refsEqual(candidate, parsed));
if (!table) {
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
}
return { resolved: { catalog: table.catalog, db: table.db, name: table.name }, candidates: [], dialect };
}
async resolveDisplayTarget(connectionName: string, display: string): Promise<DisplayTargetResolution> {
const catalog = await this.loadCatalog(connectionName);
if (!catalog) {
return { resolved: null, candidates: [], dialect: 'unknown' };
}
const dialect = getDialectForDriver(catalog.driver).type;
const tableResolution = await this.resolveDisplay(connectionName, display);
if (tableResolution.resolved) {
return tableResolution;
}
const parsedColumn = parseColumnDisplay(catalog.driver, display);
if (!parsedColumn) {
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
}
const table = catalog.tables.find((candidate) => refsEqual(candidate, parsedColumn));
if (!table) {
return { resolved: null, candidates: bestCandidates(catalog.tables, display), dialect };
}
return {
resolved: {
catalog: table.catalog,
db: table.db,
name: table.name,
column: parsedColumn.column,
},
candidates: [],
dialect,
};
}
async searchByName(connectionName: string, query: string, limit: number): Promise<RawSchemaHit[]> {
const catalog = await this.loadCatalog(connectionName);
if (!catalog) {
return [];
}
const hits: RawSchemaHit[] = [];
for (const table of catalog.tables as TableWithDescriptions[]) {
const tableMatch = matchedOnTable(table, query);
if (tableMatch) {
hits.push({
kind: 'table',
connectionName,
ref: { catalog: table.catalog, db: table.db, name: table.name },
display: formatDisplay(catalog.driver, table),
matchedOn: tableMatch,
});
}
for (const column of table.columns) {
const columnMatch = matchedOnColumn(column, query);
if (!columnMatch) {
continue;
}
hits.push({
kind: 'column',
connectionName,
ref: { catalog: table.catalog, db: table.db, name: table.name, column: column.name },
display: `${formatDisplay(catalog.driver, table)}.${column.name}`,
matchedOn: columnMatch,
});
}
}
return hits.slice(0, Math.max(0, limit));
}
private loadCatalog(connectionName: string): Promise<ConnectionCatalog | null> {
const existing = this.catalogs.get(connectionName);
if (existing) {
return existing;
}
const pending = this.readCatalog(connectionName);
this.catalogs.set(connectionName, pending);
return pending;
}
private async readCatalog(connectionName: string): Promise<ConnectionCatalog | null> {
const root = `raw-sources/${connectionName}/live-database`;
const listed = await this.deps.fileStore.listFiles(root);
const connectionFiles = listed.files.filter((file) => file.endsWith('/connection.json')).sort();
const latestConnectionPath = connectionFiles.at(-1);
if (!latestConnectionPath) {
return null;
}
const latestRoot = latestConnectionPath.slice(0, -'/connection.json'.length);
const syncId = latestRoot.split('/').at(-1) ?? '';
const connection = readJson<ConnectionArtifact>((await this.deps.fileStore.readFile(latestConnectionPath)).content);
const tablesListing = await this.deps.fileStore.listFiles(`${latestRoot}/tables`);
const tables: KtxSchemaTable[] = [];
for (const tablePath of tablesListing.files.filter((file) => file.endsWith('.json')).sort()) {
tables.push(readJson<KtxSchemaTable>((await this.deps.fileStore.readFile(tablePath)).content));
}
let profile: RelationshipProfileArtifact | null = null;
try {
profile = readJson<RelationshipProfileArtifact>(
(await this.deps.fileStore.readFile(`${latestRoot}/enrichment/relationship-profile.json`)).content,
);
} catch {
profile = null;
}
return {
connectionName,
syncId,
driver: connection.driver ?? profile?.driver ?? 'postgres',
tables,
profile,
};
}
}

View file

@ -0,0 +1,99 @@
import { describe, expect, it, vi } from 'vitest';
import { repairWikiSlRefs } from './wiki-sl-ref-repair.js';
describe('repairWikiSlRefs', () => {
it('removes missing measure refs while keeping source, measure, segment, and manifest-backed refs', async () => {
type TestPage = { pageKey: string; frontmatter: Record<string, unknown>; content: string };
const pages = new Map<string, TestPage>([
[
'GLOBAL:accounts-at-risk',
{
pageKey: 'accounts-at-risk',
frontmatter: {
summary: 'Accounts at risk',
usage_mode: 'auto',
sl_refs: [
'mart_customer_health',
'mart_customer_health.high_risk_account_count',
'mart_customer_health.medium_risk_account_count',
'mart_customer_health.high_risk',
'int_procurement_qualifying_actions',
],
},
content: 'Risk context.',
},
],
]);
const wikiService = {
readPage: vi.fn(async (scope: string, _scopeId: string | null, key: string) => pages.get(`${scope}:${key}`)),
writePage: vi.fn(
async (
scope: string,
_scopeId: string | null,
key: string,
frontmatter: Record<string, unknown>,
content: string,
) => {
pages.set(`${scope}:${key}`, { pageKey: key, frontmatter, content });
},
),
};
const configService = {
listFiles: vi.fn(async () => ({
files: ['global/accounts-at-risk.md', 'global/historic-sql/nested-legacy.md'],
})),
};
const semanticLayerService = {
loadAllSources: vi.fn(async () => [
{
name: 'mart_customer_health',
grain: [],
columns: [],
joins: [],
measures: [{ name: 'high_risk_account_count', expr: 'count(*)' }],
segments: [{ name: 'high_risk', expr: "risk_level = 'high'" }],
},
{
name: 'int_procurement_qualifying_actions',
grain: [],
columns: [],
joins: [],
measures: [],
},
]),
};
const result = await repairWikiSlRefs({
wikiService: wikiService as never,
semanticLayerService: semanticLayerService as never,
configService: configService as never,
connectionIds: ['warehouse'],
});
expect(result.repairs).toEqual([
{
pageKey: 'accounts-at-risk',
scope: 'GLOBAL',
scopeId: null,
removedRefs: ['mart_customer_health.medium_risk_account_count'],
},
]);
expect(wikiService.writePage).toHaveBeenCalledWith(
'GLOBAL',
null,
'accounts-at-risk',
expect.objectContaining({
sl_refs: [
'mart_customer_health',
'mart_customer_health.high_risk_account_count',
'mart_customer_health.high_risk',
'int_procurement_qualifying_actions',
],
}),
'Risk context.',
'System User',
'system@example.com',
'Repair semantic-layer refs: accounts-at-risk',
);
});
});

View file

@ -0,0 +1,140 @@
import type { KtxFileStorePort } from '../core/index.js';
import type { SemanticLayerService, SemanticLayerSource } from '../sl/index.js';
import { isFlatWikiKey } from '../wiki/keys.js';
import type { KnowledgeWikiService, WikiFrontmatter } from '../wiki/index.js';
const SYSTEM_AUTHOR = 'System User';
const SYSTEM_EMAIL = 'system@example.com';
export interface WikiSlRefRepair {
pageKey: string;
scope: 'GLOBAL' | 'USER';
scopeId: string | null;
removedRefs: string[];
}
export interface WikiSlRefRepairResult {
repairs: WikiSlRefRepair[];
warnings: string[];
}
interface WikiPath {
scope: 'GLOBAL' | 'USER';
scopeId: string | null;
pageKey: string;
}
function parseKnowledgeFilePath(path: string): WikiPath | null {
if (!path.endsWith('.md')) {
return null;
}
const segments = path.split('/');
if (segments.length === 2 && segments[0] === 'global') {
const pageKey = segments[1].replace(/\.md$/, '');
return isFlatWikiKey(pageKey) ? { scope: 'GLOBAL', scopeId: null, pageKey } : null;
}
if (segments.length === 3 && segments[0] === 'user') {
const pageKey = segments[2].replace(/\.md$/, '');
return isFlatWikiKey(pageKey) ? { scope: 'USER', scopeId: segments[1], pageKey } : null;
}
return null;
}
function entityRefsForSource(source: SemanticLayerSource): string[] {
return [
source.name,
...(source.measures ?? []).map((measure) => `${source.name}.${measure.name}`),
...(source.segments ?? []).map((segment) => `${source.name}.${segment.name}`),
];
}
async function loadVisibleSlRefs(
semanticLayerService: SemanticLayerService,
connectionIds: string[],
): Promise<{ refs: Set<string>; warnings: string[] }> {
const refs = new Set<string>();
const warnings: string[] = [];
for (const connectionId of connectionIds) {
try {
for (const source of await semanticLayerService.loadAllSources(connectionId)) {
for (const ref of entityRefsForSource(source)) {
refs.add(ref);
}
}
} catch (error) {
warnings.push(
`Skipped wiki sl_refs repair for connection ${connectionId}: ${error instanceof Error ? error.message : String(error)}`,
);
}
}
return { refs, warnings };
}
function uniqueStringArray(value: string[] | undefined): string[] {
return [...new Set((value ?? []).filter((entry) => typeof entry === 'string' && entry.trim().length > 0))];
}
export async function repairWikiSlRefs(input: {
wikiService: KnowledgeWikiService;
semanticLayerService: SemanticLayerService;
configService: KtxFileStorePort;
connectionIds: string[];
}): Promise<WikiSlRefRepairResult> {
const { refs: validRefs, warnings } = await loadVisibleSlRefs(input.semanticLayerService, input.connectionIds);
const listFiles =
typeof input.configService.listFiles === 'function'
? input.configService.listFiles.bind(input.configService)
: null;
if (!listFiles) {
return {
repairs: [],
warnings: [...warnings, 'Skipped wiki sl_refs repair: config service cannot list wiki files.'],
};
}
const listed = await listFiles('knowledge', true);
const repairs: WikiSlRefRepair[] = [];
for (const file of listed.files.sort()) {
const parsedPath = parseKnowledgeFilePath(file);
if (!parsedPath) {
continue;
}
const page = await input.wikiService.readPage(parsedPath.scope, parsedPath.scopeId, parsedPath.pageKey);
const refs = uniqueStringArray(page?.frontmatter.sl_refs);
if (!page || refs.length === 0) {
continue;
}
const keptRefs = refs.filter((ref) => validRefs.has(ref));
const removedRefs = refs.filter((ref) => !validRefs.has(ref));
if (removedRefs.length === 0) {
continue;
}
const frontmatter: WikiFrontmatter = {
...page.frontmatter,
sl_refs: keptRefs,
};
await input.wikiService.writePage(
parsedPath.scope,
parsedPath.scopeId,
parsedPath.pageKey,
frontmatter,
page.content,
SYSTEM_AUTHOR,
SYSTEM_EMAIL,
`Repair semantic-layer refs: ${parsedPath.pageKey}`,
);
repairs.push({ ...parsedPath, removedRefs });
}
return {
repairs,
warnings: [
...warnings,
...repairs.map(
(repair) =>
`Removed invalid sl_refs from ${repair.pageKey}: ${repair.removedRefs.join(', ')}`,
),
],
};
}

View file

@ -36,6 +36,7 @@ import { BaseTool, type GitAuthorResolverPort, type ToolContext } from '../tools
import {
type KnowledgeEventPort,
type KnowledgeIndexPort,
type KnowledgeIndexPageListing,
KnowledgeWikiService,
searchLocalKnowledgePages,
WikiListTagsTool,
@ -219,7 +220,7 @@ class LocalKnowledgeIndex implements KnowledgeIndexPort {
}
async listPagesForUser(userId: string) {
const pages: Array<{ id?: string; page_key: string; summary: string; scope: string; scope_id: string | null }> = [];
const pages: KnowledgeIndexPageListing[] = [];
for (const scope of [
{ scope: 'GLOBAL', scopeId: null, dir: 'knowledge/global' },
{ scope: 'USER', scopeId: userId, dir: `knowledge/user/${userId}` },
@ -234,6 +235,7 @@ class LocalKnowledgeIndex implements KnowledgeIndexPort {
summary: parsed.summary,
scope: scope.scope,
scope_id: scope.scopeId,
tags: parseWikiTags(raw.content),
});
}
}
@ -433,7 +435,7 @@ class LocalMemoryToolsetFactory implements MemoryToolsetFactoryPort {
};
},
}),
new WikiListTagsTool(deps.wikiService, deps.knowledgeIndex),
new WikiListTagsTool(deps.knowledgeIndex),
new WikiWriteTool(deps.wikiService, deps.knowledgeIndex, deps.knowledgeEvents),
new WikiRemoveTool(deps.wikiService, deps.knowledgeIndex, deps.knowledgeEvents),
];
@ -468,6 +470,17 @@ function parseWiki(raw: string): { summary: string; content: string } {
};
}
function parseWikiTags(raw: string): string[] {
const match = raw.match(/^---\n([\s\S]*?)\n---\n?/);
if (!match) {
return [];
}
const frontmatter = (YAML.parse(match[1]) ?? {}) as Record<string, unknown>;
return Array.isArray(frontmatter.tags)
? frontmatter.tags.filter((tag): tag is string => typeof tag === 'string')
: [];
}
function scoreText(text: string, query: string): number {
const normalized = query.toLowerCase().trim();
if (!normalized) {

View file

@ -23,11 +23,42 @@ const expectedAdapterSkillHeadings: Record<string, string> = {
metabase_ingest: '# Metabase to KTX Semantic Layer',
metricflow_ingest: '# MetricFlow to KTX Semantic Layer',
};
const verificationWriterSkills = [
'notion_synthesize',
'dbt_ingest',
'lookml_ingest',
'looker_ingest',
'metabase_ingest',
'metricflow_ingest',
'live_database_ingest',
'historic_sql_table_digest',
'historic_sql_patterns',
'knowledge_capture',
'sl_capture',
] as const;
function forbiddenProductPattern() {
return new RegExp([['Kae', 'lio'].join(''), ['kae', 'lio'].join(''), ['KAE', 'LIO_'].join('')].join('|'));
}
function sqlExecutionCallBlocks(body: string): string[] {
const blocks: string[] = [];
const marker = 'sql_execution({';
let offset = 0;
while (offset < body.length) {
const start = body.indexOf(marker, offset);
if (start === -1) {
break;
}
const end = body.indexOf('})', start + marker.length);
blocks.push(body.slice(start, end === -1 ? start + marker.length : end + 2));
offset = start + marker.length;
}
return blocks;
}
describe('memory runtime assets', () => {
it('packages every memory-agent base prompt referenced by promptNameFor()', async () => {
const prompts = new PromptService({ promptsDir, partials: [] });
@ -117,4 +148,50 @@ describe('memory runtime assets', () => {
expect(body).toContain('Do not call `sl_write_source` or `sl_edit_source`');
expect(body).toContain('LookML writes target the run connection directly');
});
it('ships identifier verification protocol in every synthesis writer skill', async () => {
for (const skillName of verificationWriterSkills) {
const body = await readFile(join(skillsDir, skillName, 'SKILL.md'), 'utf-8');
expect(body).toContain('## Identifier Verification Protocol');
expect(body).toMatch(/discover_data|entity_details/);
}
});
it('does not ship stale warehouse verification tool names or fictional identifiers', async () => {
for (const skillName of verificationWriterSkills) {
const body = await readFile(join(skillsDir, skillName, 'SKILL.md'), 'utf-8');
expect(body).not.toContain('orbit_analytics.customer');
expect(body).not.toContain('wiki_sl_search');
expect(body).not.toContain('sl_describe_table');
}
});
it('ships only the KTX connectionName sql_execution call shape in writer guidance', async () => {
const shared = await readFile(join(skillsDir, '_shared', 'identifier-verification.md'), 'utf-8');
const bodies = [{ name: '_shared/identifier-verification.md', body: shared }];
expect(shared).toContain('sql_execution({connectionName, sql: "SELECT DISTINCT');
expect(shared).toContain('sql_execution({connectionName, sql: "SELECT 1 FROM');
for (const skillName of verificationWriterSkills) {
const body = await readFile(join(skillsDir, skillName, 'SKILL.md'), 'utf-8');
bodies.push({ name: `${skillName}/SKILL.md`, body });
expect(body).toContain('sql_execution({connectionName');
expect(body).not.toContain('sql_execution({ sql');
expect(body).not.toContain('session shape');
expect(body).not.toContain('connection is already pinned by the ingest session');
}
for (const { name, body } of bodies) {
const calls = sqlExecutionCallBlocks(body);
expect(calls.length, `${name} should contain sql_execution guidance`).toBeGreaterThan(0);
expect(
calls.filter((call) => !call.includes('connectionName')),
`${name} has sql_execution calls without connectionName`,
).toEqual([]);
expect(body, `${name} has a connectionless multiline sql_execution call`).not.toMatch(
/sql_execution\(\{\s*sql\s*:/,
);
}
});
});

View file

@ -90,7 +90,7 @@ export async function validateSingleSource(
`writing it as-is drops the manifest's columns and joins. ` +
`Remove "sql:", "table:", "grain:", "columns:", and "joins:" and keep only ` +
`"name:" plus "measures:"/"segments:"/"description:" to write an overlay ` +
`that inherits the manifest schema. Call sl_describe_table to see it first.`,
`that inherits the manifest schema. Call sl_read_source to inspect the existing source first.`,
);
return { errors, warnings };
}

View file

@ -47,6 +47,7 @@ export interface ToolSession {
touchedSlSources: TouchedSlSourceSet;
actions: MemoryAction[];
allowedRawPaths?: ReadonlySet<string>;
allowedConnectionNames?: ReadonlySet<string>;
semanticLayerService: SemanticLayerService;
wikiService: KnowledgeWikiService;
configService: KtxFileStorePort;

View file

@ -12,6 +12,7 @@ export type {
KnowledgeEventPort,
KnowledgeGitDiffPort,
KnowledgeIndexPort,
KnowledgeIndexPageListing,
UpsertPageParams,
WikiFileStorePort,
} from './ports.js';

View file

@ -113,13 +113,13 @@ describe('KnowledgeWikiService.syncFromCommit', () => {
expect(call.deletes).toEqual([{ scope: 'GLOBAL', scopeId: null, pageKey: 'gone-page' }]);
});
it('indexes historic-SQL nested pages but skips other nested wiki paths from commit sync', async () => {
it('indexes only flat wiki pages and skips nested paths from commit sync', async () => {
const { service, pagesRepository, gitService, logger } = makeService();
gitService.diffNameStatus.mockResolvedValue([
{ status: 'A', path: 'knowledge/global/revenue-policy.md' },
{ status: 'A', path: 'knowledge/global/historic-sql-order-lifecycle.md' },
{ status: 'A', path: 'knowledge/global/historic-sql/order-lifecycle.md' },
{ status: 'A', path: 'knowledge/global/historic-sql/_archived/retired-pattern.md' },
{ status: 'A', path: 'knowledge/global/orbit/company-overview.md' },
]);
gitService.getFileAtCommit.mockImplementation((path: string) => {
@ -138,26 +138,25 @@ describe('KnowledgeWikiService.syncFromCommit', () => {
await service.syncFromCommit('sha-before', 'sha-after', 'run-uuid');
expect(gitService.getFileAtCommit).not.toHaveBeenCalledWith('knowledge/global/orbit/company-overview.md', 'sha-after');
expect(gitService.getFileAtCommit).not.toHaveBeenCalledWith('knowledge/global/historic-sql/order-lifecycle.md', 'sha-after');
expect(logger.warn).toHaveBeenCalledWith(
'[knowledge.sync] skipping unparseable path: knowledge/global/orbit/company-overview.md',
);
expect(logger.warn).toHaveBeenCalledWith(
'[knowledge.sync] skipping unparseable path: knowledge/global/historic-sql/order-lifecycle.md',
);
const call = pagesRepository.applyDiffTransactional.mock.calls[0][0];
expect(call.upserts).toEqual(
expect.arrayContaining([
expect.objectContaining({ scope: 'GLOBAL', pageKey: 'revenue-policy', summary: 'revenue' }),
expect.objectContaining({
scope: 'GLOBAL',
pageKey: 'historic-sql/order-lifecycle',
pageKey: 'historic-sql-order-lifecycle',
summary: 'order lifecycle',
}),
expect.objectContaining({
scope: 'GLOBAL',
pageKey: 'historic-sql/_archived/retired-pattern',
summary: 'retired',
}),
]),
);
expect(call.upserts).toHaveLength(3);
expect(call.upserts).toHaveLength(2);
});
it('is a no-op when the diff between shas has no knowledge changes', async () => {

View file

@ -11,10 +11,6 @@ const WIKI_PREFIX = 'knowledge';
export type { WikiFrontmatter };
function isHistoricSqlPathSegment(segment: string): boolean {
return /^[a-zA-Z0-9_][a-zA-Z0-9_-]*$/.test(segment);
}
export class KnowledgeWikiService {
private isWorktreeScoped = false;
@ -422,7 +418,6 @@ export class KnowledgeWikiService {
* Parse a `knowledge/<scope>/...` file path into its scope and page key.
* `knowledge/global/foo.md` { scope: 'GLOBAL', scopeId: null, pageKey: 'foo' }
* `knowledge/user/<id>/bar.md` { scope: 'USER', scopeId: '<id>', pageKey: 'bar' }
* `knowledge/global/historic-sql/foo.md` { scope: 'GLOBAL', scopeId: null, pageKey: 'historic-sql/foo' }
*/
function parseKnowledgePath(path: string): { scope: string; scopeId: string | null; pageKey: string } | null {
if (!path.endsWith('.md')) {
@ -437,13 +432,6 @@ function parseKnowledgePath(path: string): { scope: string; scopeId: string | nu
const pageKey = rest[1].replace(/\.md$/, '');
return isFlatWikiKey(pageKey) ? { scope: 'GLOBAL', scopeId: null, pageKey } : null;
}
if (rest.length >= 3 && rest[0] === 'global' && rest[1] === 'historic-sql') {
const historicPath = rest.slice(2).join('/').replace(/\.md$/, '');
if (historicPath.split('/').every(isHistoricSqlPathSegment)) {
return { scope: 'GLOBAL', scopeId: null, pageKey: `historic-sql/${historicPath}` };
}
return null;
}
if (rest.length === 3 && rest[0] === 'user') {
const pageKey = rest[2].replace(/\.md$/, '');
return isFlatWikiKey(pageKey) ? { scope: 'USER', scopeId: rest[1], pageKey } : null;

View file

@ -244,4 +244,30 @@ describe('local knowledge helpers', () => {
}),
).rejects.toThrow('Invalid wiki key "orbit/company-overview". Wiki keys must be flat; use "orbit-company-overview".');
});
it('ignores nested historic-SQL legacy paths when listing local knowledge pages', async () => {
await writeLocalKnowledgePage(project, {
key: 'historic-sql-paid-orders',
scope: 'GLOBAL',
summary: 'Flat historic SQL page',
content: 'Flat page body.',
tags: ['historic-sql'],
});
await project.fileStore.writeFile(
'knowledge/global/historic-sql/paid-orders.md',
'---\nsummary: Nested historic SQL page\nusage_mode: auto\n---\n\nNested body\n',
'Test',
'test@example.com',
'Write nested legacy page',
);
await expect(listLocalKnowledgePages(project, { userId: 'local' })).resolves.toEqual([
{
key: 'historic-sql-paid-orders',
path: 'knowledge/global/historic-sql-paid-orders.md',
scope: 'GLOBAL',
summary: 'Flat historic SQL page',
},
]);
});
});

View file

@ -80,26 +80,12 @@ function knowledgePath(scope: LocalKnowledgeScope, userId: string | undefined, k
return `knowledge/user/${assertSafePathToken('user id', userId ?? 'local')}/${safeKey}.md`;
}
function isHistoricSqlPathSegment(segment: string): boolean {
return /^[a-zA-Z0-9_][a-zA-Z0-9_-]*$/.test(segment);
}
function keyFromKnowledgePath(path: string, scope: LocalKnowledgeScope, userId: string): string | null {
const prefix = scope === 'GLOBAL' ? 'knowledge/global/' : `knowledge/user/${assertSafePathToken('user id', userId)}/`;
const key = path.slice(prefix.length).replace(/\.md$/, '');
if (isFlatWikiKey(key)) {
return key;
}
if (
scope === 'GLOBAL' &&
key.startsWith('historic-sql/') &&
key
.slice('historic-sql/'.length)
.split('/')
.every(isHistoricSqlPathSegment)
) {
return key;
}
return null;
}

View file

@ -13,6 +13,15 @@ export interface UpsertPageParams {
sourceRunId?: string | null;
}
export interface KnowledgeIndexPageListing {
id?: string;
page_key: string;
summary: string;
scope: string;
scope_id: string | null;
tags: string[];
}
export interface KnowledgeIndexPort {
upsertPage(params: UpsertPageParams): Promise<void>;
applyDiffTransactional(params: {
@ -32,9 +41,7 @@ export interface KnowledgeIndexPort {
scopeId: string | null,
pageKey: string,
): Promise<{ id?: string; page_key: string } | null | undefined>;
listPagesForUser(
userId: string,
): Promise<Array<{ id?: string; page_key: string; summary: string; scope: string; scope_id: string | null }>>;
listPagesForUser(userId: string): Promise<KnowledgeIndexPageListing[]>;
getUserPageCount(userId: string): Promise<number>;
incrementUsageCount(pageIds: string[]): Promise<void>;
searchRRF(

View file

@ -8,22 +8,11 @@ describe('WikiListTagsTool', () => {
it("returns distinct sorted tags across the user's visible pages", async () => {
const pagesRepository = {
listPagesForUser: vi.fn().mockResolvedValue([
{ scope: 'GLOBAL', scope_id: null, page_key: 'k1' },
{ scope: 'USER', scope_id: 'u', page_key: 'k2' },
{ scope: 'GLOBAL', scope_id: null, page_key: 'k1', tags: ['metrics', 'finance'] },
{ scope: 'USER', scope_id: 'u', page_key: 'k2', tags: ['metrics'] },
]),
};
const wikiService = {
readPage: vi.fn().mockImplementation((_scope, _scopeId, key) => {
if (key === 'k1') {
return Promise.resolve({ frontmatter: { tags: ['metrics', 'finance'] }, content: '' });
}
if (key === 'k2') {
return Promise.resolve({ frontmatter: { tags: ['metrics'] }, content: '' });
}
return Promise.resolve(null);
}),
};
const tool = new WikiListTagsTool(wikiService as any, pagesRepository as any);
const tool = new WikiListTagsTool(pagesRepository as any);
const result = await tool.call({}, baseContext);
expect(result.markdown).toContain('finance');
@ -31,10 +20,23 @@ describe('WikiListTagsTool', () => {
expect(result.structured.tags).toEqual(['finance', 'metrics']);
});
it('lists tags from historic-SQL indexed pages with flat wiki keys', async () => {
const pagesRepository = {
listPagesForUser: vi.fn().mockResolvedValue([
{ scope: 'GLOBAL', scope_id: null, page_key: 'company-overview', tags: ['notion'] },
{ scope: 'GLOBAL', scope_id: null, page_key: 'historic-sql-revenue-pattern', tags: ['historic-sql', 'pattern'] },
]),
};
const tool = new WikiListTagsTool(pagesRepository as any);
const result = await tool.call({}, baseContext);
expect(result.structured.tags).toEqual(['historic-sql', 'notion', 'pattern']);
});
it('returns a friendly message when no pages have tags', async () => {
const pagesRepository = { listPagesForUser: vi.fn().mockResolvedValue([]) };
const wikiService = { readPage: vi.fn() };
const tool = new WikiListTagsTool(wikiService as any, pagesRepository as any);
const tool = new WikiListTagsTool(pagesRepository as any);
const result = await tool.call({}, baseContext);
expect(result.markdown).toMatch(/no tags/i);

View file

@ -1,7 +1,5 @@
import { z } from 'zod';
import type { KnowledgeIndexPort } from '../ports.js';
type BlockScope = 'GLOBAL' | 'USER';
import { KnowledgeWikiService } from '../index.js';
import { BaseTool, type ToolContext, type ToolOutput } from '../../tools/index.js';
const wikiListTagsInputSchema = z.object({});
@ -11,10 +9,7 @@ type WikiListTagsInput = z.infer<typeof wikiListTagsInputSchema>;
export class WikiListTagsTool extends BaseTool<typeof wikiListTagsInputSchema> {
readonly name = 'wiki_list_tags';
constructor(
private readonly wikiService: KnowledgeWikiService,
private readonly pagesRepository: KnowledgeIndexPort,
) {
constructor(private readonly pagesRepository: KnowledgeIndexPort) {
super();
}
@ -33,10 +28,7 @@ Call before writing a new page so you can reuse existing tags consistently inste
const pages = await this.pagesRepository.listPagesForUser(context.userId);
const set = new Set<string>();
for (const p of pages) {
const scope = p.scope as BlockScope;
const scopeId = scope === 'USER' ? p.scope_id : null;
const page = await this.wikiService.readPage(scope, scopeId, p.page_key);
for (const t of page?.frontmatter.tags ?? []) {
for (const t of p.tags) {
set.add(t);
}
}

View file

@ -150,6 +150,7 @@ export class WikiWriteTool extends BaseTool<typeof wikiWriteInputSchema> {
Create or update a knowledge page. Provide content for create/rewrite, or replacements for targeted edits.
For existing pages, you may provide only frontmatter fields such as summary, tags, refs, or sl_refs to update metadata while preserving content.
tags/refs/sl_refs use REPLACE semantics: omit to keep existing on update, [] to clear, [values] to set.
Keys must be flat file names, not directory paths. Use tags/source frontmatter for grouping.
</purpose>`;
}

View file

@ -25,6 +25,7 @@ export interface WikiFrontmatter {
usage?: HistoricSqlWikiUsageFrontmatter;
fingerprints?: string[];
stale_since?: string;
archived_since?: string;
}
export interface WikiPage {