feat(context): add warehouse verification tools (#46)

* feat(context): add warehouse dialect dispatch

* feat(context): read warehouse scan catalog

* feat(context): add entity details verification tool

* feat(context): add ingest SQL verification tool

* feat(context): add raw warehouse discovery tool

* feat(context): expose warehouse verification tools to ingest

* docs(context): add ingest identifier verification protocol

* test(context): guard ingest identifier verification prompts

* chore(context): verify warehouse verification tools

* docs: add warehouse verification tools plan and spec

* fix(context): expose target warehouses to Notion ingest

* fix(context): update ingest prompts for warehouse verification tools

* fix(context): scope raw schema discovery to allowed connections

* fix(context): verify warehouse column display targets

* docs: add notion warehouse verification gap closure plan

* fix(context): include raw discovery connection names

* fix(context): expose warehouse targets for LookML and MetricFlow

* fix(context): pass connection config to ingest query executors

* fix(cli): enable read-only SQL probes for local ingest

* docs: add warehouse verification final v1 closure plan

* fix(context): align warehouse sql probe prompt shape

* docs: add warehouse verification prompt shape closure plan

* test(context): catch connectionless sql execution prompt examples

* fix(context): include connection name in sl capture sql example

* docs: add warehouse verification sql example closure plan

* fix(context): report structured entity detail misses

* docs: add warehouse verification structured target miss closure plan

* fix: report untracked squash merge conflicts

* feat: require ingest verification ledger

* fix: stabilize ingest wiki references
This commit is contained in:
Andrey Avtomonov 2026-05-13 13:43:23 +02:00 committed by GitHub
parent bcb0d2f8f7
commit c22248dabf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
89 changed files with 7818 additions and 191 deletions

View file

@ -277,7 +277,7 @@ describe('historic-SQL local ingest retrieval acceptance', () => {
await expect(readFile(join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves
.toContain('Analysts repeatedly inspect paid order lifecycle by customer segment.');
await expect(readFile(join(project.projectDir, 'knowledge/global/historic-sql/paid-order-lifecycle.md'), 'utf-8'))
await expect(readFile(join(project.projectDir, 'knowledge/global/historic-sql-paid-order-lifecycle.md'), 'utf-8'))
.resolves.toContain('Paid Order Lifecycle');
const reloaded = await loadKtxProject({ projectDir: project.projectDir });
@ -295,7 +295,7 @@ describe('historic-SQL local ingest retrieval acceptance', () => {
searchLocalKnowledgePages(reloaded, { query: 'paid order lifecycle', userId: 'local', limit: 5 }),
).resolves.toEqual([
expect.objectContaining({
key: 'historic-sql/paid-order-lifecycle',
key: 'historic-sql-paid-order-lifecycle',
summary: 'Paid Order Lifecycle',
matchReasons: expect.arrayContaining(['lexical']),
}),

View file

@ -10,7 +10,7 @@ async function commitProjectionChanges(workdir: string): Promise<void> {
const status = await git.status();
const paths = status.files
.map((file) => file.path)
.filter((path) => path.startsWith('semantic-layer/') || path.startsWith('knowledge/global/historic-sql/'));
.filter((path) => path.startsWith('semantic-layer/') || path.startsWith('knowledge/global/historic-sql'));
if (paths.length === 0) {
return;
}

View file

@ -106,7 +106,7 @@ describe('projectHistoricSqlEvidence', () => {
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
await writeText(
workdir,
'knowledge/global/historic-sql/old-order-lifecycle.md',
'knowledge/global/historic-sql-old-order-lifecycle.md',
[
'---',
YAML.stringify({
@ -127,7 +127,7 @@ describe('projectHistoricSqlEvidence', () => {
);
await writeText(
workdir,
'knowledge/global/historic-sql/retired-pattern.md',
'knowledge/global/historic-sql-retired-pattern.md',
[
'---',
YAML.stringify({
@ -164,15 +164,15 @@ describe('projectHistoricSqlEvidence', () => {
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.patternPagesWritten).toBe(1);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/old-order-lifecycle.md'), 'utf-8')).resolves.toContain(
await expect(readFile(join(workdir, 'knowledge/global/historic-sql-old-order-lifecycle.md'), 'utf-8')).resolves.toContain(
'Order Lifecycle Analysis',
);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/retired-pattern.md'), 'utf-8')).resolves.toContain(
await expect(readFile(join(workdir, 'knowledge/global/historic-sql-retired-pattern.md'), 'utf-8')).resolves.toContain(
'stale_since: "2026-05-11T00:00:00.000Z"',
);
});
it('writes a reappearing pattern to the active slug instead of reusing an archived page key', async () => {
it('rewrites a reappearing archived pattern at the flat slug', async () => {
const workdir = await tempWorkdir();
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
@ -192,7 +192,7 @@ describe('projectHistoricSqlEvidence', () => {
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
await writeText(
workdir,
'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md',
'knowledge/global/historic-sql-order-lifecycle-analysis.md',
[
'---',
YAML.stringify({
@ -230,15 +230,10 @@ describe('projectHistoricSqlEvidence', () => {
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.patternPagesWritten).toBe(1);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain(
'Order Lifecycle Analysis',
);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain(
'Archived body',
);
await expect(
readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/order-lifecycle-analysis.md'), 'utf-8'),
).rejects.toMatchObject({ code: 'ENOENT' });
const page = await readFile(join(workdir, 'knowledge/global/historic-sql-order-lifecycle-analysis.md'), 'utf-8');
expect(page).toContain('Analysts compare order status with customer segment again.');
expect(page).not.toContain('Archived body');
expect(page).not.toContain('archived');
});
it('leaves already archived pattern pages stable when they are still absent', async () => {
@ -259,7 +254,7 @@ describe('projectHistoricSqlEvidence', () => {
});
await writeText(
workdir,
'knowledge/global/historic-sql/_archived/retired-pattern.md',
'knowledge/global/historic-sql-retired-pattern.md',
[
'---',
YAML.stringify({
@ -284,12 +279,9 @@ describe('projectHistoricSqlEvidence', () => {
expect(result.archivedPatternPages).toBe(0);
expect(result.stalePatternPagesMarked).toBe(0);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/retired-pattern.md'), 'utf-8')).resolves.toContain(
await expect(readFile(join(workdir, 'knowledge/global/historic-sql-retired-pattern.md'), 'utf-8')).resolves.toContain(
'Archived retired body',
);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/retired-pattern.md'), 'utf-8')).rejects.toMatchObject({
code: 'ENOENT',
});
});
it('marks missing table usage stale and deletes legacy historic SQL query pages', async () => {
@ -330,7 +322,7 @@ describe('projectHistoricSqlEvidence', () => {
});
await writeText(
workdir,
'knowledge/global/historic-sql/legacy-template.md',
'knowledge/global/historic-sql-legacy-template.md',
[
'---',
YAML.stringify({
@ -365,7 +357,7 @@ describe('projectHistoricSqlEvidence', () => {
commonJoins: [],
staleSince: '2026-05-11T00:00:00.000Z',
});
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/legacy-template.md'), 'utf-8')).rejects.toMatchObject({
await expect(readFile(join(workdir, 'knowledge/global/historic-sql-legacy-template.md'), 'utf-8')).rejects.toMatchObject({
code: 'ENOENT',
});
});

View file

@ -37,7 +37,7 @@ interface HistoricSqlPatternPage {
}
function safeKnowledgeSlug(value: string): string {
return value.toLowerCase().replace(/[^a-z0-9/-]+/g, '-').replace(/^-+|-+$/g, '');
return value.toLowerCase().replace(/[^a-z0-9_-]+/g, '-').replace(/^-+|-+$/g, '');
}
async function pathExists(path: string): Promise<boolean> {
@ -159,7 +159,7 @@ function isLegacyQueryPage(page: HistoricSqlPatternPage): boolean {
function isArchivedPatternPage(page: HistoricSqlPatternPage): boolean {
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
return page.key.startsWith('_archived/') || tags.includes('archived');
return tags.includes('archived');
}
function stringArray(value: unknown): string[] {
@ -191,6 +191,9 @@ async function loadPatternPages(root: string): Promise<HistoricSqlPatternPage[]>
const files = await walkFiles(root);
const pages: HistoricSqlPatternPage[] = [];
for (const file of files.filter((candidate) => candidate.endsWith('.md'))) {
if (file.includes('/')) {
continue;
}
const key = file.replace(/\.md$/, '');
const path = join(root, file);
const page = parseMarkdownPage(key, path, await readFile(path, 'utf-8'));
@ -201,6 +204,10 @@ async function loadPatternPages(root: string): Promise<HistoricSqlPatternPage[]>
return pages;
}
function historicSqlFlatKey(slug: string): string {
return `historic-sql-${safeKnowledgeSlug(slug)}`;
}
async function currentStagedTables(rawDir: string): Promise<Set<string>> {
const tablesRoot = join(rawDir, 'tables');
const files = await walkFiles(tablesRoot);
@ -276,7 +283,7 @@ export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInp
}
}
const wikiRoot = join(input.workdir, 'knowledge/global/historic-sql');
const wikiRoot = join(input.workdir, 'knowledge/global');
await mkdir(wikiRoot, { recursive: true });
const allPages = await loadPatternPages(wikiRoot);
const activePages = allPages.filter((page) => !isArchivedPatternPage(page));
@ -286,7 +293,7 @@ export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInp
for (const pattern of patternEvidence) {
const incomingSignals = [...pattern.pattern.tablesInvolved, ...pattern.pattern.constituentTemplateIds];
const reusable = patternPages.find((page) => overlapRatio(incomingSignals, existingPageSignals(page)) >= 0.6);
const key = reusable?.key ?? safeKnowledgeSlug(pattern.pattern.slug);
const key = reusable?.key ?? historicSqlFlatKey(pattern.pattern.slug);
const pagePath = join(wikiRoot, `${key}.md`);
const frontmatter = {
summary: pattern.pattern.title,
@ -308,11 +315,12 @@ export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInp
for (const page of patternPages) {
if (writtenKeys.has(page.key)) continue;
if (shouldArchive(page.frontmatter.stale_since, manifest.fetchedAt, manifest.staleArchiveAfterDays)) {
const archivePath = join(wikiRoot, '_archived', `${page.key}.md`);
const tags = [...new Set([...stringArray(page.frontmatter.tags), 'archived'])];
await mkdir(dirname(archivePath), { recursive: true });
await writeFile(archivePath, renderMarkdownPage({ ...page.frontmatter, tags }, page.content), 'utf-8');
await rm(page.path, { force: true });
await writeFile(
page.path,
renderMarkdownPage({ ...page.frontmatter, tags, archived_since: manifest.fetchedAt }, page.content),
'utf-8',
);
result.archivedPatternPages += 1;
continue;
}

View file

@ -15,6 +15,18 @@ describe('LookmlSourceAdapter validation sidecars', () => {
afterEach(async () => rm(tmpRoot, { recursive: true, force: true }));
it('returns configured target warehouse connection ids', async () => {
const adapter = new LookmlSourceAdapter({
homeDir: join(tmpRoot, 'home'),
targetConnectionIds: ['warehouse', 'analytics', 'warehouse'],
});
await expect(adapter.listTargetConnectionIds?.(join(tmpRoot, 'staged'))).resolves.toEqual([
'analytics',
'warehouse',
]);
});
it('writes a partial fetch report and marks mismatched chunks as SL-disallowed', async () => {
const originRoot = join(tmpRoot, 'origin-src');
await mkdir(join(originRoot, 'views'), { recursive: true });

View file

@ -14,6 +14,11 @@ import { parseLookmlPullConfig } from './pull-config.js';
export interface LookmlSourceAdapterDeps {
homeDir: string;
targetConnectionIds?: string[];
}
function uniqueSorted(values: readonly string[] | undefined): string[] {
return [...new Set(values ?? [])].sort((left, right) => left.localeCompare(right));
}
export class LookmlSourceAdapter implements SourceAdapter {
@ -43,6 +48,10 @@ export class LookmlSourceAdapter implements SourceAdapter {
return readLookmlFetchReport(stagedDir);
}
async listTargetConnectionIds(_stagedDir: string): Promise<string[]> {
return uniqueSorted(this.deps.targetConnectionIds);
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const project = await parseLookmlStagedDir(stagedDir);
const mismatchedModelNames = await readLookmlMismatchedModelNames(stagedDir);

View file

@ -42,6 +42,15 @@ describe('MetricflowSourceAdapter', () => {
expect(adapter.skillNames).toEqual(['metricflow_ingest']);
});
it('returns configured target warehouse connection ids', async () => {
const metricflow = new MetricflowSourceAdapter({
homeDir: join(tmpRoot, 'cache-home'),
targetConnectionIds: ['warehouse', 'analytics', 'warehouse'],
});
await expect(metricflow.listTargetConnectionIds?.(stagedDir)).resolves.toEqual(['analytics', 'warehouse']);
});
it('detects a staged dir with a semantic_models YAML', async () => {
await mkdir(join(stagedDir, 'models'), { recursive: true });
await writeFile(

View file

@ -9,6 +9,11 @@ import { parseMetricflowPullConfig } from './pull-config.js';
export interface MetricflowSourceAdapterDeps {
homeDir: string;
targetConnectionIds?: string[];
}
function uniqueSorted(values: readonly string[] | undefined): string[] {
return [...new Set(values ?? [])].sort((left, right) => left.localeCompare(right));
}
export class MetricflowSourceAdapter implements SourceAdapter {
@ -30,6 +35,10 @@ export class MetricflowSourceAdapter implements SourceAdapter {
});
}
async listTargetConnectionIds(_stagedDir: string): Promise<string[]> {
return uniqueSorted(this.deps.targetConnectionIds);
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const project = await parseMetricFlowStagedDir(stagedDir);
const chunk = await chunkMetricFlowProject(project, { diffSet });

View file

@ -8,7 +8,7 @@ const MAX_NOTION_WORK_UNIT_CHARS = 40_000;
export const NOTION_ORG_KNOWLEDGE_WARNING =
'Anything accessible to this Notion integration can become organization knowledge.';
const NOTION_SL_WRITE_GUIDANCE =
'Write wiki entries with wiki_write. Wiki keys must be flat slugs like orbit-company-overview, not orbit/company-overview. Search existing wiki pages for the same tables or sl_refs before creating a new page. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
'Write wiki entries with wiki_write. Wiki keys must be flat slugs like orbit-company-overview, not orbit/company-overview. Search existing wiki pages, SL sources, and raw warehouse schema for the same tables or sl_refs with discover_data before creating a new page. Only write or edit SL sources after discover_data plus sl_discover/sl_read_source or entity_details confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
@ -117,7 +117,7 @@ export async function chunkNotionStagedDir(stagedDir: string, diffSet?: DiffSet)
reconcileNotes: [
`Notion maxKnowledgeCreatesPerRun=${manifest.maxKnowledgeCreatesPerRun}`,
`Notion maxKnowledgeUpdatesPerRun=${manifest.maxKnowledgeUpdatesPerRun}`,
'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.',
'Notion dataSourceCount is Notion-only; use discover_data/entity_details for warehouse/dbt mapping decisions.',
'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.',
],
contextReport: {

View file

@ -52,6 +52,14 @@ describe('NotionSourceAdapter', () => {
expect(adapter.triageSupported).toBe(true);
});
it('returns configured target warehouse connection ids', async () => {
const adapter = new NotionSourceAdapter({
targetConnectionIds: ['warehouse', 'warehouse', 'analytics'],
});
await expect(adapter.listTargetConnectionIds?.(stagedDir)).resolves.toEqual(['analytics', 'warehouse']);
});
it('returns structural triage signals for a staged Notion page', async () => {
await mkdir(join(stagedDir, 'pages', 'page-1'), { recursive: true });
await writeFile(
@ -242,6 +250,8 @@ describe('NotionSourceAdapter', () => {
});
expect(result.workUnits[0].notes).toContain('Synthesize durable wiki and SL knowledge');
expect(result.workUnits[0].notes).toContain('emit_unmapped_fallback');
expect(result.workUnits[0].notes).toContain('discover_data');
expect(result.workUnits[0].notes).toContain('entity_details');
expect(result.workUnits[0].notes).toContain('use reason no_physical_table rather than no_connection_mapping');
expect(result.workUnits[0].notes).toContain('Do not create SL sources under the Notion connection');
expect(result.workUnits[0].notes).toContain(
@ -250,7 +260,7 @@ describe('NotionSourceAdapter', () => {
expect(result.reconcileNotes).toEqual([
'Notion maxKnowledgeCreatesPerRun=25',
'Notion maxKnowledgeUpdatesPerRun=20',
'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.',
'Notion dataSourceCount is Notion-only; use discover_data/entity_details for warehouse/dbt mapping decisions.',
'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.',
]);
expect(result.contextReport).toEqual({ capped: false, warnings: [NOTION_ORG_KNOWLEDGE_WARNING] });

View file

@ -32,6 +32,11 @@ interface NotionPullSucceededContext {
export interface NotionSourceAdapterDeps {
onPullSucceeded?: (ctx: NotionPullSucceededContext) => Promise<void>;
logger?: NotionFetchLogger;
targetConnectionIds?: string[];
}
function uniqueSorted(values: readonly string[] | undefined): string[] {
return [...new Set(values ?? [])].sort((left, right) => left.localeCompare(right));
}
export class NotionSourceAdapter implements SourceAdapter {
@ -73,6 +78,10 @@ export class NotionSourceAdapter implements SourceAdapter {
return describeNotionScope(stagedDir);
}
async listTargetConnectionIds(_stagedDir: string): Promise<string[]> {
return uniqueSorted(this.deps.targetConnectionIds);
}
async getTriageSignals(stagedDir: string, externalId: string): Promise<TriageSignals> {
const metadata = await this.findMetadataByExternalId(stagedDir, externalId);
if (!metadata) {