Tighten ingest reconciliation guidance

This commit is contained in:
Luca Martial 2026-05-11 17:20:02 -07:00
parent 9f91c26752
commit 6d00cbbc2e
16 changed files with 382 additions and 20 deletions

View file

@ -8,7 +8,7 @@ const MAX_NOTION_WORK_UNIT_CHARS = 40_000;
export const NOTION_ORG_KNOWLEDGE_WARNING =
'Anything accessible to this Notion integration can become organization knowledge.';
const NOTION_SL_WRITE_GUIDANCE =
'Write wiki entries with wiki_write. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
'Write wiki entries with wiki_write. Search existing wiki pages for the same tables or sl_refs before creating a new page. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
@ -117,6 +117,8 @@ export async function chunkNotionStagedDir(stagedDir: string, diffSet?: DiffSet)
reconcileNotes: [
`Notion maxKnowledgeCreatesPerRun=${manifest.maxKnowledgeCreatesPerRun}`,
`Notion maxKnowledgeUpdatesPerRun=${manifest.maxKnowledgeUpdatesPerRun}`,
'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.',
'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.',
],
contextReport: {
capped: manifest.capped,

View file

@ -84,6 +84,22 @@ describe('clusterNotionWorkUnits', () => {
}
});
test('merges pages into one synthesis unit at the clustering threshold', async () => {
const pages = Array.from({ length: MIN_PAGES_TO_CLUSTER }, (_, i) => ({
id: `p${i}`,
title: `Customer source reference ${i}`,
body: `Customer source reference maps to orbit_analytics.customer ${i}`.repeat(10),
}));
const stagedDir = await makeStaged(pages);
const wus = makeWorkUnits(pages);
const out = await clusterNotionWorkUnits({ workUnits: wus, stagedDir, embedding: mockEmbed });
expect(out).toHaveLength(1);
expect(out[0].unitKey).toBe('notion-cluster-1');
expect(new Set(out[0].rawFiles)).toEqual(new Set(wus.flatMap((wu) => wu.rawFiles)));
expect(out[0].notes).toContain('emit_unmapped_fallback');
expect(out[0].notes).toContain('Do not create SL sources under the Notion connection');
});
test('preserves coverage: every input rawFile appears in some cluster', async () => {
const pages = Array.from({ length: 12 }, (_, i) => ({
id: `p${i}`,

View file

@ -9,7 +9,7 @@ export const MIN_PAGES_TO_CLUSTER = 5;
const CLUSTER_TEXT_BODY_CHARS = 1024;
const CLUSTER_SEED = 42;
const NOTION_CLUSTER_SL_WRITE_GUIDANCE =
'Write wiki entries directly with wiki_write. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
'Write wiki entries directly with wiki_write. Search existing wiki pages for the same tables or sl_refs before creating a new page. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
interface ClusterNotionWorkUnitsArgs {
workUnits: WorkUnit[];
@ -74,7 +74,7 @@ export async function clusterNotionWorkUnits(args: ClusterNotionWorkUnitsArgs):
const { workUnits, stagedDir, embedding } = args;
if (workUnits.length < MIN_PAGES_TO_CLUSTER) return workUnits;
const k = pickK(workUnits.length);
if (k <= 1) return workUnits;
if (k <= 1) return [mergeWorkUnits(workUnits, 0)];
const texts = await Promise.all(workUnits.map((wu) => buildClusterText(wu, stagedDir)));
let vectors: number[][];
try {

View file

@ -247,6 +247,8 @@ describe('NotionSourceAdapter', () => {
expect(result.reconcileNotes).toEqual([
'Notion maxKnowledgeCreatesPerRun=25',
'Notion maxKnowledgeUpdatesPerRun=20',
'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.',
'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.',
]);
expect(result.contextReport).toEqual({ capped: false, warnings: [NOTION_ORG_KNOWLEDGE_WARNING] });
});

View file

@ -1,5 +1,5 @@
import { describe, expect, it, vi } from 'vitest';
import { buildReconcileSystemPrompt, buildReconcileToolSet } from './build-reconcile-context.js';
import { buildReconcileSystemPrompt, buildReconcileToolSet, buildReconcileUserPrompt } from './build-reconcile-context.js';
describe('buildReconcileSystemPrompt', () => {
it('appends canonical pins when relevant pins are supplied', () => {
@ -39,6 +39,40 @@ describe('buildReconcileSystemPrompt', () => {
});
});
describe('buildReconcileUserPrompt', () => {
it('includes action details so reconciliation can compare different keys for the same table', () => {
const prompt = buildReconcileUserPrompt(
{
jobId: 'j1',
connectionId: 'notion',
workUnits: [
{
unitKey: 'notion-a',
rawFiles: ['pages/a/page.md'],
status: 'success',
actions: [
{
target: 'wiki',
type: 'created',
key: 'orbit-customer-source-reference',
detail: 'tables: orbit_analytics.customer',
},
],
touchedSlSources: [],
},
],
conflictsResolved: [],
evictionsApplied: [],
unmappedFallbacks: [],
},
undefined,
);
expect(prompt).toContain('orbit-customer-source-reference');
expect(prompt).toContain('tables: orbit_analytics.customer');
});
});
describe('buildReconcileToolSet', () => {
it('includes emit_unmapped_fallback with the reconciliation tools', () => {
const toolSet = buildReconcileToolSet({

View file

@ -104,6 +104,10 @@ function curatorPassStateSummary(runState?: ReconcilePromptRunState): string {
].join('\n');
}
function formatStageActionDetail(detail: string): string {
return detail.trim().replace(/\s+/g, ' ');
}
export function buildReconcileUserPrompt(
stageIndex: StageIndex,
ev: EvictionUnit | undefined,
@ -119,7 +123,14 @@ export function buildReconcileUserPrompt(
const actions =
wu.actions.length === 0
? ' actions: (none)'
: wu.actions.map((a) => ` - ${a.target}:${a.type} ${a.key}`).join('\n');
: wu.actions
.map((a) => {
const detail = formatStageActionDetail(a.detail);
return detail.length > 0
? ` - ${a.target}:${a.type} ${a.key}; detail: ${detail}`
: ` - ${a.target}:${a.type} ${a.key}`;
})
.join('\n');
return `- unitKey: ${wu.unitKey} (status=${wu.status})\n${actions}`;
})
.join('\n');

View file

@ -19,7 +19,14 @@ describe('stage_list tool', () => {
unitKey: 'u2',
rawFiles: ['b.yml'],
status: 'success',
actions: [{ target: 'wiki', type: 'created', key: 'page_b', detail: '' }],
actions: [
{
target: 'wiki',
type: 'created',
key: 'page_b',
detail: 'tables: orbit_analytics.customer',
},
],
touchedSlSources: [],
},
],
@ -36,6 +43,7 @@ describe('stage_list tool', () => {
expect(out).toContain('src_a');
expect(out).toContain('u2');
expect(out).toContain('page_b');
expect(out).toContain('tables: orbit_analytics.customer');
});
it('says empty when no writes', async () => {

View file

@ -6,6 +6,10 @@ export interface StageListDeps {
stageIndex: StageIndex;
}
function formatActionDetail(detail: string): string {
return detail.trim().replace(/\s+/g, ' ');
}
export function createStageListTool(deps: StageListDeps) {
return tool({
description:
@ -20,7 +24,14 @@ export function createStageListTool(deps: StageListDeps) {
const actions =
wu.actions.length === 0
? ' (no actions)'
: wu.actions.map((a) => ` - ${a.target}:${a.type} ${a.key}`).join('\n');
: wu.actions
.map((a) => {
const detail = formatActionDetail(a.detail);
return detail.length > 0
? ` - ${a.target}:${a.type} ${a.key}; detail: ${detail}`
: ` - ${a.target}:${a.type} ${a.key}`;
})
.join('\n');
return `- unitKey: ${wu.unitKey} (status=${wu.status})\n rawFiles: ${wu.rawFiles.join(', ') || '(none)'}\n actions:\n${actions}`;
})
.join('\n');

View file

@ -90,6 +90,25 @@ describe('memory runtime assets', () => {
expect(body).not.toContain('a standalone SL source only when raw evidence contains enough table or SQL structure');
});
it('ships Metabase guidance that avoids invalid joins for SQL-only card outputs', async () => {
const body = await readFile(join(skillsDir, 'metabase_ingest', 'SKILL.md'), 'utf-8');
expect(body).toContain('Do not declare a KTX join just because the card SQL joins that table internally');
expect(body).toContain('only when the card output exposes a local key that matches the target source grain');
expect(body).toContain('If `sl_discover` resolves the table, it is not outside the manifest');
expect(body).toContain('reason: "parse_error"');
expect(body).not.toContain('Tables outside the manifest');
expect(body).not.toContain('reason: "metabase_sql_untranslated"');
});
it('ships Notion guidance for physical-table fallbacks and duplicate wiki reconciliation', async () => {
const body = await readFile(join(skillsDir, 'notion_synthesize', 'SKILL.md'), 'utf-8');
expect(body).toContain('Notion `dataSourceCount` counts Notion databases/data sources only');
expect(body).toContain('Search existing wiki pages for the same `tables:` or `sl_refs:` frontmatter');
expect(body).toContain('no_physical_table');
});
it('packages LookML connection-mismatch SL gate guidance', async () => {
const body = await readFile(join(skillsDir, 'lookml_ingest', 'SKILL.md'), 'utf-8');