mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-25 08:48:08 +02:00
Tighten ingest reconciliation guidance
This commit is contained in:
parent
9f91c26752
commit
6d00cbbc2e
16 changed files with 382 additions and 20 deletions
|
|
@ -8,7 +8,7 @@ const MAX_NOTION_WORK_UNIT_CHARS = 40_000;
|
|||
export const NOTION_ORG_KNOWLEDGE_WARNING =
|
||||
'Anything accessible to this Notion integration can become organization knowledge.';
|
||||
const NOTION_SL_WRITE_GUIDANCE =
|
||||
'Write wiki entries with wiki_write. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
|
||||
'Write wiki entries with wiki_write. Search existing wiki pages for the same tables or sl_refs before creating a new page. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
|
||||
|
||||
async function walk(root: string): Promise<string[]> {
|
||||
const entries = await readdir(root, { withFileTypes: true, recursive: true });
|
||||
|
|
@ -117,6 +117,8 @@ export async function chunkNotionStagedDir(stagedDir: string, diffSet?: DiffSet)
|
|||
reconcileNotes: [
|
||||
`Notion maxKnowledgeCreatesPerRun=${manifest.maxKnowledgeCreatesPerRun}`,
|
||||
`Notion maxKnowledgeUpdatesPerRun=${manifest.maxKnowledgeUpdatesPerRun}`,
|
||||
'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.',
|
||||
'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.',
|
||||
],
|
||||
contextReport: {
|
||||
capped: manifest.capped,
|
||||
|
|
|
|||
|
|
@ -84,6 +84,22 @@ describe('clusterNotionWorkUnits', () => {
|
|||
}
|
||||
});
|
||||
|
||||
test('merges pages into one synthesis unit at the clustering threshold', async () => {
|
||||
const pages = Array.from({ length: MIN_PAGES_TO_CLUSTER }, (_, i) => ({
|
||||
id: `p${i}`,
|
||||
title: `Customer source reference ${i}`,
|
||||
body: `Customer source reference maps to orbit_analytics.customer ${i}`.repeat(10),
|
||||
}));
|
||||
const stagedDir = await makeStaged(pages);
|
||||
const wus = makeWorkUnits(pages);
|
||||
const out = await clusterNotionWorkUnits({ workUnits: wus, stagedDir, embedding: mockEmbed });
|
||||
expect(out).toHaveLength(1);
|
||||
expect(out[0].unitKey).toBe('notion-cluster-1');
|
||||
expect(new Set(out[0].rawFiles)).toEqual(new Set(wus.flatMap((wu) => wu.rawFiles)));
|
||||
expect(out[0].notes).toContain('emit_unmapped_fallback');
|
||||
expect(out[0].notes).toContain('Do not create SL sources under the Notion connection');
|
||||
});
|
||||
|
||||
test('preserves coverage: every input rawFile appears in some cluster', async () => {
|
||||
const pages = Array.from({ length: 12 }, (_, i) => ({
|
||||
id: `p${i}`,
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ export const MIN_PAGES_TO_CLUSTER = 5;
|
|||
const CLUSTER_TEXT_BODY_CHARS = 1024;
|
||||
const CLUSTER_SEED = 42;
|
||||
const NOTION_CLUSTER_SL_WRITE_GUIDANCE =
|
||||
'Write wiki entries directly with wiki_write. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
|
||||
'Write wiki entries directly with wiki_write. Search existing wiki pages for the same tables or sl_refs before creating a new page. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.';
|
||||
|
||||
interface ClusterNotionWorkUnitsArgs {
|
||||
workUnits: WorkUnit[];
|
||||
|
|
@ -74,7 +74,7 @@ export async function clusterNotionWorkUnits(args: ClusterNotionWorkUnitsArgs):
|
|||
const { workUnits, stagedDir, embedding } = args;
|
||||
if (workUnits.length < MIN_PAGES_TO_CLUSTER) return workUnits;
|
||||
const k = pickK(workUnits.length);
|
||||
if (k <= 1) return workUnits;
|
||||
if (k <= 1) return [mergeWorkUnits(workUnits, 0)];
|
||||
const texts = await Promise.all(workUnits.map((wu) => buildClusterText(wu, stagedDir)));
|
||||
let vectors: number[][];
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -247,6 +247,8 @@ describe('NotionSourceAdapter', () => {
|
|||
expect(result.reconcileNotes).toEqual([
|
||||
'Notion maxKnowledgeCreatesPerRun=25',
|
||||
'Notion maxKnowledgeUpdatesPerRun=20',
|
||||
'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.',
|
||||
'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.',
|
||||
]);
|
||||
expect(result.contextReport).toEqual({ capped: false, warnings: [NOTION_ORG_KNOWLEDGE_WARNING] });
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { buildReconcileSystemPrompt, buildReconcileToolSet } from './build-reconcile-context.js';
|
||||
import { buildReconcileSystemPrompt, buildReconcileToolSet, buildReconcileUserPrompt } from './build-reconcile-context.js';
|
||||
|
||||
describe('buildReconcileSystemPrompt', () => {
|
||||
it('appends canonical pins when relevant pins are supplied', () => {
|
||||
|
|
@ -39,6 +39,40 @@ describe('buildReconcileSystemPrompt', () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe('buildReconcileUserPrompt', () => {
|
||||
it('includes action details so reconciliation can compare different keys for the same table', () => {
|
||||
const prompt = buildReconcileUserPrompt(
|
||||
{
|
||||
jobId: 'j1',
|
||||
connectionId: 'notion',
|
||||
workUnits: [
|
||||
{
|
||||
unitKey: 'notion-a',
|
||||
rawFiles: ['pages/a/page.md'],
|
||||
status: 'success',
|
||||
actions: [
|
||||
{
|
||||
target: 'wiki',
|
||||
type: 'created',
|
||||
key: 'orbit-customer-source-reference',
|
||||
detail: 'tables: orbit_analytics.customer',
|
||||
},
|
||||
],
|
||||
touchedSlSources: [],
|
||||
},
|
||||
],
|
||||
conflictsResolved: [],
|
||||
evictionsApplied: [],
|
||||
unmappedFallbacks: [],
|
||||
},
|
||||
undefined,
|
||||
);
|
||||
|
||||
expect(prompt).toContain('orbit-customer-source-reference');
|
||||
expect(prompt).toContain('tables: orbit_analytics.customer');
|
||||
});
|
||||
});
|
||||
|
||||
describe('buildReconcileToolSet', () => {
|
||||
it('includes emit_unmapped_fallback with the reconciliation tools', () => {
|
||||
const toolSet = buildReconcileToolSet({
|
||||
|
|
|
|||
|
|
@ -104,6 +104,10 @@ function curatorPassStateSummary(runState?: ReconcilePromptRunState): string {
|
|||
].join('\n');
|
||||
}
|
||||
|
||||
function formatStageActionDetail(detail: string): string {
|
||||
return detail.trim().replace(/\s+/g, ' ');
|
||||
}
|
||||
|
||||
export function buildReconcileUserPrompt(
|
||||
stageIndex: StageIndex,
|
||||
ev: EvictionUnit | undefined,
|
||||
|
|
@ -119,7 +123,14 @@ export function buildReconcileUserPrompt(
|
|||
const actions =
|
||||
wu.actions.length === 0
|
||||
? ' actions: (none)'
|
||||
: wu.actions.map((a) => ` - ${a.target}:${a.type} ${a.key}`).join('\n');
|
||||
: wu.actions
|
||||
.map((a) => {
|
||||
const detail = formatStageActionDetail(a.detail);
|
||||
return detail.length > 0
|
||||
? ` - ${a.target}:${a.type} ${a.key}; detail: ${detail}`
|
||||
: ` - ${a.target}:${a.type} ${a.key}`;
|
||||
})
|
||||
.join('\n');
|
||||
return `- unitKey: ${wu.unitKey} (status=${wu.status})\n${actions}`;
|
||||
})
|
||||
.join('\n');
|
||||
|
|
|
|||
|
|
@ -19,7 +19,14 @@ describe('stage_list tool', () => {
|
|||
unitKey: 'u2',
|
||||
rawFiles: ['b.yml'],
|
||||
status: 'success',
|
||||
actions: [{ target: 'wiki', type: 'created', key: 'page_b', detail: '' }],
|
||||
actions: [
|
||||
{
|
||||
target: 'wiki',
|
||||
type: 'created',
|
||||
key: 'page_b',
|
||||
detail: 'tables: orbit_analytics.customer',
|
||||
},
|
||||
],
|
||||
touchedSlSources: [],
|
||||
},
|
||||
],
|
||||
|
|
@ -36,6 +43,7 @@ describe('stage_list tool', () => {
|
|||
expect(out).toContain('src_a');
|
||||
expect(out).toContain('u2');
|
||||
expect(out).toContain('page_b');
|
||||
expect(out).toContain('tables: orbit_analytics.customer');
|
||||
});
|
||||
|
||||
it('says empty when no writes', async () => {
|
||||
|
|
|
|||
|
|
@ -6,6 +6,10 @@ export interface StageListDeps {
|
|||
stageIndex: StageIndex;
|
||||
}
|
||||
|
||||
function formatActionDetail(detail: string): string {
|
||||
return detail.trim().replace(/\s+/g, ' ');
|
||||
}
|
||||
|
||||
export function createStageListTool(deps: StageListDeps) {
|
||||
return tool({
|
||||
description:
|
||||
|
|
@ -20,7 +24,14 @@ export function createStageListTool(deps: StageListDeps) {
|
|||
const actions =
|
||||
wu.actions.length === 0
|
||||
? ' (no actions)'
|
||||
: wu.actions.map((a) => ` - ${a.target}:${a.type} ${a.key}`).join('\n');
|
||||
: wu.actions
|
||||
.map((a) => {
|
||||
const detail = formatActionDetail(a.detail);
|
||||
return detail.length > 0
|
||||
? ` - ${a.target}:${a.type} ${a.key}; detail: ${detail}`
|
||||
: ` - ${a.target}:${a.type} ${a.key}`;
|
||||
})
|
||||
.join('\n');
|
||||
return `- unitKey: ${wu.unitKey} (status=${wu.status})\n rawFiles: ${wu.rawFiles.join(', ') || '(none)'}\n actions:\n${actions}`;
|
||||
})
|
||||
.join('\n');
|
||||
|
|
|
|||
|
|
@ -90,6 +90,25 @@ describe('memory runtime assets', () => {
|
|||
expect(body).not.toContain('a standalone SL source only when raw evidence contains enough table or SQL structure');
|
||||
});
|
||||
|
||||
it('ships Metabase guidance that avoids invalid joins for SQL-only card outputs', async () => {
|
||||
const body = await readFile(join(skillsDir, 'metabase_ingest', 'SKILL.md'), 'utf-8');
|
||||
|
||||
expect(body).toContain('Do not declare a KTX join just because the card SQL joins that table internally');
|
||||
expect(body).toContain('only when the card output exposes a local key that matches the target source grain');
|
||||
expect(body).toContain('If `sl_discover` resolves the table, it is not outside the manifest');
|
||||
expect(body).toContain('reason: "parse_error"');
|
||||
expect(body).not.toContain('Tables outside the manifest');
|
||||
expect(body).not.toContain('reason: "metabase_sql_untranslated"');
|
||||
});
|
||||
|
||||
it('ships Notion guidance for physical-table fallbacks and duplicate wiki reconciliation', async () => {
|
||||
const body = await readFile(join(skillsDir, 'notion_synthesize', 'SKILL.md'), 'utf-8');
|
||||
|
||||
expect(body).toContain('Notion `dataSourceCount` counts Notion databases/data sources only');
|
||||
expect(body).toContain('Search existing wiki pages for the same `tables:` or `sl_refs:` frontmatter');
|
||||
expect(body).toContain('no_physical_table');
|
||||
});
|
||||
|
||||
it('packages LookML connection-mismatch SL gate guidance', async () => {
|
||||
const body = await readFile(join(skillsDir, 'lookml_ingest', 'SKILL.md'), 'utf-8');
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue