diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 555d7fc0..8908b532 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,14 +33,6 @@ repos: name: ruff format (python) files: ^python/ - - repo: local - hooks: - - id: ktx-package-checks - name: ktx package checks - entry: node scripts/precommit-check.mjs - language: system - files: ^(packages/|scripts/|python/|package\.json$|pnpm-lock\.yaml$|pnpm-workspace\.yaml$|release-policy\.json$|tsconfig\.base\.json$|pyproject\.toml$|uv\.lock$|uv\.toml$) - - repo: https://github.com/Yelp/detect-secrets rev: v1.5.0 hooks: diff --git a/packages/cli/src/agent.test.ts b/packages/cli/src/agent.test.ts index 043bdddb..d72814bf 100644 --- a/packages/cli/src/agent.test.ts +++ b/packages/cli/src/agent.test.ts @@ -185,8 +185,8 @@ describe('runKtxAgent', () => { search: vi.fn(async () => ({ results: [ { - key: 'metrics/revenue', - path: 'knowledge/global/metrics/revenue.md', + key: 'metrics-revenue', + path: 'knowledge/global/metrics-revenue.md', scope: 'GLOBAL' as const, summary: 'Revenue metric definition', score: 0.02459016393442623, @@ -207,8 +207,8 @@ describe('runKtxAgent', () => { expect(JSON.parse(io.stdout())).toEqual({ results: [ expect.objectContaining({ - key: 'metrics/revenue', - path: 'knowledge/global/metrics/revenue.md', + key: 'metrics-revenue', + path: 'knowledge/global/metrics-revenue.md', matchReasons: ['lexical', 'token'], }), ], diff --git a/packages/cli/src/commands/connection-notion.test.ts b/packages/cli/src/commands/connection-notion.test.ts index b03484d4..3315e1cc 100644 --- a/packages/cli/src/commands/connection-notion.test.ts +++ b/packages/cli/src/commands/connection-notion.test.ts @@ -355,6 +355,53 @@ describe('runKtxConnectionNotion', () => { expect(io.stdout()).toContain('rootPageIds: 1'); }); + it('uses inline Notion auth_token for interactive discovery', async () => { + const projectDir = join(tempDir, 'project'); + const initialized = await initKtxProject({ projectDir, projectName: 'warehouse' }); + await writeProjectConfig(projectDir, { + ...initialized.config, + connections: { + 'notion-main': { + driver: 'notion', + auth_token: 'ntn_inline_token', + crawl_mode: 'selected_roots', + root_page_ids: [PAGE_IDS.engineering], + root_database_ids: [], + root_data_source_ids: [], + max_pages_per_run: 12, + max_knowledge_creates_per_run: 2, + max_knowledge_updates_per_run: 7, + last_successful_cursor: null, + }, + }, + }); + const api = fakeNotionApi([notionPage(PAGE_IDS.engineering, 'Engineering')]); + const createNotionApi = vi.fn((authToken: string) => { + expect(authToken).toBe('ntn_inline_token'); + return api; + }); + const io = makeIo(); + + await expect( + runKtxConnectionNotion( + { + command: 'pick', + projectDir, + connectionId: 'notion-main', + mode: 'interactive', + }, + io.io, + { + createNotionApi, + renderPicker: vi.fn(async (): Promise => ({ kind: 'quit' })), + }, + ), + ).resolves.toBe(0); + + expect(createNotionApi).toHaveBeenCalledOnce(); + expect(io.stdout()).toContain('No changes saved.'); + }); + it('passes partial-discovery warnings into the TUI banner state', async () => { const projectDir = join(tempDir, 'project'); const initialized = await initKtxProject({ projectDir, projectName: 'warehouse' }); diff --git a/packages/cli/src/commands/connection-notion.ts b/packages/cli/src/commands/connection-notion.ts index ecdf0f36..e0f68c0b 100644 --- a/packages/cli/src/commands/connection-notion.ts +++ b/packages/cli/src/commands/connection-notion.ts @@ -1,4 +1,4 @@ -import { parseNotionConnectionConfig, resolveNotionAuthToken } from '@ktx/context/connections'; +import { parseNotionConnectionConfig, resolveNotionConnectionAuthToken } from '@ktx/context/connections'; import { type NotionApi, type NotionBotInfo, NotionClient } from '@ktx/context/ingest'; import { type KtxLocalProject, @@ -223,7 +223,7 @@ export async function runKtxConnectionNotion( const project = await loadProject({ projectDir: args.projectDir }); const rawConnection = notionConnection(project, args.connectionId); const notion = parseNotionConnectionConfig(rawConnection); - const authToken = await resolveNotionAuthToken(notion.auth_token_ref, { env: deps.env }); + const authToken = await resolveNotionConnectionAuthToken(notion, { env: deps.env }); const api = deps.createNotionApi ? deps.createNotionApi(authToken) : new NotionClient(authToken); const discovery = await discoverNotionPickerPages(api); const tree = buildPickerTree(discovery.pages); diff --git a/packages/cli/src/context-build-view.test.ts b/packages/cli/src/context-build-view.test.ts index d7069578..e0132d33 100644 --- a/packages/cli/src/context-build-view.test.ts +++ b/packages/cli/src/context-build-view.test.ts @@ -267,10 +267,11 @@ describe('renderContextBuildView', () => { { connectionId: 'warehouse', driver: 'postgres', operation: 'scan', debugCommand: '', steps: ['scan'] }, ]); state.primarySources[0].status = 'failed'; + state.primarySources[0].failureText = 'KTX lost its connection to PostgreSQL while scanning warehouse.'; const output = renderContextBuildView(state, { styled: false }); expect(output).toContain('✗'); - expect(output).toContain('failed'); + expect(output).toContain('KTX lost its connection to PostgreSQL while scanning warehouse.'); }); it('omits empty groups', () => { @@ -327,8 +328,19 @@ describe('createRepainter', () => { repainter.paint('hello'); repainter.paint('bye'); - expect(io.stdout()).toContain('\rbye'); - expect(io.stdout()).not.toContain('\u001b[1A\rbye'); + expect(io.stdout()).toContain('bye'); + expect(io.stdout()).not.toMatch(/\[\d+A/); + }); + + it('does not undershoot cursor-up when a line is exactly the terminal width', () => { + const io = makeIo({ isTTY: true, columns: 10 }); + const repainter = createRepainter(io.io); + + repainter.paint('0123456789\nsecond\n'); + repainter.paint('0123456789\nsecond\n'); + + const cursorMoves = [...io.stdout().matchAll(/\[(\d+)A/g)].map((m) => Number(m[1])); + expect(cursorMoves).toEqual([2]); }); }); @@ -373,6 +385,52 @@ describe('runContextBuild', () => { expect(result).toEqual({ exitCode: 1, detached: false }); }); + it('renders a friendly network failure when target output contains a network error code', async () => { + const io = makeIo(); + const project = projectWithConnections({ + warehouse: { driver: 'postgres' }, + }); + const executeTarget = vi.fn(async (target, _args, targetIo) => { + targetIo.stderr.write('Error: read EADDRNOTAVAIL\n'); + return failedResult(target.connectionId, target.driver, target.operation); + }); + + const result = await runContextBuild( + project, + { projectDir: '/tmp/project', inputMode: 'disabled' }, + io.io, + { executeTarget, now: () => 1000 }, + ); + + expect(result).toEqual({ exitCode: 1, detached: false }); + expect(io.stdout()).toContain('KTX lost its connection to PostgreSQL while scanning warehouse.'); + expect(io.stdout()).toContain('network address unavailable (EADDRNOTAVAIL)'); + expect(io.stdout()).toContain('Retry: ktx setup --project-dir /tmp/project'); + expect(io.stdout()).not.toContain('BoundPool'); + }); + + it('renders a friendly network failure when target execution throws', async () => { + const io = makeIo(); + const project = projectWithConnections({ + warehouse: { driver: 'postgres' }, + }); + const error = Object.assign(new Error('read ECONNRESET'), { code: 'ECONNRESET' }); + const executeTarget = vi.fn(async () => { + throw error; + }); + + const result = await runContextBuild( + project, + { projectDir: '/tmp/project', inputMode: 'disabled' }, + io.io, + { executeTarget, now: () => 1000 }, + ); + + expect(result).toEqual({ exitCode: 1, detached: false }); + expect(io.stdout()).toContain('KTX lost its connection to PostgreSQL while scanning warehouse.'); + expect(io.stdout()).toContain('connection reset (ECONNRESET)'); + }); + it('renders final view for non-TTY output', async () => { const io = makeIo(); const project = projectWithConnections({ @@ -529,6 +587,36 @@ describe('runContextBuild', () => { ], }); }); + + it('returns report IDs parsed from failed source-ingest target output', async () => { + const io = makeIo(); + const project = projectWithConnections({ + warehouse: { driver: 'postgres' }, + dbt_main: { driver: 'dbt' }, + }); + const executeTarget = vi.fn(async (target, _args, targetIo) => { + if (target.operation === 'scan') { + return successResult(target.connectionId, target.driver, target.operation); + } + + targetIo.stdout.write('Report: report-dbt-failed\n'); + targetIo.stdout.write('Work units: 3\n'); + return failedResult(target.connectionId, target.driver, target.operation); + }); + + const result = await runContextBuild( + project, + { projectDir: '/tmp/project', inputMode: 'disabled' }, + io.io, + { executeTarget, now: () => 1000 }, + ); + + expect(result).toMatchObject({ + exitCode: 1, + detached: false, + reportIds: ['report-dbt-failed'], + }); + }); }); describe('viewStateFromSourceProgress', () => { diff --git a/packages/cli/src/context-build-view.ts b/packages/cli/src/context-build-view.ts index 571c71dd..6b706ae7 100644 --- a/packages/cli/src/context-build-view.ts +++ b/packages/cli/src/context-build-view.ts @@ -22,6 +22,7 @@ export interface ContextBuildTargetState { status: 'queued' | 'running' | 'done' | 'failed'; detailLine: string | null; summaryText: string | null; + failureText: string | null; startedAt: number | null; elapsedMs: number; } @@ -133,7 +134,8 @@ function targetDetail(target: ContextBuildTargetState, styled: boolean): string return parts.join(' · '); } if (target.status === 'failed') { - return styled ? red('failed') : 'failed'; + const failureText = target.failureText ?? 'failed'; + return styled ? red(failureText) : failureText; } if (target.status === 'running') { const percent = extractPercent(target.detailLine); @@ -327,6 +329,7 @@ export function viewStateFromSourceProgress( status: s.status, detailLine: null, summaryText: s.summaryText ?? null, + failureText: null, startedAt: s.startedAtMs ?? null, elapsedMs: s.status === 'running' && s.startedAtMs ? now - s.startedAtMs : (s.elapsedMs ?? 0), }); @@ -378,7 +381,8 @@ export function createRepainter(io: KtxCliIo) { } io.stdout.write('\r'); } - io.stdout.write(content.replaceAll('\n', `${ESC}[K\n`)); + io.stdout.write(`${ESC}[2K`); + io.stdout.write(content.replaceAll('\n', `\n${ESC}[2K`)); io.stdout.write(`${ESC}[J`); hasPainted = true; lastCursorUpRows = cursorUpRowsAfterWrite(content); @@ -440,7 +444,83 @@ export function defaultSetupKeystroke(onDetach: () => void, onCtrlC: () => void) // --- Orchestration --- function makeTargetState(target: KtxPublicIngestPlanTarget): ContextBuildTargetState { - return { target, status: 'queued', detailLine: null, summaryText: null, startedAt: null, elapsedMs: 0 }; + return { + target, + status: 'queued', + detailLine: null, + summaryText: null, + failureText: null, + startedAt: null, + elapsedMs: 0, + }; +} + +const NETWORK_ERROR_REASONS: Record = { + EADDRNOTAVAIL: 'network address unavailable', + ECONNRESET: 'connection reset', + ECONNREFUSED: 'connection refused', + ENETUNREACH: 'network unreachable', + ENOTFOUND: 'host not found', + ETIMEDOUT: 'connection timed out', + EHOSTUNREACH: 'host unreachable', +}; + +function unknownErrorMessage(error: unknown): string { + return error instanceof Error ? error.message : String(error); +} + +function networkErrorCodeFromText(text: string): string | null { + for (const code of Object.keys(NETWORK_ERROR_REASONS)) { + if (new RegExp(`\\b${code}\\b`).test(text)) { + return code; + } + } + return null; +} + +function networkErrorCode(error: unknown, capturedOutput = ''): string | null { + const directCode = typeof (error as { code?: unknown })?.code === 'string' + ? (error as { code: string }).code + : null; + if (directCode && NETWORK_ERROR_REASONS[directCode]) { + return directCode; + } + return networkErrorCodeFromText(`${unknownErrorMessage(error)}\n${capturedOutput}`); +} + +function friendlyDriverName(driver: string): string { + const normalized = driver.toLowerCase(); + if (normalized === 'postgres' || normalized === 'postgresql') return 'PostgreSQL'; + if (normalized === 'mysql') return 'MySQL'; + if (normalized === 'sqlserver') return 'SQL Server'; + if (normalized === 'bigquery') return 'BigQuery'; + if (normalized === 'snowflake') return 'Snowflake'; + if (normalized === 'clickhouse') return 'ClickHouse'; + if (normalized === 'sqlite') return 'SQLite'; + return driver || 'the source'; +} + +function failedStepDetail(result: KtxPublicIngestTargetResult): string | null { + return result.steps.find((step) => step.status === 'failed')?.detail ?? null; +} + +function failureTextForTarget(input: { + target: KtxPublicIngestPlanTarget; + projectDir: string; + capturedOutput?: string; + error?: unknown; + fallback?: string | null; +}): string { + const code = networkErrorCode(input.error, input.capturedOutput); + if (code) { + const operation = input.target.operation === 'scan' ? 'scanning' : 'ingesting'; + return [ + `KTX lost its connection to ${friendlyDriverName(input.target.driver)} while ${operation} ${input.target.connectionId}.`, + `Reason: ${NETWORK_ERROR_REASONS[code]} (${code}).`, + `Retry: ${resumeCommand(input.projectDir)}`, + ].join(' '); + } + return input.fallback ?? `${input.target.connectionId} failed.`; } export function initViewState(targets: KtxPublicIngestPlanTarget[]): ContextBuildViewState { @@ -493,6 +573,7 @@ export async function runContextBuild( const artifactPaths = new Set(); let detached = false; + let exiting = false; let cleanupKeystroke: (() => void) | null = null; if (isTTY || deps.setupKeystroke) { @@ -502,6 +583,7 @@ export async function runContextBuild( }; cleanupKeystroke = (deps.setupKeystroke ?? defaultSetupKeystroke)( () => { + detached = true; cleanup(); deps.onDetach?.(); const bg = spawnBackgroundBuild(args.projectDir); @@ -509,12 +591,14 @@ export async function runContextBuild( if (bg) io.stdout.write(`Log: ${bg.logPath}\n`); io.stdout.write(`Resume: ${resumeCommand(args.projectDir)}\n`); io.stdout.write(`Status: ktx setup context status --project-dir ${resolve(args.projectDir)}\n`); + exiting = true; process.exit(0); }, () => { cleanup(); io.stdout.write('\n\nContext build stopped. Nothing is running in the background.\n'); io.stdout.write(`Resume: ${resumeCommand(args.projectDir)}\n`); + exiting = true; process.exit(130); }, ); @@ -548,21 +632,38 @@ export async function runContextBuild( false, ); - const result = await execTarget(targetState.target, runArgs, capture.io, {}); + let result: KtxPublicIngestTargetResult | null = null; + let thrownError: unknown = null; + try { + result = await execTarget(targetState.target, runArgs, capture.io, {}); + } catch (error) { + if (exiting) { + throw error; + } + thrownError = error; + } targetState.elapsedMs = nowFn() - (targetState.startedAt ?? nowFn()); - const failed = result.steps.some((s) => s.status === 'failed'); + const failed = thrownError !== null || result?.steps.some((s) => s.status === 'failed') === true; targetState.status = failed ? 'failed' : 'done'; targetState.detailLine = null; + const capturedOutput = capture.captured(); + const metadata = collectOutputMetadata(capturedOutput, targetState.target.operation); + for (const reportId of metadata.reportIds) reportIds.add(reportId); + for (const artifactPath of metadata.artifactPaths) artifactPaths.add(artifactPath); if (!failed) { - const capturedOutput = capture.captured(); - const metadata = collectOutputMetadata(capturedOutput, targetState.target.operation); - for (const reportId of metadata.reportIds) reportIds.add(reportId); - for (const artifactPath of metadata.artifactPaths) artifactPaths.add(artifactPath); targetState.summaryText = targetState.target.operation === 'scan' ? parseScanSummary(capturedOutput) : parseIngestSummary(capturedOutput); + } else { + targetState.failureText = failureTextForTarget({ + target: targetState.target, + projectDir: args.projectDir, + capturedOutput, + error: thrownError, + fallback: result ? failedStepDetail(result) : null, + }); } if (failed) hasFailure = true; diff --git a/packages/cli/src/demo-seeded-inspect.test.ts b/packages/cli/src/demo-seeded-inspect.test.ts index a45415bd..a2aa3c16 100644 --- a/packages/cli/src/demo-seeded-inspect.test.ts +++ b/packages/cli/src/demo-seeded-inspect.test.ts @@ -4,7 +4,7 @@ import { join } from 'node:path'; import { afterEach, describe, expect, it } from 'vitest'; import { runDemoSeeded } from './demo-seeded.js'; import { formatSeededInspect, inspectSeededProject } from './demo-seeded-inspect.js'; -import { KTX_NEXT_STEP_COMMANDS } from './next-steps.js'; +import { KTX_NEXT_STEP_DIRECT_COMMANDS } from './next-steps.js'; describe('seeded demo inspect contract', () => { const projectDir = join(tmpdir(), `ktx-demo-seeded-inspect-${process.pid}`); @@ -59,7 +59,7 @@ describe('seeded demo inspect contract', () => { reports: { primaryPath: 'reports/seeded-demo-report.json', fileCount: 1 }, replays: { primaryPath: 'replays/replay.memory-flow.v1.json', latestPath: 'replays/latest.memory-flow.v1.json' }, }, - nextCommands: KTX_NEXT_STEP_COMMANDS, + nextCommands: KTX_NEXT_STEP_DIRECT_COMMANDS, }); expect(inspect.generatedOutputs.replays.fileCount).toBeGreaterThanOrEqual(3); @@ -91,10 +91,7 @@ describe('seeded demo inspect contract', () => { expect(output).toContain('Latest replay: seeded (packaged, prebuilt)'); expect(output).toContain(' $ ktx agent tools --json'); expect(output).toContain(' $ ktx agent context --json'); - expect(output).toContain(' $ ktx serve --mcp stdio --user-id local'); - expect(output.indexOf('ktx agent tools --json')).toBeLessThan( - output.indexOf('ktx serve --mcp stdio --user-id local'), - ); + expect(output).not.toContain('ktx serve --mcp stdio --user-id local'); expect(output).not.toContain('ktx ask'); expect(output).not.toContain('deterministic mode'); }); diff --git a/packages/cli/src/demo-seeded-inspect.ts b/packages/cli/src/demo-seeded-inspect.ts index 13320890..929d8702 100644 --- a/packages/cli/src/demo-seeded-inspect.ts +++ b/packages/cli/src/demo-seeded-inspect.ts @@ -4,7 +4,7 @@ import { join, resolve } from 'node:path'; import type { MemoryFlowReplayInput } from '@ktx/context/ingest/memory-flow'; import { loadPackagedDemoReplay } from './demo-assets.js'; import { DEMO_LATEST_REPLAY_FILE, loadLatestDemoReplay } from './demo-replay-store.js'; -import { KTX_NEXT_STEP_COMMANDS, KTX_NEXT_STEP_COMMAND_WIDTH } from './next-steps.js'; +import { KTX_NEXT_STEP_COMMAND_WIDTH, KTX_NEXT_STEP_DIRECT_COMMANDS } from './next-steps.js'; type SeededInspectReadiness = 'missing' | 'ready' | 'corrupt'; @@ -178,7 +178,7 @@ function sourceBundleFromManifest(manifest: DemoSeededManifest): SeededInspectSu } function nextCommands(): SeededInspectSummary['nextCommands'] { - return [...KTX_NEXT_STEP_COMMANDS]; + return [...KTX_NEXT_STEP_DIRECT_COMMANDS]; } function modeMetadataFromReplay(replay: MemoryFlowReplayInput | null): SeededInspectSummary['modeMetadata'] { diff --git a/packages/cli/src/demo.test.ts b/packages/cli/src/demo.test.ts index 0b053ee6..97d37f4e 100644 --- a/packages/cli/src/demo.test.ts +++ b/packages/cli/src/demo.test.ts @@ -8,7 +8,7 @@ import { DEMO_FULL_JOB_ID, defaultDemoProjectDir, ensureDemoProject } from './de import type { DemoFullResult } from './demo-full.js'; import { createTestDemoPromptAdapter } from './demo-interaction.js'; import type { renderMemoryFlowTui } from './memory-flow-tui.js'; -import { KTX_NEXT_STEP_COMMANDS } from './next-steps.js'; +import { KTX_NEXT_STEP_DIRECT_COMMANDS } from './next-steps.js'; import { resetVizFallbackWarningsForTest } from './viz-fallback.js'; const SEEDED_DEMO_SEMANTIC_SOURCE_COUNT = 46; @@ -208,7 +208,7 @@ describe('runKtxDemo', () => { expect(io.stdout()).toContain('Connection: orbit_demo'); expect(io.stdout()).toContain('ktx sl list'); expect(io.stdout()).toContain('ktx wiki list'); - expect(io.stdout()).toContain('ktx serve --mcp stdio --user-id local'); + expect(io.stdout()).not.toContain('ktx serve --mcp stdio --user-id local'); expect(io.stdout()).not.toContain('KTX memory flow'); expect(io.stderr()).toContain( 'Visualization requested but stdin raw mode is unavailable; printing plain output.', @@ -226,7 +226,7 @@ describe('runKtxDemo', () => { expect(testIo.stdout()).toContain('Connection: orbit_demo'); expect(testIo.stdout()).toContain('ktx sl list'); expect(testIo.stdout()).toContain('ktx wiki list'); - expect(testIo.stdout()).toContain('ktx serve --mcp stdio --user-id local'); + expect(testIo.stdout()).not.toContain('ktx serve --mcp stdio --user-id local'); expect(testIo.stdout()).not.toContain('KTX memory flow'); expect(testIo.stderr()).toContain( 'Visualization requested but stdout is not an interactive terminal; printing plain output.', @@ -287,7 +287,7 @@ describe('runKtxDemo', () => { expect(io.stdout()).toContain('LLM calls: none'); expect(io.stdout()).toContain('Your KTX project files are at:'); expect(io.stdout()).toContain(join(tmpdir(), 'ktx-demo-')); - expect(io.stdout()).toContain('ktx serve --mcp stdio'); + expect(io.stdout()).not.toContain('ktx serve --mcp stdio'); expect(io.stdout()).not.toContain(['ktx', 'mcp'].join(' ')); expect(io.stdout()).not.toContain('deterministic'); }); @@ -356,7 +356,7 @@ describe('runKtxDemo', () => { generatedContext: 'prebuilt from bundled assets', llmCalls: 'none', }, - nextCommands: KTX_NEXT_STEP_COMMANDS, + nextCommands: KTX_NEXT_STEP_DIRECT_COMMANDS, }); expect(parsed.generatedOutputs.replays.fileCount).toBeGreaterThanOrEqual(3); expect(jsonIo.stderr()).toBe(''); @@ -423,7 +423,7 @@ describe('runKtxDemo', () => { expect(testIo.stdout()).toContain('KTX finished ingesting your data'); expect(testIo.stdout()).toContain('ktx sl list'); expect(testIo.stdout()).toContain('ktx wiki list'); - expect(testIo.stdout()).toContain('ktx serve --mcp stdio --user-id local'); + expect(testIo.stdout()).not.toContain('ktx serve --mcp stdio --user-id local'); expect(testIo.stdout()).not.toContain(['ktx', 'ask'].join(' ')); expect(testIo.stdout()).not.toContain(['ktx', 'mcp'].join(' ')); }); diff --git a/packages/cli/src/index.test.ts b/packages/cli/src/index.test.ts index 855ff5fd..a84949b3 100644 --- a/packages/cli/src/index.test.ts +++ b/packages/cli/src/index.test.ts @@ -1171,7 +1171,7 @@ describe('runKtxCli', () => { projectDir: tempDir, inputMode: 'disabled', cliVersion: '0.0.0-private', - anthropicApiKeyEnv: 'ANTHROPIC_API_KEY', + anthropicApiKeyEnv: 'ANTHROPIC_API_KEY', // pragma: allowlist secret anthropicModel: 'claude-sonnet-4-6', skipLlm: false, }), @@ -1232,7 +1232,7 @@ describe('runKtxCli', () => { inputMode: 'disabled', skipLlm: true, embeddingBackend: 'openai', - embeddingApiKeyEnv: 'OPENAI_API_KEY', + embeddingApiKeyEnv: 'OPENAI_API_KEY', // pragma: allowlist secret skipEmbeddings: false, }), setupIo.io, @@ -1333,7 +1333,7 @@ describe('runKtxCli', () => { source: 'metabase', sourceConnectionId: 'prod_metabase', sourceUrl: 'https://metabase.example.com', - sourceApiKeyRef: 'env:METABASE_API_KEY', + sourceApiKeyRef: 'env:METABASE_API_KEY', // pragma: allowlist secret sourceWarehouseConnectionId: 'warehouse', metabaseDatabaseId: 1, }), @@ -1759,8 +1759,8 @@ describe('runKtxCli', () => { { results: [ { - key: 'metrics/revenue', - path: 'knowledge/global/metrics/revenue.md', + key: 'metrics-revenue', + path: 'knowledge/global/metrics-revenue.md', scope: 'GLOBAL', summary: 'Revenue metric definition', score: 0.02459016393442623, @@ -1786,8 +1786,8 @@ describe('runKtxCli', () => { expect(JSON.parse(io.stdout())).toEqual({ results: [ expect.objectContaining({ - key: 'metrics/revenue', - path: 'knowledge/global/metrics/revenue.md', + key: 'metrics-revenue', + path: 'knowledge/global/metrics-revenue.md', matchReasons: ['lexical', 'token'], }), ], diff --git a/packages/cli/src/ingest-viz.test.ts b/packages/cli/src/ingest-viz.test.ts index 6963d277..a37c1ed8 100644 --- a/packages/cli/src/ingest-viz.test.ts +++ b/packages/cli/src/ingest-viz.test.ts @@ -186,6 +186,91 @@ describe('runKtxIngest viz and replay', () => { expect(io.stdout()).toContain('Connection: warehouse'); }); + it('prints live viz final summaries as errors when the report has failed work units', async () => { + const projectDir = join(tempDir, 'project'); + await writeWarehouseConfig(projectDir); + const io = makeIo({ isTTY: true, stdinIsTTY: true, columns: 120 }); + const liveSession = { + update: vi.fn(), + close: vi.fn(), + isClosed: vi.fn(() => false), + }; + const startLiveMemoryFlow = vi.fn(async (_input: MemoryFlowReplayInput, _io: unknown) => liveSession); + const runLocal = vi.fn(async (input: RunLocalIngestOptions): Promise => { + input.memoryFlow?.emit({ type: 'source_acquired', adapter: 'notion', trigger: 'manual_resync', fileCount: 37 }); + input.memoryFlow?.update({ + syncId: 'sync-notion', + plannedWorkUnits: [ + { + unitKey: 'notion-cluster-1', + rawFiles: ['pages/a.md'], + peerFileCount: 0, + dependencyCount: 0, + }, + ], + }); + input.memoryFlow?.emit({ type: 'chunks_planned', chunkCount: 1, workUnitCount: 1, evictionCount: 0 }); + input.memoryFlow?.emit({ + type: 'work_unit_finished', + unitKey: 'notion-cluster-1', + status: 'failed', + reason: 'notion-cluster-1 failed: {"error":"invalid_grant","error_description":"reauth related error (invalid_rapt)"}', + }); + input.memoryFlow?.emit({ type: 'report_created', runId: 'live-failed' }); + input.memoryFlow?.finish('done'); + + const failedWorkUnit = { + ...localFakeBundleReport('live-failed').body.workUnits[0], + unitKey: 'notion-cluster-1', + rawFiles: ['pages/a.md'], + status: 'failed' as const, + reason: 'notion-cluster-1 failed: {"error":"invalid_grant","error_description":"reauth related error (invalid_rapt)"}', + actions: [], + touchedSlSources: [], + }; + const report = localFakeBundleReport('live-failed', { + id: 'report-live-failed', + runId: 'run-live-failed', + connectionId: input.connectionId, + sourceKey: input.adapter, + body: { + workUnits: [failedWorkUnit], + failedWorkUnits: [failedWorkUnit.unitKey], + }, + }); + return { + result: { + jobId: 'live-failed', + runId: report.runId, + syncId: report.body.syncId, + diffSummary: report.body.diffSummary, + workUnitCount: report.body.workUnits.length, + failedWorkUnits: report.body.failedWorkUnits, + artifactsWritten: report.body.provenanceRows.length, + commitSha: report.body.commitSha, + }, + report, + }; + }); + + await expect( + runKtxIngest( + { + command: 'run', + projectDir, + connectionId: 'notion-main', + adapter: 'notion', + outputMode: 'viz', + }, + io.io, + { runLocalIngest: runLocal, startLiveMemoryFlow }, + ), + ).resolves.toBe(1); + + expect(io.stdout()).toContain('Memory-flow summary: error'); + expect(io.stdout()).toContain('Notion authorization expired'); + }); + it('falls back to text live rendering when the TUI live session is unavailable', async () => { const projectDir = join(tempDir, 'project'); await writeWarehouseConfig(projectDir); diff --git a/packages/cli/src/ingest.ts b/packages/cli/src/ingest.ts index 3b5e2aa7..cf7a7aff 100644 --- a/packages/cli/src/ingest.ts +++ b/packages/cli/src/ingest.ts @@ -437,6 +437,21 @@ function initialRunMemoryFlowInput( }; } +function finalRunMemoryFlowInput(snapshot: MemoryFlowReplayInput, report: IngestReportSnapshot): MemoryFlowReplayInput { + const status = reportStatus(report); + return { + ...snapshot, + runId: report.runId, + connectionId: report.connectionId, + adapter: report.sourceKey, + status, + syncId: report.body.syncId, + reportId: report.id, + reportPath: report.id, + errors: status === 'error' ? report.body.failedWorkUnits : snapshot.errors, + }; +} + function managedDaemonOptionsForIngestRun( args: Extract, io: KtxIngestIo, @@ -592,7 +607,7 @@ export async function runKtxIngest( ...(memoryFlow ? { memoryFlow } : {}), }); if (shouldUseLiveViz && memoryFlow) { - latestMemoryFlowSnapshot = memoryFlow.snapshot(); + latestMemoryFlowSnapshot = finalRunMemoryFlowInput(memoryFlow.snapshot(), result.report); liveTui?.close(); liveTui = null; io.stdout.write(formatMemoryFlowFinalSummary(latestMemoryFlowSnapshot)); diff --git a/packages/cli/src/knowledge.test.ts b/packages/cli/src/knowledge.test.ts index 0e9ed1d5..d7e17605 100644 --- a/packages/cli/src/knowledge.test.ts +++ b/packages/cli/src/knowledge.test.ts @@ -61,7 +61,7 @@ describe('runKtxKnowledge', () => { { command: 'write', projectDir, - key: 'metrics/revenue', + key: 'metrics-revenue', scope: 'GLOBAL', userId: 'local', summary: 'Revenue', @@ -73,24 +73,53 @@ describe('runKtxKnowledge', () => { writeIo.io, ), ).resolves.toBe(0); - expect(writeIo.stdout()).toContain('Wrote knowledge/global/metrics/revenue.md'); + expect(writeIo.stdout()).toContain('Wrote knowledge/global/metrics-revenue.md'); const readIo = makeIo(); await expect( - runKtxKnowledge({ command: 'read', projectDir, key: 'metrics/revenue', userId: 'local' }, readIo.io), + runKtxKnowledge({ command: 'read', projectDir, key: 'metrics-revenue', userId: 'local' }, readIo.io), ).resolves.toBe(0); - expect(readIo.stdout()).toContain('# metrics/revenue'); + expect(readIo.stdout()).toContain('# metrics-revenue'); expect(readIo.stdout()).toContain('Revenue is paid order value.'); const listIo = makeIo(); await expect(runKtxKnowledge({ command: 'list', projectDir, userId: 'local' }, listIo.io)).resolves.toBe(0); - expect(listIo.stdout()).toContain('GLOBAL\tmetrics/revenue\tRevenue'); + expect(listIo.stdout()).toContain('GLOBAL\tmetrics-revenue\tRevenue'); const searchIo = makeIo(); await expect( runKtxKnowledge({ command: 'search', projectDir, query: 'paid order', userId: 'local' }, searchIo.io), ).resolves.toBe(0); - expect(searchIo.stdout()).toContain('metrics/revenue'); + expect(searchIo.stdout()).toContain('metrics-revenue'); + }); + + it('rejects slash-delimited write keys with a flat-key suggestion', async () => { + const projectDir = join(tempDir, 'project'); + await initKtxProject({ projectDir, projectName: 'warehouse' }); + + const writeIo = makeIo(); + await expect( + runKtxKnowledge( + { + command: 'write', + projectDir, + key: 'orbit/company-overview', + scope: 'GLOBAL', + userId: 'local', + summary: 'Orbit', + content: 'Orbit overview.', + tags: [], + refs: [], + slRefs: [], + }, + writeIo.io, + ), + ).resolves.toBe(1); + + expect(writeIo.stderr()).toContain( + 'Invalid wiki key "orbit/company-overview". Wiki keys must be flat; use "orbit-company-overview".', + ); + expect(writeIo.stdout()).toBe(''); }); it('explains empty search results for a project without wiki pages', async () => { @@ -116,7 +145,7 @@ describe('runKtxKnowledge', () => { { command: 'write', projectDir, - key: 'historic-sql/active-contract-arr-open-tickets', + key: 'active-contract-arr-open-tickets', scope: 'GLOBAL', userId: 'local', summary: 'Active Contract ARR Ranked by Open Support Ticket Count', @@ -138,7 +167,7 @@ describe('runKtxKnowledge', () => { ), ).resolves.toBe(0); - expect(searchIo.stdout()).toContain('historic-sql/active-contract-arr-open-tickets'); + expect(searchIo.stdout()).toContain('active-contract-arr-open-tickets'); expect(searchIo.stderr()).toBe(''); }); }); diff --git a/packages/cli/src/memory-flow-tui.test.tsx b/packages/cli/src/memory-flow-tui.test.tsx index 4326132a..83a46795 100644 --- a/packages/cli/src/memory-flow-tui.test.tsx +++ b/packages/cli/src/memory-flow-tui.test.tsx @@ -201,7 +201,7 @@ describe('MemoryFlowTuiApp', () => { expect(frame).toContain('KTX finished ingesting your data'); expect(frame).toContain('ktx sl list'); expect(frame).toContain('ktx wiki list'); - expect(frame).toContain('ktx serve --mcp stdio --user-id local'); + expect(frame).not.toContain('ktx serve --mcp stdio --user-id local'); expect(frame).not.toContain(['ktx', 'ask'].join(' ')); expect(frame).not.toContain(['ktx', 'mcp'].join(' ')); }); diff --git a/packages/cli/src/next-steps.test.ts b/packages/cli/src/next-steps.test.ts index 39b0e22d..39202c6d 100644 --- a/packages/cli/src/next-steps.test.ts +++ b/packages/cli/src/next-steps.test.ts @@ -66,11 +66,10 @@ describe('KTX demo next steps', () => { const rendered = formatNextStepLines().join('\n'); expect(rendered).toContain('KTX context is ready for agents.'); - expect(rendered).toContain('Preferred route: CLI + Skills'); - expect(rendered).toContain('no MCP server is required'); - expect(rendered).toContain('Direct CLI checks:'); - expect(rendered).toContain('Optional MCP:'); - expect(rendered).not.toContain('Ask your agent to use KTX'); + expect(rendered).toContain('ask a data question'); + expect(rendered).toContain('Verify with:'); + expect(rendered).not.toContain('Preferred route'); + expect(rendered).not.toContain('Optional MCP:'); }); it('does not advertise removed Commander migration commands', () => { @@ -80,7 +79,6 @@ describe('KTX demo next steps', () => { expect(rendered).toContain('ktx agent context --json'); expect(rendered).toContain('ktx sl list'); expect(rendered).toContain('ktx wiki list'); - expect(rendered).toContain('ktx serve --mcp stdio --user-id local'); for (const removed of [ command('ktx', 'ask'), @@ -91,6 +89,7 @@ describe('KTX demo next steps', () => { command('dev', 'knowledge'), command('ktx', 'ingest', 'run'), command('ktx', 'ingest', 'replay'), + command('ktx', 'serve', '--mcp', 'stdio', '--user-id', 'local'), ]) { expect(rendered).not.toContain(removed); } @@ -123,7 +122,7 @@ describe('KTX demo next steps', () => { expect(rendered).toContain('KTX context is ready for agents.'); expect(rendered).toContain('ktx agent context --json'); - expect(rendered).toContain('ktx serve --mcp stdio --user-id local'); + expect(rendered).not.toContain('ktx serve --mcp stdio --user-id local'); expect(rendered).not.toContain('Build KTX context next.'); }); }); diff --git a/packages/cli/src/next-steps.ts b/packages/cli/src/next-steps.ts index a07f5cea..18870ab8 100644 --- a/packages/cli/src/next-steps.ts +++ b/packages/cli/src/next-steps.ts @@ -58,12 +58,9 @@ function commandLines(commands: ReadonlyArray<{ command: string; description: st export function formatNextStepLines(indent = ' '): string[] { return [ - `${indent}KTX context is ready for agents.`, - `${indent}Preferred route: CLI + Skills; installed rules call the pinned local CLI directly, so no MCP server is required.`, - `${indent}Direct CLI checks:`, + `${indent}KTX context is ready for agents. Open your coding agent in this directory and ask a data question.`, + `${indent}Verify with:`, ...commandLines(KTX_NEXT_STEP_DIRECT_COMMANDS, indent), - `${indent}Optional MCP:`, - ...commandLines(KTX_NEXT_STEP_MCP_COMMANDS, indent), ]; } diff --git a/packages/cli/src/setup-agents.test.ts b/packages/cli/src/setup-agents.test.ts index d5ced403..a0971eef 100644 --- a/packages/cli/src/setup-agents.test.ts +++ b/packages/cli/src/setup-agents.test.ts @@ -239,11 +239,11 @@ describe('setup agents', () => { const output = io.stdout(); expect(output).toContain('Agent integration complete'); expect(output).toContain('Claude Code'); - expect(output).toContain('+ Skill installed'); + expect(output).toContain('+ Skill installed — teaches your agent which KTX commands to run'); expect(output).toContain('.claude/skills/ktx/SKILL.md'); - expect(output).toContain('+ Rule installed'); + expect(output).toContain('+ Rule installed — tells your agent when to use KTX'); expect(output).toContain('.claude/rules/ktx.md'); - expect(output).toContain('+ MCP config added'); + expect(output).toContain('+ MCP config added — lets your agent talk to KTX over MCP'); expect(output).toContain('.mcp.json'); }); @@ -258,9 +258,9 @@ describe('setup agents', () => { ); expect(summary).toContain('Cursor'); - expect(summary).toContain('+ Rule installed'); + expect(summary).toContain('+ Rule installed — tells your agent when to use KTX'); expect(summary).toContain('.cursor/rules/ktx.mdc'); - expect(summary).toContain('+ MCP config added'); + expect(summary).toContain('+ MCP config added — lets your agent talk to KTX over MCP'); expect(summary).toContain('.cursor/mcp.json'); expect(summary).not.toContain(tempDir); }); @@ -280,9 +280,9 @@ describe('setup agents', () => { ); expect(summary).toContain('Claude Code'); - expect(summary).toContain('+ Skill installed'); - expect(summary).toContain('+ Rule installed'); + expect(summary).toContain('+ Skill installed — teaches your agent which KTX commands to run'); + expect(summary).toContain('+ Rule installed — tells your agent when to use KTX'); expect(summary).toContain('Codex'); - expect(summary).toContain('+ MCP config added'); + expect(summary).toContain('+ MCP config added — lets your agent talk to KTX over MCP'); }); }); diff --git a/packages/cli/src/setup-agents.ts b/packages/cli/src/setup-agents.ts index 67394861..dad15e7e 100644 --- a/packages/cli/src/setup-agents.ts +++ b/packages/cli/src/setup-agents.ts @@ -348,6 +348,12 @@ export function formatInstallSummary( idx += planned.length; } + const fileHints: Record = { + skill: 'teaches your agent which KTX commands to run', + rule: 'tells your agent when to use KTX', + mcp: 'lets your agent talk to KTX over MCP', + }; + const lines: string[] = []; for (const install of installs) { const targetEntries = entriesByTarget.get(install.target) ?? []; @@ -356,11 +362,13 @@ export function formatInstallSummary( const displayPath = install.scope === 'global' ? entry.path : relative(projectDir, entry.path); if (entry.kind === 'file') { - const label = entry.role === 'rule' ? 'Rule installed' : fileEntryLabels[install.target]; - lines.push(` + ${label}`); + const isRule = entry.role === 'rule' || fileEntryLabels[install.target] === 'Rule installed'; + const label = isRule ? 'Rule installed' : fileEntryLabels[install.target]; + const hint = fileHints[isRule ? 'rule' : (entry.role ?? 'skill')] ?? ''; + lines.push(` + ${label} — ${hint}`); lines.push(` ${displayPath}`); } else { - lines.push(` + MCP config added`); + lines.push(` + MCP config added — ${fileHints.mcp}`); lines.push(` ${displayPath}`); } } diff --git a/packages/cli/src/setup-context.test.ts b/packages/cli/src/setup-context.test.ts index 0d803b7b..89de6a91 100644 --- a/packages/cli/src/setup-context.test.ts +++ b/packages/cli/src/setup-context.test.ts @@ -215,6 +215,47 @@ describe('setup context build state', () => { expect(io.stdout()).toContain('KTX context is ready for agents.'); }); + it('records only failed sources as retryable when the context build fails', async () => { + await writeReadyProject(tempDir); + const io = makeIo(); + const runContextBuildMock = vi.fn(async (_project, _args, _io, hooks) => { + hooks.onSourceProgress?.([ + { connectionId: 'warehouse', operation: 'scan', status: 'done', elapsedMs: 1000 }, + { connectionId: 'docs', operation: 'source-ingest', status: 'failed', elapsedMs: 2000 }, + ]); + return { + exitCode: 1, + detached: false, + reportIds: ['report-docs-failed'], + artifactPaths: ['raw-sources/docs/notion/sync-1/ingest-report.json'], + }; + }); + + await expect( + runKtxSetupContextStep( + { projectDir: tempDir, inputMode: 'disabled' }, + io.io, + { + runIdFactory: () => 'setup-context-local-failed', + now: () => new Date('2026-05-09T10:00:00.000Z'), + runContextBuild: runContextBuildMock, + }, + ), + ).resolves.toEqual({ status: 'failed', projectDir: tempDir }); + + await expect(readKtxSetupContextState(tempDir)).resolves.toMatchObject({ + runId: 'setup-context-local-failed', + status: 'failed', + reportIds: ['report-docs-failed'], + artifactPaths: ['raw-sources/docs/notion/sync-1/ingest-report.json'], + retryableFailedTargets: ['docs'], + sourceProgress: [ + { connectionId: 'warehouse', operation: 'scan', status: 'done', elapsedMs: 1000 }, + { connectionId: 'docs', operation: 'source-ingest', status: 'failed', elapsedMs: 2000 }, + ], + }); + }); + it('marks context complete without prompting when initial source ingest already made agent context', async () => { await writeReadyProject(tempDir); await mkdir(join(tempDir, 'semantic-layer', 'dbt-main'), { recursive: true }); diff --git a/packages/cli/src/setup-context.ts b/packages/cli/src/setup-context.ts index fc7a1aef..00928706 100644 --- a/packages/cli/src/setup-context.ts +++ b/packages/cli/src/setup-context.ts @@ -234,6 +234,24 @@ function normalizeSourceProgress(value: unknown): ContextBuildSourceProgressUpda return entries.length > 0 ? entries : undefined; } +function setupContextTargetIds(targets: KtxSetupContextTargets): string[] { + return [...new Set([...targets.primarySourceConnectionIds, ...targets.contextSourceConnectionIds])]; +} + +function retryableFailedTargetsFromProgress( + targets: KtxSetupContextTargets, + progress: ContextBuildSourceProgressUpdate[] | undefined, +): string[] { + const targetIds = setupContextTargetIds(targets); + if (!progress || progress.length === 0) { + return targetIds; + } + + const failedIds = new Set(progress.filter((source) => source.status === 'failed').map((source) => source.connectionId)); + const failedTargets = targetIds.filter((connectionId) => failedIds.has(connectionId)); + return failedTargets.length > 0 ? failedTargets : targetIds; +} + export async function readKtxSetupContextState(projectDir: string): Promise { const filePath = statePath(projectDir); if (!(await pathExists(filePath))) { @@ -614,7 +632,7 @@ async function runBuild( updatedAt, reportIds: completedReportIds, artifactPaths: completedArtifactPaths, - retryableFailedTargets: [...targets.primarySourceConnectionIds, ...targets.contextSourceConnectionIds], + retryableFailedTargets: retryableFailedTargetsFromProgress(targets, lastSourceProgress), failureReason: 'Context build failed.', ...(lastSourceProgress ? { sourceProgress: lastSourceProgress } : {}), }); diff --git a/packages/cli/src/setup-databases.test.ts b/packages/cli/src/setup-databases.test.ts index a20df910..2f5d93c6 100644 --- a/packages/cli/src/setup-databases.test.ts +++ b/packages/cli/src/setup-databases.test.ts @@ -532,8 +532,8 @@ describe('setup databases step', () => { expect(prompts.select).toHaveBeenCalledWith({ message: 'Primary sources already configured: warehouse\nWhat would you like to do?', options: [ + { value: 'continue', label: 'Continue to knowledge sources' }, { value: 'add', label: 'Add another primary source' }, - { value: 'continue', label: 'Continue setup' }, { value: 'back', label: 'Back' }, ], }); @@ -583,8 +583,8 @@ describe('setup databases step', () => { expect(prompts.select).toHaveBeenCalledWith({ message: 'Primary sources already configured: warehouse\nWhat would you like to do?', options: [ + { value: 'continue', label: 'Continue to knowledge sources' }, { value: 'add', label: 'Add another primary source' }, - { value: 'continue', label: 'Continue setup' }, { value: 'back', label: 'Back' }, ], }); @@ -618,8 +618,8 @@ describe('setup databases step', () => { expect(prompts.select).toHaveBeenCalledWith({ message: 'Primary sources already configured: postgres-warehouse\nWhat would you like to do?', options: [ + { value: 'continue', label: 'Continue to knowledge sources' }, { value: 'add', label: 'Add another primary source' }, - { value: 'continue', label: 'Continue setup' }, { value: 'back', label: 'Back' }, ], }); @@ -653,8 +653,8 @@ describe('setup databases step', () => { expect(prompts.select).toHaveBeenNthCalledWith(2, { message: 'Primary sources already configured: postgres-warehouse\nWhat would you like to do?', options: [ + { value: 'continue', label: 'Continue to knowledge sources' }, { value: 'add', label: 'Add another primary source' }, - { value: 'continue', label: 'Continue setup' }, { value: 'back', label: 'Back' }, ], }); @@ -696,8 +696,8 @@ describe('setup databases step', () => { expect(prompts.select).toHaveBeenNthCalledWith(2, { message: 'Primary sources already configured: warehouse\nWhat would you like to do?', options: [ + { value: 'continue', label: 'Continue to knowledge sources' }, { value: 'add', label: 'Add another primary source' }, - { value: 'continue', label: 'Continue setup' }, { value: 'back', label: 'Back' }, ], }); @@ -920,8 +920,18 @@ describe('setup databases step', () => { '│ ✓ Connection test passed', '│ Driver: PostgreSQL · Tables: 2', '│', + ].join('\n'), + ); + expect(io.stdout()).toContain( + [ '◇ Scanning postgres-warehouse', - '│ ✓ Structural scan completed', + '│ Running structural scan…', + '│', + ].join('\n'), + ); + expect(io.stdout()).toContain( + [ + '◇ Scan complete for postgres-warehouse', '│ Changes: 2 new tables', '│ Report: raw-sources/postgres-warehouse/live-database/.../scan-report.json', '│', @@ -1009,7 +1019,7 @@ describe('setup databases step', () => { expect(config.connections['postgres-warehouse']).toMatchObject({ schemas: ['orbit_analytics', 'orbit_raw'], }); - expect(io.stdout()).toContain('Schemas: orbit_analytics, orbit_raw'); + expect(io.stdout()).toContain('✓ orbit_analytics, orbit_raw'); }); it('auto-selects all discovered Postgres schemas in non-interactive setup', async () => { @@ -1045,7 +1055,7 @@ describe('setup databases step', () => { expect(config.connections.warehouse).toMatchObject({ schemas: ['orbit_analytics', 'orbit_raw', 'public'], }); - expect(io.stdout()).toContain('Schemas: orbit_analytics, orbit_raw, public'); + expect(io.stdout()).toContain('✓ orbit_analytics, orbit_raw, public'); }); it('adds one non-interactive Postgres URL connection, tests it, scans it, and marks databases complete', async () => { diff --git a/packages/cli/src/setup-databases.ts b/packages/cli/src/setup-databases.ts index 3d49f75b..113fd048 100644 --- a/packages/cli/src/setup-databases.ts +++ b/packages/cli/src/setup-databases.ts @@ -115,6 +115,56 @@ const DEFAULT_CONNECTION_IDS: Record = { snowflake: 'snowflake-warehouse', }; +interface ScopeDiscoverySpec { + noun: string; + nounPlural: string; + promptLabel: string; + configArrayField: string; + configSingleField: string; + defaultSelection: (values: string[]) => string[]; +} + +const SCOPE_DISCOVERY_SPECS: Partial> = { + postgres: { + noun: 'schema', + nounPlural: 'schemas', + promptLabel: 'PostgreSQL schemas', + configArrayField: 'schemas', + configSingleField: 'schema', + defaultSelection(schemas) { + const nonPublic = schemas.filter((s) => s !== 'public'); + return nonPublic.length > 0 ? nonPublic : schemas; + }, + }, + sqlserver: { + noun: 'schema', + nounPlural: 'schemas', + promptLabel: 'SQL Server schemas', + configArrayField: 'schemas', + configSingleField: 'schema', + defaultSelection: (schemas) => schemas, + }, + bigquery: { + noun: 'dataset', + nounPlural: 'datasets', + promptLabel: 'BigQuery datasets', + configArrayField: 'dataset_ids', + configSingleField: 'dataset_id', + defaultSelection: (datasets) => datasets, + }, + snowflake: { + noun: 'schema', + nounPlural: 'schemas', + promptLabel: 'Snowflake schemas', + configArrayField: 'schema_names', + configSingleField: 'schema_name', + defaultSelection(schemas) { + const nonPublic = schemas.filter((s) => s !== 'PUBLIC'); + return nonPublic.length > 0 ? nonPublic : schemas; + }, + }, +}; + type UrlDriverType = Extract; const DRIVER_CONNECTION_DEFAULTS: Record = { @@ -263,16 +313,53 @@ async function defaultHistoricSqlProbe(input: KtxSetupHistoricSqlProbeInput): Pr async function defaultListSchemas(projectDir: string, connectionId: string): Promise { const project = await loadKtxProject({ projectDir }); const connection = project.config.connections[connectionId]; - const { KtxPostgresScanConnector, isKtxPostgresConnectionConfig } = await import('@ktx/connector-postgres'); - if (!isKtxPostgresConnectionConfig(connection)) { - return []; + const driver = normalizeDriver(connection?.driver); + + if (driver === 'postgres') { + const { KtxPostgresScanConnector, isKtxPostgresConnectionConfig } = await import('@ktx/connector-postgres'); + if (!isKtxPostgresConnectionConfig(connection)) return []; + const connector = new KtxPostgresScanConnector({ connectionId, connection }); + try { + return await connector.listSchemas(); + } finally { + await connector.cleanup(); + } } - const connector = new KtxPostgresScanConnector({ connectionId, connection }); - try { - return await connector.listSchemas(); - } finally { - await connector.cleanup(); + + if (driver === 'sqlserver') { + const { KtxSqlServerScanConnector, isKtxSqlServerConnectionConfig } = await import('@ktx/connector-sqlserver'); + if (!isKtxSqlServerConnectionConfig(connection)) return []; + const connector = new KtxSqlServerScanConnector({ connectionId, connection }); + try { + return await connector.listSchemas(); + } finally { + await connector.cleanup(); + } } + + if (driver === 'bigquery') { + const { KtxBigQueryScanConnector, isKtxBigQueryConnectionConfig } = await import('@ktx/connector-bigquery'); + if (!isKtxBigQueryConnectionConfig(connection)) return []; + const connector = new KtxBigQueryScanConnector({ connectionId, connection }); + try { + return await connector.listDatasets(); + } finally { + await connector.cleanup(); + } + } + + if (driver === 'snowflake') { + const { KtxSnowflakeScanConnector, isKtxSnowflakeConnectionConfig } = await import('@ktx/connector-snowflake'); + if (!isKtxSnowflakeConnectionConfig(connection)) return []; + const connector = new KtxSnowflakeScanConnector({ connectionId, connection }); + try { + return await connector.listSchemas(); + } finally { + await connector.cleanup(); + } + } + + return []; } function existingConnectionIdsByDriver( @@ -309,8 +396,8 @@ function configuredPrimarySourcesPrompt(connectionIds: string[]): { return { message: `Primary sources already configured: ${connectionIds.join(', ')}\nWhat would you like to do?`, options: [ + { value: 'continue', label: 'Continue to knowledge sources' }, { value: 'add', label: 'Add another primary source' }, - { value: 'continue', label: 'Continue setup' }, { value: 'back', label: 'Back' }, ], }; @@ -849,41 +936,44 @@ async function writeConnectionConfig(input: { } } -function configuredSchemas(connection: KtxProjectConnectionConfig | undefined): string[] { +function configuredScopeValues( + connection: KtxProjectConnectionConfig | undefined, + spec: ScopeDiscoverySpec, +): string[] { if (!connection) return []; - if (Array.isArray(connection.schemas)) { - return connection.schemas - .filter((schema): schema is string => typeof schema === 'string' && schema.trim().length > 0) - .map((schema) => schema.trim()); + const arrayVal = connection[spec.configArrayField]; + if (Array.isArray(arrayVal)) { + return arrayVal + .filter((v): v is string => typeof v === 'string' && v.trim().length > 0) + .map((v) => v.trim()); } - return typeof connection.schema === 'string' && connection.schema.trim().length > 0 ? [connection.schema.trim()] : []; + const singleVal = connection[spec.configSingleField]; + return typeof singleVal === 'string' && singleVal.trim().length > 0 ? [singleVal.trim()] : []; } -function defaultSchemaSelection(schemas: string[]): string[] { - const nonPublic = schemas.filter((schema) => schema !== 'public'); - return nonPublic.length > 0 ? nonPublic : schemas; -} - -async function writeConnectionSchemas(input: { +async function writeScopeConfig(input: { projectDir: string; connectionId: string; - schemas: string[]; + values: string[]; + spec: ScopeDiscoverySpec; }): Promise { const project = await loadKtxProject({ projectDir: input.projectDir }); const connection = project.config.connections[input.connectionId]; if (!connection) return; - const { schema: _schema, ...connectionWithoutLegacySchema } = connection; + const cleaned = Object.fromEntries( + Object.entries(connection).filter(([key]) => key !== input.spec.configSingleField), + ) as KtxProjectConnectionConfig; await writeConnectionConfig({ projectDir: input.projectDir, connectionId: input.connectionId, connection: { - ...connectionWithoutLegacySchema, - schemas: unique(input.schemas), + ...cleaned, + [input.spec.configArrayField]: unique(input.values), }, }); } -async function maybeConfigurePostgresSchemas(input: { +async function maybeConfigureSchemaScope(input: { projectDir: string; connectionId: string; args: KtxSetupDatabasesArgs; @@ -893,65 +983,78 @@ async function maybeConfigurePostgresSchemas(input: { }): Promise { const project = await loadKtxProject({ projectDir: input.projectDir }); const connection = project.config.connections[input.connectionId]; - if (normalizeDriver(connection?.driver) !== 'postgres') { - return true; - } + const driver = normalizeDriver(connection?.driver); + if (!driver) return true; - if (configuredSchemas(connection).length > 0) { + const spec = SCOPE_DISCOVERY_SPECS[driver]; + if (!spec) return true; + + const arrayVal = connection?.[spec.configArrayField]; + if (Array.isArray(arrayVal) && arrayVal.length > 0) { return true; } if (input.args.databaseSchemas.length > 0) { - await writeConnectionSchemas({ + await writeScopeConfig({ projectDir: input.projectDir, connectionId: input.connectionId, - schemas: input.args.databaseSchemas, + values: input.args.databaseSchemas, + spec, }); return true; } - let discoveredSchemas: string[]; + writeSetupSection(input.io, `Discovering ${spec.promptLabel.toLowerCase()}`, [ + `Connecting to ${input.connectionId}…`, + ]); + + let discovered: string[]; try { - discoveredSchemas = unique( + discovered = unique( await (input.deps.listSchemas ?? defaultListSchemas)(input.projectDir, input.connectionId), ); } catch (error) { input.io.stderr.write( - `Could not discover PostgreSQL schemas for ${input.connectionId}; continuing with existing schema scope. ` + + `Could not discover ${spec.promptLabel.toLowerCase()} for ${input.connectionId}; continuing with existing ${spec.noun} scope. ` + `Pass --database-schema to set it explicitly. ${error instanceof Error ? error.message : String(error)}\n`, ); return true; } - if (discoveredSchemas.length === 0) { + if (discovered.length === 0) { return true; } - let selectedSchemas: string[]; - if (input.args.inputMode === 'disabled' || discoveredSchemas.length === 1) { - selectedSchemas = discoveredSchemas; + let selected: string[]; + if (input.args.inputMode === 'disabled' || discovered.length === 1) { + const preconfigured = configuredScopeValues(connection, spec).filter((v) => discovered.includes(v)); + selected = preconfigured.length > 0 ? preconfigured : discovered; } else { - const initialValues = defaultSchemaSelection(discoveredSchemas); + const preconfigured = configuredScopeValues(connection, spec).filter((v) => discovered.includes(v)); + const initialValues = preconfigured.length > 0 ? preconfigured : spec.defaultSelection(discovered); const choices = await input.prompts.multiselect({ message: withMultiselectNavigation( - 'PostgreSQL schemas to scan\nKTX found multiple non-system schemas. Select every schema agents should use.', + `${spec.promptLabel} to scan\n` + + `KTX found multiple ${spec.nounPlural}. Select every ${spec.noun} agents should use.`, ), - options: discoveredSchemas.map((schema) => ({ value: schema, label: schema })), + options: discovered.map((v) => ({ value: v, label: v })), initialValues, required: true, }); if (choices.includes('back')) { return false; } - selectedSchemas = choices.length > 0 ? choices : initialValues; + selected = choices.length > 0 ? choices : initialValues; } - await writeConnectionSchemas({ + await writeScopeConfig({ projectDir: input.projectDir, connectionId: input.connectionId, - schemas: selectedSchemas, + values: selected, + spec, }); - writeSetupSection(input.io, `Selecting schemas for ${input.connectionId}`, [ - `Schemas: ${selectedSchemas.join(', ')}`, + const capitalNounPlural = spec.nounPlural[0]!.toUpperCase() + spec.nounPlural.slice(1); + writeSetupSection(input.io, `${capitalNounPlural} saved for ${input.connectionId}`, [ + `✓ ${selected.join(', ')}`, ]); return true; } @@ -1081,7 +1184,7 @@ async function validateAndScanConnection(input: { testLines.push(`Driver: ${driverDisplay}${Number.isFinite(tableCount) ? ` · Tables: ${tableCount}` : ''}`); writeSetupSection(input.io, `Testing ${input.connectionId}`, testLines); - if (!(await maybeConfigurePostgresSchemas(input))) { + if (!(await maybeConfigureSchemaScope(input))) { return false; } @@ -1091,6 +1194,9 @@ async function validateAndScanConnection(input: { io: input.io, deps: input.deps, }); + writeSetupSection(input.io, `Scanning ${input.connectionId}`, [ + 'Running structural scan…', + ]); const scanIo = createBufferedCommandIo(); const scanCode = await scanConnection(input.projectDir, input.connectionId, scanIo); if (scanCode !== 0) { @@ -1103,9 +1209,8 @@ async function validateAndScanConnection(input: { const reportPath = readOutputValue(scanOutput, 'Report'); writeSetupSection( input.io, - `Scanning ${input.connectionId}`, + `Scan complete for ${input.connectionId}`, [ - '✓ Structural scan completed', `Changes: ${summarizeScanChanges(scanOutput)}`, ...(reportPath ? [`Report: ${shortenScanReportPath(reportPath)}`] : []), ], diff --git a/packages/cli/src/setup-demo-tour.ts b/packages/cli/src/setup-demo-tour.ts index 557a52cb..ea7a5892 100644 --- a/packages/cli/src/setup-demo-tour.ts +++ b/packages/cli/src/setup-demo-tour.ts @@ -52,6 +52,7 @@ function createTargetState(target: KtxPublicIngestPlanTarget): ContextBuildTarge status: 'queued', detailLine: null, summaryText: null, + failureText: null, startedAt: null, elapsedMs: 0, }; diff --git a/packages/cli/src/setup-sources.test.ts b/packages/cli/src/setup-sources.test.ts index 1a281261..c74dd642 100644 --- a/packages/cli/src/setup-sources.test.ts +++ b/packages/cli/src/setup-sources.test.ts @@ -211,6 +211,66 @@ describe('setup sources step', () => { expect(runMapping).toHaveBeenCalledWith(projectDir, 'prod_metabase', io.io); }); + it('writes Notion config with the full default knowledge create budget', async () => { + await addPrimarySource(); + const validateNotion = vi.fn(async () => ({ ok: true as const, detail: 'roots=1' })); + + await expect( + runKtxSetupSourcesStep( + { + projectDir, + inputMode: 'disabled', + source: 'notion', + sourceConnectionId: 'notion-main', + sourceApiKeyRef: 'env:NOTION_TOKEN', + notionCrawlMode: 'selected_roots', + notionRootPageIds: ['page-1'], + runInitialSourceIngest: false, + skipSources: false, + }, + makeIo().io, + { validateNotion }, + ), + ).resolves.toEqual({ status: 'ready', projectDir, connectionIds: ['notion-main'] }); + + expect((await readConfig()).connections['notion-main']).toMatchObject({ + driver: 'notion', + auth_token_ref: 'env:NOTION_TOKEN', + root_page_ids: ['page-1'], + max_knowledge_creates_per_run: 25, + max_knowledge_updates_per_run: 20, + }); + }); + + it('uses selected Notion roots when root page ids are provided even if crawl mode says all accessible', async () => { + await addPrimarySource(); + const validateNotion = vi.fn(async () => ({ ok: true as const, detail: 'roots=1' })); + + await expect( + runKtxSetupSourcesStep( + { + projectDir, + inputMode: 'disabled', + source: 'notion', + sourceConnectionId: 'notion-main', + sourceApiKeyRef: 'env:NOTION_TOKEN', + notionCrawlMode: 'all_accessible', + notionRootPageIds: ['page-1'], + runInitialSourceIngest: false, + skipSources: false, + }, + makeIo().io, + { validateNotion }, + ), + ).resolves.toEqual({ status: 'ready', projectDir, connectionIds: ['notion-main'] }); + + expect((await readConfig()).connections['notion-main']).toMatchObject({ + driver: 'notion', + root_page_ids: ['page-1'], + crawl_mode: 'selected_roots', + }); + }); + it('defaults interactive Metabase and Looker source setup to the only warehouse connection', async () => { await addPrimarySource(); const cases: Array<{ diff --git a/packages/cli/src/setup-sources.ts b/packages/cli/src/setup-sources.ts index e6e7f41b..3393b838 100644 --- a/packages/cli/src/setup-sources.ts +++ b/packages/cli/src/setup-sources.ts @@ -36,6 +36,8 @@ import { writeProjectLocalSecretReference } from './setup-secrets.js'; export type KtxSetupSourceType = 'dbt' | 'metricflow' | 'metabase' | 'looker' | 'lookml' | 'notion'; +const DEFAULT_NOTION_MAX_KNOWLEDGE_CREATES_PER_RUN = 25; + export interface KtxSetupSourcesArgs { projectDir: string; inputMode: 'auto' | 'disabled'; @@ -508,8 +510,8 @@ function buildLookmlConnection(args: KtxSetupSourcesArgs): KtxProjectConnectionC } function buildNotionConnection(args: KtxSetupSourcesArgs): KtxProjectConnectionConfig { - const crawlMode = args.notionCrawlMode ?? 'selected_roots'; const rootPageIds = args.notionRootPageIds ?? []; + const crawlMode = rootPageIds.length > 0 ? 'selected_roots' : (args.notionCrawlMode ?? 'selected_roots'); if (crawlMode === 'selected_roots' && rootPageIds.length === 0) { throw new Error('Notion selected_roots requires --notion-root-page-id.'); } @@ -521,7 +523,7 @@ function buildNotionConnection(args: KtxSetupSourcesArgs): KtxProjectConnectionC root_database_ids: [], root_data_source_ids: [], max_pages_per_run: 1000, - max_knowledge_creates_per_run: 5, + max_knowledge_creates_per_run: DEFAULT_NOTION_MAX_KNOWLEDGE_CREATES_PER_RUN, max_knowledge_updates_per_run: 20, last_successful_cursor: null, }; @@ -1185,10 +1187,10 @@ async function promptForInteractiveSource( }, async (currentState) => { const crawlMode = await prompts.select({ - message: 'Notion crawl mode', + message: 'Which Notion pages should KTX ingest?', options: [ - { value: 'selected_roots', label: 'Selected roots' }, - { value: 'all_accessible', label: 'All accessible pages' }, + { value: 'selected_roots', label: 'Specific pages and their subpages (you\'ll paste page IDs)' }, + { value: 'all_accessible', label: 'All pages the integration can access' }, { value: 'back', label: 'Back' }, ], }); @@ -1203,8 +1205,8 @@ async function promptForInteractiveSource( ? [ async (currentState: SourcePromptState) => { const roots = await promptText(prompts, { - message: 'Notion root page ids', - placeholder: 'comma-separated ids', + message: 'Notion page IDs to ingest (each page includes all its subpages)', + placeholder: 'page-id-1, page-id-2', }); if (roots === undefined) return 'back'; currentState.notionRootPageIds = roots diff --git a/packages/cli/src/setup.test.ts b/packages/cli/src/setup.test.ts index 58efc506..fa9a1197 100644 --- a/packages/cli/src/setup.test.ts +++ b/packages/cli/src/setup.test.ts @@ -1,6 +1,8 @@ +import { execFile } from 'node:child_process'; import { mkdir, mkdtemp, readFile, rm, stat, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; +import { promisify } from 'node:util'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { localFakeBundleReport, persistLocalBundleReport } from './ingest.test-utils.js'; @@ -12,6 +14,8 @@ vi.mock('./setup-demo-tour.js', () => ({ runDemoTour: vi.fn(async () => 0), })); +const execFileAsync = promisify(execFile); + function makeIo() { let stdout = ''; let stderr = ''; @@ -1453,6 +1457,60 @@ describe('setup status', () => { expect(calls).toEqual(['model', 'embeddings', 'databases', 'sources', 'context', 'agents']); }); + it('commits setup config changes written by later setup steps', async () => { + const io = makeIo(); + + await expect( + runKtxSetup( + { + command: 'run', + projectDir: tempDir, + mode: 'new', + agents: false, + inputMode: 'disabled', + yes: true, + cliVersion: '0.2.0', + skipLlm: true, + skipEmbeddings: true, + skipDatabases: true, + skipSources: true, + skipAgents: false, + databaseSchemas: [], + }, + io.io, + { + model: async () => ({ status: 'skipped', projectDir: tempDir }), + embeddings: async () => ({ status: 'skipped', projectDir: tempDir }), + databases: async () => { + const configPath = join(tempDir, 'ktx.yaml'); + const current = await readFile(configPath, 'utf-8'); + await writeFile( + configPath, + current.replace( + 'connections: {}', + ['connections:', ' warehouse:', ' driver: postgres', ' url: env:DATABASE_URL'].join('\n'), + ), + 'utf-8', + ); + return { status: 'skipped', projectDir: tempDir }; + }, + sources: async () => ({ status: 'skipped', projectDir: tempDir }), + context: async () => ({ status: 'ready', projectDir: tempDir, runId: 'setup-context-local-test' }), + agents: async () => ({ + status: 'ready', + projectDir: tempDir, + installs: [{ target: 'codex', scope: 'project', mode: 'cli' }], + }), + }, + ), + ).resolves.toBe(0); + + const { stdout } = await execFileAsync('git', ['-C', tempDir, 'status', '--short', '--', 'ktx.yaml']); + expect(stdout).toBe(''); + const committedConfig = await execFileAsync('git', ['-C', tempDir, 'show', 'HEAD:ktx.yaml']); + expect(committedConfig.stdout).toContain('warehouse:'); + }); + it('runs agent setup after context succeeds in --agents mode', async () => { const calls: string[] = []; const io = makeIo(); diff --git a/packages/cli/src/setup.ts b/packages/cli/src/setup.ts index a4f081d0..c2ab113e 100644 --- a/packages/cli/src/setup.ts +++ b/packages/cli/src/setup.ts @@ -433,6 +433,11 @@ function setupRuntimeInstallPolicy(args: Extract { + const project = await loadKtxProject({ projectDir }); + await project.git.commitFile('ktx.yaml', 'setup: update KTX project config', 'ktx setup', 'setup@ktx.local'); +} + export async function runKtxSetup(args: KtxSetupArgs, io: KtxCliIo, deps: KtxSetupDeps = {}): Promise { try { return await runKtxSetupInner(args, io, deps); @@ -773,6 +778,8 @@ async function runKtxSetupInner(args: KtxSetupArgs, io: KtxCliIo, deps: KtxSetup break; } + await commitSetupConfigChanges(projectResult.projectDir); + const status = await readKtxSetupStatus(projectResult.projectDir); io.stdout.write(formatKtxSetupStatus(status)); io.stdout.write('\nWhat you can do next:\n'); diff --git a/packages/cli/src/sl.test.ts b/packages/cli/src/sl.test.ts index 8752d0ec..c04ec15d 100644 --- a/packages/cli/src/sl.test.ts +++ b/packages/cli/src/sl.test.ts @@ -84,6 +84,56 @@ describe('runKtxSl', () => { expect(listIo.stdout()).toContain('warehouse\torders\tcolumns=1\tmeasures=0\tjoins=0'); }); + it('fails validation when a table-backed source declares columns absent from a matching warehouse manifest', async () => { + const projectDir = join(tempDir, 'project'); + const project = await initKtxProject({ projectDir, projectName: 'warehouse' }); + await project.fileStore.writeFile( + 'semantic-layer/postgres-warehouse/_schema/orbit_analytics.yaml', + `tables: + int_active_contract_arr: + table: orbit_analytics.int_active_contract_arr + columns: + - { name: contract_id, type: string } + - { name: contract_arr_cents, type: number } +`, + 'ktx', + 'ktx@example.com', + 'Add warehouse manifest', + ); + await project.fileStore.writeFile( + 'semantic-layer/dbt-main/int_active_contract_arr.yaml', + `name: int_active_contract_arr +table: orbit_analytics.int_active_contract_arr +grain: [contract_id] +columns: + - { name: contract_id, type: string } + - { name: arr_cents, type: number } +measures: + - { name: arr, expr: sum(arr_cents) } +joins: [] +`, + 'ktx', + 'ktx@example.com', + 'Add invalid dbt source', + ); + + const validateIo = makeIo(); + await expect( + runKtxSl( + { + command: 'validate', + projectDir, + connectionId: 'dbt-main', + sourceName: 'int_active_contract_arr', + }, + validateIo.io, + ), + ).resolves.toBe(1); + + expect(validateIo.stderr()).toContain('arr_cents'); + expect(validateIo.stderr()).toContain('absent from physical table'); + }); + it('runs sl query and prints SQL output', async () => { const projectDir = join(tempDir, 'project'); const project = await initKtxProject({ projectDir, projectName: 'warehouse' }); diff --git a/packages/cli/src/sl.ts b/packages/cli/src/sl.ts index a82f84d8..fb0f129e 100644 --- a/packages/cli/src/sl.ts +++ b/packages/cli/src/sl.ts @@ -97,7 +97,7 @@ export async function runKtxSl(args: KtxSlArgs, io: KtxSlIo = process, deps: Ktx if (!source) { throw new Error(`Semantic-layer source "${args.connectionId}/${args.sourceName}" was not found`); } - const result = await validateLocalSlSource(source.yaml); + const result = await validateLocalSlSource(source.yaml, { project, connectionId: args.connectionId }); if (!result.valid) { for (const error of result.errors) { io.stderr.write(`${error}\n`); diff --git a/packages/cli/src/standalone-smoke.test.ts b/packages/cli/src/standalone-smoke.test.ts index ac9d3e47..68c3f18f 100644 --- a/packages/cli/src/standalone-smoke.test.ts +++ b/packages/cli/src/standalone-smoke.test.ts @@ -226,7 +226,7 @@ describe('standalone built ktx CLI smoke', () => { expect(result.stdout).toContain('Notion:'); expect(result.stdout).toContain('Semantic-layer sources:'); expect(result.stdout).toContain('Knowledge pages:'); - expect(result.stdout).toContain('ktx serve --mcp stdio'); + expect(result.stdout).not.toContain('ktx serve --mcp stdio'); expect(result.stdout).not.toContain(['--mode', 'deterministic'].join(' ')); }); @@ -341,7 +341,7 @@ describe('standalone built ktx CLI smoke', () => { expect(inspect.stdout).toContain('ktx agent tools --json'); expect(inspect.stdout).toContain('ktx agent context --json'); expect(inspect.stdout).not.toContain('ktx ask "your question here"'); - expect(inspect.stdout).toContain('ktx serve --mcp stdio'); + expect(inspect.stdout).not.toContain('ktx serve --mcp stdio'); }); it('serves seeded demo wiki and semantic-layer context over stdio MCP', async () => { diff --git a/packages/connector-postgres/src/connector.test.ts b/packages/connector-postgres/src/connector.test.ts index 9e6e1db8..96443c90 100644 --- a/packages/connector-postgres/src/connector.test.ts +++ b/packages/connector-postgres/src/connector.test.ts @@ -358,4 +358,38 @@ describe('KtxPostgresScanConnector', () => { expect(snapshot.tables.length).toBeGreaterThan(0); expect(endCalled).toBe(true); }); + + it('attaches an error listener to the pg pool', async () => { + const on = vi.fn(); + const poolFactory: KtxPostgresPoolFactory = { + createPool() { + return { + on, + async connect() { + return { + query: vi.fn(async () => ({ rows: [{ '?column?': 1 }], fields: [{ name: '?column?', dataTypeID: 23 }] })), + release: vi.fn(), + }; + }, + end: vi.fn(async () => undefined), + }; + }, + }; + const connector = new KtxPostgresScanConnector({ + connectionId: 'warehouse', + connection: { + driver: 'postgres', + host: 'db.example.test', + database: 'analytics', + username: 'reader', + password: 'test-password', // pragma: allowlist secret + readonly: true, + }, + poolFactory, + }); + + await expect(connector.testConnection()).resolves.toEqual({ success: true }); + + expect(on).toHaveBeenCalledWith('error', expect.any(Function)); + }); }); diff --git a/packages/connector-postgres/src/connector.ts b/packages/connector-postgres/src/connector.ts index 288ef25c..f4aa2f86 100644 --- a/packages/connector-postgres/src/connector.ts +++ b/packages/connector-postgres/src/connector.ts @@ -91,6 +91,7 @@ interface KtxPostgresClient { interface KtxPostgresPool { connect(): Promise; end(): Promise; + on?(event: 'error', listener: (error: Error) => void): void; } export interface KtxPostgresPoolFactory { @@ -359,6 +360,7 @@ export class KtxPostgresScanConnector implements KtxScanConnector { private readonly now: () => Date; private readonly dialect = new KtxPostgresDialect(); private pool: KtxPostgresPool | null = null; + private lastIdlePoolError: Error | null = null; private resolvedEndpoint: KtxPostgresResolvedEndpoint | null = null; constructor(options: KtxPostgresScanConnectorOptions) { @@ -677,11 +679,15 @@ export class KtxPostgresScanConnector implements KtxScanConnector { config = { ...config, host: endpoint.host, port: endpoint.port }; } this.pool = this.poolFactory.createPool(config); + this.pool.on?.('error', (error) => { + this.lastIdlePoolError = error; + }); } return this.pool; } private async queryRaw(sql: string, params?: unknown[]): Promise { + this.throwIdlePoolErrorIfPresent(); const pool = await this.getPool(); const client = await pool.connect(); try { @@ -693,6 +699,7 @@ export class KtxPostgresScanConnector implements KtxScanConnector { } private async query(sql: string, params?: Record | unknown[]): Promise { + this.throwIdlePoolErrorIfPresent(); const pool = await this.getPool(); const client = await pool.connect(); try { @@ -714,4 +721,13 @@ export class KtxPostgresScanConnector implements KtxScanConnector { throw new Error(`PostgreSQL connector ${this.connectionId} cannot run scan for ${connectionId}`); } } + + private throwIdlePoolErrorIfPresent(): void { + if (!this.lastIdlePoolError) { + return; + } + const error = this.lastIdlePoolError; + this.lastIdlePoolError = null; + throw error; + } } diff --git a/packages/context/prompts/memory_agent_bundle_ingest_reconcile.md b/packages/context/prompts/memory_agent_bundle_ingest_reconcile.md index 33c9709d..d076d4e5 100644 --- a/packages/context/prompts/memory_agent_bundle_ingest_reconcile.md +++ b/packages/context/prompts/memory_agent_bundle_ingest_reconcile.md @@ -10,11 +10,12 @@ Parsimonious. Stage 3 WUs already loaded `ingest_triage` and handled conflicts t 1. Load `ingest_triage`, then `sl_capture` + `knowledge_capture`. 2. Call `stage_list()` for the full index of this job's writes. If it is empty AND you have no evictions, exit — the runner short-circuits this case but the skill still teaches you to bail fast. 3. If the system prompt includes ``, apply those pins before flagging a same-name or near-duplicate conflict. A pinned `canonicalArtifactKey` keeps the contested name when it is present in the Stage Index; competing variants keep or receive disambiguated names. -4. For each pair of WUs that wrote overlapping SL source names or wiki keys, call `stage_diff` to see the actual difference. If they're the same content, leave it. If they differ per `ingest_triage` rules, apply the correct resolution (rename + capture; election of canonical; silent replace for expression-only re-ingest change; or pinned canonical), then call `emit_conflict_resolution` with the artifact key and decision. -5. Call `eviction_list()` for deleted raw paths. For each eviction: if inbound refs are empty, remove the artifact (`sl_delete`, `wiki_remove`); if inbound refs exist, retain with a deprecation marker. Then call `emit_eviction_decision` for every removed or retained artifact. -6. If the Stage 4 sweep discovers a raw file whose only honest outcome is standalone SQL, wiki-only capture, or a human flag, call `emit_unmapped_fallback` with the raw path, reason, and fallback kind. -7. Use `read_raw_span` to zoom into specific raw files when you need to resolve what two contested measures actually compute. -8. Exit when you've processed every item. +4. Sweep both exact-key conflicts and near-duplicate writes. Compare WUs that wrote overlapping SL source names, overlapping wiki keys, the same `tables:` or `sl_refs:` action details, or obviously equivalent topic titles under different wiki keys. Call `stage_diff` to see the actual difference, and use `wiki_read`/`sl_read_source` when two different keys appear to describe the same table, metric, or source-of-truth mapping. If they're the same content, leave one canonical artifact and record the duplicate as subsumed. If they differ per `ingest_triage` rules, apply the correct resolution (rename + capture; election of canonical; silent replace for expression-only re-ingest change; or pinned canonical), then call `emit_conflict_resolution` with the artifact key and decision. +5. For any `wiki_write`, `wiki_remove`, `sl_write_source`, or `sl_edit_source` call you make during reconciliation, include `rawPaths` with only the raw paths that directly caused that reconciliation action. +6. Call `eviction_list()` for deleted raw paths. For each eviction: if inbound refs are empty, remove the artifact (`sl_delete`, `wiki_remove`) and include that evicted raw path in `rawPaths`; if inbound refs exist, retain with a deprecation marker and include that evicted raw path in `rawPaths`. Then call `emit_eviction_decision` for every removed or retained artifact. +7. If the Stage 4 sweep discovers a raw file whose only honest outcome is standalone SQL, wiki-only capture, or a human flag, call `emit_unmapped_fallback` with the raw path, reason, and fallback kind. +8. Use `read_raw_span` to zoom into specific raw files when you need to resolve what two contested measures or wiki pages actually describe. +9. Exit when you've processed every item. diff --git a/packages/context/prompts/memory_agent_bundle_ingest_work_unit.md b/packages/context/prompts/memory_agent_bundle_ingest_work_unit.md index b0fa8997..9645ccc4 100644 --- a/packages/context/prompts/memory_agent_bundle_ingest_work_unit.md +++ b/packages/context/prompts/memory_agent_bundle_ingest_work_unit.md @@ -11,9 +11,10 @@ Assertive. The bundle was explicitly submitted for ingest. Default to capturing 2. Load the per-source review skill first (e.g. `lookml_ingest`, `metricflow_ingest`, `dbt_ingest`), then `sl_capture` and `knowledge_capture`, and `ingest_triage` last. The triage skill tells you how to react when `wiki_sl_search` reveals that a prior WU already wrote something overlapping. 3. If the system prompt includes ``, read those pins before choosing artifact keys. A pin's `canonicalArtifactKey` is the preferred artifact for its `contestedKey`: prefer editing the pinned canonical artifact when it already exists or when this raw file clearly updates it. Do not create a duplicate contested artifact when a pin says another artifact is canonical; use a specific disambiguated key only when the raw file describes a genuinely different domain. 4. For each raw file: call `read_raw_file` (or `read_raw_span` for slicing large files) to load content. Before writing a new SL source or wiki page, call `wiki_sl_search` for each candidate name to find prior-WU writes; apply `ingest_triage` when you hit one, and apply any matching canonical pin before deciding whether to edit, rename, or skip. -5. When `priorProvenance` names an existing artifact for one of your raw files, prefer `sl_edit` over `sl_write` for that artifact: the re-ingest change rule says expression-only changes replace silently, grain/column/filter changes replace and flag. -6. When a raw file cannot map to normal SL and you use a fallback path, call `emit_unmapped_fallback` exactly once for that raw file and reason. Use `fallback: "sql_standalone"` for a standalone SQL source, `fallback: "wiki_only"` for documentation-only capture, and `fallback: "flagged"` when no reliable artifact can be written. -7. When you're done, exit the loop without further tool calls. +5. For every `wiki_write`, `wiki_remove`, `sl_write_source`, or `sl_edit_source` call, include `rawPaths` with only the raw file paths that directly support that action. If one artifact synthesizes several files, list each contributing raw file. Do not include unrelated files from the same WorkUnit. +6. When `priorProvenance` names an existing artifact for one of your raw files, prefer `sl_edit` over `sl_write` for that artifact: the re-ingest change rule says expression-only changes replace silently, grain/column/filter changes replace and flag. +7. When a raw file cannot map to normal SL and you use a fallback path, call `emit_unmapped_fallback` exactly once for that raw file and reason. Use `fallback: "sql_standalone"` for a standalone SQL source, `fallback: "wiki_only"` for documentation-only capture, and `fallback: "flagged"` when no reliable artifact can be written. +8. When you're done, exit the loop without further tool calls. @@ -23,6 +24,8 @@ All wiki writes go to the GLOBAL scope. Bundle ingests are not personal. The `wi - Do not read peer files; only files listed in `rawFiles` or `dependencyPaths` are accessible. `read_raw_file` will reject everything else. - Do not invent measures/joins/rules not declared in the raw files. +- Do not invent physical column names or grain keys. For table-backed SL sources, every `columns:`, `grain:`, `joins:`, `segments:`, and `measures[].expr` column must come from raw-file column declarations or warehouse-backed discovery (`wiki_sl_search`, `sl_discover`, `sl_describe_table`). If column names are not confirmed, capture the business context in wiki instead of writing a full SL source. +- Do not write context-source overlays into the context source connection just because that is the current WorkUnit connection. Use `sl_discover` across data sources and write the SL artifact to the warehouse/data-source connection that owns the matching manifest. If there is no confirmed target connection, use `emit_unmapped_fallback` and wiki capture. - Do not duplicate an artifact that prior provenance says you already produced; update it. - Do not silently accept a name collision with a prior WU's write when the formula differs. Trigger `ingest_triage`. diff --git a/packages/context/skills/dbt_ingest/SKILL.md b/packages/context/skills/dbt_ingest/SKILL.md index 0f7f7904..135dd2e5 100644 --- a/packages/context/skills/dbt_ingest/SKILL.md +++ b/packages/context/skills/dbt_ingest/SKILL.md @@ -19,6 +19,18 @@ Use this skill for **uploaded** dbt projects (`dbt_project.yml` at stage root, ` | `accepted_values` | Add a **brief** line in the column description: allowed values (truncate long lists) | Also mention enum-like use in `wiki_sl_search` / filters. | | `relationships` | Add or confirm `joins:` on the overlay **only** when `to` resolves to a real table via `read_raw_file` + `wiki_sl_search` / `sl_describe_table` | If the ref cannot be resolved, capture the intent in a wiki page instead. | +## Physical schema grounding + +dbt YAML is documentation and test metadata; it is not permission to invent physical columns. Before writing any table-backed SL source, confirm the real warehouse shape with `wiki_sl_search`, `sl_discover`, or `sl_describe_table` and use only confirmed column names in `columns:`, `grain:`, `joins:`, `segments:`, and `measures[].expr`. + +For dbt context-source ingest, the dbt connection is usually not the warehouse connection. Call `sl_discover` without `connectionId` first, then write overlays to the connection that owns the matching manifest-backed source (for example `postgres-warehouse`), not to the dbt connection (for example `dbt-main`). If no matching manifest-backed source is visible on any warehouse connection, do not call `sl_write_source`; record `emit_unmapped_fallback` and keep the fact wiki-only. + +If a `models:` entry has no `columns:` block, or the available raw files do not confirm the physical column names, do **not** synthesize a full standalone source. Write a wiki note or a description-only overlay for the resolved manifest table instead. If a business metric is described but its referenced column is not confirmed in the warehouse schema, omit the measure and capture the unresolved intent in the wiki. + +Include `rawPaths` on every `wiki_write`, `sl_write_source`, and `sl_edit_source` call with only the dbt YAML files that directly support the action. + +After every `sl_write_source`, call `sl_validate`. A validation error saying a declared column or measure reference is absent from the physical table is a hard stop: re-read the warehouse-backed source and rewrite with confirmed names, or remove the invalid SL fields. + ## 1.1 test hints (descriptions / meta) When YAML shows `accepted_values` or `not_null`, add **short** hints into `columns[].descriptions` (e.g. under `user`) or freeform column notes so chat and validation see intent before the next git sync refreshes `constraints` / `enum_values` in `_schema`. Keep hints under a few words when possible. @@ -30,5 +42,7 @@ If the same bundle also has MetricFlow `semantic_models:` / `metrics:`, the **`m ## Do not - Do not run `dbt` CLI or assume `target/` / `manifest.json` exists in the upload. +- Do not invent column names, grain keys, or measure expressions from dbt model names, descriptions, tests, or common naming patterns. +- Do not write `columns:`, `grain:`, or `measures:` for a dbt model unless those exact column names are confirmed by dbt YAML columns or warehouse schema discovery. - Do not invent joins from `relationships` tests if the target model/table is not found in SL or the warehouse. - Do not read `peerFileIndex` paths — use `read_raw_file` only on `rawFiles` and `dependencyPaths` from the WorkUnit. diff --git a/packages/context/skills/knowledge_capture/SKILL.md b/packages/context/skills/knowledge_capture/SKILL.md index ba6fc125..1e6a8f6c 100644 --- a/packages/context/skills/knowledge_capture/SKILL.md +++ b/packages/context/skills/knowledge_capture/SKILL.md @@ -42,10 +42,12 @@ If nothing is worth capturing, respond without calling any tool. 2. **Before writing**, search for related content so cross-references are accurate: - `wiki_search` with the topic — find related wiki pages to populate `refs`. - `sl_discover` with the concept — if the page defines a metric (revenue, churn, retention, LTV, ARR, MRR, CAC, attribution, etc.), find matching SL sources or measures to populate `sl_refs`. If no matches, pass `sl_refs: []` so future readers know you checked. -3. If updating an existing page, `wiki_read` it first. The read result begins with `[scope: ... | tags: ... | refs: ... | sl_refs: ...]` showing current frontmatter. +3. If updating an existing page, `wiki_read` it first. Use the returned `structured.content` or markdown body as the exact stored text for targeted replacements; current tags, refs, and sl_refs are returned in structured metadata. 4. `wiki_write` to create or update. Prefer merging into an existing page over creating a new one. 5. `wiki_remove` only when a page is truly obsolete — not to replace stale content (update it instead). +For bundle/external ingest, include `rawPaths` on every `wiki_write`/`wiki_remove` call with only the raw files that directly support that wiki action. This keeps ingest provenance tied to the actual source file, not every file in the WorkUnit. + ## Keys, summaries, and content - **Keys** are short kebab-case topic identifiers: `leads-source-filter`, `revenue-definition`, `churn-calculation`. No namespacing, no prefixes. diff --git a/packages/context/skills/looker_ingest/SKILL.md b/packages/context/skills/looker_ingest/SKILL.md index fec63d96..462a5910 100644 --- a/packages/context/skills/looker_ingest/SKILL.md +++ b/packages/context/skills/looker_ingest/SKILL.md @@ -76,7 +76,7 @@ When `targetTable.ok === true`, the explore has a complete KTX backing target. B 1. Use `targetTable.catalog`, `targetTable.schema`, and `targetTable.name` for `source_tables` preflight matching through `sl_discover` or `sl_read_source`. 2. Use Looker field `sql`, labels, descriptions, and type metadata to derive source columns, measures, segments, joins, and grain. -3. Call `sl_write_source` or `sl_edit_source` with `connectionId: targetWarehouseConnectionId`. +3. Call `sl_write_source` or `sl_edit_source` with `connectionId: targetWarehouseConnectionId` and `rawPaths` set to the staged explore path. 4. Set `source.name` to the deterministic API-derived source key, for example `looker__b2b__sales_pipeline`. 5. Set `source.table` to `targetTable.canonicalTable`. 6. Run `sl_validate` after every SL write. @@ -85,13 +85,13 @@ The `table` field is `targetTable.canonicalTable`, not `rawSqlTableName`. Raw Lo Use `targetTable.{catalog,schema,name}` only for source_tables preflight. Do not put those tuple fields separately into the SL source unless the SL schema already asks for them. -When `targetTable.ok === false`, keep the WU wiki-only for SL purposes. Capture durable domain semantics with `context_candidate_write`, then emit a fallback with the EXACT structured `reason` code from `targetTable.reason`. Put any human-readable context in `detail`, NOT in `reason`: +When `targetTable.ok === false`, keep the WU wiki-only for SL purposes. Capture durable domain semantics with `context_candidate_write`, then emit a fallback with the EXACT structured `reason` code from `targetTable.reason`. Put any human-readable context in `clarification`, NOT in `reason`: ```json { "rawPath": "explores/b2b/sales_pipeline.json", "reason": "no_connection_mapping", - "detail": "Looker connection b2b_sandbox_bq is not mapped to a warehouse connection", + "clarification": "Looker connection b2b_sandbox_bq is not mapped to a warehouse connection", "fallback": "wiki_only" } ``` diff --git a/packages/context/skills/lookml_ingest/SKILL.md b/packages/context/skills/lookml_ingest/SKILL.md index 9e5a8da9..18b43f3e 100644 --- a/packages/context/skills/lookml_ingest/SKILL.md +++ b/packages/context/skills/lookml_ingest/SKILL.md @@ -43,6 +43,7 @@ When SL is allowed: - **Overlay** when the view is a thin wrapper over a manifest table (`sql_table_name:` matches a manifest entry). Do not repeat base columns or grain. - **Standalone** when the view uses `derived_table:` or `sql_always_where:`. `sl_write_source` rejects overlays whose name has no manifest entry; that error points here. - **Skip** a view with only `view:`, `sql_table_name:`, and bare `dimension:` entries (no `measure:`, `description:`, `derived_table:`, `sql_always_where:`, `join:`). The pre-filter already short-circuits those. +- Include `rawPaths` on every `sl_write_source`/`sl_edit_source` call with the exact LookML raw file(s) that support the action. ## Preflight: never guess column names diff --git a/packages/context/skills/metabase_ingest/SKILL.md b/packages/context/skills/metabase_ingest/SKILL.md index 3223818a..061760bf 100644 --- a/packages/context/skills/metabase_ingest/SKILL.md +++ b/packages/context/skills/metabase_ingest/SKILL.md @@ -48,8 +48,9 @@ Use `resultMetadata` to: For each card: 1. Analyze `resolvedSql` + `resultMetadata`: identify base tables, aggregations, joins, filters, column types. -2. Check `sl_discover` and `sl_read_source` for existing sources that overlap. -3. Decide: +2. **REQUIRED before any write**: call `sl_discover` for every candidate target source name. The response tells you whether the name is manifest-backed (`Type: table` or `Type: sql`). For manifest-backed names you MUST use the overlay shape (`name:` + `measures:`/`segments:`/`description:` only — no `sql:`, `table:`, `grain:`, or `columns:`); the tool will reject a standalone write and you'll have wasted the call. If `sl_discover` returns nothing for the name, you can write a standalone source. Also call `sl_read_source` on existing sources you intend to extend so you don't duplicate measures. +3. Include `rawPaths: ["cards/.json"]` on every `sl_write_source`, `sl_edit_source`, and `wiki_write` call. If one artifact generalizes multiple near-duplicate cards, include each contributing card path and no unrelated cards. +4. Decide: - Simple aggregation on a table that already has a source → `sl_edit_source` to add a measure. - Join between tables that should be linked in the SL graph → `sl_edit_source` to add a join. - Complex derived SQL (CTEs, multi-layer aggregation, scoring models) → `sl_write_source` with `source_type: sql`. When the SQL projects/filters from a single manifest-backed base table, set `inherits_columns_from: ` so columns inherit type and description from the manifest — see `sl_capture` skill for the slim form. Use `sl_discover` to discover the manifest key from the table reference in the SQL (it accepts `MARTS.CONSIGNMENTS`, `ANALYTICS.MARTS.CONSIGNMENTS`, or `CONSIGNMENTS`). @@ -57,7 +58,7 @@ For each card: - Trivial query (`SELECT *`, simple `COUNT(*)` with no business logic) → do nothing; the runner will record this card as `action_type='skipped'`. - Duplicate of an existing measure → same as trivial; do nothing for this card. -**Manifest-only names need an overlay first.** If `sl_discover` shows a source name with `Type: table` but `sl_read_source` returns "Source not found", the source lives only in the schema manifest (no standalone overlay yet). `sl_edit_source` cannot edit manifest-only names — you must bootstrap an overlay with `sl_write_source` using the overlay shape: +**Manifest-only names need an overlay first.** If `sl_discover` shows a source name with `Type: table` but `sl_read_source` returns "Source not found", the source lives only in the schema manifest (no standalone overlay yet). `sl_edit_source` cannot edit manifest-only names, and a full standalone `sl_write_source` for that name would shadow manifest columns and joins. Bootstrap an overlay with `sl_write_source` using the overlay shape: ```yaml name: @@ -68,7 +69,9 @@ measures: Overlay shape: `name:` plus any of `measures:`, `segments:`, `description:`, `joins:`, `disable_joins:`. Never include `sql:`, `table:`, `grain:`, or `columns:` on a manifest-backed name — those would shadow the manifest's schema and drop its joins. Overlay `joins:` are merged additively with the manifest's joins (deduped by `to` + `on`); use `disable_joins: [""]` to suppress a specific manifest join. After the overlay exists, use `sl_edit_source` for further tweaks. See `sl_capture` skill for the canonical overlay rule. -**Join discovery:** When your card's SQL references warehouse tables (e.g. in `FROM` or `JOIN` clauses), call `sl_discover({ query: '' })` before writing. The matching manifest entry's `name` is the value you put in `joins: [- to: ]`. Use `many_to_one` for FK-to-dimension joins, `one_to_many` for the reverse. +**Join discovery:** When your card's SQL references warehouse tables (e.g. in `FROM` or `JOIN` clauses), call `sl_discover({ query: '
' })` before writing. The matching manifest entry's `name` is the value you use in `joins: [- to: ]` only when the card output exposes a local key that matches the target source grain (for example `account_id = mart_account_segments.account_id`). Do not declare a KTX join just because the card SQL joins that table internally. If the output only exposes display fields such as `account_name`, keep the SQL source self-contained or project the key before adding the join. Use `many_to_one` for FK-to-dimension joins, `one_to_many` for the reverse. + +**Hard rule on join columns (prevents broken joins):** For every join you declare, the local column on the left of `on:` MUST be (a) present in your source's projected output and (b) a key/ID column, never a display value. If the natural FK isn't in your SELECT, add it to SELECT before declaring the join. Joining `account_name = mart_account_segments.account_id` is always wrong — names are not identifiers and the equality produces zero matches. The validator rejects this with a "display value to identifier" error; the tool will refuse to save it. Add `account_id` to your SELECT and join on `account_id = mart_account_segments.account_id`, or omit the join entirely. ## priorProvenance @@ -162,7 +165,7 @@ After Steps A and B, your SQL must: - Reference no aliases that aren't defined inside the SQL itself. - Be valid as a standalone subquery (the validator runs `SELECT * FROM (your_sql) LIMIT 1`). -If `resolutionStatus: "fallback"` and the SQL is still complex enough that you can't confidently translate it, **skip the card** rather than writing broken SQL. Call `emit_unmapped_fallback` with the staged card path as `rawPath`, `reason: "metabase_sql_untranslated"`, and `fallback: "flagged"`. +If `resolutionStatus: "fallback"` and the SQL is still complex enough that you can't confidently translate it, **skip the card** rather than writing broken SQL. Call `emit_unmapped_fallback` with the staged card path as `rawPath`, `reason: "parse_error"`, `clarification: "metabase_sql_untranslated"`, and `fallback: "flagged"`. ## Join-graph connectivity @@ -171,8 +174,9 @@ For `source_type: table`: - Match column names ending in `_id` against existing sources' grain columns. For `source_type: sql`: -- The validator parses your SQL and **rejects the write** if any FROM/JOIN table has a manifest entry that you did not declare in `joins:`. The error names every missing join target — declare a `many_to_one` join for each and reissue. -- Tables outside the manifest (schemas not covered by this connection — e.g. `staging.*` referenced from a MARTS source) are not flagged. For those, write a single-line `wiki_write` with key `unmapped-table-` so the gap is documented, then call `emit_unmapped_fallback` with the staged card path as `rawPath`, `reason: "table_outside_manifest"`, and `fallback: "wiki_only"`. +- The validator parses your SQL and rejects the write when a referenced manifest table has a viable projected local key but no declared `joins:` entry. Add the join only after confirming the output key and target grain match. +- If `sl_discover` resolves the table, it is not outside the manifest. Do not write an `unmapped-table-*` fallback for resolved `orbit_raw`, `mart`, or other manifest-backed sources just because they appear inside card SQL. +- If `sl_discover` cannot resolve a referenced table at all, write a single-line `wiki_write` with key `unmapped-table-` and `rawPaths: ["cards/.json"]` so the gap is documented, then call `emit_unmapped_fallback` with the staged card path as `rawPath`, `reason: "missing_target_table"`, `tableRef: ""`, and `fallback: "wiki_only"`. Do not use this fallback if `sl_discover` resolved the table/source. Joins on manifest-backed names compose: the manifest's joins are inherited automatically, and any overlay `joins:` are merged on top (deduped by `to` + `on`). Use `disable_joins: [""]` in the overlay to suppress a specific manifest join. If `sl_discover` shows a manifest-backed source with `Joins: 0` and the warehouse FK metadata is genuinely absent, declaring application-level joins via the overlay is fair game — bootstrap with `sl_write_source` (overlay shape above), then refine via `sl_edit_source`. diff --git a/packages/context/skills/notion_synthesize/SKILL.md b/packages/context/skills/notion_synthesize/SKILL.md index c3fdf3cd..933acc55 100644 --- a/packages/context/skills/notion_synthesize/SKILL.md +++ b/packages/context/skills/notion_synthesize/SKILL.md @@ -18,8 +18,8 @@ Each WorkUnit is either a single Notion page/span or a topical cluster of relate 2. For each assigned page, call `read_raw_file`, or `read_raw_span` for oversized pages when the notes specify a span. 3. Search `wiki_search` for existing pages that overlap the WorkUnit topics. Prefer updating an existing page over creating a duplicate. 4. Use `context_evidence_search`, `context_evidence_read`, and `context_evidence_neighbors` to pull supporting chunks when indexed evidence is relevant. Pass `chunkId` and `documentId` values verbatim as returned by the evidence tools. -5. Write durable business knowledge with `wiki_write`. Aim for a small number of high-quality pages per WorkUnit or cluster. -6. When the Notion content defines a reusable dataset, metric, segment, join rule, source-of-truth mapping, or table with explicit columns, load `sl_capture`, discover existing sources first with `sl_discover` or `sl_read_source`, then use `sl_write_source` or `sl_edit_source`. +5. Write durable business knowledge with `wiki_write`. Aim for a small number of high-quality pages per WorkUnit or cluster. Include `rawPaths` with the exact Notion raw files that support each page. +6. When the Notion content defines a reusable dataset, metric, segment, join rule, source-of-truth mapping, or table with explicit columns, load `sl_capture`, discover existing sources first with `sl_discover` or `sl_read_source`, then use `sl_write_source` or `sl_edit_source` only for a confirmed mapped non-Notion target source. Include `rawPaths` with the exact Notion raw files that support the SL action. If no mapped target exists, call `emit_unmapped_fallback` and keep the content wiki-only. 7. For every deleted raw path in the Eviction Set, call `eviction_list`, decide retention, then `context_eviction_decision_write`. Do this even when no wiki write is needed. ## What To Capture @@ -46,6 +46,8 @@ Prefer fewer, stronger entries. Every wiki entry must cite at least one Notion p If a clustered WorkUnit includes several related pages, synthesize the shared rule or concept instead of writing one thin page per source. For oversized page spans, read only the assigned span unless the WorkUnit explicitly asks for neighboring context. +Search existing wiki pages for the same `tables:` or `sl_refs:` frontmatter and for source-of-truth aliases before creating a new page. If an existing page already documents the same warehouse object or business concept, update it instead of creating a differently named duplicate. + ## Citation Style ```md @@ -61,9 +63,14 @@ If a clustered WorkUnit includes several related pages, synthesize the shared ru - Discover existing sources first with `sl_discover`; read existing source YAML before editing. - Prefer overlays on manifest-backed sources over standalone SQL. - If Notion describes a dashboard or metric but does not define executable logic, write a wiki page and attach `sl_refs` only after confirming the referenced source exists. +- Notion `dataSourceCount` counts Notion databases/data sources only. It does not prove that a warehouse/dbt table has or lacks a mapped semantic-layer source. +- Do not create SL sources under the Notion connection just because a page mentions a warehouse, dbt, Looker, or Metabase object. Use the mapped warehouse/source connection after discovery, or emit an unmapped fallback and write wiki-only. +- Distinguish fallback reasons precisely: if a non-Notion warehouse/dbt connection exists but `sl_discover` cannot find the named table/source, use `no_physical_table`; reserve `no_connection_mapping` for cases where there is no plausible non-Notion target connection at all. +- If `sl_discover` resolves the table/source, do not call `emit_unmapped_fallback` for that table. Use the resolved source for `sl_refs`, overlay edits, or wiki-only documentation. +- When calling `emit_unmapped_fallback`, pass the table or source identifier as `tableRef` (e.g. `tableRef: "orbit_analytics.customer"`) — the tool generates the canonical detail string from the reason code and `tableRef`. Use the optional `clarification` field only to add context that does not contradict the reason. Do not restate the reason in `clarification`. ## Tools -Allowed: `read_raw_file`, `read_raw_span`, `wiki_search`, `wiki_read`, `wiki_write`, `sl_discover`, `sl_read_source`, `sl_write_source`, `sl_edit_source`, `sl_validate`, `context_evidence_search`, `context_evidence_read`, `context_evidence_neighbors`, `eviction_list`, `context_eviction_decision_write`. +Allowed: `read_raw_file`, `read_raw_span`, `wiki_search`, `wiki_read`, `wiki_write`, `sl_discover`, `sl_read_source`, `sl_write_source`, `sl_edit_source`, `sl_validate`, `context_evidence_search`, `context_evidence_read`, `context_evidence_neighbors`, `emit_unmapped_fallback`, `eviction_list`, `context_eviction_decision_write`. Not allowed: `context_candidate_write`, `context_candidate_mark`. diff --git a/packages/context/skills/sl_capture/SKILL.md b/packages/context/skills/sl_capture/SKILL.md index 1ad92174..ffb1780d 100644 --- a/packages/context/skills/sl_capture/SKILL.md +++ b/packages/context/skills/sl_capture/SKILL.md @@ -190,6 +190,7 @@ use it) without changing its SQL expression or filters. - **`sl_edit_source`** is the workhorse for additive changes: add a measure, add a join, tweak a description, replace a filter. Cheap, targeted, preserves the rest of the file. - **`sl_write_source`** is for brand-new sources or when the entire file needs restructuring. It overwrites the file completely. - Do NOT modify existing measures or their descriptions unless the current turn explicitly corrects them. +- During bundle/external ingest, include `rawPaths` on every `sl_write_source`/`sl_edit_source` call with only the raw files that directly support the SL action. ## Worked example — additive overlay diff --git a/packages/context/src/connections/index.ts b/packages/context/src/connections/index.ts index ce24a2c7..513818fa 100644 --- a/packages/context/src/connections/index.ts +++ b/packages/context/src/connections/index.ts @@ -21,6 +21,7 @@ export { notionConnectionToPullConfig, parseNotionConnectionConfig, redactNotionConnectionConfig, + resolveNotionConnectionAuthToken, resolveNotionAuthToken, type KtxNotionConnectionConfig, type RedactedKtxNotionConnectionConfig, diff --git a/packages/context/src/connections/notion-config.test.ts b/packages/context/src/connections/notion-config.test.ts index 8ad88c86..0167f186 100644 --- a/packages/context/src/connections/notion-config.test.ts +++ b/packages/context/src/connections/notion-config.test.ts @@ -30,18 +30,36 @@ describe('standalone Notion connection config', () => { expect(parsed).toEqual({ driver: 'notion', + auth_token: null, auth_token_ref: 'env:NOTION_TOKEN', crawl_mode: 'selected_roots', root_page_ids: ['page-1'], root_database_ids: [], root_data_source_ids: [], max_pages_per_run: 1000, - max_knowledge_creates_per_run: 5, + max_knowledge_creates_per_run: 25, max_knowledge_updates_per_run: 20, last_successful_cursor: null, }); }); + it('parses inline Notion auth tokens without requiring auth_token_ref', () => { + const parsed = parseNotionConnectionConfig({ + driver: 'notion', + auth_token: ' ntn_inline_token ', + crawl_mode: 'selected_roots', + root_page_ids: ['page-1'], + }); + + expect(parsed).toMatchObject({ + driver: 'notion', + auth_token: 'ntn_inline_token', + auth_token_ref: null, + crawl_mode: 'selected_roots', + root_page_ids: ['page-1'], + }); + }); + it('redacts token references from display output', () => { expect( redactNotionConnectionConfig( @@ -60,7 +78,7 @@ describe('standalone Notion connection config', () => { rootDatabaseIds: [], rootDataSourceIds: [], maxPagesPerRun: 80, - maxKnowledgeCreatesPerRun: 5, + maxKnowledgeCreatesPerRun: 25, maxKnowledgeUpdatesPerRun: 20, warning: 'Anything accessible to this Notion integration can become organization knowledge.', }); @@ -117,4 +135,23 @@ describe('standalone Notion connection config', () => { lastSuccessfulCursor: '{"phase":"all_accessible_pages","cursor":"cursor-1"}', }); }); + + it('uses inline Notion auth_token when building adapter pull config', async () => { + const pullConfig = await notionConnectionToPullConfig( + parseNotionConnectionConfig({ + driver: 'notion', + auth_token: 'ntn_inline_token', + auth_token_ref: 'env:STALE_NOTION_TOKEN', + crawl_mode: 'all_accessible', + }), + { + env: {}, + readTextFile: async () => { + throw new Error('readTextFile should not be called for inline auth_token'); + }, + }, + ); + + expect(pullConfig.authToken).toBe('ntn_inline_token'); + }); }); diff --git a/packages/context/src/connections/notion-config.ts b/packages/context/src/connections/notion-config.ts index b09cf968..d678ba2f 100644 --- a/packages/context/src/connections/notion-config.ts +++ b/packages/context/src/connections/notion-config.ts @@ -1,7 +1,11 @@ import { readFile } from 'node:fs/promises'; import { homedir } from 'node:os'; import { resolve } from 'node:path'; -import { type NotionPullConfig, notionPullConfigSchema } from '../ingest/adapters/notion/types.js'; +import { + NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN, + type NotionPullConfig, + notionPullConfigSchema, +} from '../ingest/adapters/notion/types.js'; import type { KtxProjectConnectionConfig } from '../project/config.js'; export const KTX_NOTION_ORG_KNOWLEDGE_WARNING = @@ -11,7 +15,8 @@ type KtxNotionCrawlMode = 'all_accessible' | 'selected_roots'; export interface KtxNotionConnectionConfig extends KtxProjectConnectionConfig { driver: 'notion'; - auth_token_ref: string; + auth_token: string | null; + auth_token_ref: string | null; crawl_mode: KtxNotionCrawlMode; root_page_ids: string[]; root_database_ids: string[]; @@ -89,11 +94,12 @@ export function parseNotionConnectionConfig(raw: unknown): KtxNotionConnectionCo if (input.driver !== 'notion') { throw new Error('Notion connection config requires driver: notion'); } - const authTokenRef = stringValue(input.auth_token_ref, ''); - if (!authTokenRef) { - throw new Error('Notion connection config requires auth_token_ref'); + const authToken = optionalString(input.auth_token); + const authTokenRef = optionalString(input.auth_token_ref); + if (!authToken && !authTokenRef) { + throw new Error('Notion connection config requires auth_token or auth_token_ref'); } - if (!authTokenRef.startsWith('env:') && !authTokenRef.startsWith('file:')) { + if (authTokenRef && !authTokenRef.startsWith('env:') && !authTokenRef.startsWith('file:')) { throw new Error('Notion auth_token_ref must use env:NAME or file:/path'); } @@ -111,6 +117,7 @@ export function parseNotionConnectionConfig(raw: unknown): KtxNotionConnectionCo return { ...input, driver: 'notion', + auth_token: authToken, auth_token_ref: authTokenRef, crawl_mode: crawlMode, root_page_ids: rootPageIds, @@ -119,7 +126,7 @@ export function parseNotionConnectionConfig(raw: unknown): KtxNotionConnectionCo max_pages_per_run: boundedInteger(input.max_pages_per_run, 1000, 'max_pages_per_run', 1, 10_000), max_knowledge_creates_per_run: boundedInteger( input.max_knowledge_creates_per_run, - 5, + NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN, 'max_knowledge_creates_per_run', 0, 25, @@ -138,7 +145,7 @@ export function parseNotionConnectionConfig(raw: unknown): KtxNotionConnectionCo export function redactNotionConnectionConfig(config: KtxNotionConnectionConfig): RedactedKtxNotionConnectionConfig { return { driver: 'notion', - hasAuthToken: Boolean(config.auth_token_ref), + hasAuthToken: Boolean(config.auth_token ?? config.auth_token_ref), crawlMode: config.crawl_mode, rootPageIds: config.root_page_ids, rootDatabaseIds: config.root_database_ids, @@ -178,12 +185,20 @@ export async function resolveNotionAuthToken( throw new Error('Notion auth_token_ref must use env:NAME or file:/path'); } +export async function resolveNotionConnectionAuthToken( + config: Pick, + options: ResolveNotionTokenOptions = {}, +): Promise { + return config.auth_token ?? (await resolveNotionAuthToken(config.auth_token_ref ?? '', options)); +} + export async function notionConnectionToPullConfig( config: KtxNotionConnectionConfig, options: ResolveNotionTokenOptions = {}, ): Promise { + const authToken = await resolveNotionConnectionAuthToken(config, options); return notionPullConfigSchema.parse({ - authToken: await resolveNotionAuthToken(config.auth_token_ref, options), + authToken, crawlMode: config.crawl_mode, rootPageIds: config.root_page_ids, rootDatabaseIds: config.root_database_ids, diff --git a/packages/context/src/ingest/adapters/dbt/dbt.adapter.test.ts b/packages/context/src/ingest/adapters/dbt/dbt.adapter.test.ts index dad64232..2851318e 100644 --- a/packages/context/src/ingest/adapters/dbt/dbt.adapter.test.ts +++ b/packages/context/src/ingest/adapters/dbt/dbt.adapter.test.ts @@ -48,4 +48,10 @@ describe('DbtSourceAdapter', () => { it('implements fetch() for git-backed dbt source setup', () => { expect(adapter.fetch).toBeTypeOf('function'); }); + + it('reports mapped warehouse targets for bundle SL discovery', async () => { + adapter = new DbtSourceAdapter({ targetConnectionIds: ['postgres-warehouse', 'postgres-warehouse'] }); + + await expect(adapter.listTargetConnectionIds?.(stagedDir)).resolves.toEqual(['postgres-warehouse']); + }); }); diff --git a/packages/context/src/ingest/adapters/dbt/dbt.adapter.ts b/packages/context/src/ingest/adapters/dbt/dbt.adapter.ts index 8ad71ac4..ef1c798c 100644 --- a/packages/context/src/ingest/adapters/dbt/dbt.adapter.ts +++ b/packages/context/src/ingest/adapters/dbt/dbt.adapter.ts @@ -11,6 +11,7 @@ import { parseDbtStagedDir } from './parse.js'; interface DbtSourceAdapterOptions { homeDir?: string; + targetConnectionIds?: string[]; } export class DbtSourceAdapter implements SourceAdapter { @@ -24,6 +25,10 @@ export class DbtSourceAdapter implements SourceAdapter { return detectDbtStagedDir(stagedDir); } + async listTargetConnectionIds(_stagedDir: string): Promise { + return [...new Set(this.options.targetConnectionIds ?? [])].sort((left, right) => left.localeCompare(right)); + } + async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise { const config = pullConfig as DbtPullConfig | undefined; if (!config?.repoUrl) { diff --git a/packages/context/src/ingest/adapters/historic-sql/projection.ts b/packages/context/src/ingest/adapters/historic-sql/projection.ts index ca24a67f..366b98f3 100644 --- a/packages/context/src/ingest/adapters/historic-sql/projection.ts +++ b/packages/context/src/ingest/adapters/historic-sql/projection.ts @@ -74,7 +74,7 @@ async function readJson(path: string): Promise { async function writeYamlAtomic(path: string, value: unknown): Promise { await mkdir(dirname(path), { recursive: true }); const tmp = `${path}.tmp`; - await writeFile(tmp, YAML.stringify(value, { indent: 2, lineWidth: 0 }), 'utf-8'); + await writeFile(tmp, YAML.stringify(value, { indent: 2, lineWidth: 0, version: '1.1' }), 'utf-8'); await rename(tmp, path); } @@ -270,7 +270,7 @@ export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInp } } } - const after = YAML.stringify(shard, { indent: 2, lineWidth: 0 }); + const after = YAML.stringify(shard, { indent: 2, lineWidth: 0, version: '1.1' }); if (after !== before) { await writeYamlAtomic(path, shard); } diff --git a/packages/context/src/ingest/adapters/notion/chunk.ts b/packages/context/src/ingest/adapters/notion/chunk.ts index 260a28fd..7d85fb76 100644 --- a/packages/context/src/ingest/adapters/notion/chunk.ts +++ b/packages/context/src/ingest/adapters/notion/chunk.ts @@ -7,6 +7,8 @@ import { notionManifestSchema, notionMetadataSchema } from './types.js'; const MAX_NOTION_WORK_UNIT_CHARS = 40_000; export const NOTION_ORG_KNOWLEDGE_WARNING = 'Anything accessible to this Notion integration can become organization knowledge.'; +const NOTION_SL_WRITE_GUIDANCE = + 'Write wiki entries with wiki_write. Wiki keys must be flat slugs like orbit-company-overview, not orbit/company-overview. Search existing wiki pages for the same tables or sl_refs before creating a new page. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.'; async function walk(root: string): Promise { const entries = await readdir(root, { withFileTypes: true, recursive: true }); @@ -92,7 +94,7 @@ export async function chunkNotionStagedDir(stagedDir: string, diffSet?: DiffSet) rawFiles, dependencyPaths, peerFileIndex, - notes: `Synthesize durable wiki and SL knowledge from this Notion page span only. Use read_raw_span on ${pagePath} for lines ${range.startLine}-${range.endLine}; do not call read_raw_file for oversized pages. Cite evidence chunk/page IDs.`, + notes: `Synthesize durable wiki and SL knowledge from this Notion page span only. Use read_raw_span on ${pagePath} for lines ${range.startLine}-${range.endLine}; do not call read_raw_file for oversized pages. ${NOTION_SL_WRITE_GUIDANCE} Cite evidence chunk/page IDs.`, }); } continue; @@ -105,7 +107,7 @@ export async function chunkNotionStagedDir(stagedDir: string, diffSet?: DiffSet) dependencyPaths, peerFileIndex, notes: - 'Synthesize durable wiki and SL knowledge from this Notion page. Write wiki entries with wiki_write and SL sources with sl_write_source; cite evidence chunk/page IDs.', + `Synthesize durable wiki and SL knowledge from this Notion page. ${NOTION_SL_WRITE_GUIDANCE} Cite evidence chunk/page IDs.`, }); } @@ -115,6 +117,8 @@ export async function chunkNotionStagedDir(stagedDir: string, diffSet?: DiffSet) reconcileNotes: [ `Notion maxKnowledgeCreatesPerRun=${manifest.maxKnowledgeCreatesPerRun}`, `Notion maxKnowledgeUpdatesPerRun=${manifest.maxKnowledgeUpdatesPerRun}`, + 'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.', + 'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.', ], contextReport: { capped: manifest.capped, diff --git a/packages/context/src/ingest/adapters/notion/cluster.test.ts b/packages/context/src/ingest/adapters/notion/cluster.test.ts index 886d4973..ad41b571 100644 --- a/packages/context/src/ingest/adapters/notion/cluster.test.ts +++ b/packages/context/src/ingest/adapters/notion/cluster.test.ts @@ -79,9 +79,27 @@ describe('clusterNotionWorkUnits', () => { expect(wu.unitKey).toMatch(/^notion-cluster-\d+$/); expect(wu.rawFiles.length).toBeGreaterThan(0); expect(wu.notes).toMatch(/Synthesize/); + expect(wu.notes).toContain('emit_unmapped_fallback'); + expect(wu.notes).toContain('Do not create SL sources under the Notion connection'); } }); + test('merges pages into one synthesis unit at the clustering threshold', async () => { + const pages = Array.from({ length: MIN_PAGES_TO_CLUSTER }, (_, i) => ({ + id: `p${i}`, + title: `Customer source reference ${i}`, + body: `Customer source reference maps to orbit_analytics.customer ${i}`.repeat(10), + })); + const stagedDir = await makeStaged(pages); + const wus = makeWorkUnits(pages); + const out = await clusterNotionWorkUnits({ workUnits: wus, stagedDir, embedding: mockEmbed }); + expect(out).toHaveLength(1); + expect(out[0].unitKey).toBe('notion-cluster-1'); + expect(new Set(out[0].rawFiles)).toEqual(new Set(wus.flatMap((wu) => wu.rawFiles))); + expect(out[0].notes).toContain('emit_unmapped_fallback'); + expect(out[0].notes).toContain('Do not create SL sources under the Notion connection'); + }); + test('preserves coverage: every input rawFile appears in some cluster', async () => { const pages = Array.from({ length: 12 }, (_, i) => ({ id: `p${i}`, diff --git a/packages/context/src/ingest/adapters/notion/cluster.ts b/packages/context/src/ingest/adapters/notion/cluster.ts index 0d3640dd..090ff4f7 100644 --- a/packages/context/src/ingest/adapters/notion/cluster.ts +++ b/packages/context/src/ingest/adapters/notion/cluster.ts @@ -8,6 +8,8 @@ import { notionMetadataSchema } from './types.js'; export const MIN_PAGES_TO_CLUSTER = 5; const CLUSTER_TEXT_BODY_CHARS = 1024; const CLUSTER_SEED = 42; +const NOTION_CLUSTER_SL_WRITE_GUIDANCE = + 'Write wiki entries directly with wiki_write. Wiki keys must be flat slugs like orbit-company-overview, not orbit/company-overview. Search existing wiki pages for the same tables or sl_refs before creating a new page. Only write or edit SL sources after sl_discover/sl_read_source confirms a mapped non-Notion target source; if no mapped target exists, emit_unmapped_fallback and keep the fact wiki-only. Notion dataSourceCount counts Notion databases/data sources only, not warehouse/dbt mappings. If a warehouse/dbt connection exists but the named table or source is absent, use reason no_physical_table rather than no_connection_mapping. Do not create SL sources under the Notion connection just because a page mentions a warehouse table.'; interface ClusterNotionWorkUnitsArgs { workUnits: WorkUnit[]; @@ -63,7 +65,7 @@ function mergeWorkUnits(bucket: WorkUnit[], clusterIndex: number): WorkUnit { `Synthesize durable wiki and SL knowledge from these ${bucket.length} related Notion pages. ` + 'Read each page with read_raw_file (or read_raw_span for oversized pages). ' + 'Search nearby evidence with context_evidence_search/_read/_neighbors when needed. ' + - 'Write wiki entries directly with wiki_write and SL sources directly with sl_write_source. ' + + `${NOTION_CLUSTER_SL_WRITE_GUIDANCE} ` + 'Do not call context_candidate_write.', }; } @@ -72,7 +74,7 @@ export async function clusterNotionWorkUnits(args: ClusterNotionWorkUnitsArgs): const { workUnits, stagedDir, embedding } = args; if (workUnits.length < MIN_PAGES_TO_CLUSTER) return workUnits; const k = pickK(workUnits.length); - if (k <= 1) return workUnits; + if (k <= 1) return [mergeWorkUnits(workUnits, 0)]; const texts = await Promise.all(workUnits.map((wu) => buildClusterText(wu, stagedDir))); let vectors: number[][]; try { diff --git a/packages/context/src/ingest/adapters/notion/notion.adapter.test.ts b/packages/context/src/ingest/adapters/notion/notion.adapter.test.ts index 49658526..de989d02 100644 --- a/packages/context/src/ingest/adapters/notion/notion.adapter.test.ts +++ b/packages/context/src/ingest/adapters/notion/notion.adapter.test.ts @@ -117,7 +117,7 @@ describe('NotionSourceAdapter', () => { continuedFromCursor: false, partialSnapshot: true, maxPagesPerRun: 1, - maxKnowledgeCreatesPerRun: 5, + maxKnowledgeCreatesPerRun: 25, maxKnowledgeUpdatesPerRun: 20, skipped: [], warnings: ['maxPagesPerRun reached at 1'], @@ -167,7 +167,7 @@ describe('NotionSourceAdapter', () => { continuedFromCursor: true, partialSnapshot: true, maxPagesPerRun: 100, - maxKnowledgeCreatesPerRun: 5, + maxKnowledgeCreatesPerRun: 25, maxKnowledgeUpdatesPerRun: 20, nextSuccessfulCursor: null, skipped: [], @@ -218,7 +218,7 @@ describe('NotionSourceAdapter', () => { continuedFromCursor: false, partialSnapshot: false, maxPagesPerRun: 100, - maxKnowledgeCreatesPerRun: 5, + maxKnowledgeCreatesPerRun: 25, maxKnowledgeUpdatesPerRun: 20, skipped: [], warnings: [], @@ -241,13 +241,58 @@ describe('NotionSourceAdapter', () => { dependencyPaths: ['manifest.json', 'pages/page-1/blocks.json'], }); expect(result.workUnits[0].notes).toContain('Synthesize durable wiki and SL knowledge'); + expect(result.workUnits[0].notes).toContain('emit_unmapped_fallback'); + expect(result.workUnits[0].notes).toContain('use reason no_physical_table rather than no_connection_mapping'); + expect(result.workUnits[0].notes).toContain('Do not create SL sources under the Notion connection'); + expect(result.workUnits[0].notes).toContain( + 'Wiki keys must be flat slugs like orbit-company-overview, not orbit/company-overview', + ); expect(result.reconcileNotes).toEqual([ - 'Notion maxKnowledgeCreatesPerRun=5', + 'Notion maxKnowledgeCreatesPerRun=25', 'Notion maxKnowledgeUpdatesPerRun=20', + 'Notion dataSourceCount is Notion-only; use sl_discover for warehouse/dbt mapping decisions.', + 'Reconcile Notion wiki pages sharing tables/sl_refs before creating distinct artifacts.', ]); expect(result.contextReport).toEqual({ capped: false, warnings: [NOTION_ORG_KNOWLEDGE_WARNING] }); }); + it('chunks retried pages when failed provenance makes unchanged raw files look added again', async () => { + await writeFile( + join(stagedDir, 'manifest.json'), + JSON.stringify({ + source: 'notion', + apiVersion: '2026-03-11', + crawlMode: 'selected_roots', + rootPageIds: ['page-1'], + rootDatabaseIds: [], + rootDataSourceIds: [], + fetchedAt: '2026-04-28T00:00:00.000Z', + pageCount: 1, + databaseCount: 0, + dataSourceCount: 0, + capped: false, + continuedFromCursor: false, + partialSnapshot: false, + maxPagesPerRun: 100, + maxKnowledgeCreatesPerRun: 25, + maxKnowledgeUpdatesPerRun: 20, + skipped: [], + warnings: [], + }), + 'utf-8', + ); + await writePage('page-1', 'Retry Me'); + + const result = await adapter.chunk(stagedDir, { + added: ['pages/page-1/metadata.json', 'pages/page-1/page.md'], + modified: [], + deleted: [], + unchanged: ['manifest.json', 'pages/page-1/blocks.json'], + }); + + expect(result.workUnits.map((workUnit) => workUnit.unitKey)).toEqual(['notion-page-page-1']); + }); + it('reports malformed manifests with a Notion-specific error', async () => { await writeFile(join(stagedDir, 'manifest.json'), '{bad json', 'utf-8'); diff --git a/packages/context/src/ingest/adapters/notion/types.ts b/packages/context/src/ingest/adapters/notion/types.ts index 1ac272ae..0ddf20ca 100644 --- a/packages/context/src/ingest/adapters/notion/types.ts +++ b/packages/context/src/ingest/adapters/notion/types.ts @@ -2,6 +2,7 @@ import { z } from 'zod'; export const NOTION_API_VERSION = '2026-03-11'; export const NOTION_SOURCE_KEY = 'notion'; +export const NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN = 25; export const notionPullConfigSchema = z.object({ authToken: z.string().min(1), @@ -10,7 +11,7 @@ export const notionPullConfigSchema = z.object({ rootDatabaseIds: z.array(z.string().min(1)).default([]), rootDataSourceIds: z.array(z.string().min(1)).default([]), maxPagesPerRun: z.number().int().min(1).max(10_000).default(1000), - maxKnowledgeCreatesPerRun: z.number().int().min(0).max(25).default(5), + maxKnowledgeCreatesPerRun: z.number().int().min(0).max(25).default(NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN), maxKnowledgeUpdatesPerRun: z.number().int().min(0).max(100).default(20), lastSuccessfulCursor: z.string().nullable().default(null), }); diff --git a/packages/context/src/ingest/index.ts b/packages/context/src/ingest/index.ts index 3bcd5770..d2336ae9 100644 --- a/packages/context/src/ingest/index.ts +++ b/packages/context/src/ingest/index.ts @@ -315,6 +315,7 @@ export type { MetricflowPullConfig, } from './adapters/metricflow/pull-config.js'; export { NOTION_ORG_KNOWLEDGE_WARNING } from './adapters/notion/chunk.js'; +export { NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN } from './adapters/notion/types.js'; export { NotionSourceAdapter, type NotionSourceAdapterDeps } from './adapters/notion/notion.adapter.js'; export { NotionClient, type NotionApi, type NotionBotInfo } from './adapters/notion/notion-client.js'; export { bucketDistinctUsers, bucketErrorRate, bucketExecutions, bucketP95Runtime, bucketRecency } from './adapters/historic-sql/buckets.js'; diff --git a/packages/context/src/ingest/ingest-bundle.runner.test.ts b/packages/context/src/ingest/ingest-bundle.runner.test.ts index ead6704d..a4513f63 100644 --- a/packages/context/src/ingest/ingest-bundle.runner.test.ts +++ b/packages/context/src/ingest/ingest-bundle.runner.test.ts @@ -184,7 +184,11 @@ const makeDeps = () => { .mockImplementation((connectionId: string) => Promise.resolve(connectionId === 'warehouse-2' ? ['looker__orders.yaml'] : []), ), - loadAllSources: vi.fn().mockResolvedValue([]), + loadAllSources: vi + .fn() + .mockImplementation((connectionId: string) => + Promise.resolve(connectionId === 'warehouse-2' ? [{ name: 'looker__orders' }] : []), + ), }; const slSearchService = { indexSources: vi.fn().mockResolvedValue(undefined), @@ -1261,8 +1265,8 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { ([params]: any[]) => params.telemetryTags.operationName === 'ingest-bundle-wu', ); expect(deps.adapter.listTargetConnectionIds).toHaveBeenCalledWith('/tmp/stage/upload-x'); - expect(deps.semanticLayerService.listFilesForConnection).toHaveBeenCalledWith('looker-run'); - expect(deps.semanticLayerService.listFilesForConnection).toHaveBeenCalledWith('warehouse-2'); + expect(deps.semanticLayerService.loadAllSources).toHaveBeenCalledWith('looker-run'); + expect(deps.semanticLayerService.loadAllSources).toHaveBeenCalledWith('warehouse-2'); expect(workUnitCall?.[0].userPrompt).toContain('looker__orders'); expect(deps.canonicalPins.listPins).toHaveBeenCalledWith(['looker-run', 'warehouse-2']); }); @@ -1556,6 +1560,49 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { expect(deps.knowledgeIndex.listPagesForUser).toHaveBeenCalledWith('system'); }); + it('includes manifest-backed target sources in WorkUnit prompts', async () => { + const deps = makeDeps(); + deps.adapter.listTargetConnectionIds = vi.fn().mockResolvedValue(['postgres-warehouse']); + deps.semanticLayerService.loadAllSources.mockImplementation((connectionId: string) => + Promise.resolve(connectionId === 'postgres-warehouse' ? [{ name: 'stg_accounts' }] : []), + ); + + const runner = buildRunner(deps); + (runner as any).stageRawFilesStage1 = vi.fn().mockResolvedValue({ + currentHashes: new Map([['models/schema.yml', 'h1']]), + rawDirInWorktree: 'raw-sources/dbt-main/dbt/s', + }); + (runner as any).resolveStagedDir = vi.fn().mockResolvedValue('/tmp/stage/upload-x'); + + await runner.run({ + jobId: 'j1', + connectionId: 'dbt-main', + sourceKey: 'fake', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload-x' }, + }); + + const workUnitCall = deps.agentRunner.runLoop.mock.calls.find( + ([params]: any[]) => params.telemetryTags.operationName === 'ingest-bundle-wu', + ); + expect(workUnitCall?.[0].userPrompt).toContain('## postgres-warehouse'); + expect(workUnitCall?.[0].userPrompt).toContain('stg_accounts'); + expect(deps.canonicalPins.listPins).toHaveBeenCalledWith(['dbt-main', 'postgres-warehouse']); + }); + + it('does not resolve qualified fallback table refs by source name alone', async () => { + const deps = makeDeps(); + deps.semanticLayerService.loadAllSources.mockResolvedValue([{ name: 'orders', table: 'sales.orders' }]); + const runner = buildRunner(deps); + + await expect( + (runner as any).tableRefExistsInSemanticLayer(deps.semanticLayerService, ['warehouse'], 'finance.orders'), + ).resolves.toBe(false); + await expect( + (runner as any).tableRefExistsInSemanticLayer(deps.semanticLayerService, ['warehouse'], 'sales.orders'), + ).resolves.toBe(true); + }); + it('passes relevant canonical pins into the reconciliation system prompt', async () => { const deps = makeDeps(); deps.diffSetService.compute.mockResolvedValue({ diff --git a/packages/context/src/ingest/ingest-bundle.runner.ts b/packages/context/src/ingest/ingest-bundle.runner.ts index a226bdd0..31b444bf 100644 --- a/packages/context/src/ingest/ingest-bundle.runner.ts +++ b/packages/context/src/ingest/ingest-bundle.runner.ts @@ -5,9 +5,10 @@ import pLimit from 'p-limit'; import { z } from 'zod'; import { type KtxLogger, noopLogger } from '../core/index.js'; import type { CaptureSession, MemoryAction } from '../memory/index.js'; -import type { SlValidationDeps } from '../sl/index.js'; +import type { SemanticLayerService, SemanticLayerSource, SlValidationDeps } from '../sl/index.js'; import { createTouchedSlSources, type ToolContext, type ToolSession } from '../tools/index.js'; import { actionTargetConnectionId } from './action-identity.js'; +import { NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN } from './adapters/notion/types.js'; import { selectRelevantCanonicalPins } from './canonical-pins.js'; import { sanitizeMemoryFlowError } from './memory-flow/live-buffer.js'; import type { MemoryFlowPlannedWorkUnit } from './memory-flow/types.js'; @@ -39,6 +40,11 @@ import { createReadRawSpanTool } from './tools/read-raw-span.tool.js'; import { createStageDiffTool } from './tools/stage-diff.tool.js'; import { createStageListTool } from './tools/stage-list.tool.js'; import { type ToolCallLogEntry, wrapToolsWithLogger } from './tools/tool-call-logger.js'; +import { + createMutableToolTranscriptSummary, + recordToolTranscriptEntry, + type MutableToolTranscriptSummary, +} from './tools/tool-transcript-summary.js'; import type { EvictionUnit, IngestBundleJob, @@ -48,14 +54,6 @@ import type { WorkUnit, } from './types.js'; -interface MutableToolTranscriptSummary { - unitKey: string; - path: string; - toolCallCount: number; - errorCount: number; - toolNames: Set; -} - function workUnitToMemoryFlowPlannedWorkUnit(workUnit: WorkUnit): MemoryFlowPlannedWorkUnit { return { unitKey: workUnit.unitKey, @@ -80,21 +78,6 @@ function countMemoryFlowActions(actions: MemoryAction[], target: MemoryAction['t return actions.filter((action) => action.target === target).length; } -function isStructuredToolFailure(output: unknown): boolean { - if (!output || typeof output !== 'object') { - return false; - } - const structured = (output as { structured?: unknown }).structured; - return !!structured && typeof structured === 'object' && (structured as { success?: unknown }).success === false; -} - -function isFailedToolCall(entry: ToolCallLogEntry): boolean { - if (entry.error) { - return true; - } - return (entry.toolName === 'sl_write_source' || entry.toolName === 'wiki_write') && isStructuredToolFailure(entry.output); -} - function reportIdFromCreateResult(result: unknown): string | undefined { if (!result || typeof result !== 'object' || !('id' in result)) { return undefined; @@ -103,6 +86,46 @@ function reportIdFromCreateResult(result: unknown): string | undefined { return typeof id === 'string' && id.length > 0 ? id : undefined; } +function normalizeTableReference(value: string): string { + return value + .trim() + .replace(/["`]/g, '') + .replace(/[\[\]]/g, '') + .toLowerCase(); +} + +function finalReferenceSegment(value: string): string { + const parts = value.split('.').filter((part) => part.length > 0); + return parts.at(-1) ?? value; +} + +function semanticSourceMatchesTableRef(source: SemanticLayerSource, tableRef: string): boolean { + const normalizedRef = normalizeTableReference(tableRef); + if (!normalizedRef) { + return false; + } + + const refIsQualified = normalizedRef.includes('.'); + const normalizedSourceName = normalizeTableReference(source.name); + if (normalizedSourceName === normalizedRef) { + return true; + } + + const table = typeof source.table === 'string' ? normalizeTableReference(source.table) : ''; + if (table && (table === normalizedRef || table.endsWith(`.${normalizedRef}`))) { + return true; + } + if (!refIsQualified && table && finalReferenceSegment(table) === normalizedRef) { + return true; + } + + return false; +} + +function rawPathsForAction(action: MemoryAction, fallbackRawPaths: string[]): string[] { + return action.rawPaths && action.rawPaths.length > 0 ? [...new Set(action.rawPaths)] : fallbackRawPaths; +} + export class IngestBundleRunner { private readonly logger: KtxLogger; private readonly chainByConnection = new Map>(); @@ -276,18 +299,46 @@ export class IngestBundleRunner { const blocks = await Promise.all( connectionIds.map(async (connectionId) => { try { - const files = await this.deps.semanticLayerService.listFilesForConnection(connectionId); - const names = files.filter((f) => !f.startsWith('_schema/')).map((f) => f.replace(/\.yaml$/, '')); + const sources = await this.deps.semanticLayerService.loadAllSources(connectionId); + const names = sources.map((source) => source.name).sort((left, right) => left.localeCompare(right)); const body = names.length > 0 ? names.join('\n') : '(no sources yet)'; return `## ${connectionId}\n${body}`; } catch { - return `## ${connectionId}\n(empty)`; + try { + const files = await this.deps.semanticLayerService.listFilesForConnection(connectionId); + const names = files + .filter((f) => !f.startsWith('_schema/')) + .map((f) => f.replace(/\.yaml$/, '')) + .sort((left, right) => left.localeCompare(right)); + const body = names.length > 0 ? names.join('\n') : '(no sources yet)'; + return `## ${connectionId}\n${body}`; + } catch { + return `## ${connectionId}\n(empty)`; + } } }), ); return blocks.join('\n\n'); } + private async tableRefExistsInSemanticLayer( + semanticLayerService: SemanticLayerService, + connectionIds: string[], + tableRef: string, + ): Promise { + for (const connectionId of connectionIds) { + try { + const sources = await semanticLayerService.loadAllSources(connectionId); + if (sources.some((source) => semanticSourceMatchesTableRef(source, tableRef))) { + return true; + } + } catch { + // Fallback diagnostics should not fail an ingest stage if an index lookup is temporarily unavailable. + } + } + return false; + } + private resolveContextCuratorBudget( bundleRef: IngestBundleJob['bundleRef'], stageIndex: StageIndex, @@ -297,7 +348,9 @@ export class IngestBundleRunner { ? (bundleRef.config as Record) : {}; const configuredCreates = - typeof rawConfig.maxKnowledgeCreatesPerRun === 'number' ? rawConfig.maxKnowledgeCreatesPerRun : 5; + typeof rawConfig.maxKnowledgeCreatesPerRun === 'number' + ? rawConfig.maxKnowledgeCreatesPerRun + : NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN; const configuredUpdates = typeof rawConfig.maxKnowledgeUpdatesPerRun === 'number' ? rawConfig.maxKnowledgeUpdatesPerRun : 20; const wikiActions = stageIndex.workUnits.flatMap((wu) => wu.actions).filter((action) => action.target === 'wiki'); @@ -351,17 +404,8 @@ export class IngestBundleRunner { (path: string) => (entry: ToolCallLogEntry): void => { const current = - transcriptSummaries.get(entry.wuKey) ?? - ({ - unitKey: entry.wuKey, - path, - toolCallCount: 0, - errorCount: 0, - toolNames: new Set(), - } satisfies MutableToolTranscriptSummary); - current.toolCallCount += 1; - current.errorCount += isFailedToolCall(entry) ? 1 : 0; - current.toolNames.add(entry.toolName); + transcriptSummaries.get(entry.wuKey) ?? createMutableToolTranscriptSummary(entry.wuKey, path); + recordToolTranscriptEntry(current, entry); transcriptSummaries.set(entry.wuKey, current); }; const overrideReport = await this.loadOverrideReport(job); @@ -617,6 +661,7 @@ export class IngestBundleRunner { preHead: sessionWorktree.baseSha, touchedSlSources: session.touchedSlSources, actions: sessionActions, + allowedRawPaths: new Set(wu.rawFiles), semanticLayerService: scopedSemanticLayerService, wikiService: scopedWikiService, configService: sessionWorktree.config, @@ -681,6 +726,8 @@ export class IngestBundleRunner { emit_unmapped_fallback: createEmitUnmappedFallbackTool({ stageIndex, allowedPaths: new Set(wu.rawFiles), + tableRefExists: (tableRef) => + this.tableRefExistsInSemanticLayer(scopedSemanticLayerService, slConnectionIds, tableRef), }), }; @@ -728,7 +775,7 @@ export class IngestBundleRunner { sourceKey: job.sourceKey, connectionId: job.connectionId, jobId: job.jobId, - toolFailureCount: (unitKey) => transcriptSummaries.get(unitKey)?.errorCount ?? 0, + toolFailureCount: (unitKey) => transcriptSummaries.get(unitKey)?.fatalErrorCount ?? 0, onStepFinish: ({ stepIndex, stepBudget }) => { memoryFlow?.emit({ type: 'work_unit_step', unitKey: wu.unitKey, stepIndex, stepBudget }); }, @@ -839,6 +886,10 @@ export class IngestBundleRunner { const reconcileActions: MemoryAction[] = []; const rcScopedWiki = this.deps.wikiService.forWorktree(sessionWorktree.workdir); const rcScopedSl = this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir); + const reconciliationAllowedRawPaths = new Set([ + ...currentHashes.keys(), + ...(eviction?.deletedRawPaths ?? []), + ]); const rcToolSession: ToolSession = { connectionId: job.connectionId, @@ -846,6 +897,7 @@ export class IngestBundleRunner { preHead: reconcileSession.preHead, touchedSlSources: reconcileSession.touchedSlSources, actions: reconcileActions, + allowedRawPaths: reconciliationAllowedRawPaths, semanticLayerService: rcScopedSl, wikiService: rcScopedWiki, configService: sessionWorktree.config, @@ -910,6 +962,7 @@ export class IngestBundleRunner { emit_unmapped_fallback: createEmitUnmappedFallbackTool({ stageIndex, allowedPaths: allStagedPaths, + tableRefExists: (tableRef) => this.tableRefExistsInSemanticLayer(rcScopedSl, slConnectionIds, tableRef), }), }; @@ -1167,26 +1220,34 @@ export class IngestBundleRunner { return a.type === 'created' ? 'source_created' : 'measure_added'; }; const producedPaths = new Set(); + const pushActionProvenance = (rawPath: string, action: MemoryAction): void => { + const hash = currentHashes.get(rawPath) ?? 'unknown'; + provenanceRows.push({ + connectionId: job.connectionId, + sourceKey: job.sourceKey, + syncId, + rawPath, + rawContentHash: hash, + artifactKind: action.target, + artifactKey: action.key, + targetConnectionId: action.target === 'sl' ? actionTargetConnectionId(action, job.connectionId) : null, + artifactContentHash: null, + actionType: actionToType(action), + }); + producedPaths.add(rawPath); + }; for (const wu of stageIndex.workUnits) { - for (const rawPath of wu.rawFiles) { - const hash = currentHashes.get(rawPath) ?? 'unknown'; - for (const action of wu.actions) { - provenanceRows.push({ - connectionId: job.connectionId, - sourceKey: job.sourceKey, - syncId, - rawPath, - rawContentHash: hash, - artifactKind: action.target, - artifactKey: action.key, - targetConnectionId: action.target === 'sl' ? (action.targetConnectionId ?? null) : null, - artifactContentHash: null, - actionType: actionToType(action), - }); - producedPaths.add(rawPath); + for (const action of wu.actions) { + for (const rawPath of rawPathsForAction(action, wu.rawFiles)) { + pushActionProvenance(rawPath, action); } } } + for (const action of reconcileActions) { + for (const rawPath of action.rawPaths ?? []) { + pushActionProvenance(rawPath, action); + } + } for (const resolution of stageIndex.artifactResolutions ?? []) { const hash = currentHashes.get(resolution.rawPath) ?? 'unknown'; provenanceRows.push({ diff --git a/packages/context/src/ingest/local-adapters.test.ts b/packages/context/src/ingest/local-adapters.test.ts index 48bb2a80..f8dd7da7 100644 --- a/packages/context/src/ingest/local-adapters.test.ts +++ b/packages/context/src/ingest/local-adapters.test.ts @@ -466,6 +466,38 @@ describe('local ingest adapters', () => { }); }); + it('exposes configured primary warehouses as dbt target connections', async () => { + const dbtProject: KtxLocalProject = { + ...projectWithConnections({ + warehouse: { + driver: 'postgres', + url: 'postgresql://example/db', + }, + analytics_dbt: { + driver: 'dbt', + source_dir: '/repo/dbt', + }, + }), + config: { + ...project.config, + setup: { database_connection_ids: ['warehouse'], completed_steps: [] }, + connections: { + warehouse: { + driver: 'postgres', + url: 'postgresql://example/db', + }, + analytics_dbt: { + driver: 'dbt', + source_dir: '/repo/dbt', + }, + }, + }, + }; + const adapter = createDefaultLocalIngestAdapters(dbtProject).find((candidate) => candidate.source === 'dbt'); + + await expect(adapter?.listTargetConnectionIds?.('/tmp/staged-dbt')).resolves.toEqual(['warehouse']); + }); + it('resolves MetricFlow auth_token_ref without writing literal tokens to config', async () => { const project = projectWithConnections({ metricflow_main: { diff --git a/packages/context/src/ingest/local-adapters.ts b/packages/context/src/ingest/local-adapters.ts index dc4f50a4..59daf6d1 100644 --- a/packages/context/src/ingest/local-adapters.ts +++ b/packages/context/src/ingest/local-adapters.ts @@ -89,7 +89,10 @@ export function createDefaultLocalIngestAdapters( }), }), new LookmlSourceAdapter({ homeDir: join(project.projectDir, '.ktx/cache') }), - new DbtSourceAdapter({ homeDir: join(project.projectDir, '.ktx/cache') }), + new DbtSourceAdapter({ + homeDir: join(project.projectDir, '.ktx/cache'), + targetConnectionIds: primaryWarehouseConnectionIds(project), + }), createLocalMetabaseSourceAdapter(project, { ...(options.logger ? { logger: options.logger } : {}), }), @@ -128,6 +131,21 @@ export function createDefaultLocalIngestAdapters( return adapters; } +function primaryWarehouseConnectionIds(project: KtxLocalProject): string[] { + const configuredPrimaryIds = project.config.setup?.database_connection_ids ?? []; + const configured = configuredPrimaryIds.filter((connectionId) => + Boolean(localConnectionToWarehouseDescriptor(connectionId, project.config.connections[connectionId])), + ); + if (configured.length > 0) { + return [...new Set(configured)]; + } + + return Object.entries(project.config.connections) + .filter(([connectionId, connection]) => Boolean(localConnectionToWarehouseDescriptor(connectionId, connection))) + .map(([connectionId]) => connectionId) + .sort((left, right) => left.localeCompare(right)); +} + function isRecord(value: unknown): value is Record { return typeof value === 'object' && value !== null && !Array.isArray(value); } diff --git a/packages/context/src/ingest/local-bundle-ingest.test.ts b/packages/context/src/ingest/local-bundle-ingest.test.ts index 2fa014d0..f631e6ed 100644 --- a/packages/context/src/ingest/local-bundle-ingest.test.ts +++ b/packages/context/src/ingest/local-bundle-ingest.test.ts @@ -88,6 +88,35 @@ class WikiWritingAgentRunner extends AgentRunnerService { } } +class WikiWritingWithRawPathAgentRunner extends AgentRunnerService { + override runLoop = vi.fn(async (params: any) => { + if (params.telemetryTags?.operationName === 'ingest-bundle-wu') { + const wikiWrite = params.toolSet.wiki_write; + if (!wikiWrite?.execute) { + throw new Error('wiki_write tool was not available to the WorkUnit'); + } + const result = await wikiWrite.execute( + { + key: 'orders_context', + summary: 'Orders source context', + content: 'Orders are purchase records used for revenue analysis.', + tags: ['orders'], + rawPaths: ['orders/orders.json'], + }, + { toolCallId: 'wiki-write' }, + ); + if (!result.structured.success) { + throw new Error(result.markdown); + } + } + return { stopReason: 'natural' as const }; + }); + + constructor() { + super({ llmProvider: { getModel: () => ({}) as never } as never }); + } +} + class HistoricSqlEvidenceAgentRunner extends AgentRunnerService { override runLoop = vi.fn(async (params: any) => { if ( @@ -366,14 +395,94 @@ describe('canonical local ingest', () => { expect(result.result.failedWorkUnits).toEqual([]); const db = new Database(join(project.projectDir, '.ktx', 'db.sqlite'), { readonly: true }); try { - expect(db.prepare('SELECT key, summary FROM knowledge_pages ORDER BY key').all()).toEqual([ - { key: 'orders_context', summary: 'Orders source context' }, + expect(db.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key').all()).toEqual([ + { key: 'orders_context', summary: 'Orders source context', has_embedding: 1 }, ]); } finally { db.close(); } }); + it('does not persist noop embedding vectors when local embeddings are disabled', async () => { + await writeFile( + join(project.projectDir, 'ktx.yaml'), + [ + 'project: warehouse', + 'connections:', + ' warehouse:', + ' driver: postgres', + 'ingest:', + ' adapters:', + ' - fake', + ' embeddings:', + ' backend: none', + '', + ].join('\n'), + 'utf-8', + ); + project = await loadKtxProject({ projectDir: project.projectDir }); + const sourceDir = join(tempDir, 'source'); + await mkdir(join(sourceDir, 'orders'), { recursive: true }); + await writeFile(join(sourceDir, 'orders', 'orders.json'), '{"name":"orders"}\n', 'utf-8'); + const agentRunner = new WikiWritingAgentRunner(); + + const result = await runLocalIngest({ + project, + adapters: [new FakeSourceAdapter()], + adapter: 'fake', + connectionId: 'warehouse', + sourceDir, + jobId: 'wiki-local-no-embeddings-1', + agentRunner, + }); + + expect(result.result.failedWorkUnits).toEqual([]); + const db = new Database(join(project.projectDir, '.ktx', 'db.sqlite'), { readonly: true }); + try { + expect(db.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key').all()).toEqual([ + { key: 'orders_context', summary: 'Orders source context', has_embedding: 0 }, + ]); + } finally { + db.close(); + } + }); + + it('uses explicit action raw paths to avoid over-attributing work-unit provenance', async () => { + const sourceDir = join(tempDir, 'source'); + await mkdir(join(sourceDir, 'orders'), { recursive: true }); + await writeFile(join(sourceDir, 'orders', 'orders.json'), '{"name":"orders"}\n', 'utf-8'); + await writeFile(join(sourceDir, 'orders', 'unrelated.json'), '{"name":"unrelated"}\n', 'utf-8'); + const agentRunner = new WikiWritingWithRawPathAgentRunner(); + + const result = await runLocalIngest({ + project, + adapters: [new FakeSourceAdapter()], + adapter: 'fake', + connectionId: 'warehouse', + sourceDir, + jobId: 'wiki-raw-path-local-1', + agentRunner, + }); + + expect(result.result.failedWorkUnits).toEqual([]); + expect(result.report.body.provenanceRows).toEqual([ + { + rawPath: 'orders/orders.json', + artifactKind: 'wiki', + artifactKey: 'orders_context', + targetConnectionId: null, + actionType: 'wiki_written', + }, + { + rawPath: 'orders/unrelated.json', + artifactKind: null, + artifactKey: null, + targetConnectionId: null, + actionType: 'skipped', + }, + ]); + }); + it('runs historic-SQL evidence projection through the local bundle post-processor', async () => { const projectDir = join(tempDir, 'historic-sql-project'); await initKtxProject({ projectDir, projectName: 'warehouse' }); diff --git a/packages/context/src/ingest/local-bundle-runtime.ts b/packages/context/src/ingest/local-bundle-runtime.ts index afcb2525..38e37b1d 100644 --- a/packages/context/src/ingest/local-bundle-runtime.ts +++ b/packages/context/src/ingest/local-bundle-runtime.ts @@ -53,6 +53,7 @@ import { type ToolSession, } from '../tools/index.js'; import { + buildKnowledgeSearchText, type KnowledgeEventPort, type KnowledgeIndexPort, KnowledgeWikiService, @@ -289,7 +290,10 @@ function scoreText(text: string, query: string): number { class LocalKnowledgeIndex implements KnowledgeIndexPort { private readonly sqlite: SqliteKnowledgeIndex; - constructor(private readonly project: KtxLocalProject) { + constructor( + private readonly project: KtxLocalProject, + private readonly embedding: KtxEmbeddingPort, + ) { this.sqlite = new SqliteKnowledgeIndex({ dbPath: ktxLocalStateDbPath(project) }); } @@ -391,6 +395,7 @@ class LocalKnowledgeIndex implements KnowledgeIndexPort { private async syncAllPagesFromDisk(): Promise { const listed = await this.project.fileStore.listFiles('knowledge', true); + const existingPages = this.sqlite.getExistingPages(); const pages: SqliteKnowledgeIndexPage[] = []; for (const file of listed.files.filter((entry) => entry.endsWith('.md'))) { const parsedPath = parseKnowledgeIndexPath(file); @@ -400,14 +405,21 @@ class LocalKnowledgeIndex implements KnowledgeIndexPort { const path = `knowledge/${file}`; const raw = await this.project.fileStore.readFile(path); const parsed = parseWiki(raw.content); + const tags = parseWikiTags(raw.content); + const searchText = buildKnowledgeSearchText(parsedPath.pageKey, parsed.summary, parsed.content, tags); + const existing = existingPages.get(path); + const embedding = + existing?.searchText === searchText && existing.embedding + ? existing.embedding + : await this.embedding.computeEmbedding(searchText).catch(() => null); pages.push({ path, key: parsedPath.pageKey, scope: parsedPath.scope, summary: parsed.summary, content: parsed.content, - tags: parseWikiTags(raw.content), - embedding: null, + tags, + embedding, }); } this.sqlite.sync(pages); @@ -417,10 +429,19 @@ class LocalKnowledgeIndex implements KnowledgeIndexPort { function parseKnowledgeIndexPath(file: string): { scope: 'GLOBAL' | 'USER'; pageKey: string } | null { const segments = file.split('/'); if (segments.length === 2 && segments[0] === 'global') { - return { scope: 'GLOBAL', pageKey: segments[1].replace(/\.md$/, '') }; + const pageKey = segments[1].replace(/\.md$/, ''); + return /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(pageKey) ? { scope: 'GLOBAL', pageKey } : null; + } + if (segments.length >= 3 && segments[0] === 'global' && segments[1] === 'historic-sql') { + const historicPath = segments.slice(2).join('/').replace(/\.md$/, ''); + if (historicPath.split('/').every((segment) => /^[a-zA-Z0-9_][a-zA-Z0-9_-]*$/.test(segment))) { + return { scope: 'GLOBAL', pageKey: `historic-sql/${historicPath}` }; + } + return null; } if (segments.length === 3 && segments[0] === 'user') { - return { scope: 'USER', pageKey: segments[2].replace(/\.md$/, '') }; + const pageKey = segments[2].replace(/\.md$/, ''); + return /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(pageKey) ? { scope: 'USER', pageKey } : null; } return null; } @@ -591,7 +612,7 @@ export function createLocalBundleIngestRuntime( ); const slSourcesRepository = new SqliteSlSourcesIndex({ dbPath }); const slSearchService = new SlSearchService(embedding, slSourcesRepository, logger); - const knowledgeIndex = new LocalKnowledgeIndex(options.project); + const knowledgeIndex = new LocalKnowledgeIndex(options.project, embedding); const knowledgeEvents = new NoopKnowledgeEventPort(); const wikiService = new KnowledgeWikiService(rootFileStore, embedding, knowledgeIndex, options.project.git, logger); const { agentRunner, llmProvider } = resolveAgentRunner(options); diff --git a/packages/context/src/ingest/local-stage-ingest.test.ts b/packages/context/src/ingest/local-stage-ingest.test.ts index 157bd96b..5f0ee501 100644 --- a/packages/context/src/ingest/local-stage-ingest.test.ts +++ b/packages/context/src/ingest/local-stage-ingest.test.ts @@ -611,7 +611,7 @@ describe('local ingest', () => { continuedFromCursor: false, partialSnapshot: false, maxPagesPerRun: 1000, - maxKnowledgeCreatesPerRun: 5, + maxKnowledgeCreatesPerRun: 25, maxKnowledgeUpdatesPerRun: 20, nextSuccessfulCursor: null, skipped: [], @@ -654,6 +654,7 @@ describe('local ingest', () => { crawlMode: 'selected_roots', rootPageIds: ['page-1'], maxPagesPerRun: 1000, + maxKnowledgeCreatesPerRun: 25, }), expect.any(String), { connectionId: 'notion-main', sourceKey: 'notion' }, diff --git a/packages/context/src/ingest/memory-flow/known-errors.ts b/packages/context/src/ingest/memory-flow/known-errors.ts new file mode 100644 index 00000000..8273ed86 --- /dev/null +++ b/packages/context/src/ingest/memory-flow/known-errors.ts @@ -0,0 +1,28 @@ +interface MemoryFlowErrorContext { + adapter: string; +} + +export function isNotionAuthorizationExpired( + context: MemoryFlowErrorContext, + reason: string | undefined, +): boolean { + if (context.adapter !== 'notion') { + return false; + } + const normalized = (reason ?? '').toLowerCase(); + return ( + normalized.includes('invalid_grant') && + (normalized.includes('invalid_rapt') || normalized.includes('reauth')) + ); +} + +export function formatNotionAuthorizationExpiredDetail(unitKey: string): string { + return `${unitKey} could not read Notion because the saved OAuth grant expired or requires reauthentication (invalid_grant / invalid_rapt).`; +} + +export function notionAuthorizationFixSuggestions(connectionId: string): string[] { + return [ + `Refresh the Notion token referenced by auth_token_ref for ${connectionId}. If it uses env:NAME, export a fresh token in that variable; if it uses file:/path, replace that file.`, + `Run ktx connection notion pick ${connectionId} to confirm Notion access, then rerun ktx ingest ${connectionId}.`, + ]; +} diff --git a/packages/context/src/ingest/memory-flow/summary.test.ts b/packages/context/src/ingest/memory-flow/summary.test.ts index 61c25aa5..0720dbae 100644 --- a/packages/context/src/ingest/memory-flow/summary.test.ts +++ b/packages/context/src/ingest/memory-flow/summary.test.ts @@ -60,6 +60,36 @@ describe('formatMemoryFlowFinalSummary', () => { ).toContain('Trust issues: 3'); }); + it('explains expired Notion authorization with fix suggestions', () => { + const rawReason = + 'notion-cluster-1 failed: {"error":"invalid_grant","error_description":"reauth related error (invalid_rapt)","error_uri":"https://accounts.example/reauth"}'; + const summary = formatMemoryFlowFinalSummary( + input({ + connectionId: 'notion-main', + adapter: 'notion', + status: 'error', + events: [ + { type: 'source_acquired', adapter: 'notion', trigger: 'manual_resync', fileCount: 37 }, + { type: 'chunks_planned', chunkCount: 2, workUnitCount: 2, evictionCount: 0 }, + { type: 'work_unit_finished', unitKey: 'notion-cluster-1', status: 'failed', reason: rawReason }, + ], + }), + ); + + expect(summary).toContain('Memory-flow summary: error'); + expect(summary).toContain( + 'Notion authorization expired: notion-cluster-1 could not read Notion because the saved OAuth grant expired or requires reauthentication (invalid_grant / invalid_rapt).', + ); + expect(summary).toContain('Fix suggestions:'); + expect(summary).toContain( + '- Refresh the Notion token referenced by auth_token_ref for notion-main. If it uses env:NAME, export a fresh token in that variable; if it uses file:/path, replace that file.', + ); + expect(summary).toContain( + '- Run ktx connection notion pick notion-main to confirm Notion access, then rerun ktx ingest notion-main.', + ); + expect(summary).not.toContain('error_uri'); + }); + it('labels replay source metadata in final summaries', () => { const summary = formatMemoryFlowFinalSummary({ metadata: { diff --git a/packages/context/src/ingest/memory-flow/summary.ts b/packages/context/src/ingest/memory-flow/summary.ts index b71526a1..a23d7146 100644 --- a/packages/context/src/ingest/memory-flow/summary.ts +++ b/packages/context/src/ingest/memory-flow/summary.ts @@ -1,6 +1,7 @@ import { sanitizeMemoryFlowError } from './live-buffer.js'; import type { MemoryFlowEvent, MemoryFlowReplayInput } from './types.js'; import { buildMemoryFlowViewModel } from './view-model.js'; +import { isNotionAuthorizationExpired, notionAuthorizationFixSuggestions } from './known-errors.js'; function latest( events: MemoryFlowEvent[], @@ -42,6 +43,14 @@ function humanizeSummaryText(value: string): string { .replace(/\bSL\b/g, 'semantic layer'); } +function fixSuggestions(input: MemoryFlowReplayInput): string[] { + const workUnitReasons = eventsOf(input.events, 'work_unit_finished').map((event) => event.reason); + const hasNotionAuthFailure = [...workUnitReasons, ...input.errors].some((reason) => + isNotionAuthorizationExpired(input, reason), + ); + return hasNotionAuthFailure ? notionAuthorizationFixSuggestions(input.connectionId) : []; +} + export function formatMemoryFlowFinalSummary(input: MemoryFlowReplayInput): string { const sources = eventsOf(input.events, 'source_acquired'); const source = sources.at(-1); @@ -84,6 +93,14 @@ export function formatMemoryFlowFinalSummary(input: MemoryFlowReplayInput): stri } } + const suggestions = fixSuggestions(input); + if (suggestions.length > 0) { + lines.push('Fix suggestions:'); + for (const suggestion of suggestions) { + lines.push(`- ${suggestion}`); + } + } + for (const error of input.errors.slice(0, 3)) { lines.push(`Error: ${sanitizeMemoryFlowError(error)}`); } diff --git a/packages/context/src/ingest/memory-flow/view-model.ts b/packages/context/src/ingest/memory-flow/view-model.ts index 7c908fbc..c8f784b8 100644 --- a/packages/context/src/ingest/memory-flow/view-model.ts +++ b/packages/context/src/ingest/memory-flow/view-model.ts @@ -9,6 +9,7 @@ import type { MemoryFlowViewModel, } from './types.js'; import { sanitizeMemoryFlowError } from './live-buffer.js'; +import { formatNotionAuthorizationExpiredDetail, isNotionAuthorizationExpired } from './known-errors.js'; function latest( events: MemoryFlowEvent[], @@ -109,7 +110,7 @@ function errorDetails(input: MemoryFlowReplayInput): string[] { } function isValidationFailure(reason: string | undefined): boolean { - return /semantic-layer|validation|invalid/i.test(reason ?? ''); + return /semantic-layer|validation/i.test(reason ?? ''); } function failedWorkUnitDetails(failed: Array>): string[] { @@ -180,11 +181,14 @@ function buildMemoryFlowTrustIssues(input: MemoryFlowReplayInput): MemoryFlowTru for (const event of failed) { const reason = sanitizeMemoryFlowError(event.reason ?? 'failed'); + const knownNotionAuthFailure = isNotionAuthorizationExpired(input, event.reason); issues.push({ id: `work-unit-failed:${event.unitKey}`, severity: 'failed', - title: 'WorkUnit failed', - detail: `${event.unitKey} failed: ${reason}`, + title: knownNotionAuthFailure ? 'Notion authorization expired' : 'WorkUnit failed', + detail: knownNotionAuthFailure + ? formatNotionAuthorizationExpiredDetail(event.unitKey) + : `${event.unitKey} failed: ${reason}`, columnId: 'workUnits', targetLabel: event.unitKey, }); diff --git a/packages/context/src/ingest/report-snapshot.ts b/packages/context/src/ingest/report-snapshot.ts index 891c6fa7..76565ad9 100644 --- a/packages/context/src/ingest/report-snapshot.ts +++ b/packages/context/src/ingest/report-snapshot.ts @@ -16,6 +16,7 @@ const ingestActionSchema = z.object({ key: z.string(), detail: z.string(), targetConnectionId: z.string().nullable().default(null), + rawPaths: z.array(z.string()).optional(), }); const touchedSlSourceSchema = z.object({ @@ -153,6 +154,7 @@ export const ingestReportSnapshotSchema = z ), failedWorkUnits: z.array(z.string()), reconciliationSkipped: z.boolean(), + reconciliationActions: z.array(ingestActionSchema).default([]), conflictsResolved: z.array(conflictResolvedSchema).default([]), evictionsApplied: z.array(evictionAppliedSchema).default([]), unmappedFallbacks: z.array(unmappedFallbackSchema).default([]), diff --git a/packages/context/src/ingest/reports.ts b/packages/context/src/ingest/reports.ts index 6f60f149..2c3020b4 100644 --- a/packages/context/src/ingest/reports.ts +++ b/packages/context/src/ingest/reports.ts @@ -55,6 +55,10 @@ export interface IngestReportBody { workUnits: IngestReportWorkUnit[]; failedWorkUnits: string[]; reconciliationSkipped: boolean; + // Actions emitted by the reconciliation stage (wiki/sl writes from + // cross-WU reconciliation). Counted alongside workUnit.actions in + // savedMemoryCountsForReport so progress reports reflect all writes. + reconciliationActions?: MemoryAction[]; conflictsResolved: ConflictResolvedRecord[]; evictionsApplied: EvictionAppliedRecord[]; unmappedFallbacks: UnmappedFallbackRecord[]; @@ -111,7 +115,9 @@ export function postProcessorSavedMemoryCounts( } export function savedMemoryCountsForReport(report: IngestReportSnapshot): IngestSavedMemoryCounts { - const actions = report.body.workUnits.flatMap((workUnit) => workUnit.actions); + const workUnitActions = report.body.workUnits.flatMap((workUnit) => workUnit.actions); + const reconciliationActions = report.body.reconciliationActions ?? []; + const actions = [...workUnitActions, ...reconciliationActions]; const directCounts = { wikiCount: actions.filter((action) => action.target === 'wiki').length, slCount: actions.filter((action) => action.target === 'sl').length, diff --git a/packages/context/src/ingest/sqlite-bundle-ingest-store.test.ts b/packages/context/src/ingest/sqlite-bundle-ingest-store.test.ts index 3c27ab2d..2798c64a 100644 --- a/packages/context/src/ingest/sqlite-bundle-ingest-store.test.ts +++ b/packages/context/src/ingest/sqlite-bundle-ingest-store.test.ts @@ -66,6 +66,27 @@ function reportBody(syncId: string, supersededBy: string | null = null): IngestR }; } +function emptyReportBody(syncId: string, overrides: Partial = {}): IngestReportBody { + return { + syncId, + diffSummary: diffSummary({ added: 0, modified: 0, deleted: 0, unchanged: 1 }), + commitSha: null, + workUnits: [], + failedWorkUnits: [], + reconciliationSkipped: true, + conflictsResolved: [], + evictionsApplied: [], + unmappedFallbacks: [], + evictionInputs: [], + unresolvedCards: [], + supersededBy: null, + overrideOf: null, + provenanceRows: [], + toolTranscripts: [], + ...overrides, + }; +} + describe('SqliteBundleIngestStore', () => { let tempDir: string; let dbPath: string; @@ -226,6 +247,204 @@ describe('SqliteBundleIngestStore', () => { ); }); + it('does not baseline skipped provenance from failed work units or zero-work retry runs', async () => { + const store = new SqliteBundleIngestStore({ dbPath }); + const rawHashes = new Map([ + ['pages/page-1/metadata.json', 'hash-metadata'], + ['pages/page-1/page.md', 'hash-page'], + ]); + + const failedRun = await store.create(runArgs({ jobId: 'job-failed-review', syncId: 'sync-failed-review' })); + await store.insertMany( + [...rawHashes].map(([rawPath, rawContentHash]) => ({ + connectionId: 'docs', + sourceKey: 'notion', + syncId: 'sync-failed-review', + rawPath, + rawContentHash, + artifactKind: null, + artifactKey: null, + artifactContentHash: null, + actionType: 'skipped' as const, + })), + ); + await store.markCompleted(failedRun.id, diffSummary({ added: 2 })); + await store.create({ + runId: failedRun.id, + jobId: 'job-failed-review', + connectionId: 'docs', + sourceKey: 'notion', + body: emptyReportBody('sync-failed-review', { + workUnits: [ + { + unitKey: 'notion-page-page-1', + rawFiles: [...rawHashes.keys()], + status: 'failed', + reason: 'invalid_grant', + actions: [], + touchedSlSources: [], + }, + ], + failedWorkUnits: ['notion-page-page-1'], + }), + }); + + const noWorkRun = await store.create(runArgs({ jobId: 'job-no-work', syncId: 'sync-no-work' })); + await store.insertMany( + [...rawHashes].map(([rawPath, rawContentHash]) => ({ + connectionId: 'docs', + sourceKey: 'notion', + syncId: 'sync-no-work', + rawPath, + rawContentHash, + artifactKind: null, + artifactKey: null, + artifactContentHash: null, + actionType: 'skipped' as const, + })), + ); + await store.markCompleted(noWorkRun.id, diffSummary({ unchanged: 2 })); + await store.create({ + runId: noWorkRun.id, + jobId: 'job-no-work', + connectionId: 'docs', + sourceKey: 'notion', + body: emptyReportBody('sync-no-work', { workUnits: [], failedWorkUnits: [] }), + }); + + await expect(store.findLatestHashesForCompletedSyncs('docs', 'notion')).resolves.toEqual(new Map()); + await expect(new DiffSetService(store).compute('docs', 'notion', rawHashes)).resolves.toEqual({ + added: ['pages/page-1/metadata.json', 'pages/page-1/page.md'], + modified: [], + deleted: [], + unchanged: [], + }); + }); + + it('baselines skipped provenance from successful no-output work unit runs', async () => { + const store = new SqliteBundleIngestStore({ dbPath }); + const run = await store.create(runArgs({ jobId: 'job-reviewed-no-output', syncId: 'sync-reviewed-no-output' })); + + await store.insertMany([ + { + connectionId: 'docs', + sourceKey: 'notion', + syncId: 'sync-reviewed-no-output', + rawPath: 'pages/page-1/page.md', + rawContentHash: 'hash-reviewed', + artifactKind: null, + artifactKey: null, + artifactContentHash: null, + actionType: 'skipped', + }, + ]); + await store.markCompleted(run.id, diffSummary({ added: 1 })); + await store.create({ + runId: run.id, + jobId: 'job-reviewed-no-output', + connectionId: 'docs', + sourceKey: 'notion', + body: emptyReportBody('sync-reviewed-no-output', { + workUnits: [ + { + unitKey: 'notion-page-page-1', + rawFiles: ['pages/page-1/page.md'], + status: 'success', + actions: [], + touchedSlSources: [], + }, + ], + failedWorkUnits: [], + }), + }); + + await expect(store.findLatestHashesForCompletedSyncs('docs', 'notion')).resolves.toEqual( + new Map([['pages/page-1/page.md', 'hash-reviewed']]), + ); + await expect( + new DiffSetService(store).compute('docs', 'notion', new Map([['pages/page-1/page.md', 'hash-reviewed']])), + ).resolves.toMatchObject({ + added: [], + unchanged: ['pages/page-1/page.md'], + }); + }); + + it('baselines artifact provenance in partial failures but not skipped-only failed paths', async () => { + const store = new SqliteBundleIngestStore({ dbPath }); + const run = await store.create(runArgs({ jobId: 'job-partial', syncId: 'sync-partial' })); + + await store.insertMany([ + { + connectionId: 'docs', + sourceKey: 'notion', + syncId: 'sync-partial', + rawPath: 'pages/success/page.md', + rawContentHash: 'hash-success', + artifactKind: 'wiki', + artifactKey: 'knowledge/notion/success.md', + artifactContentHash: 'artifact-success', + actionType: 'wiki_written', + }, + { + connectionId: 'docs', + sourceKey: 'notion', + syncId: 'sync-partial', + rawPath: 'pages/failed/page.md', + rawContentHash: 'hash-failed', + artifactKind: null, + artifactKey: null, + artifactContentHash: null, + actionType: 'skipped', + }, + ]); + await store.markCompleted(run.id, diffSummary({ added: 2 })); + await store.create({ + runId: run.id, + jobId: 'job-partial', + connectionId: 'docs', + sourceKey: 'notion', + body: emptyReportBody('sync-partial', { + workUnits: [ + { + unitKey: 'notion-page-success', + rawFiles: ['pages/success/page.md'], + status: 'success', + actions: [], + touchedSlSources: [], + }, + { + unitKey: 'notion-page-failed', + rawFiles: ['pages/failed/page.md'], + status: 'failed', + reason: 'invalid_grant', + actions: [], + touchedSlSources: [], + }, + ], + failedWorkUnits: ['notion-page-failed'], + }), + }); + + await expect(store.findLatestHashesForCompletedSyncs('docs', 'notion')).resolves.toEqual( + new Map([['pages/success/page.md', 'hash-success']]), + ); + await expect( + new DiffSetService(store).compute( + 'docs', + 'notion', + new Map([ + ['pages/success/page.md', 'hash-success'], + ['pages/failed/page.md', 'hash-failed'], + ]), + ), + ).resolves.toEqual({ + added: ['pages/failed/page.md'], + modified: [], + deleted: [], + unchanged: ['pages/success/page.md'], + }); + }); + it('returns the latest stored report across bundle ingest runs', async () => { const store = new SqliteBundleIngestStore({ dbPath, diff --git a/packages/context/src/ingest/sqlite-bundle-ingest-store.ts b/packages/context/src/ingest/sqlite-bundle-ingest-store.ts index 50c2d75d..bd9cf5c7 100644 --- a/packages/context/src/ingest/sqlite-bundle-ingest-store.ts +++ b/packages/context/src/ingest/sqlite-bundle-ingest-store.ts @@ -46,6 +46,13 @@ interface ProvenanceRow { action_type: string; } +interface ProvenanceHashCandidateRow { + raw_path: string; + raw_content_hash: string; + action_type: string; + report_body_json: string | null; +} + function parseArtifactKind(kind: string | null): IngestProvenanceRow['artifact_kind'] { if (kind === null || kind === 'sl' || kind === 'wiki') { return kind; @@ -93,6 +100,31 @@ function toPortProvenanceRow(row: ProvenanceRow): IngestProvenanceRow { }; } +function recordValue(value: unknown, key: string): unknown { + return typeof value === 'object' && value !== null && !Array.isArray(value) + ? (value as Record)[key] + : undefined; +} + +function isSuccessfulNoOutputSkippedBaseline(reportBodyJson: string | null): boolean { + if (reportBodyJson === null) { + return true; + } + const body = JSON.parse(reportBodyJson) as unknown; + const workUnits = recordValue(body, 'workUnits'); + const failedWorkUnits = recordValue(body, 'failedWorkUnits'); + return ( + Array.isArray(workUnits) && + workUnits.length > 0 && + Array.isArray(failedWorkUnits) && + failedWorkUnits.length === 0 + ); +} + +function isProcessedHashBaseline(row: ProvenanceHashCandidateRow): boolean { + return row.action_type !== 'skipped' || isSuccessfulNoOutputSkippedBaseline(row.report_body_json); +} + function placeholders(values: readonly unknown[]): string { return values.map(() => '?').join(', '); } @@ -275,23 +307,34 @@ export class SqliteBundleIngestStore const rows = this.db .prepare( ` - SELECT p.raw_path, p.raw_content_hash + SELECT + p.raw_path, + p.raw_content_hash, + p.action_type, + br.body_json AS report_body_json FROM bundle_ingest_provenance p INNER JOIN bundle_ingest_runs r ON r.connection_id = p.connection_id AND r.source_key = p.source_key AND r.sync_id = p.sync_id + LEFT JOIN bundle_ingest_reports br + ON br.run_id = r.id WHERE p.connection_id = ? AND p.source_key = ? AND r.status = 'completed' ORDER BY r.completed_at DESC, r.rowid DESC, p.created_at DESC, p.rowid DESC `, ) - .all(connectionId, sourceKey) as Array<{ raw_path: string; raw_content_hash: string }>; + .all(connectionId, sourceKey) as ProvenanceHashCandidateRow[]; const latest = new Map(); + const seen = new Set(); for (const row of rows) { - if (!latest.has(row.raw_path)) { + if (seen.has(row.raw_path)) { + continue; + } + seen.add(row.raw_path); + if (isProcessedHashBaseline(row)) { latest.set(row.raw_path, row.raw_content_hash); } } diff --git a/packages/context/src/ingest/stages/build-reconcile-context.test.ts b/packages/context/src/ingest/stages/build-reconcile-context.test.ts index fb919109..7db0bb23 100644 --- a/packages/context/src/ingest/stages/build-reconcile-context.test.ts +++ b/packages/context/src/ingest/stages/build-reconcile-context.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it, vi } from 'vitest'; -import { buildReconcileSystemPrompt, buildReconcileToolSet } from './build-reconcile-context.js'; +import { buildReconcileSystemPrompt, buildReconcileToolSet, buildReconcileUserPrompt } from './build-reconcile-context.js'; describe('buildReconcileSystemPrompt', () => { it('appends canonical pins when relevant pins are supplied', () => { @@ -39,6 +39,40 @@ describe('buildReconcileSystemPrompt', () => { }); }); +describe('buildReconcileUserPrompt', () => { + it('includes action details so reconciliation can compare different keys for the same table', () => { + const prompt = buildReconcileUserPrompt( + { + jobId: 'j1', + connectionId: 'notion', + workUnits: [ + { + unitKey: 'notion-a', + rawFiles: ['pages/a/page.md'], + status: 'success', + actions: [ + { + target: 'wiki', + type: 'created', + key: 'orbit-customer-source-reference', + detail: 'tables: orbit_analytics.customer', + }, + ], + touchedSlSources: [], + }, + ], + conflictsResolved: [], + evictionsApplied: [], + unmappedFallbacks: [], + }, + undefined, + ); + + expect(prompt).toContain('orbit-customer-source-reference'); + expect(prompt).toContain('tables: orbit_analytics.customer'); + }); +}); + describe('buildReconcileToolSet', () => { it('includes emit_unmapped_fallback with the reconciliation tools', () => { const toolSet = buildReconcileToolSet({ diff --git a/packages/context/src/ingest/stages/build-reconcile-context.ts b/packages/context/src/ingest/stages/build-reconcile-context.ts index 17867a55..30ff6341 100644 --- a/packages/context/src/ingest/stages/build-reconcile-context.ts +++ b/packages/context/src/ingest/stages/build-reconcile-context.ts @@ -104,6 +104,10 @@ function curatorPassStateSummary(runState?: ReconcilePromptRunState): string { ].join('\n'); } +function formatStageActionDetail(detail: string): string { + return detail.trim().replace(/\s+/g, ' '); +} + export function buildReconcileUserPrompt( stageIndex: StageIndex, ev: EvictionUnit | undefined, @@ -119,7 +123,14 @@ export function buildReconcileUserPrompt( const actions = wu.actions.length === 0 ? ' actions: (none)' - : wu.actions.map((a) => ` - ${a.target}:${a.type} ${a.key}`).join('\n'); + : wu.actions + .map((a) => { + const detail = formatStageActionDetail(a.detail); + return detail.length > 0 + ? ` - ${a.target}:${a.type} ${a.key}; detail: ${detail}` + : ` - ${a.target}:${a.type} ${a.key}`; + }) + .join('\n'); return `- unitKey: ${wu.unitKey} (status=${wu.status})\n${actions}`; }) .join('\n'); diff --git a/packages/context/src/ingest/tools/emit-reconciliation-records.tool.test.ts b/packages/context/src/ingest/tools/emit-reconciliation-records.tool.test.ts index 2d82fed7..a3e7b34f 100644 --- a/packages/context/src/ingest/tools/emit-reconciliation-records.tool.test.ts +++ b/packages/context/src/ingest/tools/emit-reconciliation-records.tool.test.ts @@ -148,6 +148,7 @@ describe('reconciliation emit tools', () => { { rawPath: 'metrics/conversion.yml', reason: 'no_physical_table', + detail: expect.stringContaining('not present as a source'), fallback: 'flagged', }, ]); @@ -175,6 +176,7 @@ describe('reconciliation emit tools', () => { { rawPath: 'metrics/conversion.yml', reason: 'no_physical_table', + detail: expect.stringContaining('not present as a source'), fallback: 'flagged', }, ]); @@ -199,6 +201,27 @@ describe('reconciliation emit tools', () => { expect(stageIndex.unmappedFallbacks).toEqual([]); }); + it('rejects missing-table fallback decisions when the table resolves to an existing semantic source', async () => { + const stageIndex = makeStageIndex(); + const tool = createEmitUnmappedFallbackTool({ + stageIndex, + allowedPaths: new Set(['cards/revenue.json']), + tableRefExists: async (tableRef) => tableRef === 'orbit_analytics.mart_revenue_daily', + }); + + const output = await executeTool(tool, { + rawPath: 'cards/revenue.json', + reason: 'no_physical_table', + tableRef: 'orbit_analytics.mart_revenue_daily', + fallback: 'wiki_only', + }); + + expect(output).toContain( + 'Error: tableRef "orbit_analytics.mart_revenue_daily" already resolves to a semantic source', + ); + expect(stageIndex.unmappedFallbacks).toEqual([]); + }); + it('records explicit artifact resolutions for provenance rows', async () => { const stageIndex = makeStageIndex(); const tool = createEmitArtifactResolutionTool({ diff --git a/packages/context/src/ingest/tools/emit-unmapped-fallback.tool.ts b/packages/context/src/ingest/tools/emit-unmapped-fallback.tool.ts index 78f2e6ea..aaba3509 100644 --- a/packages/context/src/ingest/tools/emit-unmapped-fallback.tool.ts +++ b/packages/context/src/ingest/tools/emit-unmapped-fallback.tool.ts @@ -1,10 +1,11 @@ import { tool } from 'ai'; import { z } from 'zod'; -import type { StageIndex, UnmappedFallbackRecord } from '../stages/stage-index.types.js'; +import type { StageIndex, UnmappedFallbackRecord, UnmappedFallbackReason } from '../stages/stage-index.types.js'; interface EmitUnmappedFallbackDeps { stageIndex: StageIndex; allowedPaths: ReadonlySet; + tableRefExists?: (tableRef: string) => Promise; } const unmappedFallbackReasonSchema = z.enum([ @@ -22,31 +23,78 @@ function sameUnmappedFallback(left: UnmappedFallbackRecord, right: UnmappedFallb return left.rawPath === right.rawPath && left.reason === right.reason && left.fallback === right.fallback; } +// Generates a canonical description for each reason so the recorded `detail` +// is always consistent with the reason code. Free-form text from the LLM +// previously caused contradictions like "no_physical_table" being explained +// as "no mapped connection exists" — the tool now owns the core sentence and +// the LLM may add optional clarification context. +function canonicalDetail(reason: UnmappedFallbackReason, tableRef: string | undefined): string { + const tableClause = tableRef ? `'${tableRef}'` : 'the referenced object'; + switch (reason) { + case 'no_physical_table': + return `${tableClause} is described but is not present as a source in any mapped warehouse/dbt connection.`; + case 'no_connection_mapping': + return `${tableClause} has no non-Notion warehouse/dbt connection to map against.`; + case 'missing_target_table': + return `${tableClause} is referenced but the target table could not be located.`; + case 'looker_template_unresolved': + return `${tableClause} uses LookML templating that could not be resolved.`; + case 'derived_table_not_supported': + return `${tableClause} is a derived/inline definition that is not yet supported as a semantic-layer source.`; + case 'multiple_table_references': + return `${tableClause} references multiple tables; cannot map to a single source.`; + case 'unsupported_dialect': + return `${tableClause} uses a SQL dialect that is not yet supported.`; + case 'parse_error': + return `${tableClause} could not be parsed.`; + } +} + +function requiresMissingTableValidation(reason: UnmappedFallbackReason): boolean { + return reason === 'no_physical_table' || reason === 'missing_target_table'; +} + export function createEmitUnmappedFallbackTool(deps: EmitUnmappedFallbackDeps) { return tool({ description: - 'Record one unmapped fallback decision for the final IngestReport. The rawPath must be available to the current ingest stage. The reason MUST be one of the structured codes; put any human-readable context in detail.', + 'Record one unmapped fallback decision for the final IngestReport. The rawPath must be available to the current ingest stage. The tool generates the canonical detail from the structured reason and optional tableRef; use clarification only to add context that does not contradict the reason code.', inputSchema: z.object({ rawPath: z.string().min(1), reason: unmappedFallbackReasonSchema, - detail: z.string().optional(), + tableRef: z + .string() + .optional() + .describe('The fully-qualified table or source reference that triggered the fallback (e.g. "orbit_analytics.customer"). Used to generate canonical detail text.'), + clarification: z + .string() + .optional() + .describe('Optional extra context appended to the canonical detail. Must not contradict the reason code.'), fallback: z.enum(['sql_standalone', 'wiki_only', 'flagged']), }), execute: async (input): Promise => { if (!deps.allowedPaths.has(input.rawPath)) { return `Error: rawPath "${input.rawPath}" is not available to this ingest stage`; } + if (input.tableRef && requiresMissingTableValidation(input.reason) && deps.tableRefExists) { + const exists = await deps.tableRefExists(input.tableRef); + if (exists) { + return `Error: tableRef "${input.tableRef}" already resolves to a semantic source; do not record ${input.reason} for an existing table.`; + } + } + + const base = canonicalDetail(input.reason, input.tableRef); + const detail = input.clarification ? `${base} ${input.clarification.trim()}`.trim() : base; const record: UnmappedFallbackRecord = { rawPath: input.rawPath, reason: input.reason, - ...(input.detail !== undefined ? { detail: input.detail } : {}), + detail, fallback: input.fallback, }; if (!deps.stageIndex.unmappedFallbacks.some((candidate) => sameUnmappedFallback(candidate, record))) { deps.stageIndex.unmappedFallbacks.push(record); } - return `recorded unmapped fallback for ${record.rawPath} (${record.fallback})`; + return `recorded unmapped fallback for ${record.rawPath} (${record.fallback}): ${detail}`; }, }); } diff --git a/packages/context/src/ingest/tools/stage-list.tool.test.ts b/packages/context/src/ingest/tools/stage-list.tool.test.ts index 05c9cac3..d14acb2a 100644 --- a/packages/context/src/ingest/tools/stage-list.tool.test.ts +++ b/packages/context/src/ingest/tools/stage-list.tool.test.ts @@ -19,7 +19,14 @@ describe('stage_list tool', () => { unitKey: 'u2', rawFiles: ['b.yml'], status: 'success', - actions: [{ target: 'wiki', type: 'created', key: 'page_b', detail: '' }], + actions: [ + { + target: 'wiki', + type: 'created', + key: 'page_b', + detail: 'tables: orbit_analytics.customer', + }, + ], touchedSlSources: [], }, ], @@ -36,6 +43,7 @@ describe('stage_list tool', () => { expect(out).toContain('src_a'); expect(out).toContain('u2'); expect(out).toContain('page_b'); + expect(out).toContain('tables: orbit_analytics.customer'); }); it('says empty when no writes', async () => { diff --git a/packages/context/src/ingest/tools/stage-list.tool.ts b/packages/context/src/ingest/tools/stage-list.tool.ts index b06df0a0..bbdffc41 100644 --- a/packages/context/src/ingest/tools/stage-list.tool.ts +++ b/packages/context/src/ingest/tools/stage-list.tool.ts @@ -6,6 +6,10 @@ export interface StageListDeps { stageIndex: StageIndex; } +function formatActionDetail(detail: string): string { + return detail.trim().replace(/\s+/g, ' '); +} + export function createStageListTool(deps: StageListDeps) { return tool({ description: @@ -20,7 +24,14 @@ export function createStageListTool(deps: StageListDeps) { const actions = wu.actions.length === 0 ? ' (no actions)' - : wu.actions.map((a) => ` - ${a.target}:${a.type} ${a.key}`).join('\n'); + : wu.actions + .map((a) => { + const detail = formatActionDetail(a.detail); + return detail.length > 0 + ? ` - ${a.target}:${a.type} ${a.key}; detail: ${detail}` + : ` - ${a.target}:${a.type} ${a.key}`; + }) + .join('\n'); return `- unitKey: ${wu.unitKey} (status=${wu.status})\n rawFiles: ${wu.rawFiles.join(', ') || '(none)'}\n actions:\n${actions}`; }) .join('\n'); diff --git a/packages/context/src/ingest/tools/tool-transcript-summary.test.ts b/packages/context/src/ingest/tools/tool-transcript-summary.test.ts new file mode 100644 index 00000000..bc836e97 --- /dev/null +++ b/packages/context/src/ingest/tools/tool-transcript-summary.test.ts @@ -0,0 +1,196 @@ +import { describe, expect, it } from 'vitest'; +import type { ToolCallLogEntry } from './tool-call-logger.js'; +import { createMutableToolTranscriptSummary, recordToolTranscriptEntry } from './tool-transcript-summary.js'; + +function entry(overrides: Partial): ToolCallLogEntry { + return { + ts: '2026-05-11T00:00:00.000Z', + wuKey: 'wu-1', + toolName: 'wiki_write', + durationMs: 1, + input: {}, + ...overrides, + }; +} + +describe('tool transcript summaries', () => { + it('keeps recovered wiki_write structured failures out of fatal failures', () => { + const summary = createMutableToolTranscriptSummary('wu-1', '/tmp/wu-1.jsonl'); + + recordToolTranscriptEntry( + summary, + entry({ + input: { key: 'orbit-customers' }, + output: { structured: { success: false, key: 'orbit-customers' } }, + }), + ); + recordToolTranscriptEntry( + summary, + entry({ + input: { key: 'orbit-customers' }, + output: { structured: { success: true, key: 'orbit-customers' } }, + }), + ); + + expect(summary.errorCount).toBe(1); + expect(summary.fatalErrorCount).toBe(0); + }); + + it('counts unrecovered wiki_remove structured failures as fatal transcript errors', () => { + const summary = createMutableToolTranscriptSummary('reconcile', '/tmp/reconcile.jsonl'); + + recordToolTranscriptEntry(summary, { + ts: '2026-05-11T00:00:00.000Z', + wuKey: 'reconcile', + toolCallId: 'remove-1', + toolName: 'wiki_remove', + durationMs: 1, + input: { key: 'duplicate-page' }, + output: { structured: { success: false, key: 'duplicate-page' } }, + }); + + expect(summary.errorCount).toBe(1); + expect(summary.fatalErrorCount).toBe(1); + }); + + it('keeps unrecovered structured write failures fatal', () => { + const summary = createMutableToolTranscriptSummary('wu-1', '/tmp/wu-1.jsonl'); + + recordToolTranscriptEntry( + summary, + entry({ + input: { key: 'orbit-customers' }, + output: { structured: { success: false, key: 'orbit-customers' } }, + }), + ); + + expect(summary.errorCount).toBe(1); + expect(summary.fatalErrorCount).toBe(1); + }); + + it('treats a later sl_edit_source success as recovery for the same SL source', () => { + const summary = createMutableToolTranscriptSummary('wu-1', '/tmp/wu-1.jsonl'); + + recordToolTranscriptEntry( + summary, + entry({ + toolName: 'sl_write_source', + input: { connectionId: 'warehouse', sourceName: 'orbit_customers' }, + output: { structured: { success: false, sourceName: 'orbit_customers' } }, + }), + ); + recordToolTranscriptEntry( + summary, + entry({ + toolName: 'sl_edit_source', + input: { connectionId: 'warehouse', sourceName: 'orbit_customers' }, + output: { structured: { success: true, sourceName: 'orbit_customers' } }, + }), + ); + + expect(summary.errorCount).toBe(1); + expect(summary.fatalErrorCount).toBe(0); + }); + + it('treats explicit unmapped fallback as recovery for guarded SL write failures', () => { + const summary = createMutableToolTranscriptSummary('wu-1', '/tmp/wu-1.jsonl'); + + recordToolTranscriptEntry( + summary, + entry({ + toolName: 'sl_write_source', + input: { connectionId: 'dbt-main', sourceName: 'stg_accounts' }, + output: { structured: { success: false, sourceName: 'stg_accounts' } }, + }), + ); + recordToolTranscriptEntry( + summary, + entry({ + toolName: 'emit_unmapped_fallback', + input: { rawPath: 'models/schema.yml', reason: 'no_physical_table', tableRef: 'stg_accounts', fallback: 'wiki_only' }, + output: 'recorded unmapped fallback for models/schema.yml (wiki_only)', + }), + ); + + expect(summary.errorCount).toBe(1); + expect(summary.fatalErrorCount).toBe(0); + }); + + it('treats an untargeted unmapped fallback as recovery when there is only one pending SL failure', () => { + const summary = createMutableToolTranscriptSummary('wu-1', '/tmp/wu-1.jsonl'); + + recordToolTranscriptEntry( + summary, + entry({ + toolName: 'sl_write_source', + input: { connectionId: 'dbt-main', sourceName: 'stg_accounts' }, + output: { structured: { success: false, sourceName: 'stg_accounts' } }, + }), + ); + recordToolTranscriptEntry( + summary, + entry({ + toolName: 'emit_unmapped_fallback', + input: { rawPath: 'models/schema.yml', reason: 'no_physical_table', fallback: 'wiki_only' }, + output: 'recorded unmapped fallback for models/schema.yml (wiki_only)', + }), + ); + + expect(summary.errorCount).toBe(1); + expect(summary.fatalErrorCount).toBe(0); + }); + + it('keeps unrelated SL write failures fatal when one source gets an unmapped fallback', () => { + const summary = createMutableToolTranscriptSummary('wu-1', '/tmp/wu-1.jsonl'); + + recordToolTranscriptEntry( + summary, + entry({ + toolName: 'sl_write_source', + input: { connectionId: 'dbt-main', sourceName: 'stg_accounts' }, + output: { structured: { success: false, sourceName: 'stg_accounts' } }, + }), + ); + recordToolTranscriptEntry( + summary, + entry({ + toolName: 'sl_write_source', + input: { connectionId: 'dbt-main', sourceName: 'stg_orders' }, + output: { structured: { success: false, sourceName: 'stg_orders' } }, + }), + ); + recordToolTranscriptEntry( + summary, + entry({ + toolName: 'emit_unmapped_fallback', + input: { rawPath: 'models/schema.yml', reason: 'no_physical_table', tableRef: 'stg_accounts', fallback: 'wiki_only' }, + output: 'recorded unmapped fallback for models/schema.yml (wiki_only)', + }), + ); + + expect(summary.errorCount).toBe(2); + expect(summary.fatalErrorCount).toBe(1); + }); + + it('keeps thrown tool errors fatal even after a successful write', () => { + const summary = createMutableToolTranscriptSummary('wu-1', '/tmp/wu-1.jsonl'); + + recordToolTranscriptEntry( + summary, + entry({ + input: { key: 'orbit-customers' }, + error: { message: 'tool crashed' }, + }), + ); + recordToolTranscriptEntry( + summary, + entry({ + input: { key: 'orbit-customers' }, + output: { structured: { success: true, key: 'orbit-customers' } }, + }), + ); + + expect(summary.errorCount).toBe(1); + expect(summary.fatalErrorCount).toBe(1); + }); +}); diff --git a/packages/context/src/ingest/tools/tool-transcript-summary.ts b/packages/context/src/ingest/tools/tool-transcript-summary.ts new file mode 100644 index 00000000..de7ee668 --- /dev/null +++ b/packages/context/src/ingest/tools/tool-transcript-summary.ts @@ -0,0 +1,185 @@ +import type { ToolCallLogEntry } from './tool-call-logger.js'; + +export interface MutableToolTranscriptSummary { + unitKey: string; + path: string; + toolCallCount: number; + errorCount: number; + fatalErrorCount: number; + toolNames: Set; + hardErrorCount: number; + recoverableFailureCounts: Map; +} + +export function createMutableToolTranscriptSummary(unitKey: string, path: string): MutableToolTranscriptSummary { + return { + unitKey, + path, + toolCallCount: 0, + errorCount: 0, + fatalErrorCount: 0, + toolNames: new Set(), + hardErrorCount: 0, + recoverableFailureCounts: new Map(), + }; +} + +export function recordToolTranscriptEntry(summary: MutableToolTranscriptSummary, entry: ToolCallLogEntry): void { + summary.toolCallCount += 1; + summary.toolNames.add(entry.toolName); + + if (entry.error) { + summary.errorCount += 1; + summary.hardErrorCount += 1; + refreshFatalErrorCount(summary); + return; + } + + const recoverableFailureKey = recoverableStructuredFailureKey(entry); + if (recoverableFailureKey) { + summary.errorCount += 1; + summary.recoverableFailureCounts.set( + recoverableFailureKey, + (summary.recoverableFailureCounts.get(recoverableFailureKey) ?? 0) + 1, + ); + refreshFatalErrorCount(summary); + return; + } + + const recoveryKey = recoverableStructuredSuccessKey(entry); + if (recoveryKey) { + summary.recoverableFailureCounts.delete(recoveryKey); + } + if (entry.toolName === 'emit_unmapped_fallback') { + const fallbackTarget = fallbackSlTargetKey(entry); + const pendingSlKeys = [...summary.recoverableFailureCounts.keys()].filter((key) => key.startsWith('sl:')); + for (const key of pendingSlKeys) { + if ( + (fallbackTarget && slFailureKeyMatchesFallback(key, fallbackTarget)) || + (!fallbackTarget && pendingSlKeys.length === 1) + ) { + summary.recoverableFailureCounts.delete(key); + } + } + } + refreshFatalErrorCount(summary); +} + +function refreshFatalErrorCount(summary: MutableToolTranscriptSummary): void { + summary.fatalErrorCount = + summary.hardErrorCount + [...summary.recoverableFailureCounts.values()].reduce((sum, count) => sum + count, 0); +} + +function recoverableStructuredFailureKey(entry: ToolCallLogEntry): string | null { + if (!isStructuredToolFailure(entry.output)) { + return null; + } + if (entry.toolName === 'wiki_write' || entry.toolName === 'wiki_remove') { + return wikiTargetKey(entry); + } + if (entry.toolName === 'sl_write_source') { + return slTargetKey(entry); + } + return null; +} + +function recoverableStructuredSuccessKey(entry: ToolCallLogEntry): string | null { + if (!isStructuredToolSuccess(entry.output)) { + return null; + } + if (entry.toolName === 'wiki_write' || entry.toolName === 'wiki_remove') { + return wikiTargetKey(entry); + } + if (entry.toolName === 'sl_write_source' || entry.toolName === 'sl_edit_source') { + return slTargetKey(entry); + } + return null; +} + +function isStructuredToolFailure(output: unknown): boolean { + return structuredSuccess(output) === false; +} + +function isStructuredToolSuccess(output: unknown): boolean { + return structuredSuccess(output) === true; +} + +function structuredSuccess(output: unknown): boolean | null { + const structured = recordField(output, 'structured'); + const success = structured?.success; + return typeof success === 'boolean' ? success : null; +} + +function wikiTargetKey(entry: ToolCallLogEntry): string | null { + const key = stringField(recordField(entry.output, 'structured'), 'key') ?? stringField(entry.input, 'key'); + return key ? `wiki:${key}` : null; +} + +function slTargetKey(entry: ToolCallLogEntry): string | null { + const structured = recordField(entry.output, 'structured'); + const sourceName = stringField(structured, 'sourceName') ?? stringField(entry.input, 'sourceName'); + if (!sourceName) { + return null; + } + const connectionId = stringField(entry.input, 'connectionId') ?? ''; + return `sl:${connectionId}:${sourceName}`; +} + +function fallbackSlTargetKey(entry: ToolCallLogEntry): { connectionId?: string; sourceName: string } | null { + const tableRef = stringField(entry.input, 'tableRef'); + if (!tableRef) { + return null; + } + const sourceName = finalReferenceSegment(tableRef); + if (!sourceName) { + return null; + } + const connectionId = stringField(entry.input, 'connectionId'); + return { + sourceName, + ...(connectionId ? { connectionId } : {}), + }; +} + +function slFailureKeyMatchesFallback( + failureKey: string, + fallback: { connectionId?: string; sourceName: string }, +): boolean { + const match = /^sl:([^:]*):(.*)$/.exec(failureKey); + if (!match) { + return false; + } + const [, connectionId, sourceName] = match; + if (fallback.connectionId && connectionId !== fallback.connectionId) { + return false; + } + return normalizeReferenceSegment(sourceName ?? '') === normalizeReferenceSegment(fallback.sourceName); +} + +function finalReferenceSegment(value: string): string { + const normalized = value + .trim() + .replace(/["`]/g, '') + .replace(/[\[\]]/g, ''); + return normalized.split('.').filter(Boolean).at(-1) ?? ''; +} + +function normalizeReferenceSegment(value: string): string { + return finalReferenceSegment(value).toLowerCase(); +} + +function recordField(value: unknown, field: string): Record | null { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return null; + } + const nested = (value as Record)[field]; + return nested && typeof nested === 'object' && !Array.isArray(nested) ? (nested as Record) : null; +} + +function stringField(value: unknown, field: string): string | null { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return null; + } + const raw = (value as Record)[field]; + return typeof raw === 'string' && raw.length > 0 ? raw : null; +} diff --git a/packages/context/src/mcp/local-project-ports.test.ts b/packages/context/src/mcp/local-project-ports.test.ts index 8692ab7d..85a3c2c7 100644 --- a/packages/context/src/mcp/local-project-ports.test.ts +++ b/packages/context/src/mcp/local-project-ports.test.ts @@ -617,7 +617,7 @@ describe('createLocalProjectMcpContextPorts', () => { userId: 'local-user', key: '../outside', }), - ).rejects.toThrow('Unsafe knowledge key'); + ).rejects.toThrow('Invalid wiki key "../outside". Wiki keys must be flat; use "outside".'); await expect( ports.semanticLayer?.readSource({ diff --git a/packages/context/src/mcp/local-project-ports.ts b/packages/context/src/mcp/local-project-ports.ts index 41d1f916..0c325453 100644 --- a/packages/context/src/mcp/local-project-ports.ts +++ b/packages/context/src/mcp/local-project-ports.ts @@ -512,7 +512,7 @@ export function createLocalProjectMcpContextPorts( } const yaml = - input.yaml ?? YAML.stringify({ ...input.source, name: input.sourceName }, { indent: 2, lineWidth: 0 }); + input.yaml ?? YAML.stringify({ ...input.source, name: input.sourceName }, { indent: 2, lineWidth: 0, version: '1.1' }); parseYamlRecord(yaml); await project.fileStore.writeFile( path, diff --git a/packages/context/src/memory/memory-runtime-assets.test.ts b/packages/context/src/memory/memory-runtime-assets.test.ts index 36d4dc7c..ef056d18 100644 --- a/packages/context/src/memory/memory-runtime-assets.test.ts +++ b/packages/context/src/memory/memory-runtime-assets.test.ts @@ -90,6 +90,25 @@ describe('memory runtime assets', () => { expect(body).not.toContain('a standalone SL source only when raw evidence contains enough table or SQL structure'); }); + it('ships Metabase guidance that avoids invalid joins for SQL-only card outputs', async () => { + const body = await readFile(join(skillsDir, 'metabase_ingest', 'SKILL.md'), 'utf-8'); + + expect(body).toContain('Do not declare a KTX join just because the card SQL joins that table internally'); + expect(body).toContain('only when the card output exposes a local key that matches the target source grain'); + expect(body).toContain('If `sl_discover` resolves the table, it is not outside the manifest'); + expect(body).toContain('reason: "parse_error"'); + expect(body).not.toContain('Tables outside the manifest'); + expect(body).not.toContain('reason: "metabase_sql_untranslated"'); + }); + + it('ships Notion guidance for physical-table fallbacks and duplicate wiki reconciliation', async () => { + const body = await readFile(join(skillsDir, 'notion_synthesize', 'SKILL.md'), 'utf-8'); + + expect(body).toContain('Notion `dataSourceCount` counts Notion databases/data sources only'); + expect(body).toContain('Search existing wiki pages for the same `tables:` or `sl_refs:` frontmatter'); + expect(body).toContain('no_physical_table'); + }); + it('packages LookML connection-mismatch SL gate guidance', async () => { const body = await readFile(join(skillsDir, 'lookml_ingest', 'SKILL.md'), 'utf-8'); diff --git a/packages/context/src/memory/types.ts b/packages/context/src/memory/types.ts index e1e03b41..aa50cd8c 100644 --- a/packages/context/src/memory/types.ts +++ b/packages/context/src/memory/types.ts @@ -34,6 +34,7 @@ export interface MemoryAction { key: string; detail: string; targetConnectionId?: string | null; + rawPaths?: string[]; } export interface MemoryAgentResult { diff --git a/packages/context/src/scan/local-enrichment-artifacts.ts b/packages/context/src/scan/local-enrichment-artifacts.ts index 78f5e36d..ed698806 100644 --- a/packages/context/src/scan/local-enrichment-artifacts.ts +++ b/packages/context/src/scan/local-enrichment-artifacts.ts @@ -299,7 +299,7 @@ export async function writeLocalScanManifestShards( const path = `${schemaDir(input.connectionId)}/${shardKey}.yaml`; await input.project.fileStore.writeFile( path, - YAML.stringify(shard, { indent: 2, lineWidth: 0 }), + YAML.stringify(shard, { indent: 2, lineWidth: 0, version: '1.1' }), LOCAL_AUTHOR, LOCAL_AUTHOR_EMAIL, `scan(${LIVE_DATABASE_ADAPTER}): write manifest shard ${shardKey} syncId=${input.syncId}`, diff --git a/packages/context/src/search/backend-conformance.test.ts b/packages/context/src/search/backend-conformance.test.ts index e033082d..d2d8e3bf 100644 --- a/packages/context/src/search/backend-conformance.test.ts +++ b/packages/context/src/search/backend-conformance.test.ts @@ -146,7 +146,7 @@ async function seedSemanticLayerProject(project: KtxLocalProject): Promise async function seedWikiProject(project: KtxLocalProject): Promise { await writeLocalKnowledgePage(project, { - key: 'metrics/revenue', + key: 'metrics-revenue', scope: 'GLOBAL', summary: 'Semantic revenue definition', content: 'Revenue is recognized when an order is paid.', @@ -155,7 +155,7 @@ async function seedWikiProject(project: KtxLocalProject): Promise { slRefs: ['orders'], }); await writeLocalKnowledgePage(project, { - key: 'support/escalations', + key: 'support-escalations', scope: 'GLOBAL', summary: 'Support escalation process', content: 'Escalations move urgent support tickets to the operations queue.', @@ -338,9 +338,9 @@ describe('SQLite hybrid search backend conformance', () => { surface: 'wiki', caseName: 'lexical page ranking', results: lexical.map(toWikiConformanceResult), - expectedTopIds: ['metrics/revenue'], + expectedTopIds: ['metrics-revenue'], expectedReasonsById: { - 'metrics/revenue': ['lexical'], + 'metrics-revenue': ['lexical'], }, expectedLanes: { lexical: { status: 'available' }, @@ -359,9 +359,9 @@ describe('SQLite hybrid search backend conformance', () => { surface: 'wiki', caseName: 'semantic page ranking', results: semantic.map(toWikiConformanceResult), - expectedTopIds: ['metrics/revenue'], + expectedTopIds: ['metrics-revenue'], expectedReasonsById: { - 'metrics/revenue': ['semantic'], + 'metrics-revenue': ['semantic'], }, expectedLanes: { semantic: { status: 'available' }, @@ -378,9 +378,9 @@ describe('SQLite hybrid search backend conformance', () => { surface: 'wiki', caseName: 'token page fallback', results: token.map(toWikiConformanceResult), - expectedTopIds: ['metrics/revenue'], + expectedTopIds: ['metrics-revenue'], expectedReasonsById: { - 'metrics/revenue': ['token'], + 'metrics-revenue': ['token'], }, expectedLanes: { token: { status: 'available' }, diff --git a/packages/context/src/sl/local-sl.test.ts b/packages/context/src/sl/local-sl.test.ts index 3cdfaefe..aa48546b 100644 --- a/packages/context/src/sl/local-sl.test.ts +++ b/packages/context/src/sl/local-sl.test.ts @@ -89,6 +89,39 @@ describe('local semantic-layer helpers', () => { await expect(validateLocalSlSource(ORDERS_YAML)).resolves.toEqual({ valid: true, errors: [] }); }); + it('validates table-backed sources against matching physical manifests when project context is provided', async () => { + await project.fileStore.writeFile( + 'semantic-layer/postgres-warehouse/_schema/orbit_analytics.yaml', + `tables: + int_active_contract_arr: + table: orbit_analytics.int_active_contract_arr + columns: + - { name: contract_id, type: string } + - { name: contract_arr_cents, type: number } +`, + 'ktx', + 'ktx@example.com', + 'Add warehouse manifest', + ); + + const invalidDbtSource = [ + 'name: int_active_contract_arr', + 'table: orbit_analytics.int_active_contract_arr', + 'grain: [contract_id]', + 'columns:', + ' - { name: contract_id, type: string }', + ' - { name: arr_cents, type: number }', + 'measures:', + ' - { name: arr, expr: sum(arr_cents) }', + '', + ].join('\n'); + + const result = await validateLocalSlSource(invalidDbtSource, { project, connectionId: 'dbt-main' }); + expect(result.valid).toBe(false); + expect(result.errors.join('\n')).toContain('arr_cents'); + expect(result.errors.join('\n')).toContain('absent from physical table'); + }); + it('lists and reads manifest-backed scan sources as queryable sources', async () => { await project.fileStore.writeFile( 'semantic-layer/warehouse/_schema/public.yaml', @@ -345,7 +378,7 @@ describe('local semantic-layer helpers', () => { await expect(validateLocalSlSource(invalidYaml)).resolves.toMatchObject({ valid: false, - errors: [expect.stringContaining('grain')], + errors: expect.arrayContaining([expect.stringContaining('grain')]), }); await expect( diff --git a/packages/context/src/sl/local-sl.ts b/packages/context/src/sl/local-sl.ts index 14559ffe..bf170d0a 100644 --- a/packages/context/src/sl/local-sl.ts +++ b/packages/context/src/sl/local-sl.ts @@ -7,7 +7,12 @@ import { HybridSearchCore, type SearchCandidateGenerator } from '../search/index import { DEFAULT_PRIORITY, resolveDescription } from './descriptions.js'; import { normalizeSemanticLayerDescriptions } from './description-normalization.js'; import { sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js'; -import { composeOverlay, type ManifestTableEntry, projectManifestEntry } from './semantic-layer.service.js'; +import { + composeOverlay, + type ManifestTableEntry, + projectManifestEntry, + SemanticLayerService, +} from './semantic-layer.service.js'; import type { PgliteSlSearchPrototypeOwnerOptions } from './pglite-sl-search-prototype.js'; import { loadLatestSlDictionaryEntries } from './sl-dictionary-profile.js'; import { buildSemanticLayerSourceSearchText, SlSearchService } from './sl-search.service.js'; @@ -157,7 +162,7 @@ function summarizeSource(args: { connectionId: string; path: string; raw: string } function sourceToYaml(source: SemanticLayerSource): string { - return YAML.stringify(source, { indent: 2, lineWidth: 0 }); + return YAML.stringify(source, { indent: 2, lineWidth: 0, version: '1.1' }); } function summarizeSemanticSource(args: { @@ -246,12 +251,24 @@ export async function loadLocalSlSourceRecords( return [...sources.values()].sort((left, right) => left.name.localeCompare(right.name)); } -export async function validateLocalSlSource(rawYaml: string): Promise { +export async function validateLocalSlSource( + rawYaml: string, + options?: { project?: KtxLocalProject; connectionId?: string }, +): Promise { try { const parsed = parseYamlRecord(rawYaml); const schema = parsed.table || parsed.sql ? sourceDefinitionSchema : sourceOverlaySchema; - schema.parse(parsed); - return { valid: true, errors: [] }; + const result = schema.parse(parsed); + const errors: string[] = []; + + if (options?.project && options.connectionId && 'table' in result && result.table) { + const service = new SemanticLayerService(options.project.fileStore, {} as never, {} as never); + errors.push( + ...(await service.validatePhysicalTableReferences(options.connectionId, [result as SemanticLayerSource])), + ); + } + + return { valid: errors.length === 0, errors }; } catch (error) { return { valid: false, errors: validationErrors(error) }; } @@ -261,7 +278,7 @@ export async function writeLocalSlSource( project: KtxLocalProject, input: { connectionId: string; sourceName: string; yaml: string }, ): Promise { - const validation = await validateLocalSlSource(input.yaml); + const validation = await validateLocalSlSource(input.yaml, { project, connectionId: input.connectionId }); if (!validation.valid) { throw new Error(`Invalid semantic-layer source: ${validation.errors.join('; ')}`); } diff --git a/packages/context/src/sl/schemas.ts b/packages/context/src/sl/schemas.ts index 706a4add..a42ecc87 100644 --- a/packages/context/src/sl/schemas.ts +++ b/packages/context/src/sl/schemas.ts @@ -63,6 +63,14 @@ const sourceFreshnessSchema = z.object({ dbt: freshnessDbtSchema.optional(), }); +// Identifiers (grain entries, column names) must be unqualified output-column +// names. A dot would mean the agent emitted a table-qualified reference like +// `activity.account_id` — those break SQL generation and grain semantics. +const unqualifiedNameSchema = z + .string() + .min(1) + .regex(/^[^.]+$/, "must be unqualified (no '.') — use the output column name"); + const joinDeclarationSchema = z.object({ to: z.string().min(1), on: z.string().min(1), @@ -71,7 +79,7 @@ const joinDeclarationSchema = z.object({ }); const sourceColumnSchema = z.object({ - name: z.string().min(1), + name: unqualifiedNameSchema, // type/description optional on standalone sources: compose-time enrichment fills them // from the manifest entry named in `inherits_columns_from`. If the agent does not set // `inherits_columns_from`, or the column is not in the manifest, type must be present @@ -90,7 +98,7 @@ const sourceColumnSchema = z.object({ /** Overlay column: type requires expr (structural types are inherited from manifest). */ const overlayColumnSchema = z .object({ - name: z.string().min(1), + name: unqualifiedNameSchema, type: z.enum(columnTypeValues).optional(), role: z.enum(columnRoleValues).optional(), visibility: z.enum(columnVisibilityValues).optional(), @@ -118,8 +126,13 @@ export const sourceDefinitionSchema = z // agent write `columns: [{name: FOO}]` instead of redeclaring known fields. // Lookup is fuzzy: bare key, fully-qualified table path, or any suffix all match. inherits_columns_from: z.string().optional(), - grain: z.array(z.string()).min(1), - columns: z.array(sourceColumnSchema).default([]), + grain: z.array(unqualifiedNameSchema).min(1), + // Standalone sources MUST declare columns. An empty columns array means + // there's nothing to query or join against and breaks grain validation + // (the grain must reference declared columns). Inheritance from a manifest + // via `inherits_columns_from` only fills in type/description on declared + // columns — the column names themselves must be listed here. + columns: z.array(sourceColumnSchema).min(1), joins: z.array(joinDeclarationSchema).default([]), measures: z.array(slMeasureDefinitionSchema).default([]), segments: z.array(segmentDefinitionSchema).optional(), @@ -139,7 +152,7 @@ export const sourceOverlaySchema = z name: z.string().min(1), description: z.string().optional(), descriptions: z.record(z.string(), z.string()).optional(), - grain: z.array(z.string()).optional(), + grain: z.array(unqualifiedNameSchema).optional(), columns: z.array(overlayColumnSchema).optional(), joins: z.array(joinDeclarationSchema).optional(), measures: z.array(slMeasureDefinitionSchema).optional(), diff --git a/packages/context/src/sl/semantic-layer.service.test.ts b/packages/context/src/sl/semantic-layer.service.test.ts index c89cbc28..308cc5aa 100644 --- a/packages/context/src/sl/semantic-layer.service.test.ts +++ b/packages/context/src/sl/semantic-layer.service.test.ts @@ -388,6 +388,34 @@ describe('sourceDefinitionSchema', () => { externalOwner: 'analytics', }); }); + + it("rejects qualified grain names (e.g. 'activity.account_id')", () => { + const result = sourceDefinitionSchema.safeParse({ + name: 'activity', + table: 'public.activity', + grain: ['activity.account_id'], + columns: [{ name: 'account_id', type: 'number' }], + joins: [], + measures: [], + }); + expect(result.success).toBe(false); + if (result.success) return; + expect(result.error.issues.some((i) => i.path.join('.').startsWith('grain'))).toBe(true); + }); + + it('rejects qualified column names', () => { + const result = sourceDefinitionSchema.safeParse({ + name: 'activity', + table: 'public.activity', + grain: ['account_id'], + columns: [{ name: 'activity.account_id', type: 'number' }], + joins: [], + measures: [], + }); + expect(result.success).toBe(false); + if (result.success) return; + expect(result.error.issues.some((i) => i.path.join('.').startsWith('columns'))).toBe(true); + }); }); describe('projectManifestEntry', () => { @@ -781,6 +809,210 @@ describe('validateWithProposedSource', () => { expect(result.errors[0]).toMatch(/Overlay 'orphan' has no matching manifest entry/); expect(pythonPort.validateSources).not.toHaveBeenCalled(); }); + + it('rejects table-backed sources whose declared columns are absent from a matching physical manifest', async () => { + const schemaPath = 'semantic-layer/postgres-warehouse/_schema/orbit_analytics.yaml'; + configService.listFiles.mockImplementation((dir: string) => { + if (dir === 'semantic-layer/dbt-main') { + return Promise.resolve({ files: [] }); + } + if (dir === 'semantic-layer') { + return Promise.resolve({ files: [schemaPath] }); + } + if (dir === 'semantic-layer/dbt-main/_schema' || dir === 'semantic-layer/postgres-warehouse/_schema') { + return Promise.resolve({ files: dir.endsWith('postgres-warehouse/_schema') ? [schemaPath] : [] }); + } + return Promise.resolve({ files: [] }); + }); + configService.readFile.mockImplementation((path: string) => { + if (path === schemaPath) { + return Promise.resolve({ + content: [ + 'tables:', + ' int_procurement_qualifying_actions:', + ' table: orbit_analytics.int_procurement_qualifying_actions', + ' columns:', + ' - { name: action_id, type: string }', + ' - { name: account_id, type: string }', + ' - { name: user_id, type: string }', + ' - { name: action_date, type: time }', + ' - { name: action_type, type: string }', + ].join('\n'), + }); + } + return Promise.reject(new Error(`Unexpected readFile: ${path}`)); + }); + pythonPort.validateSources.mockResolvedValue({ + data: { errors: [], warnings: [] }, + }); + + const result = await service.validateWithProposedSource('dbt-main', { + name: 'int_procurement_qualifying_actions', + table: 'orbit_analytics.int_procurement_qualifying_actions', + grain: ['purchase_request_id'], + columns: [ + { name: 'purchase_request_id', type: 'string' }, + { name: 'account_id', type: 'string' }, + { name: 'requester_user_id', type: 'string' }, + { name: 'action_week', type: 'time' }, + ], + joins: [], + measures: [{ name: 'qualifying_action_count', expr: 'count(purchase_request_id)' }], + }); + + expect(result.errors.join('\n')).toMatch(/declared column\(s\) absent from physical table/); + expect(result.errors.join('\n')).toMatch(/purchase_request_id/); + expect(result.errors.join('\n')).toMatch(/requester_user_id/); + expect(result.errors.join('\n')).toMatch(/action_week/); + expect(result.errors.join('\n')).toMatch(/measure "qualifying_action_count" references unknown column\(s\)/); + }); + + it('keeps valid table-backed sources clean when a physical manifest matches', async () => { + const schemaPath = 'semantic-layer/postgres-warehouse/_schema/orbit_analytics.yaml'; + configService.listFiles.mockImplementation((dir: string) => { + if (dir === 'semantic-layer/dbt-main') { + return Promise.resolve({ files: [] }); + } + if (dir === 'semantic-layer') { + return Promise.resolve({ files: [schemaPath] }); + } + if (dir === 'semantic-layer/dbt-main/_schema' || dir === 'semantic-layer/postgres-warehouse/_schema') { + return Promise.resolve({ files: dir.endsWith('postgres-warehouse/_schema') ? [schemaPath] : [] }); + } + return Promise.resolve({ files: [] }); + }); + configService.readFile.mockResolvedValue({ + content: [ + 'tables:', + ' mart_revenue_daily:', + ' table: orbit_analytics.mart_revenue_daily', + ' columns:', + ' - { name: revenue_date, type: time }', + ' - { name: gross_revenue_cents, type: number }', + ' - { name: credits_cents, type: number }', + ' - { name: refunds_cents, type: number }', + ' - { name: net_revenue_cents, type: number }', + ].join('\n'), + }); + pythonPort.validateSources.mockResolvedValue({ + data: { errors: [], warnings: [] }, + }); + + const result = await service.validateWithProposedSource('dbt-main', { + name: 'mart_revenue_daily', + table: 'orbit_analytics.mart_revenue_daily', + grain: ['revenue_date'], + columns: [ + { name: 'revenue_date', type: 'time' }, + { name: 'gross_revenue_cents', type: 'number' }, + { name: 'credits_cents', type: 'number' }, + { name: 'refunds_cents', type: 'number' }, + { name: 'net_revenue_cents', type: 'number' }, + ], + joins: [], + measures: [{ name: 'net_revenue', expr: 'sum(net_revenue_cents)' }], + }); + + expect(result.errors).toEqual([]); + }); + + it('allows SQL syntax tokens and cast types in physical expression validation', async () => { + const schemaPath = 'semantic-layer/postgres-warehouse/_schema/orbit_analytics.yaml'; + configService.listFiles.mockImplementation((dir: string) => { + if (dir === 'semantic-layer/dbt-main') { + return Promise.resolve({ files: [] }); + } + if (dir === 'semantic-layer') { + return Promise.resolve({ files: [schemaPath] }); + } + if (dir === 'semantic-layer/dbt-main/_schema' || dir === 'semantic-layer/postgres-warehouse/_schema') { + return Promise.resolve({ files: dir.endsWith('postgres-warehouse/_schema') ? [schemaPath] : [] }); + } + return Promise.resolve({ files: [] }); + }); + configService.readFile.mockResolvedValue({ + content: [ + 'tables:', + ' mart_revenue_daily:', + ' table: orbit_analytics.mart_revenue_daily', + ' columns:', + ' - { name: order_id, type: string }', + ' - { name: revenue_date, type: time }', + ' - { name: amount, type: number }', + ' - { name: status, type: string }', + ' - { name: created_at, type: time }', + ].join('\n'), + }); + pythonPort.validateSources.mockResolvedValue({ + data: { errors: [], warnings: [] }, + }); + + const result = await service.validateWithProposedSource('dbt-main', { + name: 'mart_revenue_daily', + table: 'orbit_analytics.mart_revenue_daily', + grain: ['order_id'], + columns: [ + { name: 'order_id', type: 'string' }, + { name: 'revenue_date', type: 'time' }, + { name: 'amount', type: 'number' }, + { name: 'status', type: 'string' }, + { name: 'created_at', type: 'time' }, + { name: 'status_text', type: 'string', expr: 'status::text' }, + ], + segments: [{ name: 'current_or_paid', expr: "created_at <= current_date OR status = 'paid'" }], + joins: [], + measures: [ + { name: 'paid_amount', expr: "sum(amount) FILTER (WHERE status = 'paid')" }, + { name: 'cast_amount_count', expr: 'count(cast(amount as text))' }, + ], + }); + + expect(result.errors).toEqual([]); + }); + + it('rejects join keys that are absent from matched physical sources', async () => { + const schemaPath = 'semantic-layer/postgres-warehouse/_schema/orbit_analytics.yaml'; + configService.listFiles.mockImplementation((dir: string) => { + if (dir === 'semantic-layer/dbt-main') { + return Promise.resolve({ files: [] }); + } + if (dir === 'semantic-layer') { + return Promise.resolve({ files: [schemaPath] }); + } + if (dir === 'semantic-layer/dbt-main/_schema' || dir === 'semantic-layer/postgres-warehouse/_schema') { + return Promise.resolve({ files: dir.endsWith('postgres-warehouse/_schema') ? [schemaPath] : [] }); + } + return Promise.resolve({ files: [] }); + }); + configService.readFile.mockResolvedValue({ + content: [ + 'tables:', + ' activity:', + ' table: orbit_analytics.activity', + ' columns:', + ' - { name: account_id, type: string }', + ' accounts:', + ' table: orbit_analytics.accounts', + ' columns:', + ' - { name: account_id, type: string }', + ].join('\n'), + }); + pythonPort.validateSources.mockResolvedValue({ + data: { errors: [], warnings: [] }, + }); + + const result = await service.validateWithProposedSource('dbt-main', { + name: 'activity', + table: 'orbit_analytics.activity', + grain: ['account_id'], + columns: [{ name: 'account_id', type: 'string' }], + joins: [{ to: 'accounts', on: 'activity.account_name = accounts.account_uuid', relationship: 'many_to_one' }], + measures: [], + }); + + expect(result.errors.join('\n')).toMatch(/local column "account_name"/); + expect(result.errors.join('\n')).toMatch(/target column "account_uuid"/); + }); }); describe('findDanglingSegmentRefs', () => { diff --git a/packages/context/src/sl/semantic-layer.service.ts b/packages/context/src/sl/semantic-layer.service.ts index 938763fe..0616851d 100644 --- a/packages/context/src/sl/semantic-layer.service.ts +++ b/packages/context/src/sl/semantic-layer.service.ts @@ -135,7 +135,7 @@ export class SemanticLayerService { const path = this.sourcePath(connectionId, source.name); const normalizedSource = normalizeSemanticLayerDescriptions(source); - const content = YAML.stringify(normalizedSource, { indent: 2, lineWidth: 0 }); + const content = YAML.stringify(normalizedSource, { indent: 2, lineWidth: 0, version: '1.1' }); const message = commitMessage ?? `Update semantic layer source: ${source.name}`; const result = await this.configService.writeFile(path, content, author, authorEmail, message, { skipLock: options?.skipLock, @@ -398,6 +398,174 @@ export class SemanticLayerService { return null; } + async findManifestEntryByTableRefAcrossConnections( + preferredConnectionId: string, + ref: string, + ): Promise<{ connectionId: string; source: SemanticLayerSource } | null> { + const preferred = await this.findManifestEntryByTableRef(preferredConnectionId, ref); + if (preferred) { + return { connectionId: preferredConnectionId, source: preferred }; + } + + for (const entry of await this.listAllManifestEntries()) { + if (entry.connectionId === preferredConnectionId) { + continue; + } + if (manifestEntryMatchesRef(entry.source, ref)) { + return entry; + } + } + + return null; + } + + async validatePhysicalTableReferences( + connectionId: string, + sources: SemanticLayerSource[], + ): Promise { + const errors: string[] = []; + const sourceNames = new Set(sources.map((s) => s.name.toLowerCase())); + const sourcesByName = new Map(sources.map((s) => [s.name.toLowerCase(), s])); + + for (const source of sources) { + if (!source.table) { + continue; + } + + const manifestMatch = await this.findManifestEntryByTableRefAcrossConnections(connectionId, source.table); + if (!manifestMatch) { + continue; + } + + const manifestSource = manifestMatch.source; + const manifestColumns = new Map(manifestSource.columns.map((c) => [c.name.toLowerCase(), c.name])); + const declaredColumns = source.columns ?? []; + const declaredByLower = new Map(declaredColumns.map((c) => [c.name.toLowerCase(), c])); + const validOutputColumns = new Set( + declaredColumns + .filter((c) => c.expr || manifestColumns.has(c.name.toLowerCase())) + .map((c) => c.name.toLowerCase()), + ); + const measureNames = new Set((source.measures ?? []).map((m) => m.name.toLowerCase())); + const manifestLabel = + manifestMatch.connectionId === connectionId + ? manifestSource.name + : `${manifestMatch.connectionId}/${manifestSource.name}`; + + const absentDeclaredColumns = declaredColumns + .filter((c) => !c.expr && !manifestColumns.has(c.name.toLowerCase())) + .map((c) => c.name); + if (absentDeclaredColumns.length > 0) { + errors.push( + `${source.name}.yaml: table "${source.table}" matched manifest ${manifestLabel}, ` + + `but declared column(s) absent from physical table: ${absentDeclaredColumns.join(', ')}. ` + + `Available columns: ${[...manifestColumns.values()].join(', ')}`, + ); + } + + const missingGrainColumns = (source.grain ?? []).filter((grain) => { + const declared = declaredByLower.get(grain.toLowerCase()); + return !declared || (!declared.expr && !manifestColumns.has(grain.toLowerCase())); + }); + if (missingGrainColumns.length > 0) { + errors.push( + `${source.name}.yaml: grain column(s) absent from physical table "${source.table}": ${missingGrainColumns.join(', ')}`, + ); + } + + for (const column of declaredColumns) { + if (!column.expr) { + continue; + } + const missing = missingLocalExpressionRefs({ + expr: column.expr, + sourceName: source.name, + sourceNames, + validColumns: new Set([...manifestColumns.keys(), ...validOutputColumns]), + validMeasures: new Set(), + }); + if (missing.length > 0) { + errors.push( + `${source.name}.yaml: computed column "${column.name}" references unknown column(s): ${missing.join(', ')}`, + ); + } + } + + for (const segment of source.segments ?? []) { + const missing = missingLocalExpressionRefs({ + expr: segment.expr, + sourceName: source.name, + sourceNames, + validColumns: validOutputColumns, + validMeasures: new Set(), + }); + if (missing.length > 0) { + errors.push( + `${source.name}.yaml: segment "${segment.name}" references unknown column(s): ${missing.join(', ')}`, + ); + } + } + + for (const measure of source.measures ?? []) { + const exprMissing = missingLocalExpressionRefs({ + expr: measure.expr, + sourceName: source.name, + sourceNames, + validColumns: validOutputColumns, + validMeasures: measureNames, + }); + if (exprMissing.length > 0) { + errors.push( + `${source.name}.yaml: measure "${measure.name}" references unknown column(s): ${exprMissing.join(', ')}`, + ); + } + + if (measure.filter) { + const filterMissing = missingLocalExpressionRefs({ + expr: measure.filter, + sourceName: source.name, + sourceNames, + validColumns: validOutputColumns, + validMeasures: new Set(), + }); + if (filterMissing.length > 0) { + errors.push( + `${source.name}.yaml: measure "${measure.name}" filter references unknown column(s): ${filterMissing.join(', ')}`, + ); + } + } + } + + for (const join of source.joins ?? []) { + const parsed = parseJoinColumns(join.on, source.name, join.to); + if (!parsed) { + continue; + } + if (!validOutputColumns.has(parsed.localColumn.toLowerCase())) { + errors.push( + `${source.name}.yaml: join to "${join.to}" references local column ` + + `"${parsed.localColumn}" that is not a valid output column`, + ); + } + + const targetSource = + sourcesByName.get(join.to.toLowerCase()) ?? + (await this.findManifestEntryByTableRefAcrossConnections(connectionId, join.to))?.source; + if (targetSource) { + const targetColumns = new Set(targetSource.columns.map((c) => c.name.toLowerCase())); + if (!targetColumns.has(parsed.targetColumn.toLowerCase())) { + errors.push( + `${source.name}.yaml: join to "${join.to}" references target column ` + + `"${parsed.targetColumn}" that does not exist on the target source`, + ); + } + } + } + } + + return errors; + } + async getDialectForConnection(connectionId: string): Promise { const connection = await this.connections.getConnectionById(connectionId); if (!connection) { @@ -502,10 +670,15 @@ export class SemanticLayerService { return { errors: [errorMsg], warnings: [], perSourceWarnings: {} }; } if (!data) { - return { errors: [], warnings: [], perSourceWarnings: {} }; + return { + errors: await this.validatePhysicalTableReferences(connectionId, validatable), + warnings: [], + perSourceWarnings: {}, + }; } + const physicalErrors = await this.validatePhysicalTableReferences(connectionId, validatable); return { - errors: data.errors ?? [], + errors: [...(data.errors ?? []), ...physicalErrors], warnings: data.warnings ?? [], perSourceWarnings: data.per_source_warnings ?? {}, }; @@ -531,14 +704,40 @@ export class SemanticLayerService { return { errors: [formatPortError(error, 'Unknown validation error')], warnings: [] }; } if (!data) { - return { errors: [], warnings: [] }; + return { errors: await this.validatePhysicalTableReferences(connectionId, sources), warnings: [] }; } + const physicalErrors = await this.validatePhysicalTableReferences(connectionId, sources); return { - errors: data.errors ?? [], + errors: [...(data.errors ?? []), ...physicalErrors], warnings: data.warnings ?? [], }; } + private async listAllManifestEntries(): Promise> { + let files: string[]; + try { + files = (await this.configService.listFiles(SL_DIR_PREFIX)).files; + } catch { + return []; + } + + const schemaFiles = files.filter((file) => /^semantic-layer\/[^/]+\/_schema\/.+\.ya?ml$/.test(file)); + const entries: Array<{ connectionId: string; source: SemanticLayerSource }> = []; + for (const filePath of schemaFiles) { + const connectionId = filePath.split('/')[1]; + try { + const { content } = await this.configService.readFile(filePath); + const shard = YAML.parse(content) as { tables?: Record }; + for (const [name, entry] of Object.entries(shard?.tables ?? {})) { + entries.push({ connectionId, source: projectManifestEntry(name, entry) }); + } + } catch { + // skip unparseable shards + } + } + return entries; + } + /** * Validate overlays and standalone sources against the current manifest. * Returns warnings for stale references (non-blocking). @@ -963,6 +1162,8 @@ const SQL_KEYWORDS = new Set([ 'in', 'between', 'like', + 'where', + 'filter', 'cast', 'coalesce', 'nullif', @@ -971,6 +1172,48 @@ const SQL_KEYWORDS = new Set([ 'false', 'asc', 'desc', + 'date', + 'day', + 'month', + 'quarter', + 'week', + 'year', + 'interval', + 'extract', + 'from', + 'over', + 'partition', + 'by', + 'rows', + 'range', + 'current', + 'current_date', + 'current_time', + 'current_timestamp', + 'localtime', + 'localtimestamp', + 'row', + 'numeric', + 'decimal', + 'int', + 'integer', + 'bigint', + 'smallint', + 'float', + 'double', + 'real', + 'string', + 'text', + 'char', + 'character', + 'varchar', + 'timestamp', + 'time', + 'uuid', + 'json', + 'jsonb', + 'bool', + 'boolean', ]); function extractColumnReferences(expr: string): string[] { @@ -979,6 +1222,122 @@ function extractColumnReferences(expr: string): string[] { return [...new Set(tokens.filter((t) => !SQL_KEYWORDS.has(t.toLowerCase())))]; } +function manifestEntryMatchesRef(source: SemanticLayerSource, ref: string): boolean { + if (source.name.toLowerCase() === ref.toLowerCase()) { + return true; + } + const table = source.table?.toLowerCase(); + const lowered = ref.toLowerCase(); + return !!table && (table === lowered || table.endsWith(`.${lowered}`)); +} + +function normalizeSqlExpressionForIdentifierScan(expr: string): string { + return expr + .replace(/--.*$/gm, ' ') + .replace(/\/\*[\s\S]*?\*\//g, ' ') + .replace(/'([^']|'')*'/g, ' ') + .replace(/"([^"]+)"/g, '$1') + .replace(/`([^`]+)`/g, '$1') + .replace(/\[([^\]]+)\]/g, '$1') + .replace(/::\s*[A-Za-z_][\w$]*(?:\s*\([^)]*\))?/g, ' '); +} + +function extractSqlIdentifierRefs(expr: string): Array<{ qualifier?: string; name: string }> { + const normalized = normalizeSqlExpressionForIdentifierScan(expr); + const refs = new Map(); + const re = /(?:\b([A-Za-z_][\w$]*)\s*\.\s*)?(\b[A-Za-z_][\w$]*)\b/g; + for (const match of normalized.matchAll(re)) { + const qualifier = match[1]; + const name = match[2]; + if (!name) { + continue; + } + const nameLower = name.toLowerCase(); + const qualifierLower = qualifier?.toLowerCase(); + const after = normalized.slice((match.index ?? 0) + match[0].length).trimStart(); + if (!qualifier && after.startsWith('(')) { + continue; + } + if (SQL_KEYWORDS.has(nameLower) || (qualifierLower && SQL_KEYWORDS.has(qualifierLower))) { + continue; + } + refs.set(`${qualifierLower ?? ''}.${nameLower}`, qualifier ? { qualifier, name } : { name }); + } + return [...refs.values()]; +} + +function refBelongsToSource( + ref: { qualifier?: string; name: string }, + sourceName: string, + sourceNames: Set, +): boolean { + if (!ref.qualifier) { + return true; + } + const qualifier = ref.qualifier.toLowerCase(); + if (qualifier === sourceName.toLowerCase()) { + return true; + } + return !sourceNames.has(qualifier); +} + +function missingLocalExpressionRefs(input: { + expr: string; + sourceName: string; + sourceNames: Set; + validColumns: Set; + validMeasures: Set; +}): string[] { + const missing = new Set(); + for (const ref of extractSqlIdentifierRefs(input.expr)) { + if (!refBelongsToSource(ref, input.sourceName, input.sourceNames)) { + continue; + } + const name = ref.name.toLowerCase(); + if (!input.validColumns.has(name) && !input.validMeasures.has(name)) { + missing.add(ref.name); + } + } + return [...missing].sort(); +} + +function parseJoinSide(side: string): { qualifier?: string; column: string } | null { + const match = side.trim().match(/^(?:(\w+)\.)?(\w+)$/); + if (!match) { + return null; + } + return match[1] ? { qualifier: match[1], column: match[2] } : { column: match[2] }; +} + +function parseJoinColumns( + on: string, + sourceName: string, + targetName: string, +): { localColumn: string; targetColumn: string } | null { + const sides = on.split('='); + if (sides.length !== 2) { + return null; + } + const left = parseJoinSide(sides[0]); + const right = parseJoinSide(sides[1]); + if (!left || !right) { + return null; + } + + const sourceLower = sourceName.toLowerCase(); + const targetLower = targetName.toLowerCase(); + const leftQualifier = left.qualifier?.toLowerCase(); + const rightQualifier = right.qualifier?.toLowerCase(); + + if (leftQualifier === targetLower || rightQualifier === sourceLower) { + return { localColumn: right.column, targetColumn: left.column }; + } + if (rightQualifier === targetLower || leftQualifier === sourceLower || !leftQualifier) { + return { localColumn: left.column, targetColumn: right.column }; + } + return { localColumn: left.column, targetColumn: right.column }; +} + /** * Returns one message per measure-level segment reference that doesn't resolve to * a segment defined on the source. Array is empty when every reference checks out. diff --git a/packages/context/src/sl/tools/sl-edit-source.tool.ts b/packages/context/src/sl/tools/sl-edit-source.tool.ts index 17a85990..27b582d5 100644 --- a/packages/context/src/sl/tools/sl-edit-source.tool.ts +++ b/packages/context/src/sl/tools/sl-edit-source.tool.ts @@ -1,6 +1,6 @@ import YAML from 'yaml'; import { z } from 'zod'; -import { addTouchedSlSource, type ToolContext, type ToolOutput } from '../../tools/index.js'; +import { addTouchedSlSource, type ToolContext, type ToolOutput, validateActionRawPaths } from '../../tools/index.js'; import { applySqlEdits } from '../../tools/sql-edit-replacer.js'; import { normalizeSemanticLayerDescriptions } from '../description-normalization.js'; import type { SemanticLayerSource } from '../types.js'; @@ -25,6 +25,10 @@ const slEditSourceInputSchema = z.object({ .optional() .describe('Targeted exact-match search/replace edits on the raw YAML content.'), delete: z.boolean().optional().describe('Set to true to delete this source entirely'), + rawPaths: z + .array(z.string().min(1)) + .optional() + .describe('In ingest sessions, raw source file paths that directly support this SL action.'), }); type SlEditSourceInput = z.infer; @@ -75,6 +79,10 @@ If no source exists yet, use sl_write_source instead — this tool will reject t const semanticLayerService = context.session?.semanticLayerService ?? this.semanticLayerService; const skipIndex = context.session?.isWorktreeScoped === true; + const rawPathValidation = validateActionRawPaths(context.session, input.rawPaths); + if (!rawPathValidation.ok) { + return this.buildOutput(false, [rawPathValidation.error], sourceName); + } // Handle delete if (input.delete) { @@ -88,6 +96,7 @@ If no source exists yet, use sl_write_source instead — this tool will reject t key: sourceName, detail: 'Deleted source', targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId), + ...(rawPathValidation.rawPaths ? { rawPaths: rawPathValidation.rawPaths } : {}), }); } return this.buildOutput(true, [], sourceName, { yaml: undefined, commitHash: undefined }); @@ -151,7 +160,7 @@ If no source exists yet, use sl_write_source instead — this tool will reject t source = normalizeSemanticLayerDescriptions(source, { fillMissing: !!context.session?.ingest }); // Re-serialize and write - const updatedYaml = YAML.stringify(source, { indent: 2, lineWidth: 0 }); + const updatedYaml = YAML.stringify(source, { indent: 2, lineWidth: 0, version: '1.1' }); const { errors: validationErrors, warnings: validationWarnings } = await semanticLayerService.validateWithProposedSource(connectionId, source); @@ -184,6 +193,7 @@ If no source exists yet, use sl_write_source instead — this tool will reject t key: sourceName, detail: `Applied ${editCount} edit(s)`, targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId), + ...(rawPathValidation.rawPaths ? { rawPaths: rawPathValidation.rawPaths } : {}), }); } diff --git a/packages/context/src/sl/tools/sl-warehouse-validation.test.ts b/packages/context/src/sl/tools/sl-warehouse-validation.test.ts index d0f7f04a..16d4ec7f 100644 --- a/packages/context/src/sl/tools/sl-warehouse-validation.test.ts +++ b/packages/context/src/sl/tools/sl-warehouse-validation.test.ts @@ -9,6 +9,7 @@ function makeDeps(opts: { sourceYaml: string; executeQuery: ReturnType { + const yaml = `name: int_active_contract_arr +table: orbit_analytics.int_active_contract_arr +grain: [contract_id] +columns: + - {name: contract_id, type: string} + - {name: arr_cents, type: number} +measures: + - {name: arr, expr: sum(arr_cents)} +joins: [] +`; + const executeQuery = vi.fn(); + const deps = makeDeps({ sourceYaml: yaml, executeQuery }) as any; + deps.semanticLayerService.validatePhysicalTableReferences.mockResolvedValue([ + 'int_active_contract_arr.yaml: declared column(s) absent from physical table: arr_cents', + ]); + + const result = await validateSingleSource(deps, 'conn-1', 'int_active_contract_arr'); + + expect(result.errors).toContain( + 'int_active_contract_arr.yaml: declared column(s) absent from physical table: arr_cents', + ); + expect(executeQuery).not.toHaveBeenCalled(); + }); }); diff --git a/packages/context/src/sl/tools/sl-warehouse-validation.ts b/packages/context/src/sl/tools/sl-warehouse-validation.ts index b2799974..f9c5e4fd 100644 --- a/packages/context/src/sl/tools/sl-warehouse-validation.ts +++ b/packages/context/src/sl/tools/sl-warehouse-validation.ts @@ -4,6 +4,7 @@ import { SYSTEM_GIT_AUTHOR } from '../../tools/index.js'; import type { SlConnectionCatalogPort, SlSourcesIndexPort } from '../ports.js'; import { sourceOverlaySchema } from '../schemas.js'; import { SemanticLayerService } from '../semantic-layer.service.js'; +import type { SemanticLayerSource } from '../types.js'; import { sourceDefinitionSchema } from './base-semantic-layer.tool.js'; export interface SlValidationDeps { @@ -118,6 +119,14 @@ export async function validateSingleSource( return { errors, warnings }; } + if (!isOverlay && 'table' in result.data && result.data.table) { + errors.push( + ...(await deps.semanticLayerService.validatePhysicalTableReferences(connectionId, [ + result.data as SemanticLayerSource, + ])), + ); + } + const measures = (parsed.measures as Array<{ name: string }> | undefined) ?? []; const seenMeasures = new Set(); for (const m of measures) { diff --git a/packages/context/src/sl/tools/sl-write-source.tool.ts b/packages/context/src/sl/tools/sl-write-source.tool.ts index 638b130e..34b6f8c4 100644 --- a/packages/context/src/sl/tools/sl-write-source.tool.ts +++ b/packages/context/src/sl/tools/sl-write-source.tool.ts @@ -1,6 +1,6 @@ import YAML from 'yaml'; import { z } from 'zod'; -import { addTouchedSlSource, type ToolContext, type ToolOutput } from '../../tools/index.js'; +import { addTouchedSlSource, type ToolContext, type ToolOutput, validateActionRawPaths } from '../../tools/index.js'; import { sourceOverlaySchema } from '../schemas.js'; import type { SemanticLayerService } from '../semantic-layer.service.js'; import type { SemanticLayerSource } from '../types.js'; @@ -25,6 +25,10 @@ const slWriteSourceInputSchema = z.object({ .optional() .describe('Source definition (standalone with table/sql) or overlay (measures, computed columns, etc.)'), delete: z.boolean().optional().describe('Set to true to delete this source entirely'), + rawPaths: z + .array(z.string().min(1)) + .optional() + .describe('In ingest sessions, raw source file paths that directly support this SL action.'), }); type SlWriteSourceInput = z.infer; @@ -99,6 +103,10 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co const semanticLayerService = context.session?.semanticLayerService ?? this.semanticLayerService; const skipIndex = context.session?.isWorktreeScoped === true; + const rawPathValidation = validateActionRawPaths(context.session, input.rawPaths); + if (!rawPathValidation.ok) { + return this.buildOutput(false, [rawPathValidation.error], sourceName); + } // Handle delete if (input.delete) { @@ -116,6 +124,7 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co key: sourceName, detail: 'Deleted source', targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId), + ...(rawPathValidation.rawPaths ? { rawPaths: rawPathValidation.rawPaths } : {}), }); } return this.buildOutput(true, [], sourceName, { yaml: undefined, commitHash: undefined }); @@ -142,6 +151,7 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co context, semanticLayerService, skipIndex, + rawPathValidation.rawPaths, ); } @@ -154,6 +164,7 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co context: ToolContext, semanticLayerService: SemanticLayerService, skipIndex: boolean, + rawPaths: string[] | undefined, ): Promise> { const normalizedSource = normalizeSemanticLayerDescriptions(source, { fillMissing: !!context.session?.ingest }); const isOverlay = @@ -164,7 +175,7 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co ? `${isOverlay ? 'Update overlay' : 'Rewrite source'}: ${sourceName}` : `${isOverlay ? 'Create overlay' : 'Create source'}: ${sourceName}`; - const yamlContent = YAML.stringify(normalizedSource); + const yamlContent = YAML.stringify(normalizedSource, { indent: 2, lineWidth: 0, version: '1.1' }); const orphanError = await this.rejectOrphanOverlay(semanticLayerService, connectionId, sourceName, yamlContent); if (orphanError) { @@ -211,6 +222,7 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co key: sourceName, detail: existing ? `Rewrote source` : `Created source`, targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId), + ...(rawPaths ? { rawPaths } : {}), }); } diff --git a/packages/context/src/tools/action-raw-paths.ts b/packages/context/src/tools/action-raw-paths.ts new file mode 100644 index 00000000..b0c47e1a --- /dev/null +++ b/packages/context/src/tools/action-raw-paths.ts @@ -0,0 +1,30 @@ +import type { ToolSession } from './tool-session.js'; + +type ActionRawPathValidation = + | { ok: true; rawPaths?: string[] } + | { ok: false; error: string }; + +export function validateActionRawPaths( + session: ToolSession | undefined, + rawPaths: readonly string[] | undefined, +): ActionRawPathValidation { + if (!rawPaths || rawPaths.length === 0) { + return { ok: true }; + } + + const uniqueRawPaths = [...new Set(rawPaths)]; + const allowedRawPaths = session?.allowedRawPaths; + if (!allowedRawPaths) { + return { ok: true, rawPaths: uniqueRawPaths }; + } + + const unavailable = uniqueRawPaths.filter((rawPath) => !allowedRawPaths.has(rawPath)); + if (unavailable.length > 0) { + return { + ok: false, + error: `rawPaths include unavailable ingest file(s): ${unavailable.join(', ')}`, + }; + } + + return { ok: true, rawPaths: uniqueRawPaths }; +} diff --git a/packages/context/src/tools/index.ts b/packages/context/src/tools/index.ts index 7116b54c..a3fc5e7b 100644 --- a/packages/context/src/tools/index.ts +++ b/packages/context/src/tools/index.ts @@ -31,6 +31,7 @@ export { ingestMetadataRequired, resolveIngestMetadata } from './context-ingest- export type { SqlEdit } from './sql-edit-replacer.js'; export { applySqlEdits } from './sql-edit-replacer.js'; export type { IngestToolMetadata, MemoryAction, ToolSession } from './tool-session.js'; +export { validateActionRawPaths } from './action-raw-paths.js'; export type { TouchedSlSource, TouchedSlSourceSet } from './touched-sl-sources.js'; export { addTouchedSlSource, diff --git a/packages/context/src/tools/tool-session.ts b/packages/context/src/tools/tool-session.ts index 8849b673..d9156258 100644 --- a/packages/context/src/tools/tool-session.ts +++ b/packages/context/src/tools/tool-session.ts @@ -16,6 +16,7 @@ export interface MemoryAction { key: string; detail: string; targetConnectionId?: string | null; + rawPaths?: string[]; } interface EvictionDecisionRecord { @@ -45,6 +46,7 @@ export interface ToolSession { preHead: string | null; touchedSlSources: TouchedSlSourceSet; actions: MemoryAction[]; + allowedRawPaths?: ReadonlySet; semanticLayerService: SemanticLayerService; wikiService: KnowledgeWikiService; configService: KtxFileStorePort; diff --git a/packages/context/src/wiki/index.ts b/packages/context/src/wiki/index.ts index 892eff34..6eae10f0 100644 --- a/packages/context/src/wiki/index.ts +++ b/packages/context/src/wiki/index.ts @@ -1,4 +1,11 @@ export { buildKnowledgeSearchText } from './knowledge-search-text.js'; +export { + assertFlatWikiKey, + invalidFlatWikiKeyMessage, + isFlatWikiKey, + suggestFlatWikiKey, + validateFlatWikiKey, +} from './keys.js'; export { KnowledgeWikiService } from './knowledge-wiki.service.js'; export * from './local-knowledge.js'; export type { diff --git a/packages/context/src/wiki/keys.ts b/packages/context/src/wiki/keys.ts new file mode 100644 index 00000000..4bffae2e --- /dev/null +++ b/packages/context/src/wiki/keys.ts @@ -0,0 +1,31 @@ +const FLAT_WIKI_KEY_PATTERN = /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/; + +export function suggestFlatWikiKey(key: string): string { + const suggested = key + .trim() + .replace(/[\\/]+/g, '-') + .replace(/[^a-zA-Z0-9_-]+/g, '-') + .replace(/-+/g, '-') + .replace(/^[-_]+|[-_]+$/g, ''); + return suggested.length > 0 ? suggested : 'page-key'; +} + +export function invalidFlatWikiKeyMessage(key: string): string { + return `Invalid wiki key "${key}". Wiki keys must be flat; use "${suggestFlatWikiKey(key)}".`; +} + +export function isFlatWikiKey(key: string): boolean { + return FLAT_WIKI_KEY_PATTERN.test(key); +} + +export function validateFlatWikiKey(key: string): { ok: true; key: string } | { ok: false; error: string } { + return isFlatWikiKey(key) ? { ok: true, key } : { ok: false, error: invalidFlatWikiKeyMessage(key) }; +} + +export function assertFlatWikiKey(key: string): string { + const result = validateFlatWikiKey(key); + if (!result.ok) { + throw new Error(result.error); + } + return result.key; +} diff --git a/packages/context/src/wiki/knowledge-wiki.service.test.ts b/packages/context/src/wiki/knowledge-wiki.service.test.ts index 8fadee11..ecbf954a 100644 --- a/packages/context/src/wiki/knowledge-wiki.service.test.ts +++ b/packages/context/src/wiki/knowledge-wiki.service.test.ts @@ -33,13 +33,19 @@ function makeService() { diffNameStatus: vi.fn().mockResolvedValue([]), getFileAtCommit: vi.fn().mockResolvedValue(''), }; + const logger = { + log: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + }; const service = new KnowledgeWikiService( configService as any, embeddingService as any, pagesRepository as any, gitService as any, + logger as any, ); - return { service, pagesRepository, embeddingService, configService, gitService }; + return { service, pagesRepository, embeddingService, configService, gitService, logger }; } const fm: WikiFrontmatter = { summary: 'sum', usage_mode: 'auto' }; @@ -107,6 +113,53 @@ describe('KnowledgeWikiService.syncFromCommit', () => { expect(call.deletes).toEqual([{ scope: 'GLOBAL', scopeId: null, pageKey: 'gone-page' }]); }); + it('indexes historic-SQL nested pages but skips other nested wiki paths from commit sync', async () => { + const { service, pagesRepository, gitService, logger } = makeService(); + + gitService.diffNameStatus.mockResolvedValue([ + { status: 'A', path: 'knowledge/global/revenue-policy.md' }, + { status: 'A', path: 'knowledge/global/historic-sql/order-lifecycle.md' }, + { status: 'A', path: 'knowledge/global/historic-sql/_archived/retired-pattern.md' }, + { status: 'A', path: 'knowledge/global/orbit/company-overview.md' }, + ]); + gitService.getFileAtCommit.mockImplementation((path: string) => { + if (path.endsWith('revenue-policy.md')) { + return Promise.resolve('---\nsummary: revenue\nusage_mode: auto\n---\n\nbody-revenue\n'); + } + if (path.endsWith('order-lifecycle.md')) { + return Promise.resolve('---\nsummary: order lifecycle\nusage_mode: auto\n---\n\nbody-orders\n'); + } + if (path.endsWith('retired-pattern.md')) { + return Promise.resolve('---\nsummary: retired\nusage_mode: never\n---\n\nbody-retired\n'); + } + return Promise.reject(new Error(`unexpected getFileAtCommit path: ${path}`)); + }); + + await service.syncFromCommit('sha-before', 'sha-after', 'run-uuid'); + + expect(gitService.getFileAtCommit).not.toHaveBeenCalledWith('knowledge/global/orbit/company-overview.md', 'sha-after'); + expect(logger.warn).toHaveBeenCalledWith( + '[knowledge.sync] skipping unparseable path: knowledge/global/orbit/company-overview.md', + ); + const call = pagesRepository.applyDiffTransactional.mock.calls[0][0]; + expect(call.upserts).toEqual( + expect.arrayContaining([ + expect.objectContaining({ scope: 'GLOBAL', pageKey: 'revenue-policy', summary: 'revenue' }), + expect.objectContaining({ + scope: 'GLOBAL', + pageKey: 'historic-sql/order-lifecycle', + summary: 'order lifecycle', + }), + expect.objectContaining({ + scope: 'GLOBAL', + pageKey: 'historic-sql/_archived/retired-pattern', + summary: 'retired', + }), + ]), + ); + expect(call.upserts).toHaveLength(3); + }); + it('is a no-op when the diff between shas has no knowledge changes', async () => { const { service, pagesRepository, gitService } = makeService(); gitService.diffNameStatus.mockResolvedValue([]); diff --git a/packages/context/src/wiki/knowledge-wiki.service.ts b/packages/context/src/wiki/knowledge-wiki.service.ts index 6234c729..2ca32f79 100644 --- a/packages/context/src/wiki/knowledge-wiki.service.ts +++ b/packages/context/src/wiki/knowledge-wiki.service.ts @@ -2,6 +2,7 @@ import { createHash } from 'node:crypto'; import YAML from 'yaml'; import type { KtxEmbeddingPort, KtxFileStorePort, KtxLogger } from '../core/index.js'; import { noopLogger } from '../core/index.js'; +import { assertFlatWikiKey, isFlatWikiKey } from './keys.js'; import { buildKnowledgeSearchText } from './knowledge-search-text.js'; import type { KnowledgeGitDiffPort, KnowledgeIndexPort, UpsertPageParams } from './ports.js'; import type { WikiFrontmatter, WikiPage, WikiPageWithScope } from './types.js'; @@ -10,6 +11,10 @@ const WIKI_PREFIX = 'knowledge'; export type { WikiFrontmatter }; +function isHistoricSqlPathSegment(segment: string): boolean { + return /^[a-zA-Z0-9_][a-zA-Z0-9_-]*$/.test(segment); +} + export class KnowledgeWikiService { private isWorktreeScoped = false; @@ -53,7 +58,7 @@ export class KnowledgeWikiService { } pagePath(scope: string, scopeId: string | null | undefined, pageKey: string): string { - return `${this.scopeDir(scope, scopeId)}/${pageKey}.md`; + return `${this.scopeDir(scope, scopeId)}/${assertFlatWikiKey(pageKey)}.md`; } // ── Parsing / serialization ─────────────────────────────────── @@ -140,7 +145,7 @@ export class KnowledgeWikiService { const name = f.replace(`${dir}/`, '').replace(/\.md$/, ''); return name; }) - .filter((name) => !name.includes('/')); + .filter(isFlatWikiKey); } catch { return []; } @@ -417,6 +422,7 @@ export class KnowledgeWikiService { * Parse a `knowledge//...` file path into its scope and page key. * `knowledge/global/foo.md` → { scope: 'GLOBAL', scopeId: null, pageKey: 'foo' } * `knowledge/user//bar.md` → { scope: 'USER', scopeId: '', pageKey: 'bar' } + * `knowledge/global/historic-sql/foo.md` → { scope: 'GLOBAL', scopeId: null, pageKey: 'historic-sql/foo' } */ function parseKnowledgePath(path: string): { scope: string; scopeId: string | null; pageKey: string } | null { if (!path.endsWith('.md')) { @@ -428,10 +434,19 @@ function parseKnowledgePath(path: string): { scope: string; scopeId: string | nu } const rest = segments.slice(1); if (rest.length === 2 && rest[0] === 'global') { - return { scope: 'GLOBAL', scopeId: null, pageKey: rest[1].replace(/\.md$/, '') }; + const pageKey = rest[1].replace(/\.md$/, ''); + return isFlatWikiKey(pageKey) ? { scope: 'GLOBAL', scopeId: null, pageKey } : null; + } + if (rest.length >= 3 && rest[0] === 'global' && rest[1] === 'historic-sql') { + const historicPath = rest.slice(2).join('/').replace(/\.md$/, ''); + if (historicPath.split('/').every(isHistoricSqlPathSegment)) { + return { scope: 'GLOBAL', scopeId: null, pageKey: `historic-sql/${historicPath}` }; + } + return null; } if (rest.length === 3 && rest[0] === 'user') { - return { scope: 'USER', scopeId: rest[1], pageKey: rest[2].replace(/\.md$/, '') }; + const pageKey = rest[2].replace(/\.md$/, ''); + return isFlatWikiKey(pageKey) ? { scope: 'USER', scopeId: rest[1], pageKey } : null; } return null; } diff --git a/packages/context/src/wiki/local-knowledge.test.ts b/packages/context/src/wiki/local-knowledge.test.ts index 1b979cef..78da841f 100644 --- a/packages/context/src/wiki/local-knowledge.test.ts +++ b/packages/context/src/wiki/local-knowledge.test.ts @@ -37,7 +37,7 @@ describe('local knowledge helpers', () => { it('writes, reads, lists, and searches global knowledge pages', async () => { const write = await writeLocalKnowledgePage(project, { - key: 'metrics/revenue', + key: 'metrics-revenue', scope: 'GLOBAL', summary: 'Revenue metric definition', content: 'Revenue is recognized when an order is paid.', @@ -46,11 +46,11 @@ describe('local knowledge helpers', () => { slRefs: ['orders'], }); - expect(write.path).toBe('knowledge/global/metrics/revenue.md'); + expect(write.path).toBe('knowledge/global/metrics-revenue.md'); expect(write.operation).toBe('write'); - await expect(readLocalKnowledgePage(project, { key: 'metrics/revenue', userId: 'local' })).resolves.toMatchObject({ - key: 'metrics/revenue', + await expect(readLocalKnowledgePage(project, { key: 'metrics-revenue', userId: 'local' })).resolves.toMatchObject({ + key: 'metrics-revenue', scope: 'GLOBAL', summary: 'Revenue metric definition', content: 'Revenue is recognized when an order is paid.', @@ -61,8 +61,8 @@ describe('local knowledge helpers', () => { await expect(listLocalKnowledgePages(project, { userId: 'local' })).resolves.toEqual([ { - key: 'metrics/revenue', - path: 'knowledge/global/metrics/revenue.md', + key: 'metrics-revenue', + path: 'knowledge/global/metrics-revenue.md', scope: 'GLOBAL', summary: 'Revenue metric definition', }, @@ -71,8 +71,8 @@ describe('local knowledge helpers', () => { const search = await searchLocalKnowledgePages(project, { query: 'paid order', userId: 'local' }); expect(search).toEqual([ expect.objectContaining({ - key: 'metrics/revenue', - path: 'knowledge/global/metrics/revenue.md', + key: 'metrics-revenue', + path: 'knowledge/global/metrics-revenue.md', scope: 'GLOBAL', score: expect.any(Number), matchReasons: expect.arrayContaining(['lexical']), @@ -85,7 +85,7 @@ describe('local knowledge helpers', () => { it('adds the token lane alongside lexical wiki matches', async () => { await writeLocalKnowledgePage(project, { - key: 'metrics/revenue', + key: 'metrics-revenue', scope: 'GLOBAL', summary: 'Revenue metric definition', content: 'Revenue is recognized when an order is paid.', @@ -95,7 +95,7 @@ describe('local knowledge helpers', () => { const search = await searchLocalKnowledgePages(project, { query: 'paid---', userId: 'local', limit: 5 }); expect(search[0]).toMatchObject({ - key: 'metrics/revenue', + key: 'metrics-revenue', matchReasons: expect.arrayContaining(['token']), lanes: expect.arrayContaining([expect.objectContaining({ lane: 'token', status: 'available' })]), }); @@ -103,14 +103,14 @@ describe('local knowledge helpers', () => { it('uses stored page embeddings when a wiki embedding backend is configured', async () => { await writeLocalKnowledgePage(project, { - key: 'metrics/revenue', + key: 'metrics-revenue', scope: 'GLOBAL', summary: 'Semantic revenue definition', content: 'Revenue search text.', tags: ['finance'], }); await writeLocalKnowledgePage(project, { - key: 'support/escalations', + key: 'support-escalations', scope: 'GLOBAL', summary: 'Support escalation process', content: 'Support search text.', @@ -125,7 +125,7 @@ describe('local knowledge helpers', () => { }); expect(search[0]).toMatchObject({ - key: 'metrics/revenue', + key: 'metrics-revenue', matchReasons: expect.arrayContaining(['semantic']), lanes: expect.arrayContaining([expect.objectContaining({ lane: 'semantic', status: 'available' })]), }); @@ -133,7 +133,7 @@ describe('local knowledge helpers', () => { it('reports semantic lane as skipped when wiki embeddings are not configured', async () => { await writeLocalKnowledgePage(project, { - key: 'metrics/revenue', + key: 'metrics-revenue', scope: 'GLOBAL', summary: 'Revenue metric definition', content: 'Revenue is recognized when an order is paid.', @@ -172,7 +172,7 @@ describe('local knowledge helpers', () => { it('serializes historic-SQL frontmatter fields for global pages', async () => { await writeLocalKnowledgePage(project, { - key: 'queries/monthly-paid-orders', + key: 'monthly-paid-orders', scope: 'GLOBAL', summary: 'Monthly paid orders', content: '## Monthly paid order count', @@ -195,7 +195,7 @@ describe('local knowledge helpers', () => { fingerprints: ['fp_paid_orders'], }); - const raw = await project.fileStore.readFile('knowledge/global/queries/monthly-paid-orders.md'); + const raw = await project.fileStore.readFile('knowledge/global/monthly-paid-orders.md'); expect(raw.content).toContain('source: historic-sql'); expect(raw.content).toContain('intent: Monthly paid order count'); expect(raw.content).toContain(['tables:', ' - analytics.orders'].join('\n')); @@ -207,7 +207,7 @@ describe('local knowledge helpers', () => { it('falls back to Markdown scanning when the config does not select sqlite-fts5', async () => { project.config.storage.search = 'postgres-hybrid'; await writeLocalKnowledgePage(project, { - key: 'metrics/revenue', + key: 'metrics-revenue', scope: 'GLOBAL', summary: 'Revenue metric definition', content: 'Revenue is recognized when an order is paid.', @@ -216,7 +216,7 @@ describe('local knowledge helpers', () => { await expect(searchLocalKnowledgePages(project, { query: 'paid order', userId: 'local' })).resolves.toEqual([ expect.objectContaining({ - key: 'metrics/revenue', + key: 'metrics-revenue', score: 3, matchReasons: ['token'], }), @@ -231,6 +231,17 @@ describe('local knowledge helpers', () => { summary: 'bad', content: 'bad', }), - ).rejects.toThrow('Unsafe knowledge key'); + ).rejects.toThrow('Invalid wiki key "../secret". Wiki keys must be flat; use "secret".'); + }); + + it('rejects slash-delimited knowledge keys with a flat-key suggestion', async () => { + await expect( + writeLocalKnowledgePage(project, { + key: 'orbit/company-overview', + scope: 'GLOBAL', + summary: 'bad', + content: 'bad', + }), + ).rejects.toThrow('Invalid wiki key "orbit/company-overview". Wiki keys must be flat; use "orbit-company-overview".'); }); }); diff --git a/packages/context/src/wiki/local-knowledge.ts b/packages/context/src/wiki/local-knowledge.ts index 11cc1cc1..007b006e 100644 --- a/packages/context/src/wiki/local-knowledge.ts +++ b/packages/context/src/wiki/local-knowledge.ts @@ -4,6 +4,7 @@ import type { KtxEmbeddingPort, KtxFileWriteResult } from '../core/index.js'; import type { KtxLocalProject } from '../project/index.js'; import { HybridSearchCore, type SearchCandidateGenerator } from '../search/index.js'; import { buildKnowledgeSearchText } from './knowledge-search-text.js'; +import { assertFlatWikiKey, isFlatWikiKey } from './keys.js'; import { SqliteKnowledgeIndex, type SqliteKnowledgeIndexPage } from './sqlite-knowledge-index.js'; import type { HistoricSqlWikiUsageFrontmatter, WikiSearchLaneSummary, WikiSearchMatchReason } from './types.js'; @@ -67,28 +68,39 @@ function assertSafePathToken(kind: string, value: string): string { return value; } -function assertSafeKnowledgeKey(key: string): string { - if (!/^[a-zA-Z0-9][a-zA-Z0-9_/-]*$/.test(key)) { - throw new Error(`Unsafe knowledge key: ${key}`); - } - return assertSafePathToken('knowledge key', key); -} - function stringArray(value: unknown): string[] { return Array.isArray(value) ? value.filter((item): item is string => typeof item === 'string') : []; } function knowledgePath(scope: LocalKnowledgeScope, userId: string | undefined, key: string): string { - const safeKey = assertSafeKnowledgeKey(key); + const safeKey = assertFlatWikiKey(key); if (scope === 'GLOBAL') { return `knowledge/global/${safeKey}.md`; } return `knowledge/user/${assertSafePathToken('user id', userId ?? 'local')}/${safeKey}.md`; } -function keyFromKnowledgePath(path: string, scope: LocalKnowledgeScope, userId: string): string { +function isHistoricSqlPathSegment(segment: string): boolean { + return /^[a-zA-Z0-9_][a-zA-Z0-9_-]*$/.test(segment); +} + +function keyFromKnowledgePath(path: string, scope: LocalKnowledgeScope, userId: string): string | null { const prefix = scope === 'GLOBAL' ? 'knowledge/global/' : `knowledge/user/${assertSafePathToken('user id', userId)}/`; - return path.slice(prefix.length).replace(/\.md$/, ''); + const key = path.slice(prefix.length).replace(/\.md$/, ''); + if (isFlatWikiKey(key)) { + return key; + } + if ( + scope === 'GLOBAL' && + key.startsWith('historic-sql/') && + key + .slice('historic-sql/'.length) + .split('/') + .every(isHistoricSqlPathSegment) + ) { + return key; + } + return null; } function parseKnowledgePage(key: string, path: string, scope: LocalKnowledgeScope, raw: string): LocalKnowledgePage { @@ -187,6 +199,9 @@ export async function listLocalKnowledgePages( const listed = await project.fileStore.listFiles(root); for (const path of listed.files.filter((file) => file.endsWith('.md')).sort()) { const key = keyFromKnowledgePath(path, scope, userId); + if (!key) { + continue; + } const page = await readPageAtPath(project, key, path, scope); if (page) { pages.push({ key, path, scope, summary: page.summary }); diff --git a/packages/context/src/wiki/sqlite-knowledge-index.test.ts b/packages/context/src/wiki/sqlite-knowledge-index.test.ts index 9bb0e6f0..620702a1 100644 --- a/packages/context/src/wiki/sqlite-knowledge-index.test.ts +++ b/packages/context/src/wiki/sqlite-knowledge-index.test.ts @@ -82,6 +82,14 @@ describe('SqliteKnowledgeIndex', () => { ); }); + it('does not treat empty embeddings as indexed semantic vectors', () => { + const index = new SqliteKnowledgeIndex({ dbPath }); + index.sync([page({ path: 'knowledge/global/revenue.md', key: 'revenue', embedding: [] })]); + + expect(index.getExistingPages().get('knowledge/global/revenue.md')?.embedding).toBeNull(); + expect(index.searchSemanticCandidates({ queryEmbedding: [1, 0], limit: 10 })).toEqual([]); + }); + it('returns semantic lane candidates from stored page embeddings', () => { const index = new SqliteKnowledgeIndex({ dbPath }); index.sync([ diff --git a/packages/context/src/wiki/sqlite-knowledge-index.ts b/packages/context/src/wiki/sqlite-knowledge-index.ts index acadc02e..7a5ae8fc 100644 --- a/packages/context/src/wiki/sqlite-knowledge-index.ts +++ b/packages/context/src/wiki/sqlite-knowledge-index.ts @@ -75,7 +75,9 @@ function parseEmbedding(raw: string | null): number[] | null { } try { const embedding = JSON.parse(raw) as unknown; - return Array.isArray(embedding) && embedding.every((value) => typeof value === 'number') ? embedding : null; + return Array.isArray(embedding) && embedding.length > 0 && embedding.every((value) => typeof value === 'number') + ? embedding + : null; } catch { return null; } @@ -170,7 +172,7 @@ export class SqliteKnowledgeIndex { content: searchText, tags: page.tags.join(' '), searchText, - embeddingJson: page.embedding ? JSON.stringify(page.embedding) : null, + embeddingJson: page.embedding && page.embedding.length > 0 ? JSON.stringify(page.embedding) : null, }; upsertPage.run(row); deleteFts.run(row); diff --git a/packages/context/src/wiki/tools/wiki-read.tool.test.ts b/packages/context/src/wiki/tools/wiki-read.tool.test.ts new file mode 100644 index 00000000..1457702c --- /dev/null +++ b/packages/context/src/wiki/tools/wiki-read.tool.test.ts @@ -0,0 +1,79 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { ToolSession } from '../../tools/index.js'; +import { createTouchedSlSources, type ToolContext } from '../../tools/index.js'; +import { WikiReadTool } from './wiki-read.tool.js'; + +describe('WikiReadTool', () => { + const baseContext: ToolContext = { sourceId: 's', messageId: 'm', userId: 'u' }; + + it('reads from the session wiki service when a worktree-scoped ingest session is present', async () => { + const rootWikiService = { readPageForUser: vi.fn().mockResolvedValue(null) }; + const sessionWikiService = { + readPageForUser: vi.fn().mockResolvedValue({ + pageKey: 'staged-page', + scope: 'GLOBAL', + frontmatter: { summary: 'Staged', tags: ['notion'], refs: ['related'] }, + content: 'A page written earlier in the same ingest worktree.', + }), + }; + const pagesRepository = { findPageByKey: vi.fn().mockResolvedValue({ id: 'page-1' }), incrementUsageCount: vi.fn() }; + const tool = new WikiReadTool(rootWikiService as any, pagesRepository as any); + const session: ToolSession = { + connectionId: 'c', + isWorktreeScoped: true, + preHead: null, + touchedSlSources: createTouchedSlSources(), + actions: [], + semanticLayerService: {} as any, + wikiService: sessionWikiService as any, + configService: {} as any, + gitService: {} as any, + }; + + const result = await tool.call({ key: 'staged-page' }, { ...baseContext, session }); + + expect(rootWikiService.readPageForUser).not.toHaveBeenCalled(); + expect(sessionWikiService.readPageForUser).toHaveBeenCalledWith('u', 'staged-page'); + expect(result.structured).toMatchObject({ found: true, blockKey: 'staged-page', scope: 'GLOBAL' }); + expect(result.markdown).toContain('A page written earlier in the same ingest worktree.'); + }); + + it('rejects slash-delimited page keys with a flat-key suggestion', async () => { + const rootWikiService = { readPageForUser: vi.fn().mockResolvedValue(null) }; + const pagesRepository = { findPageByKey: vi.fn(), incrementUsageCount: vi.fn() }; + const tool = new WikiReadTool(rootWikiService as any, pagesRepository as any); + + const result = await tool.call({ key: 'orbit/company-overview' }, baseContext); + + expect(result.structured).toEqual({ + blockKey: 'orbit/company-overview', + content: '', + scope: '', + found: false, + }); + expect(result.markdown).toContain( + 'Invalid wiki key "orbit/company-overview". Wiki keys must be flat; use "orbit-company-overview".', + ); + expect(rootWikiService.readPageForUser).not.toHaveBeenCalled(); + }); + + it('does not append derived refs to the editable markdown body', async () => { + const rootWikiService = { + readPageForUser: vi.fn().mockResolvedValue({ + pageKey: 'orbit-how-we-work', + scope: 'GLOBAL', + frontmatter: { summary: 'How we work', tags: ['policy'], refs: ['orbit-company-overview'] }, + content: '## How We Work\n\nUse written-first operating norms.', + }), + }; + const pagesRepository = { findPageByKey: vi.fn().mockResolvedValue(null), incrementUsageCount: vi.fn() }; + const tool = new WikiReadTool(rootWikiService as any, pagesRepository as any); + + const result = await tool.call({ key: 'orbit-how-we-work' }, baseContext); + + expect(result.markdown).toBe('## How We Work\n\nUse written-first operating norms.'); + expect(result.markdown).not.toContain('See also'); + expect(result.markdown).not.toContain('[[orbit-company-overview]]'); + expect(result.structured.refs).toEqual(['orbit-company-overview']); + }); +}); diff --git a/packages/context/src/wiki/tools/wiki-read.tool.ts b/packages/context/src/wiki/tools/wiki-read.tool.ts index 73eb7e40..858e717e 100644 --- a/packages/context/src/wiki/tools/wiki-read.tool.ts +++ b/packages/context/src/wiki/tools/wiki-read.tool.ts @@ -1,6 +1,7 @@ import { z } from 'zod'; import type { KnowledgeIndexPort } from '../ports.js'; import { KnowledgeWikiService } from '../index.js'; +import { validateFlatWikiKey } from '../keys.js'; import { BaseTool, type ToolContext, type ToolOutput } from '../../tools/index.js'; const WikiReadInputSchema = z.object({ @@ -34,6 +35,7 @@ export class WikiReadTool extends BaseTool { return ( 'Load the full content of a knowledge block by its key. ' + 'Use this to retrieve detailed rules, preferences, or definitions listed in the . ' + + 'The markdown output is the exact stored page body; use it verbatim for wiki_write replacements. ' + 'Call this when the user query relates to a topic covered by an available knowledge block.' ); } @@ -43,7 +45,15 @@ export class WikiReadTool extends BaseTool { } async call(input: WikiReadInput, context: ToolContext): Promise> { - const page = await this.wikiService.readPageForUser(context.userId, input.key); + const keyValidation = validateFlatWikiKey(input.key); + if (!keyValidation.ok) { + return { + markdown: keyValidation.error, + structured: { blockKey: input.key, content: '', scope: '', found: false }, + }; + } + const wikiService = context.session?.wikiService ?? this.wikiService; + const page = await wikiService.readPageForUser(context.userId, input.key); if (!page) { return { @@ -61,14 +71,8 @@ export class WikiReadTool extends BaseTool { void this.pagesRepository.incrementUsageCount([indexEntry.id]); } - let md = `## ${page.pageKey}\n\n${page.content}`; - const refs = page.frontmatter.refs; - if (refs && refs.length > 0) { - md += `\n\nSee also: ${refs.map((r) => `[[${r}]]`).join(', ')}`; - } - return { - markdown: md, + markdown: page.content, structured: { blockKey: page.pageKey, content: page.content, diff --git a/packages/context/src/wiki/tools/wiki-remove.tool.test.ts b/packages/context/src/wiki/tools/wiki-remove.tool.test.ts index 7999dd26..2cdb87e0 100644 --- a/packages/context/src/wiki/tools/wiki-remove.tool.test.ts +++ b/packages/context/src/wiki/tools/wiki-remove.tool.test.ts @@ -22,8 +22,28 @@ describe('WikiRemoveTool', () => { expect(result.markdown).toMatch(/removed/i); }); + it('rejects slash-delimited page keys with a flat-key suggestion', async () => { + const wikiService = { + deletePage: vi.fn().mockResolvedValue(undefined), + deleteFromIndex: vi.fn().mockResolvedValue(undefined), + }; + const pagesRepository = { findPageByKey: vi.fn().mockResolvedValue({ page_key: 'old' }) }; + const knowledgeRepository = { createEvent: vi.fn().mockResolvedValue(undefined) }; + const tool = new WikiRemoveTool(wikiService as any, pagesRepository as any, knowledgeRepository as any); + + const result = await tool.call({ key: 'orbit/company-overview' } as any, baseContext); + + expect(result.structured).toEqual({ success: false, key: 'orbit/company-overview' }); + expect(result.markdown).toContain( + 'Invalid wiki key "orbit/company-overview". Wiki keys must be flat; use "orbit-company-overview".', + ); + expect(pagesRepository.findPageByKey).not.toHaveBeenCalled(); + expect(wikiService.deletePage).not.toHaveBeenCalled(); + }); + it('skips deleteFromIndex when session is worktree-scoped', async () => { const wikiService = { + readPage: vi.fn().mockResolvedValue({ pageKey: 'old', frontmatter: { summary: 'Old' }, content: 'body' }), deletePage: vi.fn().mockResolvedValue(undefined), deleteFromIndex: vi.fn().mockResolvedValue(undefined), }; @@ -47,6 +67,35 @@ describe('WikiRemoveTool', () => { expect(session.actions).toContainEqual(expect.objectContaining({ target: 'wiki', type: 'removed', key: 'old' })); }); + it('finds pages through the session wiki service even when the shared index has not seen the worktree write', async () => { + const wikiService = { + readPage: vi.fn().mockResolvedValue({ pageKey: 'staged', frontmatter: { summary: 'Staged' }, content: 'body' }), + deletePage: vi.fn().mockResolvedValue(undefined), + deleteFromIndex: vi.fn().mockResolvedValue(undefined), + }; + const pagesRepository = { findPageByKey: vi.fn().mockResolvedValue(null) }; + const knowledgeRepository = { createEvent: vi.fn().mockResolvedValue(undefined) }; + const tool = new WikiRemoveTool(wikiService as any, pagesRepository as any, knowledgeRepository as any); + const session: ToolSession = { + connectionId: 'c', + isWorktreeScoped: true, + preHead: null, + touchedSlSources: createTouchedSlSources(), + actions: [], + semanticLayerService: {} as any, + wikiService: wikiService as any, + configService: {} as any, + gitService: {} as any, + }; + + const result = await tool.call({ key: 'staged' } as any, { ...baseContext, session }); + + expect(pagesRepository.findPageByKey).not.toHaveBeenCalled(); + expect(wikiService.readPage).toHaveBeenCalledWith('GLOBAL', null, 'staged'); + expect(wikiService.deletePage).toHaveBeenCalledTimes(1); + expect(result.structured).toEqual({ success: true, key: 'staged' }); + }); + it('returns a friendly message when the page does not exist', async () => { const wikiService = { deletePage: vi.fn(), deleteFromIndex: vi.fn() }; const pagesRepository = { findPageByKey: vi.fn().mockResolvedValue(null) }; diff --git a/packages/context/src/wiki/tools/wiki-remove.tool.ts b/packages/context/src/wiki/tools/wiki-remove.tool.ts index 0e5905ec..7cb56e7d 100644 --- a/packages/context/src/wiki/tools/wiki-remove.tool.ts +++ b/packages/context/src/wiki/tools/wiki-remove.tool.ts @@ -3,13 +3,18 @@ import type { KnowledgeIndexPort } from '../ports.js'; import type { KnowledgeEventPort } from '../ports.js'; type BlockScope = 'GLOBAL' | 'USER'; import { KnowledgeWikiService } from '../index.js'; -import { BaseTool, type ToolContext, type ToolOutput } from '../../tools/index.js'; +import { validateFlatWikiKey } from '../keys.js'; +import { BaseTool, type ToolContext, type ToolOutput, validateActionRawPaths } from '../../tools/index.js'; const SYSTEM_AUTHOR = 'System User'; const SYSTEM_EMAIL = 'system@example.com'; const wikiRemoveInputSchema = z.object({ key: z.string().describe('The page key to remove'), + rawPaths: z + .array(z.string().min(1)) + .optional() + .describe('In ingest sessions, raw source file paths that directly support this removal.'), }); type WikiRemoveInput = z.infer; @@ -42,11 +47,27 @@ export class WikiRemoveTool extends BaseTool { const wikiService = context.session?.wikiService ?? this.wikiService; const writesGlobal = !!context.session; const skipIndex = context.session?.isWorktreeScoped === true; + const keyValidation = validateFlatWikiKey(input.key); + if (!keyValidation.ok) { + return { + markdown: keyValidation.error, + structured: { success: false, key: input.key }, + }; + } + const rawPathValidation = validateActionRawPaths(context.session, input.rawPaths); + if (!rawPathValidation.ok) { + return { + markdown: `Error: ${rawPathValidation.error}`, + structured: { success: false, key: input.key }, + }; + } const scope: BlockScope = writesGlobal ? 'GLOBAL' : 'USER'; const scopeId = scope === 'USER' ? context.userId : null; - const existing = await this.pagesRepository.findPageByKey(scope, scopeId, input.key); + const existing = context.session + ? await wikiService.readPage(scope, scopeId, input.key) + : await this.pagesRepository.findPageByKey(scope, scopeId, input.key); if (!existing) { return { markdown: `Page "${input.key}" not found.`, @@ -74,6 +95,7 @@ export class WikiRemoveTool extends BaseTool { type: 'removed', key: input.key, detail: `Removed page "${input.key}"`, + ...(rawPathValidation.rawPaths ? { rawPaths: rawPathValidation.rawPaths } : {}), }); } diff --git a/packages/context/src/wiki/tools/wiki-search.tool.test.ts b/packages/context/src/wiki/tools/wiki-search.tool.test.ts index a15a78df..33bd752b 100644 --- a/packages/context/src/wiki/tools/wiki-search.tool.test.ts +++ b/packages/context/src/wiki/tools/wiki-search.tool.test.ts @@ -6,8 +6,8 @@ describe('WikiSearchTool', () => { const search = vi.fn(async () => ({ results: [ { - key: 'metrics/revenue', - path: 'knowledge/global/metrics/revenue.md', + key: 'metrics-revenue', + path: 'knowledge/global/metrics-revenue.md', scope: 'GLOBAL' as const, summary: 'Revenue metric definition', score: 0.02459016393442623, @@ -27,8 +27,8 @@ describe('WikiSearchTool', () => { expect(result.structured).toEqual({ results: [ { - blockKey: 'metrics/revenue', - path: 'knowledge/global/metrics/revenue.md', + blockKey: 'metrics-revenue', + path: 'knowledge/global/metrics-revenue.md', summary: 'Revenue metric definition', score: 0.02459016393442623, matchReasons: ['lexical', 'token'], @@ -36,6 +36,6 @@ describe('WikiSearchTool', () => { ], totalFound: 1, }); - expect(result.markdown).toContain('**metrics/revenue**'); + expect(result.markdown).toContain('**metrics-revenue**'); }); }); diff --git a/packages/context/src/wiki/tools/wiki-write.tool.test.ts b/packages/context/src/wiki/tools/wiki-write.tool.test.ts index 9e947d84..16d2abad 100644 --- a/packages/context/src/wiki/tools/wiki-write.tool.test.ts +++ b/packages/context/src/wiki/tools/wiki-write.tool.test.ts @@ -6,6 +6,7 @@ import { WikiWriteTool } from './wiki-write.tool.js'; function makeTool(overrides: any = {}) { const wikiService = { readPage: vi.fn().mockResolvedValue(null), + listPageKeys: vi.fn().mockResolvedValue([]), writePage: vi.fn().mockResolvedValue(undefined), syncSinglePage: vi.fn().mockResolvedValue(undefined), ...overrides.wikiService, @@ -37,6 +38,21 @@ describe('WikiWriteTool', () => { expect(result.markdown).toMatch(/created/i); }); + it('rejects slash-delimited page keys with a flat-key suggestion', async () => { + const { tool, wikiService } = makeTool(); + const result = await tool.call( + { key: 'orbit/company-overview', summary: 'Company overview', content: '# Orbit' } as any, + baseContext, + ); + + expect(result.structured).toEqual({ success: false, key: 'orbit/company-overview' }); + expect(result.markdown).toContain( + 'Invalid wiki key "orbit/company-overview". Wiki keys must be flat; use "orbit-company-overview".', + ); + expect(wikiService.readPage).not.toHaveBeenCalled(); + expect(wikiService.writePage).not.toHaveBeenCalled(); + }); + it('normalizes accidentally escaped markdown newlines before writing', async () => { const { tool, wikiService } = makeTool(); @@ -100,12 +116,56 @@ describe('WikiWriteTool', () => { expect(result.markdown).toMatch(/content.*or.*replacements/i); }); + it('updates frontmatter only on an existing page while preserving content', async () => { + const { tool, wikiService } = makeTool({ + wikiService: { + readPage: vi.fn().mockResolvedValue({ + pageKey: 'orbit-customers', + frontmatter: { + summary: 'Customer source details', + usage_mode: 'auto', + sort_order: 0, + tags: ['notion'], + refs: ['notion:old'], + sl_refs: ['postgres-warehouse/orbit_analytics.customer'], + }, + content: '# Orbit Customers\n\nSource: Notion - Orbit Customers Source.', + }), + }, + }); + + const result = await tool.call( + { + key: 'orbit-customers', + summary: 'Customer source details mapped to the warehouse customer view', + sl_refs: ['postgres-warehouse/orbit_analytics.customer', 'dbt-main/customer'], + } as any, + baseContext, + ); + + expect(result.structured).toMatchObject({ success: true, key: 'orbit-customers', action: 'updated' }); + expect(wikiService.writePage).toHaveBeenCalledWith( + 'USER', + 'u', + 'orbit-customers', + expect.objectContaining({ + summary: 'Customer source details mapped to the warehouse customer view', + tags: ['notion'], + refs: ['notion:old'], + sl_refs: ['postgres-warehouse/orbit_analytics.customer', 'dbt-main/customer'], + }), + '# Orbit Customers\n\nSource: Notion - Orbit Customers Source.', + expect.any(String), + expect.any(String), + ); + }); + it('writes historic-SQL frontmatter fields', async () => { const { tool, wikiService } = makeTool(); await tool.call( { - key: 'queries/monthly-paid-orders', + key: 'monthly-paid-orders', summary: 'Monthly paid orders', tags: ['historic-sql', 'query-pattern'], sl_refs: ['analytics.orders'], @@ -180,7 +240,7 @@ describe('WikiWriteTool', () => { const { tool, wikiService } = makeTool({ wikiService: { readPage: vi.fn().mockResolvedValue({ - pageKey: 'queries/monthly-paid-orders', + pageKey: 'monthly-paid-orders', frontmatter: existingFrontmatter, content: 'old body', }), @@ -189,7 +249,7 @@ describe('WikiWriteTool', () => { await tool.call( { - key: 'queries/monthly-paid-orders', + key: 'monthly-paid-orders', summary: 'Monthly paid orders updated', content: '## Monthly paid order count updated', } as any, @@ -201,4 +261,47 @@ describe('WikiWriteTool', () => { summary: 'Monthly paid orders updated', }); }); + + it('rejects frontmatter refs that target missing wiki pages', async () => { + const { tool, wikiService } = makeTool({ + wikiService: { + listPageKeys: vi.fn().mockResolvedValue(['orbit-company-overview']), + }, + }); + + const result = await tool.call( + { + key: 'orbit-how-we-work', + summary: 'Operating norms', + content: '## How We Work', + refs: ['orbit-company-overview', 'orbit-team-lanes-detail'], + } as any, + baseContext, + ); + + expect(result.structured.success).toBe(false); + expect(result.markdown).toMatch(/orbit-team-lanes-detail/); + expect(wikiService.writePage).not.toHaveBeenCalled(); + }); + + it('rejects inline wiki links that target missing wiki pages', async () => { + const { tool, wikiService } = makeTool({ + wikiService: { + listPageKeys: vi.fn().mockResolvedValue(['orbit-company-overview']), + }, + }); + + const result = await tool.call( + { + key: 'orbit-how-we-work', + summary: 'Operating norms', + content: 'See [[orbit-company-overview]] and [[orbit-team-lanes-detail]].', + } as any, + baseContext, + ); + + expect(result.structured.success).toBe(false); + expect(result.markdown).toMatch(/orbit-team-lanes-detail/); + expect(wikiService.writePage).not.toHaveBeenCalled(); + }); }); diff --git a/packages/context/src/wiki/tools/wiki-write.tool.ts b/packages/context/src/wiki/tools/wiki-write.tool.ts index a2930fd8..edd34f8f 100644 --- a/packages/context/src/wiki/tools/wiki-write.tool.ts +++ b/packages/context/src/wiki/tools/wiki-write.tool.ts @@ -3,8 +3,9 @@ import type { KnowledgeIndexPort } from '../ports.js'; import type { KnowledgeEventPort } from '../ports.js'; type BlockScope = 'GLOBAL' | 'USER'; import { KnowledgeWikiService, type WikiFrontmatter } from '../index.js'; +import { validateFlatWikiKey } from '../keys.js'; import { applySqlEdits } from '../../tools/sql-edit-replacer.js'; -import { BaseTool, type ToolContext, type ToolOutput } from '../../tools/index.js'; +import { BaseTool, type ToolContext, type ToolOutput, validateActionRawPaths } from '../../tools/index.js'; const MAX_USER_BLOCKS = 100; const SYSTEM_AUTHOR = 'System User'; @@ -37,6 +38,10 @@ const wikiWriteInputSchema = z.object({ representative_sql: z.string().optional(), usage: historicSqlUsageFrontmatterSchema.optional(), fingerprints: z.array(z.string()).optional(), + rawPaths: z + .array(z.string().min(1)) + .optional() + .describe('In ingest sessions, raw source file paths that directly support this wiki action.'), }); type WikiWriteInput = z.infer; @@ -45,6 +50,7 @@ interface WikiWriteStructured { success: boolean; key: string; action?: 'created' | 'updated'; + content?: string; } function looksLikeEscapedMarkdown(content: string): boolean { @@ -63,6 +69,71 @@ function normalizeAccidentalEscapedMarkdownNewlines(content: string): string { return content.replace(/\\r\\n/g, '\n').replace(/\\n/g, '\n').replace(/\\r/g, '\n'); } +function isWikiPageKeyRef(ref: string): boolean { + return /^[a-z0-9][a-z0-9_-]*(?:-[a-z0-9_]+)*$/.test(ref); +} + +function extractInlineWikiRefs(content: string): string[] { + const refs = new Set(); + const re = /\[\[([^\]\n]+)\]\]/g; + for (const match of content.matchAll(re)) { + const target = match[1]?.split('|', 1)[0]?.trim(); + if (target && isWikiPageKeyRef(target)) { + refs.add(target); + } + } + return [...refs].sort(); +} + +async function visibleWikiPageKeys( + wikiService: KnowledgeWikiService, + scope: BlockScope, + scopeId: string | null, +): Promise> { + const keys = new Set(); + if (scope === 'USER') { + for (const key of await wikiService.listPageKeys('GLOBAL', null)) { + keys.add(key); + } + for (const key of await wikiService.listPageKeys('USER', scopeId)) { + keys.add(key); + } + return keys; + } + + for (const key of await wikiService.listPageKeys('GLOBAL', null)) { + keys.add(key); + } + return keys; +} + +async function findMissingWikiRefs(input: { + wikiService: KnowledgeWikiService; + scope: BlockScope; + scopeId: string | null; + pageKey: string; + refs?: string[]; + content: string; +}): Promise { + const candidates = new Set(); + for (const ref of input.refs ?? []) { + if (isWikiPageKeyRef(ref)) { + candidates.add(ref); + } + } + for (const ref of extractInlineWikiRefs(input.content)) { + candidates.add(ref); + } + + if (candidates.size === 0) { + return []; + } + + const available = await visibleWikiPageKeys(input.wikiService, input.scope, input.scopeId); + available.add(input.pageKey); + return [...candidates].filter((ref) => !available.has(ref)).sort(); +} + export class WikiWriteTool extends BaseTool { readonly name = 'wiki_write'; @@ -77,6 +148,7 @@ export class WikiWriteTool extends BaseTool { get description(): string { return ` Create or update a knowledge page. Provide content for create/rewrite, or replacements for targeted edits. +For existing pages, you may provide only frontmatter fields such as summary, tags, refs, or sl_refs to update metadata while preserving content. tags/refs/sl_refs use REPLACE semantics: omit to keep existing on update, [] to clear, [values] to set. `; } @@ -89,10 +161,17 @@ tags/refs/sl_refs use REPLACE semantics: omit to keep existing on update, [] to const wikiService = context.session?.wikiService ?? this.wikiService; const writesGlobal = !!context.session; const skipIndex = context.session?.isWorktreeScoped === true; - - if (!input.content && (!input.replacements || input.replacements.length === 0)) { + const keyValidation = validateFlatWikiKey(input.key); + if (!keyValidation.ok) { return { - markdown: 'Error: provide either content (for create/rewrite) or replacements (for edits).', + markdown: keyValidation.error, + structured: { success: false, key: input.key }, + }; + } + const rawPathValidation = validateActionRawPaths(context.session, input.rawPaths); + if (!rawPathValidation.ok) { + return { + markdown: `Error: ${rawPathValidation.error}`, structured: { success: false, key: input.key }, }; } @@ -101,6 +180,16 @@ tags/refs/sl_refs use REPLACE semantics: omit to keep existing on update, [] to const scopeId = scope === 'USER' ? context.userId : null; const existing = await wikiService.readPage(scope, scopeId, input.key); + const content = input.content; + const hasContent = typeof content === 'string' && content.length > 0; + const hasReplacements = !!input.replacements && input.replacements.length > 0; + if (!existing && !hasContent && !hasReplacements) { + return { + markdown: 'Error: provide either content (for create/rewrite) or replacements (for edits).', + structured: { success: false, key: input.key }, + }; + } + if (!existing && !input.content) { return { markdown: `Page "${input.key}" does not exist. Provide content to create it.`, @@ -140,9 +229,9 @@ tags/refs/sl_refs use REPLACE semantics: omit to keep existing on update, [] to fingerprints: input.fingerprints === undefined ? existingFm?.fingerprints : input.fingerprints, }; - if (input.content) { - finalContent = normalizeAccidentalEscapedMarkdownNewlines(input.content); - } else { + if (hasContent) { + finalContent = normalizeAccidentalEscapedMarkdownNewlines(content); + } else if (hasReplacements) { const editResult = applySqlEdits(existing?.content ?? '', input.replacements ?? []); if (!editResult.success) { return { @@ -151,6 +240,25 @@ tags/refs/sl_refs use REPLACE semantics: omit to keep existing on update, [] to }; } finalContent = editResult.sql; + } else { + finalContent = existing?.content ?? ''; + } + + const missingRefs = await findMissingWikiRefs({ + wikiService, + scope, + scopeId, + pageKey: input.key, + refs: finalFm.refs, + content: finalContent, + }); + if (missingRefs.length > 0) { + return { + markdown: + `Error: wiki references target missing page(s): ${missingRefs.join(', ')}. ` + + 'Create those pages first, retarget the links, or remove the refs.', + structured: { success: false, key: input.key }, + }; } await wikiService.writePage(scope, scopeId, input.key, finalFm, finalContent, SYSTEM_AUTHOR, SYSTEM_EMAIL); @@ -172,12 +280,26 @@ tags/refs/sl_refs use REPLACE semantics: omit to keep existing on update, [] to const action = existing ? 'updated' : 'created'; if (context.session) { - context.session.actions.push({ target: 'wiki', type: action, key: input.key, detail: input.summary }); + context.session.actions.push({ + target: 'wiki', + type: action, + key: input.key, + detail: input.summary, + ...(rawPathValidation.rawPaths ? { rawPaths: rawPathValidation.rawPaths } : {}), + }); } + // When the LLM used `replacements` (edit mode), it doesn't have the + // post-edit content cached. Returning the result here prevents the + // common bug where a follow-up edit uses an oldText string that no + // longer matches because a prior edit already changed the page. + const markdown = hasReplacements + ? `Page "${input.key}" ${action}.\n\nCurrent content (use for subsequent edits):\n\n${finalContent}` + : `Page "${input.key}" ${action}.`; + return { - markdown: `Page "${input.key}" ${action}.`, - structured: { success: true, key: input.key, action }, + markdown, + structured: { success: true, key: input.key, action, content: finalContent }, }; } } diff --git a/python/ktx-sl/semantic_layer/engine.py b/python/ktx-sl/semantic_layer/engine.py index 79547c20..01e03805 100644 --- a/python/ktx-sl/semantic_layer/engine.py +++ b/python/ktx-sl/semantic_layer/engine.py @@ -12,6 +12,7 @@ from semantic_layer.models import ( ) from semantic_layer.planner import QueryPlanner from semantic_layer.sql_table_extractor import ( + extract_projected_columns, extract_table_refs, ref_matches_source_table, ) @@ -62,6 +63,7 @@ class SemanticEngine: report = ValidationReport() self._check_orphan_join_targets(report) self._check_invalid_grain(report) + self._check_join_columns(report) self._check_sql_join_coverage(report, recently_touched=recently_touched) self._check_disconnected_components(report, recently_touched=recently_touched) return report @@ -82,15 +84,144 @@ class SemanticEngine: report.errors.extend(self._collect_orphan_join_target_errors()) def _check_invalid_grain(self, report: ValidationReport) -> None: + dialect = getattr(self.generator, "dialect", "postgres") for source in self.sources.values(): + qualified_grain: set[str] = set() + for grain_col in source.grain: + if "." in grain_col: + qualified_grain.add(grain_col) + report.errors.append( + f"Source '{source.name}' grain entry '{grain_col}' is a " + f"qualified name. Grain must use unqualified output column " + f"names (e.g. 'account_id', not 'activity.account_id')." + ) + + for col in source.columns: + if "." in col.name: + report.errors.append( + f"Source '{source.name}' column name '{col.name}' contains " + f"'.'. Column names must be unqualified." + ) + column_names = {c.name for c in source.columns} for grain_col in source.grain: + if grain_col in qualified_grain: + continue if grain_col not in column_names: report.errors.append( f"Source '{source.name}' has grain column '{grain_col}' " f"that is not in its columns list" ) + if source.is_sql_source and source.sql: + projected = extract_projected_columns(source.sql, dialect=dialect) + if projected is not None: + for grain_col in source.grain: + if grain_col in qualified_grain: + continue + if grain_col not in projected: + report.errors.append( + f"Source '{source.name}' grain column '{grain_col}' " + f"is not in the SQL SELECT projection. Add it to the " + f"SELECT list (or remove it from grain)." + ) + + def _check_join_columns(self, report: ValidationReport) -> None: + for source in self.sources.values(): + source_columns = {c.name for c in source.columns} + for join in source.joins: + target = self.sources.get(join.to) + if target is None: + continue + target_columns = {c.name for c in target.columns} + try: + local_raw, target_raw = self.graph._parse_on(join.on, join.to) + except ValueError as exc: + report.errors.append( + f"Source '{source.name}' has invalid join to '{join.to}': {exc}" + ) + continue + + local_cols = [ + col.strip() for col in local_raw.split(",") if col.strip() + ] + target_cols = [ + col.strip() for col in target_raw.split(",") if col.strip() + ] + for local_col in local_cols: + if local_col not in source_columns: + report.errors.append( + f"Source '{source.name}' joins to '{join.to}' on " + f"local column '{local_col}', but '{local_col}' is not " + f"in '{source.name}' columns list" + ) + for target_col in target_cols: + if target_col not in target_columns: + report.errors.append( + f"Source '{source.name}' joins to '{join.to}' on " + f"target column '{target_col}', but '{target_col}' is not " + f"in '{join.to}' columns list" + ) + + if join.relationship not in {"many_to_one", "one_to_one"}: + continue + for local_col, target_col in zip(local_cols, target_cols, strict=False): + if ( + local_col in source_columns + and target_col in target_columns + and target_col in target.grain + and self._looks_like_display_value_to_identifier( + local_col, target_col + ) + ): + report.errors.append( + f"Source '{source.name}' joins '{local_col}' to " + f"'{join.to}.{target_col}', but '{local_col}' looks like " + "a display value and the target column is an identifier " + "grain. Project the matching key column or omit this join." + ) + + @staticmethod + def _looks_like_display_value_to_identifier( + local_col: str, target_col: str + ) -> bool: + if target_col != "id" and not target_col.endswith("_id"): + return False + display_names = {"name", "email", "label", "title", "description"} + display_suffixes = ( + "_name", + "_email", + "_label", + "_title", + "_description", + ) + return local_col in display_names or local_col.endswith(display_suffixes) + + @staticmethod + def _source_exposes_join_key( + source: SourceDefinition, target: SourceDefinition + ) -> bool: + source_columns = {c.name.lower() for c in source.columns} + target_name = target.name.lower() + target_name_singular = ( + target_name[:-1] if target_name.endswith("s") else target_name + ) + for grain_col in target.grain: + grain = grain_col.lower() + if grain in source_columns: + return True + if grain == "id": + candidates = { + f"{target_name}_id", + f"{target_name_singular}_id", + } + if source_columns.intersection(candidates): + return True + continue + if any(col.endswith(f"_{grain}") for col in source_columns): + return True + return False + def _check_sql_join_coverage( self, report: ValidationReport, @@ -135,6 +266,8 @@ class SemanticEngine: continue if hit_name.lower() in declared: continue + if not self._source_exposes_join_key(source, self.sources[hit_name]): + continue if hit_name not in missing: missing.append(hit_name) @@ -148,11 +281,12 @@ class SemanticEngine: ) msg = ( f"Source '{source.name}' SQL joins manifest table(s) [{ref_list}] " - f"that are not declared in joins[]. Add a join entry for each, " + f"that have projected key columns but are not declared in joins[]. " + f"Add a join entry for each, " f"e.g. {{to: {example}, on: '{source.name}. = " - f"{example}.{grain_col}', relationship: many_to_one}}. If a " - f"reference is intentionally absent, document it with a " - f"`unmapped-table-*` wiki note and remove the SQL reference." + f"{example}.{grain_col}', relationship: many_to_one}}. If the " + "SQL intentionally keeps a referenced table internal, omit " + "that table's key column from the SQL source output." ) report.errors.append(msg) diff --git a/python/ktx-sl/semantic_layer/sql_table_extractor.py b/python/ktx-sl/semantic_layer/sql_table_extractor.py index 008c53ad..9b9efa6e 100644 --- a/python/ktx-sl/semantic_layer/sql_table_extractor.py +++ b/python/ktx-sl/semantic_layer/sql_table_extractor.py @@ -70,3 +70,30 @@ def ref_matches_source_table(ref: tuple[str, ...], source_table: str) -> bool: if len(ref) > len(src): return False return src[-len(ref) :] == ref + + +def extract_projected_columns(sql: str, dialect: str = "postgres") -> set[str] | None: + """Return the set of output column names projected by `sql`. + + Returns None if the projection cannot be statically determined — when + SELECT * (or qualified `t.*`) is present, or when parsing fails. Callers + should treat None as "unknown projection" and skip projection-dependent + checks rather than reporting a false-positive error. + """ + try: + tree = sqlglot.parse_one(sql, read=dialect) + except Exception as e: + logger.debug("extract_projected_columns: parse failed (%s); skipping", e) + return None + + if not isinstance(tree, exp.Select): + return None + + for projection in tree.expressions: + # Bare `*` or `t.*` — projection list is opaque. + if isinstance(projection, exp.Star): + return None + if isinstance(projection, exp.Column) and isinstance(projection.this, exp.Star): + return None + + return {name for name in tree.named_selects if name} diff --git a/python/ktx-sl/tests/test_validator.py b/python/ktx-sl/tests/test_validator.py index 376b5381..065c6331 100644 --- a/python/ktx-sl/tests/test_validator.py +++ b/python/ktx-sl/tests/test_validator.py @@ -119,6 +119,275 @@ class TestInvalidGrain: assert not report.valid assert any("bad" in e and "nonexistent_col" in e for e in report.errors) + def test_qualified_grain_name_is_rejected(self): + bad = _src( + "activity", + columns=["account_id"], + grain=["activity.account_id"], + ) + engine = SemanticEngine.from_sources({"activity": bad}) + + report = engine.validate() + + assert not report.valid + assert any( + "activity" in e and "activity.account_id" in e and "qualified" in e + for e in report.errors + ) + + def test_qualified_column_name_is_rejected(self): + bad = SourceDefinition( + name="activity", + table="public.activity", + grain=["account_id"], + columns=[ + SourceColumn(name="account_id", type="number"), + SourceColumn(name="activity.user_id", type="number"), + ], + ) + engine = SemanticEngine.from_sources({"activity": bad}) + + report = engine.validate() + + assert not report.valid + assert any( + "activity" in e and "activity.user_id" in e and "unqualified" in e + for e in report.errors + ) + + def test_sql_source_grain_missing_from_projection(self): + bad = SourceDefinition( + name="large_contract_requesters", + sql=( + "select account.account_name, requester.email as requester_email " + "from orbit_raw.actions activity " + "join orbit_raw.accounts account " + " on account.account_id = activity.account_id " + "join orbit_raw.users requester " + " on requester.user_id = activity.user_id" + ), + grain=["account_id", "user_id"], + columns=[ + SourceColumn(name="account_id", type="number"), + SourceColumn(name="user_id", type="number"), + SourceColumn(name="account_name", type="string"), + SourceColumn(name="requester_email", type="string"), + ], + ) + engine = SemanticEngine.from_sources({"large_contract_requesters": bad}) + + report = engine.validate() + + assert not report.valid + assert any( + "large_contract_requesters" in e + and "account_id" in e + and "SELECT projection" in e + for e in report.errors + ) + + def test_sql_source_grain_in_projection_passes(self): + good = SourceDefinition( + name="contract_requesters", + sql=( + "select activity.account_id, activity.user_id, " + "account.account_name, requester.email as requester_email " + "from orbit_raw.actions activity " + "join orbit_raw.accounts account " + " on account.account_id = activity.account_id " + "join orbit_raw.users requester " + " on requester.user_id = activity.user_id" + ), + grain=["account_id", "user_id"], + columns=[ + SourceColumn(name="account_id", type="number"), + SourceColumn(name="user_id", type="number"), + SourceColumn(name="account_name", type="string"), + SourceColumn(name="requester_email", type="string"), + ], + ) + engine = SemanticEngine.from_sources({"contract_requesters": good}) + + report = engine.validate() + + # No grain-related errors. (Other validators may emit unrelated + # warnings — we just assert the grain check is clean.) + assert not any("grain" in e or "SELECT projection" in e for e in report.errors) + + def test_sql_source_with_select_star_skips_projection_check(self): + # SELECT * means we can't statically know projected columns; + # the projection check must skip rather than false-fail. + src = SourceDefinition( + name="opaque", + sql="select * from public.events", + grain=["event_id"], + columns=[SourceColumn(name="event_id", type="number")], + ) + engine = SemanticEngine.from_sources({"opaque": src}) + + report = engine.validate() + + assert not any("SELECT projection" in e for e in report.errors) + + +class TestJoinValidation: + def test_join_local_column_must_exist(self): + orders = _src( + "orders", + columns=["id"], + joins=[ + JoinDeclaration( + to="customers", + on="customer_id = customers.id", + relationship="many_to_one", + ) + ], + ) + customers = _src("customers") + engine = SemanticEngine.from_sources({"orders": orders, "customers": customers}) + + report = engine.validate() + + assert not report.valid + assert any( + "orders" in e and "customer_id" in e and "columns list" in e + for e in report.errors + ) + + def test_many_to_one_join_rejects_display_name_to_id_grain(self): + requesters = _src( + "large_contract_requesters", + columns=["account_name", "requester_email"], + grain=["requester_email"], + joins=[ + JoinDeclaration( + to="mart_account_segments", + on="account_name = mart_account_segments.account_id", + relationship="many_to_one", + ) + ], + ) + accounts = _src( + "mart_account_segments", + columns=["account_id", "account_name"], + grain=["account_id"], + ) + engine = SemanticEngine.from_sources( + { + "large_contract_requesters": requesters, + "mart_account_segments": accounts, + } + ) + + report = engine.validate() + + assert not report.valid + assert any( + "large_contract_requesters" in e + and "account_name" in e + and "mart_account_segments.account_id" in e + for e in report.errors + ) + + def test_sql_join_coverage_does_not_require_join_without_projected_key(self): + requesters = SourceDefinition( + name="large_contract_requesters", + sql=""" + select accounts.account_name, users.email as requester_email + from orbit_raw.requests requests + join public.mart_account_segments accounts + on requests.account_id = accounts.account_id + join orbit_raw.users users + on requests.user_id = users.user_id + """, + grain=["requester_email"], + columns=[ + SourceColumn(name="account_name", type="string"), + SourceColumn(name="requester_email", type="string"), + ], + joins=[], + ) + accounts = _src( + "mart_account_segments", + columns=["account_id", "account_name"], + grain=["account_id"], + ) + engine = SemanticEngine.from_sources( + { + "large_contract_requesters": requesters, + "mart_account_segments": accounts, + } + ) + + report = engine.validate(recently_touched={"large_contract_requesters"}) + + assert report.errors == [] + + def test_sql_join_coverage_does_not_treat_unrelated_id_suffix_as_id_key(self): + requesters = SourceDefinition( + name="large_contract_requesters", + sql=""" + select accounts.account_name, requests.user_id + from orbit_raw.requests requests + join public.accounts accounts + on requests.account_id = accounts.id + """, + grain=["user_id"], + columns=[ + SourceColumn(name="account_name", type="string"), + SourceColumn(name="user_id", type="string"), + ], + joins=[], + ) + accounts = _src("accounts", columns=["id", "account_name"], grain=["id"]) + engine = SemanticEngine.from_sources( + { + "large_contract_requesters": requesters, + "accounts": accounts, + } + ) + + report = engine.validate(recently_touched={"large_contract_requesters"}) + + assert report.errors == [] + + def test_sql_join_coverage_requires_join_when_projected_key_exists(self): + requesters = SourceDefinition( + name="large_contract_requesters", + sql=""" + select accounts.account_id, users.email as requester_email + from orbit_raw.requests requests + join public.mart_account_segments accounts + on requests.account_id = accounts.account_id + join orbit_raw.users users + on requests.user_id = users.user_id + """, + grain=["requester_email"], + columns=[ + SourceColumn(name="account_id", type="string"), + SourceColumn(name="requester_email", type="string"), + ], + joins=[], + ) + accounts = _src( + "mart_account_segments", + columns=["account_id", "account_name"], + grain=["account_id"], + ) + engine = SemanticEngine.from_sources( + { + "large_contract_requesters": requesters, + "mart_account_segments": accounts, + } + ) + + report = engine.validate(recently_touched={"large_contract_requesters"}) + + assert not report.valid + assert any( + "mart_account_segments" in e and "joins[]" in e for e in report.errors + ) + class TestDisconnectedComponents: def test_two_components_produce_warning_not_error(self): diff --git a/scripts/package-artifacts.mjs b/scripts/package-artifacts.mjs index c419f490..cc901658 100644 --- a/scripts/package-artifacts.mjs +++ b/scripts/package-artifacts.mjs @@ -1074,7 +1074,7 @@ try { requireStdout('ktx setup demo seeded', seeded, /Mode: seeded/); requireStdout('ktx setup demo seeded', seeded, /Source: packaged demo project/); requireStdout('ktx setup demo seeded', seeded, /LLM calls: none/); - requireStdout('ktx setup demo seeded', seeded, /ktx serve --mcp stdio/); + requireStdout('ktx setup demo seeded', seeded, /ktx agent context --json/); assert.doesNotMatch(seeded.stdout, new RegExp(['--mode', 'deterministic'].join(' '))); assert.doesNotMatch(seeded.stdout, /KTX memory flow/); requireProjectStderr('ktx setup demo seeded', seeded, projectDir); diff --git a/scripts/package-artifacts.test.mjs b/scripts/package-artifacts.test.mjs index 7e98020b..00830e0f 100644 --- a/scripts/package-artifacts.test.mjs +++ b/scripts/package-artifacts.test.mjs @@ -517,7 +517,7 @@ describe('verification snippets', () => { assert.match(source, /Mode: seeded/); assert.match(source, /Source: packaged demo project/); assert.match(source, /LLM calls: none/); - assert.match(source, /ktx serve --mcp stdio/); + assert.match(source, /ktx agent context --json/); assert.doesNotMatch(source, new RegExp(["'demo'", "'--mode'", "'deterministic'"].join(', '))); assert.match(source, /'dev', 'doctor', 'setup', '--no-input'/); assert.match(source, /'--plain'/); diff --git a/scripts/precommit-check.mjs b/scripts/precommit-check.mjs deleted file mode 100644 index 299db534..00000000 --- a/scripts/precommit-check.mjs +++ /dev/null @@ -1,188 +0,0 @@ -#!/usr/bin/env node -import { spawnSync } from 'node:child_process'; -import { existsSync, readFileSync } from 'node:fs'; -import { dirname, join, resolve } from 'node:path'; -import { fileURLToPath } from 'node:url'; - -const scriptPath = fileURLToPath(import.meta.url); -const ktxRoot = dirname(dirname(scriptPath)); - -const packageNameByDir = new Map( - [ - 'cli', - 'connector-bigquery', - 'connector-clickhouse', - 'connector-mysql', - 'connector-postgres', - 'connector-snowflake', - 'connector-sqlite', - 'connector-sqlserver', - 'context', - 'llm', - ].map((packageDir) => { - const manifestPath = join(ktxRoot, 'packages', packageDir, 'package.json'); - const manifest = JSON.parse(readFileSync(manifestPath, 'utf8')); - return [packageDir, manifest.name]; - }), -); - -const packageCodePattern = /\.(?:ts|tsx|js|jsx|json)$/; -const scriptPattern = /\.(?:mjs|js|json)$/; -const pythonPackageTests = new Map([ - ['ktx-sl', 'python/ktx-sl/tests'], - ['ktx-daemon', 'python/ktx-daemon/tests'], -]); - -function normalizeFilePath(filePath) { - const normalized = filePath.replaceAll('\\', '/').replace(/^\.\//, ''); - return normalized.startsWith('ktx/') ? normalized.slice('ktx/'.length) : normalized; -} - -function stablePush(commands, key, cmd, args) { - if (commands.some((command) => command.key === key)) { - return; - } - - commands.push({ key, cmd, args }); -} - -function maybeScriptTest(scriptFile) { - if (scriptFile.endsWith('.test.mjs')) { - return scriptFile; - } - - if (!scriptFile.endsWith('.mjs')) { - return null; - } - - const testFile = scriptFile.replace(/\.mjs$/, '.test.mjs'); - return existsSync(join(ktxRoot, testFile)) ? testFile : null; -} - -export function planChecks(files) { - const commands = []; - const packageNames = new Set(); - const pythonPackages = new Set(); - let runBoundaryCheck = false; - let runAllTypeChecks = false; - let runAllPythonTests = false; - - for (const rawFile of files) { - const ktxFile = normalizeFilePath(rawFile); - - if (ktxFile.startsWith('packages/')) { - const [, packageDir, ...rest] = ktxFile.split('/'); - const packageName = packageNameByDir.get(packageDir); - const packageFile = rest.join('/'); - - if (packageName && packageCodePattern.test(packageFile)) { - packageNames.add(packageName); - runBoundaryCheck = true; - } - - continue; - } - - if (ktxFile.startsWith('scripts/') && scriptPattern.test(ktxFile)) { - const testFile = maybeScriptTest(ktxFile); - - if (testFile) { - stablePush(commands, `script-test:${testFile}`, 'node', ['--test', testFile]); - } - - continue; - } - - if (ktxFile.startsWith('python/')) { - const [, packageDir] = ktxFile.split('/'); - - if (pythonPackageTests.has(packageDir)) { - pythonPackages.add(packageDir); - } - - continue; - } - - if ( - ['package.json', 'pnpm-lock.yaml', 'pnpm-workspace.yaml', 'release-policy.json', 'tsconfig.base.json'].includes( - ktxFile, - ) - ) { - runBoundaryCheck = true; - runAllTypeChecks = true; - continue; - } - - if (['pyproject.toml', 'uv.lock', 'uv.toml'].includes(ktxFile)) { - runAllPythonTests = true; - } - } - - if (runBoundaryCheck) { - stablePush(commands, 'boundary-check', 'node', ['scripts/check-boundaries.mjs']); - } - - if (runAllTypeChecks) { - stablePush(commands, 'type-check:all', 'pnpm', ['--filter', './packages/*', 'run', 'type-check']); - } else { - for (const packageName of [...packageNames].sort()) { - stablePush(commands, `type-check:${packageName}`, 'pnpm', ['--filter', packageName, 'run', 'type-check']); - stablePush(commands, `build:${packageName}`, 'pnpm', ['--filter', `${packageName}...`, 'run', 'build']); - stablePush(commands, `test:${packageName}`, 'pnpm', ['--filter', packageName, 'run', 'test']); - } - } - - if (runAllPythonTests) { - stablePush(commands, 'pytest:all', 'uv', ['run', 'pytest']); - } else { - for (const packageDir of [...pythonPackages].sort()) { - stablePush(commands, `pytest:${packageDir}`, 'uv', [ - 'run', - '--package', - packageDir, - 'pytest', - pythonPackageTests.get(packageDir), - ]); - } - } - - return commands; -} - -function printCommand(command) { - console.log(`\n$ ${command.cmd} ${command.args.join(' ')}`); -} - -export function runChecks(files) { - const commands = planChecks(files); - - if (commands.length === 0) { - console.log('No KTX package checks needed for these files.'); - return 0; - } - - for (const command of commands) { - printCommand(command); - - const result = spawnSync(command.cmd, command.args, { - cwd: ktxRoot, - stdio: 'inherit', - env: process.env, - }); - - if (result.error) { - console.error(result.error.message); - return 1; - } - - if (result.status !== 0) { - return result.status ?? 1; - } - } - - return 0; -} - -if (process.argv[1] && resolve(process.argv[1]) === scriptPath) { - process.exitCode = runChecks(process.argv.slice(2)); -} diff --git a/scripts/precommit-check.test.mjs b/scripts/precommit-check.test.mjs deleted file mode 100644 index 40bd1716..00000000 --- a/scripts/precommit-check.test.mjs +++ /dev/null @@ -1,42 +0,0 @@ -import assert from 'node:assert/strict'; -import { describe, it } from 'node:test'; - -import { planChecks } from './precommit-check.mjs'; - -function commandKeys(files) { - return planChecks(files).map((command) => command.key); -} - -describe('precommit-check', () => { - it('skips files outside ktx', () => { - assert.deepEqual(commandKeys(['outside-workspace/src/app.ts']), []); - }); - - it('runs only the touched package checks for standalone package paths', () => { - assert.deepEqual(commandKeys(['packages/cli/src/index.ts']), [ - 'boundary-check', - 'type-check:@ktx/cli', - 'build:@ktx/cli', - 'test:@ktx/cli', - ]); - }); - - it('accepts legacy subtree-prefixed package paths', () => { - assert.deepEqual(commandKeys(['ktx/packages/cli/src/index.ts']), [ - 'boundary-check', - 'type-check:@ktx/cli', - 'build:@ktx/cli', - 'test:@ktx/cli', - ]); - }); - - it('runs the matching script test when a script changes', () => { - assert.deepEqual(commandKeys(['scripts/check-boundaries.mjs']), [ - 'script-test:scripts/check-boundaries.test.mjs', - ]); - }); - - it('runs the touched python package tests', () => { - assert.deepEqual(commandKeys(['python/ktx-sl/semantic_layer/parser.py']), ['pytest:ktx-sl']); - }); -});