diff --git a/docs-site/content/docs/cli-reference/ktx-ingest.mdx b/docs-site/content/docs/cli-reference/ktx-ingest.mdx index 7c87f14f..6bff774f 100644 --- a/docs-site/content/docs/cli-reference/ktx-ingest.mdx +++ b/docs-site/content/docs/cli-reference/ktx-ingest.mdx @@ -180,6 +180,21 @@ notion skipped skipped done done Use `--json` when a script or agent needs the selected plan and per-target results. +## Final validation pruning + +At the end of a context-source ingest, **ktx** validates the composed semantic +layer and wiki before saving it. If the final validation finds dangling +references, **ktx** removes the reference instead of failing accepted work. This +can remove joins that point at missing semantic sources, wiki `refs`, wiki +`sl_refs`, and inline wiki body references. If a generated semantic source is +invalid, **ktx** drops that source from the final save. + +The stored ingest report records these changes as `finalGatePrunedReferences` +and `finalGateDroppedSources`. The trace emits `final_gate_reference_pruned`, +`final_gate_source_dropped`, `final_gate_prune_committed`, and +`final_gate_prune_finished` events when pruning runs. If validation still fails +after pruning, the ingest fails and the report keeps the final validation error. + ## Inspect context-source ingest traces Context-source ingest writes persistent JSONL traces for postmortem debugging. diff --git a/packages/cli/src/context/cache/content-result-cache.ts b/packages/cli/src/context/cache/content-result-cache.ts new file mode 100644 index 00000000..bc2f2573 --- /dev/null +++ b/packages/cli/src/context/cache/content-result-cache.ts @@ -0,0 +1,64 @@ +import { createHash } from 'node:crypto'; + +type ContentCacheMetadata = Record; + +export interface ContentResultCacheLookup { + namespace: string; + scopeKey: string; + inputHash: string; +} + +export interface ContentResultCacheCompleted extends ContentResultCacheLookup { + runId: string; + status: 'completed'; + output: TOutput; + errorMessage: null; + metadata: ContentCacheMetadata; + updatedAt: string; +} + +export interface ContentResultCacheFailed extends ContentResultCacheLookup { + runId: string; + status: 'failed'; + output: null; + errorMessage: string; + metadata: ContentCacheMetadata; + updatedAt: string; +} + +export type ContentResultCacheRecord = + | ContentResultCacheCompleted + | ContentResultCacheFailed; + +export interface ContentResultCache { + findCompletedResult( + input: ContentResultCacheLookup, + ): Promise | null>; + findLatestCompletedResult(input: { + namespace: string; + scopeKey: string; + }): Promise; + saveCompletedResult( + input: Omit, 'status' | 'errorMessage'>, + ): Promise; + saveFailedResult(input: Omit): Promise; + deleteResult(input: ContentResultCacheLookup): Promise; + listRunResults(runId: string): Promise; +} + +function stableJson(value: unknown): string { + if (Array.isArray(value)) { + return `[${value.map(stableJson).join(',')}]`; + } + if (value && typeof value === 'object') { + const entries = Object.entries(value as Record).sort(([left], [right]) => + left.localeCompare(right), + ); + return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableJson(item)}`).join(',')}}`; + } + return JSON.stringify(value) ?? 'undefined'; +} + +export function stableContentHash(value: unknown): string { + return createHash('sha256').update(stableJson(value)).digest('hex'); +} diff --git a/packages/cli/src/context/cache/sqlite-content-result-cache.ts b/packages/cli/src/context/cache/sqlite-content-result-cache.ts new file mode 100644 index 00000000..529c0af9 --- /dev/null +++ b/packages/cli/src/context/cache/sqlite-content-result-cache.ts @@ -0,0 +1,281 @@ +import { mkdirSync } from 'node:fs'; +import { dirname } from 'node:path'; +import Database from 'better-sqlite3'; +import type { + ContentResultCache, + ContentResultCacheCompleted, + ContentResultCacheFailed, + ContentResultCacheLookup, + ContentResultCacheRecord, +} from './content-result-cache.js'; + +export interface SqliteContentResultCacheOptions { + dbPath: string; +} + +interface ResultRow { + run_id: string; + namespace: string; + scope_key: string; + input_hash: string; + status: 'completed' | 'failed'; + output_json: string | null; + error_message: string | null; + metadata_json: string; + updated_at: string; +} + +const RESULTS_TABLE = 'local_content_results'; +const RESULTS_PRIMARY_KEY = ['namespace', 'scope_key', 'input_hash'] as const; + +function isSafeRunId(runId: string): boolean { + return /^[a-zA-Z0-9][a-zA-Z0-9_.-]*$/.test(runId); +} + +function parseResultRow(row: ResultRow): ContentResultCacheRecord { + const base = { + runId: row.run_id, + namespace: row.namespace, + scopeKey: row.scope_key, + inputHash: row.input_hash, + metadata: JSON.parse(row.metadata_json || '{}') as Record, + updatedAt: row.updated_at, + }; + if (row.status === 'completed') { + return { + ...base, + status: 'completed', + output: JSON.parse(row.output_json ?? 'null') as TOutput, + errorMessage: null, + }; + } + return { + ...base, + status: 'failed', + output: null, + errorMessage: row.error_message ?? 'Unknown content result failure', + }; +} + +export class SqliteContentResultCache implements ContentResultCache { + private readonly db: Database.Database; + + constructor(options: SqliteContentResultCacheOptions) { + mkdirSync(dirname(options.dbPath), { recursive: true }); + this.db = new Database(options.dbPath); + this.db.pragma('journal_mode = WAL'); + this.db.exec('DROP TABLE IF EXISTS local_scan_enrichment_stages'); + this.dropResultsTableIfPrimaryKeyDiffers(); + this.db.exec(` + CREATE TABLE IF NOT EXISTS local_content_results ( + run_id TEXT NOT NULL, + namespace TEXT NOT NULL, + scope_key TEXT NOT NULL, + input_hash TEXT NOT NULL, + status TEXT NOT NULL, + output_json TEXT, + error_message TEXT, + metadata_json TEXT NOT NULL, + updated_at TEXT NOT NULL, + PRIMARY KEY (namespace, scope_key, input_hash) + ); + + CREATE INDEX IF NOT EXISTS local_content_results_lookup_idx + ON local_content_results (namespace, scope_key, input_hash, updated_at); + CREATE INDEX IF NOT EXISTS local_content_results_run_idx + ON local_content_results (run_id, updated_at, namespace); + `); + } + + private dropResultsTableIfPrimaryKeyDiffers(): void { + const columns = this.db.prepare(`PRAGMA table_info(${RESULTS_TABLE})`).all() as Array<{ + name: string; + pk: number; + }>; + if (columns.length === 0) { + return; + } + const primaryKey = columns + .filter((column) => column.pk > 0) + .sort((left, right) => left.pk - right.pk) + .map((column) => column.name); + const matches = + primaryKey.length === RESULTS_PRIMARY_KEY.length && + primaryKey.every((name, index) => name === RESULTS_PRIMARY_KEY[index]); + if (!matches) { + this.db.exec(`DROP TABLE ${RESULTS_TABLE}`); + } + } + + async findCompletedResult( + input: ContentResultCacheLookup, + ): Promise | null> { + const row = this.db + .prepare( + ` + SELECT * + FROM local_content_results + WHERE namespace = ? + AND scope_key = ? + AND input_hash = ? + AND status = 'completed' + ORDER BY updated_at DESC + LIMIT 1 + `, + ) + .get(input.namespace, input.scopeKey, input.inputHash) as ResultRow | undefined; + if (!row) { + return null; + } + const parsed = parseResultRow(row); + return parsed.status === 'completed' ? parsed : null; + } + + async findLatestCompletedResult(input: { + namespace: string; + scopeKey: string; + }): Promise { + const row = this.db + .prepare( + ` + SELECT * + FROM local_content_results + WHERE namespace = ? + AND scope_key = ? + AND status = 'completed' + ORDER BY updated_at DESC + LIMIT 1 + `, + ) + .get(input.namespace, input.scopeKey) as ResultRow | undefined; + if (!row) { + return null; + } + const parsed = parseResultRow(row); + return parsed.status === 'completed' ? parsed : null; + } + + async saveCompletedResult( + input: Omit, 'status' | 'errorMessage'>, + ): Promise { + this.db + .prepare( + ` + INSERT INTO local_content_results ( + run_id, + namespace, + scope_key, + input_hash, + status, + output_json, + error_message, + metadata_json, + updated_at + ) + VALUES ( + @runId, + @namespace, + @scopeKey, + @inputHash, + 'completed', + @outputJson, + NULL, + @metadataJson, + @updatedAt + ) + ON CONFLICT(namespace, scope_key, input_hash) DO UPDATE SET + run_id = excluded.run_id, + status = excluded.status, + output_json = excluded.output_json, + error_message = excluded.error_message, + metadata_json = excluded.metadata_json, + updated_at = excluded.updated_at + `, + ) + .run({ + runId: input.runId, + namespace: input.namespace, + scopeKey: input.scopeKey, + inputHash: input.inputHash, + outputJson: JSON.stringify(input.output), + metadataJson: JSON.stringify(input.metadata), + updatedAt: input.updatedAt, + }); + } + + async saveFailedResult(input: Omit): Promise { + this.db + .prepare( + ` + INSERT INTO local_content_results ( + run_id, + namespace, + scope_key, + input_hash, + status, + output_json, + error_message, + metadata_json, + updated_at + ) + VALUES ( + @runId, + @namespace, + @scopeKey, + @inputHash, + 'failed', + NULL, + @errorMessage, + @metadataJson, + @updatedAt + ) + ON CONFLICT(namespace, scope_key, input_hash) DO UPDATE SET + run_id = excluded.run_id, + status = excluded.status, + output_json = excluded.output_json, + error_message = excluded.error_message, + metadata_json = excluded.metadata_json, + updated_at = excluded.updated_at + `, + ) + .run({ + runId: input.runId, + namespace: input.namespace, + scopeKey: input.scopeKey, + inputHash: input.inputHash, + errorMessage: input.errorMessage, + metadataJson: JSON.stringify(input.metadata), + updatedAt: input.updatedAt, + }); + } + + async deleteResult(input: ContentResultCacheLookup): Promise { + this.db + .prepare( + ` + DELETE FROM local_content_results + WHERE namespace = ? + AND scope_key = ? + AND input_hash = ? + `, + ) + .run(input.namespace, input.scopeKey, input.inputHash); + } + + async listRunResults(runId: string): Promise { + if (!isSafeRunId(runId)) { + return []; + } + const rows = this.db + .prepare( + ` + SELECT * + FROM local_content_results + WHERE run_id = ? + ORDER BY updated_at ASC, namespace ASC + `, + ) + .all(runId) as ResultRow[]; + return rows.map((row) => parseResultRow(row)); + } +} diff --git a/packages/cli/src/context/ingest/artifact-gates.ts b/packages/cli/src/context/ingest/artifact-gates.ts index 52b2df85..d241f9b9 100644 --- a/packages/cli/src/context/ingest/artifact-gates.ts +++ b/packages/cli/src/context/ingest/artifact-gates.ts @@ -3,7 +3,7 @@ import type { TouchedSlSource } from '../../context/tools/touched-sl-sources.js' import type { KnowledgeWikiService } from '../../context/wiki/knowledge-wiki.service.js'; import { findMissingWikiRefs } from '../wiki/wiki-ref-validation.js'; import type { WuValidationResult } from './stages/validate-wu-sources.js'; -import { findInvalidWikiBodyRefs } from './wiki-body-refs.js'; +import { findInvalidWikiBodyRefIssues, type WikiBodyRefIssue } from './wiki-body-refs.js'; export interface FinalArtifactGateInput { connectionIds: string[]; @@ -21,6 +21,31 @@ export interface ProvenanceRawPathValidationInput { deletedRawPaths: Set; } +export type FinalArtifactGateFinding = + | { kind: 'invalid_source'; connectionId: string; sourceName: string; errors: string[] } + | { + kind: 'missing_join_target'; + ownerConnectionId: string; + ownerSourceName: string; + targetSourceName: string; + message: string; + } + | { kind: 'missing_wiki_ref'; pageKey: string; targetPageKey: string; message: string } + | { + kind: 'missing_wiki_sl_ref'; + pageKey: string; + ref: string; + sourceName: string; + entityName: string | null; + message: string; + } + | WikiBodyRefIssue; + +export interface FinalArtifactGateResult { + ok: boolean; + findings: FinalArtifactGateFinding[]; +} + function normalizeRawPath(path: string): string { return path.replace(/\\/g, '/').replace(/^\/+/, ''); } @@ -40,8 +65,8 @@ function slEntityNames(source: Awaited { - const errors: string[] = []; +async function validateWikiSlRefs(input: FinalArtifactGateInput): Promise { + const findings: FinalArtifactGateFinding[] = []; const sourcesByConnection = new Map>['sources']>(); for (const connectionId of input.connectionIds) { const { sources } = await input.semanticLayerService.loadAllSources(connectionId); @@ -64,19 +89,33 @@ async function validateWikiSlRefs(input: FinalArtifactGateInput): Promise { - const dangling: string[] = []; +async function validateWikiRefs(input: FinalArtifactGateInput): Promise { + const findings: FinalArtifactGateFinding[] = []; for (const pageKey of input.changedWikiPageKeys) { const page = await input.wikiService.readPage('GLOBAL', null, pageKey); if (!page) { @@ -91,33 +130,82 @@ async function validateWikiRefs(input: FinalArtifactGateInput): Promise ${missingRef}`); + findings.push({ + kind: 'missing_wiki_ref', + pageKey, + targetPageKey: missingRef, + message: `${pageKey} -> ${missingRef}`, + }); } } - return dangling; + return findings; } -export async function validateFinalIngestArtifacts(input: FinalArtifactGateInput): Promise { +export function formatFinalArtifactGateFindings(findings: FinalArtifactGateFinding[]): string { + const errors = findings.map((finding) => { + if (finding.kind === 'invalid_source') { + return `semantic-layer validation failed for ${finding.connectionId}:${finding.sourceName}: ${finding.errors.join('; ')}`; + } + if (finding.kind === 'missing_wiki_ref') { + return `wiki reference targets missing page: ${finding.message}`; + } + return finding.message; + }); + return `final artifact gates failed:\n${errors.join('\n')}`; +} + +export function isFinalArtifactGateFindingPruneable(finding: FinalArtifactGateFinding): boolean { + switch (finding.kind) { + case 'invalid_source': + case 'missing_join_target': + case 'missing_wiki_ref': + case 'missing_wiki_sl_ref': + case 'missing_wiki_body_sl_entity': + case 'missing_wiki_body_sl_source': + case 'missing_wiki_body_table': + return true; + default: { + const exhaustive: never = finding; + return exhaustive; + } + } +} + +export async function validateFinalIngestArtifacts(input: FinalArtifactGateInput): Promise { // Join-neighbor expansion happens inside validateTouchedSources so work-unit // validation and this gate check the same set — a source that passes one // passes the other. const validation = await input.validateTouchedSources(input.touchedSlSources); - const errors: string[] = validation.invalidSources.map( - (invalid) => `semantic-layer validation failed for ${invalid.source}: ${invalid.errors.join('; ')}`, - ); - errors.push(...(await validateWikiSlRefs(input))); - const danglingWikiRefs = await validateWikiRefs(input); - if (danglingWikiRefs.length > 0) { - errors.push(`wiki references target missing page(s): ${danglingWikiRefs.join(', ')}`); + const findings: FinalArtifactGateFinding[] = []; + for (const invalid of validation.invalidSources) { + const [connectionId = '', sourceName = ''] = invalid.source.split(':', 2); + const issues = invalid.issues ?? invalid.errors.map((message) => ({ kind: 'source_validation' as const, message })); + const sourceErrors = issues.filter((issue) => issue.kind === 'source_validation').map((issue) => issue.message); + if (sourceErrors.length > 0) { + findings.push({ kind: 'invalid_source', connectionId, sourceName, errors: sourceErrors }); + } + for (const issue of issues) { + if (issue.kind === 'missing_join_target') { + findings.push({ + kind: 'missing_join_target', + ownerConnectionId: connectionId, + ownerSourceName: sourceName, + targetSourceName: issue.targetSourceName, + message: issue.message, + }); + } + } } + findings.push(...(await validateWikiSlRefs(input))); + findings.push(...(await validateWikiRefs(input))); for (const pageKey of input.changedWikiPageKeys) { const page = await input.wikiService.readPage('GLOBAL', null, pageKey); if (!page) { continue; } - errors.push( - ...(await findInvalidWikiBodyRefs({ + findings.push( + ...(await findInvalidWikiBodyRefIssues({ pageKey, body: page.content, visibleConnectionIds: input.connectionIds, @@ -130,9 +218,7 @@ export async function validateFinalIngestArtifacts(input: FinalArtifactGateInput ); } - if (errors.length > 0) { - throw new Error(`final artifact gates failed:\n${errors.join('\n')}`); - } + return { ok: findings.length === 0, findings }; } export function validateProvenanceRawPaths(input: ProvenanceRawPathValidationInput): void { diff --git a/packages/cli/src/context/ingest/final-gate-prune.ts b/packages/cli/src/context/ingest/final-gate-prune.ts new file mode 100644 index 00000000..50d8d163 --- /dev/null +++ b/packages/cli/src/context/ingest/final-gate-prune.ts @@ -0,0 +1,330 @@ +import YAML from 'yaml'; +import type { KtxFileStorePort } from '../core/file-store.js'; +import { listSlSourceFiles, resolveSlSourceFile, slSourceNameForFile } from '../sl/source-files.js'; +import type { KnowledgeWikiService } from '../wiki/knowledge-wiki.service.js'; +import type { FinalArtifactGateFinding } from './artifact-gates.js'; +import type { IngestTraceWriter } from './ingest-trace.js'; + +type FinalGatePrunedReferenceKind = 'join' | 'wiki_ref' | 'wiki_sl_ref' | 'wiki_body_ref'; +type SemanticLayerFileStore = Pick; + +interface ResolvedYamlSource { + path: string; + source: Record; +} + +export interface FinalGatePrunedReference { + kind: FinalGatePrunedReferenceKind; + artifact: string; + removedRef: string; + absentTarget: string; +} + +export interface FinalGateDroppedSource { + connectionId: string; + sourceName: string; + reason: string; +} + +export interface FinalGatePruneResult { + prunedReferences: FinalGatePrunedReference[]; + droppedSources: FinalGateDroppedSource[]; +} + +interface PruneInput { + workdir: string; + semanticLayerFiles: SemanticLayerFileStore; + findings: FinalArtifactGateFinding[]; + droppedSources: FinalGateDroppedSource[]; + trace: IngestTraceWriter; + author: { name: string; email: string }; + wikiService?: KnowledgeWikiService; +} + +async function resolveYamlSource( + fileStore: SemanticLayerFileStore, + connectionId: string, + sourceName: string, +): Promise { + const file = await resolveSlSourceFile(fileStore, connectionId, sourceName); + if (!file) { + return null; + } + const parsed = YAML.parse(file.content); + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + throw new Error(`${file.path}: expected semantic-layer source YAML object`); + } + return { path: file.path, source: parsed as Record }; +} + +async function writeYamlSource(input: { + fileStore: SemanticLayerFileStore; + path: string; + source: Record; + author: { name: string; email: string }; +}): Promise { + await input.fileStore.writeFile( + input.path, + YAML.stringify(input.source, { indent: 2, lineWidth: 0, version: '1.1' }), + input.author.name, + input.author.email, + `Prune dangling joins from ${input.path}`, + { skipLock: true }, + ); +} + +function removeInlineToken(content: string, rawToken: string): string { + return content.replaceAll(`\`${rawToken}\``, '').replace(/[ \t]+([.,;:!?])/g, '$1'); +} + +function escapeRegExp(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +function removeWikiRefToken(content: string, targetPageKey: string): string { + const pattern = new RegExp(`\\[\\[\\s*${escapeRegExp(targetPageKey)}(?:\\|[^\\]\\n]+)?\\s*\\]\\]`, 'g'); + return content.replace(pattern, '').replace(/[ \t]+([.,;:!?])/g, '$1'); +} + +function wikiBodyAbsentTarget(finding: FinalArtifactGateFinding): string { + if (finding.kind === 'missing_wiki_body_table') { + return finding.tableRef; + } + if (finding.kind === 'missing_wiki_body_sl_source') { + return finding.sourceName; + } + if (finding.kind === 'missing_wiki_body_sl_entity') { + return `${finding.sourceName}.${finding.entityName}`; + } + return ''; +} + +/** Remove every join whose target matches `shouldRemove`, write the file back, and + * emit one pruned-reference record per distinct removed target. */ +async function pruneJoinsFromSource(input: { + fileStore: SemanticLayerFileStore; + connectionId: string; + ownerSourceName: string; + resolved: ResolvedYamlSource; + shouldRemove: (target: string) => boolean; + author: { name: string; email: string }; + trace: IngestTraceWriter; +}): Promise { + if (!Array.isArray(input.resolved.source.joins)) { + return []; + } + const removed = new Set(); + const nextJoins = input.resolved.source.joins.filter((entry) => { + const to = entry && typeof entry === 'object' && 'to' in entry ? (entry as { to: unknown }).to : undefined; + if (typeof to === 'string' && input.shouldRemove(to)) { + removed.add(to); + return false; + } + return true; + }); + if (removed.size === 0) { + return []; + } + input.resolved.source.joins = nextJoins; + await writeYamlSource({ + fileStore: input.fileStore, + path: input.resolved.path, + source: input.resolved.source, + author: input.author, + }); + const records: FinalGatePrunedReference[] = []; + for (const target of removed) { + const record = { + kind: 'join' as const, + artifact: `semantic-layer/${input.connectionId}/${input.ownerSourceName}`, + removedRef: target, + absentTarget: target, + }; + records.push(record); + await input.trace.event('info', 'final_gates', 'final_gate_reference_pruned', record); + } + return records; +} + +export async function pruneFinalGateFindings(input: PruneInput): Promise { + const droppedSources = [...input.droppedSources]; + const prunedReferences: FinalGatePrunedReference[] = []; + const droppedKey = new Set(droppedSources.map((source) => `${source.connectionId}:${source.sourceName}`)); + + for (const finding of input.findings) { + if (finding.kind !== 'invalid_source') { + continue; + } + const key = `${finding.connectionId}:${finding.sourceName}`; + if (droppedKey.has(key)) { + continue; + } + const file = await resolveSlSourceFile(input.semanticLayerFiles, finding.connectionId, finding.sourceName); + if (!file) { + continue; + } + const deleted = await input.semanticLayerFiles.deleteFile( + file.path, + input.author.name, + input.author.email, + `Drop invalid source ${finding.connectionId}:${finding.sourceName}`, + { skipLock: true }, + ); + if (!deleted) { + continue; + } + const dropped = { + connectionId: finding.connectionId, + sourceName: finding.sourceName, + reason: finding.errors.join('; '), + }; + droppedSources.push(dropped); + droppedKey.add(key); + await input.trace.event('info', 'final_gates', 'final_gate_source_dropped', dropped); + } + + // A dropped node can leave a join dangling on any owner — including sources + // untouched by this run, which the touched-scoped gate (and the confirm gate + // after it) never revisit. Prune those edges directly (D5), or the committed + // orphan join breaks every SL query on the connection. + const droppedByConnection = new Map>(); + for (const dropped of droppedSources) { + const names = droppedByConnection.get(dropped.connectionId) ?? new Set(); + names.add(dropped.sourceName); + droppedByConnection.set(dropped.connectionId, names); + } + for (const [connectionId, droppedNames] of droppedByConnection) { + for (const file of await listSlSourceFiles(input.semanticLayerFiles, connectionId)) { + let parsed: unknown; + try { + parsed = YAML.parse(file.content); + } catch { + continue; + } + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + continue; + } + prunedReferences.push( + ...(await pruneJoinsFromSource({ + fileStore: input.semanticLayerFiles, + connectionId, + ownerSourceName: slSourceNameForFile(file.path, file.content), + resolved: { path: file.path, source: parsed as Record }, + shouldRemove: (target) => droppedNames.has(target), + author: input.author, + trace: input.trace, + })), + ); + } + } + + for (const finding of input.findings) { + if (finding.kind !== 'missing_join_target') { + continue; + } + const resolved = await resolveYamlSource( + input.semanticLayerFiles, + finding.ownerConnectionId, + finding.ownerSourceName, + ); + if (!resolved) { + continue; + } + prunedReferences.push( + ...(await pruneJoinsFromSource({ + fileStore: input.semanticLayerFiles, + connectionId: finding.ownerConnectionId, + ownerSourceName: finding.ownerSourceName, + resolved, + shouldRemove: (target) => target === finding.targetSourceName, + author: input.author, + trace: input.trace, + })), + ); + } + + const wikiFindings = input.findings.filter( + (finding) => + finding.kind === 'missing_wiki_ref' || + finding.kind === 'missing_wiki_sl_ref' || + finding.kind === 'missing_wiki_body_sl_source' || + finding.kind === 'missing_wiki_body_sl_entity' || + finding.kind === 'missing_wiki_body_table', + ); + const pageKeys = [...new Set(wikiFindings.map((finding) => finding.pageKey))].sort(); + for (const pageKey of pageKeys) { + const page = input.wikiService ? await input.wikiService.readPage('GLOBAL', null, pageKey) : null; + if (!page) { + continue; + } + const frontmatter = { ...page.frontmatter }; + let content = page.content; + let changed = false; + for (const finding of wikiFindings.filter((candidate) => candidate.pageKey === pageKey)) { + if (finding.kind === 'missing_wiki_ref') { + const refs = Array.isArray(frontmatter.refs) ? frontmatter.refs.filter((ref) => ref !== finding.targetPageKey) : []; + const nextContent = removeWikiRefToken(content, finding.targetPageKey); + if ((Array.isArray(frontmatter.refs) && refs.length !== frontmatter.refs.length) || nextContent !== content) { + if (Array.isArray(frontmatter.refs)) { + frontmatter.refs = refs; + } + content = nextContent; + changed = true; + const record = { + kind: 'wiki_ref' as const, + artifact: `wiki/global/${pageKey}`, + removedRef: finding.targetPageKey, + absentTarget: finding.targetPageKey, + }; + prunedReferences.push(record); + await input.trace.event('info', 'final_gates', 'final_gate_reference_pruned', record); + } + } else if (finding.kind === 'missing_wiki_sl_ref') { + const slRefs = Array.isArray(frontmatter.sl_refs) + ? frontmatter.sl_refs.filter((ref) => ref !== finding.ref) + : []; + if (Array.isArray(frontmatter.sl_refs) && slRefs.length !== frontmatter.sl_refs.length) { + frontmatter.sl_refs = slRefs; + changed = true; + const record = { + kind: 'wiki_sl_ref' as const, + artifact: `wiki/global/${pageKey}`, + removedRef: finding.ref, + absentTarget: finding.sourceName, + }; + prunedReferences.push(record); + await input.trace.event('info', 'final_gates', 'final_gate_reference_pruned', record); + } + } else { + const nextContent = removeInlineToken(content, finding.rawToken); + if (nextContent !== content) { + content = nextContent; + changed = true; + const record = { + kind: 'wiki_body_ref' as const, + artifact: `wiki/global/${pageKey}`, + removedRef: finding.rawToken, + absentTarget: wikiBodyAbsentTarget(finding), + }; + prunedReferences.push(record); + await input.trace.event('info', 'final_gates', 'final_gate_reference_pruned', record); + } + } + } + if (changed && input.wikiService) { + await input.wikiService.writePage( + 'GLOBAL', + null, + pageKey, + frontmatter, + content, + input.author.name, + input.author.email, + `Prune dangling refs from ${pageKey}`, + { skipLock: true }, + ); + } + } + + return { prunedReferences, droppedSources }; +} diff --git a/packages/cli/src/context/ingest/final-gate-repair.ts b/packages/cli/src/context/ingest/final-gate-repair.ts deleted file mode 100644 index ff2d1a9a..00000000 --- a/packages/cli/src/context/ingest/final-gate-repair.ts +++ /dev/null @@ -1,136 +0,0 @@ -import { z } from 'zod'; -import type { AgentRunnerPort, KtxRuntimeToolSet } from '../../context/llm/runtime-port.js'; -import type { ConstrainedRepairResult, RepairVerification } from './constrained-repair.js'; -import { runConstrainedRepairLoop } from './constrained-repair.js'; -import type { IngestTraceWriter } from './ingest-trace.js'; - -type FinalGateRepairKind = 'patch_semantic_gate' | 'final_artifact_gate'; - -export type FinalGateRepairResult = ConstrainedRepairResult; - -export interface RepairFinalGateFailureInput { - agentRunner: AgentRunnerPort; - workdir: string; - gateError: string; - allowedPaths: string[]; - trace: IngestTraceWriter; - repairKind: FinalGateRepairKind; - /** - * Re-runs the failed gate against the current worktree. The repair counts - * as successful only when this passes — editing files is not the success - * signal. - */ - verify(changedPaths: string[]): Promise; - maxAttempts?: number; - stepBudget?: number; - abortSignal?: AbortSignal; -} - -function buildGateRepairSystemPrompt(): string { - return ` -You repair one ktx isolated-diff artifact gate failure inside the integration worktree. - - - -- Use read_gate_error first. -- Read only files exposed by read_repair_file. -- Edit only paths exposed by write_repair_file. -- Prefer the smallest text edit that makes the gate pass. -- Preserve accepted work-unit, reconciliation, and deterministic projection content. -- Do not invent warehouse facts, business definitions, or semantic-layer entities. -- If the gate error requires choosing between conflicting facts without evidence, stop without editing. -`; -} - -function buildGateRepairUserPrompt(input: { - gateError: string; - allowedPaths: string[]; - repairKind: FinalGateRepairKind; - attempt: number; - maxAttempts: number; - previousFailure: string | null; -}): string { - const previousFailureBlock = input.previousFailure - ? `\nPrevious attempt did not pass the gate:\n${input.previousFailure}\n` - : ''; - return `Repair isolated-diff artifact gates. - -Repair kind: ${input.repairKind} -Attempt: ${input.attempt} of ${input.maxAttempts} - -Allowed files: -${input.allowedPaths.map((path) => `- ${path}`).join('\n')} - -Gate error: -${input.gateError} -${previousFailureBlock} -Use read_gate_error first. Then inspect only the allowed files, write the -minimal repaired content, and stop.`; -} - -function buildReadGateErrorTool(gateError: string): KtxRuntimeToolSet { - return { - read_gate_error: { - name: 'read_gate_error', - description: 'Read the artifact gate failure that must be repaired.', - inputSchema: z.object({}), - execute: async () => ({ - markdown: gateError, - structured: { gateError }, - }), - }, - }; -} - -export function finalGateRepairPaths(input: { - changedWikiPageKeys: string[]; - // Resolved by the caller: SL filenames are derived labels, so the repair - // allowlist must carry the real on-disk paths, not name-interpolated ones. - touchedSlSourcePaths: string[]; -}): string[] { - return [ - ...new Set([ - ...input.touchedSlSourcePaths, - ...input.changedWikiPageKeys.map((pageKey) => `wiki/global/${pageKey}.md`), - ]), - ].sort(); -} - -export async function repairFinalGateFailure( - input: RepairFinalGateFailureInput, -): Promise { - return runConstrainedRepairLoop({ - agentRunner: input.agentRunner, - workdir: input.workdir, - allowedPaths: input.allowedPaths, - trace: input.trace, - tracePhase: 'gate_repair', - traceEventName: 'gate_repair', - traceData: { - repairKind: input.repairKind, - gateError: input.gateError, - }, - systemPrompt: buildGateRepairSystemPrompt(), - buildUserPrompt: ({ attempt, maxAttempts, previousFailure }) => - buildGateRepairUserPrompt({ - gateError: input.gateError, - allowedPaths: [...input.allowedPaths].sort(), - repairKind: input.repairKind, - attempt, - maxAttempts, - previousFailure, - }), - buildExtraTools: () => buildReadGateErrorTool(input.gateError), - verify: input.verify, - noChangeFailureReason: 'gate repair completed without editing an allowed path', - telemetryTags: { - operationName: 'ingest-isolated-diff-gate-repair', - source: input.trace.context.sourceKey, - jobId: input.trace.context.jobId, - repairKind: input.repairKind, - }, - maxAttempts: input.maxAttempts, - stepBudget: input.stepBudget ?? 16, - abortSignal: input.abortSignal, - }); -} diff --git a/packages/cli/src/context/ingest/ingest-bundle.runner.ts b/packages/cli/src/context/ingest/ingest-bundle.runner.ts index e054fce8..53503738 100644 --- a/packages/cli/src/context/ingest/ingest-bundle.runner.ts +++ b/packages/cli/src/context/ingest/ingest-bundle.runner.ts @@ -6,6 +6,7 @@ import { type KtxLogger, noopLogger } from '../../context/core/config.js'; import type { RateLimitWaitState } from '../../context/llm/rate-limit-governor.js'; import { createRuntimeToolDescriptorFromAiTool } from '../../context/llm/runtime-tools.js'; import type { KtxRuntimeToolSet } from '../../context/llm/runtime-port.js'; +import type { KtxModelRole } from '../../llm/types.js'; import type { CaptureSession, MemoryAction } from '../../context/memory/types.js'; import type { SemanticLayerService } from '../../context/sl/semantic-layer.service.js'; import { isSlYamlPath, slSourceFilePath, slSourceNameForFile, sourceNameFromPath } from '../../context/sl/source-files.js'; @@ -18,19 +19,28 @@ import type { KnowledgeWikiService } from '../../context/wiki/knowledge-wiki.ser import { findDanglingWikiRefsForActions } from '../wiki/wiki-ref-validation.js'; import { actionTargetConnectionId } from './action-identity.js'; import { NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN } from './adapters/notion/types.js'; -import { validateFinalIngestArtifacts, validateProvenanceRawPaths } from './artifact-gates.js'; +import { + formatFinalArtifactGateFindings, + isFinalArtifactGateFindingPruneable, + validateFinalIngestArtifacts, + validateProvenanceRawPaths, +} from './artifact-gates.js'; import { selectRelevantCanonicalPins } from './canonical-pins.js'; -import { finalGateRepairPaths, repairFinalGateFailure } from './final-gate-repair.js'; import { compareFinalizationDeclarations, deriveFinalizationTouchedSources, deriveFinalizationWikiPageKeys, } from './finalization-scope.js'; +import { + pruneFinalGateFindings, + type FinalGateDroppedSource, + type FinalGatePrunedReference, +} from './final-gate-prune.js'; import { FileIngestTraceWriter, ingestTracePathForJob, type IngestTraceWriter, traceTimed } from './ingest-trace.js'; import { formatIngestProfile, formatIngestProfileJson, readIngestProfile, resolveIngestProfileMode } from './ingest-profile.js'; import { integrateWorkUnitPatch } from './isolated-diff/patch-integrator.js'; import { resolveTextualConflict } from './isolated-diff/textual-conflict-resolver.js'; -import { runIsolatedWorkUnit } from './isolated-diff/work-unit-executor.js'; +import { runIsolatedWorkUnit, workUnitPatchFileName } from './isolated-diff/work-unit-executor.js'; import { sanitizeMemoryFlowError } from './memory-flow/live-buffer.js'; import type { CanonicalPin } from './canonical-pins.js'; import type { MemoryFlowEvent, MemoryFlowEventSink, MemoryFlowPlannedWorkUnit } from './memory-flow/types.js'; @@ -64,6 +74,14 @@ import { runReconciliationStage4 } from './stages/stage-4-reconciliation.js'; import type { StageIndex } from './stages/stage-index.types.js'; import { validateWuTouchedSources } from './stages/validate-wu-sources.js'; import { assertSemanticLayerTargetPathsAllowed } from './semantic-layer-target-policy.js'; +import { + computeIngestWorkUnitInputHash, + computeIngestWorkUnitPromptFingerprint, + INGEST_WORK_UNIT_CACHE_NAMESPACE, + ingestWorkUnitCacheScopeKey, + materializeCachedWorkUnitReplayPatch, + type IngestWorkUnitCachePayload, +} from './work-unit-cache.js'; import { createEmitArtifactResolutionTool } from './tools/emit-artifact-resolution.tool.js'; import { createEmitConflictResolutionTool } from './tools/emit-conflict-resolution.tool.js'; import { createEmitEvictionDecisionTool } from './tools/emit-eviction-decision.tool.js'; @@ -212,6 +230,15 @@ interface ProvenancePlan { diagnostics: ProvenanceRowDiagnostic[]; } +type CachedWorkUnitOutcome = WorkUnitOutcome & { + cacheInputHash: string; + cacheHit: true; +}; + +function isCachedWorkUnitOutcome(outcome: WorkUnitOutcome | CachedWorkUnitOutcome): outcome is CachedWorkUnitOutcome { + return 'cacheHit' in outcome && outcome.cacheHit === true; +} + export class IngestBundleRunner { private readonly logger: KtxLogger; private readonly chainByConnection = new Map>(); @@ -220,6 +247,137 @@ export class IngestBundleRunner { this.logger = deps.logger ?? noopLogger; } + private async cachedWorkUnitOutcome(input: { + runId: string; + syncId: string; + connectionId: string; + sourceKey: string; + stagedDir: string; + unit: WorkUnit; + unitIndex: number; + patchDir: string; + ingestionBaseSha: string; + promptFingerprint: string; + modelRole: KtxModelRole; + trace: IngestTraceWriter; + }): Promise { + const inputHash = await computeIngestWorkUnitInputHash({ + stagedDir: input.stagedDir, + connectionId: input.connectionId, + sourceKey: input.sourceKey, + unit: input.unit, + cliVersion: this.deps.settings.cliVersion, + promptFingerprint: input.promptFingerprint, + modelRole: input.modelRole, + }); + const cached = await this.deps.contentCache.findCompletedResult({ + namespace: INGEST_WORK_UNIT_CACHE_NAMESPACE, + scopeKey: ingestWorkUnitCacheScopeKey(input), + inputHash, + }); + if (!cached) { + await input.trace.event('trace', 'work_unit', 'work_unit_cache_miss', { + unitKey: input.unit.unitKey, + inputHash, + }); + return { cacheInputHash: inputHash, cacheHit: false }; + } + + await mkdir(input.patchDir, { recursive: true }); + const patchPath = join(input.patchDir, workUnitPatchFileName(input.unitIndex, input.unit.unitKey)); + if (cached.output.schemaVersion !== 2 || !Array.isArray(cached.output.artifactFiles)) { + await this.deps.contentCache.deleteResult({ + namespace: INGEST_WORK_UNIT_CACHE_NAMESPACE, + scopeKey: ingestWorkUnitCacheScopeKey(input), + inputHash, + }); + return { cacheInputHash: inputHash, cacheHit: false }; + } + const materialized = await materializeCachedWorkUnitReplayPatch({ + sessionWorktreeService: this.deps.sessionWorktreeService, + baseSha: input.ingestionBaseSha, + jobId: input.runId, + unitKey: input.unit.unitKey, + patchPath, + artifactFiles: cached.output.artifactFiles, + author: this.deps.storage.systemGitAuthor, + trace: input.trace, + }); + if (materialized === 'unsafe_drift') { + await this.deps.contentCache.deleteResult({ + namespace: INGEST_WORK_UNIT_CACHE_NAMESPACE, + scopeKey: ingestWorkUnitCacheScopeKey(input), + inputHash, + }); + await input.trace.event('debug', 'work_unit', 'work_unit_cache_unsafe_drift', { + unitKey: input.unit.unitKey, + inputHash, + }); + return { cacheInputHash: inputHash, cacheHit: false }; + } + await input.trace.event('debug', 'work_unit', 'work_unit_cache_hit', { + unitKey: input.unit.unitKey, + inputHash, + producerRunId: cached.runId, + artifactFileCount: cached.output.artifactFiles.length, + }); + await input.trace.event('debug', 'work_unit', 'work_unit_cache_replayed', { + unitKey: input.unit.unitKey, + patchPath, + inputHash, + }); + return { + unitKey: input.unit.unitKey, + status: 'success', + preSha: '', + postSha: '', + actions: cached.output.actions, + touchedSlSources: cached.output.touchedSlSources, + slDisallowed: cached.output.slDisallowed, + slDisallowedReason: cached.output.slDisallowedReason, + patchPath, + patchTouchedPaths: cached.output.patchTouchedPaths, + artifactFiles: cached.output.artifactFiles, + cacheInputHash: inputHash, + cacheHit: true, + }; + } + + private async saveSuccessfulWorkUnitCache(input: { + runId: string; + syncId: string; + connectionId: string; + sourceKey: string; + inputHash: string; + outcome: WorkUnitOutcome; + }): Promise { + if (input.outcome.status !== 'success' || !input.outcome.patchPath) { + return; + } + await this.deps.contentCache.saveCompletedResult({ + runId: input.runId, + namespace: INGEST_WORK_UNIT_CACHE_NAMESPACE, + scopeKey: ingestWorkUnitCacheScopeKey(input), + inputHash: input.inputHash, + output: { + schemaVersion: 2, + unitKey: input.outcome.unitKey, + patchTouchedPaths: input.outcome.patchTouchedPaths ?? [], + artifactFiles: input.outcome.artifactFiles ?? [], + actions: input.outcome.actions, + touchedSlSources: input.outcome.touchedSlSources, + slDisallowed: input.outcome.slDisallowed, + slDisallowedReason: input.outcome.slDisallowedReason, + }, + metadata: { + syncId: input.syncId, + connectionId: input.connectionId, + sourceKey: input.sourceKey, + }, + updatedAt: new Date().toISOString(), + }); + } + async run(job: IngestBundleJob, ctx?: IngestJobContext): Promise { const unsubscribeRateLimitGovernor = this.subscribeRateLimitGovernor({ trace: this.createTrace(job), @@ -880,6 +1038,52 @@ export class IngestBundleRunner { ); } + private markFinalGateDroppedSourceWorkUnits(input: { + stageIndex: StageIndex; + workUnitOutcomes: WorkUnitOutcome[]; + failedWorkUnits: string[]; + droppedSources: FinalGateDroppedSource[]; + fallbackConnectionId: string; + }): void { + const unitFailures = new Map(); + for (const dropped of input.droppedSources) { + for (const workUnit of input.stageIndex.workUnits) { + const producedByAction = workUnit.actions.some( + (action) => + action.target === 'sl' && + action.key === dropped.sourceName && + actionTargetConnectionId(action, input.fallbackConnectionId) === dropped.connectionId, + ); + const producedByTouchedSource = workUnit.touchedSlSources.some( + (source) => source.connectionId === dropped.connectionId && source.sourceName === dropped.sourceName, + ); + if (!producedByAction && !producedByTouchedSource) { + continue; + } + const reasons = unitFailures.get(workUnit.unitKey) ?? []; + reasons.push(`${dropped.connectionId}:${dropped.sourceName} (${dropped.reason})`); + unitFailures.set(workUnit.unitKey, reasons); + } + } + + for (const [unitKey, reasons] of unitFailures) { + const reason = `final artifact gate dropped invalid source(s): ${reasons.join(', ')}`; + const reportWorkUnit = input.stageIndex.workUnits.find((workUnit) => workUnit.unitKey === unitKey); + if (reportWorkUnit) { + reportWorkUnit.status = 'failed'; + reportWorkUnit.reason = reason; + } + const outcome = input.workUnitOutcomes.find((workUnit) => workUnit.unitKey === unitKey); + if (outcome) { + outcome.status = 'failed'; + outcome.reason = reason; + } + if (!input.failedWorkUnits.includes(unitKey)) { + input.failedWorkUnits.push(unitKey); + } + } + } + private finalGateActionOrigins(input: { stageIndex: StageIndex; reconcileActions: MemoryAction[]; @@ -1096,13 +1300,6 @@ export class IngestBundleRunner { agentRunner: this.deps.agentRunner, validateTouchedSources: (touched) => validateWuTouchedSources({ ...slValidationDeps, slValidator: this.deps.slValidator }, touched), - validateWikiRefs: (actions) => - findDanglingWikiRefsForActions({ - wikiService: input.scopedWikiService, - scope: 'GLOBAL', - scopeId: null, - actions, - }), resetHardTo: (targetSha) => input.worktree.git.resetHardTo(targetSha), buildSystemPrompt: () => systemPrompt, buildUserPrompt: (wuInner) => @@ -1514,6 +1711,16 @@ export class IngestBundleRunner { const wuSkills = await this.deps.skillsRegistry.listSkills(wuSkillNames, 'memory_agent'); const skillsPrompt = this.deps.skillsRegistry.buildSkillsPrompt(wuSkills, 'memory_agent'); const canonicalPins = await this.deps.canonicalPins.listPins(slConnectionIds); + const workUnitModelRole = 'candidateExtraction' as const; + const workUnitPromptFingerprint = computeIngestWorkUnitPromptFingerprint({ + cliVersion: this.deps.settings.cliVersion, + baseFraming, + skillsPrompt, + canonicalPins, + sourceKey: job.sourceKey, + connectionId: job.connectionId, + skillNames: wuSkillNames, + }); const workUnitOutcomes: WorkUnitOutcome[] = []; const failedWorkUnits: string[] = []; @@ -1550,9 +1757,6 @@ export class IngestBundleRunner { resolverAttempts: 0, resolverRepairs: 0, resolverFailures: 0, - gateRepairAttempts: 0, - gateRepairs: 0, - gateRepairFailures: 0, }; latestIsolatedDiffSummary = isolatedDiffSummary; @@ -1643,79 +1847,106 @@ export class IngestBundleRunner { await stage3?.updateProgress(1.0, '0 of 0 work units complete'); } + const runFreshIsolatedWorkUnit = async (wu: WorkUnit, index: number): Promise => + runIsolatedWorkUnit({ + unitIndex: index, + ingestionBaseSha, + sessionWorktreeService: this.deps.sessionWorktreeService, + patchDir, + trace: runTrace, + workUnit: wu, + abortSignal: ctx?.abortSignal, + afterSuccess: (child) => copyTransientIngestEvidence(child.workdir, sessionWorktree.workdir), + run: async (child) => { + const scopedWikiService = this.deps.wikiService.forWorktree(child.workdir); + const scopedSemanticLayerService = this.deps.semanticLayerService.forWorktree(child.workdir); + return this.runWorkUnitInWorktree({ + job, + syncId, + wu, + worktree: child, + stagedDir, + contextReport, + ingestToolMetadata, + slConnectionIds, + wikiIndex, + slIndex, + priorProvenance: await this.deps.provenance.findLatestArtifactsForRawPaths( + job.connectionId, + job.sourceKey, + wu.rawFiles, + ), + scopedWikiService, + scopedSemanticLayerService, + baseFraming, + skillsPrompt, + canonicalPins, + workUnitSettings, + transcriptDir, + transcriptSummaries, + recordTranscriptEntry, + stageIndex, + includeContextEvidenceTools: adapter.evidenceIndexing === 'documents' && !!contextReport, + currentTableExists: (tableRef) => + this.tableRefExistsInSemanticLayer(scopedSemanticLayerService, slConnectionIds, tableRef), + abortSignal: ctx?.abortSignal, + memoryFlow, + wuSkillNames, + }); + }, + }); + try { await Promise.all( workUnits.map((wu, index) => limitWorkUnit(() => this.withRateLimitWorkSlot(ctx?.abortSignal, async () => { - const outcome = await runIsolatedWorkUnit({ - unitIndex: index, - ingestionBaseSha, - sessionWorktreeService: this.deps.sessionWorktreeService, - patchDir, - trace: runTrace, - workUnit: wu, - abortSignal: ctx?.abortSignal, - afterSuccess: (child) => copyTransientIngestEvidence(child.workdir, sessionWorktree.workdir), - run: async (child) => { - const scopedWikiService = this.deps.wikiService.forWorktree(child.workdir); - const scopedSemanticLayerService = this.deps.semanticLayerService.forWorktree(child.workdir); - return this.runWorkUnitInWorktree({ - job, - syncId, - wu, - worktree: child, - stagedDir, - contextReport, - ingestToolMetadata, - slConnectionIds, - wikiIndex, - slIndex, - priorProvenance: await this.deps.provenance.findLatestArtifactsForRawPaths( - job.connectionId, - job.sourceKey, - wu.rawFiles, - ), - scopedWikiService, - scopedSemanticLayerService, - baseFraming, - skillsPrompt, - canonicalPins, - workUnitSettings, - transcriptDir, - transcriptSummaries, - recordTranscriptEntry, - stageIndex, - includeContextEvidenceTools: adapter.evidenceIndexing === 'documents' && !!contextReport, - currentTableExists: (tableRef) => - this.tableRefExistsInSemanticLayer(scopedSemanticLayerService, slConnectionIds, tableRef), - abortSignal: ctx?.abortSignal, - memoryFlow, - wuSkillNames, - }); - }, - }); - workUnitOutcomesByIndex[index] = outcome; - for (const action of outcome.actions) { - memoryFlow?.emit({ - type: 'candidate_action', - unitKey: outcome.unitKey, - target: action.target, - action: action.type, - key: action.key, + const cached = await this.cachedWorkUnitOutcome({ + runId: createdRunRow.id, + syncId, + connectionId: job.connectionId, + sourceKey: job.sourceKey, + stagedDir, + unit: wu, + unitIndex: index, + patchDir, + ingestionBaseSha, + promptFingerprint: workUnitPromptFingerprint, + modelRole: workUnitModelRole, + trace: runTrace, }); - } - memoryFlow?.emit({ - type: 'work_unit_finished', - unitKey: outcome.unitKey, - status: outcome.status, - ...(outcome.reason ? { reason: outcome.reason } : {}), - }); - completedWorkUnits += 1; - await stage3?.updateProgress( - completedWorkUnits / workUnits.length, - `${completedWorkUnits} of ${workUnits.length} work units complete`, - ); + const outcome = cached.cacheHit ? cached : await runFreshIsolatedWorkUnit(wu, index); + if (!cached.cacheHit) { + await this.saveSuccessfulWorkUnitCache({ + runId: createdRunRow.id, + syncId, + connectionId: job.connectionId, + sourceKey: job.sourceKey, + inputHash: cached.cacheInputHash, + outcome, + }); + } + workUnitOutcomesByIndex[index] = outcome; + for (const action of outcome.actions) { + memoryFlow?.emit({ + type: 'candidate_action', + unitKey: outcome.unitKey, + target: action.target, + action: action.type, + key: action.key, + }); + } + memoryFlow?.emit({ + type: 'work_unit_finished', + unitKey: outcome.unitKey, + status: outcome.status, + ...(outcome.reason ? { reason: outcome.reason } : {}), + }); + completedWorkUnits += 1; + await stage3?.updateProgress( + completedWorkUnits / workUnits.length, + `${completedWorkUnits} of ${workUnits.length} work units complete`, + ); }), ), ), @@ -1725,24 +1956,29 @@ export class IngestBundleRunner { throw error; } - workUnitOutcomes.push( - ...workUnitOutcomesByIndex.filter((outcome): outcome is WorkUnitOutcome => Boolean(outcome)), - ); - failedWorkUnits.push( - ...workUnitOutcomes.filter((outcome) => outcome.status === 'failed').map((outcome) => outcome.unitKey), - ); - latestWorkUnits = workUnitOutcomes; - latestFailedWorkUnits = failedWorkUnits; - stageIndex.workUnits = workUnitOutcomes.map((o) => ({ - unitKey: o.unitKey, - rawFiles: workUnits.find((w) => w.unitKey === o.unitKey)?.rawFiles ?? [], - status: o.status, - reason: o.reason, - actions: o.actions, - touchedSlSources: o.touchedSlSources, - slDisallowed: o.slDisallowed, - slDisallowedReason: o.slDisallowedReason, - })); + const refreshWorkUnitState = () => { + workUnitOutcomes.length = 0; + workUnitOutcomes.push( + ...workUnitOutcomesByIndex.filter((outcome): outcome is WorkUnitOutcome => Boolean(outcome)), + ); + failedWorkUnits.length = 0; + failedWorkUnits.push( + ...workUnitOutcomes.filter((outcome) => outcome.status === 'failed').map((outcome) => outcome.unitKey), + ); + latestWorkUnits = workUnitOutcomes; + latestFailedWorkUnits = failedWorkUnits; + stageIndex.workUnits = workUnitOutcomes.map((o) => ({ + unitKey: o.unitKey, + rawFiles: workUnits.find((w) => w.unitKey === o.unitKey)?.rawFiles ?? [], + status: o.status, + reason: o.reason, + actions: o.actions, + touchedSlSources: o.touchedSlSources, + slDisallowed: o.slDisallowed, + slDisallowedReason: o.slDisallowedReason, + })); + }; + refreshWorkUnitState(); activePhase = 'integration'; const integrablePatchCount = workUnitOutcomesByIndex.filter( @@ -1757,155 +1993,177 @@ export class IngestBundleRunner { if (!wu) { continue; } - const integrationFailureDetails = { - unitKey: outcome.unitKey, - patchPath: outcome.patchPath, - allowedTargetConnectionIds: slConnectionIds, - }; - activeFailureDetails = integrationFailureDetails; - emitStageProgress( - 'integration', - 80, - `Integrating ${integratedPatchCount + 1}/${integrablePatchCount} patches: ${outcome.unitKey}`, - ); - const integration = await integrateWorkUnitPatch({ - unitKey: outcome.unitKey, - patchPath: outcome.patchPath, - integrationGit: sessionWorktree.git, - trace: runTrace, - author: this.deps.storage.systemGitAuthor, - slDisallowed: wu.slDisallowed === true, - allowedTargetConnectionIds: new Set(slConnectionIds), - validateAppliedTree: async (touchedPaths) => { - await validateFinalIngestArtifacts({ - connectionIds: slConnectionIds, - changedWikiPageKeys: this.wikiPageKeysFromPaths(touchedPaths), - touchedSlSources: await this.touchedSlSourcesFromPaths( - sessionWorktree, - touchedPaths, - await sessionWorktree.git.revParseHead(), - ), - wikiService: this.deps.wikiService.forWorktree(sessionWorktree.workdir), - semanticLayerService: this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir), - validateTouchedSources: (touched) => - validateWuTouchedSources( - { - semanticLayerService: this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir), - connections: this.deps.connections, - configService: sessionWorktree.config, - gitService: sessionWorktree.git, - slSourcesRepository: this.deps.slSourcesRepository, - probeRowCount: this.deps.settings.probeRowCount, - slValidator: this.deps.slValidator, - }, - touched, - ), - tableExists: (connectionId, tableRef) => - this.tableRefExistsInSemanticLayer( - this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir), - [connectionId], - tableRef, + let outcomeForIntegration: WorkUnitOutcome | CachedWorkUnitOutcome = outcome; + let recomputedCachedPatch = false; + while (true) { + const patchPath = outcomeForIntegration.patchPath; + if (!patchPath) { + activeFailureDetails = undefined; + break; + } + const integrationFailureDetails = { + unitKey: outcomeForIntegration.unitKey, + patchPath, + allowedTargetConnectionIds: slConnectionIds, + }; + activeFailureDetails = integrationFailureDetails; + emitStageProgress( + 'integration', + 80, + `Integrating ${integratedPatchCount + 1}/${integrablePatchCount} patches: ${outcomeForIntegration.unitKey}`, + ); + const integration = await integrateWorkUnitPatch({ + unitKey: outcomeForIntegration.unitKey, + patchPath, + integrationGit: sessionWorktree.git, + trace: runTrace, + author: this.deps.storage.systemGitAuthor, + slDisallowed: wu.slDisallowed === true, + allowedTargetConnectionIds: new Set(slConnectionIds), + validateAppliedTree: async (touchedPaths) => { + const gate = await validateFinalIngestArtifacts({ + connectionIds: slConnectionIds, + changedWikiPageKeys: this.wikiPageKeysFromPaths(touchedPaths), + touchedSlSources: await this.touchedSlSourcesFromPaths( + sessionWorktree, + touchedPaths, + await sessionWorktree.git.revParseHead(), ), + wikiService: this.deps.wikiService.forWorktree(sessionWorktree.workdir), + semanticLayerService: this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir), + validateTouchedSources: (touched) => + validateWuTouchedSources( + { + semanticLayerService: this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir), + connections: this.deps.connections, + configService: sessionWorktree.config, + gitService: sessionWorktree.git, + slSourcesRepository: this.deps.slSourcesRepository, + probeRowCount: this.deps.settings.probeRowCount, + slValidator: this.deps.slValidator, + }, + touched, + ), + tableExists: (connectionId, tableRef) => + this.tableRefExistsInSemanticLayer( + this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir), + [connectionId], + tableRef, + ), + }); + if (!gate.ok) { + const blocking = gate.findings.filter((finding) => !isFinalArtifactGateFindingPruneable(finding)); + if (blocking.length > 0) { + throw new Error(formatFinalArtifactGateFindings(blocking)); + } + await runTrace.event('debug', 'integration', 'patch_semantic_gate_deferred_to_final_prune', { + unitKey: outcomeForIntegration.unitKey, + findings: gate.findings, + }); + } + }, + resolveTextualConflict: async (context) => { + emitStageProgress('integration', 81, `Resolving text conflict for ${context.unitKey}`); + const result = await resolveTextualConflict({ + agentRunner: this.deps.agentRunner, + workdir: sessionWorktree.workdir, + unitKey: context.unitKey, + patchPath: context.patchPath, + touchedPaths: context.touchedPaths, + trace: runTrace, + reason: context.reason, + verify: context.verify, + maxAttempts: 2, + stepBudget: 12, + abortSignal: ctx?.abortSignal, + }); + emitStageProgress( + 'integration', + 82, + result.status === 'repaired' + ? `Resolved text conflict for ${context.unitKey}` + : `Text conflict resolver failed for ${context.unitKey}`, + ); + return result; + }, + }); + if (integration.textualResolution) { + isolatedDiffSummary.resolverAttempts += integration.textualResolution.attempts; + if (integration.textualResolution.status === 'repaired') { + isolatedDiffSummary.textualConflicts += 1; + isolatedDiffSummary.resolverRepairs += 1; + } else { + isolatedDiffSummary.resolverFailures += 1; + } + } + if ( + integration.status !== 'accepted' && + isCachedWorkUnitOutcome(outcomeForIntegration) && + !recomputedCachedPatch + ) { + await this.deps.contentCache.deleteResult({ + namespace: INGEST_WORK_UNIT_CACHE_NAMESPACE, + scopeKey: ingestWorkUnitCacheScopeKey({ connectionId: job.connectionId, sourceKey: job.sourceKey }), + inputHash: outcomeForIntegration.cacheInputHash, }); - }, - resolveTextualConflict: async (context) => { - emitStageProgress('integration', 81, `Resolving text conflict for ${context.unitKey}`); - const result = await resolveTextualConflict({ - agentRunner: this.deps.agentRunner, - workdir: sessionWorktree.workdir, - unitKey: context.unitKey, - patchPath: context.patchPath, - touchedPaths: context.touchedPaths, - trace: runTrace, - reason: context.reason, - verify: context.verify, - maxAttempts: 2, - stepBudget: 12, - abortSignal: ctx?.abortSignal, + await runTrace.event('debug', 'integration', 'work_unit_cache_stale_recompute', { + unitKey: outcomeForIntegration.unitKey, + inputHash: outcomeForIntegration.cacheInputHash, + reason: integration.reason, }); - emitStageProgress( - 'integration', - 82, - result.status === 'repaired' - ? `Resolved text conflict for ${context.unitKey}` - : `Text conflict resolver failed for ${context.unitKey}`, - ); - return result; - }, - repairGateFailure: async (context) => { - emitStageProgress('integration', 82, `Repairing semantic gate for ${context.unitKey}`); - const result = await repairFinalGateFailure({ - agentRunner: this.deps.agentRunner, - workdir: sessionWorktree.workdir, - gateError: context.reason, - allowedPaths: context.touchedPaths, - trace: runTrace, - repairKind: 'patch_semantic_gate', - verify: context.verify, - maxAttempts: 2, - stepBudget: 16, - abortSignal: ctx?.abortSignal, + const recomputed = await runFreshIsolatedWorkUnit(wu, index); + workUnitOutcomesByIndex[index] = recomputed; + await this.saveSuccessfulWorkUnitCache({ + runId: createdRunRow.id, + syncId, + connectionId: job.connectionId, + sourceKey: job.sourceKey, + inputHash: outcomeForIntegration.cacheInputHash, + outcome: recomputed, }); - emitStageProgress( - 'integration', - 83, - result.status === 'repaired' - ? `Repaired semantic gate for ${context.unitKey}` - : `Semantic gate repair failed for ${context.unitKey}`, - ); - return result; - }, - }); - if (integration.textualResolution) { - isolatedDiffSummary.resolverAttempts += integration.textualResolution.attempts; - if (integration.textualResolution.status === 'repaired') { + refreshWorkUnitState(); + if (recomputed.status !== 'success' || !recomputed.patchPath) { + activeFailureDetails = undefined; + break; + } + outcomeForIntegration = recomputed; + recomputedCachedPatch = true; + continue; + } + if (integration.status === 'textual_conflict') { isolatedDiffSummary.textualConflicts += 1; - isolatedDiffSummary.resolverRepairs += 1; - } else { - isolatedDiffSummary.resolverFailures += 1; + await this.deps.runs.markFailed(runRow.id); + cleanupOutcome = 'conflict'; + activeFailureDetails = { + ...integrationFailureDetails, + touchedPaths: integration.touchedPaths, + reason: integration.reason, + }; + throw new Error(`isolated diff textual conflict in ${outcomeForIntegration.unitKey}: ${integration.reason}`); } - } - if (integration.gateRepair) { - isolatedDiffSummary.gateRepairAttempts += integration.gateRepair.attempts; - if (integration.gateRepair.status === 'repaired') { + if (integration.status === 'semantic_conflict') { isolatedDiffSummary.semanticConflicts += 1; - isolatedDiffSummary.gateRepairs += 1; - } else { - isolatedDiffSummary.gateRepairFailures += 1; + await this.deps.runs.markFailed(runRow.id); + cleanupOutcome = 'conflict'; + activeFailureDetails = { + ...integrationFailureDetails, + touchedPaths: integration.touchedPaths, + reason: integration.reason, + }; + throw new Error(`isolated diff semantic conflict in ${outcomeForIntegration.unitKey}: ${integration.reason}`); } + activeFailureDetails = undefined; + if (integration.touchedPaths.length > 0) { + isolatedDiffSummary.acceptedPatches += 1; + integratedPatchCount += 1; + } + emitStageProgress( + 'integration', + 83, + `Integrated ${integratedPatchCount}/${integrablePatchCount} patches`, + ); + break; } - if (integration.status === 'textual_conflict') { - isolatedDiffSummary.textualConflicts += 1; - await this.deps.runs.markFailed(runRow.id); - cleanupOutcome = 'conflict'; - activeFailureDetails = { - ...integrationFailureDetails, - touchedPaths: integration.touchedPaths, - reason: integration.reason, - }; - throw new Error(`isolated diff textual conflict in ${outcome.unitKey}: ${integration.reason}`); - } - if (integration.status === 'semantic_conflict') { - isolatedDiffSummary.semanticConflicts += 1; - await this.deps.runs.markFailed(runRow.id); - cleanupOutcome = 'conflict'; - activeFailureDetails = { - ...integrationFailureDetails, - touchedPaths: integration.touchedPaths, - reason: integration.reason, - }; - throw new Error(`isolated diff semantic conflict in ${outcome.unitKey}: ${integration.reason}`); - } - activeFailureDetails = undefined; - if (integration.touchedPaths.length > 0) { - isolatedDiffSummary.acceptedPatches += 1; - integratedPatchCount += 1; - } - emitStageProgress( - 'integration', - 83, - `Integrated ${integratedPatchCount}/${integrablePatchCount} patches`, - ); } } @@ -2457,6 +2715,22 @@ export class IngestBundleRunner { ...finalizationTouchedSources.map((source) => source.connectionId), ]), ].sort(); + const preWikiSlRefRepairSha = await sessionWorktree.git.revParseHead(); + const preWikiSlRefRepairPaths = + preReconciliationSha && preWikiSlRefRepairSha && preReconciliationSha !== preWikiSlRefRepairSha + ? (await sessionWorktree.git.diffNameStatus(preReconciliationSha, preWikiSlRefRepairSha)).map( + (entry) => entry.path, + ) + : []; + const wikiPageKeysForFinalPrune = this.uniqueWikiPageKeys([ + ...(isolatedDiffEnabled ? projectionChangedWikiPageKeys : []), + ...workUnitOutcomes + .flatMap((outcome) => outcome.patchTouchedPaths ?? []) + .flatMap((path) => this.wikiPageKeysFromPaths([path])), + ...this.wikiPageKeysFromActions(reconcileActions), + ...finalizationChangedWikiPageKeys, + ...preWikiSlRefRepairPaths.flatMap((path) => this.wikiPageKeysFromPaths([path])), + ]); activePhase = 'wiki_sl_ref_repair'; emitStageProgress('wiki_sl_ref_repair', 88, 'Repairing wiki semantic-layer references'); wikiSlRefRepairResult = await traceTimed( @@ -2470,6 +2744,7 @@ export class IngestBundleRunner { semanticLayerService: this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir), configService: sessionWorktree.config, connectionIds: repairConnectionIds, + deferGlobalPageKeys: wikiPageKeysForFinalPrune, }), ); await runTrace.event('debug', 'wiki_sl_ref_repair', 'wiki_sl_refs_repaired', { @@ -2549,11 +2824,13 @@ export class IngestBundleRunner { activePhase = 'final_gates'; activeFailureDetails = finalArtifactGateTraceData; emitStageProgress('final_gates', 89, 'Running final artifact gates'); - const runFinalArtifactGates = async () => { - await validateFinalIngestArtifacts({ + let finalGatePrunedReferences: FinalGatePrunedReference[] = []; + let finalGateDroppedSources: FinalGateDroppedSource[] = []; + const runFinalArtifactGates = async (touchedSources = finalTouchedSlSources) => + validateFinalIngestArtifacts({ connectionIds: repairConnectionIds, changedWikiPageKeys: finalChangedWikiPageKeys, - touchedSlSources: finalTouchedSlSources, + touchedSlSources: touchedSources, wikiService: this.deps.wikiService.forWorktree(sessionWorktree.workdir), semanticLayerService: this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir), validateTouchedSources: (touched) => @@ -2576,70 +2853,74 @@ export class IngestBundleRunner { tableRef, ), }); - }; - try { - await traceTimed( - runTrace, - 'final_gates', - 'final_artifact_gates', - finalArtifactGateTraceData, - runFinalArtifactGates, - ); - } catch (error) { - const gateError = this.errorMessage(error); - const repairPaths = finalGateRepairPaths({ - changedWikiPageKeys: finalChangedWikiPageKeys, - touchedSlSourcePaths: await this.touchedSlSourcePaths(sessionWorktree.workdir, finalTouchedSlSources), - }); - emitStageProgress('final_gates', 89, 'Repairing final artifact gates'); - const gateRepair = await repairFinalGateFailure({ - agentRunner: this.deps.agentRunner, + + const firstGate = await traceTimed( + runTrace, + 'final_gates', + 'final_artifact_gates', + finalArtifactGateTraceData, + () => runFinalArtifactGates(), + ); + if (!firstGate.ok) { + emitStageProgress('final_gates', 89, 'Pruning final artifact gates'); + const firstPrune = await pruneFinalGateFindings({ workdir: sessionWorktree.workdir, - gateError, - allowedPaths: repairPaths, + semanticLayerFiles: sessionWorktree.config, + findings: firstGate.findings, + droppedSources: [], trace: runTrace, - repairKind: 'final_artifact_gate', - verify: async () => { - try { - await runFinalArtifactGates(); - return { ok: true }; - } catch (verifyError) { - return { ok: false, reason: this.errorMessage(verifyError) }; - } - }, - maxAttempts: 2, - stepBudget: 16, - abortSignal: ctx?.abortSignal, + author: this.deps.storage.systemGitAuthor, + wikiService: this.deps.wikiService.forWorktree(sessionWorktree.workdir), }); - - isolatedDiffSummary.gateRepairAttempts += gateRepair.attempts; - if (gateRepair.status === 'failed') { - isolatedDiffSummary.gateRepairFailures += 1; - activeFailureDetails = { - ...finalArtifactGateTraceData, - gateRepair, - gateError, - }; - throw new Error(`${gateError}\ngate repair failed: ${gateRepair.reason}`); - } - - // The repair loop re-ran the gates via `verify` before reporting - // success, so a repaired status here means the tree already passed. - isolatedDiffSummary.gateRepairs += 1; - - const repairCommit = await sessionWorktree.git.commitFiles( - gateRepair.changedPaths, - `ingest(${job.sourceKey}): repair final gates syncId=${syncId}`, - this.deps.storage.systemGitAuthor.name, - this.deps.storage.systemGitAuthor.email, + finalGateDroppedSources = firstPrune.droppedSources; + finalGatePrunedReferences = firstPrune.prunedReferences; + const droppedKeys = new Set( + finalGateDroppedSources.map((source) => `${source.connectionId}:${source.sourceName}`), ); - if (!repairCommit.created) { - isolatedDiffSummary.gateRepairFailures += 1; - throw new Error('final gate repair produced no committable changes'); + const touchedAfterDrop = finalTouchedSlSources.filter( + (source) => !droppedKeys.has(`${source.connectionId}:${source.sourceName}`), + ); + const secondGate = await runFinalArtifactGates(touchedAfterDrop); + if (!secondGate.ok) { + const secondPrune = await pruneFinalGateFindings({ + workdir: sessionWorktree.workdir, + semanticLayerFiles: sessionWorktree.config, + findings: secondGate.findings.filter((finding) => finding.kind !== 'invalid_source'), + droppedSources: finalGateDroppedSources, + trace: runTrace, + author: this.deps.storage.systemGitAuthor, + wikiService: this.deps.wikiService.forWorktree(sessionWorktree.workdir), + }); + finalGateDroppedSources = secondPrune.droppedSources; + finalGatePrunedReferences = [...finalGatePrunedReferences, ...secondPrune.prunedReferences]; } - await runTrace.event('debug', 'final_gates', 'final_gate_repair_committed', { - commitSha: repairCommit.commitHash, - repairedPaths: gateRepair.changedPaths, + const pruneTouchedPaths = await sessionWorktree.git.changedPaths(); + if (pruneTouchedPaths.length > 0) { + const pruneCommit = await sessionWorktree.git.commitFiles( + pruneTouchedPaths, + `ingest(${job.sourceKey}): prune final gate findings syncId=${syncId}`, + this.deps.storage.systemGitAuthor.name, + this.deps.storage.systemGitAuthor.email, + ); + await runTrace.event('debug', 'final_gates', 'final_gate_prune_committed', { + commitSha: pruneCommit.created ? pruneCommit.commitHash : null, + touchedPaths: pruneTouchedPaths, + }); + } + const confirmGate = await runFinalArtifactGates(touchedAfterDrop); + if (!confirmGate.ok) { + throw new Error(`final artifact gates still failed after prune:\n${formatFinalArtifactGateFindings(confirmGate.findings)}`); + } + await runTrace.event('info', 'final_gates', 'final_gate_prune_finished', { + prunedReferences: finalGatePrunedReferences, + droppedSources: finalGateDroppedSources, + }); + this.markFinalGateDroppedSourceWorkUnits({ + stageIndex, + workUnitOutcomes, + failedWorkUnits, + droppedSources: finalGateDroppedSources, + fallbackConnectionId: job.connectionId, }); } activeFailureDetails = undefined; @@ -2866,6 +3147,8 @@ export class IngestBundleRunner { provenanceRows: reportProvenanceRows, toolTranscripts: reportToolTranscripts, finalization: finalizationOutcome, + finalGatePrunedReferences, + finalGateDroppedSources, wikiSlRefRepairs: wikiSlRefRepairResult.repairs, wikiSlRefRepairWarnings: wikiSlRefRepairResult.warnings, ...(reportMemoryFlow ? { memoryFlow: reportMemoryFlow } : {}), @@ -2977,6 +3260,8 @@ export class IngestBundleRunner { failedWorkUnits, artifactsWritten: provenanceRows.filter((r) => r.actionType !== 'skipped').length, commitSha, + finalGatePrunedReferences, + finalGateDroppedSources, }; } finally { await this.deps.sessionWorktreeService.cleanup(sessionWorktree, cleanupOutcome); diff --git a/packages/cli/src/context/ingest/isolated-diff/patch-integrator.ts b/packages/cli/src/context/ingest/isolated-diff/patch-integrator.ts index 04cc099b..af0e60dd 100644 --- a/packages/cli/src/context/ingest/isolated-diff/patch-integrator.ts +++ b/packages/cli/src/context/ingest/isolated-diff/patch-integrator.ts @@ -1,7 +1,6 @@ import { readFile } from 'node:fs/promises'; import type { GitService } from '../../../context/core/git.service.js'; import type { RepairVerification } from '../constrained-repair.js'; -import type { FinalGateRepairResult } from '../final-gate-repair.js'; import type { IngestTraceWriter } from '../ingest-trace.js'; import { traceTimed } from '../ingest-trace.js'; import { assertPatchAllowedForWorkUnit, parsePatchTouchedPaths } from './git-patch.js'; @@ -13,21 +12,18 @@ export type PatchIntegrationResult = commitSha: string; touchedPaths: string[]; textualResolution?: TextualConflictResolutionResult; - gateRepair?: FinalGateRepairResult; } | { status: 'textual_conflict'; reason: string; touchedPaths: string[]; textualResolution?: TextualConflictResolutionResult; - gateRepair?: FinalGateRepairResult; } | { status: 'semantic_conflict'; reason: string; touchedPaths: string[]; textualResolution?: TextualConflictResolutionResult; - gateRepair?: FinalGateRepairResult; }; export interface IntegrateWorkUnitPatchInput { @@ -46,13 +42,6 @@ export interface IntegrateWorkUnitPatchInput { reason: string; verify(changedPaths: string[]): Promise; }): Promise; - repairGateFailure?(input: { - unitKey: string; - patchPath: string; - touchedPaths: string[]; - reason: string; - verify(changedPaths: string[]): Promise; - }): Promise; } function errorMessage(error: unknown): string { @@ -225,59 +214,6 @@ export async function integrateWorkUnitPatch(input: IntegrateWorkUnitPatchInput) reason, }); - if (input.repairGateFailure) { - const gateRepair = await input.repairGateFailure({ - unitKey: input.unitKey, - patchPath: input.patchPath, - touchedPaths, - reason, - verify: verifyAppliedTree, - }); - - if (gateRepair.status === 'failed') { - if (preApplyHead) { - await input.integrationGit.resetHardTo(preApplyHead); - } - return { - status: 'semantic_conflict', - reason: gateRepair.reason, - touchedPaths, - gateRepair, - }; - } - - const commit = await input.integrationGit.commitFiles( - gateRepair.changedPaths, - `ingest: repair WorkUnit ${input.unitKey} gates`, - input.author.name, - input.author.email, - ); - if (!commit.created) { - if (preApplyHead) { - await input.integrationGit.resetHardTo(preApplyHead); - } - return { - status: 'semantic_conflict', - reason: 'gate repair produced no committable changes', - touchedPaths: gateRepair.changedPaths, - gateRepair, - }; - } - - await input.trace.event('debug', 'integration', 'patch_accepted_after_gate_repair', { - unitKey: input.unitKey, - commitSha: commit.commitHash, - touchedPaths: gateRepair.changedPaths, - attempts: gateRepair.attempts, - }); - return { - status: 'accepted', - commitSha: commit.commitHash, - touchedPaths: gateRepair.changedPaths, - gateRepair, - }; - } - if (preApplyHead) { await input.integrationGit.resetHardTo(preApplyHead); } diff --git a/packages/cli/src/context/ingest/isolated-diff/work-unit-executor.ts b/packages/cli/src/context/ingest/isolated-diff/work-unit-executor.ts index 5ab52102..50811522 100644 --- a/packages/cli/src/context/ingest/isolated-diff/work-unit-executor.ts +++ b/packages/cli/src/context/ingest/isolated-diff/work-unit-executor.ts @@ -5,6 +5,7 @@ import type { IngestSessionWorktree, IngestSessionWorktreePort } from '../ports. import type { WorkUnit } from '../types.js'; import type { IngestTraceWriter } from '../ingest-trace.js'; import type { WorkUnitOutcome } from '../stages/stage-3-work-units.js'; +import { captureIngestWorkUnitCachedArtifactFiles } from '../work-unit-cache.js'; import { parsePatchTouchedPaths } from './git-patch.js'; export interface RunIsolatedWorkUnitInput { @@ -19,7 +20,7 @@ export interface RunIsolatedWorkUnitInput { afterSuccess?(child: IngestSessionWorktree): Promise; } -function patchFileName(unitIndex: number, unitKey: string): string { +export function workUnitPatchFileName(unitIndex: number, unitKey: string): string { const safeKey = unitKey.replace(/[^a-zA-Z0-9_.-]+/g, '-'); return `${String(unitIndex).padStart(4, '0')}-${safeKey}.patch`; } @@ -84,21 +85,29 @@ export async function runIsolatedWorkUnit(input: RunIsolatedWorkUnitInput): Prom await input.afterSuccess?.(child); await mkdir(input.patchDir, { recursive: true }); - const patchPath = join(input.patchDir, patchFileName(input.unitIndex, input.workUnit.unitKey)); + const patchPath = join(input.patchDir, workUnitPatchFileName(input.unitIndex, input.workUnit.unitKey)); await child.git.writeBinaryNoRenamePatch(input.ingestionBaseSha, 'HEAD', patchPath); const patch = await readFile(patchPath, 'utf-8'); const touched = parsePatchTouchedPaths(patch); + const patchTouchedPaths = touched.map((entry) => entry.path); + const artifactFiles = await captureIngestWorkUnitCachedArtifactFiles({ + git: child.git, + workdir: child.workdir, + baseSha: input.ingestionBaseSha, + patchTouchedPaths, + }); cleanupOutcome = 'success'; await input.trace.event('debug', 'work_unit', 'work_unit_patch_collected', { unitKey: input.workUnit.unitKey, patchPath, - touchedPaths: touched.map((entry) => entry.path), + touchedPaths: patchTouchedPaths, patchBytes: Buffer.byteLength(patch), }); return { ...outcome, patchPath, - patchTouchedPaths: touched.map((entry) => entry.path), + patchTouchedPaths, + artifactFiles, childWorktreePath: child.workdir, }; } catch (error) { diff --git a/packages/cli/src/context/ingest/local-bundle-runtime.ts b/packages/cli/src/context/ingest/local-bundle-runtime.ts index 69f0baa5..c8c118e4 100644 --- a/packages/cli/src/context/ingest/local-bundle-runtime.ts +++ b/packages/cli/src/context/ingest/local-bundle-runtime.ts @@ -15,6 +15,7 @@ import { createLocalKtxLlmRuntimeFromConfig } from '../../context/llm/local-conf import { KtxIngestEmbeddingPortAdapter } from '../../context/llm/embedding-port.js'; import { createRateLimitGovernorConfig, RateLimitGovernor } from '../../context/llm/rate-limit-governor.js'; import { RuntimeAgentRunner, type AgentRunnerPort, type KtxLlmRuntimePort, type KtxRuntimeToolSet } from '../../context/llm/runtime-port.js'; +import { getKtxCliPackageInfo } from '../../cli-runtime.js'; import type { KtxEmbeddingProvider } from '../../llm/types.js'; import type { KtxLocalProject } from '../../context/project/project.js'; import { ktxLocalStateDbPath } from '../../context/project/local-state-db.js'; @@ -54,6 +55,7 @@ import { WikiWriteTool } from '../../context/wiki/tools/wiki-write.tool.js'; import { CandidateDedupService } from '../../context/ingest/context-candidates/candidate-dedup.service.js'; import { ContextCandidateCarryforwardService } from '../../context/ingest/context-candidates/context-candidate-carryforward.service.js'; import { CuratorPaginationService } from '../../context/ingest/context-candidates/curator-pagination.service.js'; +import { SqliteContentResultCache } from '../cache/sqlite-content-result-cache.js'; import { createEmitHistoricSqlEvidenceTool } from './adapters/historic-sql/evidence-tool.js'; import { ContextEvidenceIndexService } from '../../context/ingest/context-evidence/context-evidence-index.service.js'; import { SqliteContextEvidenceStore } from '../../context/ingest/context-evidence/sqlite-context-evidence-store.js'; @@ -657,6 +659,7 @@ export function createLocalBundleIngestRuntime( mkdirSync(join(options.project.projectDir, '.ktx/cache/local-ingest'), { recursive: true }); const store = new SqliteBundleIngestStore({ dbPath }); const contextStore = new SqliteContextEvidenceStore({ dbPath }); + const contentCache = new SqliteContentResultCache({ dbPath }); const embeddingProvider = options.embeddingProvider ?? null; if (!embeddingProvider && options.project.config.ingest.embeddings.backend !== 'none') { // Embedding-dependent stages (CandidateDedup clustering, ContextEvidenceIndex @@ -711,6 +714,7 @@ export function createLocalBundleIngestRuntime( provenance: store, reports: store, canonicalPins: store, + contentCache, registry, diffSetService: new DiffSetService(store), sessionWorktreeService: new SessionWorktreeService({ @@ -724,6 +728,7 @@ export function createLocalBundleIngestRuntime( storage, settings: { memoryIngestionModel: options.project.config.llm.models.default ?? 'local-ingest-model', + cliVersion: getKtxCliPackageInfo().version, probeRowCount: 0, workUnitMaxConcurrency: options.project.config.ingest.workUnits.maxConcurrency, workUnitStepBudget: options.project.config.ingest.workUnits.stepBudget, diff --git a/packages/cli/src/context/ingest/ports.ts b/packages/cli/src/context/ingest/ports.ts index 88294f59..4f80a0d8 100644 --- a/packages/cli/src/context/ingest/ports.ts +++ b/packages/cli/src/context/ingest/ports.ts @@ -18,6 +18,7 @@ import type { ToolContext } from '../../context/tools/base-tool.js'; import type { ToolSession } from '../../context/tools/tool-session.js'; import type { KnowledgeIndexPort } from '../../context/wiki/ports.js'; import type { KnowledgeWikiService } from '../../context/wiki/knowledge-wiki.service.js'; +import type { ContentResultCache } from '../cache/content-result-cache.js'; import type { CanonicalPin } from './canonical-pins.js'; import type { IngestTraceLevel } from './ingest-trace.js'; import type { IngestReportSnapshot } from './reports.js'; @@ -141,6 +142,7 @@ export interface IngestSessionWorktreePort { interface IngestSettingsPort { memoryIngestionModel: string; + cliVersion: string; probeRowCount: number; workUnitMaxConcurrency?: number; workUnitStepBudget?: number; @@ -333,6 +335,7 @@ export interface IngestBundleRunnerDeps { provenance: IngestProvenancePort; reports: IngestReportsPort; canonicalPins: IngestCanonicalPinsPort; + contentCache: ContentResultCache; registry: SourceAdapterRegistryPort; diffSetService: DiffSetComputerPort; sessionWorktreeService: IngestSessionWorktreePort; diff --git a/packages/cli/src/context/ingest/report-snapshot.ts b/packages/cli/src/context/ingest/report-snapshot.ts index d91f374b..8b1522fb 100644 --- a/packages/cli/src/context/ingest/report-snapshot.ts +++ b/packages/cli/src/context/ingest/report-snapshot.ts @@ -158,6 +158,19 @@ const finalizationOutcomeSchema = z.object({ provenanceExclusions: z.array(finalizationProvenanceExclusionSchema).default([]), }); +const finalGatePrunedReferenceSchema = z.object({ + kind: z.enum(['join', 'wiki_ref', 'wiki_sl_ref', 'wiki_body_ref']), + artifact: z.string(), + removedRef: z.string(), + absentTarget: z.string(), +}); + +const finalGateDroppedSourceSchema = z.object({ + connectionId: z.string(), + sourceName: z.string(), + reason: z.string(), +}); + const ingestReportSnapshotSchema = z .object({ id: z.string().min(1), @@ -187,9 +200,6 @@ const ingestReportSnapshotSchema = z resolverAttempts: z.number().int().min(0).default(0), resolverRepairs: z.number().int().min(0).default(0), resolverFailures: z.number().int().min(0).default(0), - gateRepairAttempts: z.number().int().min(0).default(0), - gateRepairs: z.number().int().min(0).default(0), - gateRepairFailures: z.number().int().min(0).default(0), }) .optional(), workUnits: z.array( @@ -218,6 +228,8 @@ const ingestReportSnapshotSchema = z provenanceRows: z.array(provenanceDetailSchema).default([]), toolTranscripts: z.array(toolTranscriptSummarySchema).default([]), finalization: finalizationOutcomeSchema.optional(), + finalGatePrunedReferences: z.array(finalGatePrunedReferenceSchema).default([]), + finalGateDroppedSources: z.array(finalGateDroppedSourceSchema).default([]), memoryFlow: memoryFlowReplayInputSchema.optional(), }) .passthrough(), diff --git a/packages/cli/src/context/ingest/reports.ts b/packages/cli/src/context/ingest/reports.ts index 09f92170..9f16aac3 100644 --- a/packages/cli/src/context/ingest/reports.ts +++ b/packages/cli/src/context/ingest/reports.ts @@ -1,6 +1,7 @@ import type { MemoryAction } from '../../context/memory/types.js'; import type { TouchedSlSource } from '../../context/tools/touched-sl-sources.js'; import type { MemoryFlowReplayInput } from './memory-flow/types.js'; +import type { FinalGateDroppedSource, FinalGatePrunedReference } from './final-gate-prune.js'; import type { IngestProvenanceInsert } from './ports.js'; import type { ArtifactResolutionRecord, @@ -93,9 +94,6 @@ export interface IngestReportBody { resolverAttempts?: number; resolverRepairs?: number; resolverFailures?: number; - gateRepairAttempts?: number; - gateRepairs?: number; - gateRepairFailures?: number; }; workUnits: IngestReportWorkUnit[]; failedWorkUnits: string[]; @@ -115,6 +113,8 @@ export interface IngestReportBody { provenanceRows: IngestReportProvenanceDetail[]; toolTranscripts: IngestReportToolTranscriptSummary[]; finalization?: IngestReportFinalizationOutcome; + finalGatePrunedReferences?: FinalGatePrunedReference[]; + finalGateDroppedSources?: FinalGateDroppedSource[]; wikiSlRefRepairs?: WikiSlRefRepair[]; wikiSlRefRepairWarnings?: string[]; memoryFlow?: MemoryFlowReplayInput; @@ -153,7 +153,10 @@ export function ingestReportOutcome(report: IngestReportSnapshot): IngestReportO if (report.body.status === 'failed') { return 'error'; } - if (report.body.failedWorkUnits.length === 0) { + const hasPruneOrDrop = + (report.body.finalGatePrunedReferences?.length ?? 0) > 0 || + (report.body.finalGateDroppedSources?.length ?? 0) > 0; + if (report.body.failedWorkUnits.length === 0 && !hasPruneOrDrop) { return 'done'; } const { wikiCount, slCount } = savedMemoryCountsForReport(report); diff --git a/packages/cli/src/context/ingest/stages/stage-3-work-units.ts b/packages/cli/src/context/ingest/stages/stage-3-work-units.ts index 91f8b24b..4d4f4149 100644 --- a/packages/cli/src/context/ingest/stages/stage-3-work-units.ts +++ b/packages/cli/src/context/ingest/stages/stage-3-work-units.ts @@ -3,7 +3,12 @@ import { isAbortError } from '../../core/abort.js'; import type { AgentRunnerPort, KtxRuntimeToolSet, RunLoopMetrics } from '../../../context/llm/runtime-port.js'; import type { CaptureSession, MemoryAction } from '../../../context/memory/types.js'; import { listTouchedSlSources, type TouchedSlSource } from '../../../context/tools/touched-sl-sources.js'; -import { formatInvalidWuSources, type WuValidationResult } from './validate-wu-sources.js'; +import { + formatInvalidWuSources, + hasBlockingWuSourceIssue, + type WuValidationResult, +} from './validate-wu-sources.js'; +import type { IngestWorkUnitCachedArtifactFile } from '../work-unit-cache.js'; import type { WorkUnit } from '../types.js'; const MAX_WORK_UNIT_PROMPT_CHARS = 240_000; @@ -11,7 +16,6 @@ const MAX_WORK_UNIT_PROMPT_CHARS = 240_000; export interface WorkUnitExecutionDeps { sessionWorktreeGit: { revParseHead(): Promise }; agentRunner: AgentRunnerPort; - validateWikiRefs?: (actions: MemoryAction[]) => Promise; validateTouchedSources: (touched: TouchedSlSource[]) => Promise; resetHardTo: (targetSha: string) => Promise; buildSystemPrompt: (wu: WorkUnit) => string; @@ -40,6 +44,7 @@ export interface WorkUnitOutcome { slDisallowedReason?: 'lookml_connection_mismatch'; patchPath?: string; patchTouchedPaths?: string[]; + artifactFiles?: IngestWorkUnitCachedArtifactFile[]; childWorktreePath?: string; /** Timing and token metrics for the work-unit agent loop, used for ingest profiling. */ metrics?: RunLoopMetrics; @@ -140,19 +145,12 @@ export async function executeWorkUnit(deps: WorkUnitExecutionDeps, wu: WorkUnit) return failWithReset(`${toolFailureCount} tool call(s) failed during WorkUnit ${wu.unitKey}`); } - const danglingWikiRefs = (await deps.validateWikiRefs?.(deps.sessionActions)) ?? []; - if (danglingWikiRefs.length > 0) { - return failWithReset(`wiki references target missing page(s): ${danglingWikiRefs.join(', ')}`); - } - const touched = listTouchedSlSources(deps.captureSession.touchedSlSources); if (touched.length > 0) { const validation = await deps.validateTouchedSources(touched); - if (validation.invalidSources.length > 0) { - // Spec: invalid SL writes reset the session worktree to the WU's pre-state, WU is marked failed, - // its files are absent from the Stage Index. Per-source surgical revert is the - // memory-agent pattern — NOT the bundle-ingest pattern. - return failWithReset(`sl_validate failed for: ${formatInvalidWuSources(validation.invalidSources)}`); + const blockingInvalidSources = validation.invalidSources.filter(hasBlockingWuSourceIssue); + if (blockingInvalidSources.length > 0) { + return failWithReset(`sl_validate failed for: ${formatInvalidWuSources(blockingInvalidSources)}`); } } diff --git a/packages/cli/src/context/ingest/stages/validate-wu-sources.ts b/packages/cli/src/context/ingest/stages/validate-wu-sources.ts index f89e5730..c2c12388 100644 --- a/packages/cli/src/context/ingest/stages/validate-wu-sources.ts +++ b/packages/cli/src/context/ingest/stages/validate-wu-sources.ts @@ -7,6 +7,7 @@ export interface InvalidWuSource { /** `${connectionId}:${sourceName}` */ source: string; errors: string[]; + issues?: WuValidationIssue[]; } export interface WuValidationResult { @@ -14,10 +15,24 @@ export interface WuValidationResult { invalidSources: InvalidWuSource[]; } +type WuValidationIssue = + | { kind: 'source_validation'; message: string } + | { kind: 'missing_join_target'; targetSourceName: string; caseMismatch: string | null; message: string }; + export function formatInvalidWuSources(invalid: InvalidWuSource[]): string { return invalid.map((entry) => `${entry.source} (${entry.errors.join('; ')})`).join(', '); } +export function hasBlockingWuSourceIssue(source: InvalidWuSource): boolean { + const issues = + source.issues ?? + source.errors.map((message) => ({ + kind: 'source_validation' as const, + message, + })); + return issues.some((issue) => issue.kind === 'source_validation'); +} + type LoadedSource = Awaited>['sources'][number]; function uniqueTouchedSources(sources: TouchedSlSource[]): TouchedSlSource[] { @@ -86,11 +101,11 @@ function expandWithExistingJoinNeighbors( * are out of scope — they must not block unrelated work. Resolution is the * Python engine's: exact source-name match within the connection. */ -function findJoinTargetErrors( +function findJoinTargetIssues( touched: TouchedSlSource[], sourcesByConnection: Map, -): Map { - const errorsBySource = new Map(); +): Map { + const issuesBySource = new Map(); const touchedByConnection = new Map>(); for (const source of touched) { const bucket = touchedByConnection.get(source.connectionId) ?? new Set(); @@ -114,11 +129,16 @@ function findJoinTargetErrors( continue; } const key = `${connectionId}:${source.name}`; - const messages = missing.map(formatMissingJoinTarget); - errorsBySource.set(key, [...(errorsBySource.get(key) ?? []), ...messages]); + const issues = missing.map((entry) => ({ + kind: 'missing_join_target' as const, + targetSourceName: entry.to, + caseMismatch: entry.caseMismatch, + message: formatMissingJoinTarget(entry), + })); + issuesBySource.set(key, [...(issuesBySource.get(key) ?? []), ...issues]); } } - return errorsBySource; + return issuesBySource; } export async function validateWuTouchedSources( @@ -136,18 +156,20 @@ export async function validateWuTouchedSources( } const expanded = expandWithExistingJoinNeighbors(touched, sourcesByConnection); - const joinTargetErrors = findJoinTargetErrors(touched, sourcesByConnection); + const joinTargetIssues = findJoinTargetIssues(touched, sourcesByConnection); const valid: string[] = []; const invalid: InvalidWuSource[] = []; for (const source of expanded) { const key = `${source.connectionId}:${source.sourceName}`; const result = await deps.slValidator.validateSingleSource(deps, source.connectionId, source.sourceName); - const errors = [...result.errors, ...(joinTargetErrors.get(key) ?? [])]; + const sourceIssues: WuValidationIssue[] = result.errors.map((message) => ({ kind: 'source_validation', message })); + const issues = [...sourceIssues, ...(joinTargetIssues.get(key) ?? [])]; + const errors = issues.map((issue) => issue.message); if (errors.length === 0) { valid.push(key); } else { - invalid.push({ source: key, errors }); + invalid.push({ source: key, errors, issues }); } } return { validSources: valid, invalidSources: invalid }; diff --git a/packages/cli/src/context/ingest/types.ts b/packages/cli/src/context/ingest/types.ts index 925f3d82..180bfd3e 100644 --- a/packages/cli/src/context/ingest/types.ts +++ b/packages/cli/src/context/ingest/types.ts @@ -6,6 +6,7 @@ import type { KtxTableRefKey } from '../scan/table-ref.js'; import type { MemoryFlowEventSink } from './memory-flow/types.js'; import type { StageIndex } from './stages/stage-index.types.js'; import type { WorkUnitOutcome } from './stages/stage-3-work-units.js'; +import type { FinalGateDroppedSource, FinalGatePrunedReference } from './final-gate-prune.js'; export type IngestTrigger = 'upload' | 'scheduled_pull' | 'manual_resync' | 'manual_override'; @@ -210,6 +211,8 @@ export interface IngestBundleResult { failedWorkUnits: string[]; artifactsWritten: number; commitSha: string | null; + finalGatePrunedReferences?: FinalGatePrunedReference[]; + finalGateDroppedSources?: FinalGateDroppedSource[]; } export interface IngestJobPhase { diff --git a/packages/cli/src/context/ingest/wiki-body-refs.ts b/packages/cli/src/context/ingest/wiki-body-refs.ts index fa62aefa..161e8286 100644 --- a/packages/cli/src/context/ingest/wiki-body-refs.ts +++ b/packages/cli/src/context/ingest/wiki-body-refs.ts @@ -2,9 +2,9 @@ import type { SemanticLayerSource } from '../../context/sl/types.js'; /** @internal */ export type WikiBodyRef = - | { kind: 'sl_entity'; connectionId: string | null; sourceName: string; entityName: string } - | { kind: 'sl_source'; connectionId: string | null; sourceName: string } - | { kind: 'table'; connectionId: string | null; tableRef: string }; + | { kind: 'sl_entity'; connectionId: string | null; sourceName: string; entityName: string; rawToken: string } + | { kind: 'sl_source'; connectionId: string | null; sourceName: string; rawToken: string } + | { kind: 'table'; connectionId: string | null; tableRef: string; rawToken: string }; export interface WikiBodyRefValidationInput { pageKey: string; @@ -14,6 +14,33 @@ export interface WikiBodyRefValidationInput { tableExists(connectionId: string, tableRef: string): Promise; } +export type WikiBodyRefIssue = + | { + kind: 'missing_wiki_body_sl_entity'; + pageKey: string; + rawToken: string; + connectionId?: string; + sourceName: string; + entityName: string; + message: string; + } + | { + kind: 'missing_wiki_body_sl_source'; + pageKey: string; + rawToken: string; + connectionId?: string; + sourceName: string; + message: string; + } + | { + kind: 'missing_wiki_body_table'; + pageKey: string; + rawToken: string; + connectionId?: string; + tableRef: string; + message: string; + }; + const inlineCodePattern = /`([^`\n]+)`/g; function visibleLinesOutsideFences(body: string): string[] { @@ -56,14 +83,14 @@ export function parseWikiBodyRefs(body: string): WikiBodyRef[] { if (scoped.body.startsWith('source:')) { const sourceName = scoped.body.slice('source:'.length).trim(); if (sourceName) { - refs.push({ kind: 'sl_source', connectionId: scoped.connectionId, sourceName }); + refs.push({ kind: 'sl_source', connectionId: scoped.connectionId, sourceName, rawToken: token }); } continue; } if (scoped.body.startsWith('table:')) { const tableRef = scoped.body.slice('table:'.length).trim(); if (tableRef) { - refs.push({ kind: 'table', connectionId: scoped.connectionId, tableRef }); + refs.push({ kind: 'table', connectionId: scoped.connectionId, tableRef, rawToken: token }); } continue; } @@ -74,6 +101,7 @@ export function parseWikiBodyRefs(body: string): WikiBodyRef[] { connectionId: scoped.connectionId, sourceName: parts[0], entityName: parts[1], + rawToken: token, }); } } @@ -89,8 +117,8 @@ function entityNames(source: SemanticLayerSource): Set { ]); } -export async function findInvalidWikiBodyRefs(input: WikiBodyRefValidationInput): Promise { - const errors: string[] = []; +export async function findInvalidWikiBodyRefIssues(input: WikiBodyRefValidationInput): Promise { + const issues: WikiBodyRefIssue[] = []; const sourceCache = new Map(); const loadSources = async (connectionId: string): Promise => { const cached = sourceCache.get(connectionId); @@ -120,7 +148,15 @@ export async function findInvalidWikiBodyRefs(input: WikiBodyRefValidationInput) if (ref.kind === 'table') { const found = await Promise.all(connectionIds.map((connectionId) => input.tableExists(connectionId, ref.tableRef))); if (!found.some(Boolean)) { - errors.push(`${input.pageKey}: unknown raw table ${ref.connectionId ? `${ref.connectionId}/` : ''}${ref.tableRef}`); + const target = `${ref.connectionId ? `${ref.connectionId}/` : ''}${ref.tableRef}`; + issues.push({ + kind: 'missing_wiki_body_table', + pageKey: input.pageKey, + rawToken: ref.rawToken, + ...(ref.connectionId ? { connectionId: ref.connectionId } : {}), + tableRef: ref.tableRef, + message: `${input.pageKey}: unknown raw table ${target}`, + }); } continue; } @@ -128,16 +164,35 @@ export async function findInvalidWikiBodyRefs(input: WikiBodyRefValidationInput) const found = await findSource(connectionIds, ref.sourceName); if (!found) { if (ref.kind === 'sl_source') { - errors.push( - `${input.pageKey}: unknown semantic-layer source ${ref.connectionId ? `${ref.connectionId}/` : ''}${ref.sourceName}`, - ); + const target = `${ref.connectionId ? `${ref.connectionId}/` : ''}${ref.sourceName}`; + issues.push({ + kind: 'missing_wiki_body_sl_source', + pageKey: input.pageKey, + rawToken: ref.rawToken, + ...(ref.connectionId ? { connectionId: ref.connectionId } : {}), + sourceName: ref.sourceName, + message: `${input.pageKey}: unknown semantic-layer source ${target}`, + }); } continue; } if (ref.kind === 'sl_entity' && !entityNames(found.source).has(ref.entityName)) { - errors.push(`${input.pageKey}: unknown semantic-layer entity ${ref.sourceName}.${ref.entityName}`); + issues.push({ + kind: 'missing_wiki_body_sl_entity', + pageKey: input.pageKey, + rawToken: ref.rawToken, + ...(ref.connectionId ? { connectionId: ref.connectionId } : {}), + sourceName: ref.sourceName, + entityName: ref.entityName, + message: `${input.pageKey}: unknown semantic-layer entity ${ref.sourceName}.${ref.entityName}`, + }); } } - return errors; + return issues; +} + +/** @internal */ +export async function findInvalidWikiBodyRefs(input: WikiBodyRefValidationInput): Promise { + return (await findInvalidWikiBodyRefIssues(input)).map((issue) => issue.message); } diff --git a/packages/cli/src/context/ingest/wiki-sl-ref-repair.ts b/packages/cli/src/context/ingest/wiki-sl-ref-repair.ts index 7dcd1bd1..01602bf9 100644 --- a/packages/cli/src/context/ingest/wiki-sl-ref-repair.ts +++ b/packages/cli/src/context/ingest/wiki-sl-ref-repair.ts @@ -82,6 +82,7 @@ export async function repairWikiSlRefs(input: { semanticLayerService: SemanticLayerService; configService: KtxFileStorePort; connectionIds: string[]; + deferGlobalPageKeys?: string[]; }): Promise { const { refs: validRefs, warnings } = await loadVisibleSlRefs(input.semanticLayerService, input.connectionIds); const listFiles = @@ -96,12 +97,16 @@ export async function repairWikiSlRefs(input: { } const listed = await listFiles('wiki', true); const repairs: WikiSlRefRepair[] = []; + const deferredGlobalPageKeys = new Set(input.deferGlobalPageKeys ?? []); for (const file of listed.files.sort()) { const parsedPath = parseKnowledgeFilePath(file); if (!parsedPath) { continue; } + if (parsedPath.scope === 'GLOBAL' && deferredGlobalPageKeys.has(parsedPath.pageKey)) { + continue; + } const page = await input.wikiService.readPage(parsedPath.scope, parsedPath.scopeId, parsedPath.pageKey); const refs = uniqueStringArray(page?.frontmatter.sl_refs); if (!page || refs.length === 0) { diff --git a/packages/cli/src/context/ingest/work-unit-cache.ts b/packages/cli/src/context/ingest/work-unit-cache.ts new file mode 100644 index 00000000..42dc2ba7 --- /dev/null +++ b/packages/cli/src/context/ingest/work-unit-cache.ts @@ -0,0 +1,314 @@ +import { mkdir, readFile, rm, writeFile } from 'node:fs/promises'; +import { dirname, join } from 'node:path'; +import { isDeepStrictEqual } from 'node:util'; +import YAML from 'yaml'; +import type { KtxModelRole } from '../../llm/types.js'; +import { stableContentHash } from '../cache/content-result-cache.js'; +import type { GitService } from '../core/git.service.js'; +import type { MemoryAction } from '../memory/types.js'; +import type { TouchedSlSource } from '../tools/touched-sl-sources.js'; +import type { IngestTraceWriter } from './ingest-trace.js'; +import type { IngestSessionWorktreePort } from './ports.js'; +import type { WorkUnit } from './types.js'; + +export const INGEST_WORK_UNIT_CACHE_NAMESPACE = 'ingest:work-unit'; + +export interface IngestWorkUnitCachedArtifactFile { + path: string; + beforeBase64: string | null; + afterBase64: string | null; +} + +export interface IngestWorkUnitCachePayload { + schemaVersion: 2; + unitKey: string; + patchTouchedPaths: string[]; + // Replay re-derives the patch from these before/after snapshots; the diff text + // itself is never stored, so the payload carries each touched file only once. + artifactFiles: IngestWorkUnitCachedArtifactFile[]; + actions: MemoryAction[]; + touchedSlSources: TouchedSlSource[]; + slDisallowed?: boolean; + slDisallowedReason?: 'lookml_connection_mismatch'; +} + +export interface ComputeIngestWorkUnitInputHashInput { + stagedDir: string; + connectionId: string; + sourceKey: string; + unit: WorkUnit; + cliVersion: string; + promptFingerprint: string; + modelRole: KtxModelRole; +} + +async function fileDigest( + stagedDir: string, + path: string, +): Promise<{ path: string; status: 'present' | 'missing'; hash: string | null }> { + try { + const bytes = await readFile(join(stagedDir, path)); + return { path, status: 'present', hash: stableContentHash(bytes.toString('base64')) }; + } catch (error) { + if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') { + return { path, status: 'missing', hash: null }; + } + throw error; + } +} + +export async function computeIngestWorkUnitInputHash(input: ComputeIngestWorkUnitInputHashInput): Promise { + const rawFiles = [...input.unit.rawFiles].sort(); + const dependencyPaths = [...input.unit.dependencyPaths].sort(); + const [raw, dependencies] = await Promise.all([ + Promise.all(rawFiles.map((path) => fileDigest(input.stagedDir, path))), + Promise.all(dependencyPaths.map((path) => fileDigest(input.stagedDir, path))), + ]); + + return stableContentHash({ + schemaVersion: 2, + connectionId: input.connectionId, + sourceKey: input.sourceKey, + unitKey: input.unit.unitKey, + rawFiles: raw, + dependencyPaths: dependencies, + slDisallowed: input.unit.slDisallowed === true, + slDisallowedReason: input.unit.slDisallowedReason ?? null, + cliVersion: input.cliVersion, + promptFingerprint: input.promptFingerprint, + modelRole: input.modelRole, + }); +} + +async function readFileBase64(workdir: string, path: string): Promise { + try { + return (await readFile(join(workdir, path))).toString('base64'); + } catch (error) { + if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') { + return null; + } + throw error; + } +} + +async function readGitFileBase64(git: GitService, path: string, commitSha: string): Promise { + try { + return Buffer.from(await git.getFileAtCommit(path, commitSha), 'utf-8').toString('base64'); + } catch { + return null; + } +} + +function decodeBase64(value: string | null): string | null { + return value === null ? null : Buffer.from(value, 'base64').toString('utf-8'); +} + +function parseYamlObject(content: string): Record | null { + const parsed = YAML.parse(content); + return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? (parsed as Record) : null; +} + +function isSubsequenceOfDeepValues(current: unknown[], output: unknown[]): boolean { + let outputIndex = 0; + for (const item of current) { + while (outputIndex < output.length && !isDeepStrictEqual(item, output[outputIndex])) { + outputIndex += 1; + } + if (outputIndex >= output.length) { + return false; + } + outputIndex += 1; + } + return true; +} + +function isSemanticLayerPruneShape(current: string, output: string): boolean { + const currentYaml = parseYamlObject(current); + const outputYaml = parseYamlObject(output); + if (!currentYaml || !outputYaml) { + return false; + } + const currentJoins = Array.isArray(currentYaml.joins) ? currentYaml.joins : []; + const outputJoins = Array.isArray(outputYaml.joins) ? outputYaml.joins : []; + if (currentJoins.length >= outputJoins.length) { + return false; + } + if (!isSubsequenceOfDeepValues(currentJoins, outputJoins)) { + return false; + } + const normalizedOutput = { ...outputYaml, joins: currentJoins }; + return isDeepStrictEqual(currentYaml, normalizedOutput); +} + +function parseWikiPage(raw: string): { frontmatter: Record; content: string } | null { + const match = raw.match(/^---\n([\s\S]*?)\n---\n?([\s\S]*)$/); + if (!match) { + return null; + } + const frontmatter = YAML.parse(match[1] ?? '') as Record; + return { frontmatter, content: (match[2] ?? '').trim() }; +} + +function withoutRemovedWikiTokens(output: string, current: string): string { + let projected = output; + for (const match of output.matchAll(/\[\[\s*([^|\]\n]+)(?:\|[^\]\n]+)?\s*\]\]/g)) { + const token = match[0] ?? ''; + if (token && !current.includes(token)) { + projected = projected.replaceAll(token, '').replace(/[ \t]+([.,;:!?])/g, '$1'); + } + } + for (const match of output.matchAll(/`([^`\n]+)`/g)) { + const token = match[0] ?? ''; + if (token && !current.includes(token)) { + projected = projected.replaceAll(token, '').replace(/[ \t]+([.,;:!?])/g, '$1'); + } + } + projected = projected.replace(/,\s*,/g, ',').replace(/[ \t]+([.,;:!?])/g, '$1'); + return projected.trim(); +} + +function isWikiPruneShape(current: string, output: string): boolean { + const currentPage = parseWikiPage(current); + const outputPage = parseWikiPage(output); + if (!currentPage || !outputPage) { + return false; + } + const currentRefs = Array.isArray(currentPage.frontmatter.refs) ? currentPage.frontmatter.refs : []; + const outputRefs = Array.isArray(outputPage.frontmatter.refs) ? outputPage.frontmatter.refs : []; + const currentSlRefs = Array.isArray(currentPage.frontmatter.sl_refs) ? currentPage.frontmatter.sl_refs : []; + const outputSlRefs = Array.isArray(outputPage.frontmatter.sl_refs) ? outputPage.frontmatter.sl_refs : []; + if (currentRefs.length > outputRefs.length || currentSlRefs.length > outputSlRefs.length) { + return false; + } + if (!isSubsequenceOfDeepValues(currentRefs, outputRefs) || !isSubsequenceOfDeepValues(currentSlRefs, outputSlRefs)) { + return false; + } + const normalizedOutputFrontmatter = { + ...outputPage.frontmatter, + refs: currentRefs, + sl_refs: currentSlRefs, + }; + if (!isDeepStrictEqual(currentPage.frontmatter, normalizedOutputFrontmatter)) { + return false; + } + return withoutRemovedWikiTokens(outputPage.content, currentPage.content) === currentPage.content.trim(); +} + +/** @internal */ +export function isPruneShapedCachedReplayBase(path: string, currentContent: string, outputContent: string): boolean { + if (path.startsWith('semantic-layer/') && path.endsWith('.yaml')) { + return isSemanticLayerPruneShape(currentContent, outputContent); + } + if (path.startsWith('wiki/') && path.endsWith('.md')) { + return isWikiPruneShape(currentContent, outputContent); + } + return false; +} + +export async function captureIngestWorkUnitCachedArtifactFiles(input: { + git: GitService; + workdir: string; + baseSha: string; + patchTouchedPaths: string[]; +}): Promise { + const paths = [...new Set(input.patchTouchedPaths)].sort(); + return Promise.all( + paths.map(async (path) => ({ + path, + beforeBase64: await readGitFileBase64(input.git, path, input.baseSha), + afterBase64: await readFileBase64(input.workdir, path), + })), + ); +} + +async function writeCachedFile(workdir: string, file: IngestWorkUnitCachedArtifactFile): Promise { + const target = join(workdir, file.path); + if (file.afterBase64 === null) { + await rm(target, { force: true }); + return; + } + await mkdir(dirname(target), { recursive: true }); + await writeFile(target, Buffer.from(file.afterBase64, 'base64')); +} + +function cacheFileCanReplayFromCurrentBase(file: IngestWorkUnitCachedArtifactFile, currentBase64: string | null): boolean { + if (currentBase64 === file.beforeBase64 || currentBase64 === file.afterBase64) { + return true; + } + const current = decodeBase64(currentBase64); + const output = decodeBase64(file.afterBase64); + if (current === null || output === null) { + return false; + } + return isPruneShapedCachedReplayBase(file.path, current, output); +} + +export async function materializeCachedWorkUnitReplayPatch(input: { + sessionWorktreeService: IngestSessionWorktreePort; + baseSha: string; + jobId: string; + unitKey: string; + patchPath: string; + artifactFiles: IngestWorkUnitCachedArtifactFile[]; + author: { name: string; email: string }; + trace: IngestTraceWriter; +}): Promise<'materialized' | 'unsafe_drift'> { + const replay = await input.sessionWorktreeService.create(`${input.jobId}-${input.unitKey}-cache-replay`, input.baseSha); + let cleanup: 'success' | 'crash' = 'crash'; + try { + for (const file of input.artifactFiles) { + const currentBase64 = await readFileBase64(replay.workdir, file.path); + if (!cacheFileCanReplayFromCurrentBase(file, currentBase64)) { + cleanup = 'success'; + return 'unsafe_drift'; + } + } + for (const file of input.artifactFiles) { + await writeCachedFile(replay.workdir, file); + } + const changedPaths = await replay.git.changedPaths(); + if (changedPaths.length > 0) { + await replay.git.commitFiles( + changedPaths, + `ingest: materialize cached WorkUnit ${input.unitKey}`, + input.author.name, + input.author.email, + ); + } + await replay.git.writeBinaryNoRenamePatch(input.baseSha, 'HEAD', input.patchPath); + await input.trace.event('debug', 'work_unit', 'work_unit_cache_patch_materialized', { + unitKey: input.unitKey, + patchPath: input.patchPath, + touchedPaths: changedPaths, + }); + cleanup = 'success'; + return 'materialized'; + } finally { + await input.sessionWorktreeService.cleanup(replay, cleanup); + } +} + +export function ingestWorkUnitCacheScopeKey(input: { connectionId: string; sourceKey: string }): string { + return `${input.connectionId}:${input.sourceKey}`; +} + +export function computeIngestWorkUnitPromptFingerprint(input: { + cliVersion: string; + baseFraming: string; + skillsPrompt: string; + canonicalPins: unknown[]; + sourceKey: string; + connectionId: string; + skillNames: string[]; +}): string { + return stableContentHash({ + schemaVersion: 1, + cliVersion: input.cliVersion, + baseFraming: input.baseFraming, + skillsPrompt: input.skillsPrompt, + canonicalPins: input.canonicalPins, + sourceKey: input.sourceKey, + connectionId: input.connectionId, + skillNames: [...input.skillNames].sort(), + }); +} diff --git a/packages/cli/src/context/scan/enrichment-state.ts b/packages/cli/src/context/scan/enrichment-state.ts index 40975003..84b6224d 100644 --- a/packages/cli/src/context/scan/enrichment-state.ts +++ b/packages/cli/src/context/scan/enrichment-state.ts @@ -1,4 +1,4 @@ -import { createHash } from 'node:crypto'; +import { stableContentHash } from '../cache/content-result-cache.js'; import type { KtxScanRelationshipConfig } from '../project/config.js'; import type { KtxScanEnrichmentStage, KtxScanEnrichmentStateSummary, KtxScanMode, KtxSchemaSnapshot } from './types.js'; @@ -99,29 +99,12 @@ export interface KtxRelationshipsStageHashInput { llmIdentity: KtxScanLlmIdentity; } -function stableJson(value: unknown): string { - if (Array.isArray(value)) { - return `[${value.map(stableJson).join(',')}]`; - } - if (value && typeof value === 'object') { - const entries = Object.entries(value as Record).sort(([left], [right]) => - left.localeCompare(right), - ); - return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableJson(item)}`).join(',')}}`; - } - return JSON.stringify(value); -} - -function sha256(value: unknown): string { - return createHash('sha256').update(stableJson(value)).digest('hex'); -} - export function computeKtxDescriptionsStageHash(input: KtxDescriptionsStageHashInput): string { - return sha256({ snapshot: input.snapshot, llmIdentity: input.llmIdentity }); + return stableContentHash({ snapshot: input.snapshot, llmIdentity: input.llmIdentity }); } export function computeKtxEmbeddingsStageHash(input: KtxEmbeddingsStageHashInput): string { - return sha256({ + return stableContentHash({ snapshot: input.snapshot, embeddingIdentity: input.embeddingIdentity, descriptionDigest: input.descriptionDigest, @@ -129,7 +112,7 @@ export function computeKtxEmbeddingsStageHash(input: KtxEmbeddingsStageHashInput } export function computeKtxRelationshipsStageHash(input: KtxRelationshipsStageHashInput): string { - return sha256({ + return stableContentHash({ snapshot: input.snapshot, relationshipSettings: input.relationshipSettings, llmIdentity: input.llmIdentity, @@ -143,7 +126,7 @@ export function computeKtxRelationshipsStageHash(input: KtxRelationshipsStageHas * that depend on the changed text (D4 self-healing). */ export function computeKtxScanDescriptionDigest(texts: readonly string[]): string { - return sha256(texts); + return stableContentHash(texts); } function uniqueStages(stages: KtxScanEnrichmentStage[]): KtxScanEnrichmentStage[] { diff --git a/packages/cli/src/context/scan/local-enrichment.ts b/packages/cli/src/context/scan/local-enrichment.ts index f391a6c2..8b4da128 100644 --- a/packages/cli/src/context/scan/local-enrichment.ts +++ b/packages/cli/src/context/scan/local-enrichment.ts @@ -809,17 +809,21 @@ export async function runLocalScanEnrichment( let relationshipPartial: { reason: KtxRelationshipDetectionStopReason } | null = null; let relationships: KtxScanRelationshipSummary = { accepted: 0, review: 0, rejected: 0, skipped: 0 }; - // Promote the paid descriptions + embeddings to the queryable layer at the - // cost boundary, before the slow, kill-prone relationship stage — so an - // interrupted relationship stage degrades to "no joins," never "no descriptions." - if (shouldDetectRelationships && summary.tableDescriptions === 'completed' && input.onCheckpoint) { + // Promote any non-relationship stage that ran this invocation (descriptions or + // a `--stages embeddings,relationships` re-embed) before the slow, kill-prone + // relationship stage, so an interruption degrades to "no joins," never lost + // enrichment. descriptionUpdates uses the best-available set (D3): the manifest + // merge overwrites scan-managed descriptions, so the empty this-invocation set + // would delete prior on-disk ones. + const checkpointablePaidWork = summary.tableDescriptions === 'completed' || summary.embeddings === 'completed'; + if (shouldDetectRelationships && checkpointablePaidWork && input.onCheckpoint) { await input.onCheckpoint({ snapshot, summary: { ...summary }, relationships, state: summarizeKtxScanEnrichmentState(state), warnings: [...warnings], - descriptionUpdates: descriptions, + descriptionUpdates: await resolveDownstreamDescriptions(), embeddingUpdates, relationshipUpdate: null, relationshipProfile: null, diff --git a/packages/cli/src/context/scan/sqlite-local-enrichment-state-store.ts b/packages/cli/src/context/scan/sqlite-local-enrichment-state-store.ts index 50f649d4..e80c1e57 100644 --- a/packages/cli/src/context/scan/sqlite-local-enrichment-state-store.ts +++ b/packages/cli/src/context/scan/sqlite-local-enrichment-state-store.ts @@ -1,6 +1,5 @@ -import { mkdirSync } from 'node:fs'; -import { dirname } from 'node:path'; -import Database from 'better-sqlite3'; +import type { ContentResultCache, ContentResultCacheRecord } from '../cache/content-result-cache.js'; +import { SqliteContentResultCache } from '../cache/sqlite-content-result-cache.js'; import type { KtxScanEnrichmentCompletedStage, KtxScanEnrichmentFailedStage, @@ -8,281 +7,142 @@ import type { KtxScanEnrichmentStageRecord, KtxScanEnrichmentStateStore, } from './enrichment-state.js'; +import { KTX_SCAN_ENRICHMENT_STAGES } from './enrichment-state.js'; +import { KTX_SCAN_MODES } from './types.js'; import type { KtxScanEnrichmentStage, KtxScanMode } from './types.js'; export interface SqliteLocalScanEnrichmentStateStoreOptions { dbPath: string; + cache?: ContentResultCache; } -interface StageRow { - run_id: string; - connection_id: string; - sync_id: string; +interface ScanStageMetadata { + connectionId: string; + syncId: string; mode: KtxScanMode; stage: KtxScanEnrichmentStage; - input_hash: string; - status: 'completed' | 'failed'; - output_json: string | null; - error_message: string | null; - updated_at: string; } -function parseStageRow(row: StageRow): KtxScanEnrichmentStageRecord { - if (row.status === 'completed') { - return { - runId: row.run_id, - connectionId: row.connection_id, - syncId: row.sync_id, - mode: row.mode, - stage: row.stage, - inputHash: row.input_hash, - status: 'completed', - output: JSON.parse(row.output_json ?? 'null') as TOutput, - errorMessage: null, - updatedAt: row.updated_at, - }; - } +function namespace(stage: KtxScanEnrichmentStage): string { + return `scan:${stage}`; +} +function metadataFor(input: { + connectionId: string; + syncId: string; + mode: KtxScanMode; + stage: KtxScanEnrichmentStage; +}): Record { return { - runId: row.run_id, - connectionId: row.connection_id, - syncId: row.sync_id, - mode: row.mode, - stage: row.stage, - inputHash: row.input_hash, - status: 'failed', - output: null, - errorMessage: row.error_message ?? 'Unknown enrichment stage failure', - updatedAt: row.updated_at, + connectionId: input.connectionId, + syncId: input.syncId, + mode: input.mode, + stage: input.stage, }; } -function isSafeRunId(runId: string): boolean { - return /^[a-zA-Z0-9][a-zA-Z0-9_.-]*$/.test(runId); +function isScanMode(value: unknown): value is KtxScanMode { + return typeof value === 'string' && (KTX_SCAN_MODES as readonly string[]).includes(value); } -const STAGES_TABLE = 'local_scan_enrichment_stages'; -const STAGES_PRIMARY_KEY = ['connection_id', 'stage', 'input_hash'] as const; +function isScanEnrichmentStage(value: unknown): value is KtxScanEnrichmentStage { + return typeof value === 'string' && (KTX_SCAN_ENRICHMENT_STAGES as readonly string[]).includes(value); +} + +function parseMetadata(record: ContentResultCacheRecord): ScanStageMetadata { + const { connectionId, syncId, mode, stage } = record.metadata as Partial; + if (typeof connectionId !== 'string' || typeof syncId !== 'string' || !isScanMode(mode) || !isScanEnrichmentStage(stage)) { + throw new Error(`Invalid scan enrichment cache metadata for ${record.namespace}/${record.scopeKey}`); + } + return { connectionId, syncId, mode, stage }; +} + +function toScanRecord(record: ContentResultCacheRecord): KtxScanEnrichmentStageRecord { + const metadata = parseMetadata(record); + const base = { + runId: record.runId, + connectionId: metadata.connectionId, + syncId: metadata.syncId, + mode: metadata.mode, + stage: metadata.stage, + inputHash: record.inputHash, + updatedAt: record.updatedAt, + }; + if (record.status === 'completed') { + return { + ...base, + status: 'completed', + output: record.output, + errorMessage: null, + }; + } + return { + ...base, + status: 'failed', + output: null, + errorMessage: record.errorMessage, + }; +} export class SqliteLocalScanEnrichmentStateStore implements KtxScanEnrichmentStateStore { - private readonly db: Database.Database; + private readonly cache: ContentResultCache; constructor(options: SqliteLocalScanEnrichmentStateStoreOptions) { - mkdirSync(dirname(options.dbPath), { recursive: true }); - this.db = new Database(options.dbPath); - this.db.pragma('journal_mode = WAL'); - // Disposable local resume cache: if a prior ktx wrote the table with a - // different primary key, drop it rather than migrate. Losing it only means - // one ingest cannot resume; it never corrupts a queryable artifact. - this.dropStagesTableIfPrimaryKeyDiffers(); - this.db.exec(` - CREATE TABLE IF NOT EXISTS local_scan_enrichment_stages ( - run_id TEXT NOT NULL, - stage TEXT NOT NULL, - input_hash TEXT NOT NULL, - connection_id TEXT NOT NULL, - sync_id TEXT NOT NULL, - mode TEXT NOT NULL, - status TEXT NOT NULL, - output_json TEXT, - error_message TEXT, - updated_at TEXT NOT NULL, - PRIMARY KEY (connection_id, stage, input_hash) - ); - - CREATE INDEX IF NOT EXISTS local_scan_enrichment_stages_content_idx - ON local_scan_enrichment_stages (connection_id, stage, input_hash, updated_at); - CREATE INDEX IF NOT EXISTS local_scan_enrichment_stages_run_idx - ON local_scan_enrichment_stages (run_id, updated_at, stage); - `); - } - - private dropStagesTableIfPrimaryKeyDiffers(): void { - const columns = this.db.prepare(`PRAGMA table_info(${STAGES_TABLE})`).all() as Array<{ - name: string; - pk: number; - }>; - if (columns.length === 0) { - return; - } - const primaryKey = columns - .filter((column) => column.pk > 0) - .sort((left, right) => left.pk - right.pk) - .map((column) => column.name); - const matches = - primaryKey.length === STAGES_PRIMARY_KEY.length && - primaryKey.every((name, index) => name === STAGES_PRIMARY_KEY[index]); - if (!matches) { - this.db.exec(`DROP TABLE ${STAGES_TABLE}`); - } + this.cache = options.cache ?? new SqliteContentResultCache({ dbPath: options.dbPath }); } async findCompletedStage( input: KtxScanEnrichmentStageLookup, ): Promise | null> { - const row = this.db - .prepare( - ` - SELECT * - FROM local_scan_enrichment_stages - WHERE connection_id = ? - AND stage = ? - AND input_hash = ? - AND status = 'completed' - ORDER BY updated_at DESC - LIMIT 1 - `, - ) - .get(input.connectionId, input.stage, input.inputHash) as StageRow | undefined; - - if (!row) { - return null; - } - const parsed = parseStageRow(row); - return parsed.status === 'completed' ? parsed : null; + const record = await this.cache.findCompletedResult({ + namespace: namespace(input.stage), + scopeKey: input.connectionId, + inputHash: input.inputHash, + }); + return record ? (toScanRecord(record) as KtxScanEnrichmentCompletedStage) : null; } async findLatestCompletedStage(input: { connectionId: string; stage: KtxScanEnrichmentStage; }): Promise { - const row = this.db - .prepare( - ` - SELECT * - FROM local_scan_enrichment_stages - WHERE connection_id = ? - AND stage = ? - AND status = 'completed' - ORDER BY updated_at DESC - LIMIT 1 - `, - ) - .get(input.connectionId, input.stage) as StageRow | undefined; - - if (!row) { - return null; - } - const parsed = parseStageRow(row); - return parsed.status === 'completed' ? parsed : null; + const record = await this.cache.findLatestCompletedResult({ + namespace: namespace(input.stage), + scopeKey: input.connectionId, + }); + return record ? (toScanRecord(record) as KtxScanEnrichmentCompletedStage) : null; } async saveCompletedStage( input: Omit, 'status' | 'errorMessage'>, ): Promise { - this.db - .prepare( - ` - INSERT INTO local_scan_enrichment_stages ( - run_id, - stage, - input_hash, - connection_id, - sync_id, - mode, - status, - output_json, - error_message, - updated_at - ) - VALUES ( - @runId, - @stage, - @inputHash, - @connectionId, - @syncId, - @mode, - 'completed', - @outputJson, - NULL, - @updatedAt - ) - ON CONFLICT(connection_id, stage, input_hash) DO UPDATE SET - run_id = excluded.run_id, - sync_id = excluded.sync_id, - mode = excluded.mode, - status = excluded.status, - output_json = excluded.output_json, - error_message = excluded.error_message, - updated_at = excluded.updated_at - `, - ) - .run({ - runId: input.runId, - stage: input.stage, - inputHash: input.inputHash, - connectionId: input.connectionId, - syncId: input.syncId, - mode: input.mode, - outputJson: JSON.stringify(input.output), - updatedAt: input.updatedAt, - }); + await this.cache.saveCompletedResult({ + runId: input.runId, + namespace: namespace(input.stage), + scopeKey: input.connectionId, + inputHash: input.inputHash, + output: input.output, + metadata: metadataFor(input), + updatedAt: input.updatedAt, + }); } async saveFailedStage(input: Omit): Promise { - this.db - .prepare( - ` - INSERT INTO local_scan_enrichment_stages ( - run_id, - stage, - input_hash, - connection_id, - sync_id, - mode, - status, - output_json, - error_message, - updated_at - ) - VALUES ( - @runId, - @stage, - @inputHash, - @connectionId, - @syncId, - @mode, - 'failed', - NULL, - @errorMessage, - @updatedAt - ) - ON CONFLICT(connection_id, stage, input_hash) DO UPDATE SET - run_id = excluded.run_id, - sync_id = excluded.sync_id, - mode = excluded.mode, - status = excluded.status, - output_json = excluded.output_json, - error_message = excluded.error_message, - updated_at = excluded.updated_at - `, - ) - .run({ - runId: input.runId, - stage: input.stage, - inputHash: input.inputHash, - connectionId: input.connectionId, - syncId: input.syncId, - mode: input.mode, - errorMessage: input.errorMessage, - updatedAt: input.updatedAt, - }); + await this.cache.saveFailedResult({ + runId: input.runId, + namespace: namespace(input.stage), + scopeKey: input.connectionId, + inputHash: input.inputHash, + errorMessage: input.errorMessage, + metadata: metadataFor(input), + updatedAt: input.updatedAt, + }); } async listRunStages(runId: string): Promise { - if (!isSafeRunId(runId)) { - return []; - } - const rows = this.db - .prepare( - ` - SELECT * - FROM local_scan_enrichment_stages - WHERE run_id = ? - ORDER BY updated_at ASC, stage ASC - `, - ) - .all(runId) as StageRow[]; - return rows.map((row) => parseStageRow(row)); + const records = await this.cache.listRunResults(runId); + return records + .filter((record) => record.namespace.startsWith('scan:')) + .map((record) => toScanRecord(record)); } } diff --git a/packages/cli/src/context/scan/types.ts b/packages/cli/src/context/scan/types.ts index 0d269c37..9c6010c5 100644 --- a/packages/cli/src/context/scan/types.ts +++ b/packages/cli/src/context/scan/types.ts @@ -10,7 +10,9 @@ export type KtxConnectionDriver = | 'clickhouse' | 'mongodb'; -export type KtxScanMode = 'structural' | 'relationships' | 'enriched'; +/** Canonical scan-mode registry. Runtime validation derives its allowlist here. */ +export const KTX_SCAN_MODES = ['structural', 'relationships', 'enriched'] as const; +export type KtxScanMode = (typeof KTX_SCAN_MODES)[number]; export type KtxScanTrigger = 'cli' | 'mcp' | 'schema_scan' | 'scheduled' | 'manual'; diff --git a/packages/cli/src/context/sl/source-files.ts b/packages/cli/src/context/sl/source-files.ts index 02a5cd48..0c0e2c1d 100644 --- a/packages/cli/src/context/sl/source-files.ts +++ b/packages/cli/src/context/sl/source-files.ts @@ -135,6 +135,27 @@ export function slDeclaredSourceName(content: string): string | null { return typeof name === 'string' && name.length > 0 ? name : null; } +/** + * Every standalone/overlay source file for a connection (excludes the `_schema/` + * manifest). The one listing entry points share so a file is visible to all. + */ +export async function listSlSourceFiles( + fileStore: Pick, + connectionId: string, +): Promise { + const dir = `semantic-layer/${assertSafeConnectionId(connectionId)}`; + const schemaDir = `${dir}/_schema`; + const listed = await fileStore.listFiles(dir); + const paths = listed.files.filter((file) => isSlYamlPath(file) && !file.startsWith(`${schemaDir}/`)).sort(); + + const files: SlSourceFile[] = []; + for (const path of paths) { + const raw = await fileStore.readFile(path); + files.push({ path, content: raw.content }); + } + return files; +} + /** * Find the standalone/overlay file that defines `sourceName` for a connection. * Returns null when no file declares the name (the source may still exist as a @@ -147,18 +168,9 @@ export async function resolveSlSourceFile( connectionId: string, sourceName: string, ): Promise { - const dir = `semantic-layer/${assertSafeConnectionId(connectionId)}`; - const schemaDir = `${dir}/_schema`; - const listed = await fileStore.listFiles(dir); - const paths = listed.files.filter((file) => isSlYamlPath(file) && !file.startsWith(`${schemaDir}/`)).sort(); - - const matches: SlSourceFile[] = []; - for (const path of paths) { - const raw = await fileStore.readFile(path); - if (slSourceNameForFile(path, raw.content) === sourceName) { - matches.push({ path, content: raw.content }); - } - } + const matches = (await listSlSourceFiles(fileStore, connectionId)).filter( + (file) => slSourceNameForFile(file.path, file.content) === sourceName, + ); if (matches.length > 1) { throw new Error( `Multiple semantic-layer files declare source "${sourceName}": ${matches.map((match) => match.path).join(', ')}`, diff --git a/packages/cli/test/context/cache/content-result-cache.test.ts b/packages/cli/test/context/cache/content-result-cache.test.ts new file mode 100644 index 00000000..ca85c072 --- /dev/null +++ b/packages/cli/test/context/cache/content-result-cache.test.ts @@ -0,0 +1,145 @@ +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import Database from 'better-sqlite3'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { stableContentHash } from '../../../src/context/cache/content-result-cache.js'; +import { SqliteContentResultCache } from '../../../src/context/cache/sqlite-content-result-cache.js'; + +describe('content result cache', () => { + let tempDir: string; + let cache: SqliteContentResultCache; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-content-result-cache-')); + cache = new SqliteContentResultCache({ dbPath: join(tempDir, 'db.sqlite') }); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('computes stable hashes independent of object key order', () => { + const first = stableContentHash({ b: ['two', { z: 1, a: true }], a: 'one' }); + const second = stableContentHash({ a: 'one', b: ['two', { a: true, z: 1 }] }); + + expect(first).toMatch(/^[a-f0-9]{64}$/); + expect(second).toBe(first); + }); + + it('persists completed results by namespace, scope, and input hash', async () => { + await cache.saveCompletedResult({ + runId: 'run-1', + namespace: 'scan:descriptions', + scopeKey: 'warehouse', + inputHash: 'hash-1', + output: { rows: 3 }, + metadata: { syncId: 'sync-1', mode: 'enriched' }, + updatedAt: '2026-06-25T10:00:00.000Z', + }); + + await expect( + cache.findCompletedResult<{ rows: number }>({ + namespace: 'scan:descriptions', + scopeKey: 'warehouse', + inputHash: 'hash-1', + }), + ).resolves.toMatchObject({ + runId: 'run-1', + namespace: 'scan:descriptions', + scopeKey: 'warehouse', + inputHash: 'hash-1', + status: 'completed', + output: { rows: 3 }, + metadata: { syncId: 'sync-1', mode: 'enriched' }, + }); + + await expect( + cache.findCompletedResult({ + namespace: 'scan:descriptions', + scopeKey: 'warehouse', + inputHash: 'hash-2', + }), + ).resolves.toBeNull(); + }); + + it('records failed results without making them reusable', async () => { + await cache.saveFailedResult({ + runId: 'run-2', + namespace: 'scan:embeddings', + scopeKey: 'warehouse', + inputHash: 'hash-2', + errorMessage: 'embedding service timed out', + metadata: { syncId: 'sync-2', mode: 'enriched' }, + updatedAt: '2026-06-25T10:01:00.000Z', + }); + + await expect( + cache.findCompletedResult({ + namespace: 'scan:embeddings', + scopeKey: 'warehouse', + inputHash: 'hash-2', + }), + ).resolves.toBeNull(); + + await expect(cache.listRunResults('run-2')).resolves.toEqual([ + expect.objectContaining({ + runId: 'run-2', + namespace: 'scan:embeddings', + status: 'failed', + errorMessage: 'embedding service timed out', + }), + ]); + }); + + it('drops the obsolete scan-specific cache table when opening the shared cache', async () => { + const dbPath = join(tempDir, 'legacy.sqlite'); + const legacy = new Database(dbPath); + legacy.exec(` + CREATE TABLE local_scan_enrichment_stages ( + run_id TEXT NOT NULL, + stage TEXT NOT NULL, + input_hash TEXT NOT NULL, + connection_id TEXT NOT NULL, + sync_id TEXT NOT NULL, + mode TEXT NOT NULL, + status TEXT NOT NULL, + output_json TEXT, + error_message TEXT, + updated_at TEXT NOT NULL, + PRIMARY KEY (connection_id, stage, input_hash) + ); + INSERT INTO local_scan_enrichment_stages + VALUES ('old-run', 'descriptions', 'hash', 'warehouse', 'sync', 'enriched', 'completed', 'null', NULL, '2026-01-01T00:00:00.000Z'); + `); + legacy.close(); + + const reopened = new SqliteContentResultCache({ dbPath }); + await reopened.saveCompletedResult({ + runId: 'new-run', + namespace: 'scan:descriptions', + scopeKey: 'warehouse', + inputHash: 'hash', + output: ['fresh'], + metadata: { syncId: 'sync', mode: 'enriched' }, + updatedAt: '2026-06-25T10:02:00.000Z', + }); + + const db = new Database(dbPath, { readonly: true }); + const legacyRow = db + .prepare( + "SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'local_scan_enrichment_stages'", + ) + .get(); + db.close(); + + expect(legacyRow).toBeUndefined(); + await expect( + reopened.findCompletedResult({ + namespace: 'scan:descriptions', + scopeKey: 'warehouse', + inputHash: 'hash', + }), + ).resolves.toMatchObject({ runId: 'new-run', output: ['fresh'] }); + }); +}); diff --git a/packages/cli/test/context/ingest/artifact-gates.test.ts b/packages/cli/test/context/ingest/artifact-gates.test.ts index 491a7feb..c81f1e06 100644 --- a/packages/cli/test/context/ingest/artifact-gates.test.ts +++ b/packages/cli/test/context/ingest/artifact-gates.test.ts @@ -59,7 +59,15 @@ describe('artifact gates', () => { validateTouchedSources: async () => ({ invalidSources: [], validSources: ['mart_account_segments'] }), tableExists: async () => true, }), - ).rejects.toThrow(/unknown semantic-layer entity mart_account_segments\.total_contract_arr_cents/); + ).resolves.toMatchObject({ + ok: false, + findings: [ + { + kind: 'missing_wiki_body_sl_entity', + message: 'account-segments: unknown semantic-layer entity mart_account_segments.total_contract_arr_cents', + }, + ], + }); }); it('fails before provenance insertion when a raw path cannot be tied to the current snapshot or eviction set', () => { @@ -118,7 +126,15 @@ describe('artifact gates', () => { validateTouchedSources: async () => ({ invalidSources: [], validSources: ['warehouse:mart_account_segments'] }), tableExists: async () => true, }), - ).rejects.toThrow(/unknown sl_refs entity mart_account_segments\.total_contract_arr_cents/); + ).resolves.toMatchObject({ + ok: false, + findings: [ + { + kind: 'missing_wiki_sl_ref', + message: 'account-segments: unknown sl_refs entity mart_account_segments.total_contract_arr_cents', + }, + ], + }); }); it('passes touched sources to the shared validation path and surfaces its reasons', async () => { @@ -148,9 +164,17 @@ describe('artifact gates', () => { validateTouchedSources, tableExists: async () => true, }), - ).rejects.toThrow( - /semantic-layer validation failed for warehouse:mart_account_segments: join target "accounts" does not exist/, - ); + ).resolves.toMatchObject({ + ok: false, + findings: [ + { + kind: 'invalid_source', + connectionId: 'warehouse', + sourceName: 'mart_account_segments', + errors: ['join target "accounts" does not exist'], + }, + ], + }); expect(validateTouchedSources).toHaveBeenCalledWith([ { connectionId: 'warehouse', sourceName: 'mart_account_segments' }, @@ -178,8 +202,87 @@ describe('artifact gates', () => { validateTouchedSources: async () => ({ invalidSources: [], validSources: [] }), tableExists: async () => true, }), - ).rejects.toThrow( - /wiki references target missing page\(s\): account-segments -> missing-frontmatter-page, account-segments -> missing-inline-page/, - ); + ).resolves.toMatchObject({ + ok: false, + findings: [ + { kind: 'missing_wiki_ref', targetPageKey: 'missing-frontmatter-page' }, + { kind: 'missing_wiki_ref', targetPageKey: 'missing-inline-page' }, + ], + }); + }); + + it('returns structured final gate findings instead of throwing', async () => { + const wikiService = wikiServiceWithPages({ + 'account-segments': { + refs: ['missing-page'], + slRefs: ['missing_source'], + content: 'Revenue depends on `source:missing_source`.', + }, + }); + const semanticLayerService = { + loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }), + }; + + const result = await validateFinalIngestArtifacts({ + connectionIds: ['warehouse'], + changedWikiPageKeys: ['account-segments'], + touchedSlSources: [{ connectionId: 'warehouse', sourceName: 'orders' }], + wikiService: wikiService as never, + semanticLayerService: semanticLayerService as never, + validateTouchedSources: async () => ({ + validSources: [], + invalidSources: [ + { + source: 'warehouse:orders', + errors: ['dry run failed', 'join target "customers" does not exist'], + issues: [ + { kind: 'source_validation', message: 'dry run failed' }, + { + kind: 'missing_join_target', + targetSourceName: 'customers', + caseMismatch: null, + message: 'join target "customers" does not exist', + }, + ], + }, + ], + }), + tableExists: async () => true, + }); + + expect(result).toEqual({ + ok: false, + findings: [ + { kind: 'invalid_source', connectionId: 'warehouse', sourceName: 'orders', errors: ['dry run failed'] }, + { + kind: 'missing_join_target', + ownerConnectionId: 'warehouse', + ownerSourceName: 'orders', + targetSourceName: 'customers', + message: 'join target "customers" does not exist', + }, + { + kind: 'missing_wiki_sl_ref', + pageKey: 'account-segments', + ref: 'missing_source', + sourceName: 'missing_source', + entityName: null, + message: 'account-segments: unknown sl_refs entry missing_source', + }, + { + kind: 'missing_wiki_ref', + pageKey: 'account-segments', + targetPageKey: 'missing-page', + message: 'account-segments -> missing-page', + }, + { + kind: 'missing_wiki_body_sl_source', + pageKey: 'account-segments', + rawToken: 'source:missing_source', + sourceName: 'missing_source', + message: 'account-segments: unknown semantic-layer source missing_source', + }, + ], + }); }); }); diff --git a/packages/cli/test/context/ingest/final-gate-prune.test.ts b/packages/cli/test/context/ingest/final-gate-prune.test.ts new file mode 100644 index 00000000..e9e37cf2 --- /dev/null +++ b/packages/cli/test/context/ingest/final-gate-prune.test.ts @@ -0,0 +1,294 @@ +import { mkdir, mkdtemp, readFile, rm, unlink, writeFile } from 'node:fs/promises'; +import { dirname, join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import type { KtxFileStorePort } from '../../../src/context/core/file-store.js'; +import { pruneFinalGateFindings } from '../../../src/context/ingest/final-gate-prune.js'; +import { slSourceFilePath } from '../../../src/context/sl/source-files.js'; +import { KnowledgeWikiService } from '../../../src/context/wiki/knowledge-wiki.service.js'; + +describe('final gate prune', () => { + let tempDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-final-gate-prune-')); + await mkdir(join(tempDir, 'semantic-layer/warehouse'), { recursive: true }); + await mkdir(join(tempDir, 'wiki/global'), { recursive: true }); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + function tempFileStore(): KtxFileStorePort { + const absolute = (path: string) => join(tempDir, path); + const walk = async (root: string): Promise => { + const { readdir, stat } = await import('node:fs/promises'); + const entries = await readdir(root).catch((error: NodeJS.ErrnoException) => { + if (error.code === 'ENOENT') { + return []; + } + throw error; + }); + const files: string[] = []; + for (const entry of entries) { + const path = join(root, entry); + const info = await stat(path); + if (info.isDirectory()) { + files.push(...(await walk(path))); + } else { + files.push(path); + } + } + return files; + }; + + return { + writeFile: async (path, content) => { + await mkdir(dirname(absolute(path)), { recursive: true }); + await writeFile(absolute(path), content, 'utf-8'); + return { success: true, commitHash: null, path }; + }, + readFile: async (path) => ({ content: await readFile(absolute(path), 'utf-8') }), + deleteFile: async (path) => { + await unlink(absolute(path)).catch((error: NodeJS.ErrnoException) => { + if (error.code !== 'ENOENT') { + throw error; + } + }); + return { success: true, commitHash: null, path }; + }, + listFiles: async (path) => { + const root = absolute(path); + const files = await walk(root); + return { files: files.map((file) => file.slice(tempDir.length + 1).replaceAll('\\', '/')).sort() }; + }, + getFileHistory: vi.fn(), + forWorktree: vi.fn(), + }; + } + + it('drops invalid sources and prunes dangling joins from surviving sources', async () => { + await writeFile( + join(tempDir, 'semantic-layer/warehouse/orders.yaml'), + 'name: orders\ngrain: [id]\ncolumns: [{name: id, type: number}]\njoins:\n - to: missing_customers\n on: orders.customer_id = missing_customers.id\nmeasures: []\n', + 'utf-8', + ); + await writeFile( + join(tempDir, 'semantic-layer/warehouse/bad.yaml'), + 'name: bad\ngrain: [id]\ncolumns: [{name: id, type: number}]\njoins: []\nmeasures: []\n', + 'utf-8', + ); + + const result = await pruneFinalGateFindings({ + workdir: tempDir, + semanticLayerFiles: tempFileStore(), + findings: [ + { kind: 'invalid_source', connectionId: 'warehouse', sourceName: 'bad', errors: ['dry run failed'] }, + { + kind: 'missing_join_target', + ownerConnectionId: 'warehouse', + ownerSourceName: 'orders', + targetSourceName: 'missing_customers', + message: 'join target "missing_customers" does not exist', + }, + ], + droppedSources: [], + trace: { event: vi.fn() } as never, + author: { name: 'ktx Test', email: 'system@ktx.local' }, + }); + + await expect(readFile(join(tempDir, 'semantic-layer/warehouse/bad.yaml'), 'utf-8')).rejects.toThrow(); + await expect(readFile(join(tempDir, 'semantic-layer/warehouse/orders.yaml'), 'utf-8')).resolves.not.toContain( + 'missing_customers', + ); + expect(result.droppedSources).toEqual([ + { connectionId: 'warehouse', sourceName: 'bad', reason: 'dry run failed' }, + ]); + expect(result.prunedReferences).toEqual([ + { + kind: 'join', + artifact: 'semantic-layer/warehouse/orders', + removedRef: 'missing_customers', + absentTarget: 'missing_customers', + }, + ]); + }); + + it('prunes a dangling join from an untouched sibling that points at a dropped source', async () => { + // The gate only flags joins owned by re-ingested (touched) sources, so a + // pre-existing sibling joining to a just-dropped source produces no + // missing_join_target finding. The drop must still prune that edge (D5), + // or the committed orphan join breaks every SL query on the connection. + await writeFile( + join(tempDir, 'semantic-layer/warehouse/orders.yaml'), + 'name: orders\ngrain: [id]\ncolumns: [{name: id, type: number}]\njoins:\n - to: customers\n on: orders.customer_id = customers.id\nmeasures: []\n', + 'utf-8', + ); + await writeFile( + join(tempDir, 'semantic-layer/warehouse/customers.yaml'), + 'name: customers\ngrain: [id]\ncolumns: [{name: id, type: number}]\njoins: []\nmeasures: []\n', + 'utf-8', + ); + + const result = await pruneFinalGateFindings({ + workdir: tempDir, + semanticLayerFiles: tempFileStore(), + findings: [ + { kind: 'invalid_source', connectionId: 'warehouse', sourceName: 'customers', errors: ['dry run failed'] }, + ], + droppedSources: [], + trace: { event: vi.fn() } as never, + author: { name: 'ktx Test', email: 'system@ktx.local' }, + }); + + await expect(readFile(join(tempDir, 'semantic-layer/warehouse/customers.yaml'), 'utf-8')).rejects.toThrow(); + await expect(readFile(join(tempDir, 'semantic-layer/warehouse/orders.yaml'), 'utf-8')).resolves.not.toContain( + 'customers', + ); + expect(result.droppedSources).toEqual([ + { connectionId: 'warehouse', sourceName: 'customers', reason: 'dry run failed' }, + ]); + expect(result.prunedReferences).toEqual([ + { + kind: 'join', + artifact: 'semantic-layer/warehouse/orders', + removedRef: 'customers', + absentTarget: 'customers', + }, + ]); + }); + + it('resolves semantic-layer source files by declared source name before pruning or dropping', async () => { + const ordersPath = slSourceFilePath('warehouse', 'ORDERS'); + const customersPath = slSourceFilePath('warehouse', 'CUSTOMERS'); + await mkdir(dirname(join(tempDir, ordersPath)), { recursive: true }); + await writeFile( + join(tempDir, ordersPath), + [ + 'name: ORDERS', + 'grain: [ORDER_ID]', + 'columns: [{name: ORDER_ID, type: number}, {name: CUSTOMER_ID, type: number}]', + 'joins:', + ' - to: CUSTOMERS', + ' on: ORDERS.CUSTOMER_ID = CUSTOMERS.CUSTOMER_ID', + 'measures: []', + '', + ].join('\n'), + 'utf-8', + ); + await writeFile( + join(tempDir, customersPath), + 'name: CUSTOMERS\ngrain: [CUSTOMER_ID]\ncolumns: [{name: CUSTOMER_ID, type: number}]\njoins: []\nmeasures: []\n', + 'utf-8', + ); + + const result = await pruneFinalGateFindings({ + workdir: tempDir, + semanticLayerFiles: tempFileStore(), + findings: [ + { + kind: 'invalid_source', + connectionId: 'warehouse', + sourceName: 'CUSTOMERS', + errors: ['dry run failed'], + }, + { + kind: 'missing_join_target', + ownerConnectionId: 'warehouse', + ownerSourceName: 'ORDERS', + targetSourceName: 'CUSTOMERS', + message: 'join target "CUSTOMERS" does not exist', + }, + ], + droppedSources: [], + trace: { event: vi.fn() } as never, + author: { name: 'ktx Test', email: 'system@ktx.local' }, + }); + + await expect(readFile(join(tempDir, customersPath), 'utf-8')).rejects.toThrow(); + await expect(readFile(join(tempDir, ordersPath), 'utf-8')).resolves.not.toContain('to: CUSTOMERS'); + await expect(readFile(join(tempDir, 'semantic-layer/warehouse/CUSTOMERS.yaml'), 'utf-8')).rejects.toThrow(); + await expect(readFile(join(tempDir, 'semantic-layer/warehouse/ORDERS.yaml'), 'utf-8')).rejects.toThrow(); + expect(result.droppedSources).toEqual([ + { connectionId: 'warehouse', sourceName: 'CUSTOMERS', reason: 'dry run failed' }, + ]); + expect(result.prunedReferences).toEqual([ + { + kind: 'join', + artifact: 'semantic-layer/warehouse/ORDERS', + removedRef: 'CUSTOMERS', + absentTarget: 'CUSTOMERS', + }, + ]); + }); + + it('prunes wiki refs, wiki sl_refs, and body ref tokens from owning pages', async () => { + await writeFile( + join(tempDir, 'wiki/global/revenue.md'), + '---\nsummary: Revenue\nusage_mode: auto\nrefs:\n - missing-page\nsl_refs:\n - missing_source\n---\n\nRevenue uses [[missing-page]], `source:missing_source`, and `orders.missing_measure`.\n', + 'utf-8', + ); + const wikiService = new KnowledgeWikiService( + { + readFile: async (path: string) => ({ content: await readFile(join(tempDir, path), 'utf-8'), hash: 'h' }), + writeFile: async (path: string, content: string) => { + await writeFile(join(tempDir, path), content, 'utf-8'); + return { commitHash: 'c', path }; + }, + deleteFile: vi.fn(), + listFiles: vi.fn(), + forWorktree: vi.fn(), + } as never, + { computeEmbedding: vi.fn(), computeEmbeddingsBulk: vi.fn(), maxBatchSize: 1 } as never, + { upsertPage: vi.fn(), deletePage: vi.fn(), listPagesForUser: vi.fn() } as never, + {} as never, + ); + + const result = await pruneFinalGateFindings({ + workdir: tempDir, + semanticLayerFiles: tempFileStore(), + findings: [ + { kind: 'missing_wiki_ref', pageKey: 'revenue', targetPageKey: 'missing-page', message: 'revenue -> missing-page' }, + { + kind: 'missing_wiki_sl_ref', + pageKey: 'revenue', + ref: 'missing_source', + sourceName: 'missing_source', + entityName: null, + message: 'revenue: unknown sl_refs entry missing_source', + }, + { + kind: 'missing_wiki_body_sl_source', + pageKey: 'revenue', + rawToken: 'source:missing_source', + sourceName: 'missing_source', + message: 'revenue: unknown semantic-layer source missing_source', + }, + { + kind: 'missing_wiki_body_sl_entity', + pageKey: 'revenue', + rawToken: 'orders.missing_measure', + sourceName: 'orders', + entityName: 'missing_measure', + message: 'revenue: unknown semantic-layer entity orders.missing_measure', + }, + ], + droppedSources: [], + trace: { event: vi.fn() } as never, + author: { name: 'ktx Test', email: 'system@ktx.local' }, + wikiService, + }); + + const page = await readFile(join(tempDir, 'wiki/global/revenue.md'), 'utf-8'); + expect(page).not.toContain('missing-page'); + expect(page).not.toContain('missing_source'); + expect(page).not.toContain('orders.missing_measure'); + expect(result.prunedReferences.map((ref) => ref.kind)).toEqual([ + 'wiki_ref', + 'wiki_sl_ref', + 'wiki_body_ref', + 'wiki_body_ref', + ]); + }); +}); diff --git a/packages/cli/test/context/ingest/final-gate-repair.test.ts b/packages/cli/test/context/ingest/final-gate-repair.test.ts deleted file mode 100644 index d711c5bd..00000000 --- a/packages/cli/test/context/ingest/final-gate-repair.test.ts +++ /dev/null @@ -1,187 +0,0 @@ -import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'; -import { tmpdir } from 'node:os'; -import { join } from 'node:path'; -import { describe, expect, it, vi } from 'vitest'; -import { finalGateRepairPaths, repairFinalGateFailure } from '../../../src/context/ingest/final-gate-repair.js'; -import { FileIngestTraceWriter } from '../../../src/context/ingest/ingest-trace.js'; - -async function makeHarness() { - const root = await mkdtemp(join(tmpdir(), 'ktx-final-gate-repair-')); - const workdir = join(root, 'workdir'); - await mkdir(join(workdir, 'wiki/global'), { recursive: true }); - await mkdir(join(workdir, 'semantic-layer/warehouse'), { recursive: true }); - await writeFile( - join(workdir, 'wiki/global/account-segments.md'), - '---\nsummary: Account segments\nusage_mode: auto\n---\n\nARR uses `mart_account_segments.total_contract_arr_cents`.\n', - 'utf-8', - ); - await writeFile( - join(workdir, 'semantic-layer/warehouse/mart_account_segments.yaml'), - 'name: mart_account_segments\ncolumns: [{name: account_id, type: string}]\njoins: []\nmeasures:\n - name: total_contract_arr\n expr: sum(contract_arr)\n', - 'utf-8', - ); - const trace = new FileIngestTraceWriter({ - tracePath: join(root, 'trace.jsonl'), - jobId: 'job-1', - connectionId: 'warehouse', - sourceKey: 'metabase', - runId: 'run-1', - syncId: 'sync-1', - level: 'trace', - }); - return { root, workdir, trace }; -} - -describe('finalGateRepairPaths', () => { - it('derives sorted, deduplicated wiki and semantic-layer file paths', () => { - expect( - finalGateRepairPaths({ - changedWikiPageKeys: ['account-segments', 'overview', 'account-segments'], - touchedSlSourcePaths: [ - 'semantic-layer/warehouse/mart_account_segments.yaml', - 'semantic-layer/warehouse/orders.yaml', - 'semantic-layer/warehouse/orders.yaml', - ], - }), - ).toEqual([ - 'semantic-layer/warehouse/mart_account_segments.yaml', - 'semantic-layer/warehouse/orders.yaml', - 'wiki/global/account-segments.md', - 'wiki/global/overview.md', - ]); - }); -}); - -describe('repairFinalGateFailure', () => { - it('lets the repair agent read gate errors, edit only allowed files, and verifies the gate', async () => { - const { workdir, trace } = await makeHarness(); - const agentRunner = { - runLoop: vi.fn(async (params: any) => { - const error = await params.toolSet.read_gate_error.execute({}); - expect(error.markdown).toContain('total_contract_arr_cents'); - - const page = await params.toolSet.read_repair_file.execute({ - path: 'wiki/global/account-segments.md', - }); - expect(page.markdown).toContain('total_contract_arr_cents'); - - await expect( - params.toolSet.write_repair_file.execute({ - path: 'wiki/global/other.md', - content: 'not allowed', - }), - ).rejects.toThrow(/repair path not allowed/); - - await params.toolSet.write_repair_file.execute({ - path: 'wiki/global/account-segments.md', - content: page.markdown.replace('total_contract_arr_cents', 'total_contract_arr'), - }); - return { stopReason: 'natural' as const }; - }), - }; - const verify = vi.fn(async () => ({ ok: true as const })); - - const result = await repairFinalGateFailure({ - agentRunner, - workdir, - gateError: - 'final artifact gates failed:\naccount-segments: unknown semantic-layer entity mart_account_segments.total_contract_arr_cents', - allowedPaths: ['wiki/global/account-segments.md'], - trace, - repairKind: 'final_artifact_gate', - verify, - maxAttempts: 1, - stepBudget: 8, - }); - - expect(result).toEqual({ - status: 'repaired', - attempts: 1, - changedPaths: ['wiki/global/account-segments.md'], - }); - expect(verify).toHaveBeenCalledWith(['wiki/global/account-segments.md']); - await expect(readFile(join(workdir, 'wiki/global/account-segments.md'), 'utf-8')).resolves.toContain( - 'total_contract_arr', - ); - await expect(readFile(trace.tracePath, 'utf-8')).resolves.toContain('gate_repair_repaired'); - expect(agentRunner.runLoop).toHaveBeenCalledWith( - expect.objectContaining({ - modelRole: 'repair', - stepBudget: 8, - telemetryTags: expect.objectContaining({ - operationName: 'ingest-isolated-diff-gate-repair', - repairKind: 'final_artifact_gate', - }), - }), - ); - }); - - it('returns failed when the repair agent edits no allowed file', async () => { - const { workdir, trace } = await makeHarness(); - const verify = vi.fn(async () => ({ ok: true as const })); - const result = await repairFinalGateFailure({ - agentRunner: { runLoop: vi.fn(async () => ({ stopReason: 'natural' as const })) }, - workdir, - gateError: 'final artifact gates failed:\naccount-segments: unknown semantic-layer entity', - allowedPaths: ['wiki/global/account-segments.md'], - trace, - repairKind: 'final_artifact_gate', - verify, - maxAttempts: 1, - stepBudget: 8, - }); - - expect(result).toEqual({ - status: 'failed', - attempts: 1, - reason: 'gate repair completed without editing an allowed path', - }); - expect(verify).not.toHaveBeenCalled(); - await expect(readFile(trace.tracePath, 'utf-8')).resolves.toContain('gate_repair_failed'); - }); - - it('does not report repaired when edits fail gate verification', async () => { - // Regression: the repair agent edited allowed files but left a dangling - // join in place. The old loop reported "repaired" because a file changed; - // success must come from the gate re-check instead. - const { workdir, trace } = await makeHarness(); - const agentRunner = { - runLoop: vi.fn(async (params: any) => { - await params.toolSet.write_repair_file.execute({ - path: 'wiki/global/account-segments.md', - content: 'an edit that does not fix the gate\n', - }); - return { stopReason: 'natural' as const }; - }), - }; - const verify = vi - .fn() - .mockResolvedValueOnce({ - ok: false, - reason: 'final artifact gates failed:\nsemantic-layer validation failed for warehouse:accounts', - }) - .mockResolvedValueOnce({ ok: true }); - - const result = await repairFinalGateFailure({ - agentRunner, - workdir, - gateError: 'final artifact gates failed:\nsemantic-layer validation failed for warehouse:accounts', - allowedPaths: ['wiki/global/account-segments.md'], - trace, - repairKind: 'patch_semantic_gate', - verify, - maxAttempts: 2, - stepBudget: 8, - }); - - expect(result).toEqual({ - status: 'repaired', - attempts: 2, - changedPaths: ['wiki/global/account-segments.md'], - }); - expect(verify).toHaveBeenCalledTimes(2); - const secondPrompt = agentRunner.runLoop.mock.calls[1][0].userPrompt as string; - expect(secondPrompt).toContain('semantic-layer validation failed for warehouse:accounts'); - expect(secondPrompt).toContain('Previous attempt did not pass the gate'); - }); -}); diff --git a/packages/cli/test/context/ingest/ingest-bundle.runner.isolated-diff.test.ts b/packages/cli/test/context/ingest/ingest-bundle.runner.isolated-diff.test.ts index b9485684..bb3313a8 100644 --- a/packages/cli/test/context/ingest/ingest-bundle.runner.isolated-diff.test.ts +++ b/packages/cli/test/context/ingest/ingest-bundle.runner.isolated-diff.test.ts @@ -1,10 +1,13 @@ import { mkdir, mkdtemp, readFile, readdir, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; +import YAML from 'yaml'; import { describe, expect, it, vi } from 'vitest'; import { GitService } from '../../../src/context/core/git.service.js'; import { SessionWorktreeService } from '../../../src/context/core/session-worktree.service.js'; import { LocalGitFileStore } from '../../../src/context/project/local-git-file-store.js'; +import { SqliteContentResultCache } from '../../../src/context/cache/sqlite-content-result-cache.js'; +import { slSourceFilePath } from '../../../src/context/sl/source-files.js'; import { addTouchedSlSource } from '../../../src/context/tools/touched-sl-sources.js'; import { IngestBundleRunner } from '../../../src/context/ingest/ingest-bundle.runner.js'; import type { IngestBundleRunnerDeps } from '../../../src/context/ingest/ports.js'; @@ -46,26 +49,30 @@ function rootOfConfig(configService: unknown, fallback: string): string { return typeof rootDir === 'string' ? rootDir : fallback; } -async function loadSourcesFromRoot(root: string) { - const raw = await readFile(join(root, 'semantic-layer/warehouse/mart_account_segments.yaml'), 'utf-8').catch( - () => '', +async function loadSourcesFromRoot(root: string, connectionId = 'warehouse') { + const dir = join(root, 'semantic-layer', connectionId); + const entries = await readdir(dir).catch(() => []); + const sources = await Promise.all( + entries + .filter((entry) => entry.endsWith('.yaml') || entry.endsWith('.yml')) + .sort() + .map(async (entry) => { + const parsed = YAML.parse(await readFile(join(dir, entry), 'utf-8')) as Record | null; + return parsed && typeof parsed.name === 'string' + ? { + name: parsed.name, + grain: Array.isArray(parsed.grain) ? parsed.grain : [], + columns: Array.isArray(parsed.columns) ? parsed.columns : [], + joins: Array.isArray(parsed.joins) ? parsed.joins : [], + measures: Array.isArray(parsed.measures) ? parsed.measures : [], + segments: Array.isArray(parsed.segments) ? parsed.segments : [], + table: parsed.table, + } + : null; + }), ); - const hasCents = raw.includes('total_contract_arr_cents'); - const hasDollars = raw.includes('total_contract_arr'); return { - sources: - hasCents || hasDollars - ? [ - { - name: 'mart_account_segments', - grain: ['account_id'], - columns: [{ name: 'account_id', type: 'string' }], - joins: [], - measures: [{ name: hasCents ? 'total_contract_arr_cents' : 'total_contract_arr', expr: 'sum(contract_arr)' }], - table: 'analytics.mart_account_segments', - }, - ] - : [], + sources: sources.filter((source): source is NonNullable => source !== null), loadErrors: [], }; } @@ -107,6 +114,12 @@ function legacySharedTraceEvent(): string { return ['shared', 'worktree', 'path', 'enabled'].join('_'); } +function workUnitRunLoopCalls(deps: IngestBundleRunnerDeps) { + return vi + .mocked(deps.agentRunner.runLoop) + .mock.calls.filter(([params]: any[]) => params.telemetryTags?.operationName === 'ingest-bundle-wu'); +} + function makeWikiService(root: string) { return { listPageKeys: vi.fn(async (scope: string) => (scope === 'GLOBAL' ? listGlobalWikiPageKeys(root) : [])), @@ -179,7 +192,7 @@ function makeDeps( }; const wikiService = makeWikiService(runtime.configDir); const semanticLayerService: any = { - loadAllSources: vi.fn(async () => loadSourcesFromRoot(runtime.configDir)), + loadAllSources: vi.fn(async (connectionId: string) => loadSourcesFromRoot(runtime.configDir, connectionId)), listFilesForConnection: vi.fn().mockResolvedValue(['mart_account_segments.yaml']), readSourceFile: vi.fn((connectionId: string, sourceName: string) => readSourceFileFromRoot(runtime.configDir, connectionId, sourceName), @@ -187,7 +200,7 @@ function makeDeps( }; semanticLayerService.forWorktree = vi.fn((workdir: string) => ({ ...semanticLayerService, - loadAllSources: vi.fn(async () => loadSourcesFromRoot(workdir)), + loadAllSources: vi.fn(async (connectionId: string) => loadSourcesFromRoot(workdir, connectionId)), listFilesForConnection: vi.fn().mockResolvedValue(['mart_account_segments.yaml']), readSourceFile: vi.fn((connectionId: string, sourceName: string) => readSourceFileFromRoot(workdir, connectionId, sourceName), @@ -203,6 +216,7 @@ function makeDeps( }, reports: { create: vi.fn().mockResolvedValue({ id: 'report-1' }), findByJobId: vi.fn().mockResolvedValue(null), markSuperseded: vi.fn() }, canonicalPins: { listPins: vi.fn().mockResolvedValue([]) }, + contentCache: new SqliteContentResultCache({ dbPath: join(runtime.homeDir, 'cache.sqlite') }), registry: { get: vi.fn().mockReturnValue(adapter), register: vi.fn(), has: vi.fn(), list: vi.fn() }, diffSetService: { compute: vi.fn().mockResolvedValue({ added: ['cards/wiki.json', 'cards/source.json'], modified: [], deleted: [], unchanged: [] }), @@ -221,6 +235,7 @@ function makeDeps( }, settings: { memoryIngestionModel: 'test', + cliVersion: '0.0.0-test', probeRowCount: 1, ingestTraceLevel: 'trace', ...settings, @@ -256,10 +271,14 @@ async function mockStageRawFiles( (runner as any).resolveStagedDir = vi.fn().mockResolvedValue(join(runtime.homeDir, 'stage')); (runner as any).stageRawFilesStage1 = vi.fn(async ({ worktreeRoot }: any) => { const rawDir = join(worktreeRoot, 'raw-sources/warehouse', sourceKey, 's'); + const stagedDir = join(runtime.homeDir, 'stage'); await mkdir(rawDir, { recursive: true }); - for (const [rawPath] of hashes) { + for (const [rawPath, rawHash] of hashes) { + await mkdir(join(stagedDir, rawPath.split('/').slice(0, -1).join('/')), { recursive: true }); await mkdir(join(rawDir, rawPath.split('/').slice(0, -1).join('/')), { recursive: true }); - await writeFile(join(rawDir, rawPath), '{}'); + const content = JSON.stringify({ rawHash }); + await writeFile(join(stagedDir, rawPath), content); + await writeFile(join(rawDir, rawPath), content); } return { currentHashes: new Map(hashes), rawDirInWorktree: `raw-sources/warehouse/${sourceKey}/s` }; }); @@ -556,6 +575,600 @@ describe('IngestBundleRunner isolated diff path', () => { } }); + it('replays completed work units on a second identical run without an agent loop', async () => { + const runtime = await makeRealGitRuntime(); + try { + const { deps, adapter } = makeDeps(runtime, 'dbt'); + adapter.chunk.mockResolvedValue({ + workUnits: [ + { unitKey: 'orders', rawFiles: ['models/orders.sql'], peerFileIndex: [], dependencyPaths: [] }, + { unitKey: 'customers', rawFiles: ['models/customers.sql'], peerFileIndex: [], dependencyPaths: [] }, + ], + }); + let currentSession: any = null; + deps.toolsetFactory.createIngestWuToolset = vi.fn((toolSession: any) => { + currentSession = toolSession; + return { toRuntimeTools: vi.fn(() => ({})) }; + }); + deps.agentRunner.runLoop = vi.fn(async (params: any) => { + const unitKey = params.telemetryTags.unitKey; + const root = rootOfConfig(currentSession.configService, runtime.configDir); + await mkdir(join(root, 'wiki/global'), { recursive: true }); + await writeFile( + join(root, `wiki/global/${unitKey}.md`), + `---\nsummary: ${unitKey}\nusage_mode: auto\n---\n\n${unitKey}\n`, + 'utf-8', + ); + currentSession.actions.push({ target: 'wiki', type: 'created', key: unitKey, detail: unitKey }); + await currentSession.gitService.commitFiles([`wiki/global/${unitKey}.md`], `wu ${unitKey}`, 'ktx Test', 'system@ktx.local'); + return { stopReason: 'natural' }; + }) as never; + + const runner = new IngestBundleRunner(deps); + await mockStageRawFiles(runner, runtime, [ + ['models/orders.sql', 'orders-hash'], + ['models/customers.sql', 'customers-hash'], + ], 'dbt'); + + await expect( + runner.run({ jobId: 'job-resume-1', connectionId: 'warehouse', sourceKey: 'dbt', trigger: 'upload', bundleRef: { kind: 'upload', uploadId: 'upload' } }), + ).resolves.toMatchObject({ failedWorkUnits: [] }); + expect(workUnitRunLoopCalls(deps)).toHaveLength(2); + + await expect( + runner.run({ jobId: 'job-resume-2', connectionId: 'warehouse', sourceKey: 'dbt', trigger: 'upload', bundleRef: { kind: 'upload', uploadId: 'upload' } }), + ).resolves.toMatchObject({ failedWorkUnits: [] }); + expect(workUnitRunLoopCalls(deps)).toHaveLength(2); + + const trace = await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-resume-2/trace.jsonl'), 'utf-8'); + expect(trace).toContain('work_unit_cache_hit'); + expect(trace.match(/work_unit_cache_replayed/g)).toHaveLength(2); + } finally { + await rm(runtime.homeDir, { recursive: true, force: true }); + } + }); + + it('recomputes only the changed work unit after an input byte changes', async () => { + const runtime = await makeRealGitRuntime(); + try { + const { deps, adapter } = makeDeps(runtime, 'dbt'); + adapter.chunk.mockResolvedValue({ + workUnits: [ + { unitKey: 'orders', rawFiles: ['models/orders.sql'], peerFileIndex: [], dependencyPaths: [] }, + { unitKey: 'customers', rawFiles: ['models/customers.sql'], peerFileIndex: [], dependencyPaths: [] }, + ], + }); + let currentSession: any = null; + deps.toolsetFactory.createIngestWuToolset = vi.fn((toolSession: any) => { + currentSession = toolSession; + return { toRuntimeTools: vi.fn(() => ({})) }; + }); + deps.agentRunner.runLoop = vi.fn(async (params: any) => { + const unitKey = params.telemetryTags.unitKey; + const root = rootOfConfig(currentSession.configService, runtime.configDir); + await mkdir(join(root, 'wiki/global'), { recursive: true }); + await writeFile( + join(root, `wiki/global/${unitKey}.md`), + `---\nsummary: ${unitKey}\nusage_mode: auto\n---\n\n${unitKey}\n`, + 'utf-8', + ); + currentSession.actions.push({ target: 'wiki', type: 'updated', key: unitKey, detail: unitKey }); + await currentSession.gitService.commitFiles([`wiki/global/${unitKey}.md`], `wu ${unitKey}`, 'ktx Test', 'system@ktx.local'); + return { stopReason: 'natural' }; + }) as never; + + const runner = new IngestBundleRunner(deps); + await mockStageRawFiles(runner, runtime, [ + ['models/orders.sql', 'orders-hash'], + ['models/customers.sql', 'customers-hash'], + ], 'dbt'); + await runner.run({ jobId: 'job-input-1', connectionId: 'warehouse', sourceKey: 'dbt', trigger: 'upload', bundleRef: { kind: 'upload', uploadId: 'upload' } }); + + await mockStageRawFiles(runner, runtime, [ + ['models/orders.sql', 'orders-hash-changed'], + ['models/customers.sql', 'customers-hash'], + ], 'dbt'); + await runner.run({ jobId: 'job-input-2', connectionId: 'warehouse', sourceKey: 'dbt', trigger: 'upload', bundleRef: { kind: 'upload', uploadId: 'upload' } }); + + const wuCalls = workUnitRunLoopCalls(deps); + expect(wuCalls).toHaveLength(3); + const secondRunUnitKeys = wuCalls.slice(2).map(([params]: any[]) => params.telemetryTags.unitKey); + expect(secondRunUnitKeys).toEqual(['orders']); + } finally { + await rm(runtime.homeDir, { recursive: true, force: true }); + } + }); + + it('does not cache failed work units and retries them on the next run', async () => { + const runtime = await makeRealGitRuntime(); + try { + const { deps, adapter } = makeDeps(runtime, 'dbt'); + adapter.chunk.mockResolvedValue({ + workUnits: [{ unitKey: 'orders', rawFiles: ['models/orders.sql'], peerFileIndex: [], dependencyPaths: [] }], + }); + let currentSession: any = null; + let attempt = 0; + deps.toolsetFactory.createIngestWuToolset = vi.fn((toolSession: any) => { + currentSession = toolSession; + return { toRuntimeTools: vi.fn(() => ({})) }; + }); + deps.agentRunner.runLoop = vi.fn(async () => { + attempt += 1; + if (attempt === 1) { + return { stopReason: 'error', error: new Error('provider disconnected') }; + } + const root = rootOfConfig(currentSession.configService, runtime.configDir); + await mkdir(join(root, 'wiki/global'), { recursive: true }); + await writeFile(join(root, 'wiki/global/orders.md'), '---\nsummary: orders\nusage_mode: auto\n---\n\norders\n', 'utf-8'); + currentSession.actions.push({ target: 'wiki', type: 'created', key: 'orders', detail: 'orders' }); + await currentSession.gitService.commitFiles(['wiki/global/orders.md'], 'wu orders', 'ktx Test', 'system@ktx.local'); + return { stopReason: 'natural' }; + }) as never; + + const runner = new IngestBundleRunner(deps); + await mockStageRawFiles(runner, runtime, [['models/orders.sql', 'orders-hash']], 'dbt'); + + await expect( + runner.run({ jobId: 'job-failed-cache-1', connectionId: 'warehouse', sourceKey: 'dbt', trigger: 'upload', bundleRef: { kind: 'upload', uploadId: 'upload' } }), + ).resolves.toMatchObject({ failedWorkUnits: ['orders'] }); + await expect( + runner.run({ jobId: 'job-failed-cache-2', connectionId: 'warehouse', sourceKey: 'dbt', trigger: 'upload', bundleRef: { kind: 'upload', uploadId: 'upload' } }), + ).resolves.toMatchObject({ failedWorkUnits: [] }); + expect(workUnitRunLoopCalls(deps)).toHaveLength(2); + } finally { + await rm(runtime.homeDir, { recursive: true, force: true }); + } + }); + + it('prunes a missing sibling join, then self-heals from the cached owner patch without rerunning it', async () => { + const runtime = await makeRealGitRuntime(); + try { + const { deps, adapter } = makeDeps(runtime, 'dbt'); + adapter.chunk.mockResolvedValue({ + workUnits: [ + { unitKey: 'orders', rawFiles: ['models/orders.sql'], peerFileIndex: [], dependencyPaths: [] }, + { unitKey: 'customers', rawFiles: ['models/customers.sql'], peerFileIndex: [], dependencyPaths: [] }, + ], + }); + + let currentSession: any = null; + let customersAttempt = 0; + deps.toolsetFactory.createIngestWuToolset = vi.fn((toolSession: any) => { + currentSession = toolSession; + return { toRuntimeTools: vi.fn(() => ({})) }; + }); + deps.agentRunner.runLoop = vi.fn(async (params: any) => { + if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') { + return { stopReason: 'natural' }; + } + const unitKey = params.telemetryTags.unitKey; + const root = rootOfConfig(currentSession.configService, runtime.configDir); + await mkdir(join(root, 'semantic-layer/warehouse'), { recursive: true }); + if (unitKey === 'orders') { + await writeFile( + join(root, 'semantic-layer/warehouse/orders.yaml'), + [ + 'name: orders', + 'grain: [order_id]', + 'columns: [{name: order_id, type: string}, {name: customer_id, type: string}]', + 'joins:', + ' - to: customers', + ' on: orders.customer_id = customers.customer_id', + 'measures: []', + '', + ].join('\n'), + 'utf-8', + ); + addTouchedSlSource(currentSession.touchedSlSources, 'warehouse', 'orders'); + currentSession.actions.push({ + target: 'sl', + type: 'created', + key: 'orders', + detail: 'orders with customer join', + targetConnectionId: 'warehouse', + rawPaths: ['models/orders.sql'], + }); + await currentSession.gitService.commitFiles( + ['semantic-layer/warehouse/orders.yaml'], + 'wu orders', + 'ktx Test', + 'system@ktx.local', + ); + return { stopReason: 'natural' }; + } + + customersAttempt += 1; + if (customersAttempt === 1) { + return { stopReason: 'error', error: new Error('provider disconnected') }; + } + await writeFile( + join(root, 'semantic-layer/warehouse/customers.yaml'), + 'name: customers\ngrain: [customer_id]\ncolumns: [{name: customer_id, type: string}]\njoins: []\nmeasures: []\n', + 'utf-8', + ); + addTouchedSlSource(currentSession.touchedSlSources, 'warehouse', 'customers'); + currentSession.actions.push({ + target: 'sl', + type: 'created', + key: 'customers', + detail: 'customers source', + targetConnectionId: 'warehouse', + rawPaths: ['models/customers.sql'], + }); + await currentSession.gitService.commitFiles( + ['semantic-layer/warehouse/customers.yaml'], + 'wu customers', + 'ktx Test', + 'system@ktx.local', + ); + return { stopReason: 'natural' }; + }) as never; + + const runner = new IngestBundleRunner(deps); + await mockStageRawFiles( + runner, + runtime, + [ + ['models/orders.sql', 'orders-hash'], + ['models/customers.sql', 'customers-hash'], + ], + 'dbt', + ); + + const first = await runner.run({ + jobId: 'job-join-prune-1', + connectionId: 'warehouse', + sourceKey: 'dbt', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); + expect(first.commitSha).toBeTruthy(); + expect(first.failedWorkUnits).toEqual(['customers']); + expect(first.finalGatePrunedReferences).toContainEqual({ + kind: 'join', + artifact: 'semantic-layer/warehouse/orders', + removedRef: 'customers', + absentTarget: 'customers', + }); + await expect(readFile(join(runtime.configDir, 'semantic-layer/warehouse/orders.yaml'), 'utf-8')).resolves.not.toContain( + 'to: customers', + ); + + const second = await runner.run({ + jobId: 'job-join-prune-2', + connectionId: 'warehouse', + sourceKey: 'dbt', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); + expect(second.failedWorkUnits).toEqual([]); + expect(second.finalGatePrunedReferences ?? []).toEqual([]); + expect(workUnitRunLoopCalls(deps).map(([params]: any[]) => params.telemetryTags.unitKey)).toEqual([ + 'orders', + 'customers', + 'customers', + ]); + await expect(readFile(join(runtime.configDir, 'semantic-layer/warehouse/orders.yaml'), 'utf-8')).resolves.toContain( + 'to: customers', + ); + await expect(readFile(join(runtime.configDir, 'semantic-layer/warehouse/customers.yaml'), 'utf-8')).resolves.toContain( + 'name: customers', + ); + const trace = await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-join-prune-2/trace.jsonl'), 'utf-8'); + expect(trace).toContain('work_unit_cache_hit'); + expect(trace).toContain('work_unit_cache_replayed'); + expect(trace).not.toContain('work_unit_cache_stale_recompute'); + } finally { + await rm(runtime.homeDir, { recursive: true, force: true }); + } + }); + + it('prunes a failed sibling join without pruning a valid surviving join', async () => { + const runtime = await makeRealGitRuntime(); + try { + const { deps, adapter } = makeDeps(runtime, 'dbt'); + adapter.chunk.mockResolvedValue({ + workUnits: [ + { unitKey: 'orders', rawFiles: ['models/orders.sql'], peerFileIndex: [], dependencyPaths: [] }, + { unitKey: 'customers', rawFiles: ['models/customers.sql'], peerFileIndex: [], dependencyPaths: [] }, + { unitKey: 'products', rawFiles: ['models/products.sql'], peerFileIndex: [], dependencyPaths: [] }, + ], + }); + + let currentSession: any = null; + deps.toolsetFactory.createIngestWuToolset = vi.fn((toolSession: any) => { + currentSession = toolSession; + return { toRuntimeTools: vi.fn(() => ({})) }; + }); + deps.agentRunner.runLoop = vi.fn(async (params: any) => { + if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') { + return { stopReason: 'natural' }; + } + const unitKey = params.telemetryTags.unitKey; + const root = rootOfConfig(currentSession.configService, runtime.configDir); + await mkdir(join(root, 'semantic-layer/warehouse'), { recursive: true }); + + if (unitKey === 'orders') { + await writeFile( + join(root, 'semantic-layer/warehouse/orders.yaml'), + [ + 'name: orders', + 'grain: [order_id]', + 'columns: [{name: order_id, type: string}, {name: customer_id, type: string}, {name: product_id, type: string}]', + 'joins:', + ' - to: customers', + ' on: orders.customer_id = customers.customer_id', + ' - to: products', + ' on: orders.product_id = products.product_id', + 'measures: []', + '', + ].join('\n'), + 'utf-8', + ); + addTouchedSlSource(currentSession.touchedSlSources, 'warehouse', 'orders'); + currentSession.actions.push({ + target: 'sl', + type: 'created', + key: 'orders', + detail: 'orders with customer and product joins', + targetConnectionId: 'warehouse', + rawPaths: ['models/orders.sql'], + }); + await currentSession.gitService.commitFiles( + ['semantic-layer/warehouse/orders.yaml'], + 'wu orders', + 'ktx Test', + 'system@ktx.local', + ); + return { stopReason: 'natural' }; + } + + if (unitKey === 'customers') { + return { stopReason: 'error', error: new Error('provider disconnected') }; + } + + await writeFile( + join(root, 'semantic-layer/warehouse/products.yaml'), + 'name: products\ngrain: [product_id]\ncolumns: [{name: product_id, type: string}]\njoins: []\nmeasures: []\n', + 'utf-8', + ); + addTouchedSlSource(currentSession.touchedSlSources, 'warehouse', 'products'); + currentSession.actions.push({ + target: 'sl', + type: 'created', + key: 'products', + detail: 'products source', + targetConnectionId: 'warehouse', + rawPaths: ['models/products.sql'], + }); + await currentSession.gitService.commitFiles( + ['semantic-layer/warehouse/products.yaml'], + 'wu products', + 'ktx Test', + 'system@ktx.local', + ); + return { stopReason: 'natural' }; + }) as never; + + const runner = new IngestBundleRunner(deps); + await mockStageRawFiles( + runner, + runtime, + [ + ['models/orders.sql', 'orders-hash'], + ['models/customers.sql', 'customers-hash'], + ['models/products.sql', 'products-hash'], + ], + 'dbt', + ); + + const result = await runner.run({ + jobId: 'job-join-prune-no-cascade', + connectionId: 'warehouse', + sourceKey: 'dbt', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); + + expect(result.commitSha).toBeTruthy(); + expect(result.failedWorkUnits).toEqual(['customers']); + expect(result.finalGatePrunedReferences).toContainEqual({ + kind: 'join', + artifact: 'semantic-layer/warehouse/orders', + removedRef: 'customers', + absentTarget: 'customers', + }); + expect(result.finalGatePrunedReferences).not.toContainEqual({ + kind: 'join', + artifact: 'semantic-layer/warehouse/orders', + removedRef: 'products', + absentTarget: 'products', + }); + const ordersYaml = await readFile(join(runtime.configDir, 'semantic-layer/warehouse/orders.yaml'), 'utf-8'); + expect(ordersYaml).not.toContain('to: customers'); + expect(ordersYaml).toContain('to: products'); + await expect(readFile(join(runtime.configDir, 'semantic-layer/warehouse/products.yaml'), 'utf-8')).resolves.toContain( + 'name: products', + ); + } finally { + await rm(runtime.homeDir, { recursive: true, force: true }); + } + }); + + it('drops an intrinsically invalid uppercase source at the final gate and reports the producing work unit', async () => { + const runtime = await makeRealGitRuntime(); + try { + const { deps, adapter } = makeDeps(runtime, 'dbt'); + adapter.chunk.mockResolvedValue({ + workUnits: [{ unitKey: 'signed-up', rawFiles: ['models/signed_up.sql'], peerFileIndex: [], dependencyPaths: [] }], + }); + + const sourceName = 'SIGNED_UP'; + const sourcePath = slSourceFilePath('warehouse', sourceName); + let currentSession: any = null; + deps.toolsetFactory.createIngestWuToolset = vi.fn((toolSession: any) => { + currentSession = toolSession; + return { toRuntimeTools: vi.fn(() => ({})) }; + }); + let signedUpValidationCount = 0; + deps.slValidator.validateSingleSource = vi.fn( + async (_validationDeps: any, _connectionId: string, validatedSourceName: string) => { + if (validatedSourceName === sourceName) { + signedUpValidationCount += 1; + if (signedUpValidationCount > 1) { + return { errors: ['intrinsic final validation failed'], warnings: [] }; + } + } + return { errors: [], warnings: [] }; + }, + ) as never; + deps.agentRunner.runLoop = vi.fn(async (params: any) => { + if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') { + return { stopReason: 'natural' }; + } + const root = rootOfConfig(currentSession.configService, runtime.configDir); + await mkdir(join(root, 'semantic-layer/warehouse'), { recursive: true }); + await writeFile( + join(root, sourcePath), + 'name: SIGNED_UP\ngrain: [USER_ID]\ncolumns: [{name: USER_ID, type: string}]\njoins: []\nmeasures: []\n', + 'utf-8', + ); + addTouchedSlSource(currentSession.touchedSlSources, 'warehouse', sourceName); + currentSession.actions.push({ + target: 'sl', + type: 'created', + key: sourceName, + detail: 'uppercase signed up source', + targetConnectionId: 'warehouse', + rawPaths: ['models/signed_up.sql'], + }); + await currentSession.gitService.commitFiles([sourcePath], 'wu signed up', 'ktx Test', 'system@ktx.local'); + return { stopReason: 'natural' }; + }) as never; + + const runner = new IngestBundleRunner(deps); + await mockStageRawFiles(runner, runtime, [['models/signed_up.sql', 'signed-up-hash']], 'dbt'); + + const result = await runner.run({ + jobId: 'job-final-gate-intrinsic-drop', + connectionId: 'warehouse', + sourceKey: 'dbt', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); + + expect(result.commitSha).toBeTruthy(); + expect(result.failedWorkUnits).toEqual(['signed-up']); + expect(result.finalGateDroppedSources).toContainEqual({ + connectionId: 'warehouse', + sourceName, + reason: 'intrinsic final validation failed', + }); + await expect(readFile(join(runtime.configDir, sourcePath), 'utf-8')).rejects.toMatchObject({ code: 'ENOENT' }); + } finally { + await rm(runtime.homeDir, { recursive: true, force: true }); + } + }); + + it('recomputes a stale cached patch and reports recomputed metadata', async () => { + const runtime = await makeRealGitRuntime(); + try { + const { deps, adapter } = makeDeps(runtime, 'dbt'); + adapter.chunk.mockResolvedValue({ + workUnits: [{ unitKey: 'orders', rawFiles: ['models/orders.sql'], peerFileIndex: [], dependencyPaths: [] }], + }); + let currentSession: any = null; + let agentAttempt = 0; + deps.toolsetFactory.createIngestWuToolset = vi.fn((toolSession: any) => { + return { + toRuntimeTools: vi.fn(() => { + currentSession = toolSession; + return {}; + }), + }; + }); + deps.agentRunner.runLoop = vi.fn(async (params: any) => { + if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') { + return { stopReason: 'natural' }; + } + agentAttempt += 1; + const root = rootOfConfig(currentSession.configService, runtime.configDir); + await mkdir(join(root, 'wiki/global'), { recursive: true }); + const detail = agentAttempt === 1 ? 'cached first output' : 'fresh recompute output'; + const body = agentAttempt === 1 ? 'orders cached' : 'orders recomputed'; + await writeFile( + join(root, 'wiki/global/orders.md'), + `---\nsummary: orders\nusage_mode: auto\n---\n\n${body}\n`, + 'utf-8', + ); + currentSession.actions.push({ + target: 'wiki', + type: agentAttempt === 1 ? 'created' : 'updated', + key: 'orders', + detail, + }); + await currentSession.gitService.commitFiles( + ['wiki/global/orders.md'], + `wu orders ${agentAttempt}`, + 'ktx Test', + 'system@ktx.local', + ); + return { stopReason: 'natural' }; + }) as never; + + const runner = new IngestBundleRunner(deps); + await mockStageRawFiles(runner, runtime, [['models/orders.sql', 'orders-hash']], 'dbt'); + + await expect( + runner.run({ + jobId: 'job-stale-cache-1', + connectionId: 'warehouse', + sourceKey: 'dbt', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }), + ).resolves.toMatchObject({ failedWorkUnits: [] }); + expect(workUnitRunLoopCalls(deps)).toHaveLength(1); + + await writeFile( + join(runtime.configDir, 'wiki/global/orders.md'), + '---\nsummary: orders\nusage_mode: auto\n---\n\noperator drift\n', + 'utf-8', + ); + await runtime.git.commitFiles(['wiki/global/orders.md'], 'manual drift', 'ktx Test', 'system@ktx.local'); + + await expect( + runner.run({ + jobId: 'job-stale-cache-2', + connectionId: 'warehouse', + sourceKey: 'dbt', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }), + ).resolves.toMatchObject({ failedWorkUnits: [] }); + + expect(workUnitRunLoopCalls(deps)).toHaveLength(2); + await expect(readFile(join(runtime.configDir, 'wiki/global/orders.md'), 'utf-8')).resolves.toContain( + 'orders recomputed', + ); + + const trace = await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-stale-cache-2/trace.jsonl'), 'utf-8'); + expect(trace).toContain('work_unit_cache_unsafe_drift'); + expect(trace).not.toContain('work_unit_cache_hit'); + expect(trace).not.toContain('work_unit_cache_stale_recompute'); + + const reportCreate = vi.mocked(deps.reports.create).mock.calls.at(-1)?.[0] as any; + expect(reportCreate.body.workUnits).toContainEqual( + expect.objectContaining({ + unitKey: 'orders', + actions: [expect.objectContaining({ type: 'updated', detail: 'fresh recompute output' })], + }), + ); + } finally { + await rm(runtime.homeDir, { recursive: true, force: true }); + } + }); + it.each(['notion', 'lookml', 'looker', 'dbt', 'metricflow'] as const)( 'routes %s direct writes through isolated child worktrees', async (sourceKey) => { @@ -651,7 +1264,7 @@ describe('IngestBundleRunner isolated diff path', () => { }, ); - it('rejects the Metabase stale-measure wiki body regression before squash', async () => { + it('prunes the Metabase stale-measure wiki body regression before squash', async () => { const runtime = await makeRealGitRuntime(); try { const { deps, adapter } = makeDeps(runtime); @@ -708,23 +1321,35 @@ describe('IngestBundleRunner isolated diff path', () => { ['cards/source.json', 'h2'], ]); - await expect( - runner.run({ jobId: 'job-1', connectionId: 'warehouse', sourceKey: 'metabase', trigger: 'upload', bundleRef: { kind: 'upload', uploadId: 'upload' } }), - ).rejects.toThrow(/total_contract_arr_cents/); + const result = await runner.run({ + jobId: 'job-1', + connectionId: 'warehouse', + sourceKey: 'metabase', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); + expect(result.finalGatePrunedReferences).toContainEqual({ + kind: 'wiki_body_ref', + artifact: 'wiki/global/account-segments', + removedRef: 'mart_account_segments.total_contract_arr_cents', + absentTarget: 'mart_account_segments.total_contract_arr_cents', + }); const trace = await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-1/trace.jsonl'), 'utf-8'); expect(trace).toContain('input_snapshot'); expect(trace).toContain('isolated_diff_enabled'); expect(trace).toContain('work_unit_child_created'); expect(trace).toContain('work_unit_patch_collected'); expect(trace).toContain('patch_apply_started'); - expect(trace).toContain('final_artifact_gates_failed'); - expect(trace).toContain('ingest_failed'); + expect(trace).toContain('final_artifact_gates_finished'); + expect(trace).toContain('final_gate_prune_finished'); + expect(trace).toContain('squash_finished'); + expect(trace).not.toContain('ingest_failed'); } finally { await rm(runtime.homeDir, { recursive: true, force: true }); } }); - it('rejects unchanged wiki body refs made stale by isolated semantic-layer changes', async () => { + it('prunes unchanged wiki body refs made stale by isolated semantic-layer changes', async () => { const runtime = await makeRealGitRuntime(); try { await mkdir(join(runtime.configDir, 'semantic-layer/warehouse'), { recursive: true }); @@ -782,17 +1407,24 @@ describe('IngestBundleRunner isolated diff path', () => { const runner = new IngestBundleRunner(deps); await mockStageRawFiles(runner, runtime, [['cards/source.json', 'h1']]); - await expect( - runner.run({ - jobId: 'job-existing-body-stale', - connectionId: 'warehouse', - sourceKey: 'metabase', - trigger: 'upload', - bundleRef: { kind: 'upload', uploadId: 'upload' }, - }), - ).rejects.toThrow(/total_contract_arr_cents/); + const result = await runner.run({ + jobId: 'job-existing-body-stale', + connectionId: 'warehouse', + sourceKey: 'metabase', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); - expect(await runtime.git.revParseHead()).toBe(preRunHead); + expect(await runtime.git.revParseHead()).not.toBe(preRunHead); + expect(result.finalGatePrunedReferences).toContainEqual({ + kind: 'wiki_body_ref', + artifact: 'wiki/global/account-segments', + removedRef: 'mart_account_segments.total_contract_arr_cents', + absentTarget: 'mart_account_segments.total_contract_arr_cents', + }); + await expect(readFile(join(runtime.configDir, 'wiki/global/account-segments.md'), 'utf-8')).resolves.not.toContain( + 'total_contract_arr_cents', + ); const events = (await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-existing-body-stale/trace.jsonl'), 'utf-8')) .trim() .split('\n') @@ -800,83 +1432,14 @@ describe('IngestBundleRunner isolated diff path', () => { expect(events.map((event) => event.event)).toEqual( expect.arrayContaining([ 'final_artifact_gates_started', - 'final_artifact_gates_failed', - 'ingest_failed', - 'failure_report_created', - ]), - ); - expect(events.map((event) => event.event)).not.toContain('squash_finished'); - const gateFailure = events.find((event) => event.event === 'final_artifact_gates_failed'); - expect(gateFailure).toMatchObject({ - data: { - wikiReferenceGateScope: { - global: true, - reasons: expect.arrayContaining(['semantic_layer_changed']), - pageKeysValidated: expect.arrayContaining(['account-segments']), - }, - actionOrigins: expect.arrayContaining([ - expect.objectContaining({ - source: 'work_unit_action', - unitKey: 'source-only', - unitRawFiles: ['cards/source.json'], - action: expect.objectContaining({ - target: 'sl', - type: 'updated', - key: 'mart_account_segments', - rawPaths: ['cards/source.json'], - targetConnectionId: 'warehouse', - }), - }), - ]), - }, - error: { message: expect.stringContaining('total_contract_arr_cents') }, - }); - - const failureReport = (deps.reports.create as any).mock.calls - .map((call: any[]) => call[0]) - .find((report: any) => report.body.status === 'failed'); - expect(failureReport.body.failure).toMatchObject({ - phase: 'final_gates', - message: expect.stringContaining('total_contract_arr_cents'), - details: expect.objectContaining({ - wikiReferenceGateScope: expect.objectContaining({ - global: true, - reasons: expect.arrayContaining(['semantic_layer_changed']), - pageKeysValidated: expect.arrayContaining(['account-segments']), - }), - touchedSlSources: expect.arrayContaining([ - expect.objectContaining({ connectionId: 'warehouse', sourceName: 'mart_account_segments' }), - ]), - actionOrigins: expect.arrayContaining([ - expect.objectContaining({ - source: 'work_unit_action', - unitKey: 'source-only', - action: expect.objectContaining({ - target: 'sl', - type: 'updated', - key: 'mart_account_segments', - rawPaths: ['cards/source.json'], - targetConnectionId: 'warehouse', - }), - }), - ]), - }), - }); - expect(failureReport.body.workUnits).toEqual( - expect.arrayContaining([ - expect.objectContaining({ - unitKey: 'source-only', - actions: expect.arrayContaining([ - expect.objectContaining({ - target: 'sl', - type: 'updated', - key: 'mart_account_segments', - rawPaths: ['cards/source.json'], - }), - ]), - }), + 'final_artifact_gates_finished', + 'final_gate_reference_pruned', + 'final_gate_prune_committed', + 'final_gate_prune_finished', + 'squash_finished', ]), ); + expect(events.map((event) => event.event)).not.toContain('ingest_failed'); } finally { await rm(runtime.homeDir, { recursive: true, force: true }); } @@ -1022,7 +1585,7 @@ describe('IngestBundleRunner isolated diff path', () => { } }); - it('rejects Notion-style changed wiki pages with invalid sl_refs', async () => { + it('prunes direct missing wiki sl_refs instead of rejecting the work unit', async () => { const runtime = await makeRealGitRuntime(); try { const { deps, adapter } = makeDeps(runtime); @@ -1035,12 +1598,16 @@ describe('IngestBundleRunner isolated diff path', () => { return { toRuntimeTools: vi.fn(() => ({})) }; }); deps.agentRunner.runLoop = vi.fn(async (params: any) => { - if (params.telemetryTags.operationName === 'ingest-isolated-diff-gate-repair') { - return { stopReason: 'natural' as const }; + if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') { + return { stopReason: 'natural' }; } const root = rootOfConfig(currentSession.configService, runtime.configDir); await mkdir(join(root, 'wiki/global'), { recursive: true }); - await writeFile(join(root, 'wiki/global/notion-page.md'), '---\nsummary: Notion page\nusage_mode: auto\nsl_refs:\n - missing_source\n---\n\nBody\n'); + await writeFile( + join(root, 'wiki/global/notion-page.md'), + '---\nsummary: Notion page\nusage_mode: auto\nsl_refs:\n - missing_source\n---\n\nBody\n', + 'utf-8', + ); currentSession.actions.push({ target: 'wiki', type: 'created', key: 'notion-page', detail: 'Notion page' }); await currentSession.gitService.commitFiles(['wiki/global/notion-page.md'], 'wu notion', 'ktx Test', 'system@ktx.local'); return { stopReason: 'natural' }; @@ -1048,9 +1615,24 @@ describe('IngestBundleRunner isolated diff path', () => { const runner = new IngestBundleRunner(deps); await mockStageRawFiles(runner, runtime, [['pages/notion.json', 'h1']]); - await expect( - runner.run({ jobId: 'job-invalid-slrefs', connectionId: 'warehouse', sourceKey: 'metabase', trigger: 'upload', bundleRef: { kind: 'upload', uploadId: 'upload' } }), - ).rejects.toThrow(/gate repair completed without editing an allowed path/); + const result = await runner.run({ + jobId: 'job-invalid-slrefs', + connectionId: 'warehouse', + sourceKey: 'metabase', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); + + expect(result.commitSha).toBeTruthy(); + expect(result.finalGatePrunedReferences).toContainEqual({ + kind: 'wiki_sl_ref', + artifact: 'wiki/global/notion-page', + removedRef: 'missing_source', + absentTarget: 'missing_source', + }); + await expect(readFile(join(runtime.configDir, 'wiki/global/notion-page.md'), 'utf-8')).resolves.not.toContain( + 'missing_source', + ); } finally { await rm(runtime.homeDir, { recursive: true, force: true }); } @@ -1112,27 +1694,35 @@ describe('IngestBundleRunner isolated diff path', () => { const runner = new IngestBundleRunner(deps); await mockStageRawFiles(runner, runtime, [['cards/source.json', 'h1']]); - await expect( - runner.run({ - jobId: 'job-reconcile-stale', - connectionId: 'warehouse', - sourceKey: 'metabase', - trigger: 'upload', - bundleRef: { kind: 'upload', uploadId: 'upload' }, - }), - ).rejects.toThrow(/total_contract_arr_cents/); + const result = await runner.run({ + jobId: 'job-reconcile-stale', + connectionId: 'warehouse', + sourceKey: 'metabase', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); const trace = await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-reconcile-stale/trace.jsonl'), 'utf-8'); expect(trace).toContain('reconciliation_finished'); - expect(trace).toContain('final_artifact_gates_failed'); - expect(trace).toContain('ingest_failed'); - expect(await runtime.git.revParseHead()).not.toContain('reconcile wiki'); + expect(trace).toContain('final_artifact_gates_finished'); + expect(trace).toContain('final_gate_prune_finished'); + expect(trace).toContain('squash_finished'); + expect(trace).not.toContain('ingest_failed'); + expect(result.finalGatePrunedReferences).toContainEqual({ + kind: 'wiki_body_ref', + artifact: 'wiki/global/account-segments', + removedRef: 'mart_account_segments.total_contract_arr_cents', + absentTarget: 'mart_account_segments.total_contract_arr_cents', + }); + await expect(readFile(join(runtime.configDir, 'wiki/global/account-segments.md'), 'utf-8')).resolves.not.toContain( + 'total_contract_arr_cents', + ); } finally { await rm(runtime.homeDir, { recursive: true, force: true }); } }); - it('stores a failure report and postmortem trace for final gate failures', async () => { + it('stores final gate prune details in the success report and trace', async () => { const runtime = await makeRealGitRuntime(); try { const { deps, adapter } = makeDeps(runtime); @@ -1200,19 +1790,28 @@ describe('IngestBundleRunner isolated diff path', () => { ['cards/source.json', 'h2'], ]); - await expect( - runner.run({ - jobId: 'job-trace-failure', - connectionId: 'warehouse', - sourceKey: 'metabase', - trigger: 'upload', - bundleRef: { kind: 'upload', uploadId: 'upload' }, - }), - ).rejects.toThrow(/total_contract_arr_cents/); + const result = await runner.run({ + jobId: 'job-trace-failure', + connectionId: 'warehouse', + sourceKey: 'metabase', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); - const failureReport = createdReports.find((report) => report.body.status === 'failed'); - expect(failureReport.body.tracePath).toContain('job-trace-failure/trace.jsonl'); - expect(failureReport.body.failure).toMatchObject({ phase: 'final_gates' }); + expect(result.finalGatePrunedReferences).toContainEqual({ + kind: 'wiki_body_ref', + artifact: 'wiki/global/account-segments', + removedRef: 'mart_account_segments.total_contract_arr_cents', + absentTarget: 'mart_account_segments.total_contract_arr_cents', + }); + const successReport = createdReports.find((report) => report.body.status === 'completed'); + expect(successReport.body.tracePath).toContain('job-trace-failure/trace.jsonl'); + expect(successReport.body.finalGatePrunedReferences).toContainEqual({ + kind: 'wiki_body_ref', + artifact: 'wiki/global/account-segments', + removedRef: 'mart_account_segments.total_contract_arr_cents', + absentTarget: 'mart_account_segments.total_contract_arr_cents', + }); const events = (await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-trace-failure/trace.jsonl'), 'utf-8')) .trim() @@ -1229,18 +1828,14 @@ describe('IngestBundleRunner isolated diff path', () => { 'patch_apply_started', 'patch_accepted', 'reconciliation_finished', - 'final_artifact_gates_failed', - 'ingest_failed', - 'failure_report_created', + 'final_artifact_gates_finished', + 'final_gate_reference_pruned', + 'final_gate_prune_committed', + 'final_gate_prune_finished', + 'squash_finished', ]), ); - const failed = events.find((event) => event.event === 'ingest_failed'); - expect(failed).toMatchObject({ - runId: 'run-1', - syncId: expect.any(String), - data: { phase: 'final_gates', tracePath: expect.stringContaining('trace.jsonl') }, - error: { message: expect.stringContaining('total_contract_arr_cents') }, - }); + expect(events.map((event) => event.event)).not.toContain('ingest_failed'); } finally { await rm(runtime.homeDir, { recursive: true, force: true }); } @@ -1461,7 +2056,7 @@ describe('IngestBundleRunner isolated diff path', () => { } }); - it('rejects final wiki refs broken by another accepted WorkUnit before squash', async () => { + it('prunes final wiki refs broken by another accepted WorkUnit before squash', async () => { const runtime = await makeRealGitRuntime(); try { await mkdir(join(runtime.configDir, 'wiki/global'), { recursive: true }); @@ -1530,44 +2125,35 @@ describe('IngestBundleRunner isolated diff path', () => { ['pages/delete.json', 'h2'], ]); - await expect( - runner.run({ - jobId: 'job-wiki-ref-conflict', - connectionId: 'warehouse', - sourceKey: 'metabase', - trigger: 'upload', - bundleRef: { kind: 'upload', uploadId: 'upload' }, - }), - ).rejects.toThrow(/wiki references target missing page\(s\): account-segments -> source-page/); - - expect(await runtime.git.revParseHead()).toBe(preRunHead); - const trace = await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-wiki-ref-conflict/trace.jsonl'), 'utf-8'); - expect(trace).toContain('final_artifact_gates_failed'); - expect(trace).toContain('account-segments -> source-page'); - expect(trace).toContain('ingest_failed'); - expect(trace).toContain('failure_report_created'); - expect(trace).not.toContain('squash_finished'); - - const failureReport = (deps.reports.create as any).mock.calls - .map((call: any[]) => call[0]) - .find((report: any) => report.body.status === 'failed'); - expect(failureReport.body.failure).toMatchObject({ - phase: 'final_gates', - message: expect.stringContaining('account-segments -> source-page'), - details: expect.objectContaining({ - changedWikiPageKeys: expect.arrayContaining(['account-segments']), - workUnitPatchTouchedPaths: expect.arrayContaining([ - 'wiki/global/account-segments.md', - 'wiki/global/source-page.md', - ]), - }), + const result = await runner.run({ + jobId: 'job-wiki-ref-conflict', + connectionId: 'warehouse', + sourceKey: 'metabase', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, }); + + expect(await runtime.git.revParseHead()).not.toBe(preRunHead); + expect(result.finalGatePrunedReferences).toContainEqual({ + kind: 'wiki_ref', + artifact: 'wiki/global/account-segments', + removedRef: 'source-page', + absentTarget: 'source-page', + }); + await expect(readFile(join(runtime.configDir, 'wiki/global/account-segments.md'), 'utf-8')).resolves.not.toContain( + 'source-page', + ); + const trace = await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-wiki-ref-conflict/trace.jsonl'), 'utf-8'); + expect(trace).toContain('final_artifact_gates_finished'); + expect(trace).toContain('final_gate_reference_pruned'); + expect(trace).toContain('squash_finished'); + expect(trace).not.toContain('ingest_failed'); } finally { await rm(runtime.homeDir, { recursive: true, force: true }); } }); - it('rejects unchanged inbound wiki refs broken by an isolated wiki deletion', async () => { + it('prunes unchanged inbound wiki refs broken by an isolated wiki deletion', async () => { const runtime = await makeRealGitRuntime(); try { await mkdir(join(runtime.configDir, 'wiki/global'), { recursive: true }); @@ -1622,17 +2208,24 @@ describe('IngestBundleRunner isolated diff path', () => { const runner = new IngestBundleRunner(deps); await mockStageRawFiles(runner, runtime, [['pages/delete.json', 'h1']]); - await expect( - runner.run({ - jobId: 'job-existing-wiki-ref-stale', - connectionId: 'warehouse', - sourceKey: 'metabase', - trigger: 'upload', - bundleRef: { kind: 'upload', uploadId: 'upload' }, - }), - ).rejects.toThrow(/wiki references target missing page\(s\): account-segments -> source-page/); + const result = await runner.run({ + jobId: 'job-existing-wiki-ref-stale', + connectionId: 'warehouse', + sourceKey: 'metabase', + trigger: 'upload', + bundleRef: { kind: 'upload', uploadId: 'upload' }, + }); - expect(await runtime.git.revParseHead()).toBe(preRunHead); + expect(await runtime.git.revParseHead()).not.toBe(preRunHead); + expect(result.finalGatePrunedReferences).toContainEqual({ + kind: 'wiki_ref', + artifact: 'wiki/global/account-segments', + removedRef: 'source-page', + absentTarget: 'source-page', + }); + await expect(readFile(join(runtime.configDir, 'wiki/global/account-segments.md'), 'utf-8')).resolves.not.toContain( + 'source-page', + ); const events = (await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-existing-wiki-ref-stale/trace.jsonl'), 'utf-8')) .trim() .split('\n') @@ -1640,81 +2233,14 @@ describe('IngestBundleRunner isolated diff path', () => { expect(events.map((event) => event.event)).toEqual( expect.arrayContaining([ 'final_artifact_gates_started', - 'final_artifact_gates_failed', - 'ingest_failed', - 'failure_report_created', - ]), - ); - expect(events.map((event) => event.event)).not.toContain('squash_finished'); - const gateFailure = events.find((event) => event.event === 'final_artifact_gates_failed'); - expect(gateFailure).toMatchObject({ - data: { - wikiReferenceGateScope: { - global: true, - reasons: expect.arrayContaining(['wiki_page_removed']), - removedWikiPageKeys: expect.arrayContaining(['source-page']), - pageKeysValidated: expect.arrayContaining(['account-segments']), - }, - actionOrigins: expect.arrayContaining([ - expect.objectContaining({ - source: 'work_unit_action', - unitKey: 'delete-target-page', - unitRawFiles: ['pages/delete.json'], - action: expect.objectContaining({ - target: 'wiki', - type: 'removed', - key: 'source-page', - rawPaths: ['pages/delete.json'], - }), - }), - ]), - }, - error: { message: expect.stringContaining('account-segments -> source-page') }, - }); - - const failureReport = (deps.reports.create as any).mock.calls - .map((call: any[]) => call[0]) - .find((report: any) => report.body.status === 'failed'); - expect(failureReport.body.failure).toMatchObject({ - phase: 'final_gates', - message: expect.stringContaining('account-segments -> source-page'), - details: expect.objectContaining({ - wikiReferenceGateScope: expect.objectContaining({ - global: true, - reasons: expect.arrayContaining(['wiki_page_removed']), - removedWikiPageKeys: expect.arrayContaining(['source-page']), - pageKeysValidated: expect.arrayContaining(['account-segments']), - }), - changedWikiPageKeys: expect.arrayContaining(['source-page']), - actionOrigins: expect.arrayContaining([ - expect.objectContaining({ - source: 'work_unit_action', - unitKey: 'delete-target-page', - action: expect.objectContaining({ - target: 'wiki', - type: 'removed', - key: 'source-page', - rawPaths: ['pages/delete.json'], - }), - }), - ]), - }), - }); - expect(failureReport.body.workUnits).toEqual( - expect.arrayContaining([ - expect.objectContaining({ - unitKey: 'delete-target-page', - actions: expect.arrayContaining([ - expect.objectContaining({ - target: 'wiki', - type: 'removed', - key: 'source-page', - rawPaths: ['pages/delete.json'], - }), - ]), - }), + 'final_artifact_gates_finished', + 'final_gate_reference_pruned', + 'final_gate_prune_committed', + 'final_gate_prune_finished', + 'squash_finished', ]), ); + expect(events.map((event) => event.event)).not.toContain('ingest_failed'); } finally { await rm(runtime.homeDir, { recursive: true, force: true }); } @@ -2009,7 +2535,7 @@ describe('IngestBundleRunner isolated diff path', () => { } }); - it('repairs final wiki body refs before squash when the repair agent edits the scoped page', async () => { + it('prunes final wiki body refs before squash', async () => { const runtime = await makeRealGitRuntime(); try { await mkdir(join(runtime.configDir, 'semantic-layer/warehouse'), { recursive: true }); @@ -2040,18 +2566,6 @@ describe('IngestBundleRunner isolated diff path', () => { return { toRuntimeTools: vi.fn(() => ({})) }; }); deps.agentRunner.runLoop = vi.fn(async (params: any) => { - if (params.telemetryTags.operationName === 'ingest-isolated-diff-gate-repair') { - const gateError = await params.toolSet.read_gate_error.execute({}); - expect(gateError.markdown).toContain('total_contract_arr_cents'); - const page = await params.toolSet.read_repair_file.execute({ - path: 'wiki/global/account-segments.md', - }); - await params.toolSet.write_repair_file.execute({ - path: 'wiki/global/account-segments.md', - content: page.markdown.replace('total_contract_arr_cents', 'total_contract_arr'), - }); - return { stopReason: 'natural' as const }; - } if (params.modelRole === 'reconcile') { return { stopReason: 'natural' as const }; } @@ -2083,7 +2597,7 @@ describe('IngestBundleRunner isolated diff path', () => { await mockStageRawFiles(runner, runtime, [['cards/source.json', 'h1']]); const result = await runner.run({ - jobId: 'job-final-gate-repair', + jobId: 'job-final-gate-prune', connectionId: 'warehouse', sourceKey: 'metabase', trigger: 'upload', @@ -2091,116 +2605,22 @@ describe('IngestBundleRunner isolated diff path', () => { }); expect(result.commitSha).toBeTruthy(); - await expect(readFile(join(runtime.configDir, 'wiki/global/account-segments.md'), 'utf-8')).resolves.toContain( - 'mart_account_segments.total_contract_arr', - ); await expect(readFile(join(runtime.configDir, 'wiki/global/account-segments.md'), 'utf-8')).resolves.not.toContain( 'total_contract_arr_cents', ); const reportCreate = vi.mocked(deps.reports.create).mock.calls.at(-1)?.[0] as any; - expect(reportCreate.body.isolatedDiff).toMatchObject({ - gateRepairAttempts: 1, - gateRepairs: 1, - gateRepairFailures: 0, - }); - const trace = await readFile(join(runtime.configDir, '.ktx/ingest-traces/job-final-gate-repair/trace.jsonl'), 'utf-8'); - expect(trace).toContain('gate_repair_repaired'); - expect(trace).toContain('final_gate_repair_committed'); - } finally { - await rm(runtime.homeDir, { recursive: true, force: true }); - } - }); - - it('fails before squash when final gate repair makes no edit', async () => { - const runtime = await makeRealGitRuntime(); - try { - await mkdir(join(runtime.configDir, 'semantic-layer/warehouse'), { recursive: true }); - await mkdir(join(runtime.configDir, 'wiki/global'), { recursive: true }); - await writeFile( - join(runtime.configDir, 'semantic-layer/warehouse/mart_account_segments.yaml'), - 'name: mart_account_segments\ngrain: [account_id]\ncolumns: [{name: account_id, type: string}]\njoins: []\nmeasures:\n - name: total_contract_arr_cents\n expr: sum(contract_arr)\n', - ); - await writeFile( - join(runtime.configDir, 'wiki/global/account-segments.md'), - '---\nsummary: Account segments\nusage_mode: auto\n---\n\nExisting ARR uses `mart_account_segments.total_contract_arr_cents`.\n', - ); - await runtime.git.commitFiles( - ['semantic-layer/warehouse/mart_account_segments.yaml', 'wiki/global/account-segments.md'], - 'seed stale wiki body ref', - 'ktx Test', - 'system@ktx.local', - ); - const preRunHead = await runtime.git.revParseHead(); - - const { deps, adapter } = makeDeps(runtime); - adapter.chunk.mockResolvedValue({ - workUnits: [{ unitKey: 'source-only', rawFiles: ['cards/source.json'], peerFileIndex: [], dependencyPaths: [] }], - }); - - let currentSession: any = null; - deps.toolsetFactory.createIngestWuToolset = vi.fn((toolSession: any) => { - currentSession = toolSession; - return { toRuntimeTools: vi.fn(() => ({})) }; - }); - deps.agentRunner.runLoop = vi.fn(async (params: any) => { - if (params.telemetryTags.operationName === 'ingest-isolated-diff-gate-repair') { - return { stopReason: 'natural' as const }; - } - if (params.modelRole === 'reconcile') { - return { stopReason: 'natural' as const }; - } - - const root = rootOfConfig(currentSession.configService, runtime.configDir); - await writeFile( - join(root, 'semantic-layer/warehouse/mart_account_segments.yaml'), - 'name: mart_account_segments\ngrain: [account_id]\ncolumns: [{name: account_id, type: string}]\njoins: []\nmeasures:\n - name: total_contract_arr\n expr: sum(contract_arr)\n', - ); - addTouchedSlSource(currentSession.touchedSlSources, 'warehouse', 'mart_account_segments'); - currentSession.actions.push({ - target: 'sl', - type: 'updated', - key: 'mart_account_segments', - detail: 'Rename ARR measure', - targetConnectionId: 'warehouse', - rawPaths: ['cards/source.json'], - }); - await currentSession.gitService.commitFiles( - ['semantic-layer/warehouse/mart_account_segments.yaml'], - 'wu source rename', - 'ktx Test', - 'system@ktx.local', - ); - return { stopReason: 'natural' as const }; - }) as never; - - const runner = new IngestBundleRunner(deps); - await mockStageRawFiles(runner, runtime, [['cards/source.json', 'h1']]); - - await expect( - runner.run({ - jobId: 'job-final-gate-repair-fails', - connectionId: 'warehouse', - sourceKey: 'metabase', - trigger: 'upload', - bundleRef: { kind: 'upload', uploadId: 'upload' }, - }), - ).rejects.toThrow(/gate repair completed without editing an allowed path/); - - expect(await runtime.git.revParseHead()).toBe(preRunHead); - const reportCreate = vi.mocked(deps.reports.create).mock.calls.at(-1)?.[0] as any; - expect(reportCreate.body.status).toBe('failed'); - expect(reportCreate.body.isolatedDiff).toMatchObject({ - // Both attempts of the verify-based repair loop ran without an edit. - gateRepairAttempts: 2, - gateRepairs: 0, - gateRepairFailures: 1, + expect(reportCreate.body.finalGatePrunedReferences).toContainEqual({ + kind: 'wiki_body_ref', + artifact: 'wiki/global/account-segments', + removedRef: 'mart_account_segments.total_contract_arr_cents', + absentTarget: 'mart_account_segments.total_contract_arr_cents', }); const trace = await readFile( - join(runtime.configDir, '.ktx/ingest-traces/job-final-gate-repair-fails/trace.jsonl'), + join(runtime.configDir, '.ktx/ingest-traces/job-final-gate-prune/trace.jsonl'), 'utf-8', ); - expect(trace).toContain('gate_repair_failed'); - expect(trace).not.toContain('squash_finished'); + expect(trace).toContain('final_gate_reference_pruned'); + expect(trace).toContain('final_gate_prune_finished'); } finally { await rm(runtime.homeDir, { recursive: true, force: true }); } diff --git a/packages/cli/test/context/ingest/ingest-bundle.runner.test.ts b/packages/cli/test/context/ingest/ingest-bundle.runner.test.ts index 105d8ad1..0a30bae0 100644 --- a/packages/cli/test/context/ingest/ingest-bundle.runner.test.ts +++ b/packages/cli/test/context/ingest/ingest-bundle.runner.test.ts @@ -218,11 +218,20 @@ const makeDeps = () => { const configService = { enqueueCommitMessageJobForExternalCommit: vi.fn().mockResolvedValue(undefined), }; + const contentCache = { + findCompletedResult: vi.fn().mockResolvedValue(null), + findLatestCompletedResult: vi.fn().mockResolvedValue(null), + saveCompletedResult: vi.fn().mockResolvedValue(undefined), + saveFailedResult: vi.fn().mockResolvedValue(undefined), + deleteResult: vi.fn().mockResolvedValue(undefined), + listRunResults: vi.fn().mockResolvedValue([]), + }; return { runsRepo, provenanceRepo, reportsRepo, canonicalPins, + contentCache, adapter, registry, diffSetService, @@ -270,6 +279,7 @@ const buildRunner = (deps: ReturnType = makeDeps(), overrides: settings: { probeRowCount: 1, memoryIngestionModel: 'test-model', + cliVersion: '0.0.0-test', }, skillsRegistry: deps.skillsRegistry as any, promptService: deps.promptService as any, @@ -286,6 +296,7 @@ const buildRunner = (deps: ReturnType = makeDeps(), overrides: }, reports: deps.reportsRepo as any, canonicalPins: deps.canonicalPins, + contentCache: deps.contentCache, slValidator: deps.slValidator as any, toolsetFactory: deps.toolsetFactory as any, commitMessages: { @@ -433,6 +444,7 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { settings: { probeRowCount: 1, memoryIngestionModel: 'test-model', + cliVersion: '0.0.0-test', workUnitMaxConcurrency: 2, rateLimitGovernor: { acquireWorkSlot, subscribe: vi.fn(() => vi.fn()) } as never, }, @@ -471,6 +483,7 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { settings: { probeRowCount: 1, memoryIngestionModel: 'test-model', + cliVersion: '0.0.0-test', workUnitMaxConcurrency: 1, rateLimitGovernor: { acquireWorkSlot, subscribe: vi.fn(() => vi.fn()) } as never, }, @@ -506,6 +519,7 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { settings: { probeRowCount: 1, memoryIngestionModel: 'test-model', + cliVersion: '0.0.0-test', workUnitMaxConcurrency: 1, }, }); @@ -546,6 +560,7 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { settings: { probeRowCount: 1, memoryIngestionModel: 'test-model', + cliVersion: '0.0.0-test', rateLimitGovernor: { acquireWorkSlot: vi.fn(async () => vi.fn()), subscribe: vi.fn((cb: (state: any) => void) => { diff --git a/packages/cli/test/context/ingest/isolated-diff/patch-integrator.test.ts b/packages/cli/test/context/ingest/isolated-diff/patch-integrator.test.ts index 43590bc7..0b12a6cf 100644 --- a/packages/cli/test/context/ingest/isolated-diff/patch-integrator.test.ts +++ b/packages/cli/test/context/ingest/isolated-diff/patch-integrator.test.ts @@ -300,110 +300,6 @@ describe('integrateWorkUnitPatch', () => { await expect(readFile(join(configDir, 'wiki/global/a.md'), 'utf-8')).resolves.toBe('accepted\n'); }); - it('repairs semantic gate failures after a patch applies cleanly', async () => { - const { homeDir, configDir, git, baseSha } = await makeRepo(); - const childDir = join(homeDir, 'child-semantic-repair'); - await git.addWorktree(childDir, 'child-semantic-repair', baseSha); - const childGit = git.forWorktree(childDir); - await writeFile(join(childDir, 'wiki/global/a.md'), 'bad semantic ref\n'); - await childGit.commitFiles(['wiki/global/a.md'], 'bad semantic edit', 'System User', 'system@example.com'); - const patchPath = join(homeDir, 'patches/semantic-repair.patch'); - await childGit.writeBinaryNoRenamePatch(baseSha, 'HEAD', patchPath); - const trace = new FileIngestTraceWriter({ - tracePath: join(homeDir, '.ktx/ingest-traces/job-semantic-repair/trace.jsonl'), - jobId: 'job-semantic-repair', - connectionId: 'c1', - sourceKey: 'fake', - level: 'trace', - }); - const validateAppliedTree = vi - .fn() - .mockRejectedValueOnce(new Error('final artifact gates failed:\na: unknown semantic-layer entity')) - .mockResolvedValueOnce(undefined); - - const result = await integrateWorkUnitPatch({ - unitKey: 'wu-repairable', - patchPath, - integrationGit: git, - trace, - author: { name: 'ktx Test', email: 'system@ktx.local' }, - validateAppliedTree, - slDisallowed: false, - allowedTargetConnectionIds: new Set(['c1']), - repairGateFailure: vi.fn(async (context) => { - expect(context).toMatchObject({ - unitKey: 'wu-repairable', - patchPath, - touchedPaths: ['wiki/global/a.md'], - }); - await writeFile(join(configDir, 'wiki/global/a.md'), 'repaired semantic ref\n', 'utf-8'); - await expect(context.verify(['wiki/global/a.md'])).resolves.toEqual({ ok: true }); - return { - status: 'repaired' as const, - attempts: 1, - changedPaths: ['wiki/global/a.md'], - }; - }), - }); - - expect(result).toMatchObject({ - status: 'accepted', - touchedPaths: ['wiki/global/a.md'], - gateRepair: { - status: 'repaired', - attempts: 1, - changedPaths: ['wiki/global/a.md'], - }, - }); - expect(validateAppliedTree).toHaveBeenCalledTimes(2); - await expect(readFile(join(configDir, 'wiki/global/a.md'), 'utf-8')).resolves.toBe('repaired semantic ref\n'); - await expect(readFile(trace.tracePath, 'utf-8')).resolves.toContain('patch_accepted_after_gate_repair'); - }); - - it('keeps the pre-apply tree when semantic gate repair fails', async () => { - const { homeDir, configDir, git, baseSha } = await makeRepo(); - const childDir = join(homeDir, 'child-semantic-repair-fails'); - await git.addWorktree(childDir, 'child-semantic-repair-fails', baseSha); - const childGit = git.forWorktree(childDir); - await writeFile(join(childDir, 'wiki/global/a.md'), 'bad semantic ref\n'); - await childGit.commitFiles(['wiki/global/a.md'], 'bad semantic edit', 'System User', 'system@example.com'); - const patchPath = join(homeDir, 'patches/semantic-repair-fails.patch'); - await childGit.writeBinaryNoRenamePatch(baseSha, 'HEAD', patchPath); - const trace = new FileIngestTraceWriter({ - tracePath: join(homeDir, '.ktx/ingest-traces/job-semantic-repair-fails/trace.jsonl'), - jobId: 'job-semantic-repair-fails', - connectionId: 'c1', - sourceKey: 'fake', - level: 'trace', - }); - - const result = await integrateWorkUnitPatch({ - unitKey: 'wu-not-repaired', - patchPath, - integrationGit: git, - trace, - author: { name: 'ktx Test', email: 'system@ktx.local' }, - validateAppliedTree: vi.fn().mockRejectedValue(new Error('final artifact gates failed')), - slDisallowed: false, - allowedTargetConnectionIds: new Set(['c1']), - repairGateFailure: vi.fn(async () => ({ - status: 'failed' as const, - attempts: 1, - reason: 'gate repair completed without editing an allowed path', - })), - }); - - expect(result).toMatchObject({ - status: 'semantic_conflict', - gateRepair: { - status: 'failed', - attempts: 1, - reason: 'gate repair completed without editing an allowed path', - }, - }); - await expect(readFile(join(configDir, 'wiki/global/a.md'), 'utf-8')).resolves.toBe('old\n'); - }); - it('accepts a redundant duplicate-creation patch as subsumed without committing', async () => { // Regression: two work units each emitted a creation patch for the same // wiki page. The second creation patch conflicts with the page already in diff --git a/packages/cli/test/context/ingest/local-bundle-runtime.test.ts b/packages/cli/test/context/ingest/local-bundle-runtime.test.ts index 6f7c99dd..53349555 100644 --- a/packages/cli/test/context/ingest/local-bundle-runtime.test.ts +++ b/packages/cli/test/context/ingest/local-bundle-runtime.test.ts @@ -298,6 +298,7 @@ describe('createLocalBundleIngestRuntime', () => { expect(settings).not.toHaveProperty(fallbackSettingKey); expect(Object.keys(settings).sort()).toEqual([ + 'cliVersion', 'ingestTraceLevel', 'memoryIngestionModel', 'probeRowCount', diff --git a/packages/cli/test/context/ingest/report-snapshot.test.ts b/packages/cli/test/context/ingest/report-snapshot.test.ts index 36f822e1..ea1f8c13 100644 --- a/packages/cli/test/context/ingest/report-snapshot.test.ts +++ b/packages/cli/test/context/ingest/report-snapshot.test.ts @@ -106,6 +106,26 @@ describe('parseIngestReportSnapshot', () => { expect(snapshot.body.toolTranscripts).toHaveLength(1); }); + it('parses final gate prune and drop arrays', () => { + const report: any = validReportSnapshot(); + report.body.finalGatePrunedReferences = [ + { + kind: 'join', + artifact: 'semantic-layer/warehouse/orders', + removedRef: 'customers', + absentTarget: 'customers', + }, + ] as never; + report.body.finalGateDroppedSources = [ + { connectionId: 'warehouse', sourceName: 'bad', reason: 'dry run failed' }, + ] as never; + + const snapshot = parseIngestReportSnapshot(report); + + expect(snapshot.body.finalGatePrunedReferences).toEqual(report.body.finalGatePrunedReferences); + expect(snapshot.body.finalGateDroppedSources).toEqual(report.body.finalGateDroppedSources); + }); + it('parses target-aware actions and touched source objects', () => { const report = validReportSnapshot(); report.body.workUnits[0] = { @@ -302,47 +322,4 @@ describe('parseIngestReportSnapshot', () => { }); }); - it('parses isolated-diff gate repair counters', () => { - const snapshot = parseIngestReportSnapshot({ - id: 'report-1', - runId: 'run-1', - jobId: 'job-1', - connectionId: 'warehouse', - sourceKey: 'metabase', - createdAt: '2026-05-18T00:00:00.000Z', - body: { - status: 'completed', - syncId: 'sync-1', - diffSummary: { added: 1, modified: 0, deleted: 0, unchanged: 0 }, - commitSha: 'abc123', - isolatedDiff: { - enabled: true, - acceptedPatches: 1, - textualConflicts: 0, - semanticConflicts: 1, - gateRepairAttempts: 1, - gateRepairs: 1, - gateRepairFailures: 0, - }, - workUnits: [], - failedWorkUnits: [], - reconciliationSkipped: true, - conflictsResolved: [], - evictionsApplied: [], - unmappedFallbacks: [], - evictionInputs: [], - unresolvedCards: [], - supersededBy: null, - overrideOf: null, - provenanceRows: [], - toolTranscripts: [], - }, - }); - - expect(snapshot.body.isolatedDiff).toMatchObject({ - gateRepairAttempts: 1, - gateRepairs: 1, - gateRepairFailures: 0, - }); - }); }); diff --git a/packages/cli/test/context/ingest/reports.test.ts b/packages/cli/test/context/ingest/reports.test.ts index 5fc24f6d..f8217859 100644 --- a/packages/cli/test/context/ingest/reports.test.ts +++ b/packages/cli/test/context/ingest/reports.test.ts @@ -59,6 +59,24 @@ describe('ingestReportOutcome', () => { ).toBe('partial'); }); + it('returns partial when final gates pruned or dropped artifacts from a saved run', () => { + expect( + ingestReportOutcome( + report({ + workUnits: [savingWorkUnit], + finalGatePrunedReferences: [ + { + kind: 'join', + artifact: 'semantic-layer/warehouse/orders', + removedRef: 'customers', + absentTarget: 'customers', + }, + ], + }), + ), + ).toBe('partial'); + }); + it('returns error when failed work units produced no saved memory', () => { expect(ingestReportOutcome(report({ workUnits: [failedWorkUnit], failedWorkUnits: ['bad'] }))).toBe('error'); }); diff --git a/packages/cli/test/context/ingest/stages/stage-3-work-units.test.ts b/packages/cli/test/context/ingest/stages/stage-3-work-units.test.ts index f4ec9ae9..6cc71fbe 100644 --- a/packages/cli/test/context/ingest/stages/stage-3-work-units.test.ts +++ b/packages/cli/test/context/ingest/stages/stage-3-work-units.test.ts @@ -122,22 +122,20 @@ describe('Stage 3 — executeWorkUnit', () => { expect(deps.resetHardTo).toHaveBeenCalledWith('pre'); }); - it('dangling wiki refs reset to the pre-WU SHA and mark WU failed after the agent loop', async () => { + it('dangling wiki refs are deferred to the final gate and preserve actions', async () => { const deps = makeDeps(); deps.sessionWorktreeGit.revParseHead = vi.fn().mockResolvedValueOnce('pre').mockResolvedValueOnce('post'); deps.agentRunner.runLoop = vi.fn().mockImplementation(() => { deps.sessionActions.push({ target: 'wiki', type: 'created', key: 'page-a', detail: 'Page A' }); return Promise.resolve({ stopReason: 'natural' }); }); - (deps as any).validateWikiRefs = vi.fn().mockResolvedValue(['page-a -> page-b']); const outcome = await executeWorkUnit(deps, makeWu()); - expect(outcome.status).toBe('failed'); - expect(outcome.reason).toContain('wiki references target missing page(s): page-a -> page-b'); - expect(outcome.actions).toEqual([]); + expect(outcome.status).toBe('success'); + expect(outcome.actions.map((action) => action.key)).toEqual(['page-a']); expect(outcome.touchedSlSources).toEqual([]); - expect(deps.resetHardTo).toHaveBeenCalledWith('pre'); + expect(deps.resetHardTo).not.toHaveBeenCalled(); }); it('resolved wiki refs pass post-WU validation and preserve actions', async () => { @@ -148,7 +146,6 @@ describe('Stage 3 — executeWorkUnit', () => { deps.sessionActions.push({ target: 'wiki', type: 'created', key: 'page-b', detail: 'Page B' }); return Promise.resolve({ stopReason: 'natural' }); }); - (deps as any).validateWikiRefs = vi.fn().mockResolvedValue([]); const outcome = await executeWorkUnit(deps, makeWu()); diff --git a/packages/cli/test/context/ingest/stages/validate-wu-sources.test.ts b/packages/cli/test/context/ingest/stages/validate-wu-sources.test.ts index 51508510..e40393d2 100644 --- a/packages/cli/test/context/ingest/stages/validate-wu-sources.test.ts +++ b/packages/cli/test/context/ingest/stages/validate-wu-sources.test.ts @@ -35,7 +35,7 @@ describe('validateWuTouchedSources', () => { ]); expect(result.validSources).toEqual(['warehouse-a:good']); - expect(result.invalidSources).toEqual([{ source: 'warehouse-b:bad', errors: ['invalid measure'] }]); + expect(result.invalidSources).toMatchObject([{ source: 'warehouse-b:bad', errors: ['invalid measure'] }]); }); it('returns empty arrays when no sources are touched', async () => { @@ -85,7 +85,7 @@ describe('validateWuTouchedSources', () => { ]); expect(result.validSources).toEqual([]); - expect(result.invalidSources).toEqual([ + expect(result.invalidSources).toMatchObject([ { source: 'warehouse:mart_account_segments', errors: ['join target "accounts" does not exist'], @@ -108,10 +108,12 @@ describe('validateWuTouchedSources', () => { const result = await validateWuTouchedSources(deps, [{ connectionId: 'warehouse', sourceName: 'accounts' }]); - expect(result.invalidSources).toContainEqual({ - source: 'warehouse:orders', - errors: ['join target "accounts" does not exist'], - }); + expect(result.invalidSources).toContainEqual( + expect.objectContaining({ + source: 'warehouse:orders', + errors: ['join target "accounts" does not exist'], + }), + ); }); it('rejects join targets that match a source name only case-insensitively', async () => { @@ -127,7 +129,7 @@ describe('validateWuTouchedSources', () => { const result = await validateWuTouchedSources(deps, [{ connectionId: 'warehouse', sourceName: 'orders' }]); - expect(result.invalidSources).toEqual([ + expect(result.invalidSources).toMatchObject([ { source: 'warehouse:orders', errors: [ @@ -154,6 +156,50 @@ describe('validateWuTouchedSources', () => { expect(result.invalidSources).toEqual([]); expect(result.validSources).toEqual(['warehouse:touched_source']); }); + + it('preserves structured source-validation and missing-join issues', async () => { + const result = await validateWuTouchedSources( + { + semanticLayerService: { + loadAllSources: vi.fn().mockResolvedValue({ + sources: [ + { + name: 'orders', + columns: [], + joins: [{ to: 'customers_missing', on: 'orders.customer_id = customers_missing.id' }], + measures: [], + segments: [], + }, + ], + loadErrors: [], + }), + } as never, + connections: {} as never, + configService: {} as never, + gitService: {} as never, + slSourcesRepository: {} as never, + probeRowCount: 0, + slValidator: { validateSingleSource: vi.fn().mockResolvedValue({ errors: ['dry run failed'], warnings: [] }) }, + }, + [{ connectionId: 'warehouse', sourceName: 'orders' }], + ); + + expect(result.invalidSources).toEqual([ + { + source: 'warehouse:orders', + errors: ['dry run failed', 'join target "customers_missing" does not exist'], + issues: [ + { kind: 'source_validation', message: 'dry run failed' }, + { + kind: 'missing_join_target', + targetSourceName: 'customers_missing', + caseMismatch: null, + message: 'join target "customers_missing" does not exist', + }, + ], + }, + ]); + }); }); describe('formatInvalidWuSources', () => { diff --git a/packages/cli/test/context/ingest/wiki-body-refs.test.ts b/packages/cli/test/context/ingest/wiki-body-refs.test.ts index 578dc600..246db5c9 100644 --- a/packages/cli/test/context/ingest/wiki-body-refs.test.ts +++ b/packages/cli/test/context/ingest/wiki-body-refs.test.ts @@ -1,5 +1,9 @@ import { describe, expect, it } from 'vitest'; -import { findInvalidWikiBodyRefs, parseWikiBodyRefs } from '../../../src/context/ingest/wiki-body-refs.js'; +import { + findInvalidWikiBodyRefIssues, + findInvalidWikiBodyRefs, + parseWikiBodyRefs, +} from '../../../src/context/ingest/wiki-body-refs.js'; const sources = [ { @@ -31,10 +35,32 @@ describe('wiki body refs', () => { ].join('\n'); expect(parseWikiBodyRefs(body)).toEqual([ - { kind: 'sl_entity', connectionId: null, sourceName: 'mart_account_segments', entityName: 'total_contract_arr' }, - { kind: 'sl_source', connectionId: null, sourceName: 'mart_account_segments' }, - { kind: 'sl_entity', connectionId: 'warehouse', sourceName: 'mart_account_segments', entityName: 'segment' }, - { kind: 'table', connectionId: null, tableRef: 'analytics.mart_account_segments' }, + { + kind: 'sl_entity', + connectionId: null, + sourceName: 'mart_account_segments', + entityName: 'total_contract_arr', + rawToken: 'mart_account_segments.total_contract_arr', + }, + { + kind: 'sl_source', + connectionId: null, + sourceName: 'mart_account_segments', + rawToken: 'source:mart_account_segments', + }, + { + kind: 'sl_entity', + connectionId: 'warehouse', + sourceName: 'mart_account_segments', + entityName: 'segment', + rawToken: 'warehouse/mart_account_segments.segment', + }, + { + kind: 'table', + connectionId: null, + tableRef: 'analytics.mart_account_segments', + rawToken: 'table:analytics.mart_account_segments', + }, ]); }); @@ -150,4 +176,41 @@ describe('wiki body refs', () => { 'account-segments: unknown raw table analytics.missing_table', ]); }); + + it('returns structured body ref issues with raw tokens', async () => { + const issues = await findInvalidWikiBodyRefIssues({ + pageKey: 'revenue', + body: '`orders.missing_measure`\n`source:missing_source`\n`table:analytics.missing_table`', + visibleConnectionIds: ['warehouse'], + loadSources: async () => [ + { name: 'orders', columns: [{ name: 'id', type: 'number' }], measures: [], segments: [], joins: [] }, + ] as never, + tableExists: async () => false, + }); + + expect(issues).toEqual([ + { + kind: 'missing_wiki_body_sl_entity', + pageKey: 'revenue', + rawToken: 'orders.missing_measure', + sourceName: 'orders', + entityName: 'missing_measure', + message: 'revenue: unknown semantic-layer entity orders.missing_measure', + }, + { + kind: 'missing_wiki_body_sl_source', + pageKey: 'revenue', + rawToken: 'source:missing_source', + sourceName: 'missing_source', + message: 'revenue: unknown semantic-layer source missing_source', + }, + { + kind: 'missing_wiki_body_table', + pageKey: 'revenue', + rawToken: 'table:analytics.missing_table', + tableRef: 'analytics.missing_table', + message: 'revenue: unknown raw table analytics.missing_table', + }, + ]); + }); }); diff --git a/packages/cli/test/context/ingest/work-unit-cache.test.ts b/packages/cli/test/context/ingest/work-unit-cache.test.ts new file mode 100644 index 00000000..83653132 --- /dev/null +++ b/packages/cli/test/context/ingest/work-unit-cache.test.ts @@ -0,0 +1,181 @@ +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { + computeIngestWorkUnitInputHash, + isPruneShapedCachedReplayBase, +} from '../../../src/context/ingest/work-unit-cache.js'; +import type { WorkUnit } from '../../../src/context/ingest/types.js'; + +describe('ingest work-unit cache', () => { + let tempDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-wu-cache-')); + await mkdir(join(tempDir, 'models'), { recursive: true }); + await writeFile(join(tempDir, 'models/orders.sql'), 'select * from raw.orders\n', 'utf-8'); + await writeFile(join(tempDir, 'models/customers.sql'), 'select * from raw.customers\n', 'utf-8'); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + function unit(overrides: Partial = {}): WorkUnit { + return { + unitKey: 'orders', + rawFiles: ['models/orders.sql'], + peerFileIndex: [], + dependencyPaths: ['models/customers.sql'], + ...overrides, + }; + } + + it('hashes raw and dependency file bytes with stable source identity', async () => { + const first = await computeIngestWorkUnitInputHash({ + stagedDir: tempDir, + connectionId: 'warehouse', + sourceKey: 'dbt', + unit: unit(), + cliVersion: '0.13.1', + promptFingerprint: 'prompt-v1', + modelRole: 'default', + }); + const second = await computeIngestWorkUnitInputHash({ + stagedDir: tempDir, + connectionId: 'warehouse', + sourceKey: 'dbt', + unit: unit(), + cliVersion: '0.13.1', + promptFingerprint: 'prompt-v1', + modelRole: 'default', + }); + + expect(first).toMatch(/^[a-f0-9]{64}$/); + expect(second).toBe(first); + }); + + it('changes when one raw file changes and keeps unrelated units stable', async () => { + const before = await computeIngestWorkUnitInputHash({ + stagedDir: tempDir, + connectionId: 'warehouse', + sourceKey: 'dbt', + unit: unit(), + cliVersion: '0.13.1', + promptFingerprint: 'prompt-v1', + modelRole: 'default', + }); + + await writeFile(join(tempDir, 'models/orders.sql'), 'select id from raw.orders\n', 'utf-8'); + + const after = await computeIngestWorkUnitInputHash({ + stagedDir: tempDir, + connectionId: 'warehouse', + sourceKey: 'dbt', + unit: unit(), + cliVersion: '0.13.1', + promptFingerprint: 'prompt-v1', + modelRole: 'default', + }); + const unrelated = await computeIngestWorkUnitInputHash({ + stagedDir: tempDir, + connectionId: 'warehouse', + sourceKey: 'dbt', + unit: unit({ unitKey: 'customers', rawFiles: ['models/customers.sql'], dependencyPaths: [] }), + cliVersion: '0.13.1', + promptFingerprint: 'prompt-v1', + modelRole: 'default', + }); + + expect(after).not.toBe(before); + expect(unrelated).not.toBe(after); + }); + + it('changes when version, prompt fingerprint, or model role changes', async () => { + const base = { + stagedDir: tempDir, + connectionId: 'warehouse', + sourceKey: 'dbt', + unit: unit(), + cliVersion: '0.13.1', + promptFingerprint: 'prompt-v1', + modelRole: 'default' as const, + }; + const hash = await computeIngestWorkUnitInputHash(base); + + await expect(computeIngestWorkUnitInputHash({ ...base, cliVersion: '0.13.2' })).resolves.not.toBe(hash); + await expect(computeIngestWorkUnitInputHash({ ...base, promptFingerprint: 'prompt-v2' })).resolves.not.toBe(hash); + await expect(computeIngestWorkUnitInputHash({ ...base, modelRole: 'repair' })).resolves.not.toBe(hash); + }); + + it('hashes a missing dependency as a stable missing marker', async () => { + const hash = await computeIngestWorkUnitInputHash({ + stagedDir: tempDir, + connectionId: 'warehouse', + sourceKey: 'dbt', + unit: unit({ dependencyPaths: ['models/missing.sql'] }), + cliVersion: '0.13.1', + promptFingerprint: 'prompt-v1', + modelRole: 'default', + }); + + expect(hash).toMatch(/^[a-f0-9]{64}$/); + }); + + it('recognizes a semantic-layer file that differs only by pruned joins', () => { + const output = [ + 'name: orders', + 'grain: [order_id]', + 'columns: [{name: order_id, type: string}, {name: customer_id, type: string}]', + 'joins:', + ' - to: customers', + ' on: orders.customer_id = customers.customer_id', + 'measures: []', + '', + ].join('\n'); + const current = [ + 'name: orders', + 'grain: [order_id]', + 'columns: [{name: order_id, type: string}, {name: customer_id, type: string}]', + 'joins: []', + 'measures: []', + '', + ].join('\n'); + + expect(isPruneShapedCachedReplayBase('semantic-layer/warehouse/orders.yaml', current, output)).toBe(true); + expect(isPruneShapedCachedReplayBase('semantic-layer/warehouse/orders.yaml', current.replace('order_id', 'id'), output)).toBe( + false, + ); + }); + + it('recognizes a wiki page that differs only by pruned refs and inline body refs', () => { + const output = [ + '---', + 'summary: Revenue', + 'usage_mode: auto', + 'refs:', + ' - missing-page', + 'sl_refs:', + ' - missing_source', + '---', + '', + 'Revenue uses [[missing-page]], `source:missing_source`, and `orders.missing_measure`.', + '', + ].join('\n'); + const current = [ + '---', + 'summary: Revenue', + 'usage_mode: auto', + 'refs: []', + 'sl_refs: []', + '---', + '', + 'Revenue uses, and.', + '', + ].join('\n'); + + expect(isPruneShapedCachedReplayBase('wiki/global/revenue.md', current, output)).toBe(true); + expect(isPruneShapedCachedReplayBase('wiki/global/revenue.md', current.replace('Revenue', 'ARR'), output)).toBe(false); + }); +}); diff --git a/packages/cli/test/context/scan/enrichment-state.test.ts b/packages/cli/test/context/scan/enrichment-state.test.ts index d2c37b39..797a74d5 100644 --- a/packages/cli/test/context/scan/enrichment-state.test.ts +++ b/packages/cli/test/context/scan/enrichment-state.test.ts @@ -228,8 +228,34 @@ describe('scan enrichment state', () => { ]); }); - it('recreates the resume cache when an older primary key shape is found', async () => { - const dbPath = join(tempDir, 'legacy.sqlite'); + it('round-trips a relationships-mode stage through listRunStages', async () => { + // A relationships-mode scan persists the relationships stage with + // mode 'relationships'; listRunStages must accept it, not reject it as + // invalid metadata (the mode allowlist once omitted 'relationships'). + await store.saveCompletedStage({ + runId: 'scan-run-rel', + connectionId: 'warehouse', + syncId: 'sync-rel', + mode: 'relationships', + stage: 'relationships', + inputHash: 'rel-hash', + output: { relationshipUpdate: null }, + updatedAt: '2026-04-29T12:03:00.000Z', + }); + + await expect(store.listRunStages('scan-run-rel')).resolves.toEqual([ + expect.objectContaining({ + runId: 'scan-run-rel', + syncId: 'sync-rel', + mode: 'relationships', + stage: 'relationships', + status: 'completed', + }), + ]); + }); + + it('uses the shared content-result cache and ignores the obsolete scan-specific table', async () => { + const dbPath = join(tempDir, 'shared.sqlite'); const legacy = new Database(dbPath); legacy.exec(` CREATE TABLE local_scan_enrichment_stages ( @@ -243,7 +269,7 @@ describe('scan enrichment state', () => { output_json TEXT, error_message TEXT, updated_at TEXT NOT NULL, - PRIMARY KEY (run_id, stage) + PRIMARY KEY (connection_id, stage, input_hash) ); INSERT INTO local_scan_enrichment_stages VALUES ('old-run', 'descriptions', 'hash', 'warehouse', 'sync', 'enriched', 'completed', 'null', NULL, '2026-01-01T00:00:00.000Z'); @@ -251,8 +277,6 @@ describe('scan enrichment state', () => { legacy.close(); const recreated = new SqliteLocalScanEnrichmentStateStore({ dbPath }); - // The legacy row is dropped with the old table; the new key shape is in - // force, so a fresh save + lookup round-trips cleanly. await recreated.saveCompletedStage({ runId: 'new-run', connectionId: 'warehouse', @@ -261,14 +285,41 @@ describe('scan enrichment state', () => { stage: 'descriptions', inputHash: 'hash', output: ['fresh'], - updatedAt: '2026-02-01T00:00:00.000Z', + updatedAt: '2026-06-25T00:00:00.000Z', }); + await expect( recreated.findCompletedStage({ connectionId: 'warehouse', stage: 'descriptions', inputHash: 'hash' }), ).resolves.toMatchObject({ runId: 'new-run', output: ['fresh'] }); await expect(recreated.listRunStages('old-run')).resolves.toEqual([]); }); + it('lists scan stages through the shared cache metadata', async () => { + await store.saveCompletedStage({ + runId: 'scan-run-shared', + connectionId: 'warehouse', + syncId: 'sync-shared', + mode: 'enriched', + stage: 'relationships', + inputHash: 'relationship-hash', + output: { accepted: [] }, + updatedAt: '2026-06-25T00:00:00.000Z', + }); + + await expect(store.listRunStages('scan-run-shared')).resolves.toEqual([ + expect.objectContaining({ + runId: 'scan-run-shared', + connectionId: 'warehouse', + syncId: 'sync-shared', + mode: 'enriched', + stage: 'relationships', + inputHash: 'relationship-hash', + status: 'completed', + output: { accepted: [] }, + }), + ]); + }); + it('summarizes resumed, completed, and failed stages for reports', () => { expect( summarizeKtxScanEnrichmentState({ diff --git a/packages/cli/test/context/scan/local-enrichment.test.ts b/packages/cli/test/context/scan/local-enrichment.test.ts index 2db86ac3..61cdbcbc 100644 --- a/packages/cli/test/context/scan/local-enrichment.test.ts +++ b/packages/cli/test/context/scan/local-enrichment.test.ts @@ -968,6 +968,76 @@ describe('local scan enrichment', () => { } }); + it('checkpoints recomputed embeddings before relationships even when descriptions load from disk', async () => { + const executor = new InMemorySqliteExecutor(); + try { + executor.db.exec(` + CREATE TABLE accounts (id INTEGER NOT NULL); + CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL); + INSERT INTO accounts (id) VALUES (1), (2); + INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2); + `); + const priorDescriptions: Array<{ + table: { catalog: null; db: null; name: string }; + tableDescription: string | null; + columnDescriptions: Record; + }> = [ + { + table: { catalog: null, db: null, name: 'orders' }, + tableDescription: 'Customer purchase orders', + columnDescriptions: { id: 'Order identifier', account_id: 'The owning account reference' }, + }, + { + table: { catalog: null, db: null, name: 'accounts' }, + tableDescription: 'Account records', + columnDescriptions: { id: 'Account identifier' }, + }, + ]; + const scanConnector = { + ...connector(), + driver: 'sqlite' as const, + capabilities: createKtxConnectorCapabilities({ readOnlySql: true, columnStats: true }), + introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()), + executeReadOnly: executor.executeReadOnly.bind(executor), + }; + const checkpoints: Array>> = []; + + await runLocalScanEnrichment({ + connectionId: 'warehouse', + mode: 'enriched', + detectRelationships: true, + connector: scanConnector, + context: { runId: 'embeddings-rel-checkpoint' }, + providers: { + ...createDeterministicLocalScanEnrichmentProviders(), + embedding: fakeScanEmbedding({ dimensions: 6 }), + }, + stages: ['embeddings', 'relationships'], + loadPriorDescriptions: async () => priorDescriptions, + onCheckpoint: async (checkpoint) => { + checkpoints.push(checkpoint); + }, + }); + + expect(checkpoints).toHaveLength(1); + const checkpoint = checkpoints[0]; + if (!checkpoint) { + throw new Error('Expected a checkpoint'); + } + // Descriptions were loaded from disk (not re-run), but the recomputed + // embeddings are promoted before the kill-prone relationship stage. + expect(checkpoint.summary.tableDescriptions).toBe('skipped'); + expect(checkpoint.summary.embeddings).toBe('completed'); + expect(checkpoint.embeddingUpdates.length).toBeGreaterThan(0); + // The checkpoint must carry the on-disk descriptions, not an empty set, or + // the manifest merge would delete them (D3). + expect(checkpoint.descriptionUpdates.length).toBeGreaterThan(0); + expect(checkpoint.relationshipUpdate).toBeNull(); + } finally { + executor.close(); + } + }); + it('does not checkpoint when relationship detection is skipped', async () => { const onCheckpoint = vi.fn(async () => {}); await runLocalScanEnrichment({ diff --git a/spider2-specs/specs/22-resumable-and-fault-tolerant-source-ingest.md b/spider2-specs/specs/22-resumable-and-fault-tolerant-source-ingest.md new file mode 100644 index 00000000..15d1a861 --- /dev/null +++ b/spider2-specs/specs/22-resumable-and-fault-tolerant-source-ingest.md @@ -0,0 +1,463 @@ +# Resumable and fault-tolerant source ingest + +> Refined spec. No intake draft — surfaced by a real user report, not the +> playground agent (see Motivation). Lives beside the analogous scan-durability +> specs 19/20. +> +> **Scope: make `ktx ingest` (the source-ingest work-unit pipeline behind dbt / +> Metabase / Notion) survive interruption and partial failure on large +> projects.** Two compounding gaps live on the source-ingest path: (1) an +> interrupted run restarts every work unit from scratch — there is no cross-run +> reuse of already-generated work-unit output, so a multi-day dbt ingest loses +> *all* progress to a single VPN/network blip; (2) the final integration gate is +> all-or-nothing — one artifact that cannot pass it (after LLM repair) discards +> the **entire** run with nothing committed. This is the source-ingest analog of +> spec 19 (move the durability boundary to the cost boundary so expensive LLM +> work is not lost) and spec 20 (a stage survives an interruption with per-item +> durability). It **reuses** the same content-keyed durability primitive those +> specs established rather than copying it. + +## Problem + +Two independent failure modes on the source-ingest work-unit (WU) pipeline, +both confirmed in the current code, both observed by a user on a ~2-day dbt +ingest. Their union makes large-project ingest brittle: any interruption is +total loss, and any single unfixable artifact at the end is total loss. + +### 1. An interrupted run resumes nothing — every work unit re-runs + +`IngestBundleRunner` (`context/ingest/ingest-bundle.runner.ts`) executes a run as +a sequence of stages: fetch → parse/extract into **work units** → run each WU as +an isolated agent loop in a child worktree (`runIsolatedWorkUnit` → +`executeWorkUnit`, `stages/stage-3-work-units.ts`) → integrate the successful WU +patches → reconcile → finalize → final gates → one atomic squash commit +(`squashMergeIntoMain`, ~2716). The WU stage is where the LLM cost lives: each WU +is an agent loop that reads its `rawFiles`/`dependencyPaths` and writes SL/wiki +artifacts, producing a git patch (`WorkUnitOutcome.patchPath` / +`patchTouchedPaths`, `stage-3-work-units.ts:31-46`). + +The only persisted cross-run state is `SqliteBundleIngestStore` +(`context/ingest/sqlite-bundle-ingest-store.ts`): run metadata, the final report, +and provenance — all written at or near **run completion**. There is **no +checkpoint of completed WU output**. A run that dies mid-flight (the user's +VPN/network drop) leaves nothing reusable: the next `ktx ingest` re-fetches, +re-parses, and **re-executes every WU from scratch**, re-paying the entire LLM +cost. The store even keys `job_id` UNIQUE, so a re-run is a brand-new job with no +relationship to the interrupted one. + +> Observed (user report, large dbt project): a run reached deep into its +> work-unit progress and was lost to a network blip; the follow-up run started +> over from zero. On a ~2-day ingest this is the difference between a 5-minute +> resume and a 2-day redo. + +### 2. The final integration gate is all-or-nothing + +After all surviving WUs are integrated, `validateFinalIngestArtifacts` +(`context/ingest/artifact-gates.ts:96`) runs the final gate. It checks, across +the *integrated* tree: + +- **intrinsic source validity** — `validateTouchedSources` → + `validateWuTouchedSources` (`stages/validate-wu-sources.ts:124`) → + `validateSingleSource` (`context/sl/tools/sl-warehouse-validation.ts:56`), + which runs a **live warehouse dry-run** (`SELECT * FROM (sql) LIMIT 1`); +- **cross-artifact references** — dangling join targets + (`findJoinTargetErrors`, `validate-wu-sources.ts:89`), dangling `wiki→wiki` + refs (`validateWikiRefs` → `findMissingWikiRefs`), broken `wiki→sl_ref`s + (`validateWikiSlRefs`, `artifact-gates.ts:39`), and broken wiki body refs + (`findInvalidWikiBodyRefs`). + +On any error it **`throw`s a single concatenated string** (`artifact-gates.ts:129`). +The runner catches it, runs the LLM repair `repairFinalGateFailure` +(`runner.ts:2595`, `maxAttempts: 2`), and if repair still fails, **re-throws** +(`runner.ts:2623`) → `markFailed` → the squash never runs → `commitSha: null` +(`runner.ts:2729`) → **the whole run is discarded, nothing committed.** + +The crucial asymmetry: a WU that fails *on its own terms* never reaches this gate +— `executeWorkUnit` already validates each WU in isolation (`validateWikiRefs` +~143, `validateTouchedSources` ~150) and **soft-fails** it (`failWithReset`, +~155: the WU resets, is excluded from integration, and the run continues). So by +the time the final gate runs, intrinsic single-source failures are rare. The +gate fails predominantly on **cross-artifact dangling references**: WU-A's source +joins to a source WU-B was meant to create, but WU-B failed/was-excluded, so +A's join now points at nothing. Each WU passed *alone*; the break only appears +once the survivors are integrated — and that break currently nukes the run. + +> Observed (user report): a run completed all task generation and then failed at +> the final integration gate on a **single model**; because the gate is +> all-or-nothing, that one failure discarded an ~18h run with nothing committed. + +## Generic use case (independent of any benchmark) + +Anyone ingesting a large warehouse/BI/dbt project with an LLM pipeline will hit +both failures. Large ingests run long enough that an interruption is a *when*, +not an *if* (laptop sleep, VPN reconnect, transient provider error, an operator +ctrl-C on an apparently-stuck run), and a large artifact set makes it +near-certain that *some* model lands a cross-reference its sibling didn't +produce. Without cross-run reuse, every interruption is a from-scratch redo of +the dominant (LLM) cost; without partial commit, one unfixable artifact throws +away every good one. Both fixes make large-project ingest **resilient and +resumable**: an interruption costs only the unfinished work, and a single bad +model costs only that model — not the run. This is core robustness for a +general-purpose ingestion product. + +## Design decisions (resolved during refinement) + +These resolve the design space explored during refinement. They constrain the +implementer; the exact code is theirs (requirement-level, per the specs README). + +### D1 — Resume is automatic and content-keyed at the work-unit level + +A successful WU's output is cached across runs, keyed by a **content hash of its +inputs**, with **no `--resume` flag**. Re-running the same `ktx ingest` +transparently replays any WU whose inputs are byte-identical to a cached success +and re-runs only the changed, failed, or missing WUs. The key is computed over: +the contents of the WU's `rawFiles` + `dependencyPaths` (the bytes the WU reads, +`types.ts:19-28`), the adapter/source identity, and a **version/prompt +fingerprint** (ktx version + the WU system/user prompt + model role). A changed +dbt model busts only that model's entry; everything unchanged replays for free. + +> No flag, no config knob. Content-keying makes resume automatic; a flag would +> double the state space for no benefit. This is the same shape scan uses +> (`computeKtxScanEnrichmentInputHash`, spec 19), reached here for the WU +> pipeline. + +### D2 — The cached unit is the successful WU's patch; replay verifies or recomputes + +The cache stores a successful WU's **output artifacts**: its git patch +(`patchPath` content / `patchTouchedPaths`) plus the metadata integration needs +(`actions`, `touchedSlSources`, `slDisallowed`). On a cache hit, the runner +**replays the patch** into the session worktree — no agent loop, no LLM — exactly +where it would have integrated a freshly-run WU. If a cached patch **fails to +apply** (the surrounding tree drifted), the entry is discarded and the WU +**recomputes**. So a stale hit degrades to "recompute," never to a corrupt tree: +the cache can only make a run faster, never wrong. + +### D3 — One durability primitive, shared by scan and ingest + +Per the "one capability, one implementation" rule, the content-keyed store is +**extracted** into a shared primitive and **both** scan and ingest route through +it — not copied. Scan's `sqlite-local-enrichment-state-store.ts` (PK +`(connection_id, stage, input_hash)`, `findCompletedStage` / `saveCompletedStage`) +and its `inputHash` computation (`enrichment-state.ts`) are generalized to a +content-keyed result cache; scan is migrated onto the shared primitive **in the +same change** so no second copy exists even transiently. The ingest cache is a +new logical namespace (e.g. keyed `(connectionId, sourceKey, workUnitInputHash)`) +on that one store. + +> Extract-and-share in one PR, not "build a copy for ingest now, unify later." +> A temporary fork is exactly the divergence the rule forbids; the one-time +> extraction cost is paid once and both paths benefit from every later fix. + +### D4 — Only successes are cached; failures retry on the next run + +A failed WU is **not** recorded as terminal — the next run retries it. WU +failures on this path are dominantly transient (network, provider stall, an LLM +slip), and the user's explicit ask is "resume and finish the rest," so a failure +must not be sticky. This deliberately differs from scan's stage store (which +caches failed stages and re-throws): there the failure is the stage's +deterministic verdict; here a WU failure is usually a blip to retry. Caching only +successes also keeps the invariant simple — a cache entry always means "this +exact input already produced this exact good output." + +### D5 — The final gate becomes non-fatal: deterministic dangling-edge prune + +Replace the gate's fatal `throw`-after-repair with a deterministic reconciliation +that always yields a committable, internally-consistent tree: + +1. `validateFinalIngestArtifacts` is refactored to **return structured findings** + (the danglers it already computes internally — join targets, `wiki→wiki`, + `wiki→sl_ref`, wiki body refs — plus any intrinsic source failure) instead of + flattening them into a thrown string. +2. **Drop the rare self-invalid source first.** A source that fails its *own* + validation at the final gate (intrinsic — rare, since stage 3 already filters + these) is removed, establishing the surviving artifact set. +3. **Prune the dead edges in a single pass** over that surviving set. For each + dangling reference — whether it pointed at an absent sibling or at a + just-dropped source — **remove that reference from its owner** (drop the join + entry, remove the `wiki ref` / `sl_ref`, remove the broken body link), keeping + the owning artifact. Because nodes are dropped first (step 2) and pruning only + removes edges, pruning **cannot create a new dangling edge, so one pass + suffices; no fixpoint.** +4. Re-run the gate to **confirm** the remainder is clean (warehouse dry-runs are + cached per D6/D2, ref checks are in-memory, so this is cheap), then squash-commit + the remainder. If the confirm pass *still* fails, that is a real bug — fail the + run loudly rather than commit a dirty tree. + +`repairFinalGateFailure` (the LLM repair, `runner.ts:2595` / `final-gate-repair.ts`) +is **removed**. The deterministic prune supersedes it for the referential class, +and the rare intrinsic case is handled by drop. + +> **Prune the edge, do not cascade the node.** The rejected alternative drops the +> *referencing artifact* and, transitively, everything that referenced *it* — a +> node-quarantine fixpoint that cascades healthy artifacts and needs a closure +> search, a confirm loop, and an un-apply step. Pruning the dead edge keeps the +> dependent intact (minus one pointer that never resolved anyway), needs no +> fixpoint, and acts on findings the gate already produces. +> +> **Why remove the LLM repair rather than keep it as a pre-prune step.** Repair +> can occasionally *fix* a ref (e.g. correct a typo'd source name) where prune +> merely deletes it, preserving marginally more content. We drop it anyway: +> determinism beats an LLM round-trip with variance on the commit path, prune +> guarantees a commit where repair could only `throw`, and deleting it is a net +> maintenance reduction. The decision is reversible — repair could later run as a +> best-effort pass *before* prune — but the default is prune-only. + +### D6 — Prune runs on the integrated tree, never poisons the cache (resume ∘ prune compose) + +Pruning is applied to the **integrated session worktree** at gate time and is +**re-derived from the current survivor set on every run**. It MUST NOT mutate the +cached WU patches (D2). This makes resume and prune compose correctly and +**self-heal**: + +- Run 1: WU-A (joins to B) succeeds and is cached *with its join intact*; WU-B + fails; the gate prunes A's join-to-B from the integrated tree and commits A + without it. +- Run 2 (after the root cause is fixed): A's input is unchanged → A **replays + from cache with its join restored**; B now succeeds and exists; the gate finds + no dangler and commits both, fully linked. + +So a ref pruned because of a sibling's failure costs nothing permanent: fixing +the sibling and re-running restores the link for free. The cache stores +intent (the WU's real output); prune is a per-run consistency projection over +whatever survived. + +### D7 — Pruning is faithful and never silent + +A pruned reference was, by definition, non-functional (its target was absent), so +removing it loses nothing executable — and removing dangling SL joins is already +the established fix for the SL engine's eager orphan-join rejection. Every prune +and every drop MUST be **recorded in the run report and a trace event** naming +the artifact, the removed reference, and the absent target. The report status +MUST reflect partial completion (extend the existing `failedWorkUnits` +mechanism, `IngestBundleResult`, `types.ts:204-213`, with the pruned-refs / +dropped-sources detail) so a partial run is visibly partial, never a silent +"success." + +### D8 — Cache state is regenerable; no migration bridge + +The WU cache is regenerable local state under `.ktx/`. Its on-disk/SQLite shape +may change with **no migration bridge** — a stale-shaped or absent cache simply +forces a full (non-resumed) run, exactly today's behavior. Consistent with ktx's +no-backward-compatibility policy; the cache is an optimization, never a source of +truth. + +## Requirements + +1. **Cross-run WU resume, automatic and content-keyed.** A successful WU's output + MUST be cached keyed by a content hash over its input bytes + (`rawFiles` + `dependencyPaths`), the adapter/source identity, and a + version/prompt fingerprint (ktx version + WU prompt + model role). Re-running + `ktx ingest` MUST replay cached successes without an agent loop / LLM call and + re-run only changed, failed, or missing WUs. No `--resume` flag and no config + knob is added. +2. **Replay verifies or recomputes.** On a cache hit the runner MUST replay the + stored patch into the session worktree; if the patch does not apply cleanly the + entry MUST be discarded and the WU recomputed. A cache hit MUST NOT be able to + produce a tree different from what a fresh run of that WU would have integrated. +3. **Only successes are cached.** A failed WU MUST NOT be recorded as terminal; it + MUST be retried on the next run. +4. **Conservative invalidation.** The input hash MUST change when the ktx version, + the WU prompt, or the model role changes (bias toward recompute). Under-keying + (stale reuse) is a correctness bug; over-keying (an unnecessary recompute) is + acceptable. +5. **The final gate is non-fatal.** A final-gate failure MUST NOT discard the run. + `validateFinalIngestArtifacts` MUST return structured findings; the runner MUST + deterministically **prune** every dangling reference from its owning artifact + and **drop** any source that fails its own validation, then commit the + remaining internally-consistent tree. +6. **Single-pass prune, dependents survive.** Pruning MUST remove dead *edges*, not + cascade-drop owning artifacts; it MUST complete in a single pass (no fixpoint) + because edge removal cannot create new dangling edges. A dependent that loses + one dangling ref MUST otherwise be committed intact. +7. **Prune composes with resume.** Pruning MUST operate on the integrated tree and + MUST NOT mutate cached WU patches. A reference pruned in one run because its + target was absent MUST be restored automatically on a later run once the target + exists (resume replays the owner's intact patch). +8. **Confirm before commit.** After pruning/dropping, the gate MUST be re-run on + the remainder and MUST pass before the squash; if it still fails the run MUST + fail loudly rather than commit a dirty tree. +9. **`repairFinalGateFailure` is removed.** The LLM final-gate repair path and its + obsolete tests/branches MUST be deleted (no dormant compatibility path). +10. **Every prune/drop is reported.** Each pruned reference and dropped source MUST + be recorded in the run report and a trace event (artifact, removed ref, absent + target). A run that pruned or dropped anything MUST report as partial, never as + an unqualified success. +11. **One shared durability primitive.** The content-keyed store MUST be a single + implementation used by both scan and ingest; scan MUST be migrated onto it in + the same change. No second copy may exist, even transiently. +12. **No regression for clean runs.** A small, uninterrupted run whose every WU + passes and whose final gate is clean MUST produce byte-identical artifacts and + the same `commitSha`/report shape (modulo new, empty pruned/dropped fields) as + today. + +## Acceptance criteria + +- **Resume skips completed work:** interrupt an ingest after K of N WUs have + succeeded; re-run the same command (unchanged inputs); the run issues **zero** + agent loops / LLM calls for the K cached WUs, runs only the remaining N−K, and + produces the same final artifacts as an uninterrupted run. +- **Changed model busts only its entry:** edit one dbt model between runs; the + re-run re-executes **only** the WU(s) whose input bytes changed and replays the + rest from cache. +- **Stale patch self-corrects:** a cached patch that no longer applies (forced + drift in a test) causes that WU to recompute, not a corrupt tree or a crash. +- **Failures retry:** a WU that fails in run 1 (transient error) is **not** cached; + run 2 retries it and, on success, integrates it. +- **One bad model no longer nukes the run:** a run where WU-B fails so WU-A's join + to B dangles **commits** — A is committed with the dangling join **pruned**, the + report lists the pruned ref, and `commitSha` is non-null (contrast: today this + throws and commits nothing). +- **No cascade:** in that scenario A (and any other artifact that only referenced + B) is committed intact except for the single pruned reference; nothing healthy + is dropped. +- **Self-heal:** fix B's root cause and re-run; A replays from cache with its join + intact, B succeeds, and the final tree commits both fully linked with no prune. +- **Intrinsic drop:** a source that fails its own warehouse dry-run at the final + gate (forced) is dropped, refs to it are pruned, and the rest commits; the drop + is reported. +- **Repair is gone:** `repairFinalGateFailure` and its tests no longer exist; the + gate path has no LLM call. +- **One store:** scan and ingest both resume through the same content-keyed + primitive (one implementation; scan's behavior is unchanged by the migration — + spec 19/20 acceptance still passes). +- **Clean-run regression:** a small uninterrupted all-passing ingest yields + identical artifacts, `commitSha`, and report (empty pruned/dropped fields) to + today. + +## Non-goals + +- **Resuming the cross-WU stages.** Reconciliation, finalization, and the final + gate re-run every time; their inputs depend on the full survivor set and their + cost is small relative to WU generation. Only WU generation is cached. +- **A `--resume` flag or any timeout/cache config knob.** Content-keying makes + resume automatic (D1); one opinionated default is the canonical ktx shape. +- **Caching failed WUs as terminal.** Failures retry (D4). +- **Node-cascade quarantine of the final gate.** Prune edges, do not drop + dependents (D5). No closure search, confirm-loop-over-nodes, or un-apply step. +- **Tolerating dangling references (warn instead of remove).** Unsafe — the SL + engine eagerly rejects orphan joins — so dead edges must be removed, not kept. +- **Keeping the LLM final-gate repair.** Removed (D5/req 9). +- **A general per-stage resume framework beyond the shared content-keyed store.** + The store is the one shared primitive (D3); this spec does not abstract every + ingest stage into a resumable framework. +- **Re-implementing spec 19/20 (scan durability).** This spec composes the same + primitive onto the source-ingest WU pipeline. + +## Implementation orientation + +Line numbers drift; treat these as anchors, not addresses. The implementer owns +the design. + +- **Run flow + the all-or-nothing seam** — `context/ingest/ingest-bundle.runner.ts`: + WU run + integration of successful patches (~1600–1900), the final-gate block + (~2549–2587, `runFinalArtifactGates`), the repair-then-rethrow that must be + replaced by prune (~2588–2644; the fatal `throw` ~2623), and the atomic squash + (~2701–2729; `commitSha: null` when nothing is touched ~2729). The prune step + slots between the gate findings and the squash, operating on `sessionWorktree`. +- **Work units & cacheable output** — `context/ingest/types.ts` (`WorkUnit` + ~19–28: `rawFiles`/`peerFileIndex`/`dependencyPaths`; `IngestBundleResult` + ~204–213: extend with pruned/dropped detail); + `context/ingest/stages/stage-3-work-units.ts` (`executeWorkUnit`; the per-WU + validation + `failWithReset` ~134–157 that already soft-fails a WU; + `WorkUnitOutcome` ~31–46 with `patchPath`/`patchTouchedPaths`/`actions`/ + `touchedSlSources` — the cache payload). The cache lookup/replay wraps the + per-WU execution; only the agent-loop branch is skipped on a hit. +- **The gate (make it return findings)** — `context/ingest/artifact-gates.ts` + (`validateFinalIngestArtifacts` ~96; the internal per-artifact danglers from + `validateWikiSlRefs` ~39, `validateWikiRefs` ~74, `findInvalidWikiBodyRefs`; + the concatenated `throw` ~129 to replace with a structured return); + `context/ingest/stages/validate-wu-sources.ts` (`validateWuTouchedSources` ~124; + `findJoinTargetErrors` ~89 already returns missing join targets per source — + the join-edge danglers to prune); `context/sl/tools/sl-warehouse-validation.ts` + (`validateSingleSource` ~56 — the intrinsic warehouse dry-run; its failures are + the drop set, not the prune set). +- **Per-ref-type pruners (pair 1:1 with the validators)** — join: remove the + offending `joins[]` entry from the source YAML; `wiki refs`/`sl_refs`: remove + the entry from page frontmatter (`context/wiki/wiki-ref-validation.ts` + `findMissingWikiRefs`); wiki body refs: remove the broken link token + (`context/ingest/wiki-body-refs.ts` `findInvalidWikiBodyRefs`). Each pruner is + deterministic and edits the integrated worktree only. +- **Remove the LLM repair** — `context/ingest/final-gate-repair.ts` + (`repairFinalGateFailure`) and the `constrained-repair.ts` usage for + `final_artifact_gate`; delete the call site (~2595) and its tests. +- **Durability primitive to extract & share** — + `context/scan/sqlite-local-enrichment-state-store.ts` (`local_scan_enrichment_stages`, + PK `(connection_id, stage, input_hash)`, `findCompletedStage`/`saveCompletedStage`), + `context/scan/enrichment-state.ts` (`computeKtxScanEnrichmentInputHash` ~78), and + the resume wrapper `runEnrichmentStage` (`context/scan/local-enrichment.ts`). + Generalize to a content-keyed result cache; migrate scan onto it; add the ingest + namespace. The existing ingest store + `context/ingest/sqlite-bundle-ingest-store.ts` (`SqliteBundleIngestStore`) is + where ingest-side persistence lives — the WU cache sits alongside it under + `.ktx/`. +- **Tests** — resume: run an ingest against a real git-backed project with a fake + agent runner, interrupt after K WUs, assert the re-run issues no agent loops for + the K and the same artifacts result; changed-input bust; stale-patch recompute; + failed-WU retry. Prune: a fixture where one WU fails so a sibling's join/wiki + ref dangles → assert the run commits the sibling with the ref pruned, reports the + prune, and `commitSha` is non-null; assert no cascade; assert self-heal on a + follow-up run; assert intrinsic drop. Migration: spec 19/20 scan acceptance still + green on the shared primitive. Regression: a small uninterrupted all-passing + ingest is byte-identical to today. +- After implementing, rebuild and re-link so the playground picks it up: + `pnpm run build && pnpm run link:dev`. + +## Motivation (the real report, not a benchmark) + +A user ingesting a fairly large dbt project (~2-day run) hit both gaps together. +First, an interruption — a VPN drop / network blip — lost all progress because +ingest cannot resume; they had to restart from scratch. Second, on a later run +that completed all task generation, a **single model** failed the final +integration gate, and because the gate is all-or-nothing the one failure +discarded an ~18h run with nothing committed. Their ask: "some form of resume or +checkpoint (or at least reusing the patches that were already generated), and a +way to skip or quarantine a single failing model instead of failing the entire +run." This spec delivers both — resume via the content-keyed WU cache, and +partial commit via deterministic dangling-edge pruning. Unlike specs 19/20 this +gap was surfaced by a real user on a real warehouse, not by the benchmark; the +fix is generic production hygiene for any large ingest. + +## Implementation notes + +Shipped on branch `write-feature-spec-wiki` (squash-merge target). All 12 +requirements and every acceptance criterion are covered by committed code and +tests; the full `@kaelio/ktx` package suite is green. + +What was built and where: + +- **Shared content-keyed durability primitive** — `context/cache/content-result-cache.ts` + + `sqlite-content-result-cache.ts` (`SqliteContentResultCache`, `local_content_results`). + Scan was migrated onto it in the same change (`context/scan/sqlite-local-enrichment-state-store.ts` + is now a thin adapter; the old `local_scan_enrichment_stages` table is dropped), + so no second copy exists (D3 / req 11). +- **Content-keyed WU cache + replay** — `context/ingest/work-unit-cache.ts` + (`computeIngestWorkUnitInputHash` over raw/dependency bytes + source identity + + CLI version + prompt fingerprint + model role; success-only `saveSuccessfulWorkUnitCache`). + Replay/recompute and stale-recompute state refresh wrap the WU loop in + `ingest-bundle.runner.ts` (D1/D2/D4 / reqs 1–4). +- **Non-fatal final gate** — `artifact-gates.ts` `validateFinalIngestArtifacts` + returns structured findings; `context/ingest/final-gate-prune.ts` deterministically + drops self-invalid sources and prunes dangling edges in a single pass, then a + confirm gate runs before squash (D5/D6 / reqs 5–8). `finalGatePrunedReferences` + / `finalGateDroppedSources` are recorded in the report + trace and surface as a + `partial` outcome (D7 / req 10). `repairFinalGateFailure` and its tests are + deleted (req 9). + +Deviations / decisions worth noting (all preserve spec intent): + +- **Cache stores artifact content snapshots (payload schema v2), not just a raw + git patch.** Replay materializes the owner's artifacts against the *current* + base, so a ref pruned in one run because a sibling failed is restored for free + on a later run once the sibling exists — without re-running the owner's agent + loop (D2/D6 / req 7 self-heal). A drifted/stale snapshot degrades to recompute. +- **Final-gate prune/drop resolves sources through the canonical + `resolveSlSourceFile` resolver**, not a derived `semantic-layer//.yaml` + path, so it works for uppercase / hash-derived source filenames (not only + lowercase demo names). +- **`executeWorkUnit` defers pruneable cross-artifact findings** (missing join + target / wiki ref / sl_ref) to the final gate instead of soft-failing the WU; + only intrinsic `source_validation` failures remain fatal at the WU level. This + is what lets a sibling-failed WU's owner survive to be pruned rather than be + excluded upstream (reqs 5–7, "no cascade"). +- The raw report record keeps `status: 'completed'`; partial completion is derived + by `ingestReportOutcome` from the populated prune/drop fields.