fix(ingest): validate scan sources and wiki refs

This commit is contained in:
Andrey Avtomonov 2026-05-18 00:11:11 +02:00
parent 199c916549
commit 7cc9f0e70a
6 changed files with 300 additions and 46 deletions

View file

@ -168,6 +168,7 @@ const makeDeps = () => {
};
const wikiService = {
forWorktree: vi.fn(),
listPageKeys: vi.fn().mockResolvedValue([]),
readPage: vi.fn().mockResolvedValue(null),
syncFromCommit: vi.fn().mockResolvedValue(undefined),
};
@ -1573,6 +1574,7 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
workUnits: [{ unitKey: 'u1', rawFiles: ['semantic_models.yml'], peerFileIndex: [], dependencyPaths: [] }],
parseArtifacts: { semanticModels: [{ name: 'orders' }] },
});
deps.adapter.listTargetConnectionIds = vi.fn().mockResolvedValue(['warehouse-2']);
deps.semanticLayerService.loadAllSources.mockImplementation((connectionId: string) =>
Promise.resolve({ sources: [{ name: `${connectionId}_source` }], loadErrors: [] }),
);

View file

@ -17,6 +17,18 @@ type RuntimeWithConnectionDeps = {
};
};
type RuntimeWithSlValidationDeps = {
deps: {
slValidator: {
validateSingleSource(
deps: unknown,
connectionId: string,
sourceName: string,
): Promise<{ errors: string[]; warnings: string[] }>;
};
};
};
function testAgentRunner(): AgentRunnerPort {
return { runLoop: vi.fn().mockResolvedValue({ stopReason: 'natural' as const }) };
}
@ -144,6 +156,77 @@ describe('createLocalBundleIngestRuntime', () => {
]);
});
it('validates manifest-backed scan sources during local ingest gates', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
[
'tables:',
' payments:',
' table: public.payments',
' columns:',
' - name: payment_id',
' type: string',
' - name: amount',
' type: number',
'',
].join('\n'),
'ktx',
'ktx@example.com',
'Add warehouse manifest',
);
const agentRunner = testAgentRunner();
const runtime = createLocalBundleIngestRuntime({
project,
adapters: [new FakeSourceAdapter()],
agentRunner,
});
const deps = (runtime.runner as unknown as RuntimeWithSlValidationDeps).deps;
await expect(deps.slValidator.validateSingleSource(deps, 'warehouse', 'payments')).resolves.toEqual({
errors: [],
warnings: expect.any(Array),
});
});
it('does not mask malformed direct overlays with manifest-backed fallback validation', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
[
'tables:',
' payments:',
' table: public.payments',
' columns:',
' - name: payment_id',
' type: string',
'',
].join('\n'),
'ktx',
'ktx@example.com',
'Add warehouse manifest',
);
await project.fileStore.writeFile(
'semantic-layer/warehouse/payments.yaml',
['name: payments', 'columns:', ' - [', ''].join('\n'),
'ktx',
'ktx@example.com',
'Add malformed overlay',
);
const agentRunner = testAgentRunner();
const runtime = createLocalBundleIngestRuntime({
project,
adapters: [new FakeSourceAdapter()],
agentRunner,
});
const deps = (runtime.runner as unknown as RuntimeWithSlValidationDeps).deps;
await expect(deps.slValidator.validateSingleSource(deps, 'warehouse', 'payments')).resolves.toEqual({
errors: [expect.stringContaining('invalid YAML')],
warnings: [],
});
});
it('passes project connection config to local ingest query executors', async () => {
const agentRunner = testAgentRunner();
const queryExecutor = {

View file

@ -24,7 +24,6 @@ import {
type KtxConnectionInfo,
type KtxQueryResult,
SemanticLayerService,
type SemanticLayerSource,
type SlConnectionCatalogPort,
SlDiscoverTool,
SlEditSourceTool,
@ -248,22 +247,63 @@ class LocalSlPythonPort implements SlPythonPort {
}
class LocalShapeOnlySlValidator implements SlValidatorPort<SlValidationDeps> {
private validateParsedSource(sourceName: string, parsed: Record<string, unknown>) {
const isOverlay = parsed.table == null && parsed.sql == null;
const result = (isOverlay ? sourceOverlaySchema : sourceDefinitionSchema).safeParse(parsed);
return result.success
? { errors: [], warnings: [LOCAL_SHAPE_WARNING] }
: {
errors: result.error.issues.map(
(issue) => `${sourceName}: ${issue.path.join('.') || 'source'} ${issue.message}`,
),
warnings: [],
};
}
private async validateComposedSource(
deps: SlValidationDeps,
connectionId: string,
sourceName: string,
readError: unknown,
) {
try {
const { sources, loadErrors } = await deps.semanticLayerService.loadAllSources(connectionId);
const source = sources.find((candidate) => candidate.name === sourceName);
if (source) {
return this.validateParsedSource(sourceName, source as unknown as Record<string, unknown>);
}
const detail =
loadErrors.length > 0
? loadErrors.join('; ')
: readError instanceof Error
? readError.message
: String(readError);
return { errors: [`${sourceName}: ${detail}`], warnings: [] };
} catch (fallbackError) {
return {
errors: [`${sourceName}: ${fallbackError instanceof Error ? fallbackError.message : String(fallbackError)}`],
warnings: [],
};
}
}
async validateSingleSource(deps: SlValidationDeps, connectionId: string, sourceName: string) {
let content: string;
try {
const file = await deps.semanticLayerService.readSourceFile(connectionId, sourceName);
const parsed = YAML.parse(file.content) as SemanticLayerSource;
const isOverlay = parsed.table == null && parsed.sql == null;
const result = (isOverlay ? sourceOverlaySchema : sourceDefinitionSchema).safeParse(parsed);
return result.success
? { errors: [], warnings: [LOCAL_SHAPE_WARNING] }
: {
errors: result.error.issues.map(
(issue) => `${sourceName}: ${issue.path.join('.') || 'source'} ${issue.message}`,
),
warnings: [],
};
content = file.content;
} catch (error) {
return { errors: [`${sourceName}: ${error instanceof Error ? error.message : String(error)}`], warnings: [] };
return this.validateComposedSource(deps, connectionId, sourceName, error);
}
try {
const parsed = YAML.parse(content) as unknown as Record<string, unknown>;
return this.validateParsedSource(sourceName, parsed);
} catch (error) {
return {
errors: [`${sourceName}: invalid YAML — ${error instanceof Error ? error.message : String(error)}`],
warnings: [],
};
}
}
}

View file

@ -23,6 +23,8 @@ describe('wiki body refs', () => {
'Also `warehouse/mart_account_segments.segment` and `table:analytics.mart_account_segments`.',
'Ignore prose mart_account_segments.total_contract_arr_cents.',
'Ignore `single_token`.',
'Ignore wildcard pattern `mart_nrr_quarterly.*_arr_cents`.',
'Ignore condition `users.is_internal = false`.',
'```sql',
'select `mart_account_segments.total_contract_arr_cents`',
'```',
@ -50,6 +52,52 @@ describe('wiki body refs', () => {
]);
});
it('does not treat wildcard inline-code patterns as exact semantic-layer entity references', async () => {
const invalid = await findInvalidWikiBodyRefs({
pageKey: 'revenue-metrics-encoding',
body: 'Cents columns include `mart_nrr_quarterly.*_arr_cents` and `mart_retention_movement_breakout.*_arr_cents`.',
visibleConnectionIds: ['warehouse'],
loadSources: async () => [
{ name: 'mart_nrr_quarterly', grain: [], columns: [], joins: [], measures: [], table: 'analytics.mart_nrr_quarterly' },
{
name: 'mart_retention_movement_breakout',
grain: [],
columns: [],
joins: [],
measures: [],
table: 'analytics.mart_retention_movement_breakout',
},
],
tableExists: async () => true,
});
expect(invalid).toEqual([]);
});
it('does not treat inline-code SQL predicates as exact semantic-layer entity references', async () => {
const invalid = await findInvalidWikiBodyRefs({
pageKey: 'account-reporting-exclusions',
body: 'Exclude internal users with `users.is_internal = false` and test users with `users.is_test = false`.',
visibleConnectionIds: ['warehouse'],
loadSources: async () => [
{
name: 'users',
grain: [],
columns: [
{ name: 'is_internal', type: 'boolean' },
{ name: 'is_test', type: 'boolean' },
],
joins: [],
measures: [],
table: 'analytics.users',
},
],
tableExists: async () => true,
});
expect(invalid).toEqual([]);
});
it('validates source, dimension, segment, measure, and table references', async () => {
const invalid = await findInvalidWikiBodyRefs({
pageKey: 'account-segments',

View file

@ -38,6 +38,10 @@ function parseConnectionScoped(value: string): { connectionId: string | null; bo
return { connectionId: value.slice(0, slash), body: value.slice(slash + 1) };
}
function isIdentifierToken(value: string): boolean {
return /^[A-Za-z_][A-Za-z0-9_]*$/.test(value);
}
export function parseWikiBodyRefs(body: string): WikiBodyRef[] {
const refs: WikiBodyRef[] = [];
for (const line of visibleLinesOutsideFences(body)) {
@ -62,7 +66,7 @@ export function parseWikiBodyRefs(body: string): WikiBodyRef[] {
continue;
}
const parts = scoped.body.split('.');
if (parts.length === 2 && parts[0] && parts[1]) {
if (parts.length === 2 && isIdentifierToken(parts[0] ?? '') && isIdentifierToken(parts[1] ?? '')) {
refs.push({
kind: 'sl_entity',
connectionId: scoped.connectionId,