From ba76f2e5f0b7d905450f35b712528c6a2a5dea1c Mon Sep 17 00:00:00 2001 From: Andrey Avtomonov Date: Sun, 17 May 2026 21:22:43 +0200 Subject: [PATCH] feat: validate wiki body semantic references --- .../context/src/ingest/wiki-body-refs.test.ts | 70 ++++++++++ packages/context/src/ingest/wiki-body-refs.ts | 126 ++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 packages/context/src/ingest/wiki-body-refs.test.ts create mode 100644 packages/context/src/ingest/wiki-body-refs.ts diff --git a/packages/context/src/ingest/wiki-body-refs.test.ts b/packages/context/src/ingest/wiki-body-refs.test.ts new file mode 100644 index 00000000..a412b116 --- /dev/null +++ b/packages/context/src/ingest/wiki-body-refs.test.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from 'vitest'; +import { findInvalidWikiBodyRefs, parseWikiBodyRefs } from './wiki-body-refs.js'; + +const sources = [ + { + name: 'mart_account_segments', + grain: ['account_id'], + columns: [ + { name: 'account_id', type: 'string' }, + { name: 'segment', type: 'string' }, + ], + joins: [], + measures: [{ name: 'total_contract_arr', expr: 'sum(contract_arr)' }], + segments: [{ name: 'enterprise', expr: "segment = 'enterprise'" }], + table: 'analytics.mart_account_segments', + }, +]; + +describe('wiki body refs', () => { + it('parses only explicit inline-code body references outside fenced blocks', () => { + const body = [ + 'Valid `mart_account_segments.total_contract_arr` and `source:mart_account_segments`.', + 'Also `warehouse/mart_account_segments.segment` and `table:analytics.mart_account_segments`.', + 'Ignore prose mart_account_segments.total_contract_arr_cents.', + 'Ignore `single_token`.', + '```sql', + 'select `mart_account_segments.total_contract_arr_cents`', + '```', + ].join('\n'); + + expect(parseWikiBodyRefs(body)).toEqual([ + { kind: 'sl_entity', connectionId: null, sourceName: 'mart_account_segments', entityName: 'total_contract_arr' }, + { kind: 'sl_source', connectionId: null, sourceName: 'mart_account_segments' }, + { kind: 'sl_entity', connectionId: 'warehouse', sourceName: 'mart_account_segments', entityName: 'segment' }, + { kind: 'table', connectionId: null, tableRef: 'analytics.mart_account_segments' }, + ]); + }); + + it('rejects stale inline-code semantic-layer references', async () => { + const invalid = await findInvalidWikiBodyRefs({ + pageKey: 'account-segments', + body: 'ARR is documented as `mart_account_segments.total_contract_arr_cents`.', + visibleConnectionIds: ['warehouse'], + loadSources: async () => sources, + tableExists: async () => true, + }); + + expect(invalid).toEqual([ + 'account-segments: unknown semantic-layer entity mart_account_segments.total_contract_arr_cents', + ]); + }); + + it('validates source, dimension, segment, measure, and table references', async () => { + const invalid = await findInvalidWikiBodyRefs({ + pageKey: 'account-segments', + body: [ + '`mart_account_segments.total_contract_arr`', + '`mart_account_segments.segment`', + '`mart_account_segments.enterprise`', + '`source:mart_account_segments`', + '`table:analytics.mart_account_segments`', + ].join('\n'), + visibleConnectionIds: ['warehouse'], + loadSources: async () => sources, + tableExists: async (_connectionId, tableRef) => tableRef === 'analytics.mart_account_segments', + }); + + expect(invalid).toEqual([]); + }); +}); diff --git a/packages/context/src/ingest/wiki-body-refs.ts b/packages/context/src/ingest/wiki-body-refs.ts new file mode 100644 index 00000000..598cc59d --- /dev/null +++ b/packages/context/src/ingest/wiki-body-refs.ts @@ -0,0 +1,126 @@ +import type { SemanticLayerSource } from '../sl/index.js'; + +export type WikiBodyRef = + | { kind: 'sl_entity'; connectionId: string | null; sourceName: string; entityName: string } + | { kind: 'sl_source'; connectionId: string | null; sourceName: string } + | { kind: 'table'; connectionId: string | null; tableRef: string }; + +export interface WikiBodyRefValidationInput { + pageKey: string; + body: string; + visibleConnectionIds: string[]; + loadSources(connectionId: string): Promise; + tableExists(connectionId: string, tableRef: string): Promise; +} + +const inlineCodePattern = /`([^`\n]+)`/g; + +function visibleLinesOutsideFences(body: string): string[] { + const lines: string[] = []; + let fenced = false; + for (const line of body.split('\n')) { + if (/^\s*```/.test(line)) { + fenced = !fenced; + continue; + } + if (!fenced) { + lines.push(line); + } + } + return lines; +} + +function parseConnectionScoped(value: string): { connectionId: string | null; body: string } { + const slash = value.indexOf('/'); + if (slash <= 0) { + return { connectionId: null, body: value }; + } + return { connectionId: value.slice(0, slash), body: value.slice(slash + 1) }; +} + +export function parseWikiBodyRefs(body: string): WikiBodyRef[] { + const refs: WikiBodyRef[] = []; + for (const line of visibleLinesOutsideFences(body)) { + for (const match of line.matchAll(inlineCodePattern)) { + const token = (match[1] ?? '').trim(); + if (!token) { + continue; + } + const scoped = parseConnectionScoped(token); + if (scoped.body.startsWith('source:')) { + const sourceName = scoped.body.slice('source:'.length).trim(); + if (sourceName) { + refs.push({ kind: 'sl_source', connectionId: scoped.connectionId, sourceName }); + } + continue; + } + if (scoped.body.startsWith('table:')) { + const tableRef = scoped.body.slice('table:'.length).trim(); + if (tableRef) { + refs.push({ kind: 'table', connectionId: scoped.connectionId, tableRef }); + } + continue; + } + const parts = scoped.body.split('.'); + if (parts.length === 2 && parts[0] && parts[1]) { + refs.push({ + kind: 'sl_entity', + connectionId: scoped.connectionId, + sourceName: parts[0], + entityName: parts[1], + }); + } + } + } + return refs; +} + +function entityNames(source: SemanticLayerSource): Set { + return new Set([ + ...(source.measures ?? []).map((measure) => measure.name), + ...(source.columns ?? []).map((column) => column.name), + ...(source.segments ?? []).map((segment) => segment.name), + ]); +} + +export async function findInvalidWikiBodyRefs(input: WikiBodyRefValidationInput): Promise { + const errors: string[] = []; + const sourceCache = new Map(); + const loadSources = async (connectionId: string): Promise => { + const cached = sourceCache.get(connectionId); + if (cached) { + return cached; + } + const sources = await input.loadSources(connectionId); + sourceCache.set(connectionId, sources); + return sources; + }; + + for (const ref of parseWikiBodyRefs(input.body)) { + const connectionIds = ref.connectionId ? [ref.connectionId] : input.visibleConnectionIds; + if (ref.kind === 'table') { + const found = await Promise.all(connectionIds.map((connectionId) => input.tableExists(connectionId, ref.tableRef))); + if (!found.some(Boolean)) { + errors.push(`${input.pageKey}: unknown raw table ${ref.connectionId ? `${ref.connectionId}/` : ''}${ref.tableRef}`); + } + continue; + } + + let source: SemanticLayerSource | undefined; + for (const connectionId of connectionIds) { + source = (await loadSources(connectionId)).find((candidate) => candidate.name === ref.sourceName); + if (source) { + break; + } + } + if (!source) { + errors.push(`${input.pageKey}: unknown semantic-layer source ${ref.sourceName}`); + continue; + } + if (ref.kind === 'sl_entity' && !entityNames(source).has(ref.entityName)) { + errors.push(`${input.pageKey}: unknown semantic-layer entity ${ref.sourceName}.${ref.entityName}`); + } + } + + return errors; +}