feat: carry historic sql usage in semantic sources

This commit is contained in:
Andrey Avtomonov 2026-05-11 16:47:28 +02:00
parent d73a54e8c7
commit f17053061d
4 changed files with 103 additions and 0 deletions

View file

@ -1,4 +1,5 @@
import { z } from 'zod';
import { tableUsageOutputSchema } from '../ingest/adapters/historic-sql/skill-schemas.js';
// Literal vocabularies — kept in lockstep with the Python Pydantic model at
// python/ktx-sl/semantic_layer/models.py (SourceColumn / ColumnRole /
@ -125,6 +126,7 @@ export const sourceDefinitionSchema = z
default_time_dimension: defaultTimeDimensionDbtSchema.optional(),
tags: sourceKeyedStringArraySchema.optional(),
freshness: sourceFreshnessSchema.optional(),
usage: tableUsageOutputSchema.optional(),
})
.strict()
.refine((s) => (s.table || s.sql) && !(s.table && s.sql), {
@ -145,6 +147,7 @@ export const sourceOverlaySchema = z
exclude_columns: z.array(z.string()).optional(),
disable_joins: z.array(z.string()).optional(),
default_time_dimension: defaultTimeDimensionDbtSchema.optional(),
usage: tableUsageOutputSchema.optional(),
})
.strict();

View file

@ -5,6 +5,7 @@ import {
composeOverlay,
enrichColumnsFromManifest,
findDanglingSegmentRefs,
projectManifestEntry,
SemanticLayerService,
} from './semantic-layer.service.js';
import { sourceDefinitionSchema } from './schemas.js';
@ -129,6 +130,39 @@ describe('composeOverlay', () => {
dbt: 'dbt description',
});
});
it('replaces manifest usage only when an overlay explicitly provides usage', () => {
const baseWithUsage: SemanticLayerSource = {
...baseTable,
usage: {
narrative: 'Orders are commonly queried by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
},
};
expect(composeOverlay(baseWithUsage, { name: 'fct_labs', measures: [] }).usage).toEqual(baseWithUsage.usage);
const composed = composeOverlay(baseWithUsage, {
name: 'fct_labs',
usage: {
narrative: 'Overlay-curated usage note.',
frequencyTier: 'mid',
commonFilters: ['created_at'],
commonGroupBys: ['created_at'],
commonJoins: [],
},
});
expect(composed.usage).toEqual({
narrative: 'Overlay-curated usage note.',
frequencyTier: 'mid',
commonFilters: ['created_at'],
commonGroupBys: ['created_at'],
commonJoins: [],
});
});
});
describe('enrichColumnsFromManifest', () => {
@ -299,6 +333,61 @@ describe('sourceDefinitionSchema', () => {
dbt: { loaded_at_field: 'updated_at', raw: { warn_after: { count: 12, period: 'hour' } } },
});
});
it('accepts historic SQL usage on standalone sources', () => {
const result = sourceDefinitionSchema.safeParse({
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'string' }],
joins: [],
measures: [],
usage: {
narrative: 'Orders are queried for fulfillment and revenue analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
externalOwner: 'analytics',
},
});
expect(result.success).toBe(true);
if (!result.success) {
return;
}
expect(result.data.usage).toMatchObject({
narrative: 'Orders are queried for fulfillment and revenue analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
externalOwner: 'analytics',
});
});
});
describe('projectManifestEntry', () => {
it('projects manifest usage onto the semantic-layer source', () => {
const source = projectManifestEntry('orders', {
table: 'public.orders',
usage: {
narrative: 'Orders are frequently filtered by status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
},
columns: [
{ name: 'id', type: 'string', pk: true },
{ name: 'status', type: 'string' },
],
});
expect(source.usage).toEqual({
narrative: 'Orders are frequently filtered by status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
});
});
});
describe('findManifestEntryByTableRef', () => {

View file

@ -1,6 +1,7 @@
import YAML from 'yaml';
import type { KtxFileStorePort, KtxLogger } from '../core/index.js';
import { noopLogger } from '../core/index.js';
import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js';
import type { SlConnectionCatalogPort, SlPythonPort } from './ports.js';
import { normalizeSemanticLayerDescriptions } from './description-normalization.js';
import { isOverlaySource, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js';
@ -884,6 +885,7 @@ export interface ManifestTableEntry {
joins?: ManifestJoinEntry[];
tags?: { dbt?: string[] };
freshness?: { dbt?: { raw?: unknown; loaded_at_field?: string | null } };
usage?: TableUsageOutput;
}
/** Migrate legacy flat description/db_description fields to a descriptions map. */
@ -930,6 +932,7 @@ export function projectManifestEntry(name: string, entry: ManifestTableEntry): S
measures: [],
...(entry.tags?.dbt?.length ? { tags: entry.tags } : {}),
...(entry.freshness?.dbt ? { freshness: entry.freshness } : {}),
...(entry.usage ? { usage: entry.usage } : {}),
};
}
@ -1005,6 +1008,7 @@ const COMPOSE_KNOWN_KEYS = new Set([
'exclude_columns',
'disable_joins',
'default_time_dimension',
'usage',
]);
export function composeOverlay(base: SemanticLayerSource, overlay: Record<string, unknown>): SemanticLayerSource {
@ -1028,6 +1032,10 @@ export function composeOverlay(base: SemanticLayerSource, overlay: Record<string
};
}
if (normalizedOverlay.usage !== undefined) {
result.usage = normalizedOverlay.usage as SemanticLayerSource['usage'];
}
// Filter out excluded columns
const excluded = new Set((normalizedOverlay.exclude_columns as string[] | undefined) ?? []);
let columns = result.columns.filter((c) => !excluded.has(c.name));

View file

@ -1,3 +1,5 @@
import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js';
export interface SemanticLayerSource {
name: string;
descriptions?: Record<string, string>;
@ -42,6 +44,7 @@ export interface SemanticLayerSource {
default_time_dimension?: { dbt?: string };
tags?: { dbt?: string[] };
freshness?: { dbt?: { raw?: unknown; loaded_at_field?: string | null } };
usage?: TableUsageOutput;
}
export interface SemanticLayerQueryInput {