fix(context): normalize manifest column identifiers

This commit is contained in:
Andrey Avtomonov 2026-05-18 22:55:00 +02:00
parent cf93ee53bd
commit 9938469b2d
7 changed files with 477 additions and 29 deletions

View file

@ -11,6 +11,78 @@ function shardObject(shards: Map<string, LiveDatabaseManifestShard>): Record<str
}
describe('buildLiveDatabaseManifestShards', () => {
it('normalizes unsafe physical column names while preserving quoted expressions', () => {
const existingDescriptions = new Map<string, LiveDatabaseManifestExistingDescriptions>([
[
'npi',
{
columns: new Map([
['provider_business_mailing_address_country_code_if_outside_u_s', { user: 'Pinned normalized description' }],
['provider_business_practice_location_address_country_code_(if_outside_u.s.)', { user: 'Pinned raw description' }],
]),
},
],
]);
const result = buildLiveDatabaseManifestShards({
connectionType: 'DUCKDB',
mapColumnType: (nativeType) => nativeType.toLowerCase(),
existingDescriptions,
tables: [
{
name: 'npi',
catalog: 'provider',
db: 'main',
columns: [
{ name: 'npi', type: 'INTEGER', pk: true },
{ name: 'provider_business_mailing_address_country_code_(if_outside_u.s.)', type: 'VARCHAR' },
{ name: 'provider_business_practice_location_address_country_code_(if_outside_u.s.)', type: 'VARCHAR' },
{ name: 'Display Name', type: 'VARCHAR' },
{ name: 'display_name', type: 'VARCHAR' },
{ name: '123 code', type: 'VARCHAR' },
],
},
],
joins: [],
});
expect(shardObject(result.shards)).toEqual({
main: {
tables: {
npi: {
table: 'provider.main.npi',
columns: [
{ name: 'npi', type: 'integer', pk: true },
{
name: 'provider_business_mailing_address_country_code_if_outside_u_s',
type: 'varchar',
expr: 'npi."provider_business_mailing_address_country_code_(if_outside_u.s.)"',
descriptions: { user: 'Pinned normalized description' },
},
{
name: 'provider_business_practice_location_address_country_code_if_outside_u_s',
type: 'varchar',
expr: 'npi."provider_business_practice_location_address_country_code_(if_outside_u.s.)"',
descriptions: { user: 'Pinned raw description' },
},
{
name: 'display_name_2',
type: 'varchar',
expr: 'npi."Display Name"',
},
{ name: 'display_name', type: 'varchar' },
{
name: 'column_123_code',
type: 'varchar',
expr: 'npi."123 code"',
},
],
},
},
},
});
});
it('builds shard objects with generated joins and preserved external descriptions', () => {
const existingDescriptions = new Map<string, LiveDatabaseManifestExistingDescriptions>([
[

View file

@ -22,9 +22,74 @@ const HISTORIC_SQL_MANAGED_USAGE_KEYS = new Set([
'staleSince',
]);
const SAFE_SEMANTIC_COLUMN_NAME = /^[A-Za-z_][A-Za-z0-9_]*$/;
const SQL_RESERVED_WORDS = new Set([
'all',
'and',
'as',
'between',
'by',
'case',
'column',
'constraint',
'create',
'default',
'delete',
'distinct',
'drop',
'else',
'end',
'except',
'exists',
'false',
'fetch',
'for',
'from',
'full',
'grant',
'group',
'having',
'in',
'index',
'inner',
'insert',
'intersect',
'is',
'join',
'key',
'left',
'like',
'limit',
'natural',
'not',
'null',
'on',
'or',
'order',
'outer',
'primary',
'references',
'revoke',
'right',
'select',
'set',
'table',
'then',
'true',
'union',
'update',
'using',
'values',
'view',
'when',
'where',
'with',
]);
export interface LiveDatabaseManifestColumn {
name: string;
type: string;
expr?: string;
pk?: boolean;
nullable?: boolean;
descriptions?: Record<string, string>;
@ -93,6 +158,64 @@ export interface BuildLiveDatabaseManifestShardsResult {
tablesProcessed: number;
}
function isSafeSemanticColumnName(name: string): boolean {
return SAFE_SEMANTIC_COLUMN_NAME.test(name) && !SQL_RESERVED_WORDS.has(name.toLowerCase());
}
export function normalizeManifestColumnName(rawName: string): string {
const trimmed = rawName.trim();
if (isSafeSemanticColumnName(trimmed)) {
return trimmed;
}
const normalized = trimmed
.replace(/[^A-Za-z0-9]+/g, '_')
.replace(/_+/g, '_')
.replace(/^_+|_+$/g, '')
.toLowerCase();
const withFallback = normalized.length > 0 ? normalized : 'column';
const withSafePrefix = /^[0-9]/.test(withFallback) ? `column_${withFallback}` : withFallback;
return SQL_RESERVED_WORDS.has(withSafePrefix.toLowerCase()) ? `${withSafePrefix}_column` : withSafePrefix;
}
export function normalizeManifestColumnNames(rawNames: readonly string[]): string[] {
const safeNameCounts = new Map<string, number>();
for (const rawName of rawNames) {
const trimmed = rawName.trim();
if (isSafeSemanticColumnName(trimmed)) {
safeNameCounts.set(trimmed, (safeNameCounts.get(trimmed) ?? 0) + 1);
}
}
const reservedSafeNames = new Set([...safeNameCounts.entries()].filter(([, count]) => count === 1).map(([name]) => name));
const used = new Set<string>();
return rawNames.map((rawName) => {
const trimmed = rawName.trim();
if (isSafeSemanticColumnName(trimmed) && !used.has(trimmed)) {
used.add(trimmed);
return trimmed;
}
const base = normalizeManifestColumnName(rawName);
let candidate = base;
let suffix = 2;
while (used.has(candidate) || reservedSafeNames.has(candidate)) {
candidate = `${base}_${suffix}`;
suffix += 1;
}
used.add(candidate);
return candidate;
});
}
function quoteManifestIdentifier(identifier: string): string {
return `"${identifier.replace(/"/g, '""')}"`;
}
export function manifestColumnPhysicalExpression(sourceName: string, physicalColumnName: string): string {
return `${sourceName}.${quoteManifestIdentifier(physicalColumnName)}`;
}
function mergeDescriptionsPreservingExternal(
existing: Record<string, string> | undefined,
incoming: Record<string, string> | undefined,
@ -189,6 +312,7 @@ function joinCondition(
leftColumns: readonly string[],
rightTable: string,
rightColumns: readonly string[],
columnNameMaps: Map<string, Map<string, string>>,
): string {
if (leftColumns.length === 0 || leftColumns.length !== rightColumns.length) {
throw new Error(`Invalid relationship join from ${leftTable} to ${rightTable}: column tuple widths differ`);
@ -199,7 +323,9 @@ function joinCondition(
if (!rightColumn) {
throw new Error(`Invalid relationship join from ${leftTable} to ${rightTable}: missing target column`);
}
return `${leftTable}.${leftColumn} = ${rightTable}.${rightColumn}`;
const normalizedLeftColumn = columnNameMaps.get(leftTable)?.get(leftColumn) ?? normalizeManifestColumnName(leftColumn);
const normalizedRightColumn = columnNameMaps.get(rightTable)?.get(rightColumn) ?? normalizeManifestColumnName(rightColumn);
return `${leftTable}.${normalizedLeftColumn} = ${rightTable}.${normalizedRightColumn}`;
})
.join(' AND ');
}
@ -208,6 +334,7 @@ function buildJoinsByTable(
tableNames: Set<string>,
joins: LiveDatabaseManifestJoinData[],
preservedJoins: Map<string, LiveDatabaseManifestJoinEntry[]>,
columnNameMaps: Map<string, Map<string, string>>,
): Map<string, LiveDatabaseManifestJoinEntry[]> {
const joinsByTable = new Map<string, LiveDatabaseManifestJoinEntry[]>();
@ -218,7 +345,7 @@ function buildJoinsByTable(
const relationship = RELATIONSHIP_MAP[join.relationship] ?? join.relationship;
addJoinOnce(joinsByTable, join.fromTable, {
to: join.toTable,
on: joinCondition(join.fromTable, join.fromColumns, join.toTable, join.toColumns),
on: joinCondition(join.fromTable, join.fromColumns, join.toTable, join.toColumns, columnNameMaps),
relationship,
source: join.source,
});
@ -226,7 +353,7 @@ function buildJoinsByTable(
const reverseRelationship = RELATIONSHIP_INVERSE[relationship] ?? 'one_to_many';
addJoinOnce(joinsByTable, join.toTable, {
to: join.fromTable,
on: joinCondition(join.toTable, join.toColumns, join.fromTable, join.fromColumns),
on: joinCondition(join.toTable, join.toColumns, join.fromTable, join.fromColumns, columnNameMaps),
relationship: reverseRelationship,
source: join.source,
});
@ -250,19 +377,30 @@ export function buildLiveDatabaseManifestShards(
input: BuildLiveDatabaseManifestShardsInput,
): BuildLiveDatabaseManifestShardsResult {
const tableNames = new Set(input.tables.map((table) => table.name));
const joinsByTable = buildJoinsByTable(tableNames, input.joins, input.existingPreservedJoins ?? new Map());
const columnNameMaps = new Map(
input.tables.map((table) => {
const normalizedNames = normalizeManifestColumnNames(table.columns.map((column) => column.name));
return [table.name, new Map(table.columns.map((column, index) => [column.name, normalizedNames[index] ?? column.name]))] as const;
}),
);
const joinsByTable = buildJoinsByTable(tableNames, input.joins, input.existingPreservedJoins ?? new Map(), columnNameMaps);
const shards = new Map<string, LiveDatabaseManifestShard>();
for (const table of input.tables) {
const shardKey = getShardKey(input.connectionType, table.catalog, table.db);
const shard = shards.get(shardKey) ?? { tables: {} };
const existingDescriptions = input.existingDescriptions?.get(table.name);
const normalizedNames = normalizeManifestColumnNames(table.columns.map((column) => column.name));
const columns: LiveDatabaseManifestColumn[] = table.columns.map((column) => {
const columns: LiveDatabaseManifestColumn[] = table.columns.map((column, index) => {
const name = normalizedNames[index] ?? normalizeManifestColumnName(column.name);
const manifestColumn: LiveDatabaseManifestColumn = {
name: column.name,
name,
type: input.mapColumnType(column.type),
};
if (name !== column.name) {
manifestColumn.expr = manifestColumnPhysicalExpression(table.name, column.name);
}
if (column.pk) {
manifestColumn.pk = true;
}
@ -270,7 +408,7 @@ export function buildLiveDatabaseManifestShards(
manifestColumn.nullable = false;
}
const descriptions = mergeDescriptionsPreservingExternal(
existingDescriptions?.columns.get(column.name),
existingDescriptions?.columns.get(name) ?? existingDescriptions?.columns.get(column.name),
column.descriptions,
);
if (descriptions) {

View file

@ -182,6 +182,71 @@ grain: []
});
});
it('sends normalized legacy manifest columns with physical expressions to compute', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/main.yaml',
`tables:
npi:
table: provider.main.npi
columns:
- name: npi
type: number
- name: provider_business_mailing_address_country_code_(if_outside_u.s.)
type: string
- name: Display Name
type: string
- name: display_name
type: string
`,
'ktx',
'ktx@example.com',
'Add legacy manifest shard',
);
await compileLocalSlQuery(project, {
connectionId: 'warehouse',
query: {
measures: [],
dimensions: ['npi.provider_business_mailing_address_country_code_if_outside_u_s'],
},
compute,
});
expect(compute.query).toHaveBeenLastCalledWith({
sources: expect.arrayContaining([
expect.objectContaining({
name: 'npi',
table: 'provider.main.npi',
grain: [
'npi',
'provider_business_mailing_address_country_code_if_outside_u_s',
'display_name_2',
'display_name',
],
columns: [
expect.objectContaining({ name: 'npi', type: 'number' }),
expect.objectContaining({
name: 'provider_business_mailing_address_country_code_if_outside_u_s',
type: 'string',
expr: 'npi."provider_business_mailing_address_country_code_(if_outside_u.s.)"',
}),
expect.objectContaining({
name: 'display_name_2',
type: 'string',
expr: 'npi."Display Name"',
}),
expect.objectContaining({ name: 'display_name', type: 'string' }),
],
}),
]),
dialect: 'postgres',
query: {
measures: [],
dimensions: ['npi.provider_business_mailing_address_country_code_if_outside_u_s'],
},
});
});
it('strips authoring-only fields (usage, inherits_columns_from) before sending sources to the daemon', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',

View file

@ -165,6 +165,45 @@ describe('local semantic-layer helpers', () => {
);
});
it('normalizes unsafe column names from existing manifest-backed scan sources', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/main.yaml',
`tables:
npi:
table: provider.main.npi
columns:
- name: npi
type: number
- name: provider_business_mailing_address_country_code_(if_outside_u.s.)
type: string
- name: Display Name
type: string
- name: display_name
type: string
`,
'ktx',
'ktx@example.com',
'Add legacy manifest shard',
);
await expect(listLocalSlSources(project, { connectionId: 'warehouse' })).resolves.toEqual([
{
columnCount: 4,
connectionId: 'warehouse',
joinCount: 0,
measureCount: 0,
name: 'npi',
path: 'semantic-layer/warehouse/_schema/main.yaml#npi',
},
]);
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'npi' })).resolves.toEqual(
expect.objectContaining({
yaml: expect.stringContaining('provider_business_mailing_address_country_code_if_outside_u_s'),
}),
);
});
it('expands manifest-backed scan sources when listing all connections', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',

View file

@ -1127,6 +1127,56 @@ describe('validateWithProposedSource', () => {
expect(result.errors).toEqual([]);
});
it('allows generated manifest columns to reference quoted physical identifiers', async () => {
const schemaPath = 'semantic-layer/warehouse/_schema/main.yaml';
configService.listFiles.mockImplementation((dir: string) => {
if (dir === 'semantic-layer/warehouse') {
return Promise.resolve({ files: [] });
}
if (dir === 'semantic-layer') {
return Promise.resolve({ files: [schemaPath] });
}
if (dir === 'semantic-layer/warehouse/_schema') {
return Promise.resolve({ files: [schemaPath] });
}
return Promise.resolve({ files: [] });
});
configService.readFile.mockResolvedValue({
content: [
'tables:',
' npi:',
' table: provider.main.npi',
' columns:',
' - name: npi',
' type: number',
' - name: provider_business_mailing_address_country_code_if_outside_u_s',
' type: string',
' expr: npi."provider_business_mailing_address_country_code_(if_outside_u.s.)"',
].join('\n'),
});
pythonPort.validateSources.mockResolvedValue({
data: { errors: [], warnings: [] },
});
const result = await service.validateWithProposedSource('warehouse', {
name: 'npi',
table: 'provider.main.npi',
grain: ['npi'],
columns: [
{ name: 'npi', type: 'number' },
{
name: 'provider_business_mailing_address_country_code_if_outside_u_s',
type: 'string',
expr: 'npi."provider_business_mailing_address_country_code_(if_outside_u.s.)"',
},
],
joins: [],
measures: [],
});
expect(result.errors).toEqual([]);
});
it('rejects join keys that are absent from matched physical sources', async () => {
const schemaPath = 'semantic-layer/postgres-warehouse/_schema/orbit_analytics.yaml';
configService.listFiles.mockImplementation((dir: string) => {

View file

@ -2,6 +2,11 @@ import YAML from 'yaml';
import type { KtxFileStorePort, KtxLogger } from '../core/index.js';
import { noopLogger } from '../core/index.js';
import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js';
import {
manifestColumnPhysicalExpression,
normalizeManifestColumnName,
normalizeManifestColumnNames,
} from '../ingest/adapters/live-database/manifest.js';
import type { SlConnectionCatalogPort, SlPythonPort } from './ports.js';
import { normalizeSemanticLayerDescriptions } from './description-normalization.js';
import { isOverlaySource, resolvedSourceSchema, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js';
@ -495,6 +500,17 @@ export class SemanticLayerService {
const manifestSource = manifestMatch.source;
const manifestColumns = new Map(manifestSource.columns.map((c) => [c.name.toLowerCase(), c.name]));
const manifestPhysicalColumns = new Set<string>();
for (const manifestColumn of manifestSource.columns) {
if (!manifestColumn.expr) {
continue;
}
for (const ref of extractSqlIdentifierRefs(manifestColumn.expr)) {
if (refBelongsToSource(ref, manifestSource.name, sourceNames)) {
manifestPhysicalColumns.add(ref.name.toLowerCase());
}
}
}
const declaredColumns = source.columns ?? [];
const declaredByLower = new Map(declaredColumns.map((c) => [c.name.toLowerCase(), c]));
const validOutputColumns = new Set(
@ -537,7 +553,7 @@ export class SemanticLayerService {
expr: column.expr,
sourceName: source.name,
sourceNames,
validColumns: new Set([...manifestColumns.keys(), ...validOutputColumns]),
validColumns: new Set([...manifestColumns.keys(), ...manifestPhysicalColumns, ...validOutputColumns]),
validMeasures: new Set(),
});
if (missing.length > 0) {
@ -1137,6 +1153,7 @@ export class SemanticLayerService {
interface ManifestColumnEntry {
name: string;
type: string;
expr?: string;
pk?: boolean;
nullable?: boolean;
descriptions?: Record<string, string>;
@ -1166,18 +1183,26 @@ export interface ManifestTableEntry {
}
export function projectManifestEntry(name: string, entry: ManifestTableEntry): SemanticLayerSource {
const columns = entry.columns.map((c) => ({
name: c.name,
type: c.type,
role: c.type === 'time' ? 'time' : undefined,
descriptions: c.descriptions,
constraints: c.constraints,
enum_values: c.enum_values,
tests: c.tests,
}));
const columnNames = normalizeManifestColumnNames(entry.columns.map((c) => c.name));
const columns = entry.columns.map((c, index) => {
const columnName = columnNames[index] ?? normalizeManifestColumnName(c.name);
const expr = c.expr ?? (columnName !== c.name ? manifestColumnPhysicalExpression(name, c.name) : undefined);
return {
name: columnName,
type: c.type,
role: c.type === 'time' ? 'time' : undefined,
descriptions: c.descriptions,
...(expr ? { expr } : {}),
constraints: c.constraints,
enum_values: c.enum_values,
tests: c.tests,
};
});
const pkColumns = entry.columns.filter((c) => c.pk).map((c) => c.name);
const grain = pkColumns.length > 0 ? pkColumns : entry.columns.map((c) => c.name);
const pkColumns = entry.columns.flatMap((c, index) =>
c.pk ? [columnNames[index] ?? normalizeManifestColumnName(c.name)] : [],
);
const grain = pkColumns.length > 0 ? pkColumns : columnNames;
// Table-level dbt config from manifest shards is surfaced on the source for search / tools.
const source: SemanticLayerSource = {
@ -1276,9 +1301,7 @@ const SQL_KEYWORDS = new Set([
]);
function extractColumnReferences(expr: string): string[] {
const cleaned = expr.replace(/'[^']*'/g, '').replace(/\b\d+(\.\d+)?\b/g, '');
const tokens = cleaned.match(/\b[a-zA-Z_]\w*\b/g) ?? [];
return [...new Set(tokens.filter((t) => !SQL_KEYWORDS.has(t.toLowerCase())))];
return [...new Set(extractSqlIdentifierRefs(expr).map((ref) => ref.name))];
}
function manifestEntryMatchesRef(source: SemanticLayerSource, ref: string): boolean {
@ -1295,19 +1318,23 @@ function normalizeSqlExpressionForIdentifierScan(expr: string): string {
.replace(/--.*$/gm, ' ')
.replace(/\/\*[\s\S]*?\*\//g, ' ')
.replace(/'([^']|'')*'/g, ' ')
.replace(/"([^"]+)"/g, '$1')
.replace(/`([^`]+)`/g, '$1')
.replace(/\[([^\]]+)\]/g, '$1')
.replace(/::\s*[A-Za-z_][\w$]*(?:\s*\([^)]*\))?/g, ' ');
}
function extractSqlIdentifierRefs(expr: string): Array<{ qualifier?: string; name: string }> {
const normalized = normalizeSqlExpressionForIdentifierScan(expr);
const refs = new Map<string, { qualifier?: string; name: string }>();
const re = /(?:\b([A-Za-z_][\w$]*)\s*\.\s*)?(\b[A-Za-z_][\w$]*)\b/g;
const bareIdentifier = String.raw`[A-Za-z_][\w$]*`;
const quotedIdentifier = String.raw`"(?:""|[^"])*"|` + String.raw`\`(?:\`\`|[^\`])*\`|\[(?:[^\]])+\]`;
const identifier = String.raw`(?:${quotedIdentifier}|${bareIdentifier})`;
const re = new RegExp(String.raw`(${identifier})\s*\.\s*(${identifier})|(${identifier})`, 'g');
for (const match of normalized.matchAll(re)) {
const qualifier = match[1];
const name = match[2];
const rawQualifier = match[1];
const rawName = match[2] ?? match[3] ?? '';
const qualifier = rawQualifier ? unquoteSqlIdentifier(rawQualifier) : undefined;
const name = unquoteSqlIdentifier(rawName);
if (!name) {
continue;
}
@ -1317,7 +1344,10 @@ function extractSqlIdentifierRefs(expr: string): Array<{ qualifier?: string; nam
if (!qualifier && after.startsWith('(')) {
continue;
}
if (SQL_KEYWORDS.has(nameLower) || (qualifierLower && SQL_KEYWORDS.has(qualifierLower))) {
if (!isQuotedSqlIdentifier(rawName) && SQL_KEYWORDS.has(nameLower)) {
continue;
}
if (rawQualifier && !isQuotedSqlIdentifier(rawQualifier) && qualifierLower && SQL_KEYWORDS.has(qualifierLower)) {
continue;
}
refs.set(`${qualifierLower ?? ''}.${nameLower}`, qualifier ? { qualifier, name } : { name });
@ -1325,6 +1355,27 @@ function extractSqlIdentifierRefs(expr: string): Array<{ qualifier?: string; nam
return [...refs.values()];
}
function isQuotedSqlIdentifier(identifier: string): boolean {
return (
(identifier.startsWith('"') && identifier.endsWith('"')) ||
(identifier.startsWith('`') && identifier.endsWith('`')) ||
(identifier.startsWith('[') && identifier.endsWith(']'))
);
}
function unquoteSqlIdentifier(identifier: string): string {
if (identifier.startsWith('"') && identifier.endsWith('"')) {
return identifier.slice(1, -1).replace(/""/g, '"');
}
if (identifier.startsWith('`') && identifier.endsWith('`')) {
return identifier.slice(1, -1).replace(/``/g, '`');
}
if (identifier.startsWith('[') && identifier.endsWith(']')) {
return identifier.slice(1, -1);
}
return identifier;
}
function refBelongsToSource(
ref: { qualifier?: string; name: string },
sourceName: string,

View file

@ -62,6 +62,39 @@ class TestSimpleSingleSource:
assert "GROUP BY" in sql.upper()
assert "public.orders" in sql
def test_computed_column_can_reference_quoted_physical_name(self):
source = SourceDefinition(
name="npi",
table="provider.main.npi",
grain=["provider_business_mailing_address_country_code_if_outside_u_s"],
columns=[
SourceColumn(
name="provider_business_mailing_address_country_code_if_outside_u_s",
type="string",
expr='npi."provider_business_mailing_address_country_code_(if_outside_u.s.)"',
)
],
)
sources = {"npi": source}
engine = SemanticEngine.from_sources(sources, dialect="duckdb")
sql = engine.query(
{
"measures": [],
"dimensions": [
"npi.provider_business_mailing_address_country_code_if_outside_u_s"
],
"limit": 5,
}
).sql
assert_valid_sql(sql)
assert (
'npi."provider_business_mailing_address_country_code_(if_outside_u.s.)"'
in sql
)
assert "AS provider_business_mailing_address_country_code_if_outside_u_s" in sql
class TestCrossSourceM2O:
"""Test 2: Cross-source, all m2o (the LATAM query)."""