fix(sl): tighten source validation guards

This commit is contained in:
Luca Martial 2026-05-11 23:31:04 -07:00
parent aaa928e768
commit 7a86aa9ddc
4 changed files with 105 additions and 3 deletions

View file

@ -891,6 +891,60 @@ describe('validateWithProposedSource', () => {
expect(result.errors).toEqual([]);
});
it('allows SQL syntax tokens and cast types in physical expression validation', async () => {
const schemaPath = 'semantic-layer/postgres-warehouse/_schema/orbit_analytics.yaml';
configService.listFiles.mockImplementation((dir: string) => {
if (dir === 'semantic-layer/dbt-main') {
return Promise.resolve({ files: [] });
}
if (dir === 'semantic-layer') {
return Promise.resolve({ files: [schemaPath] });
}
if (dir === 'semantic-layer/dbt-main/_schema' || dir === 'semantic-layer/postgres-warehouse/_schema') {
return Promise.resolve({ files: dir.endsWith('postgres-warehouse/_schema') ? [schemaPath] : [] });
}
return Promise.resolve({ files: [] });
});
configService.readFile.mockResolvedValue({
content: [
'tables:',
' mart_revenue_daily:',
' table: orbit_analytics.mart_revenue_daily',
' columns:',
' - { name: order_id, type: string }',
' - { name: revenue_date, type: time }',
' - { name: amount, type: number }',
' - { name: status, type: string }',
' - { name: created_at, type: time }',
].join('\n'),
});
pythonPort.validateSources.mockResolvedValue({
data: { errors: [], warnings: [] },
});
const result = await service.validateWithProposedSource('dbt-main', {
name: 'mart_revenue_daily',
table: 'orbit_analytics.mart_revenue_daily',
grain: ['order_id'],
columns: [
{ name: 'order_id', type: 'string' },
{ name: 'revenue_date', type: 'time' },
{ name: 'amount', type: 'number' },
{ name: 'status', type: 'string' },
{ name: 'created_at', type: 'time' },
{ name: 'status_text', type: 'string', expr: 'status::text' },
],
segments: [{ name: 'current_or_paid', expr: "created_at <= current_date OR status = 'paid'" }],
joins: [],
measures: [
{ name: 'paid_amount', expr: "sum(amount) FILTER (WHERE status = 'paid')" },
{ name: 'cast_amount_count', expr: 'count(cast(amount as text))' },
],
});
expect(result.errors).toEqual([]);
});
it('rejects join keys that are absent from matched physical sources', async () => {
const schemaPath = 'semantic-layer/postgres-warehouse/_schema/orbit_analytics.yaml';
configService.listFiles.mockImplementation((dir: string) => {

View file

@ -1160,6 +1160,8 @@ const SQL_KEYWORDS = new Set([
'in',
'between',
'like',
'where',
'filter',
'cast',
'coalesce',
'nullif',
@ -1183,15 +1185,31 @@ const SQL_KEYWORDS = new Set([
'rows',
'range',
'current',
'current_date',
'current_time',
'current_timestamp',
'localtime',
'localtimestamp',
'row',
'numeric',
'decimal',
'int',
'integer',
'bigint',
'smallint',
'float',
'double',
'real',
'string',
'text',
'char',
'character',
'varchar',
'timestamp',
'time',
'uuid',
'json',
'jsonb',
'bool',
'boolean',
]);
@ -1218,7 +1236,8 @@ function normalizeSqlExpressionForIdentifierScan(expr: string): string {
.replace(/'([^']|'')*'/g, ' ')
.replace(/"([^"]+)"/g, '$1')
.replace(/`([^`]+)`/g, '$1')
.replace(/\[([^\]]+)\]/g, '$1');
.replace(/\[([^\]]+)\]/g, '$1')
.replace(/::\s*[A-Za-z_][\w$]*(?:\s*\([^)]*\))?/g, ' ');
}
function extractSqlIdentifierRefs(expr: string): Array<{ qualifier?: string; name: string }> {

View file

@ -210,8 +210,6 @@ class SemanticEngine:
grain = grain_col.lower()
if grain in source_columns:
return True
if any(col.endswith(f"_{grain}") for col in source_columns):
return True
if grain == "id":
candidates = {
f"{target_name}_id",
@ -219,6 +217,9 @@ class SemanticEngine:
}
if source_columns.intersection(candidates):
return True
continue
if any(col.endswith(f"_{grain}") for col in source_columns):
return True
return False
def _check_sql_join_coverage(

View file

@ -323,6 +323,34 @@ class TestJoinValidation:
assert report.errors == []
def test_sql_join_coverage_does_not_treat_unrelated_id_suffix_as_id_key(self):
requesters = SourceDefinition(
name="large_contract_requesters",
sql="""
select accounts.account_name, requests.user_id
from orbit_raw.requests requests
join public.accounts accounts
on requests.account_id = accounts.id
""",
grain=["user_id"],
columns=[
SourceColumn(name="account_name", type="string"),
SourceColumn(name="user_id", type="string"),
],
joins=[],
)
accounts = _src("accounts", columns=["id", "account_name"], grain=["id"])
engine = SemanticEngine.from_sources(
{
"large_contract_requesters": requesters,
"accounts": accounts,
}
)
report = engine.validate(recently_touched={"large_contract_requesters"})
assert report.errors == []
def test_sql_join_coverage_requires_join_when_projected_key_exists(self):
requesters = SourceDefinition(
name="large_contract_requesters",