mirror of
https://github.com/Kaelio/ktx.git
synced 2026-07-04 10:52:13 +02:00
fix(sl): correct reserved-word/week-grain SQL and classify sl_query errors (#340)
Reserved-word columns (like, default, ...) referenced as source.col were quoted with postgres double quotes even on BigQuery/MySQL, where a double-quoted token is a string literal, not an identifier -- the "Unexpected string literal" semantic-layer errors. quote_reserved_identifiers now uses the identifier quote char of the dialect it will be parsed in (backtick for BigQuery/MySQL), threaded through the planner and generator parse sites; week_<weekday> granularity now emits WEEK(<weekday>) on BigQuery instead of the invalid WEEK_MONDAY. On the telemetry side, warehouse rejections from the sl_query execution path are classified as expected (KtxQueryError) via a new shared markExpected() helper, so routine agent/warehouse query failures stop reaching PostHog Error Tracking as ktx faults; the sql_execution catch is refactored onto the same helper. The daemon-compile boundary is deliberately left unclassified here so genuine daemon crashes stay visible.
This commit is contained in:
parent
a0d19ba26f
commit
4ebce75449
6 changed files with 156 additions and 21 deletions
|
|
@ -42,7 +42,7 @@ def validate_measure_duplicates(
|
|||
parsed: list[tuple[str, exp.Expression | None, str | None, frozenset[str]]] = []
|
||||
for m in source.measures:
|
||||
try:
|
||||
quoted = quote_reserved_identifiers(m.expr)
|
||||
quoted = quote_reserved_identifiers(m.expr, dialect)
|
||||
tree = sqlglot.parse_one(f"SELECT {quoted}", read=dialect)
|
||||
expr_node = tree.expressions[0] if tree.expressions else None
|
||||
except Exception:
|
||||
|
|
|
|||
|
|
@ -671,7 +671,8 @@ class SqlGenerator:
|
|||
"""Apply a measure-level filter by injecting CASE WHEN into each aggregate."""
|
||||
try:
|
||||
tree = sqlglot.parse_one(
|
||||
f"SELECT {quote_reserved_identifiers(expr)}", read=self.dialect
|
||||
f"SELECT {quote_reserved_identifiers(expr, self.dialect)}",
|
||||
read=self.dialect,
|
||||
)
|
||||
select_expr = tree.expressions[0]
|
||||
if isinstance(select_expr, exp.Alias):
|
||||
|
|
@ -723,7 +724,8 @@ class SqlGenerator:
|
|||
def _translate_custom_funcs(self, expr: str) -> str:
|
||||
"""Translate custom functions: median(), percentile(), count_distinct()."""
|
||||
tree = sqlglot.parse_one(
|
||||
f"SELECT {quote_reserved_identifiers(expr)}", read=self.dialect
|
||||
f"SELECT {quote_reserved_identifiers(expr, self.dialect)}",
|
||||
read=self.dialect,
|
||||
)
|
||||
|
||||
has_custom = False
|
||||
|
|
@ -767,7 +769,8 @@ class SqlGenerator:
|
|||
def _extract_outer_aggregate(self, expr: str) -> tuple[str | None, str | None]:
|
||||
"""Use AST to extract the outer aggregate function name and inner expression."""
|
||||
tree = sqlglot.parse_one(
|
||||
f"SELECT {quote_reserved_identifiers(expr)}", read=self.dialect
|
||||
f"SELECT {quote_reserved_identifiers(expr, self.dialect)}",
|
||||
read=self.dialect,
|
||||
)
|
||||
select_expr = tree.expressions[0]
|
||||
if isinstance(select_expr, exp.Alias):
|
||||
|
|
@ -788,7 +791,8 @@ class SqlGenerator:
|
|||
if not replacements:
|
||||
return expr
|
||||
tree = sqlglot.parse_one(
|
||||
f"SELECT {quote_reserved_identifiers(expr)}", read=self.dialect
|
||||
f"SELECT {quote_reserved_identifiers(expr, self.dialect)}",
|
||||
read=self.dialect,
|
||||
)
|
||||
|
||||
def _replace(node):
|
||||
|
|
@ -857,13 +861,26 @@ class SqlGenerator:
|
|||
return self._time_trunc(dim.granularity, field)
|
||||
return field
|
||||
|
||||
_WEEKDAYS = frozenset(
|
||||
{"sunday", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday"}
|
||||
)
|
||||
|
||||
def _bigquery_date_part(self, granularity: str) -> str:
|
||||
# BigQuery spells a week starting on a given weekday as WEEK(MONDAY),
|
||||
# not the bare WEEK_MONDAY that other systems accept.
|
||||
if granularity.startswith("week_"):
|
||||
weekday = granularity[len("week_") :]
|
||||
if weekday in self._WEEKDAYS:
|
||||
return f"WEEK({weekday.upper()})"
|
||||
return granularity.upper()
|
||||
|
||||
def _time_trunc(self, granularity: str, field: str) -> str:
|
||||
"""Generate dialect-appropriate time truncation expression."""
|
||||
g = granularity.lower()
|
||||
if self.dialect == "sqlite":
|
||||
return self._sqlite_time_trunc(g, field)
|
||||
if self.dialect == "bigquery":
|
||||
return f"DATE_TRUNC({field}, {g.upper()})"
|
||||
return f"DATE_TRUNC({field}, {self._bigquery_date_part(g)})"
|
||||
if self.dialect == "mysql":
|
||||
return self._mysql_time_trunc(g, field)
|
||||
return f"DATE_TRUNC('{g}', {field})"
|
||||
|
|
@ -1257,7 +1274,8 @@ class SqlGenerator:
|
|||
"""Qualify bare column references in a computed column expression with the source name."""
|
||||
try:
|
||||
tree = sqlglot.parse_one(
|
||||
f"SELECT {quote_reserved_identifiers(expr)}", read=self.dialect
|
||||
f"SELECT {quote_reserved_identifiers(expr, self.dialect)}",
|
||||
read=self.dialect,
|
||||
)
|
||||
|
||||
def _qualify(node: exp.Expression) -> exp.Expression:
|
||||
|
|
@ -1403,7 +1421,7 @@ class SqlGenerator:
|
|||
try:
|
||||
# Quote reserved-word identifiers so target dialect parsers do not
|
||||
# confuse them with keywords (e.g. Snowflake's SAMPLE, QUALIFY).
|
||||
quoted_outer = quote_reserved_identifiers(outer_sql)
|
||||
quoted_outer = quote_reserved_identifiers(outer_sql, self.dialect)
|
||||
results = sqlglot.transpile(
|
||||
quoted_outer, read=self.dialect, write=self.dialect
|
||||
)
|
||||
|
|
|
|||
|
|
@ -254,7 +254,7 @@ class JoinGraph:
|
|||
from sqlglot import exp as _exp
|
||||
from semantic_layer.parser import quote_reserved_identifiers
|
||||
|
||||
quoted = quote_reserved_identifiers(on_clause)
|
||||
quoted = quote_reserved_identifiers(on_clause, self.dialect)
|
||||
tree = sqlglot.parse_one(
|
||||
f"SELECT 1 FROM _a JOIN _b ON {quoted}", read=self.dialect
|
||||
)
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from dataclasses import dataclass, field
|
|||
|
||||
import sqlglot
|
||||
from sqlglot import exp
|
||||
from sqlglot.dialects.dialect import Dialect
|
||||
|
||||
# DIALECT CONVENTION:
|
||||
# `ExpressionParser` wraps read-only AST walks over user-authored
|
||||
|
|
@ -154,12 +155,29 @@ def _strip_quotes(name: str) -> str:
|
|||
return name
|
||||
|
||||
|
||||
def quote_reserved_identifiers(expr: str) -> str:
|
||||
@functools.lru_cache(maxsize=32)
|
||||
def _identifier_quote(dialect: str) -> str:
|
||||
"""The character `dialect` quotes identifiers with (backtick for BigQuery/MySQL,
|
||||
double-quote for ANSI dialects). A reserved-word column must be quoted with this so a
|
||||
backtick dialect does not read a double-quoted identifier as a string literal."""
|
||||
try:
|
||||
start = Dialect.get_or_raise(dialect).IDENTIFIER_START
|
||||
except Exception:
|
||||
start = '"'
|
||||
return start or '"'
|
||||
|
||||
|
||||
def quote_reserved_identifiers(expr: str, dialect: str = "postgres") -> str:
|
||||
"""Quote source.column references where either part is a SQL reserved word.
|
||||
|
||||
String literals are masked before processing to prevent matching
|
||||
dotted identifiers inside quoted strings like 'group.value'.
|
||||
Quoted with `dialect`'s identifier character, because the caller parses the
|
||||
result in that dialect: a double-quoted identifier on BigQuery/MySQL is a
|
||||
string literal, not an identifier. String literals are masked before
|
||||
processing to prevent matching dotted identifiers inside quoted strings like
|
||||
'group.value'.
|
||||
"""
|
||||
quote = _identifier_quote(dialect)
|
||||
|
||||
# Mask string literals to avoid matching inside them
|
||||
literals: list[str] = []
|
||||
|
||||
|
|
@ -172,16 +190,16 @@ def quote_reserved_identifiers(expr: str) -> str:
|
|||
def _quote_match(m: re.Match) -> str:
|
||||
source, col = m.group(1), m.group(2)
|
||||
start = m.start()
|
||||
if start > 0 and masked[start - 1] == '"':
|
||||
if start > 0 and masked[start - 1] == quote:
|
||||
return m.group(0)
|
||||
needs_quote = False
|
||||
source_q = source
|
||||
col_q = col
|
||||
if source.lower() in _SQL_RESERVED:
|
||||
source_q = f'"{source}"'
|
||||
source_q = f"{quote}{source}{quote}"
|
||||
needs_quote = True
|
||||
if col.lower() in _SQL_RESERVED:
|
||||
col_q = f'"{col}"'
|
||||
col_q = f"{quote}{col}{quote}"
|
||||
needs_quote = True
|
||||
if needs_quote:
|
||||
return f"{source_q}.{col_q}"
|
||||
|
|
@ -196,13 +214,13 @@ def quote_reserved_identifiers(expr: str) -> str:
|
|||
return result
|
||||
|
||||
|
||||
def _predicate_select(expr: str) -> str:
|
||||
def _predicate_select(expr: str, dialect: str = "postgres") -> str:
|
||||
"""Wrap a user expression as `SELECT * WHERE …`, quoting reserved identifiers.
|
||||
|
||||
Predicate, not projection: T-SQL reads a top-level `col = 'value'` projection
|
||||
as the `alias = expression` form and would compile the filter to `'value' AS col`.
|
||||
"""
|
||||
return f"SELECT * WHERE {quote_reserved_identifiers(expr)}"
|
||||
return f"SELECT * WHERE {quote_reserved_identifiers(expr, dialect)}"
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=256)
|
||||
|
|
@ -220,7 +238,11 @@ def parse_predicate(expr: str, dialect: str) -> exp.Expression:
|
|||
|
||||
Uncached, so the result is safe to `.transform()`; raises on unparseable input.
|
||||
"""
|
||||
return sqlglot.parse_one(_predicate_select(expr), read=dialect).find(exp.Where).this
|
||||
return (
|
||||
sqlglot.parse_one(_predicate_select(expr, dialect), read=dialect)
|
||||
.find(exp.Where)
|
||||
.this
|
||||
)
|
||||
|
||||
|
||||
class ExpressionParser:
|
||||
|
|
@ -237,7 +259,7 @@ class ExpressionParser:
|
|||
|
||||
def _parse_as_select(self, expr: str) -> exp.Expression:
|
||||
"""Parse a user fragment for read-only AST walks, via the parse cache."""
|
||||
return _cached_parse_select(_predicate_select(expr), self.dialect)
|
||||
return _cached_parse_select(_predicate_select(expr, self.dialect), self.dialect)
|
||||
|
||||
def parse(
|
||||
self,
|
||||
|
|
|
|||
|
|
@ -694,7 +694,8 @@ class QueryPlanner:
|
|||
all_dep_names.add(dep_name)
|
||||
named.add(dep_name)
|
||||
tree = sqlglot.parse_one(
|
||||
f"SELECT {quote_reserved_identifiers(expr)}", dialect=self.dialect
|
||||
f"SELECT {quote_reserved_identifiers(expr, self.dialect)}",
|
||||
dialect=self.dialect,
|
||||
)
|
||||
|
||||
def _replace(node):
|
||||
|
|
@ -738,7 +739,8 @@ class QueryPlanner:
|
|||
"""Reject expressions with nested aggregate functions (e.g., avg(sum(x)))."""
|
||||
try:
|
||||
tree = sqlglot.parse_one(
|
||||
f"SELECT {quote_reserved_identifiers(expr)}", dialect=self.dialect
|
||||
f"SELECT {quote_reserved_identifiers(expr, self.dialect)}",
|
||||
dialect=self.dialect,
|
||||
)
|
||||
for agg_node in tree.find_all(exp.AggFunc):
|
||||
# Check if this aggregate contains another aggregate inside
|
||||
|
|
|
|||
93
python/ktx-sl/tests/test_dialect_identifier_quoting.py
Normal file
93
python/ktx-sl/tests/test_dialect_identifier_quoting.py
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
"""Reserved-word identifiers and week-of-day granularity must produce valid SQL
|
||||
in the connection's native dialect.
|
||||
|
||||
Regression: on backtick dialects (BigQuery, MySQL) a reserved-word column such
|
||||
as `default` or `like` was quoted with postgres-style double quotes, which those
|
||||
dialects read as a *string literal* — `WHERE loans.'like' = 'x'` — yielding the
|
||||
production error `Syntax error: Unexpected string literal "like"`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlglot
|
||||
|
||||
from conftest import make_engine
|
||||
|
||||
|
||||
def _loans_engine(dialect: str):
|
||||
source = {
|
||||
"name": "loans",
|
||||
"table": "mydataset.loans",
|
||||
"grain": ["id"],
|
||||
"columns": [
|
||||
{"name": "id", "type": "number"},
|
||||
{"name": "amount", "type": "number"},
|
||||
{"name": "default", "type": "boolean"},
|
||||
{"name": "like", "type": "string"},
|
||||
{"name": "created_at", "type": "time"},
|
||||
],
|
||||
"measures": [{"name": "total", "expr": "sum(amount)"}],
|
||||
}
|
||||
return make_engine({"loans": source}, dialect=dialect)
|
||||
|
||||
|
||||
def test_reserved_word_column_filter_valid_on_bigquery():
|
||||
sql = (
|
||||
_loans_engine("bigquery")
|
||||
.query({"measures": ["loans.total"], "filters": ["loans.default = true"]})
|
||||
.sql
|
||||
)
|
||||
sqlglot.parse_one(sql, read="bigquery") # must not raise
|
||||
assert "`default`" in sql
|
||||
assert "'default'" not in sql
|
||||
|
||||
|
||||
def test_reserved_word_column_filter_valid_on_mysql():
|
||||
sql = (
|
||||
_loans_engine("mysql")
|
||||
.query({"measures": ["loans.total"], "filters": ["loans.default = true"]})
|
||||
.sql
|
||||
)
|
||||
sqlglot.parse_one(sql, read="mysql")
|
||||
assert "`default`" in sql
|
||||
assert "'default'" not in sql
|
||||
|
||||
|
||||
def test_like_column_filter_valid_on_bigquery():
|
||||
# Mirrors the production message: Unexpected string literal "like".
|
||||
sql = (
|
||||
_loans_engine("bigquery")
|
||||
.query({"measures": ["loans.total"], "filters": ["loans.like = 'x'"]})
|
||||
.sql
|
||||
)
|
||||
sqlglot.parse_one(sql, read="bigquery")
|
||||
assert "`like`" in sql
|
||||
assert "'like'" not in sql
|
||||
|
||||
|
||||
def test_reserved_word_column_still_double_quoted_on_snowflake():
|
||||
sql = (
|
||||
_loans_engine("snowflake")
|
||||
.query({"measures": ["loans.total"], "filters": ["loans.default = true"]})
|
||||
.sql
|
||||
)
|
||||
sqlglot.parse_one(sql, read="snowflake")
|
||||
assert '"default"' in sql
|
||||
|
||||
|
||||
def test_week_weekday_granularity_translated_on_bigquery():
|
||||
sql = (
|
||||
_loans_engine("bigquery")
|
||||
.query(
|
||||
{
|
||||
"measures": ["loans.total"],
|
||||
"dimensions": [
|
||||
{"field": "loans.created_at", "granularity": "week_monday"}
|
||||
],
|
||||
}
|
||||
)
|
||||
.sql
|
||||
)
|
||||
sqlglot.parse_one(sql, read="bigquery") # must not raise
|
||||
assert "WEEK_MONDAY" not in sql
|
||||
assert "WEEK(MONDAY)" in sql
|
||||
Loading…
Add table
Add a link
Reference in a new issue