ktx/python/ktx-sl/tests/test_duplicate_check.py

221 lines
7.4 KiB
Python
Raw Permalink Normal View History

2026-05-10 23:12:26 +02:00
"""Tests for semantic_layer.duplicate_check.validate_measure_duplicates."""
from __future__ import annotations
from semantic_layer.duplicate_check import validate_measure_duplicates
from semantic_layer.models import (
MeasureDefinition,
SourceColumn,
SourceDefinition,
)
def _make_source(name: str, measures: list[MeasureDefinition]) -> SourceDefinition:
return SourceDefinition(
name=name,
table=f"public.{name}",
grain=["id"],
columns=[SourceColumn(name="id", type="number")],
measures=measures,
)
def test_same_expr_different_filter_is_flagged() -> None:
"""The replay-trimmed case: count(*) twice, one with is_active filter."""
source = _make_source(
"fct_subscriptions",
[
MeasureDefinition(
name="active_subscription_count",
expr="count(*)",
filter="is_active = true",
),
MeasureDefinition(
name="new_subscription_count",
expr="count(*)",
),
],
)
errors = validate_measure_duplicates({"fct_subscriptions": source})
assert len(errors) == 1
assert "new_subscription_count" in errors[0]
assert "active_subscription_count" in errors[0]
assert "differs only by `filter`" in errors[0]
def test_same_expr_same_filter_is_flagged() -> None:
"""Two measures with identical expr and filter — flagged as duplicate pair."""
source = _make_source(
"fct_orders",
[
MeasureDefinition(
name="order_count_a", expr="count(*)", filter="is_paid = true"
),
MeasureDefinition(
name="order_count_b", expr="count(*)", filter="is_paid = true"
),
],
)
errors = validate_measure_duplicates({"fct_orders": source})
assert len(errors) == 1
assert "same expression and filter" in errors[0]
def test_different_expr_is_not_flagged() -> None:
"""count(*) vs sum(amount) on same source — legitimately distinct measures."""
source = _make_source(
"fct_orders",
[
MeasureDefinition(name="order_count", expr="count(*)"),
MeasureDefinition(name="total_revenue", expr="sum(amount)"),
MeasureDefinition(name="avg_revenue", expr="avg(amount)"),
],
)
errors = validate_measure_duplicates({"fct_orders": source})
assert errors == []
def test_measures_on_different_sources_not_compared() -> None:
"""Same expr on two different sources is not a duplicate."""
a = _make_source("fct_a", [MeasureDefinition(name="total", expr="count(*)")])
b = _make_source("fct_b", [MeasureDefinition(name="total", expr="count(*)")])
errors = validate_measure_duplicates({"fct_a": a, "fct_b": b})
assert errors == []
def test_whitespace_and_case_are_normalized() -> None:
"""COUNT(*) and count(*) and count( * ) all compare equal."""
source = _make_source(
"fct_orders",
[
MeasureDefinition(name="a", expr="count(*)"),
MeasureDefinition(name="b", expr="COUNT(*)"),
MeasureDefinition(name="c", expr=" count( * ) "),
],
)
errors = validate_measure_duplicates({"fct_orders": source})
# Three measures pairwise — should yield 3 errors (a vs b, a vs c, b vs c)
assert len(errors) == 3
def test_unparseable_expr_is_skipped_not_errored() -> None:
"""A measure whose expr can't be parsed is ignored — don't block commit."""
source = _make_source(
"fct_orders",
[
MeasureDefinition(name="bad", expr="!!! not SQL !!!"),
MeasureDefinition(name="good", expr="count(*)"),
],
)
# Should not raise, should not flag — the parser validator will catch the bad one elsewhere
errors = validate_measure_duplicates({"fct_orders": source})
assert errors == []
def test_non_commutative_args_not_treated_as_equivalent() -> None:
"""safe_divide(a, b) is NOT equivalent to safe_divide(b, a)."""
source = _make_source(
"fct_orders",
[
MeasureDefinition(
name="ratio_ab", expr="safe_divide(count(*), sum(amount))"
),
MeasureDefinition(
name="ratio_ba", expr="safe_divide(sum(amount), count(*))"
),
],
)
errors = validate_measure_duplicates({"fct_orders": source})
assert errors == []
def test_single_measure_source_no_comparison() -> None:
source = _make_source(
"fct_orders", [MeasureDefinition(name="total", expr="count(*)")]
)
errors = validate_measure_duplicates({"fct_orders": source})
assert errors == []
def test_same_expr_different_segments_is_not_flagged() -> None:
"""Two measures with same expr but different named segments are by-design distinct."""
source = _make_source(
"fct_subscriptions",
[
MeasureDefinition(
name="active_count", expr="count(*)", segments=["active"]
),
MeasureDefinition(
name="inactive_count", expr="count(*)", segments=["inactive"]
),
],
)
errors = validate_measure_duplicates({"fct_subscriptions": source})
assert errors == []
def test_same_expr_same_segments_is_flagged() -> None:
"""Same expr + same segment set = a true duplicate."""
source = _make_source(
"fct_subscriptions",
[
MeasureDefinition(name="a_count", expr="count(*)", segments=["active"]),
MeasureDefinition(name="b_count", expr="count(*)", segments=["active"]),
],
)
errors = validate_measure_duplicates({"fct_subscriptions": source})
assert len(errors) == 1
assert "same expression and filter" in errors[0]
def test_segment_difference_with_filter_difference_not_flagged() -> None:
"""Segments differ → distinct measures even if filter also differs."""
source = _make_source(
"fct_subscriptions",
[
MeasureDefinition(
name="m1",
expr="count(*)",
segments=["active"],
filter="protocol = 'TRT'",
),
MeasureDefinition(name="m2", expr="count(*)", segments=["inactive"]),
],
)
errors = validate_measure_duplicates({"fct_subscriptions": source})
assert errors == []
def test_bigquery_native_exprs_compared_correctly():
"""Two measures with identical BigQuery-native exprs must be flagged as duplicates."""
from semantic_layer.duplicate_check import validate_measure_duplicates
from semantic_layer.models import (
MeasureDefinition,
SourceColumn,
SourceDefinition,
)
source = SourceDefinition(
name="fct_orders",
table="fct_orders",
grain=["id"],
columns=[
SourceColumn(name="id", type="number"),
SourceColumn(name="amount", type="number"),
],
measures=[
MeasureDefinition(
name="safe_ratio_a",
expr="SAFE_DIVIDE(sum(amount), count(*))",
),
MeasureDefinition(
name="safe_ratio_b",
expr="SAFE_DIVIDE(sum(amount), count(*))",
),
],
)
errors = validate_measure_duplicates({"fct_orders": source}, dialect="bigquery")
assert any("safe_ratio_a" in e and "safe_ratio_b" in e for e in errors), (
f"Duplicate detection missed identical BigQuery-native exprs: {errors}"
)