mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-16 08:25:14 +02:00
Initial open-source release
This commit is contained in:
commit
1a42152e6f
1199 changed files with 257054 additions and 0 deletions
220
python/klo-sl/tests/test_duplicate_check.py
Normal file
220
python/klo-sl/tests/test_duplicate_check.py
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
"""Tests for semantic_layer.duplicate_check.validate_measure_duplicates."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from semantic_layer.duplicate_check import validate_measure_duplicates
|
||||
from semantic_layer.models import (
|
||||
MeasureDefinition,
|
||||
SourceColumn,
|
||||
SourceDefinition,
|
||||
)
|
||||
|
||||
|
||||
def _make_source(name: str, measures: list[MeasureDefinition]) -> SourceDefinition:
|
||||
return SourceDefinition(
|
||||
name=name,
|
||||
table=f"public.{name}",
|
||||
grain=["id"],
|
||||
columns=[SourceColumn(name="id", type="number")],
|
||||
measures=measures,
|
||||
)
|
||||
|
||||
|
||||
def test_same_expr_different_filter_is_flagged() -> None:
|
||||
"""The replay-trimmed case: count(*) twice, one with is_active filter."""
|
||||
source = _make_source(
|
||||
"fct_subscriptions",
|
||||
[
|
||||
MeasureDefinition(
|
||||
name="active_subscription_count",
|
||||
expr="count(*)",
|
||||
filter="is_active = true",
|
||||
),
|
||||
MeasureDefinition(
|
||||
name="new_subscription_count",
|
||||
expr="count(*)",
|
||||
),
|
||||
],
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_subscriptions": source})
|
||||
assert len(errors) == 1
|
||||
assert "new_subscription_count" in errors[0]
|
||||
assert "active_subscription_count" in errors[0]
|
||||
assert "differs only by `filter`" in errors[0]
|
||||
|
||||
|
||||
def test_same_expr_same_filter_is_flagged() -> None:
|
||||
"""Two measures with identical expr and filter — flagged as duplicate pair."""
|
||||
source = _make_source(
|
||||
"fct_orders",
|
||||
[
|
||||
MeasureDefinition(
|
||||
name="order_count_a", expr="count(*)", filter="is_paid = true"
|
||||
),
|
||||
MeasureDefinition(
|
||||
name="order_count_b", expr="count(*)", filter="is_paid = true"
|
||||
),
|
||||
],
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_orders": source})
|
||||
assert len(errors) == 1
|
||||
assert "same expression and filter" in errors[0]
|
||||
|
||||
|
||||
def test_different_expr_is_not_flagged() -> None:
|
||||
"""count(*) vs sum(amount) on same source — legitimately distinct measures."""
|
||||
source = _make_source(
|
||||
"fct_orders",
|
||||
[
|
||||
MeasureDefinition(name="order_count", expr="count(*)"),
|
||||
MeasureDefinition(name="total_revenue", expr="sum(amount)"),
|
||||
MeasureDefinition(name="avg_revenue", expr="avg(amount)"),
|
||||
],
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_orders": source})
|
||||
assert errors == []
|
||||
|
||||
|
||||
def test_measures_on_different_sources_not_compared() -> None:
|
||||
"""Same expr on two different sources is not a duplicate."""
|
||||
a = _make_source("fct_a", [MeasureDefinition(name="total", expr="count(*)")])
|
||||
b = _make_source("fct_b", [MeasureDefinition(name="total", expr="count(*)")])
|
||||
errors = validate_measure_duplicates({"fct_a": a, "fct_b": b})
|
||||
assert errors == []
|
||||
|
||||
|
||||
def test_whitespace_and_case_are_normalized() -> None:
|
||||
"""COUNT(*) and count(*) and count( * ) all compare equal."""
|
||||
source = _make_source(
|
||||
"fct_orders",
|
||||
[
|
||||
MeasureDefinition(name="a", expr="count(*)"),
|
||||
MeasureDefinition(name="b", expr="COUNT(*)"),
|
||||
MeasureDefinition(name="c", expr=" count( * ) "),
|
||||
],
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_orders": source})
|
||||
# Three measures pairwise — should yield 3 errors (a vs b, a vs c, b vs c)
|
||||
assert len(errors) == 3
|
||||
|
||||
|
||||
def test_unparseable_expr_is_skipped_not_errored() -> None:
|
||||
"""A measure whose expr can't be parsed is ignored — don't block commit."""
|
||||
source = _make_source(
|
||||
"fct_orders",
|
||||
[
|
||||
MeasureDefinition(name="bad", expr="!!! not SQL !!!"),
|
||||
MeasureDefinition(name="good", expr="count(*)"),
|
||||
],
|
||||
)
|
||||
# Should not raise, should not flag — the parser validator will catch the bad one elsewhere
|
||||
errors = validate_measure_duplicates({"fct_orders": source})
|
||||
assert errors == []
|
||||
|
||||
|
||||
def test_non_commutative_args_not_treated_as_equivalent() -> None:
|
||||
"""safe_divide(a, b) is NOT equivalent to safe_divide(b, a)."""
|
||||
source = _make_source(
|
||||
"fct_orders",
|
||||
[
|
||||
MeasureDefinition(
|
||||
name="ratio_ab", expr="safe_divide(count(*), sum(amount))"
|
||||
),
|
||||
MeasureDefinition(
|
||||
name="ratio_ba", expr="safe_divide(sum(amount), count(*))"
|
||||
),
|
||||
],
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_orders": source})
|
||||
assert errors == []
|
||||
|
||||
|
||||
def test_single_measure_source_no_comparison() -> None:
|
||||
source = _make_source(
|
||||
"fct_orders", [MeasureDefinition(name="total", expr="count(*)")]
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_orders": source})
|
||||
assert errors == []
|
||||
|
||||
|
||||
def test_same_expr_different_segments_is_not_flagged() -> None:
|
||||
"""Two measures with same expr but different named segments are by-design distinct."""
|
||||
source = _make_source(
|
||||
"fct_subscriptions",
|
||||
[
|
||||
MeasureDefinition(
|
||||
name="active_count", expr="count(*)", segments=["active"]
|
||||
),
|
||||
MeasureDefinition(
|
||||
name="inactive_count", expr="count(*)", segments=["inactive"]
|
||||
),
|
||||
],
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_subscriptions": source})
|
||||
assert errors == []
|
||||
|
||||
|
||||
def test_same_expr_same_segments_is_flagged() -> None:
|
||||
"""Same expr + same segment set = a true duplicate."""
|
||||
source = _make_source(
|
||||
"fct_subscriptions",
|
||||
[
|
||||
MeasureDefinition(name="a_count", expr="count(*)", segments=["active"]),
|
||||
MeasureDefinition(name="b_count", expr="count(*)", segments=["active"]),
|
||||
],
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_subscriptions": source})
|
||||
assert len(errors) == 1
|
||||
assert "same expression and filter" in errors[0]
|
||||
|
||||
|
||||
def test_segment_difference_with_filter_difference_not_flagged() -> None:
|
||||
"""Segments differ → distinct measures even if filter also differs."""
|
||||
source = _make_source(
|
||||
"fct_subscriptions",
|
||||
[
|
||||
MeasureDefinition(
|
||||
name="m1",
|
||||
expr="count(*)",
|
||||
segments=["active"],
|
||||
filter="protocol = 'TRT'",
|
||||
),
|
||||
MeasureDefinition(name="m2", expr="count(*)", segments=["inactive"]),
|
||||
],
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_subscriptions": source})
|
||||
assert errors == []
|
||||
|
||||
|
||||
def test_bigquery_native_exprs_compared_correctly():
|
||||
"""Two measures with identical BigQuery-native exprs must be flagged as duplicates."""
|
||||
from semantic_layer.duplicate_check import validate_measure_duplicates
|
||||
from semantic_layer.models import (
|
||||
MeasureDefinition,
|
||||
SourceColumn,
|
||||
SourceDefinition,
|
||||
)
|
||||
|
||||
source = SourceDefinition(
|
||||
name="fct_orders",
|
||||
table="fct_orders",
|
||||
grain=["id"],
|
||||
columns=[
|
||||
SourceColumn(name="id", type="number"),
|
||||
SourceColumn(name="amount", type="number"),
|
||||
],
|
||||
measures=[
|
||||
MeasureDefinition(
|
||||
name="safe_ratio_a",
|
||||
expr="SAFE_DIVIDE(sum(amount), count(*))",
|
||||
),
|
||||
MeasureDefinition(
|
||||
name="safe_ratio_b",
|
||||
expr="SAFE_DIVIDE(sum(amount), count(*))",
|
||||
),
|
||||
],
|
||||
)
|
||||
errors = validate_measure_duplicates({"fct_orders": source}, dialect="bigquery")
|
||||
assert any("safe_ratio_a" in e and "safe_ratio_b" in e for e in errors), (
|
||||
f"Duplicate detection missed identical BigQuery-native exprs: {errors}"
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue