ktx/python/ktx-sl/tests/test_segments.py

"""Tests for named segments — reusable boolean predicates on a source.

Segments are AND-ed into the measure's effective filter via the same CASE WHEN
pathway used by `measure.filter`. They never become a global WHERE clause.
"""

from __future__ import annotations

import pytest

from .conftest import assert_valid_sql, make_engine


def _orders_source(**overrides):
    base = {
        "name": "orders",
        "table": "public.orders",
        "grain": ["id"],
        "columns": [
            {"name": "id", "type": "number"},
            {"name": "amount", "type": "number"},
            {"name": "is_paid", "type": "boolean"},
            {"name": "is_refunded", "type": "string"},
            {"name": "customer_id", "type": "number"},
        ],
        "segments": [
            {
                "name": "paid_non_refunded",
                "expr": "is_paid = true and is_refunded = '0'",
                "description": "Settled, not reversed.",
            },
        ],
        "measures": [
            {
                "name": "total_revenue",
                "expr": "sum(amount)",
                "segments": ["paid_non_refunded"],
            },
        ],
    }
    base.update(overrides)
    return base


def _customers_source(**overrides):
    base = {
        "name": "customers",
        "table": "public.customers",
        "grain": ["id"],
        "columns": [
            {"name": "id", "type": "number"},
            {"name": "is_vip", "type": "boolean"},
        ],
        "measures": [
            {"name": "customer_count", "expr": "count(distinct id)"},
        ],
    }
    base.update(overrides)
    return base


# ── Composition + golden SQL shape ───────────────────────────────────


class TestSegmentComposition:
    def test_measure_segment_lands_in_case_when_wrap(self):
        engine = make_engine({"orders": _orders_source()})
        result = engine.query({"measures": ["orders.total_revenue"]})
        assert_valid_sql(result.sql)
        sql_upper = result.sql.upper()
        # Filter must be inside CASE WHEN (the measure-filter pathway)
        assert "CASE WHEN" in sql_upper
        assert "is_paid" in result.sql.lower()
        assert "is_refunded" in result.sql.lower()
        # Should NOT show up as a global WHERE
        # (a WHERE clause may exist for other reasons — assert no segment expr in it)
        # Easiest: assert WHERE doesn't contain the segment's exact predicate.
        # Split before/after first WHERE keyword if any.
        assert "WHERE IS_PAID" not in sql_upper.replace(" = ", " = ")

    def test_measure_filter_and_segment_both_applied(self):
        src = _orders_source()
        src["measures"][0]["filter"] = "amount > 0"
        engine = make_engine({"orders": src})
        result = engine.query({"measures": ["orders.total_revenue"]})
        assert_valid_sql(result.sql)
        sql_lower = result.sql.lower()
        # Both predicates appear inside the measure's CASE WHEN wrap
        assert "amount > 0" in sql_lower
        assert "is_paid" in sql_lower
        assert "is_refunded" in sql_lower
        # AND composition: ensure both halves are joined
        assert " and " in sql_lower

    def test_query_time_segment_applies_to_measure(self):
        # Measure has no measure-bound segment; segment is applied at query time.
        src = _orders_source()
        src["measures"] = [{"name": "raw_revenue", "expr": "sum(amount)"}]
        engine = make_engine({"orders": src})
        result = engine.query(
            {
                "measures": ["orders.raw_revenue"],
                "segments": ["orders.paid_non_refunded"],
            }
        )
        assert_valid_sql(result.sql)
        sql_lower = result.sql.lower()
        assert "case when" in sql_lower
        assert "is_paid" in sql_lower
        assert "is_refunded" in sql_lower

    def test_measure_and_query_segments_compose(self):
        # Measure has paid_non_refunded; query adds 'high_value'.
        src = _orders_source()
        src["segments"].append(
            {"name": "high_value", "expr": "amount >= 100"},
        )
        engine = make_engine({"orders": src})
        result = engine.query(
            {
                "measures": ["orders.total_revenue"],
                "segments": ["orders.high_value"],
            }
        )
        assert_valid_sql(result.sql)
        sql_lower = result.sql.lower()
        # All three predicates present
        assert "is_paid" in sql_lower
        assert "is_refunded" in sql_lower
        assert "amount >= 100" in sql_lower


# ── Multi-source query: scope is per-measure, not global ─────────────


class TestSegmentMultiSourceScope:
    def test_segment_does_not_apply_to_other_source_measures(self):
        # Query touches both orders and customers; segment is on orders only.
        # Assert that the segment predicate does NOT show up in the
        # customers CTE / WHERE on customers.
        engine = make_engine(
            {
                "orders": _orders_source(
                    joins=[
                        {
                            "to": "customers",
                            "on": "customer_id = customers.id",
                            "relationship": "many_to_one",
                        }
                    ],
                    measures=[
                        {"name": "raw_revenue", "expr": "sum(amount)"},
                    ],
                ),
                "customers": _customers_source(),
            }
        )
        result = engine.query(
            {
                "measures": [
                    "orders.raw_revenue",
                    "customers.customer_count",
                ],
                "segments": ["orders.paid_non_refunded"],
            }
        )
        assert_valid_sql(result.sql)
        sql_lower = result.sql.lower()
        # Segment predicate appears (it landed on orders)
        assert "is_paid" in sql_lower
        # The customers measure's pre-aggregation CTE / clause must not be filtered by the segment.
        # Heuristic: find each line that references count(distinct ... id) and assert no
        # "is_paid" or "is_refunded" in the same CASE WHEN block. The simpler assertion
        # is that there's no global WHERE applying the segment.
        # We assert the segment doesn't appear inside an aggregate against the customers source.
        # Concretely: count(...customers...) should not contain is_paid/is_refunded.
        # Walk the SQL and find COUNT(DISTINCT ... ID) — that aggregate must be unfiltered.
        import re

        count_aggs = re.findall(
            r"COUNT\s*\(\s*DISTINCT[^()]*\)", result.sql, flags=re.IGNORECASE
        )
        assert count_aggs, "expected at least one COUNT(DISTINCT ...) aggregate"
        for agg in count_aggs:
            assert "is_paid" not in agg.lower(), (
                f"customer_count aggregate must not be filtered by segment: {agg}"
            )


# ── Error cases ──────────────────────────────────────────────────────


class TestSegmentErrors:
    def test_unknown_bare_name_in_measure_segments(self):
        src = _orders_source()
        src["measures"][0]["segments"] = ["does_not_exist"]
        engine = make_engine({"orders": src})
        with pytest.raises(ValueError, match="unknown segment 'does_not_exist'"):
            engine.query({"measures": ["orders.total_revenue"]})

    def test_unknown_query_time_segment_name(self):
        engine = make_engine({"orders": _orders_source()})
        with pytest.raises(ValueError, match="unknown segment 'does_not_exist'"):
            engine.query(
                {
                    "measures": ["orders.total_revenue"],
                    "segments": ["orders.does_not_exist"],
                }
            )

    def test_unknown_query_time_segment_source(self):
        engine = make_engine({"orders": _orders_source()})
        with pytest.raises(ValueError, match="unknown source 'no_such_source'"):
            engine.query(
                {
                    "measures": ["orders.total_revenue"],
                    "segments": ["no_such_source.foo"],
                }
            )

    def test_query_time_segment_must_be_dotted(self):
        engine = make_engine({"orders": _orders_source()})
        with pytest.raises(ValueError, match="dotted"):
            engine.query(
                {
                    "measures": ["orders.total_revenue"],
                    "segments": ["paid_non_refunded"],  # missing source prefix
                }
            )

    def test_no_op_query_time_segment_errors(self):
        # Segment on customers, but no customers measure in the query.
        engine = make_engine(
            {
                "orders": _orders_source(
                    joins=[
                        {
                            "to": "customers",
                            "on": "customer_id = customers.id",
                            "relationship": "many_to_one",
                        }
                    ],
                    measures=[{"name": "raw_revenue", "expr": "sum(amount)"}],
                ),
                "customers": _customers_source(
                    segments=[{"name": "vips", "expr": "is_vip = true"}]
                ),
            }
        )
        with pytest.raises(ValueError, match="no matching"):
            engine.query(
                {
                    "measures": ["orders.raw_revenue"],
                    "segments": ["customers.vips"],
                }
            )


def test_bigquery_native_segment_referenced_by_measure(make_engine_factory):
    """Segment authored in BigQuery dialect, referenced by a measure,
    must not degrade the segment's native syntax when composed."""
    source = {
        "name": "fct_orders",
        "table": "fct_orders",
        "grain": ["id"],
        "columns": [
            {"name": "id", "type": "number"},
            {"name": "status", "type": "string"},
            {"name": "ts", "type": "time"},
        ],
        "segments": [
            {"name": "non_cancelled", "expr": "status != 'cancelled'"},
            {
                "name": "last_30",
                "expr": "ts >= timestamp(date_sub(current_date(), interval 30 day))",
            },
        ],
        "measures": [
            {
                "name": "dau",
                "expr": "count(distinct id)",
                "segments": ["non_cancelled", "last_30"],
            }
        ],
    }
    engine = make_engine_factory({"fct_orders": source}, dialect="bigquery")
    result = engine.query(
        {"measures": ["fct_orders.dau"], "dimensions": [], "filters": []}
    )
    sql = result.sql
    assert "INTERVAL '30'" not in sql or "DAY" in sql.upper(), (
        f"INTERVAL unit lost in segment reference:\n{sql}"
    )
Initial open-source release 2026-05-10 23:12:26 +02:00			`"""Tests for named segments — reusable boolean predicates on a source.`

			`Segments are AND-ed into the measure's effective filter via the same CASE WHEN`
			pathway used by `measure.filter`. They never become a global WHERE clause.
			`"""`

			`from __future__ import annotations`

			`import pytest`

			`from .conftest import assert_valid_sql, make_engine`


			`def _orders_source(**overrides):`
			`base = {`
			`"name": "orders",`
			`"table": "public.orders",`
			`"grain": ["id"],`
			`"columns": [`
			`{"name": "id", "type": "number"},`
			`{"name": "amount", "type": "number"},`
			`{"name": "is_paid", "type": "boolean"},`
			`{"name": "is_refunded", "type": "string"},`
			`{"name": "customer_id", "type": "number"},`
			`],`
			`"segments": [`
			`{`
			`"name": "paid_non_refunded",`
			`"expr": "is_paid = true and is_refunded = '0'",`
			`"description": "Settled, not reversed.",`
			`},`
			`],`
			`"measures": [`
			`{`
			`"name": "total_revenue",`
			`"expr": "sum(amount)",`
			`"segments": ["paid_non_refunded"],`
			`},`
			`],`
			`}`
			`base.update(overrides)`
			`return base`


			`def _customers_source(**overrides):`
			`base = {`
			`"name": "customers",`
			`"table": "public.customers",`
			`"grain": ["id"],`
			`"columns": [`
			`{"name": "id", "type": "number"},`
			`{"name": "is_vip", "type": "boolean"},`
			`],`
			`"measures": [`
			`{"name": "customer_count", "expr": "count(distinct id)"},`
			`],`
			`}`
			`base.update(overrides)`
			`return base`


			`# ── Composition + golden SQL shape ───────────────────────────────────`


			`class TestSegmentComposition:`
			`def test_measure_segment_lands_in_case_when_wrap(self):`
			`engine = make_engine({"orders": _orders_source()})`
			`result = engine.query({"measures": ["orders.total_revenue"]})`
			`assert_valid_sql(result.sql)`
			`sql_upper = result.sql.upper()`
			`# Filter must be inside CASE WHEN (the measure-filter pathway)`
			`assert "CASE WHEN" in sql_upper`
			`assert "is_paid" in result.sql.lower()`
			`assert "is_refunded" in result.sql.lower()`
			`# Should NOT show up as a global WHERE`
			`# (a WHERE clause may exist for other reasons — assert no segment expr in it)`
			`# Easiest: assert WHERE doesn't contain the segment's exact predicate.`
			`# Split before/after first WHERE keyword if any.`
			`assert "WHERE IS_PAID" not in sql_upper.replace(" = ", " = ")`

			`def test_measure_filter_and_segment_both_applied(self):`
			`src = _orders_source()`
			`src["measures"][0]["filter"] = "amount > 0"`
			`engine = make_engine({"orders": src})`
			`result = engine.query({"measures": ["orders.total_revenue"]})`
			`assert_valid_sql(result.sql)`
			`sql_lower = result.sql.lower()`
			`# Both predicates appear inside the measure's CASE WHEN wrap`
			`assert "amount > 0" in sql_lower`
			`assert "is_paid" in sql_lower`
			`assert "is_refunded" in sql_lower`
			`# AND composition: ensure both halves are joined`
			`assert " and " in sql_lower`

			`def test_query_time_segment_applies_to_measure(self):`
			`# Measure has no measure-bound segment; segment is applied at query time.`
			`src = _orders_source()`
			`src["measures"] = [{"name": "raw_revenue", "expr": "sum(amount)"}]`
			`engine = make_engine({"orders": src})`
			`result = engine.query(`
			`{`
			`"measures": ["orders.raw_revenue"],`
			`"segments": ["orders.paid_non_refunded"],`
			`}`
			`)`
			`assert_valid_sql(result.sql)`
			`sql_lower = result.sql.lower()`
			`assert "case when" in sql_lower`
			`assert "is_paid" in sql_lower`
			`assert "is_refunded" in sql_lower`

			`def test_measure_and_query_segments_compose(self):`
			`# Measure has paid_non_refunded; query adds 'high_value'.`
			`src = _orders_source()`
			`src["segments"].append(`
			`{"name": "high_value", "expr": "amount >= 100"},`
			`)`
			`engine = make_engine({"orders": src})`
			`result = engine.query(`
			`{`
			`"measures": ["orders.total_revenue"],`
			`"segments": ["orders.high_value"],`
			`}`
			`)`
			`assert_valid_sql(result.sql)`
			`sql_lower = result.sql.lower()`
			`# All three predicates present`
			`assert "is_paid" in sql_lower`
			`assert "is_refunded" in sql_lower`
			`assert "amount >= 100" in sql_lower`


			`# ── Multi-source query: scope is per-measure, not global ─────────────`


			`class TestSegmentMultiSourceScope:`
			`def test_segment_does_not_apply_to_other_source_measures(self):`
			`# Query touches both orders and customers; segment is on orders only.`
			`# Assert that the segment predicate does NOT show up in the`
			`# customers CTE / WHERE on customers.`
			`engine = make_engine(`
			`{`
			`"orders": _orders_source(`
			`joins=[`
			`{`
			`"to": "customers",`
			`"on": "customer_id = customers.id",`
			`"relationship": "many_to_one",`
			`}`
			`],`
			`measures=[`
			`{"name": "raw_revenue", "expr": "sum(amount)"},`
			`],`
			`),`
			`"customers": _customers_source(),`
			`}`
			`)`
			`result = engine.query(`
			`{`
			`"measures": [`
			`"orders.raw_revenue",`
			`"customers.customer_count",`
			`],`
			`"segments": ["orders.paid_non_refunded"],`
			`}`
			`)`
			`assert_valid_sql(result.sql)`
			`sql_lower = result.sql.lower()`
			`# Segment predicate appears (it landed on orders)`
			`assert "is_paid" in sql_lower`
			`# The customers measure's pre-aggregation CTE / clause must not be filtered by the segment.`
			`# Heuristic: find each line that references count(distinct ... id) and assert no`
			`# "is_paid" or "is_refunded" in the same CASE WHEN block. The simpler assertion`
			`# is that there's no global WHERE applying the segment.`
			`# We assert the segment doesn't appear inside an aggregate against the customers source.`
			`# Concretely: count(...customers...) should not contain is_paid/is_refunded.`
			`# Walk the SQL and find COUNT(DISTINCT ... ID) — that aggregate must be unfiltered.`
			`import re`

			`count_aggs = re.findall(`
			`r"COUNT\s\(\sDISTINCT[^()]*\)", result.sql, flags=re.IGNORECASE`
			`)`
			`assert count_aggs, "expected at least one COUNT(DISTINCT ...) aggregate"`
			`for agg in count_aggs:`
			`assert "is_paid" not in agg.lower(), (`
			`f"customer_count aggregate must not be filtered by segment: {agg}"`
			`)`


			`# ── Error cases ──────────────────────────────────────────────────────`


			`class TestSegmentErrors:`
			`def test_unknown_bare_name_in_measure_segments(self):`
			`src = _orders_source()`
			`src["measures"][0]["segments"] = ["does_not_exist"]`
			`engine = make_engine({"orders": src})`
			`with pytest.raises(ValueError, match="unknown segment 'does_not_exist'"):`
			`engine.query({"measures": ["orders.total_revenue"]})`

			`def test_unknown_query_time_segment_name(self):`
			`engine = make_engine({"orders": _orders_source()})`
			`with pytest.raises(ValueError, match="unknown segment 'does_not_exist'"):`
			`engine.query(`
			`{`
			`"measures": ["orders.total_revenue"],`
			`"segments": ["orders.does_not_exist"],`
			`}`
			`)`

			`def test_unknown_query_time_segment_source(self):`
			`engine = make_engine({"orders": _orders_source()})`
			`with pytest.raises(ValueError, match="unknown source 'no_such_source'"):`
			`engine.query(`
			`{`
			`"measures": ["orders.total_revenue"],`
			`"segments": ["no_such_source.foo"],`
			`}`
			`)`

			`def test_query_time_segment_must_be_dotted(self):`
			`engine = make_engine({"orders": _orders_source()})`
			`with pytest.raises(ValueError, match="dotted"):`
			`engine.query(`
			`{`
			`"measures": ["orders.total_revenue"],`
			`"segments": ["paid_non_refunded"], # missing source prefix`
			`}`
			`)`

			`def test_no_op_query_time_segment_errors(self):`
			`# Segment on customers, but no customers measure in the query.`
			`engine = make_engine(`
			`{`
			`"orders": _orders_source(`
			`joins=[`
			`{`
			`"to": "customers",`
			`"on": "customer_id = customers.id",`
			`"relationship": "many_to_one",`
			`}`
			`],`
			`measures=[{"name": "raw_revenue", "expr": "sum(amount)"}],`
			`),`
			`"customers": _customers_source(`
			`segments=[{"name": "vips", "expr": "is_vip = true"}]`
			`),`
			`}`
			`)`
			`with pytest.raises(ValueError, match="no matching"):`
			`engine.query(`
			`{`
			`"measures": ["orders.raw_revenue"],`
			`"segments": ["customers.vips"],`
			`}`
			`)`


			`def test_bigquery_native_segment_referenced_by_measure(make_engine_factory):`
			`"""Segment authored in BigQuery dialect, referenced by a measure,`
			`must not degrade the segment's native syntax when composed."""`
			`source = {`
			`"name": "fct_orders",`
			`"table": "fct_orders",`
			`"grain": ["id"],`
			`"columns": [`
			`{"name": "id", "type": "number"},`
			`{"name": "status", "type": "string"},`
			`{"name": "ts", "type": "time"},`
			`],`
			`"segments": [`
			`{"name": "non_cancelled", "expr": "status != 'cancelled'"},`
			`{`
			`"name": "last_30",`
			`"expr": "ts >= timestamp(date_sub(current_date(), interval 30 day))",`
			`},`
			`],`
			`"measures": [`
			`{`
			`"name": "dau",`
			`"expr": "count(distinct id)",`
			`"segments": ["non_cancelled", "last_30"],`
			`}`
			`],`
			`}`
			`engine = make_engine_factory({"fct_orders": source}, dialect="bigquery")`
			`result = engine.query(`
			`{"measures": ["fct_orders.dau"], "dimensions": [], "filters": []}`
			`)`
			`sql = result.sql`
			`assert "INTERVAL '30'" not in sql or "DAY" in sql.upper(), (`
			`f"INTERVAL unit lost in segment reference:\n{sql}"`
			`)`