ktx/python/ktx-daemon/tests/test_sql_analysis.py
Andrey Avtomonov e70ae1e63b
feat(query-history): scope mining to modeled schemas by default (#258)
* feat(query-history): structure SQL analysis table refs

* feat(query-history): qualify SQL analysis table refs

* feat(query-history): wire modeled scope floor through ingest

* chore(query-history): verify scope floor

* test(query-history): align daemon SQL batch endpoint contract

* feat(query-history): build scope from same-run scan catalog

* feat(query-history): fail open on scope-floor catalog failures

* chore(query-history): verify scope-floor v1 closure

* refactor(query-history): share scope membership

* feat(setup): apply derived query history filters

* docs: document derived query history filters

* fix(query-history): redact filter picker LLM prompt SQL

* fix(setup): run filter picker SQL analysis through managed daemon

* chore(query-history): verify filter picker v1 closure

* fix(query-history): fail open on partial service-account attribution

* fix(query-history): aggregate BigQuery users by execution count

* fix(query-history): aggregate Snowflake users by execution count

* fix(query-history): use BigQuery query info hash
2026-06-03 17:19:42 +02:00

242 lines
7.8 KiB
Python

from __future__ import annotations
from ktx_daemon.sql_analysis import (
AnalyzeSqlBatchItem,
AnalyzeSqlBatchRequest,
ValidateReadOnlySqlRequest,
_columns_from_nodes,
analyze_sql_batch_response,
validate_read_only_sql_response,
)
def test_analyze_sql_batch_extracts_tables_and_clause_columns() -> None:
response = analyze_sql_batch_response(
AnalyzeSqlBatchRequest(
dialect="postgres",
items=[
AnalyzeSqlBatchItem(
id="orders_by_customer",
sql=(
"select o.status, count(*) "
"from public.orders o "
"join public.customers c on o.customer_id = c.id "
"where o.created_at >= current_date - interval '30 day' "
"group by o.status"
),
)
],
max_workers=1,
)
)
result = response.results["orders_by_customer"]
assert result.error is None
assert [item.model_dump() for item in result.tables_touched] == [
{"catalog": None, "db": "public", "name": "orders"},
{"catalog": None, "db": "public", "name": "customers"},
]
assert result.columns_by_clause == {
"select": ["status"],
"where": ["created_at"],
"join": ["customer_id", "id"],
"groupBy": ["status"],
}
def test_analyze_sql_batch_returns_per_item_parse_errors() -> None:
response = analyze_sql_batch_response(
AnalyzeSqlBatchRequest(
dialect="postgres",
items=[AnalyzeSqlBatchItem(id="broken", sql="select * from where")],
max_workers=1,
)
)
result = response.results["broken"]
assert result.tables_touched == []
assert result.columns_by_clause == {}
assert result.error is not None
def test_analyze_sql_batch_qualifies_bare_table_from_catalog() -> None:
response = analyze_sql_batch_response(
AnalyzeSqlBatchRequest(
dialect="postgres",
catalog={
"tables": [
{
"catalog": None,
"db": "orbit_raw",
"name": "accounts",
"columns": ["id"],
},
{
"catalog": None,
"db": "orbit_analytics",
"name": "orders",
"columns": ["id"],
},
]
},
items=[AnalyzeSqlBatchItem(id="bare", sql="select id from accounts")],
max_workers=1,
)
)
assert [item.model_dump() for item in response.results["bare"].tables_touched] == [
{"catalog": None, "db": "orbit_raw", "name": "accounts"}
]
def test_analyze_sql_batch_returns_all_ambiguous_modeled_matches() -> None:
response = analyze_sql_batch_response(
AnalyzeSqlBatchRequest(
dialect="postgres",
catalog={
"tables": [
{
"catalog": None,
"db": "orbit_raw",
"name": "events",
"columns": ["id"],
},
{
"catalog": None,
"db": "orbit_analytics",
"name": "events",
"columns": ["id"],
},
]
},
items=[AnalyzeSqlBatchItem(id="ambiguous", sql="select id from events")],
max_workers=1,
)
)
assert [
item.model_dump() for item in response.results["ambiguous"].tables_touched
] == [
{"catalog": None, "db": "orbit_raw", "name": "events"},
{"catalog": None, "db": "orbit_analytics", "name": "events"},
]
def test_analyze_sql_batch_leaves_unresolved_bare_refs_unqualified() -> None:
response = analyze_sql_batch_response(
AnalyzeSqlBatchRequest(
dialect="postgres",
catalog={
"tables": [{"catalog": None, "db": "orbit_raw", "name": "accounts"}]
},
items=[AnalyzeSqlBatchItem(id="missing", sql="select * from invoices")],
max_workers=1,
)
)
assert [
item.model_dump() for item in response.results["missing"].tables_touched
] == [{"catalog": None, "db": None, "name": "invoices"}]
def test_analyze_sql_batch_returns_bigquery_project_dataset_table_refs() -> None:
response = analyze_sql_batch_response(
AnalyzeSqlBatchRequest(
dialect="bigquery",
catalog={
"tables": [
{
"catalog": "demo-project",
"db": "orbit_analytics",
"name": "orders",
}
]
},
items=[
AnalyzeSqlBatchItem(
id="bq",
sql="select * from `demo-project.orbit_analytics.orders`",
)
],
max_workers=1,
)
)
assert [item.model_dump() for item in response.results["bq"].tables_touched] == [
{"catalog": "demo-project", "db": "orbit_analytics", "name": "orders"}
]
def test_columns_from_nodes_ignores_non_expression_clause_values() -> None:
assert _columns_from_nodes([True, False, None]) == []
def test_validate_read_only_sql_accepts_select_and_with_queries() -> None:
select_response = validate_read_only_sql_response(
ValidateReadOnlySqlRequest(
dialect="postgres",
sql="select id, status from public.orders where status = 'paid'",
)
)
with_response = validate_read_only_sql_response(
ValidateReadOnlySqlRequest(
dialect="postgres",
sql=(
"with paid as (select * from public.orders where status = 'paid') "
"select count(*) from paid"
),
)
)
assert select_response.ok is True
assert select_response.error is None
assert with_response.ok is True
assert with_response.error is None
def test_validate_read_only_sql_rejects_cte_dml() -> None:
response = validate_read_only_sql_response(
ValidateReadOnlySqlRequest(
dialect="postgres",
sql="with x as (insert into audit.events values (1) returning *) select * from x",
)
)
assert response.ok is False
assert response.error == "SQL contains read/write operation: Insert"
def test_validate_read_only_sql_rejects_multi_statement_payloads() -> None:
response = validate_read_only_sql_response(
ValidateReadOnlySqlRequest(
dialect="postgres",
sql="select * from public.orders; delete from public.orders",
)
)
assert response.ok is False
assert response.error == "Only one SQL statement can be executed."
def test_validate_read_only_sql_rejects_commands_and_pragmas() -> None:
command_response = validate_read_only_sql_response(
ValidateReadOnlySqlRequest(dialect="postgres", sql="call refresh_stats()")
)
pragma_response = validate_read_only_sql_response(
ValidateReadOnlySqlRequest(dialect="sqlite", sql="pragma table_info(users)")
)
assert command_response.ok is False
assert command_response.error == "SQL contains read/write operation: Command"
assert pragma_response.ok is False
assert pragma_response.error == "SQL contains read/write operation: Pragma"
def test_validate_read_only_sql_reports_parse_errors() -> None:
response = validate_read_only_sql_response(
ValidateReadOnlySqlRequest(dialect="postgres", sql="select * from where")
)
assert response.ok is False
assert response.error is not None
assert "Invalid expression" in response.error