Tighten ingest reconciliation guidance

This commit is contained in:
Luca Martial 2026-05-11 17:20:02 -07:00
parent 9f91c26752
commit 6d00cbbc2e
16 changed files with 382 additions and 20 deletions

View file

@ -62,6 +62,7 @@ class SemanticEngine:
report = ValidationReport()
self._check_orphan_join_targets(report)
self._check_invalid_grain(report)
self._check_join_columns(report)
self._check_sql_join_coverage(report, recently_touched=recently_touched)
self._check_disconnected_components(report, recently_touched=recently_touched)
return report
@ -91,6 +92,99 @@ class SemanticEngine:
f"that is not in its columns list"
)
def _check_join_columns(self, report: ValidationReport) -> None:
for source in self.sources.values():
source_columns = {c.name for c in source.columns}
for join in source.joins:
target = self.sources.get(join.to)
if target is None:
continue
target_columns = {c.name for c in target.columns}
try:
local_raw, target_raw = self.graph._parse_on(join.on, join.to)
except ValueError as exc:
report.errors.append(
f"Source '{source.name}' has invalid join to '{join.to}': {exc}"
)
continue
local_cols = [col.strip() for col in local_raw.split(",") if col.strip()]
target_cols = [
col.strip() for col in target_raw.split(",") if col.strip()
]
for local_col in local_cols:
if local_col not in source_columns:
report.errors.append(
f"Source '{source.name}' joins to '{join.to}' on "
f"local column '{local_col}', but '{local_col}' is not "
f"in '{source.name}' columns list"
)
for target_col in target_cols:
if target_col not in target_columns:
report.errors.append(
f"Source '{source.name}' joins to '{join.to}' on "
f"target column '{target_col}', but '{target_col}' is not "
f"in '{join.to}' columns list"
)
if join.relationship not in {"many_to_one", "one_to_one"}:
continue
for local_col, target_col in zip(local_cols, target_cols, strict=False):
if (
local_col in source_columns
and target_col in target_columns
and target_col in target.grain
and self._looks_like_display_value_to_identifier(
local_col, target_col
)
):
report.errors.append(
f"Source '{source.name}' joins '{local_col}' to "
f"'{join.to}.{target_col}', but '{local_col}' looks like "
"a display value and the target column is an identifier "
"grain. Project the matching key column or omit this join."
)
@staticmethod
def _looks_like_display_value_to_identifier(
local_col: str, target_col: str
) -> bool:
if target_col != "id" and not target_col.endswith("_id"):
return False
display_names = {"name", "email", "label", "title", "description"}
display_suffixes = (
"_name",
"_email",
"_label",
"_title",
"_description",
)
return local_col in display_names or local_col.endswith(display_suffixes)
@staticmethod
def _source_exposes_join_key(
source: SourceDefinition, target: SourceDefinition
) -> bool:
source_columns = {c.name.lower() for c in source.columns}
target_name = target.name.lower()
target_name_singular = (
target_name[:-1] if target_name.endswith("s") else target_name
)
for grain_col in target.grain:
grain = grain_col.lower()
if grain in source_columns:
return True
if any(col.endswith(f"_{grain}") for col in source_columns):
return True
if grain == "id":
candidates = {
f"{target_name}_id",
f"{target_name_singular}_id",
}
if source_columns.intersection(candidates):
return True
return False
def _check_sql_join_coverage(
self,
report: ValidationReport,
@ -135,6 +229,8 @@ class SemanticEngine:
continue
if hit_name.lower() in declared:
continue
if not self._source_exposes_join_key(source, self.sources[hit_name]):
continue
if hit_name not in missing:
missing.append(hit_name)
@ -148,11 +244,12 @@ class SemanticEngine:
)
msg = (
f"Source '{source.name}' SQL joins manifest table(s) [{ref_list}] "
f"that are not declared in joins[]. Add a join entry for each, "
f"that have projected key columns but are not declared in joins[]. "
f"Add a join entry for each, "
f"e.g. {{to: {example}, on: '{source.name}.<your_fk> = "
f"{example}.{grain_col}', relationship: many_to_one}}. If a "
f"reference is intentionally absent, document it with a "
f"`unmapped-table-*` wiki note and remove the SQL reference."
f"{example}.{grain_col}', relationship: many_to_one}}. If the "
"SQL intentionally keeps a referenced table internal, omit "
"that table's key column from the SQL source output."
)
report.errors.append(msg)

View file

@ -120,6 +120,135 @@ class TestInvalidGrain:
assert any("bad" in e and "nonexistent_col" in e for e in report.errors)
class TestJoinValidation:
def test_join_local_column_must_exist(self):
orders = _src(
"orders",
columns=["id"],
joins=[
JoinDeclaration(
to="customers",
on="customer_id = customers.id",
relationship="many_to_one",
)
],
)
customers = _src("customers")
engine = SemanticEngine.from_sources({"orders": orders, "customers": customers})
report = engine.validate()
assert not report.valid
assert any(
"orders" in e and "customer_id" in e and "columns list" in e
for e in report.errors
)
def test_many_to_one_join_rejects_display_name_to_id_grain(self):
requesters = _src(
"large_contract_requesters",
columns=["account_name", "requester_email"],
grain=["requester_email"],
joins=[
JoinDeclaration(
to="mart_account_segments",
on="account_name = mart_account_segments.account_id",
relationship="many_to_one",
)
],
)
accounts = _src(
"mart_account_segments",
columns=["account_id", "account_name"],
grain=["account_id"],
)
engine = SemanticEngine.from_sources(
{
"large_contract_requesters": requesters,
"mart_account_segments": accounts,
}
)
report = engine.validate()
assert not report.valid
assert any(
"large_contract_requesters" in e
and "account_name" in e
and "mart_account_segments.account_id" in e
for e in report.errors
)
def test_sql_join_coverage_does_not_require_join_without_projected_key(self):
requesters = SourceDefinition(
name="large_contract_requesters",
sql="""
select accounts.account_name, users.email as requester_email
from orbit_raw.requests requests
join public.mart_account_segments accounts
on requests.account_id = accounts.account_id
join orbit_raw.users users
on requests.user_id = users.user_id
""",
grain=["requester_email"],
columns=[
SourceColumn(name="account_name", type="string"),
SourceColumn(name="requester_email", type="string"),
],
joins=[],
)
accounts = _src(
"mart_account_segments",
columns=["account_id", "account_name"],
grain=["account_id"],
)
engine = SemanticEngine.from_sources(
{
"large_contract_requesters": requesters,
"mart_account_segments": accounts,
}
)
report = engine.validate(recently_touched={"large_contract_requesters"})
assert report.errors == []
def test_sql_join_coverage_requires_join_when_projected_key_exists(self):
requesters = SourceDefinition(
name="large_contract_requesters",
sql="""
select accounts.account_id, users.email as requester_email
from orbit_raw.requests requests
join public.mart_account_segments accounts
on requests.account_id = accounts.account_id
join orbit_raw.users users
on requests.user_id = users.user_id
""",
grain=["requester_email"],
columns=[
SourceColumn(name="account_id", type="string"),
SourceColumn(name="requester_email", type="string"),
],
joins=[],
)
accounts = _src(
"mart_account_segments",
columns=["account_id", "account_name"],
grain=["account_id"],
)
engine = SemanticEngine.from_sources(
{
"large_contract_requesters": requesters,
"mart_account_segments": accounts,
}
)
report = engine.validate(recently_touched={"large_contract_requesters"})
assert not report.valid
assert any("mart_account_segments" in e and "joins[]" in e for e in report.errors)
class TestDisconnectedComponents:
def test_two_components_produce_warning_not_error(self):
a = _src("a")