Improve schema setup and Notion ingest UX (#14)

* Improve schema setup and Notion ingest UX

* Handle Postgres network scan failures

* WIP: save local changes before main merge

* Refine setup prompt choices

* Tighten ingest reconciliation guidance

* Commit setup config updates

* Canonicalize unmapped fallback details

* Count reconciliation actions in reports

* Harden semantic layer source validation

* Return wiki content after edits

* Validate SL sources against manifests

* Validate wiki refs before writes

* Simplify CLI next steps

* Clarify agent setup summary

* Surface dbt target SL sources

* Recover SL write fallbacks

* Preserve failed context build metadata

* Track raw paths for ingest actions

* test(cli): update seeded demo expectations

* fix(ingest): scope fallback recovery checks

* fix(sl): tighten source validation guards

* fix(wiki): ignore empty embedding vectors

* Improve Notion ingest UX

* Enforce flat wiki keys

* test(context): update wiki key assertion

---------

Co-authored-by: Andrey Avtomonov <andreybavt@gmail.com>
This commit is contained in:
Luca Martial 2026-05-12 16:56:58 -04:00 committed by GitHub
parent 866d33e71a
commit 60457e9407
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
116 changed files with 4177 additions and 610 deletions

View file

@ -119,6 +119,275 @@ class TestInvalidGrain:
assert not report.valid
assert any("bad" in e and "nonexistent_col" in e for e in report.errors)
def test_qualified_grain_name_is_rejected(self):
bad = _src(
"activity",
columns=["account_id"],
grain=["activity.account_id"],
)
engine = SemanticEngine.from_sources({"activity": bad})
report = engine.validate()
assert not report.valid
assert any(
"activity" in e and "activity.account_id" in e and "qualified" in e
for e in report.errors
)
def test_qualified_column_name_is_rejected(self):
bad = SourceDefinition(
name="activity",
table="public.activity",
grain=["account_id"],
columns=[
SourceColumn(name="account_id", type="number"),
SourceColumn(name="activity.user_id", type="number"),
],
)
engine = SemanticEngine.from_sources({"activity": bad})
report = engine.validate()
assert not report.valid
assert any(
"activity" in e and "activity.user_id" in e and "unqualified" in e
for e in report.errors
)
def test_sql_source_grain_missing_from_projection(self):
bad = SourceDefinition(
name="large_contract_requesters",
sql=(
"select account.account_name, requester.email as requester_email "
"from orbit_raw.actions activity "
"join orbit_raw.accounts account "
" on account.account_id = activity.account_id "
"join orbit_raw.users requester "
" on requester.user_id = activity.user_id"
),
grain=["account_id", "user_id"],
columns=[
SourceColumn(name="account_id", type="number"),
SourceColumn(name="user_id", type="number"),
SourceColumn(name="account_name", type="string"),
SourceColumn(name="requester_email", type="string"),
],
)
engine = SemanticEngine.from_sources({"large_contract_requesters": bad})
report = engine.validate()
assert not report.valid
assert any(
"large_contract_requesters" in e
and "account_id" in e
and "SELECT projection" in e
for e in report.errors
)
def test_sql_source_grain_in_projection_passes(self):
good = SourceDefinition(
name="contract_requesters",
sql=(
"select activity.account_id, activity.user_id, "
"account.account_name, requester.email as requester_email "
"from orbit_raw.actions activity "
"join orbit_raw.accounts account "
" on account.account_id = activity.account_id "
"join orbit_raw.users requester "
" on requester.user_id = activity.user_id"
),
grain=["account_id", "user_id"],
columns=[
SourceColumn(name="account_id", type="number"),
SourceColumn(name="user_id", type="number"),
SourceColumn(name="account_name", type="string"),
SourceColumn(name="requester_email", type="string"),
],
)
engine = SemanticEngine.from_sources({"contract_requesters": good})
report = engine.validate()
# No grain-related errors. (Other validators may emit unrelated
# warnings — we just assert the grain check is clean.)
assert not any("grain" in e or "SELECT projection" in e for e in report.errors)
def test_sql_source_with_select_star_skips_projection_check(self):
# SELECT * means we can't statically know projected columns;
# the projection check must skip rather than false-fail.
src = SourceDefinition(
name="opaque",
sql="select * from public.events",
grain=["event_id"],
columns=[SourceColumn(name="event_id", type="number")],
)
engine = SemanticEngine.from_sources({"opaque": src})
report = engine.validate()
assert not any("SELECT projection" in e for e in report.errors)
class TestJoinValidation:
def test_join_local_column_must_exist(self):
orders = _src(
"orders",
columns=["id"],
joins=[
JoinDeclaration(
to="customers",
on="customer_id = customers.id",
relationship="many_to_one",
)
],
)
customers = _src("customers")
engine = SemanticEngine.from_sources({"orders": orders, "customers": customers})
report = engine.validate()
assert not report.valid
assert any(
"orders" in e and "customer_id" in e and "columns list" in e
for e in report.errors
)
def test_many_to_one_join_rejects_display_name_to_id_grain(self):
requesters = _src(
"large_contract_requesters",
columns=["account_name", "requester_email"],
grain=["requester_email"],
joins=[
JoinDeclaration(
to="mart_account_segments",
on="account_name = mart_account_segments.account_id",
relationship="many_to_one",
)
],
)
accounts = _src(
"mart_account_segments",
columns=["account_id", "account_name"],
grain=["account_id"],
)
engine = SemanticEngine.from_sources(
{
"large_contract_requesters": requesters,
"mart_account_segments": accounts,
}
)
report = engine.validate()
assert not report.valid
assert any(
"large_contract_requesters" in e
and "account_name" in e
and "mart_account_segments.account_id" in e
for e in report.errors
)
def test_sql_join_coverage_does_not_require_join_without_projected_key(self):
requesters = SourceDefinition(
name="large_contract_requesters",
sql="""
select accounts.account_name, users.email as requester_email
from orbit_raw.requests requests
join public.mart_account_segments accounts
on requests.account_id = accounts.account_id
join orbit_raw.users users
on requests.user_id = users.user_id
""",
grain=["requester_email"],
columns=[
SourceColumn(name="account_name", type="string"),
SourceColumn(name="requester_email", type="string"),
],
joins=[],
)
accounts = _src(
"mart_account_segments",
columns=["account_id", "account_name"],
grain=["account_id"],
)
engine = SemanticEngine.from_sources(
{
"large_contract_requesters": requesters,
"mart_account_segments": accounts,
}
)
report = engine.validate(recently_touched={"large_contract_requesters"})
assert report.errors == []
def test_sql_join_coverage_does_not_treat_unrelated_id_suffix_as_id_key(self):
requesters = SourceDefinition(
name="large_contract_requesters",
sql="""
select accounts.account_name, requests.user_id
from orbit_raw.requests requests
join public.accounts accounts
on requests.account_id = accounts.id
""",
grain=["user_id"],
columns=[
SourceColumn(name="account_name", type="string"),
SourceColumn(name="user_id", type="string"),
],
joins=[],
)
accounts = _src("accounts", columns=["id", "account_name"], grain=["id"])
engine = SemanticEngine.from_sources(
{
"large_contract_requesters": requesters,
"accounts": accounts,
}
)
report = engine.validate(recently_touched={"large_contract_requesters"})
assert report.errors == []
def test_sql_join_coverage_requires_join_when_projected_key_exists(self):
requesters = SourceDefinition(
name="large_contract_requesters",
sql="""
select accounts.account_id, users.email as requester_email
from orbit_raw.requests requests
join public.mart_account_segments accounts
on requests.account_id = accounts.account_id
join orbit_raw.users users
on requests.user_id = users.user_id
""",
grain=["requester_email"],
columns=[
SourceColumn(name="account_id", type="string"),
SourceColumn(name="requester_email", type="string"),
],
joins=[],
)
accounts = _src(
"mart_account_segments",
columns=["account_id", "account_name"],
grain=["account_id"],
)
engine = SemanticEngine.from_sources(
{
"large_contract_requesters": requesters,
"mart_account_segments": accounts,
}
)
report = engine.validate(recently_touched={"large_contract_requesters"})
assert not report.valid
assert any(
"mart_account_segments" in e and "joins[]" in e for e in report.errors
)
class TestDisconnectedComponents:
def test_two_components_produce_warning_not_error(self):