feat(connector): add Amazon Athena connector via Glue Data Catalog (#309)

* feat(connector): add Amazon Athena connector via Glue Data Catalog

* fix(athena): address reviewer feedback

* fix(athena): wire scope discovery, fix normalizeDriver, tighten types and tests

* fix(athena): honor databases scope, wire sql-analysis dialect, harden config resolution

- introspect() limits to the configured `databases` scope instead of scanning
  every Glue database in the account (docs promised this; connector ignored it)
- add athena -> athena to sql-analysis SQLGLOT_DIALECTS so `ktx sql` and MCP
  read-only validation parse Athena SQL under the Trino grammar, not postgres
- stringConfigValue coerces a resolved-empty `env:` reference to undefined so
  optional fields fall back to their defaults (workgroup 'primary', catalog
  'AwsDataCatalog') instead of ''
- drop trailing whitespace in dialect.test.ts

* fix(athena): integrate with main's SQL/non-SQL dialect split and add dialect notes

Rebase onto main, which introduced the KtxDialect (core) vs KtxSqlDialect
(SQL-only) split for MongoDB:
- KtxAthenaDialect implements KtxSqlDialect; the connector resolves it via
  getSqlDialectForDriver so SQL-generation methods stay in scope
- add authored athena.md SQL notes for the sql_dialect_notes MCP tool, required
  now that athena resolves to the athena sqlglot dialect (dialect-notes coverage
  is derived from the warehouse-driver registry)

---------

Co-authored-by: Andrey Avtomonov <andreybavt@gmail.com>
This commit is contained in:
Patel Dhrit 2026-07-02 06:00:26 -07:00 committed by GitHub
parent 6d01030745
commit fe7e6bd1fa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 2047 additions and 6 deletions

View file

@ -0,0 +1,630 @@
import { describe, expect, it, vi } from 'vitest';
import {
athenaConnectionConfigFromConfig,
isKtxAthenaConnectionConfig,
KtxAthenaScanConnector,
type KtxAthenaClientFactory,
type KtxAthenaClient,
type KtxGlueClient,
} from '../../../src/connectors/athena/connector.js';
import { createAthenaLiveDatabaseIntrospection } from '../../../src/connectors/athena/live-database-introspection.js';
import { tableRefSet } from '../../../src/context/scan/table-ref.js';
function fakeClientFactory(options: { queryState?: string; queryError?: string } = {}): KtxAthenaClientFactory {
const state = options.queryState ?? 'SUCCEEDED';
const queries = new Map<string, string>();
let execCounter = 0;
const fakeAthenaClient: KtxAthenaClient = {
startQueryExecution: vi.fn(async (input) => {
const id = `exec-${++execCounter}`;
queries.set(id, input.QueryString);
return { QueryExecutionId: id };
}),
getQueryExecution: vi.fn(async () => ({
QueryExecution: {
Status: {
State: state,
StateChangeReason: options.queryError,
},
},
})),
getQueryResults: vi.fn(async (input) => {
const sql = queries.get(input.QueryExecutionId) ?? '';
// Column sample query: single-column result for the queried column only.
if (sql.includes('IS NOT NULL')) {
return {
ResultSet: {
ResultSetMetadata: { ColumnInfo: [{ Name: 'status', Type: 'string' }] },
Rows: [
{ Data: [{ VarCharValue: 'status' }] }, // header row
{ Data: [{ VarCharValue: 'paid' }] },
],
},
NextToken: undefined,
};
}
return {
ResultSet: {
ResultSetMetadata: {
ColumnInfo: [
{ Name: 'id', Type: 'bigint' },
{ Name: 'status', Type: 'string' },
],
},
Rows: [
// Header row (Athena always includes it on first page)
{ Data: [{ VarCharValue: 'id' }, { VarCharValue: 'status' }] },
// Data row
{ Data: [{ VarCharValue: '1' }, { VarCharValue: 'paid' }] },
],
},
NextToken: undefined,
};
}),
};
const fakeGlueClient: KtxGlueClient = {
getDatabases: vi.fn(async () => ({
DatabaseList: [{ Name: 'analytics' }],
NextToken: undefined,
})),
getTables: vi.fn(async () => ({
TableList: [
{
Name: 'orders',
TableType: 'EXTERNAL_TABLE',
Description: 'Orders table',
StorageDescriptor: {
Columns: [
{ Name: 'id', Type: 'bigint', Comment: 'Order id' },
{ Name: 'status', Type: 'string' },
],
},
PartitionKeys: [{ Name: 'dt', Type: 'date', Comment: 'Partition date' }],
},
],
NextToken: undefined,
})),
};
return {
createAthenaClient: vi.fn(() => fakeAthenaClient),
createGlueClient: vi.fn(() => fakeGlueClient),
};
}
const connection = {
driver: 'athena',
region: 'us-east-1',
s3_staging_dir: 's3://my-bucket/athena-results/',
workgroup: 'analytics',
catalog: 'AwsDataCatalog',
database: 'analytics',
} as const;
describe('KtxAthenaScanConnector', () => {
it('identifies athena connection configs correctly', () => {
expect(isKtxAthenaConnectionConfig(connection)).toBe(true);
expect(isKtxAthenaConnectionConfig({ driver: 'bigquery' })).toBe(false);
expect(isKtxAthenaConnectionConfig(null)).toBe(false);
expect(isKtxAthenaConnectionConfig(undefined)).toBe(false);
});
it('resolves configuration and throws on missing required fields', () => {
expect(athenaConnectionConfigFromConfig({ connectionId: 'dw', connection })).toMatchObject({
region: 'us-east-1',
s3StagingDir: 's3://my-bucket/athena-results/',
workgroup: 'analytics',
catalog: 'AwsDataCatalog',
database: 'analytics',
});
expect(() =>
athenaConnectionConfigFromConfig({ connectionId: 'dw', connection: { driver: 'athena' } }),
).toThrow('connections.dw.region');
expect(() =>
athenaConnectionConfigFromConfig({
connectionId: 'dw',
connection: { driver: 'athena', region: 'us-east-1' },
}),
).toThrow('connections.dw.s3_staging_dir');
});
it('applies defaults for optional config fields', () => {
const resolved = athenaConnectionConfigFromConfig({
connectionId: 'dw',
connection: { driver: 'athena', region: 'us-east-1', s3_staging_dir: 's3://bucket/' },
});
expect(resolved.workgroup).toBe('primary');
expect(resolved.catalog).toBe('AwsDataCatalog');
expect(resolved.database).toBeUndefined();
});
it('introspects databases, tables, and columns from Glue', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
now: () => new Date('2026-06-21T10:00:00.000Z'),
});
const snapshot = await connector.introspect(
{ connectionId: 'dw', driver: 'athena' },
{ runId: 'scan-1' },
);
expect(snapshot).toMatchObject({
connectionId: 'dw',
driver: 'athena',
extractedAt: '2026-06-21T10:00:00.000Z',
scope: { catalogs: ['AwsDataCatalog'], datasets: ['analytics'] },
metadata: {
catalog: 'AwsDataCatalog',
databases: ['analytics'],
table_count: 1,
total_columns: 3,
},
});
expect(snapshot.tables[0]).toMatchObject({
catalog: 'AwsDataCatalog',
db: 'analytics',
name: 'orders',
kind: 'table',
comment: 'Orders table',
estimatedRows: null,
foreignKeys: [],
});
expect(snapshot.tables[0]?.columns).toEqual([
{
name: 'id',
nativeType: 'bigint',
normalizedType: 'BIGINT',
dimensionType: 'number',
nullable: true,
primaryKey: false,
comment: 'Order id',
},
{
name: 'status',
nativeType: 'string',
normalizedType: 'VARCHAR',
dimensionType: 'string',
nullable: true,
primaryKey: false,
comment: null,
},
{
name: 'dt',
nativeType: 'date',
normalizedType: 'DATE',
dimensionType: 'time',
nullable: true,
primaryKey: false,
comment: 'Partition date',
},
]);
});
it('respects tableScope and excludes tables not in scope', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
now: () => new Date('2026-06-21T10:00:00.000Z'),
});
const scopedSnapshot = await connector.introspect(
{
connectionId: 'dw',
driver: 'athena',
tableScope: tableRefSet([{ catalog: 'AwsDataCatalog', db: 'analytics', name: 'nonexistent' }]),
},
{ runId: 'scan-1' },
);
expect(scopedSnapshot.tables).toHaveLength(0);
const matchingSnapshot = await connector.introspect(
{
connectionId: 'dw',
driver: 'athena',
tableScope: tableRefSet([{ catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' }]),
},
{ runId: 'scan-1' },
);
expect(matchingSnapshot.tables).toHaveLength(1);
expect(matchingSnapshot.tables[0]?.name).toBe('orders');
});
it('limits introspection to the configured databases scope', async () => {
const requestedDatabases: string[] = [];
const getDatabases = vi.fn(async () => ({
DatabaseList: [{ Name: 'analytics' }, { Name: 'raw' }, { Name: 'staging' }],
NextToken: undefined,
}));
const glueClient: KtxGlueClient = {
getDatabases,
getTables: vi.fn(async (input) => {
requestedDatabases.push(input.DatabaseName);
return {
TableList: [
{
Name: `${input.DatabaseName}_orders`,
TableType: 'EXTERNAL_TABLE',
StorageDescriptor: { Columns: [{ Name: 'id', Type: 'bigint' }] },
},
],
NextToken: undefined,
};
}),
};
const clientFactory: KtxAthenaClientFactory = {
createAthenaClient: vi.fn(() => fakeClientFactory().createAthenaClient('us-east-1')),
createGlueClient: vi.fn(() => glueClient),
};
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection: { ...connection, databases: ['analytics', 'raw'] },
clientFactory,
now: () => new Date('2026-06-21T10:00:00.000Z'),
});
const snapshot = await connector.introspect({ connectionId: 'dw', driver: 'athena' }, { runId: 'scan-1' });
// Scope is taken from config, so the account-wide database list is never enumerated.
expect(getDatabases).not.toHaveBeenCalled();
expect(requestedDatabases).toEqual(['analytics', 'raw']);
expect(snapshot.scope).toMatchObject({ datasets: ['analytics', 'raw'] });
expect(snapshot.tables.map((t) => t.db)).toEqual(['analytics', 'raw']);
});
it('resolves optional env-referenced config to defaults when the variable is unset', () => {
const resolved = athenaConnectionConfigFromConfig({
connectionId: 'dw',
connection: {
driver: 'athena',
region: 'us-east-1',
s3_staging_dir: 's3://bucket/',
workgroup: 'env:ATHENA_WORKGROUP_UNSET',
catalog: 'env:GLUE_CATALOG_UNSET',
},
env: {},
});
expect(resolved.workgroup).toBe('primary');
expect(resolved.catalog).toBe('AwsDataCatalog');
});
it('samples a table via Athena query execution', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
});
const result = await connector.sampleTable(
{
connectionId: 'dw',
table: { catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' },
columns: ['id', 'status'],
limit: 10,
},
{ runId: 'scan-1' },
);
expect(result).toMatchObject({
headers: ['id', 'status'],
rows: [['1', 'paid']],
totalRows: 1,
});
});
it('samples a column via Athena query execution', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
});
const result = await connector.sampleColumn(
{
connectionId: 'dw',
table: { catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' },
column: 'status',
limit: 10,
},
{ runId: 'scan-1' },
);
expect(result).toMatchObject({
values: ['paid'],
nullCount: null,
distinctCount: null,
});
});
it('executes read-only SQL and rejects write statements', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
});
await expect(
connector.executeReadOnly(
{ connectionId: 'dw', sql: 'SELECT id, status FROM "analytics"."orders"', maxRows: 100 },
{ runId: 'scan-1' },
),
).resolves.toMatchObject({
headers: ['id', 'status'],
rows: [['1', 'paid']],
rowCount: 1,
});
await expect(
connector.executeReadOnly({ connectionId: 'dw', sql: 'DELETE FROM orders' }, { runId: 'scan-1' }),
).rejects.toThrow('Only read-only SELECT/WITH queries can be executed locally');
});
it('lists schemas (databases) from Glue', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
});
await expect(connector.listSchemas()).resolves.toEqual(['analytics']);
});
it('lists tables from Glue', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
});
await expect(connector.listTables(['analytics'])).resolves.toEqual([
{
catalog: 'AwsDataCatalog',
schema: 'analytics',
name: 'orders',
kind: 'table',
},
]);
});
it('returns null for columnStats', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
});
await expect(
connector.columnStats(
{ connectionId: 'dw', table: { catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' }, column: 'status' },
{ runId: 'scan-1' },
),
).resolves.toBeNull();
});
it('tests connection successfully', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
});
await expect(connector.testConnection()).resolves.toMatchObject({ success: true });
});
it('returns failure result when testConnection throws', async () => {
const factory = fakeClientFactory();
const glueClient = factory.createGlueClient('us-east-1');
vi.mocked(glueClient.getDatabases).mockRejectedValue(new Error('Access denied'));
const brokenFactory: KtxAthenaClientFactory = {
createAthenaClient: factory.createAthenaClient,
createGlueClient: vi.fn(() => glueClient),
};
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: brokenFactory,
});
await expect(connector.testConnection()).resolves.toMatchObject({
success: false,
error: 'Access denied',
});
});
it('cleans up without throwing', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory(),
});
await connector.listSchemas();
await expect(connector.cleanup()).resolves.toBeUndefined();
});
it('throws when query execution fails', async () => {
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory({ queryState: 'FAILED', queryError: 'Syntax error in SQL' }),
});
await expect(
connector.executeReadOnly({ connectionId: 'dw', sql: 'SELECT 1' }, { runId: 'scan-1' }),
).rejects.toThrow('Athena query FAILED: Syntax error in SQL');
});
it('throws when query execution times out', async () => {
let callCount = 0;
// First now() call sets the deadline; second call simulates time past it.
const now = () => (++callCount === 1 ? new Date(0) : new Date(5 * 60 * 1000 + 1));
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: fakeClientFactory({ queryState: 'RUNNING' }),
now,
});
await expect(
connector.executeReadOnly({ connectionId: 'dw', sql: 'SELECT 1' }, { runId: 'scan-1' }),
).rejects.toThrow('timed out after 300s');
});
it('passes the exact column list to Athena when sampling specific columns', async () => {
const factory = fakeClientFactory();
const athenaClient = factory.createAthenaClient('us-east-1');
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: { createAthenaClient: vi.fn(() => athenaClient), createGlueClient: factory.createGlueClient },
});
await connector.sampleTable(
{
connectionId: 'dw',
table: { catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' },
columns: ['id', 'status'],
limit: 5,
},
{ runId: 'scan-1' },
);
expect(vi.mocked(athenaClient.startQueryExecution).mock.calls[0]?.[0].QueryString).toBe(
'SELECT "id", "status" FROM "AwsDataCatalog"."analytics"."orders" LIMIT 5',
);
});
it('paginates Glue databases and tables across multiple pages', async () => {
const glueClient: KtxGlueClient = {
getDatabases: vi.fn()
.mockResolvedValueOnce({ DatabaseList: [{ Name: 'db1' }], NextToken: 'page2' })
.mockResolvedValueOnce({ DatabaseList: [{ Name: 'db2' }], NextToken: undefined }),
getTables: vi.fn().mockImplementation(async ({ DatabaseName }: { DatabaseName: string }) => {
if (DatabaseName === 'db1') {
return {
TableList: [
{
Name: 'table_a',
TableType: 'EXTERNAL_TABLE',
StorageDescriptor: { Columns: [{ Name: 'id', Type: 'bigint' }] },
},
],
NextToken: undefined,
};
}
return {
TableList: [
{
Name: 'table_b',
TableType: 'EXTERNAL_TABLE',
StorageDescriptor: { Columns: [{ Name: 'id', Type: 'bigint' }] },
},
],
NextToken: undefined,
};
}),
};
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: {
createAthenaClient: vi.fn(() => fakeClientFactory().createAthenaClient('us-east-1')),
createGlueClient: vi.fn(() => glueClient),
},
now: () => new Date('2026-06-21T10:00:00.000Z'),
});
const snapshot = await connector.introspect({ connectionId: 'dw', driver: 'athena' }, { runId: 'scan-1' });
expect(vi.mocked(glueClient.getDatabases)).toHaveBeenCalledTimes(2);
expect(snapshot.metadata).toMatchObject({ databases: ['db1', 'db2'], table_count: 2 });
expect(snapshot.tables.map((t) => t.name)).toEqual(['table_a', 'table_b']);
});
it('paginates Athena query results across multiple pages', async () => {
const factory = fakeClientFactory();
const athenaClient = factory.createAthenaClient('us-east-1');
vi.mocked(athenaClient.getQueryResults)
.mockResolvedValueOnce({
ResultSet: {
ResultSetMetadata: {
ColumnInfo: [
{ Name: 'id', Type: 'bigint' },
{ Name: 'status', Type: 'string' },
],
},
Rows: [
// Header row — only present on the first page
{ Data: [{ VarCharValue: 'id' }, { VarCharValue: 'status' }] },
{ Data: [{ VarCharValue: '1' }, { VarCharValue: 'paid' }] },
{ Data: [{ VarCharValue: '2' }, { VarCharValue: 'shipped' }] },
],
},
NextToken: 'page-2',
})
.mockResolvedValueOnce({
ResultSet: {
ResultSetMetadata: { ColumnInfo: [] },
// No header row on subsequent pages
Rows: [{ Data: [{ VarCharValue: '3' }, { VarCharValue: 'pending' }] }],
},
NextToken: undefined,
});
const connector = new KtxAthenaScanConnector({
connectionId: 'dw',
connection,
clientFactory: { createAthenaClient: vi.fn(() => athenaClient), createGlueClient: factory.createGlueClient },
});
const result = await connector.executeReadOnly(
{ connectionId: 'dw', sql: 'SELECT id, status FROM "analytics"."orders"', maxRows: 100 },
{ runId: 'scan-1' },
);
expect(result.headers).toEqual(['id', 'status']);
expect(result.rows).toEqual([
['1', 'paid'],
['2', 'shipped'],
['3', 'pending'],
]);
expect(result.rowCount).toBe(3);
expect(vi.mocked(athenaClient.getQueryResults)).toHaveBeenCalledTimes(2);
expect(vi.mocked(athenaClient.getQueryResults).mock.calls[1]?.[0].NextToken).toBe('page-2');
});
it('adapts to the live-database introspection port via factory', async () => {
const introspection = createAthenaLiveDatabaseIntrospection({
connections: { dw: connection },
clientFactory: fakeClientFactory(),
now: () => new Date('2026-06-21T10:00:00.000Z'),
});
await expect(introspection.extractSchema('dw')).resolves.toMatchObject({
connectionId: 'dw',
driver: 'athena',
metadata: { catalog: 'AwsDataCatalog' },
tables: expect.arrayContaining([
expect.objectContaining({
db: 'analytics',
name: 'orders',
columns: expect.arrayContaining([
expect.objectContaining({ name: 'id', dimensionType: 'number' }),
]),
}),
]),
});
});
});

View file

@ -0,0 +1,72 @@
import { describe, expect, it } from 'vitest';
import { KtxAthenaDialect } from '../../../src/connectors/athena/dialect.js';
describe('KtxAthenaDialect', () => {
const dialect = new KtxAthenaDialect();
it('quotes identifiers and formats catalog.database.table names', () => {
expect(dialect.quoteIdentifier('my"col')).toBe('"my""col"');
expect(dialect.formatTableName({ catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' })).toBe(
'"AwsDataCatalog"."analytics"."orders"',
);
expect(dialect.formatTableName({ db: 'analytics', name: 'orders' })).toBe('"analytics"."orders"');
expect(dialect.formatTableName({ name: 'orders' })).toBe('"orders"');
});
it('maps native Athena/Glue types to normalized types and dimension types', () => {
expect(dialect.mapDataType('bigint')).toBe('BIGINT');
expect(dialect.mapDataType('string')).toBe('VARCHAR');
expect(dialect.mapDataType('array<string>')).toBe('ARRAY');
expect(dialect.mapDataType('map<string,bigint>')).toBe('MAP');
expect(dialect.mapDataType('struct<id:bigint>')).toBe('STRUCT');
expect(dialect.mapDataType('decimal(18,2)')).toBe('DECIMAL');
expect(dialect.mapDataType('UNKNOWN_TYPE')).toBe('UNKNOWN_TYPE');
expect(dialect.mapToDimensionType('timestamp')).toBe('time');
expect(dialect.mapToDimensionType('date')).toBe('time');
expect(dialect.mapToDimensionType('bigint')).toBe('number');
expect(dialect.mapToDimensionType('double')).toBe('number');
expect(dialect.mapToDimensionType('decimal(10,2)')).toBe('number');
expect(dialect.mapToDimensionType('boolean')).toBe('boolean');
expect(dialect.mapToDimensionType('string')).toBe('string');
expect(dialect.mapToDimensionType('varchar')).toBe('string');
});
it('generates correct sample and column-sample SQL', () => {
expect(dialect.generateSampleQuery('"analytics"."orders"', 10, ['id', 'status'])).toBe(
'SELECT "id", "status" FROM "analytics"."orders" LIMIT 10',
);
expect(dialect.generateSampleQuery('"analytics"."orders"', 5)).toBe(
'SELECT * FROM "analytics"."orders" LIMIT 5',
);
expect(dialect.generateColumnSampleQuery('"analytics"."orders"', 'status', 20)).toBe(
'SELECT "status" FROM "analytics"."orders" WHERE "status" IS NOT NULL LIMIT 20',
);
});
it('generates Presto-style cardinality and distinct-values SQL', () => {
expect(dialect.generateCardinalitySampleQuery('"t"', '"col"', 1000)).toContain('approx_distinct');
expect(dialect.generateRandomizedCardinalitySampleQuery('"t"', '"col"', 500)).toContain('rand()');
expect(dialect.generateDistinctValuesQuery('"t"', '"col"', 50)).toContain(
'SELECT DISTINCT CAST("col" AS VARCHAR) AS val',
);
});
it('returns null for column statistics (unsupported)', () => {
expect(dialect.generateColumnStatisticsQuery('analytics', 'orders')).toBeNull();
});
it('produces Trino-correct OFFSET-before-LIMIT ordering', () => {
expect(dialect.getLimitOffsetClause(10)).toBe('LIMIT 10');
expect(dialect.getLimitOffsetClause(10, 0)).toBe('LIMIT 10');
expect(dialect.getLimitOffsetClause(10, 20)).toBe('OFFSET 20 LIMIT 10');
});
it('uses unit-separator (U+001F) as the array_join delimiter', () => {
const sql = dialect.getSampleValueAggregation('SELECT value FROM t');
const separatorIndex =
sql.indexOf("array_join(array_agg(CAST(value AS VARCHAR)), '") +
"array_join(array_agg(CAST(value AS VARCHAR)), '".length;
expect(sql.charCodeAt(separatorIndex)).toBe(0x1f);
});
});

View file

@ -305,7 +305,7 @@ describe('getDialectForDriver', () => {
it('throws with a supported-driver list for unknown drivers', () => {
expect(() => getDialectForDriver('oracle')).toThrow(
'Unsupported driver "oracle". Supported drivers: bigquery, clickhouse, duckdb, mongodb, mysql, postgres, snowflake, sqlite, sqlserver',
'Unsupported driver "oracle". Supported drivers: athena, bigquery, clickhouse, duckdb, mongodb, mysql, postgres, snowflake, sqlite, sqlserver',
);
});

View file

@ -70,6 +70,11 @@ const connectionFixtures: Record<KtxConnectionDriver, FixtureFactory> = {
database: 'ANALYTICS',
schema: 'PUBLIC',
}),
athena: () => ({
driver: 'athena',
region: 'us-east-1',
s3_staging_dir: 's3://my-bucket/athena-results/',
}),
};
const allowedScopeKeys = new Set(['dataset_ids', 'databases', 'schemas', 'schema_names']);
@ -100,6 +105,7 @@ describe('driverRegistrations', () => {
const registryDrivers = Object.keys(driverRegistrations).sort();
expect(listSupportedDrivers()).toEqual(registryDrivers);
expect(listSupportedDrivers()).toEqual([
'athena',
'bigquery',
'clickhouse',
'duckdb',

View file

@ -2175,6 +2175,40 @@ describe('local scan', () => {
};
expect(manifest.tables.orders?.joins?.some((join) => join.to === 'accounts')).toBe(true);
});
it('accepts athena as a native standalone scan driver when the host supplies a live-database adapter', async () => {
await writeFile(
join(project.projectDir, 'ktx.yaml'),
[
'connections:',
' warehouse:',
' driver: athena',
' region: us-east-1',
' s3_staging_dir: s3://my-bucket/athena-results/',
' databases:',
' - analytics',
'ingest:',
' adapters:',
' - live-database',
'',
].join('\n'),
'utf-8',
);
project = await loadKtxProject({ projectDir: project.projectDir });
const result = await runLocalScan({
project,
adapters: [fetchOnlyAdapter()],
connectionId: 'warehouse',
jobId: 'scan-run-athena',
now: () => new Date('2026-04-29T17:00:00.000Z'),
});
expect(result.report.driver).toBe('athena');
expect(result.report.artifactPaths.reportPath).toBe(
'raw-sources/warehouse/live-database/2026-04-29-170000-scan-run-athena/scan-report.json',
);
});
});
describe('resolveEnabledTables', () => {

View file

@ -12,6 +12,7 @@ describe('sqlAnalysisDialectForDriver', () => {
expect(sqlAnalysisDialectForDriver('duckdb')).toBe('duckdb');
expect(sqlAnalysisDialectForDriver('clickhouse')).toBe('clickhouse');
expect(sqlAnalysisDialectForDriver('databricks')).toBe('databricks');
expect(sqlAnalysisDialectForDriver('athena')).toBe('athena');
});
it('maps local connection-type spellings to sqlglot dialects', () => {

View file

@ -243,6 +243,7 @@ describe('setup databases step', () => {
{ value: 'mysql', label: 'MySQL' },
{ value: 'clickhouse', label: 'ClickHouse' },
{ value: 'sqlserver', label: 'SQL Server' },
{ value: 'athena', label: 'Amazon Athena' },
{ value: 'mongodb', label: 'MongoDB' },
{ value: 'sqlite', label: 'SQLite' },
{ value: 'duckdb', label: 'DuckDB' },
@ -618,6 +619,29 @@ describe('setup databases step', () => {
},
],
},
{
driver: 'athena',
textValues: ['', 'us-east-1', 's3://my-bucket/athena-results/', '', ''],
expectedTextPrompts: [
{
message: connectionNamePrompt('Amazon Athena'),
placeholder: 'athena-warehouse',
initialValue: 'athena-warehouse',
},
{
message: 'AWS region\nFor example us-east-1.',
},
{
message: 'S3 staging directory\nAthena writes query results here. For example s3://my-bucket/athena-results/.',
},
{
message: 'Athena workgroup (optional)\nPress Enter to use the default workgroup "primary".',
},
{
message: 'Glue Data Catalog name (optional)\nPress Enter to use the default "AwsDataCatalog".',
},
],
},
];
for (const testCase of cases) {
@ -1967,6 +1991,40 @@ describe('setup databases step', () => {
expect(project.config.connections['clickhouse-warehouse']).not.toHaveProperty('schemas');
});
it('maps Athena scripted database schema input to databases field', async () => {
await writeFile(
join(tempDir, 'ktx.yaml'),
[
'connections:',
' athena-warehouse:',
' driver: athena',
' region: us-east-1',
' s3_staging_dir: s3://my-bucket/athena-results/',
'',
].join('\n'),
'utf-8',
);
await runKtxSetupDatabasesStep(
{
projectDir: tempDir,
inputMode: 'disabled',
skipDatabases: false,
databaseConnectionIds: ['athena-warehouse'],
databaseSchemas: ['analytics', 'raw'],
},
makeIo().io,
{ testConnection: vi.fn(async () => 0), scanConnection: vi.fn(async () => 0) },
);
const project = await loadKtxProject({ projectDir: tempDir });
expect(project.config.connections['athena-warehouse']).toMatchObject({
driver: 'athena',
databases: ['analytics', 'raw'],
});
expect(project.config.connections['athena-warehouse']).not.toHaveProperty('schemas');
});
it('does not prompt for a bootstrap BigQuery dataset before scope discovery', async () => {
const prompts = makePromptAdapter({
multiselectValues: [['bigquery']],