mirror of
https://github.com/Kaelio/ktx.git
synced 2026-07-04 10:52:13 +02:00
feat(connector): add Amazon Athena connector via Glue Data Catalog (#309)
* feat(connector): add Amazon Athena connector via Glue Data Catalog * fix(athena): address reviewer feedback * fix(athena): wire scope discovery, fix normalizeDriver, tighten types and tests * fix(athena): honor databases scope, wire sql-analysis dialect, harden config resolution - introspect() limits to the configured `databases` scope instead of scanning every Glue database in the account (docs promised this; connector ignored it) - add athena -> athena to sql-analysis SQLGLOT_DIALECTS so `ktx sql` and MCP read-only validation parse Athena SQL under the Trino grammar, not postgres - stringConfigValue coerces a resolved-empty `env:` reference to undefined so optional fields fall back to their defaults (workgroup 'primary', catalog 'AwsDataCatalog') instead of '' - drop trailing whitespace in dialect.test.ts * fix(athena): integrate with main's SQL/non-SQL dialect split and add dialect notes Rebase onto main, which introduced the KtxDialect (core) vs KtxSqlDialect (SQL-only) split for MongoDB: - KtxAthenaDialect implements KtxSqlDialect; the connector resolves it via getSqlDialectForDriver so SQL-generation methods stay in scope - add authored athena.md SQL notes for the sql_dialect_notes MCP tool, required now that athena resolves to the athena sqlglot dialect (dialect-notes coverage is derived from the warehouse-driver registry) --------- Co-authored-by: Andrey Avtomonov <andreybavt@gmail.com>
This commit is contained in:
parent
6d01030745
commit
fe7e6bd1fa
24 changed files with 2047 additions and 6 deletions
630
packages/cli/test/connectors/athena/connector.test.ts
Normal file
630
packages/cli/test/connectors/athena/connector.test.ts
Normal file
|
|
@ -0,0 +1,630 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import {
|
||||
athenaConnectionConfigFromConfig,
|
||||
isKtxAthenaConnectionConfig,
|
||||
KtxAthenaScanConnector,
|
||||
type KtxAthenaClientFactory,
|
||||
type KtxAthenaClient,
|
||||
type KtxGlueClient,
|
||||
} from '../../../src/connectors/athena/connector.js';
|
||||
import { createAthenaLiveDatabaseIntrospection } from '../../../src/connectors/athena/live-database-introspection.js';
|
||||
import { tableRefSet } from '../../../src/context/scan/table-ref.js';
|
||||
|
||||
function fakeClientFactory(options: { queryState?: string; queryError?: string } = {}): KtxAthenaClientFactory {
|
||||
const state = options.queryState ?? 'SUCCEEDED';
|
||||
const queries = new Map<string, string>();
|
||||
let execCounter = 0;
|
||||
|
||||
const fakeAthenaClient: KtxAthenaClient = {
|
||||
startQueryExecution: vi.fn(async (input) => {
|
||||
const id = `exec-${++execCounter}`;
|
||||
queries.set(id, input.QueryString);
|
||||
return { QueryExecutionId: id };
|
||||
}),
|
||||
getQueryExecution: vi.fn(async () => ({
|
||||
QueryExecution: {
|
||||
Status: {
|
||||
State: state,
|
||||
StateChangeReason: options.queryError,
|
||||
},
|
||||
},
|
||||
})),
|
||||
getQueryResults: vi.fn(async (input) => {
|
||||
const sql = queries.get(input.QueryExecutionId) ?? '';
|
||||
// Column sample query: single-column result for the queried column only.
|
||||
if (sql.includes('IS NOT NULL')) {
|
||||
return {
|
||||
ResultSet: {
|
||||
ResultSetMetadata: { ColumnInfo: [{ Name: 'status', Type: 'string' }] },
|
||||
Rows: [
|
||||
{ Data: [{ VarCharValue: 'status' }] }, // header row
|
||||
{ Data: [{ VarCharValue: 'paid' }] },
|
||||
],
|
||||
},
|
||||
NextToken: undefined,
|
||||
};
|
||||
}
|
||||
return {
|
||||
ResultSet: {
|
||||
ResultSetMetadata: {
|
||||
ColumnInfo: [
|
||||
{ Name: 'id', Type: 'bigint' },
|
||||
{ Name: 'status', Type: 'string' },
|
||||
],
|
||||
},
|
||||
Rows: [
|
||||
// Header row (Athena always includes it on first page)
|
||||
{ Data: [{ VarCharValue: 'id' }, { VarCharValue: 'status' }] },
|
||||
// Data row
|
||||
{ Data: [{ VarCharValue: '1' }, { VarCharValue: 'paid' }] },
|
||||
],
|
||||
},
|
||||
NextToken: undefined,
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const fakeGlueClient: KtxGlueClient = {
|
||||
getDatabases: vi.fn(async () => ({
|
||||
DatabaseList: [{ Name: 'analytics' }],
|
||||
NextToken: undefined,
|
||||
})),
|
||||
getTables: vi.fn(async () => ({
|
||||
TableList: [
|
||||
{
|
||||
Name: 'orders',
|
||||
TableType: 'EXTERNAL_TABLE',
|
||||
Description: 'Orders table',
|
||||
StorageDescriptor: {
|
||||
Columns: [
|
||||
{ Name: 'id', Type: 'bigint', Comment: 'Order id' },
|
||||
{ Name: 'status', Type: 'string' },
|
||||
],
|
||||
},
|
||||
PartitionKeys: [{ Name: 'dt', Type: 'date', Comment: 'Partition date' }],
|
||||
},
|
||||
],
|
||||
NextToken: undefined,
|
||||
})),
|
||||
};
|
||||
|
||||
return {
|
||||
createAthenaClient: vi.fn(() => fakeAthenaClient),
|
||||
createGlueClient: vi.fn(() => fakeGlueClient),
|
||||
};
|
||||
}
|
||||
|
||||
const connection = {
|
||||
driver: 'athena',
|
||||
region: 'us-east-1',
|
||||
s3_staging_dir: 's3://my-bucket/athena-results/',
|
||||
workgroup: 'analytics',
|
||||
catalog: 'AwsDataCatalog',
|
||||
database: 'analytics',
|
||||
} as const;
|
||||
|
||||
describe('KtxAthenaScanConnector', () => {
|
||||
it('identifies athena connection configs correctly', () => {
|
||||
expect(isKtxAthenaConnectionConfig(connection)).toBe(true);
|
||||
expect(isKtxAthenaConnectionConfig({ driver: 'bigquery' })).toBe(false);
|
||||
expect(isKtxAthenaConnectionConfig(null)).toBe(false);
|
||||
expect(isKtxAthenaConnectionConfig(undefined)).toBe(false);
|
||||
});
|
||||
|
||||
it('resolves configuration and throws on missing required fields', () => {
|
||||
expect(athenaConnectionConfigFromConfig({ connectionId: 'dw', connection })).toMatchObject({
|
||||
region: 'us-east-1',
|
||||
s3StagingDir: 's3://my-bucket/athena-results/',
|
||||
workgroup: 'analytics',
|
||||
catalog: 'AwsDataCatalog',
|
||||
database: 'analytics',
|
||||
});
|
||||
|
||||
expect(() =>
|
||||
athenaConnectionConfigFromConfig({ connectionId: 'dw', connection: { driver: 'athena' } }),
|
||||
).toThrow('connections.dw.region');
|
||||
|
||||
expect(() =>
|
||||
athenaConnectionConfigFromConfig({
|
||||
connectionId: 'dw',
|
||||
connection: { driver: 'athena', region: 'us-east-1' },
|
||||
}),
|
||||
).toThrow('connections.dw.s3_staging_dir');
|
||||
});
|
||||
|
||||
it('applies defaults for optional config fields', () => {
|
||||
const resolved = athenaConnectionConfigFromConfig({
|
||||
connectionId: 'dw',
|
||||
connection: { driver: 'athena', region: 'us-east-1', s3_staging_dir: 's3://bucket/' },
|
||||
});
|
||||
expect(resolved.workgroup).toBe('primary');
|
||||
expect(resolved.catalog).toBe('AwsDataCatalog');
|
||||
expect(resolved.database).toBeUndefined();
|
||||
});
|
||||
|
||||
it('introspects databases, tables, and columns from Glue', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
now: () => new Date('2026-06-21T10:00:00.000Z'),
|
||||
});
|
||||
|
||||
const snapshot = await connector.introspect(
|
||||
{ connectionId: 'dw', driver: 'athena' },
|
||||
{ runId: 'scan-1' },
|
||||
);
|
||||
|
||||
expect(snapshot).toMatchObject({
|
||||
connectionId: 'dw',
|
||||
driver: 'athena',
|
||||
extractedAt: '2026-06-21T10:00:00.000Z',
|
||||
scope: { catalogs: ['AwsDataCatalog'], datasets: ['analytics'] },
|
||||
metadata: {
|
||||
catalog: 'AwsDataCatalog',
|
||||
databases: ['analytics'],
|
||||
table_count: 1,
|
||||
total_columns: 3,
|
||||
},
|
||||
});
|
||||
|
||||
expect(snapshot.tables[0]).toMatchObject({
|
||||
catalog: 'AwsDataCatalog',
|
||||
db: 'analytics',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Orders table',
|
||||
estimatedRows: null,
|
||||
foreignKeys: [],
|
||||
});
|
||||
|
||||
expect(snapshot.tables[0]?.columns).toEqual([
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'bigint',
|
||||
normalizedType: 'BIGINT',
|
||||
dimensionType: 'number',
|
||||
nullable: true,
|
||||
primaryKey: false,
|
||||
comment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'status',
|
||||
nativeType: 'string',
|
||||
normalizedType: 'VARCHAR',
|
||||
dimensionType: 'string',
|
||||
nullable: true,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'dt',
|
||||
nativeType: 'date',
|
||||
normalizedType: 'DATE',
|
||||
dimensionType: 'time',
|
||||
nullable: true,
|
||||
primaryKey: false,
|
||||
comment: 'Partition date',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('respects tableScope and excludes tables not in scope', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
now: () => new Date('2026-06-21T10:00:00.000Z'),
|
||||
});
|
||||
|
||||
const scopedSnapshot = await connector.introspect(
|
||||
{
|
||||
connectionId: 'dw',
|
||||
driver: 'athena',
|
||||
tableScope: tableRefSet([{ catalog: 'AwsDataCatalog', db: 'analytics', name: 'nonexistent' }]),
|
||||
},
|
||||
{ runId: 'scan-1' },
|
||||
);
|
||||
expect(scopedSnapshot.tables).toHaveLength(0);
|
||||
|
||||
const matchingSnapshot = await connector.introspect(
|
||||
{
|
||||
connectionId: 'dw',
|
||||
driver: 'athena',
|
||||
tableScope: tableRefSet([{ catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' }]),
|
||||
},
|
||||
{ runId: 'scan-1' },
|
||||
);
|
||||
expect(matchingSnapshot.tables).toHaveLength(1);
|
||||
expect(matchingSnapshot.tables[0]?.name).toBe('orders');
|
||||
});
|
||||
|
||||
it('limits introspection to the configured databases scope', async () => {
|
||||
const requestedDatabases: string[] = [];
|
||||
const getDatabases = vi.fn(async () => ({
|
||||
DatabaseList: [{ Name: 'analytics' }, { Name: 'raw' }, { Name: 'staging' }],
|
||||
NextToken: undefined,
|
||||
}));
|
||||
const glueClient: KtxGlueClient = {
|
||||
getDatabases,
|
||||
getTables: vi.fn(async (input) => {
|
||||
requestedDatabases.push(input.DatabaseName);
|
||||
return {
|
||||
TableList: [
|
||||
{
|
||||
Name: `${input.DatabaseName}_orders`,
|
||||
TableType: 'EXTERNAL_TABLE',
|
||||
StorageDescriptor: { Columns: [{ Name: 'id', Type: 'bigint' }] },
|
||||
},
|
||||
],
|
||||
NextToken: undefined,
|
||||
};
|
||||
}),
|
||||
};
|
||||
const clientFactory: KtxAthenaClientFactory = {
|
||||
createAthenaClient: vi.fn(() => fakeClientFactory().createAthenaClient('us-east-1')),
|
||||
createGlueClient: vi.fn(() => glueClient),
|
||||
};
|
||||
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection: { ...connection, databases: ['analytics', 'raw'] },
|
||||
clientFactory,
|
||||
now: () => new Date('2026-06-21T10:00:00.000Z'),
|
||||
});
|
||||
|
||||
const snapshot = await connector.introspect({ connectionId: 'dw', driver: 'athena' }, { runId: 'scan-1' });
|
||||
|
||||
// Scope is taken from config, so the account-wide database list is never enumerated.
|
||||
expect(getDatabases).not.toHaveBeenCalled();
|
||||
expect(requestedDatabases).toEqual(['analytics', 'raw']);
|
||||
expect(snapshot.scope).toMatchObject({ datasets: ['analytics', 'raw'] });
|
||||
expect(snapshot.tables.map((t) => t.db)).toEqual(['analytics', 'raw']);
|
||||
});
|
||||
|
||||
it('resolves optional env-referenced config to defaults when the variable is unset', () => {
|
||||
const resolved = athenaConnectionConfigFromConfig({
|
||||
connectionId: 'dw',
|
||||
connection: {
|
||||
driver: 'athena',
|
||||
region: 'us-east-1',
|
||||
s3_staging_dir: 's3://bucket/',
|
||||
workgroup: 'env:ATHENA_WORKGROUP_UNSET',
|
||||
catalog: 'env:GLUE_CATALOG_UNSET',
|
||||
},
|
||||
env: {},
|
||||
});
|
||||
expect(resolved.workgroup).toBe('primary');
|
||||
expect(resolved.catalog).toBe('AwsDataCatalog');
|
||||
});
|
||||
|
||||
it('samples a table via Athena query execution', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
});
|
||||
|
||||
const result = await connector.sampleTable(
|
||||
{
|
||||
connectionId: 'dw',
|
||||
table: { catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' },
|
||||
columns: ['id', 'status'],
|
||||
limit: 10,
|
||||
},
|
||||
{ runId: 'scan-1' },
|
||||
);
|
||||
|
||||
expect(result).toMatchObject({
|
||||
headers: ['id', 'status'],
|
||||
rows: [['1', 'paid']],
|
||||
totalRows: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it('samples a column via Athena query execution', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
});
|
||||
|
||||
const result = await connector.sampleColumn(
|
||||
{
|
||||
connectionId: 'dw',
|
||||
table: { catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' },
|
||||
column: 'status',
|
||||
limit: 10,
|
||||
},
|
||||
{ runId: 'scan-1' },
|
||||
);
|
||||
|
||||
expect(result).toMatchObject({
|
||||
values: ['paid'],
|
||||
nullCount: null,
|
||||
distinctCount: null,
|
||||
});
|
||||
});
|
||||
|
||||
it('executes read-only SQL and rejects write statements', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
});
|
||||
|
||||
await expect(
|
||||
connector.executeReadOnly(
|
||||
{ connectionId: 'dw', sql: 'SELECT id, status FROM "analytics"."orders"', maxRows: 100 },
|
||||
{ runId: 'scan-1' },
|
||||
),
|
||||
).resolves.toMatchObject({
|
||||
headers: ['id', 'status'],
|
||||
rows: [['1', 'paid']],
|
||||
rowCount: 1,
|
||||
});
|
||||
|
||||
await expect(
|
||||
connector.executeReadOnly({ connectionId: 'dw', sql: 'DELETE FROM orders' }, { runId: 'scan-1' }),
|
||||
).rejects.toThrow('Only read-only SELECT/WITH queries can be executed locally');
|
||||
});
|
||||
|
||||
it('lists schemas (databases) from Glue', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
});
|
||||
|
||||
await expect(connector.listSchemas()).resolves.toEqual(['analytics']);
|
||||
});
|
||||
|
||||
it('lists tables from Glue', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
});
|
||||
|
||||
await expect(connector.listTables(['analytics'])).resolves.toEqual([
|
||||
{
|
||||
catalog: 'AwsDataCatalog',
|
||||
schema: 'analytics',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('returns null for columnStats', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
});
|
||||
|
||||
await expect(
|
||||
connector.columnStats(
|
||||
{ connectionId: 'dw', table: { catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' }, column: 'status' },
|
||||
{ runId: 'scan-1' },
|
||||
),
|
||||
).resolves.toBeNull();
|
||||
});
|
||||
|
||||
it('tests connection successfully', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
});
|
||||
|
||||
await expect(connector.testConnection()).resolves.toMatchObject({ success: true });
|
||||
});
|
||||
|
||||
it('returns failure result when testConnection throws', async () => {
|
||||
const factory = fakeClientFactory();
|
||||
const glueClient = factory.createGlueClient('us-east-1');
|
||||
vi.mocked(glueClient.getDatabases).mockRejectedValue(new Error('Access denied'));
|
||||
const brokenFactory: KtxAthenaClientFactory = {
|
||||
createAthenaClient: factory.createAthenaClient,
|
||||
createGlueClient: vi.fn(() => glueClient),
|
||||
};
|
||||
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: brokenFactory,
|
||||
});
|
||||
|
||||
await expect(connector.testConnection()).resolves.toMatchObject({
|
||||
success: false,
|
||||
error: 'Access denied',
|
||||
});
|
||||
});
|
||||
|
||||
it('cleans up without throwing', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory(),
|
||||
});
|
||||
await connector.listSchemas();
|
||||
await expect(connector.cleanup()).resolves.toBeUndefined();
|
||||
});
|
||||
|
||||
it('throws when query execution fails', async () => {
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory({ queryState: 'FAILED', queryError: 'Syntax error in SQL' }),
|
||||
});
|
||||
|
||||
await expect(
|
||||
connector.executeReadOnly({ connectionId: 'dw', sql: 'SELECT 1' }, { runId: 'scan-1' }),
|
||||
).rejects.toThrow('Athena query FAILED: Syntax error in SQL');
|
||||
});
|
||||
|
||||
it('throws when query execution times out', async () => {
|
||||
let callCount = 0;
|
||||
// First now() call sets the deadline; second call simulates time past it.
|
||||
const now = () => (++callCount === 1 ? new Date(0) : new Date(5 * 60 * 1000 + 1));
|
||||
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: fakeClientFactory({ queryState: 'RUNNING' }),
|
||||
now,
|
||||
});
|
||||
|
||||
await expect(
|
||||
connector.executeReadOnly({ connectionId: 'dw', sql: 'SELECT 1' }, { runId: 'scan-1' }),
|
||||
).rejects.toThrow('timed out after 300s');
|
||||
});
|
||||
|
||||
it('passes the exact column list to Athena when sampling specific columns', async () => {
|
||||
const factory = fakeClientFactory();
|
||||
const athenaClient = factory.createAthenaClient('us-east-1');
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: { createAthenaClient: vi.fn(() => athenaClient), createGlueClient: factory.createGlueClient },
|
||||
});
|
||||
|
||||
await connector.sampleTable(
|
||||
{
|
||||
connectionId: 'dw',
|
||||
table: { catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' },
|
||||
columns: ['id', 'status'],
|
||||
limit: 5,
|
||||
},
|
||||
{ runId: 'scan-1' },
|
||||
);
|
||||
|
||||
expect(vi.mocked(athenaClient.startQueryExecution).mock.calls[0]?.[0].QueryString).toBe(
|
||||
'SELECT "id", "status" FROM "AwsDataCatalog"."analytics"."orders" LIMIT 5',
|
||||
);
|
||||
});
|
||||
|
||||
it('paginates Glue databases and tables across multiple pages', async () => {
|
||||
const glueClient: KtxGlueClient = {
|
||||
getDatabases: vi.fn()
|
||||
.mockResolvedValueOnce({ DatabaseList: [{ Name: 'db1' }], NextToken: 'page2' })
|
||||
.mockResolvedValueOnce({ DatabaseList: [{ Name: 'db2' }], NextToken: undefined }),
|
||||
getTables: vi.fn().mockImplementation(async ({ DatabaseName }: { DatabaseName: string }) => {
|
||||
if (DatabaseName === 'db1') {
|
||||
return {
|
||||
TableList: [
|
||||
{
|
||||
Name: 'table_a',
|
||||
TableType: 'EXTERNAL_TABLE',
|
||||
StorageDescriptor: { Columns: [{ Name: 'id', Type: 'bigint' }] },
|
||||
},
|
||||
],
|
||||
NextToken: undefined,
|
||||
};
|
||||
}
|
||||
return {
|
||||
TableList: [
|
||||
{
|
||||
Name: 'table_b',
|
||||
TableType: 'EXTERNAL_TABLE',
|
||||
StorageDescriptor: { Columns: [{ Name: 'id', Type: 'bigint' }] },
|
||||
},
|
||||
],
|
||||
NextToken: undefined,
|
||||
};
|
||||
}),
|
||||
};
|
||||
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: {
|
||||
createAthenaClient: vi.fn(() => fakeClientFactory().createAthenaClient('us-east-1')),
|
||||
createGlueClient: vi.fn(() => glueClient),
|
||||
},
|
||||
now: () => new Date('2026-06-21T10:00:00.000Z'),
|
||||
});
|
||||
|
||||
const snapshot = await connector.introspect({ connectionId: 'dw', driver: 'athena' }, { runId: 'scan-1' });
|
||||
|
||||
expect(vi.mocked(glueClient.getDatabases)).toHaveBeenCalledTimes(2);
|
||||
expect(snapshot.metadata).toMatchObject({ databases: ['db1', 'db2'], table_count: 2 });
|
||||
expect(snapshot.tables.map((t) => t.name)).toEqual(['table_a', 'table_b']);
|
||||
});
|
||||
|
||||
it('paginates Athena query results across multiple pages', async () => {
|
||||
const factory = fakeClientFactory();
|
||||
const athenaClient = factory.createAthenaClient('us-east-1');
|
||||
vi.mocked(athenaClient.getQueryResults)
|
||||
.mockResolvedValueOnce({
|
||||
ResultSet: {
|
||||
ResultSetMetadata: {
|
||||
ColumnInfo: [
|
||||
{ Name: 'id', Type: 'bigint' },
|
||||
{ Name: 'status', Type: 'string' },
|
||||
],
|
||||
},
|
||||
Rows: [
|
||||
// Header row — only present on the first page
|
||||
{ Data: [{ VarCharValue: 'id' }, { VarCharValue: 'status' }] },
|
||||
{ Data: [{ VarCharValue: '1' }, { VarCharValue: 'paid' }] },
|
||||
{ Data: [{ VarCharValue: '2' }, { VarCharValue: 'shipped' }] },
|
||||
],
|
||||
},
|
||||
NextToken: 'page-2',
|
||||
})
|
||||
.mockResolvedValueOnce({
|
||||
ResultSet: {
|
||||
ResultSetMetadata: { ColumnInfo: [] },
|
||||
// No header row on subsequent pages
|
||||
Rows: [{ Data: [{ VarCharValue: '3' }, { VarCharValue: 'pending' }] }],
|
||||
},
|
||||
NextToken: undefined,
|
||||
});
|
||||
|
||||
const connector = new KtxAthenaScanConnector({
|
||||
connectionId: 'dw',
|
||||
connection,
|
||||
clientFactory: { createAthenaClient: vi.fn(() => athenaClient), createGlueClient: factory.createGlueClient },
|
||||
});
|
||||
|
||||
const result = await connector.executeReadOnly(
|
||||
{ connectionId: 'dw', sql: 'SELECT id, status FROM "analytics"."orders"', maxRows: 100 },
|
||||
{ runId: 'scan-1' },
|
||||
);
|
||||
|
||||
expect(result.headers).toEqual(['id', 'status']);
|
||||
expect(result.rows).toEqual([
|
||||
['1', 'paid'],
|
||||
['2', 'shipped'],
|
||||
['3', 'pending'],
|
||||
]);
|
||||
expect(result.rowCount).toBe(3);
|
||||
expect(vi.mocked(athenaClient.getQueryResults)).toHaveBeenCalledTimes(2);
|
||||
expect(vi.mocked(athenaClient.getQueryResults).mock.calls[1]?.[0].NextToken).toBe('page-2');
|
||||
});
|
||||
|
||||
it('adapts to the live-database introspection port via factory', async () => {
|
||||
const introspection = createAthenaLiveDatabaseIntrospection({
|
||||
connections: { dw: connection },
|
||||
clientFactory: fakeClientFactory(),
|
||||
now: () => new Date('2026-06-21T10:00:00.000Z'),
|
||||
});
|
||||
|
||||
await expect(introspection.extractSchema('dw')).resolves.toMatchObject({
|
||||
connectionId: 'dw',
|
||||
driver: 'athena',
|
||||
metadata: { catalog: 'AwsDataCatalog' },
|
||||
tables: expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
db: 'analytics',
|
||||
name: 'orders',
|
||||
columns: expect.arrayContaining([
|
||||
expect.objectContaining({ name: 'id', dimensionType: 'number' }),
|
||||
]),
|
||||
}),
|
||||
]),
|
||||
});
|
||||
});
|
||||
});
|
||||
72
packages/cli/test/connectors/athena/dialect.test.ts
Normal file
72
packages/cli/test/connectors/athena/dialect.test.ts
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { KtxAthenaDialect } from '../../../src/connectors/athena/dialect.js';
|
||||
|
||||
describe('KtxAthenaDialect', () => {
|
||||
const dialect = new KtxAthenaDialect();
|
||||
|
||||
it('quotes identifiers and formats catalog.database.table names', () => {
|
||||
expect(dialect.quoteIdentifier('my"col')).toBe('"my""col"');
|
||||
expect(dialect.formatTableName({ catalog: 'AwsDataCatalog', db: 'analytics', name: 'orders' })).toBe(
|
||||
'"AwsDataCatalog"."analytics"."orders"',
|
||||
);
|
||||
expect(dialect.formatTableName({ db: 'analytics', name: 'orders' })).toBe('"analytics"."orders"');
|
||||
expect(dialect.formatTableName({ name: 'orders' })).toBe('"orders"');
|
||||
});
|
||||
|
||||
it('maps native Athena/Glue types to normalized types and dimension types', () => {
|
||||
expect(dialect.mapDataType('bigint')).toBe('BIGINT');
|
||||
expect(dialect.mapDataType('string')).toBe('VARCHAR');
|
||||
expect(dialect.mapDataType('array<string>')).toBe('ARRAY');
|
||||
expect(dialect.mapDataType('map<string,bigint>')).toBe('MAP');
|
||||
expect(dialect.mapDataType('struct<id:bigint>')).toBe('STRUCT');
|
||||
expect(dialect.mapDataType('decimal(18,2)')).toBe('DECIMAL');
|
||||
expect(dialect.mapDataType('UNKNOWN_TYPE')).toBe('UNKNOWN_TYPE');
|
||||
|
||||
expect(dialect.mapToDimensionType('timestamp')).toBe('time');
|
||||
expect(dialect.mapToDimensionType('date')).toBe('time');
|
||||
expect(dialect.mapToDimensionType('bigint')).toBe('number');
|
||||
expect(dialect.mapToDimensionType('double')).toBe('number');
|
||||
expect(dialect.mapToDimensionType('decimal(10,2)')).toBe('number');
|
||||
expect(dialect.mapToDimensionType('boolean')).toBe('boolean');
|
||||
expect(dialect.mapToDimensionType('string')).toBe('string');
|
||||
expect(dialect.mapToDimensionType('varchar')).toBe('string');
|
||||
});
|
||||
|
||||
it('generates correct sample and column-sample SQL', () => {
|
||||
expect(dialect.generateSampleQuery('"analytics"."orders"', 10, ['id', 'status'])).toBe(
|
||||
'SELECT "id", "status" FROM "analytics"."orders" LIMIT 10',
|
||||
);
|
||||
expect(dialect.generateSampleQuery('"analytics"."orders"', 5)).toBe(
|
||||
'SELECT * FROM "analytics"."orders" LIMIT 5',
|
||||
);
|
||||
expect(dialect.generateColumnSampleQuery('"analytics"."orders"', 'status', 20)).toBe(
|
||||
'SELECT "status" FROM "analytics"."orders" WHERE "status" IS NOT NULL LIMIT 20',
|
||||
);
|
||||
});
|
||||
|
||||
it('generates Presto-style cardinality and distinct-values SQL', () => {
|
||||
expect(dialect.generateCardinalitySampleQuery('"t"', '"col"', 1000)).toContain('approx_distinct');
|
||||
expect(dialect.generateRandomizedCardinalitySampleQuery('"t"', '"col"', 500)).toContain('rand()');
|
||||
expect(dialect.generateDistinctValuesQuery('"t"', '"col"', 50)).toContain(
|
||||
'SELECT DISTINCT CAST("col" AS VARCHAR) AS val',
|
||||
);
|
||||
});
|
||||
|
||||
it('returns null for column statistics (unsupported)', () => {
|
||||
expect(dialect.generateColumnStatisticsQuery('analytics', 'orders')).toBeNull();
|
||||
});
|
||||
|
||||
it('produces Trino-correct OFFSET-before-LIMIT ordering', () => {
|
||||
expect(dialect.getLimitOffsetClause(10)).toBe('LIMIT 10');
|
||||
expect(dialect.getLimitOffsetClause(10, 0)).toBe('LIMIT 10');
|
||||
expect(dialect.getLimitOffsetClause(10, 20)).toBe('OFFSET 20 LIMIT 10');
|
||||
});
|
||||
|
||||
it('uses unit-separator (U+001F) as the array_join delimiter', () => {
|
||||
const sql = dialect.getSampleValueAggregation('SELECT value FROM t');
|
||||
const separatorIndex =
|
||||
sql.indexOf("array_join(array_agg(CAST(value AS VARCHAR)), '") +
|
||||
"array_join(array_agg(CAST(value AS VARCHAR)), '".length;
|
||||
expect(sql.charCodeAt(separatorIndex)).toBe(0x1f);
|
||||
});
|
||||
});
|
||||
|
|
@ -305,7 +305,7 @@ describe('getDialectForDriver', () => {
|
|||
|
||||
it('throws with a supported-driver list for unknown drivers', () => {
|
||||
expect(() => getDialectForDriver('oracle')).toThrow(
|
||||
'Unsupported driver "oracle". Supported drivers: bigquery, clickhouse, duckdb, mongodb, mysql, postgres, snowflake, sqlite, sqlserver',
|
||||
'Unsupported driver "oracle". Supported drivers: athena, bigquery, clickhouse, duckdb, mongodb, mysql, postgres, snowflake, sqlite, sqlserver',
|
||||
);
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -70,6 +70,11 @@ const connectionFixtures: Record<KtxConnectionDriver, FixtureFactory> = {
|
|||
database: 'ANALYTICS',
|
||||
schema: 'PUBLIC',
|
||||
}),
|
||||
athena: () => ({
|
||||
driver: 'athena',
|
||||
region: 'us-east-1',
|
||||
s3_staging_dir: 's3://my-bucket/athena-results/',
|
||||
}),
|
||||
};
|
||||
|
||||
const allowedScopeKeys = new Set(['dataset_ids', 'databases', 'schemas', 'schema_names']);
|
||||
|
|
@ -100,6 +105,7 @@ describe('driverRegistrations', () => {
|
|||
const registryDrivers = Object.keys(driverRegistrations).sort();
|
||||
expect(listSupportedDrivers()).toEqual(registryDrivers);
|
||||
expect(listSupportedDrivers()).toEqual([
|
||||
'athena',
|
||||
'bigquery',
|
||||
'clickhouse',
|
||||
'duckdb',
|
||||
|
|
|
|||
|
|
@ -2175,6 +2175,40 @@ describe('local scan', () => {
|
|||
};
|
||||
expect(manifest.tables.orders?.joins?.some((join) => join.to === 'accounts')).toBe(true);
|
||||
});
|
||||
|
||||
it('accepts athena as a native standalone scan driver when the host supplies a live-database adapter', async () => {
|
||||
await writeFile(
|
||||
join(project.projectDir, 'ktx.yaml'),
|
||||
[
|
||||
'connections:',
|
||||
' warehouse:',
|
||||
' driver: athena',
|
||||
' region: us-east-1',
|
||||
' s3_staging_dir: s3://my-bucket/athena-results/',
|
||||
' databases:',
|
||||
' - analytics',
|
||||
'ingest:',
|
||||
' adapters:',
|
||||
' - live-database',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
);
|
||||
project = await loadKtxProject({ projectDir: project.projectDir });
|
||||
|
||||
const result = await runLocalScan({
|
||||
project,
|
||||
adapters: [fetchOnlyAdapter()],
|
||||
connectionId: 'warehouse',
|
||||
jobId: 'scan-run-athena',
|
||||
now: () => new Date('2026-04-29T17:00:00.000Z'),
|
||||
});
|
||||
|
||||
expect(result.report.driver).toBe('athena');
|
||||
expect(result.report.artifactPaths.reportPath).toBe(
|
||||
'raw-sources/warehouse/live-database/2026-04-29-170000-scan-run-athena/scan-report.json',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('resolveEnabledTables', () => {
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ describe('sqlAnalysisDialectForDriver', () => {
|
|||
expect(sqlAnalysisDialectForDriver('duckdb')).toBe('duckdb');
|
||||
expect(sqlAnalysisDialectForDriver('clickhouse')).toBe('clickhouse');
|
||||
expect(sqlAnalysisDialectForDriver('databricks')).toBe('databricks');
|
||||
expect(sqlAnalysisDialectForDriver('athena')).toBe('athena');
|
||||
});
|
||||
|
||||
it('maps local connection-type spellings to sqlglot dialects', () => {
|
||||
|
|
|
|||
|
|
@ -243,6 +243,7 @@ describe('setup databases step', () => {
|
|||
{ value: 'mysql', label: 'MySQL' },
|
||||
{ value: 'clickhouse', label: 'ClickHouse' },
|
||||
{ value: 'sqlserver', label: 'SQL Server' },
|
||||
{ value: 'athena', label: 'Amazon Athena' },
|
||||
{ value: 'mongodb', label: 'MongoDB' },
|
||||
{ value: 'sqlite', label: 'SQLite' },
|
||||
{ value: 'duckdb', label: 'DuckDB' },
|
||||
|
|
@ -618,6 +619,29 @@ describe('setup databases step', () => {
|
|||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
driver: 'athena',
|
||||
textValues: ['', 'us-east-1', 's3://my-bucket/athena-results/', '', ''],
|
||||
expectedTextPrompts: [
|
||||
{
|
||||
message: connectionNamePrompt('Amazon Athena'),
|
||||
placeholder: 'athena-warehouse',
|
||||
initialValue: 'athena-warehouse',
|
||||
},
|
||||
{
|
||||
message: 'AWS region\nFor example us-east-1.',
|
||||
},
|
||||
{
|
||||
message: 'S3 staging directory\nAthena writes query results here. For example s3://my-bucket/athena-results/.',
|
||||
},
|
||||
{
|
||||
message: 'Athena workgroup (optional)\nPress Enter to use the default workgroup "primary".',
|
||||
},
|
||||
{
|
||||
message: 'Glue Data Catalog name (optional)\nPress Enter to use the default "AwsDataCatalog".',
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
for (const testCase of cases) {
|
||||
|
|
@ -1967,6 +1991,40 @@ describe('setup databases step', () => {
|
|||
expect(project.config.connections['clickhouse-warehouse']).not.toHaveProperty('schemas');
|
||||
});
|
||||
|
||||
it('maps Athena scripted database schema input to databases field', async () => {
|
||||
await writeFile(
|
||||
join(tempDir, 'ktx.yaml'),
|
||||
[
|
||||
'connections:',
|
||||
' athena-warehouse:',
|
||||
' driver: athena',
|
||||
' region: us-east-1',
|
||||
' s3_staging_dir: s3://my-bucket/athena-results/',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
);
|
||||
|
||||
await runKtxSetupDatabasesStep(
|
||||
{
|
||||
projectDir: tempDir,
|
||||
inputMode: 'disabled',
|
||||
skipDatabases: false,
|
||||
databaseConnectionIds: ['athena-warehouse'],
|
||||
databaseSchemas: ['analytics', 'raw'],
|
||||
},
|
||||
makeIo().io,
|
||||
{ testConnection: vi.fn(async () => 0), scanConnection: vi.fn(async () => 0) },
|
||||
);
|
||||
|
||||
const project = await loadKtxProject({ projectDir: tempDir });
|
||||
expect(project.config.connections['athena-warehouse']).toMatchObject({
|
||||
driver: 'athena',
|
||||
databases: ['analytics', 'raw'],
|
||||
});
|
||||
expect(project.config.connections['athena-warehouse']).not.toHaveProperty('schemas');
|
||||
});
|
||||
|
||||
it('does not prompt for a bootstrap BigQuery dataset before scope discovery', async () => {
|
||||
const prompts = makePromptAdapter({
|
||||
multiselectValues: [['bigquery']],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue