fix(gdrive): validate folder access, run config test, harden Drive API (#321)

* fix(gdrive): validate folder access, run config test, harden Drive API

Connection test and setup validation now verify folder_id resolves to an accessible Drive folder before counting Docs, via a shared verifyGdriveFolderAndCountDocs helper, so a wrong or unshared folder fails instead of passing with 0 docs.

Move gdrive-config.test.ts under test/ so Vitest's test/** glob actually runs it; escape folder_id in the Drive query; add retry/backoff on transient Google API responses; and record skipped non-Google-Doc files in the staged manifest.

* chore: sync uv.lock to ktx-daemon/ktx-sl 0.13.1
This commit is contained in:
Andrey Avtomonov 2026-06-28 01:02:37 +02:00 committed by GitHub
parent 5645dc4d28
commit ca231df5fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 346 additions and 65 deletions

View file

@ -6,8 +6,8 @@ import { type NotionBotInfo, NotionClient } from './context/ingest/adapters/noti
import { parseGdriveConnectionConfig, resolveGdriveServiceAccountKey } from './context/connections/gdrive-config.js';
import { createLocalLookerCredentialResolver } from './context/ingest/adapters/looker/local-looker.adapter.js';
import { metabaseRuntimeConfigFromLocalConnection } from './context/ingest/adapters/metabase/local-metabase.adapter.js';
import { createGoogleDocsClients } from './context/ingest/adapters/gdrive/gdrive-client.js';
import { GDRIVE_DOC_MIME_TYPE, gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js';
import { createGoogleDocsClients, verifyGdriveFolderAndCountDocs } from './context/ingest/adapters/gdrive/gdrive-client.js';
import { gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js';
import { testRepoConnection } from './context/ingest/repo-fetch.js';
import { federatedConnectionListing } from './context/connections/federation.js';
import { getDriverRegistration } from './context/connections/drivers.js';
@ -36,7 +36,7 @@ type LookerTestPort = Pick<LookerClient, 'testConnection'>;
type NotionTestPort = Pick<NotionClient, 'retrieveBotUser'>;
type GdriveTestPort = Pick<
ReturnType<typeof createGoogleDocsClients>['drive'],
'listFiles'
'listFiles' | 'getFile'
>;
type TestRepoConnection = typeof testRepoConnection;
@ -217,12 +217,7 @@ async function testGdriveConnection(
}
const parsed = parseGdriveConnectionConfig(connection);
const client = await createClient(project, connectionId);
const result = await client.listFiles({
q: `'${parsed.folder_id}' in parents and trashed = false`,
});
return {
docs: result.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length,
};
return { docs: await verifyGdriveFolderAndCountDocs(client, parsed.folder_id) };
}
interface GitConnectionFields {

View file

@ -1,71 +0,0 @@
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import {
gdriveConnectionToPullConfig,
parseGdriveConnectionConfig,
resolveGdriveServiceAccountKey,
} from './gdrive-config.js';
describe('standalone gdrive connection config', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-gdrive-config-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('parses config with safe defaults', () => {
const parsed = parseGdriveConnectionConfig({
driver: 'gdrive',
service_account_key_ref: 'file:/tmp/google-key.json', // pragma: allowlist secret
folder_id: 'folder-123',
});
expect(parsed).toEqual({
driver: 'gdrive',
service_account_key_ref: 'file:/tmp/google-key.json', // pragma: allowlist secret
folder_id: 'folder-123',
recursive: false,
});
});
it('requires file-based service account keys', () => {
expect(() =>
parseGdriveConnectionConfig({
driver: 'gdrive',
service_account_key_ref: 'env:GOOGLE_KEY', // pragma: allowlist secret
folder_id: 'folder-123',
}),
).toThrow('gdrive service_account_key_ref must use file:/path/to/key.json');
});
it('resolves service account key files', async () => {
const keyPath = join(tempDir, 'google-key.json');
await writeFile(keyPath, '{"client_email":"bot@example.com","private_key":"line-1"}\n', 'utf-8'); // pragma: allowlist secret
await expect(resolveGdriveServiceAccountKey(`file:${keyPath}`)).resolves.toContain('"client_email":"bot@example.com"');
});
it('converts config into adapter pull config', async () => {
const keyPath = join(tempDir, 'google-key.json');
await writeFile(keyPath, '{"client_email":"bot@example.com","private_key":"line-1"}\n', 'utf-8'); // pragma: allowlist secret
const pullConfig = await gdriveConnectionToPullConfig(
parseGdriveConnectionConfig({
driver: 'gdrive',
service_account_key_ref: `file:${keyPath}`, // pragma: allowlist secret
folder_id: 'folder-123',
recursive: true,
}),
);
expect(pullConfig).toEqual({
serviceAccountKey: '{"client_email":"bot@example.com","private_key":"line-1"}', // pragma: allowlist secret
folderId: 'folder-123',
recursive: true,
});
});
});

View file

@ -1,10 +1,10 @@
import { createHash } from 'node:crypto';
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import { createGoogleDocsClients } from './gdrive-client.js';
import { createGoogleDocsClients, driveFolderChildrenQuery } from './gdrive-client.js';
import { normalizeGoogleDocToMarkdown } from './normalize.js';
import type { GdriveFileRecord, GdriveManifest, GdrivePullConfig } from './types.js';
import { GDRIVE_DOC_MIME_TYPE, GDRIVE_SOURCE_KEY } from './types.js';
import { GDRIVE_DOC_MIME_TYPE, GDRIVE_FOLDER_MIME_TYPE, GDRIVE_SOURCE_KEY } from './types.js';
async function writeJson(path: string, value: unknown): Promise<void> {
await mkdir(dirname(path), { recursive: true });
@ -39,32 +39,52 @@ function gdriveDocDirName(title: string, fileId: string): string {
return `${compactSegment(title)}-${shortHash(fileId)}`;
}
interface GdriveDocRecord {
file: GdriveFileRecord;
drivePath: string[];
folderId: string;
}
interface GdriveSkippedFile {
externalId: string;
reason: string;
}
interface ListFolderResult {
docs: GdriveDocRecord[];
skipped: GdriveSkippedFile[];
}
async function listFolderFiles(
drive: ReturnType<typeof createGoogleDocsClients>['drive'],
folderId: string,
recursive: boolean,
parents: string[] = [],
): Promise<Array<{ file: GdriveFileRecord; drivePath: string[]; folderId: string }>> {
const q = `'${folderId}' in parents and trashed = false`;
const records: Array<{ file: GdriveFileRecord; drivePath: string[]; folderId: string }> = [];
): Promise<ListFolderResult> {
const q = driveFolderChildrenQuery(folderId);
const docs: GdriveDocRecord[] = [];
const skipped: GdriveSkippedFile[] = [];
let pageToken: string | undefined;
do {
const page = await drive.listFiles({ q, pageToken });
for (const file of page.files) {
if (file.mimeType === 'application/vnd.google-apps.folder') {
if (file.mimeType === GDRIVE_FOLDER_MIME_TYPE) {
if (recursive) {
records.push(...(await listFolderFiles(drive, file.id, true, [...parents, file.name])));
const nested = await listFolderFiles(drive, file.id, true, [...parents, file.name]);
docs.push(...nested.docs);
skipped.push(...nested.skipped);
}
continue;
}
if (file.mimeType !== GDRIVE_DOC_MIME_TYPE) {
skipped.push({ externalId: file.id, reason: `unsupported mime type: ${file.mimeType}` });
continue;
}
records.push({ file, drivePath: parents, folderId });
docs.push({ file, drivePath: parents, folderId });
}
pageToken = page.nextPageToken ?? undefined;
} while (pageToken);
return records;
return { docs, skipped };
}
export async function fetchGdriveSnapshot(params: {
@ -74,7 +94,7 @@ export async function fetchGdriveSnapshot(params: {
}): Promise<GdriveManifest> {
await mkdir(params.stagedDir, { recursive: true });
const clients = createGoogleDocsClients(params.key);
const docs = await listFolderFiles(clients.drive, params.config.folderId, params.config.recursive);
const { docs, skipped } = await listFolderFiles(clients.drive, params.config.folderId, params.config.recursive);
for (const { file, drivePath, folderId } of docs) {
const document = await clients.docs.getDocument(file.id);
@ -101,8 +121,11 @@ export async function fetchGdriveSnapshot(params: {
recursive: params.config.recursive,
fetchedAt: new Date().toISOString(),
fileCount: docs.length,
skipped: [],
warnings: [],
skipped,
warnings:
skipped.length > 0
? [`Skipped ${skipped.length} non-Google-Doc file(s); only Google Docs are ingested in v1.`]
: [],
};
await writeJson(join(params.stagedDir, 'manifest.json'), manifest);
return manifest;

View file

@ -1,21 +1,13 @@
import { JWT } from 'google-auth-library';
import type { GdriveFileRecord, GdriveServiceAccountKey, GoogleDocsDocument } from './types.js';
import { GDRIVE_SCOPES, gdriveServiceAccountKeySchema } from './types.js';
import { GDRIVE_DOC_MIME_TYPE, GDRIVE_FOLDER_MIME_TYPE, GDRIVE_SCOPES, gdriveServiceAccountKeySchema } from './types.js';
const GOOGLE_DRIVE_BASE_URL = 'https://www.googleapis.com/drive/v3';
const GOOGLE_DOCS_BASE_URL = 'https://docs.googleapis.com/v1';
const GOOGLE_FILE_FIELDS = 'id,name,mimeType,parents,webViewLink,modifiedTime';
interface GoogleApiListResponse {
files?: Array<{
id?: string;
name?: string;
mimeType?: string;
parents?: string[];
webViewLink?: string;
modifiedTime?: string;
}>;
nextPageToken?: string;
}
const RETRYABLE_STATUSES = new Set([408, 429, 500, 502, 503, 504]);
const MAX_REQUEST_ATTEMPTS = 4;
interface GoogleApiFile {
id?: string;
@ -26,6 +18,50 @@ interface GoogleApiFile {
modifiedTime?: string;
}
interface GoogleApiListResponse {
files?: GoogleApiFile[];
nextPageToken?: string;
}
export interface GoogleDriveClient {
listFiles(args: { q: string; pageToken?: string }): Promise<{ files: GdriveFileRecord[]; nextPageToken: string | null }>;
getFile(fileId: string): Promise<GdriveFileRecord | null>;
}
export interface GoogleDocsClients {
drive: GoogleDriveClient;
docs: {
getDocument(documentId: string): Promise<GoogleDocsDocument>;
};
}
function defaultSleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function retryDelayMs(attempt: number, retryAfterHeader: string | null): number {
const retryAfterSeconds = retryAfterHeader ? Number.parseInt(retryAfterHeader, 10) : Number.NaN;
if (Number.isFinite(retryAfterSeconds) && retryAfterSeconds >= 0) {
return Math.min(retryAfterSeconds * 1000, 30_000);
}
return Math.min(500 * 2 ** attempt, 8_000);
}
/** @internal Retries transient Google API responses (429/5xx) honoring Retry-After. */
export async function fetchWithGoogleRetry(
doFetch: () => Promise<Response>,
options: { maxAttempts?: number; sleep?: (ms: number) => Promise<void> } = {},
): Promise<Response> {
const maxAttempts = options.maxAttempts ?? MAX_REQUEST_ATTEMPTS;
const sleep = options.sleep ?? defaultSleep;
let response = await doFetch();
for (let attempt = 1; attempt < maxAttempts && !response.ok && RETRYABLE_STATUSES.has(response.status); attempt += 1) {
await sleep(retryDelayMs(attempt - 1, response.headers.get('retry-after')));
response = await doFetch();
}
return response;
}
async function parseGoogleResponse<T>(response: Response): Promise<T> {
if (!response.ok) {
const body = await response.text();
@ -35,8 +71,10 @@ async function parseGoogleResponse<T>(response: Response): Promise<T> {
}
async function authorizedFetch(client: JWT, url: string): Promise<Response> {
const headers = await client.getRequestHeaders(url);
return fetch(url, { headers });
return fetchWithGoogleRetry(async () => {
const headers = await client.getRequestHeaders(url);
return fetch(url, { headers });
});
}
function isGoogleApiFileRecord(file: GoogleApiFile): file is GoogleApiFile & {
@ -47,14 +85,55 @@ function isGoogleApiFileRecord(file: GoogleApiFile): file is GoogleApiFile & {
return typeof file.id === 'string' && typeof file.name === 'string' && typeof file.mimeType === 'string';
}
export function createGoogleDocsClients(rawKey: unknown): {
drive: {
listFiles(args: { q: string; pageToken?: string }): Promise<{ files: GdriveFileRecord[]; nextPageToken: string | null }>;
function toFileRecord(file: GoogleApiFile & { id: string; name: string; mimeType: string }): GdriveFileRecord {
return {
id: file.id,
name: file.name,
mimeType: file.mimeType,
parents: Array.isArray(file.parents) ? file.parents.filter((parent): parent is string => typeof parent === 'string') : [],
webViewLink: typeof file.webViewLink === 'string' ? file.webViewLink : null,
modifiedTime: typeof file.modifiedTime === 'string' ? file.modifiedTime : null,
};
docs: {
getDocument(documentId: string): Promise<GoogleDocsDocument>;
};
} {
}
function escapeDriveQueryValue(value: string): string {
return value.replace(/\\/g, '\\\\').replace(/'/g, "\\'");
}
/** Builds the Drive query for the non-trashed direct children of a folder, escaping the folder id. */
export function driveFolderChildrenQuery(folderId: string): string {
return `'${escapeDriveQueryValue(folderId)}' in parents and trashed = false`;
}
/**
* Confirms `folderId` resolves to a folder the service account can read, then counts the
* Google Docs directly inside it. Throws a caller-facing error when the id is missing or not a folder.
*/
export async function verifyGdriveFolderAndCountDocs(
drive: GoogleDriveClient,
folderId: string,
): Promise<number> {
const folder = await drive.getFile(folderId);
if (!folder) {
throw new Error(
`Google Drive folder "${folderId}" is not accessible. Share it with the service account email and verify folder_id.`,
);
}
if (folder.mimeType !== GDRIVE_FOLDER_MIME_TYPE) {
throw new Error(`Google Drive id "${folderId}" is not a folder (mimeType: ${folder.mimeType}).`);
}
const q = driveFolderChildrenQuery(folderId);
let docs = 0;
let pageToken: string | undefined;
do {
const page = await drive.listFiles({ q, pageToken });
docs += page.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length;
pageToken = page.nextPageToken ?? undefined;
} while (pageToken);
return docs;
}
export function createGoogleDocsClients(rawKey: unknown): GoogleDocsClients {
const key = gdriveServiceAccountKeySchema.parse(rawKey) satisfies GdriveServiceAccountKey;
const client = new JWT({
email: key.client_email,
@ -70,7 +149,7 @@ export function createGoogleDocsClients(rawKey: unknown): {
supportsAllDrives: 'true',
includeItemsFromAllDrives: 'true',
pageSize: '1000',
fields: 'nextPageToken,files(id,name,mimeType,parents,webViewLink,modifiedTime)',
fields: `nextPageToken,files(${GOOGLE_FILE_FIELDS})`,
});
if (args.pageToken) {
params.set('pageToken', args.pageToken);
@ -78,19 +157,22 @@ export function createGoogleDocsClients(rawKey: unknown): {
const response = await authorizedFetch(client, `${GOOGLE_DRIVE_BASE_URL}/files?${params.toString()}`);
const parsed = await parseGoogleResponse<GoogleApiListResponse>(response);
return {
files: (parsed.files ?? [])
.filter(isGoogleApiFileRecord)
.map((file) => ({
id: file.id,
name: file.name,
mimeType: file.mimeType,
parents: Array.isArray(file.parents) ? file.parents.filter((parent): parent is string => typeof parent === 'string') : [],
webViewLink: typeof file.webViewLink === 'string' ? file.webViewLink : null,
modifiedTime: typeof file.modifiedTime === 'string' ? file.modifiedTime : null,
})),
files: (parsed.files ?? []).filter(isGoogleApiFileRecord).map(toFileRecord),
nextPageToken: typeof parsed.nextPageToken === 'string' ? parsed.nextPageToken : null,
};
},
async getFile(fileId: string) {
const params = new URLSearchParams({ supportsAllDrives: 'true', fields: GOOGLE_FILE_FIELDS });
const response = await authorizedFetch(
client,
`${GOOGLE_DRIVE_BASE_URL}/files/${encodeURIComponent(fileId)}?${params.toString()}`,
);
if (response.status === 404) {
return null;
}
const file = await parseGoogleResponse<GoogleApiFile>(response);
return isGoogleApiFileRecord(file) ? toFileRecord(file) : null;
},
},
docs: {
async getDocument(documentId: string) {

View file

@ -5,6 +5,7 @@ const GDRIVE_DRIVE_SCOPE = 'https://www.googleapis.com/auth/drive.readonly';
export const GDRIVE_SCOPES = [GDRIVE_DRIVE_SCOPE, GDRIVE_DOCS_SCOPE] as const;
export const GDRIVE_SOURCE_KEY = 'gdrive';
export const GDRIVE_DOC_MIME_TYPE = 'application/vnd.google-apps.document';
export const GDRIVE_FOLDER_MIME_TYPE = 'application/vnd.google-apps.folder';
export const gdrivePullConfigSchema = z.object({
serviceAccountKey: z.string().min(1),

View file

@ -11,8 +11,9 @@ import { resolveNotionConnectionAuthToken } from './context/connections/notion-c
import { resolveKtxConfigReference } from './context/core/config-reference.js';
import {
createGoogleDocsClients,
verifyGdriveFolderAndCountDocs,
} from './context/ingest/adapters/gdrive/gdrive-client.js';
import { GDRIVE_DOC_MIME_TYPE, gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js';
import { gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js';
import { cloneOrPull, testRepoConnection } from './context/ingest/repo-fetch.js';
import { DEFAULT_METABASE_CLIENT_CONFIG, MetabaseClient } from './context/ingest/adapters/metabase/client.js';
import { discoverMetabaseDatabases, type DiscoveredMetabaseDatabase } from './context/ingest/adapters/metabase/mapping.js';
@ -716,10 +717,7 @@ async function defaultValidateGdrive(connection: KtxProjectConnectionConfig): Pr
const config = parseGdriveConnectionConfig(connection);
const keyText = await resolveGdriveServiceAccountKey(config.service_account_key_ref);
const clients = createGoogleDocsClients(gdriveServiceAccountKeySchema.parse(JSON.parse(keyText)));
const result = await clients.drive.listFiles({
q: `'${config.folder_id}' in parents and trashed = false`,
});
const docs = result.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length;
const docs = await verifyGdriveFolderAndCountDocs(clients.drive, config.folder_id);
return { ok: true, detail: `docs=${docs}` };
}