diff --git a/docs-site/content/docs/integrations/context-sources.mdx b/docs-site/content/docs/integrations/context-sources.mdx index 918fd07a..ed789266 100644 --- a/docs-site/content/docs/integrations/context-sources.mdx +++ b/docs-site/content/docs/integrations/context-sources.mdx @@ -438,7 +438,8 @@ connections: - `gdrive` is knowledge-only in v1; it does not produce semantic layer sources - `ktx setup` supports Google Drive configuration, including the service-account key ref, folder id, and recursive crawl flag -- `ktx connection test ` supports `gdrive` and reports the number of Google Docs visible in the configured folder +- `ktx connection test ` supports `gdrive`: it verifies that `folder_id` resolves to a folder the service account can read, then reports the number of Google Docs visible in it. A wrong or unshared `folder_id` fails the test instead of reporting zero docs +- Only Google Docs are ingested in v1; other file types (Sheets, Slides, PDFs) in the folder are skipped and recorded in the staged manifest - The service account must be granted access to the target folder explicitly ## Common errors diff --git a/packages/cli/src/connection.ts b/packages/cli/src/connection.ts index 0fa0ee5a..f134a4a4 100644 --- a/packages/cli/src/connection.ts +++ b/packages/cli/src/connection.ts @@ -6,8 +6,8 @@ import { type NotionBotInfo, NotionClient } from './context/ingest/adapters/noti import { parseGdriveConnectionConfig, resolveGdriveServiceAccountKey } from './context/connections/gdrive-config.js'; import { createLocalLookerCredentialResolver } from './context/ingest/adapters/looker/local-looker.adapter.js'; import { metabaseRuntimeConfigFromLocalConnection } from './context/ingest/adapters/metabase/local-metabase.adapter.js'; -import { createGoogleDocsClients } from './context/ingest/adapters/gdrive/gdrive-client.js'; -import { GDRIVE_DOC_MIME_TYPE, gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js'; +import { createGoogleDocsClients, verifyGdriveFolderAndCountDocs } from './context/ingest/adapters/gdrive/gdrive-client.js'; +import { gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js'; import { testRepoConnection } from './context/ingest/repo-fetch.js'; import { federatedConnectionListing } from './context/connections/federation.js'; import { getDriverRegistration } from './context/connections/drivers.js'; @@ -36,7 +36,7 @@ type LookerTestPort = Pick; type NotionTestPort = Pick; type GdriveTestPort = Pick< ReturnType['drive'], - 'listFiles' + 'listFiles' | 'getFile' >; type TestRepoConnection = typeof testRepoConnection; @@ -217,12 +217,7 @@ async function testGdriveConnection( } const parsed = parseGdriveConnectionConfig(connection); const client = await createClient(project, connectionId); - const result = await client.listFiles({ - q: `'${parsed.folder_id}' in parents and trashed = false`, - }); - return { - docs: result.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length, - }; + return { docs: await verifyGdriveFolderAndCountDocs(client, parsed.folder_id) }; } interface GitConnectionFields { diff --git a/packages/cli/src/context/ingest/adapters/gdrive/fetch.ts b/packages/cli/src/context/ingest/adapters/gdrive/fetch.ts index 288bc664..3b5e910d 100644 --- a/packages/cli/src/context/ingest/adapters/gdrive/fetch.ts +++ b/packages/cli/src/context/ingest/adapters/gdrive/fetch.ts @@ -1,10 +1,10 @@ import { createHash } from 'node:crypto'; import { mkdir, writeFile } from 'node:fs/promises'; import { dirname, join } from 'node:path'; -import { createGoogleDocsClients } from './gdrive-client.js'; +import { createGoogleDocsClients, driveFolderChildrenQuery } from './gdrive-client.js'; import { normalizeGoogleDocToMarkdown } from './normalize.js'; import type { GdriveFileRecord, GdriveManifest, GdrivePullConfig } from './types.js'; -import { GDRIVE_DOC_MIME_TYPE, GDRIVE_SOURCE_KEY } from './types.js'; +import { GDRIVE_DOC_MIME_TYPE, GDRIVE_FOLDER_MIME_TYPE, GDRIVE_SOURCE_KEY } from './types.js'; async function writeJson(path: string, value: unknown): Promise { await mkdir(dirname(path), { recursive: true }); @@ -39,32 +39,52 @@ function gdriveDocDirName(title: string, fileId: string): string { return `${compactSegment(title)}-${shortHash(fileId)}`; } +interface GdriveDocRecord { + file: GdriveFileRecord; + drivePath: string[]; + folderId: string; +} + +interface GdriveSkippedFile { + externalId: string; + reason: string; +} + +interface ListFolderResult { + docs: GdriveDocRecord[]; + skipped: GdriveSkippedFile[]; +} + async function listFolderFiles( drive: ReturnType['drive'], folderId: string, recursive: boolean, parents: string[] = [], -): Promise> { - const q = `'${folderId}' in parents and trashed = false`; - const records: Array<{ file: GdriveFileRecord; drivePath: string[]; folderId: string }> = []; +): Promise { + const q = driveFolderChildrenQuery(folderId); + const docs: GdriveDocRecord[] = []; + const skipped: GdriveSkippedFile[] = []; let pageToken: string | undefined; do { const page = await drive.listFiles({ q, pageToken }); for (const file of page.files) { - if (file.mimeType === 'application/vnd.google-apps.folder') { + if (file.mimeType === GDRIVE_FOLDER_MIME_TYPE) { if (recursive) { - records.push(...(await listFolderFiles(drive, file.id, true, [...parents, file.name]))); + const nested = await listFolderFiles(drive, file.id, true, [...parents, file.name]); + docs.push(...nested.docs); + skipped.push(...nested.skipped); } continue; } if (file.mimeType !== GDRIVE_DOC_MIME_TYPE) { + skipped.push({ externalId: file.id, reason: `unsupported mime type: ${file.mimeType}` }); continue; } - records.push({ file, drivePath: parents, folderId }); + docs.push({ file, drivePath: parents, folderId }); } pageToken = page.nextPageToken ?? undefined; } while (pageToken); - return records; + return { docs, skipped }; } export async function fetchGdriveSnapshot(params: { @@ -74,7 +94,7 @@ export async function fetchGdriveSnapshot(params: { }): Promise { await mkdir(params.stagedDir, { recursive: true }); const clients = createGoogleDocsClients(params.key); - const docs = await listFolderFiles(clients.drive, params.config.folderId, params.config.recursive); + const { docs, skipped } = await listFolderFiles(clients.drive, params.config.folderId, params.config.recursive); for (const { file, drivePath, folderId } of docs) { const document = await clients.docs.getDocument(file.id); @@ -101,8 +121,11 @@ export async function fetchGdriveSnapshot(params: { recursive: params.config.recursive, fetchedAt: new Date().toISOString(), fileCount: docs.length, - skipped: [], - warnings: [], + skipped, + warnings: + skipped.length > 0 + ? [`Skipped ${skipped.length} non-Google-Doc file(s); only Google Docs are ingested in v1.`] + : [], }; await writeJson(join(params.stagedDir, 'manifest.json'), manifest); return manifest; diff --git a/packages/cli/src/context/ingest/adapters/gdrive/gdrive-client.ts b/packages/cli/src/context/ingest/adapters/gdrive/gdrive-client.ts index 4b66bc7a..d595454b 100644 --- a/packages/cli/src/context/ingest/adapters/gdrive/gdrive-client.ts +++ b/packages/cli/src/context/ingest/adapters/gdrive/gdrive-client.ts @@ -1,21 +1,13 @@ import { JWT } from 'google-auth-library'; import type { GdriveFileRecord, GdriveServiceAccountKey, GoogleDocsDocument } from './types.js'; -import { GDRIVE_SCOPES, gdriveServiceAccountKeySchema } from './types.js'; +import { GDRIVE_DOC_MIME_TYPE, GDRIVE_FOLDER_MIME_TYPE, GDRIVE_SCOPES, gdriveServiceAccountKeySchema } from './types.js'; const GOOGLE_DRIVE_BASE_URL = 'https://www.googleapis.com/drive/v3'; const GOOGLE_DOCS_BASE_URL = 'https://docs.googleapis.com/v1'; +const GOOGLE_FILE_FIELDS = 'id,name,mimeType,parents,webViewLink,modifiedTime'; -interface GoogleApiListResponse { - files?: Array<{ - id?: string; - name?: string; - mimeType?: string; - parents?: string[]; - webViewLink?: string; - modifiedTime?: string; - }>; - nextPageToken?: string; -} +const RETRYABLE_STATUSES = new Set([408, 429, 500, 502, 503, 504]); +const MAX_REQUEST_ATTEMPTS = 4; interface GoogleApiFile { id?: string; @@ -26,6 +18,50 @@ interface GoogleApiFile { modifiedTime?: string; } +interface GoogleApiListResponse { + files?: GoogleApiFile[]; + nextPageToken?: string; +} + +export interface GoogleDriveClient { + listFiles(args: { q: string; pageToken?: string }): Promise<{ files: GdriveFileRecord[]; nextPageToken: string | null }>; + getFile(fileId: string): Promise; +} + +export interface GoogleDocsClients { + drive: GoogleDriveClient; + docs: { + getDocument(documentId: string): Promise; + }; +} + +function defaultSleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function retryDelayMs(attempt: number, retryAfterHeader: string | null): number { + const retryAfterSeconds = retryAfterHeader ? Number.parseInt(retryAfterHeader, 10) : Number.NaN; + if (Number.isFinite(retryAfterSeconds) && retryAfterSeconds >= 0) { + return Math.min(retryAfterSeconds * 1000, 30_000); + } + return Math.min(500 * 2 ** attempt, 8_000); +} + +/** @internal Retries transient Google API responses (429/5xx) honoring Retry-After. */ +export async function fetchWithGoogleRetry( + doFetch: () => Promise, + options: { maxAttempts?: number; sleep?: (ms: number) => Promise } = {}, +): Promise { + const maxAttempts = options.maxAttempts ?? MAX_REQUEST_ATTEMPTS; + const sleep = options.sleep ?? defaultSleep; + let response = await doFetch(); + for (let attempt = 1; attempt < maxAttempts && !response.ok && RETRYABLE_STATUSES.has(response.status); attempt += 1) { + await sleep(retryDelayMs(attempt - 1, response.headers.get('retry-after'))); + response = await doFetch(); + } + return response; +} + async function parseGoogleResponse(response: Response): Promise { if (!response.ok) { const body = await response.text(); @@ -35,8 +71,10 @@ async function parseGoogleResponse(response: Response): Promise { } async function authorizedFetch(client: JWT, url: string): Promise { - const headers = await client.getRequestHeaders(url); - return fetch(url, { headers }); + return fetchWithGoogleRetry(async () => { + const headers = await client.getRequestHeaders(url); + return fetch(url, { headers }); + }); } function isGoogleApiFileRecord(file: GoogleApiFile): file is GoogleApiFile & { @@ -47,14 +85,55 @@ function isGoogleApiFileRecord(file: GoogleApiFile): file is GoogleApiFile & { return typeof file.id === 'string' && typeof file.name === 'string' && typeof file.mimeType === 'string'; } -export function createGoogleDocsClients(rawKey: unknown): { - drive: { - listFiles(args: { q: string; pageToken?: string }): Promise<{ files: GdriveFileRecord[]; nextPageToken: string | null }>; +function toFileRecord(file: GoogleApiFile & { id: string; name: string; mimeType: string }): GdriveFileRecord { + return { + id: file.id, + name: file.name, + mimeType: file.mimeType, + parents: Array.isArray(file.parents) ? file.parents.filter((parent): parent is string => typeof parent === 'string') : [], + webViewLink: typeof file.webViewLink === 'string' ? file.webViewLink : null, + modifiedTime: typeof file.modifiedTime === 'string' ? file.modifiedTime : null, }; - docs: { - getDocument(documentId: string): Promise; - }; -} { +} + +function escapeDriveQueryValue(value: string): string { + return value.replace(/\\/g, '\\\\').replace(/'/g, "\\'"); +} + +/** Builds the Drive query for the non-trashed direct children of a folder, escaping the folder id. */ +export function driveFolderChildrenQuery(folderId: string): string { + return `'${escapeDriveQueryValue(folderId)}' in parents and trashed = false`; +} + +/** + * Confirms `folderId` resolves to a folder the service account can read, then counts the + * Google Docs directly inside it. Throws a caller-facing error when the id is missing or not a folder. + */ +export async function verifyGdriveFolderAndCountDocs( + drive: GoogleDriveClient, + folderId: string, +): Promise { + const folder = await drive.getFile(folderId); + if (!folder) { + throw new Error( + `Google Drive folder "${folderId}" is not accessible. Share it with the service account email and verify folder_id.`, + ); + } + if (folder.mimeType !== GDRIVE_FOLDER_MIME_TYPE) { + throw new Error(`Google Drive id "${folderId}" is not a folder (mimeType: ${folder.mimeType}).`); + } + const q = driveFolderChildrenQuery(folderId); + let docs = 0; + let pageToken: string | undefined; + do { + const page = await drive.listFiles({ q, pageToken }); + docs += page.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length; + pageToken = page.nextPageToken ?? undefined; + } while (pageToken); + return docs; +} + +export function createGoogleDocsClients(rawKey: unknown): GoogleDocsClients { const key = gdriveServiceAccountKeySchema.parse(rawKey) satisfies GdriveServiceAccountKey; const client = new JWT({ email: key.client_email, @@ -70,7 +149,7 @@ export function createGoogleDocsClients(rawKey: unknown): { supportsAllDrives: 'true', includeItemsFromAllDrives: 'true', pageSize: '1000', - fields: 'nextPageToken,files(id,name,mimeType,parents,webViewLink,modifiedTime)', + fields: `nextPageToken,files(${GOOGLE_FILE_FIELDS})`, }); if (args.pageToken) { params.set('pageToken', args.pageToken); @@ -78,19 +157,22 @@ export function createGoogleDocsClients(rawKey: unknown): { const response = await authorizedFetch(client, `${GOOGLE_DRIVE_BASE_URL}/files?${params.toString()}`); const parsed = await parseGoogleResponse(response); return { - files: (parsed.files ?? []) - .filter(isGoogleApiFileRecord) - .map((file) => ({ - id: file.id, - name: file.name, - mimeType: file.mimeType, - parents: Array.isArray(file.parents) ? file.parents.filter((parent): parent is string => typeof parent === 'string') : [], - webViewLink: typeof file.webViewLink === 'string' ? file.webViewLink : null, - modifiedTime: typeof file.modifiedTime === 'string' ? file.modifiedTime : null, - })), + files: (parsed.files ?? []).filter(isGoogleApiFileRecord).map(toFileRecord), nextPageToken: typeof parsed.nextPageToken === 'string' ? parsed.nextPageToken : null, }; }, + async getFile(fileId: string) { + const params = new URLSearchParams({ supportsAllDrives: 'true', fields: GOOGLE_FILE_FIELDS }); + const response = await authorizedFetch( + client, + `${GOOGLE_DRIVE_BASE_URL}/files/${encodeURIComponent(fileId)}?${params.toString()}`, + ); + if (response.status === 404) { + return null; + } + const file = await parseGoogleResponse(response); + return isGoogleApiFileRecord(file) ? toFileRecord(file) : null; + }, }, docs: { async getDocument(documentId: string) { diff --git a/packages/cli/src/context/ingest/adapters/gdrive/types.ts b/packages/cli/src/context/ingest/adapters/gdrive/types.ts index abfd7047..69104194 100644 --- a/packages/cli/src/context/ingest/adapters/gdrive/types.ts +++ b/packages/cli/src/context/ingest/adapters/gdrive/types.ts @@ -5,6 +5,7 @@ const GDRIVE_DRIVE_SCOPE = 'https://www.googleapis.com/auth/drive.readonly'; export const GDRIVE_SCOPES = [GDRIVE_DRIVE_SCOPE, GDRIVE_DOCS_SCOPE] as const; export const GDRIVE_SOURCE_KEY = 'gdrive'; export const GDRIVE_DOC_MIME_TYPE = 'application/vnd.google-apps.document'; +export const GDRIVE_FOLDER_MIME_TYPE = 'application/vnd.google-apps.folder'; export const gdrivePullConfigSchema = z.object({ serviceAccountKey: z.string().min(1), diff --git a/packages/cli/src/setup-sources.ts b/packages/cli/src/setup-sources.ts index 5c071a81..4c6feb26 100644 --- a/packages/cli/src/setup-sources.ts +++ b/packages/cli/src/setup-sources.ts @@ -11,8 +11,9 @@ import { resolveNotionConnectionAuthToken } from './context/connections/notion-c import { resolveKtxConfigReference } from './context/core/config-reference.js'; import { createGoogleDocsClients, + verifyGdriveFolderAndCountDocs, } from './context/ingest/adapters/gdrive/gdrive-client.js'; -import { GDRIVE_DOC_MIME_TYPE, gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js'; +import { gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js'; import { cloneOrPull, testRepoConnection } from './context/ingest/repo-fetch.js'; import { DEFAULT_METABASE_CLIENT_CONFIG, MetabaseClient } from './context/ingest/adapters/metabase/client.js'; import { discoverMetabaseDatabases, type DiscoveredMetabaseDatabase } from './context/ingest/adapters/metabase/mapping.js'; @@ -716,10 +717,7 @@ async function defaultValidateGdrive(connection: KtxProjectConnectionConfig): Pr const config = parseGdriveConnectionConfig(connection); const keyText = await resolveGdriveServiceAccountKey(config.service_account_key_ref); const clients = createGoogleDocsClients(gdriveServiceAccountKeySchema.parse(JSON.parse(keyText))); - const result = await clients.drive.listFiles({ - q: `'${config.folder_id}' in parents and trashed = false`, - }); - const docs = result.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length; + const docs = await verifyGdriveFolderAndCountDocs(clients.drive, config.folder_id); return { ok: true, detail: `docs=${docs}` }; } diff --git a/packages/cli/test/connection.test.ts b/packages/cli/test/connection.test.ts index aab39554..beeaad65 100644 --- a/packages/cli/test/connection.test.ts +++ b/packages/cli/test/connection.test.ts @@ -434,7 +434,15 @@ describe('runKtxConnection', () => { ], nextPageToken: null, })); - const createGdriveClient = vi.fn(async () => ({ listFiles })); + const getFile = vi.fn(async () => ({ + id: 'folder-123', + name: 'Docs', + mimeType: 'application/vnd.google-apps.folder', + parents: [], + webViewLink: null, + modifiedTime: null, + })); + const createGdriveClient = vi.fn(async () => ({ listFiles, getFile })); const io = makeIo(); await expect( @@ -442,12 +450,38 @@ describe('runKtxConnection', () => { ).resolves.toBe(0); expect(createGdriveClient).toHaveBeenCalledWith(expect.objectContaining({ projectDir }), 'docs_drive'); - expect(listFiles).toHaveBeenCalledWith({ q: "'folder-123' in parents and trashed = false" }); + expect(getFile).toHaveBeenCalledWith('folder-123'); + expect(listFiles).toHaveBeenCalledWith({ q: "'folder-123' in parents and trashed = false", pageToken: undefined }); expect(io.stdout()).toContain('Connection test passed: docs_drive'); expect(io.stdout()).toContain('Driver: gdrive'); expect(io.stdout()).toContain('Docs: 1'); }); + it('fails a Google Drive connection test when the folder is not accessible', async () => { + const projectDir = join(tempDir, 'project'); + await initKtxProject({ projectDir }); + await writeConnections(projectDir, { + docs_drive: { + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/gdrive-key.json', // pragma: allowlist secret + folder_id: 'missing-folder', + recursive: false, + }, + }); + const listFiles = vi.fn(); + const getFile = vi.fn(async () => null); + const createGdriveClient = vi.fn(async () => ({ listFiles, getFile })); + const io = makeIo(); + + await expect( + runKtxConnection({ command: 'test', projectDir, connectionId: 'docs_drive' }, io.io, { createGdriveClient }), + ).resolves.toBe(1); + + expect(getFile).toHaveBeenCalledWith('missing-folder'); + expect(listFiles).not.toHaveBeenCalled(); + expect(io.stderr()).toContain('is not accessible'); + }); + it('tests a dbt connection via testRepoConnection (success)', async () => { const projectDir = join(tempDir, 'project'); await initKtxProject({ projectDir }); @@ -593,6 +627,14 @@ describe('runKtxConnection', () => { ], nextPageToken: null, })), + getFile: vi.fn(async () => ({ + id: 'folder-123', + name: 'Docs', + mimeType: 'application/vnd.google-apps.folder', + parents: [], + webViewLink: null, + modifiedTime: null, + })), })); const io = makeIo(); diff --git a/packages/cli/src/context/connections/gdrive-config.test.ts b/packages/cli/test/context/connections/gdrive-config.test.ts similarity index 97% rename from packages/cli/src/context/connections/gdrive-config.test.ts rename to packages/cli/test/context/connections/gdrive-config.test.ts index 0ed30654..e79962c4 100644 --- a/packages/cli/src/context/connections/gdrive-config.test.ts +++ b/packages/cli/test/context/connections/gdrive-config.test.ts @@ -6,7 +6,7 @@ import { gdriveConnectionToPullConfig, parseGdriveConnectionConfig, resolveGdriveServiceAccountKey, -} from './gdrive-config.js'; +} from '../../../src/context/connections/gdrive-config.js'; describe('standalone gdrive connection config', () => { let tempDir: string; diff --git a/packages/cli/test/context/ingest/adapters/gdrive/fetch.test.ts b/packages/cli/test/context/ingest/adapters/gdrive/fetch.test.ts index 23bc58a0..41584f33 100644 --- a/packages/cli/test/context/ingest/adapters/gdrive/fetch.test.ts +++ b/packages/cli/test/context/ingest/adapters/gdrive/fetch.test.ts @@ -22,7 +22,8 @@ const listFiles = vi.fn(async () => ({ nextPageToken: null, })); -vi.mock('../../../../../src/context/ingest/adapters/gdrive/gdrive-client.js', () => ({ +vi.mock('../../../../../src/context/ingest/adapters/gdrive/gdrive-client.js', async (importOriginal) => ({ + ...(await importOriginal()), createGoogleDocsClients: vi.fn(() => ({ drive: { listFiles }, docs: { getDocument }, @@ -81,4 +82,42 @@ describe('fetchGdriveSnapshot', () => { readFile(join(stagedDir, 'docs', 'herness-and-enterprise-a-7913523027', 'page.md'), 'utf-8'), ).resolves.toContain('# Herness and Enterprise Agent Operating Framework for Connected Systems'); }); + + it('records skipped non-Google-Doc files in the manifest with a summary warning', async () => { + stagedDir = await mkdtemp(join(tmpdir(), 'ktx-gdrive-fetch-')); + listFiles.mockResolvedValueOnce({ + files: [ + { + id: 'doc-1', + name: 'Doc', + mimeType: 'application/vnd.google-apps.document', + parents: ['folder-123'], + webViewLink: 'https://docs.google.com/document/d/doc-1', + modifiedTime: '2026-05-24T01:53:28.347Z', + }, + { + id: 'sheet-1', + name: 'Sheet', + mimeType: 'application/vnd.google-apps.spreadsheet', + parents: ['folder-123'], + webViewLink: 'https://docs.google.com/spreadsheets/d/sheet-1', + modifiedTime: '2026-05-24T01:53:28.347Z', + }, + ], + nextPageToken: null, + }); + + const manifest = await fetchGdriveSnapshot({ + key: { client_email: 'bot@example.com', private_key: 'secret' }, // pragma: allowlist secret + config: { serviceAccountKey: 'unused', folderId: 'folder-123', recursive: false }, // pragma: allowlist secret + stagedDir, + }); + + expect(manifest.fileCount).toBe(1); + expect(manifest.skipped).toEqual([ + { externalId: 'sheet-1', reason: 'unsupported mime type: application/vnd.google-apps.spreadsheet' }, + ]); + expect(manifest.warnings).toHaveLength(1); + expect(manifest.warnings[0]).toContain('Skipped 1'); + }); }); diff --git a/packages/cli/test/context/ingest/adapters/gdrive/gdrive-client.test.ts b/packages/cli/test/context/ingest/adapters/gdrive/gdrive-client.test.ts new file mode 100644 index 00000000..1aa8e610 --- /dev/null +++ b/packages/cli/test/context/ingest/adapters/gdrive/gdrive-client.test.ts @@ -0,0 +1,100 @@ +import { describe, expect, it, vi } from 'vitest'; +import { + driveFolderChildrenQuery, + fetchWithGoogleRetry, + verifyGdriveFolderAndCountDocs, + type GoogleDriveClient, +} from '../../../../../src/context/ingest/adapters/gdrive/gdrive-client.js'; +import { + GDRIVE_DOC_MIME_TYPE, + GDRIVE_FOLDER_MIME_TYPE, + type GdriveFileRecord, +} from '../../../../../src/context/ingest/adapters/gdrive/types.js'; + +function fileRecord(partial: Partial & { id: string; mimeType: string }): GdriveFileRecord { + return { + name: partial.name ?? partial.id, + parents: [], + webViewLink: null, + modifiedTime: null, + ...partial, + }; +} + +describe('driveFolderChildrenQuery', () => { + it('escapes single quotes and backslashes in the folder id', () => { + expect(driveFolderChildrenQuery('abc')).toBe("'abc' in parents and trashed = false"); + expect(driveFolderChildrenQuery("a'b")).toBe("'a\\'b' in parents and trashed = false"); + expect(driveFolderChildrenQuery('a\\b')).toBe("'a\\\\b' in parents and trashed = false"); + }); +}); + +describe('verifyGdriveFolderAndCountDocs', () => { + it('throws a caller-facing error when the folder is not accessible', async () => { + const drive: GoogleDriveClient = { + getFile: vi.fn(async () => null), + listFiles: vi.fn(), + }; + await expect(verifyGdriveFolderAndCountDocs(drive, 'missing')).rejects.toThrow('is not accessible'); + expect(drive.listFiles).not.toHaveBeenCalled(); + }); + + it('throws when the id resolves to a non-folder', async () => { + const drive: GoogleDriveClient = { + getFile: vi.fn(async () => fileRecord({ id: 'doc-1', mimeType: GDRIVE_DOC_MIME_TYPE })), + listFiles: vi.fn(), + }; + await expect(verifyGdriveFolderAndCountDocs(drive, 'doc-1')).rejects.toThrow('is not a folder'); + expect(drive.listFiles).not.toHaveBeenCalled(); + }); + + it('counts Google Docs across pages and ignores non-Docs', async () => { + const listFiles = vi + .fn() + .mockResolvedValueOnce({ + files: [ + fileRecord({ id: '1', mimeType: GDRIVE_DOC_MIME_TYPE }), + fileRecord({ id: '2', mimeType: 'application/vnd.google-apps.spreadsheet' }), + ], + nextPageToken: 'page-2', + }) + .mockResolvedValueOnce({ + files: [fileRecord({ id: '3', mimeType: GDRIVE_DOC_MIME_TYPE })], + nextPageToken: null, + }); + const drive: GoogleDriveClient = { + getFile: vi.fn(async () => fileRecord({ id: 'folder', mimeType: GDRIVE_FOLDER_MIME_TYPE })), + listFiles, + }; + await expect(verifyGdriveFolderAndCountDocs(drive, 'folder')).resolves.toBe(2); + expect(listFiles).toHaveBeenCalledTimes(2); + }); +}); + +describe('fetchWithGoogleRetry', () => { + const noopSleep = async () => {}; + + it('retries transient 5xx responses then returns success', async () => { + const doFetch = vi + .fn() + .mockResolvedValueOnce(new Response('busy', { status: 503 })) + .mockResolvedValueOnce(new Response('{}', { status: 200 })); + const response = await fetchWithGoogleRetry(doFetch, { sleep: noopSleep }); + expect(response.status).toBe(200); + expect(doFetch).toHaveBeenCalledTimes(2); + }); + + it('does not retry non-retryable responses', async () => { + const doFetch = vi.fn().mockResolvedValue(new Response('nope', { status: 404 })); + const response = await fetchWithGoogleRetry(doFetch, { sleep: noopSleep }); + expect(response.status).toBe(404); + expect(doFetch).toHaveBeenCalledTimes(1); + }); + + it('stops after maxAttempts when responses stay transient', async () => { + const doFetch = vi.fn().mockResolvedValue(new Response('rate', { status: 429 })); + const response = await fetchWithGoogleRetry(doFetch, { sleep: noopSleep, maxAttempts: 3 }); + expect(response.status).toBe(429); + expect(doFetch).toHaveBeenCalledTimes(3); + }); +}); diff --git a/uv.lock b/uv.lock index e831ec69..81405d31 100644 --- a/uv.lock +++ b/uv.lock @@ -466,7 +466,7 @@ wheels = [ [[package]] name = "ktx-daemon" -version = "0.13.0" +version = "0.13.1" source = { editable = "python/ktx-daemon" } dependencies = [ { name = "fastapi" }, @@ -523,7 +523,7 @@ dev = [ [[package]] name = "ktx-sl" -version = "0.13.0" +version = "0.13.1" source = { editable = "python/ktx-sl" } dependencies = [ { name = "pydantic" },