From 5645dc4d28775f0c9789388b7284c6f740c777d1 Mon Sep 17 00:00:00 2001 From: ARYAN <57013028+Aryan1718@users.noreply.github.com> Date: Sat, 27 Jun 2026 14:41:32 -0700 Subject: [PATCH] Add gdrive context source adapter (#209) * Add gdrive context source adapter * feat(gdrive): normalize internal doc links, tabs, and header/footer structure * fix(gdrive): reject generic source credential flags * test(gdrive): include local adapter in expected list * fix(gdrive): remove dead exports and silence false positive secret checks * fix(setup): restore notion source auth flow --- .../docs/integrations/context-sources.mdx | 69 +- packages/cli/package.json | 1 + packages/cli/src/commands/setup-commands.ts | 20 +- packages/cli/src/connection.ts | 51 ++ .../context/connections/gdrive-config.test.ts | 71 ++ .../src/context/connections/gdrive-config.ts | 87 +++ .../context/ingest/adapters/gdrive/chunk.ts | 85 +++ .../context/ingest/adapters/gdrive/detect.ts | 20 + .../context/ingest/adapters/gdrive/fetch.ts | 109 +++ .../ingest/adapters/gdrive/gdrive-client.ts | 106 +++ .../ingest/adapters/gdrive/gdrive.adapter.ts | 33 + .../ingest/adapters/gdrive/normalize.ts | 323 +++++++++ .../context/ingest/adapters/gdrive/types.ts | 167 +++++ .../cli/src/context/ingest/artifact-gates.ts | 9 +- .../cli/src/context/ingest/local-adapters.ts | 6 + .../ingest/tools/read-raw-file.tool.ts | 16 +- .../ingest/tools/read-raw-span.tool.ts | 16 +- .../cli/src/context/project/driver-schemas.ts | 13 + packages/cli/src/public-ingest.ts | 1 + packages/cli/src/setup-sources.ts | 202 ++++-- packages/cli/src/setup.ts | 10 +- .../cli/src/skills/gdrive_synthesize/SKILL.md | 97 +++ packages/cli/test/connection.test.ts | 60 ++ .../ingest/adapters/gdrive/chunk.test.ts | 110 +++ .../ingest/adapters/gdrive/detect.test.ts | 22 + .../ingest/adapters/gdrive/fetch.test.ts | 84 +++ .../adapters/gdrive/gdrive.adapter.test.ts | 60 ++ .../ingest/adapters/gdrive/normalize.test.ts | 628 ++++++++++++++++++ .../context/ingest/artifact-gates.test.ts | 13 + .../ingest/ingest-runtime-assets.test.ts | 1 + .../context/ingest/local-adapters.test.ts | 1 + .../ingest/tools/read-raw-file.tool.test.ts | 9 + .../ingest/tools/read-raw-span.tool.test.ts | 9 + .../memory/memory-runtime-assets.test.ts | 10 + .../context/project/driver-schemas.test.ts | 17 +- packages/cli/test/public-ingest.test.ts | 9 + packages/cli/test/setup-sources.test.ts | 63 ++ packages/cli/test/setup.test.ts | 9 +- pnpm-lock.yaml | 3 + 39 files changed, 2546 insertions(+), 74 deletions(-) create mode 100644 packages/cli/src/context/connections/gdrive-config.test.ts create mode 100644 packages/cli/src/context/connections/gdrive-config.ts create mode 100644 packages/cli/src/context/ingest/adapters/gdrive/chunk.ts create mode 100644 packages/cli/src/context/ingest/adapters/gdrive/detect.ts create mode 100644 packages/cli/src/context/ingest/adapters/gdrive/fetch.ts create mode 100644 packages/cli/src/context/ingest/adapters/gdrive/gdrive-client.ts create mode 100644 packages/cli/src/context/ingest/adapters/gdrive/gdrive.adapter.ts create mode 100644 packages/cli/src/context/ingest/adapters/gdrive/normalize.ts create mode 100644 packages/cli/src/context/ingest/adapters/gdrive/types.ts create mode 100644 packages/cli/src/skills/gdrive_synthesize/SKILL.md create mode 100644 packages/cli/test/context/ingest/adapters/gdrive/chunk.test.ts create mode 100644 packages/cli/test/context/ingest/adapters/gdrive/detect.test.ts create mode 100644 packages/cli/test/context/ingest/adapters/gdrive/fetch.test.ts create mode 100644 packages/cli/test/context/ingest/adapters/gdrive/gdrive.adapter.test.ts create mode 100644 packages/cli/test/context/ingest/adapters/gdrive/normalize.test.ts diff --git a/docs-site/content/docs/integrations/context-sources.mdx b/docs-site/content/docs/integrations/context-sources.mdx index f7a52685..918fd07a 100644 --- a/docs-site/content/docs/integrations/context-sources.mdx +++ b/docs-site/content/docs/integrations/context-sources.mdx @@ -1,6 +1,6 @@ --- title: Context Sources -description: Ingest semantic context from dbt, MetricFlow, LookML, Metabase, Looker, and Notion. +description: Ingest semantic context from dbt, MetricFlow, LookML, Metabase, Looker, Notion, and Google Drive. --- Context sources feed your existing analytics tooling into **ktx**. During ingestion, **ktx** extracts metadata from each source and uses a reconciliation agent to reconcile it with your existing semantic layer and knowledge base - preserving accepted edits rather than overwriting. @@ -27,7 +27,7 @@ LookML uses top-level `repoUrl`, and MetricFlow uses nested | Field | Required | Description | |-------|----------|-------------| -| `driver` | Yes | Source connector: `dbt`, `metricflow`, `lookml`, `metabase`, `looker`, or `notion` | +| `driver` | Yes | Source connector: `dbt`, `metricflow`, `lookml`, `metabase`, `looker`, `notion`, or `gdrive` | | `source_dir` | For local file sources | Absolute or project-relative source directory | | `repo_url` | For Git-hosted dbt sources | Git repository URL | | `repoUrl` | For Git-hosted LookML sources | Git repository URL | @@ -376,6 +376,71 @@ Create an integration at [notion.so/my-integrations](https://www.notion.so/my-in - Incremental sync cursors are stored in `.ktx/db.sqlite`; don't add `last_successful_cursor` to `ktx.yaml` +--- + +## Google Drive + +Ingests Google Docs from a shared Google Drive folder as wiki-ready knowledge content. This v1 implementation is knowledge-only and ingests Google Docs MIME types only. + +### What it provides + +- Wiki pages synthesized from Google Docs content +- Folder-scoped knowledge ingestion from a specific Drive folder +- Markdown normalization for headings, lists, paragraphs, links, common inline formatting, and Google Docs tables + +### Connection config + +```yaml title="ktx.yaml" +connections: + company-docs: + driver: gdrive + service_account_key_ref: file:/absolute/path/to/google-service-account.json + folder_id: your-google-drive-folder-id + recursive: false +``` + +### Authentication + +| Method | Config | +|--------|--------| +| Service account JSON key file | `service_account_key_ref: file:/absolute/path/to/key.json` | + +### Google Cloud setup + +1. Create a Google Cloud project. +2. Enable the Google Drive API. +3. Enable the Google Docs API. +4. Create a service account. +5. Download the service account JSON key. +6. Share the target Drive folder with the service account email. +7. Reference the key in `ktx.yaml` with `service_account_key_ref`. + +### Required scopes + +- `https://www.googleapis.com/auth/drive.readonly` +- `https://www.googleapis.com/auth/documents.readonly` + +### Configuration options + +| Field | Description | Default | +|-------|-------------|---------| +| `service_account_key_ref` | File reference to the service account JSON key | - | +| `folder_id` | Google Drive folder ID to ingest | - | +| `recursive` | Traverse subfolders under `folder_id` | `false` | + +### What gets ingested + +- Google Docs documents only +- Wiki-oriented knowledge content +- One work unit per staged Google Doc + +### Notes + +- `gdrive` is knowledge-only in v1; it does not produce semantic layer sources +- `ktx setup` supports Google Drive configuration, including the service-account key ref, folder id, and recursive crawl flag +- `ktx connection test ` supports `gdrive` and reports the number of Google Docs visible in the configured folder +- The service account must be granted access to the target folder explicitly + ## Common errors | Error or symptom | Likely cause | Recovery | diff --git a/packages/cli/package.json b/packages/cli/package.json index d6975429..a5f9e581 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -57,6 +57,7 @@ "@commander-js/extra-typings": "14.0.0", "@duckdb/node-api": "1.5.3-r.3", "@google-cloud/bigquery": "^8.3.1", + "google-auth-library": "10.6.2", "@looker/sdk": "^26.8.0", "@looker/sdk-node": "^26.8.0", "@looker/sdk-rtl": "^21.6.5", diff --git a/packages/cli/src/commands/setup-commands.ts b/packages/cli/src/commands/setup-commands.ts index a37b7eb6..5f86b9c9 100644 --- a/packages/cli/src/commands/setup-commands.ts +++ b/packages/cli/src/commands/setup-commands.ts @@ -57,7 +57,8 @@ function sourceType(value: string): KtxSetupSourceType { value === 'metabase' || value === 'looker' || value === 'lookml' || - value === 'notion' + value === 'notion' || + value === 'gdrive' ) { return value; } @@ -132,6 +133,9 @@ function shouldShowSetupEntryMenu( metabaseDatabaseId?: number; notionCrawlMode?: string; notionRootPageId?: string[]; + gdriveServiceAccountKeyRef?: string; + gdriveFolderId?: string; + gdriveRecursive?: boolean; skipSources?: boolean; }, command: Command, @@ -197,6 +201,9 @@ function shouldShowSetupEntryMenu( 'sourceTarget', 'metabaseDatabaseId', 'notionCrawlMode', + 'gdriveServiceAccountKeyRef', + 'gdriveFolderId', + 'gdriveRecursive', 'skipSources', ].some((optionName) => optionWasSpecified(command, optionName)); } @@ -337,6 +344,12 @@ export function registerSetupCommands(program: Command, context: KtxCliCommandCo .default([] as string[]) .hideHelp(), ) + .addOption( + new Option('--gdrive-service-account-key-ref ', 'file: reference to a Google service account JSON key') + .hideHelp(), + ) + .addOption(new Option('--gdrive-folder-id ', 'Google Drive folder id to ingest').hideHelp()) + .addOption(new Option('--gdrive-recursive', 'Recursively traverse Google Drive subfolders').hideHelp().default(false)) .addOption(new Option('--skip-sources', 'Mark optional source setup complete with no sources').hideHelp().default(false)) .showHelpAfterError(); @@ -486,6 +499,11 @@ export function registerSetupCommands(program: Command, context: KtxCliCommandCo ...(options.metabaseDatabaseId !== undefined ? { metabaseDatabaseId: options.metabaseDatabaseId } : {}), ...(options.notionCrawlMode ? { notionCrawlMode: options.notionCrawlMode } : {}), ...(options.notionRootPageId.length > 0 ? { notionRootPageIds: options.notionRootPageId } : {}), + ...(options.gdriveServiceAccountKeyRef + ? { gdriveServiceAccountKeyRef: options.gdriveServiceAccountKeyRef } + : {}), + ...(options.gdriveFolderId ? { gdriveFolderId: options.gdriveFolderId } : {}), + ...(options.gdriveRecursive ? { gdriveRecursive: true } : {}), runInitialSourceIngest: false, skipSources: options.skipSources === true, showEntryMenu: shouldShowSetupEntryMenu(options, command), diff --git a/packages/cli/src/connection.ts b/packages/cli/src/connection.ts index d12dccb7..0fa0ee5a 100644 --- a/packages/cli/src/connection.ts +++ b/packages/cli/src/connection.ts @@ -3,8 +3,11 @@ import { DefaultLookerConnectionClientFactory } from './context/ingest/adapters/ import type { LookerClient } from './context/ingest/adapters/looker/client.js'; import type { MetabaseRuntimeClient } from './context/ingest/adapters/metabase/client-port.js'; import { type NotionBotInfo, NotionClient } from './context/ingest/adapters/notion/notion-client.js'; +import { parseGdriveConnectionConfig, resolveGdriveServiceAccountKey } from './context/connections/gdrive-config.js'; import { createLocalLookerCredentialResolver } from './context/ingest/adapters/looker/local-looker.adapter.js'; import { metabaseRuntimeConfigFromLocalConnection } from './context/ingest/adapters/metabase/local-metabase.adapter.js'; +import { createGoogleDocsClients } from './context/ingest/adapters/gdrive/gdrive-client.js'; +import { GDRIVE_DOC_MIME_TYPE, gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js'; import { testRepoConnection } from './context/ingest/repo-fetch.js'; import { federatedConnectionListing } from './context/connections/federation.js'; import { getDriverRegistration } from './context/connections/drivers.js'; @@ -31,6 +34,10 @@ export type KtxConnectionArgs = type MetabaseTestPort = Pick; type LookerTestPort = Pick; type NotionTestPort = Pick; +type GdriveTestPort = Pick< + ReturnType['drive'], + 'listFiles' +>; type TestRepoConnection = typeof testRepoConnection; export interface KtxConnectionDeps { @@ -38,6 +45,7 @@ export interface KtxConnectionDeps { createMetabaseClient?: (project: KtxLocalProject, connectionId: string) => Promise; createLookerClient?: (project: KtxLocalProject, connectionId: string) => Promise; createNotionClient?: (project: KtxLocalProject, connectionId: string) => Promise; + createGdriveClient?: (project: KtxLocalProject, connectionId: string) => Promise; testRepoConnection?: TestRepoConnection; } @@ -52,6 +60,7 @@ const SUPPORTED_TEST_DRIVERS = [ 'metabase', 'looker', 'notion', + 'gdrive', 'dbt', 'metricflow', 'lookml', @@ -183,6 +192,39 @@ async function testNotionConnection( return { bot: describeNotionBot(bot) }; } +async function createDefaultGdriveClient( + project: KtxLocalProject, + connectionId: string, +): Promise { + const connection = project.config.connections[connectionId]; + if (!connection) { + throw new Error(`Connection "${connectionId}" is not configured in ktx.yaml`); + } + const parsed = parseGdriveConnectionConfig(connection); + const keyText = await resolveGdriveServiceAccountKey(parsed.service_account_key_ref); + const key = gdriveServiceAccountKeySchema.parse(JSON.parse(keyText)); + return createGoogleDocsClients(key).drive; +} + +async function testGdriveConnection( + project: KtxLocalProject, + connectionId: string, + createClient: (project: KtxLocalProject, connectionId: string) => Promise, +): Promise<{ docs: number }> { + const connection = project.config.connections[connectionId]; + if (!connection) { + throw new Error(`Connection "${connectionId}" is not configured in ktx.yaml`); + } + const parsed = parseGdriveConnectionConfig(connection); + const client = await createClient(project, connectionId); + const result = await client.listFiles({ + q: `'${parsed.folder_id}' in parents and trashed = false`, + }); + return { + docs: result.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length, + }; +} + interface GitConnectionFields { repoUrl: string; authToken: string | null; @@ -271,6 +313,15 @@ async function testConnectionByDriver( return { driver, detailKey: 'Bot', detailValue: result.bot }; } + if (driver === 'gdrive') { + const result = await testGdriveConnection( + project, + connectionId, + deps.createGdriveClient ?? createDefaultGdriveClient, + ); + return { driver, detailKey: 'Docs', detailValue: String(result.docs) }; + } + if (driver === 'dbt' || driver === 'metricflow' || driver === 'lookml') { const result = await testGitRepoConnection( project, diff --git a/packages/cli/src/context/connections/gdrive-config.test.ts b/packages/cli/src/context/connections/gdrive-config.test.ts new file mode 100644 index 00000000..0ed30654 --- /dev/null +++ b/packages/cli/src/context/connections/gdrive-config.test.ts @@ -0,0 +1,71 @@ +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { + gdriveConnectionToPullConfig, + parseGdriveConnectionConfig, + resolveGdriveServiceAccountKey, +} from './gdrive-config.js'; + +describe('standalone gdrive connection config', () => { + let tempDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-gdrive-config-')); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('parses config with safe defaults', () => { + const parsed = parseGdriveConnectionConfig({ + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/google-key.json', // pragma: allowlist secret + folder_id: 'folder-123', + }); + + expect(parsed).toEqual({ + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/google-key.json', // pragma: allowlist secret + folder_id: 'folder-123', + recursive: false, + }); + }); + + it('requires file-based service account keys', () => { + expect(() => + parseGdriveConnectionConfig({ + driver: 'gdrive', + service_account_key_ref: 'env:GOOGLE_KEY', // pragma: allowlist secret + folder_id: 'folder-123', + }), + ).toThrow('gdrive service_account_key_ref must use file:/path/to/key.json'); + }); + + it('resolves service account key files', async () => { + const keyPath = join(tempDir, 'google-key.json'); + await writeFile(keyPath, '{"client_email":"bot@example.com","private_key":"line-1"}\n', 'utf-8'); // pragma: allowlist secret + await expect(resolveGdriveServiceAccountKey(`file:${keyPath}`)).resolves.toContain('"client_email":"bot@example.com"'); + }); + + it('converts config into adapter pull config', async () => { + const keyPath = join(tempDir, 'google-key.json'); + await writeFile(keyPath, '{"client_email":"bot@example.com","private_key":"line-1"}\n', 'utf-8'); // pragma: allowlist secret + const pullConfig = await gdriveConnectionToPullConfig( + parseGdriveConnectionConfig({ + driver: 'gdrive', + service_account_key_ref: `file:${keyPath}`, // pragma: allowlist secret + folder_id: 'folder-123', + recursive: true, + }), + ); + + expect(pullConfig).toEqual({ + serviceAccountKey: '{"client_email":"bot@example.com","private_key":"line-1"}', // pragma: allowlist secret + folderId: 'folder-123', + recursive: true, + }); + }); +}); diff --git a/packages/cli/src/context/connections/gdrive-config.ts b/packages/cli/src/context/connections/gdrive-config.ts new file mode 100644 index 00000000..d5630a1e --- /dev/null +++ b/packages/cli/src/context/connections/gdrive-config.ts @@ -0,0 +1,87 @@ +import { readFile } from 'node:fs/promises'; +import { homedir } from 'node:os'; +import { resolve } from 'node:path'; +import type { KtxProjectConnectionConfig } from '../project/config.js'; +import type { GdrivePullConfig } from '../ingest/adapters/gdrive/types.js'; +import { gdrivePullConfigSchema } from '../ingest/adapters/gdrive/types.js'; + +type RawKtxGdriveConnectionConfig = Extract; + +export type KtxGdriveConnectionConfig = Omit< + RawKtxGdriveConnectionConfig, + 'service_account_key_ref' | 'folder_id' | 'recursive' +> & { + driver: 'gdrive'; + service_account_key_ref: string; + folder_id: string; + recursive: boolean; +}; + +interface ResolveKeyOptions { + readTextFile?: (path: string) => Promise; +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function expandHome(path: string): string { + return path === '~' || path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path; +} + +export function parseGdriveConnectionConfig(raw: unknown): KtxGdriveConnectionConfig { + if (!isRecord(raw)) { + throw new Error('gdrive connection config must be an object'); + } + if (raw.driver !== 'gdrive') { + throw new Error('gdrive connection config requires driver: gdrive'); + } + const keyRef = + typeof raw.service_account_key_ref === 'string' && raw.service_account_key_ref.trim().length > 0 // pragma: allowlist secret + ? raw.service_account_key_ref.trim() + : null; + if (!keyRef) { + throw new Error('gdrive connection config requires service_account_key_ref'); + } + if (!keyRef.startsWith('file:')) { + throw new Error('gdrive service_account_key_ref must use file:/path/to/key.json'); + } + const folderId = typeof raw.folder_id === 'string' && raw.folder_id.trim().length > 0 ? raw.folder_id.trim() : null; + if (!folderId) { + throw new Error('gdrive connection config requires folder_id'); + } + return { + driver: 'gdrive', + service_account_key_ref: keyRef, + folder_id: folderId, + recursive: raw.recursive === true, + }; +} + +/** @internal */ +export async function resolveGdriveServiceAccountKey( + serviceAccountKeyRef: string, + options: ResolveKeyOptions = {}, +): Promise { + if (!serviceAccountKeyRef.startsWith('file:')) { + throw new Error('gdrive service_account_key_ref must use file:/path/to/key.json'); + } + const path = expandHome(serviceAccountKeyRef.slice('file:'.length)); + const readTextFile = options.readTextFile ?? ((filePath: string) => readFile(filePath, 'utf-8')); + const value = (await readTextFile(path)).trim(); + if (!value) { + throw new Error(`gdrive service account key file is empty: ${path}`); + } + return value; +} + +export async function gdriveConnectionToPullConfig( + config: KtxGdriveConnectionConfig, + options: ResolveKeyOptions = {}, +): Promise { + return gdrivePullConfigSchema.parse({ + serviceAccountKey: await resolveGdriveServiceAccountKey(config.service_account_key_ref, options), + folderId: config.folder_id, + recursive: config.recursive, + }); +} diff --git a/packages/cli/src/context/ingest/adapters/gdrive/chunk.ts b/packages/cli/src/context/ingest/adapters/gdrive/chunk.ts new file mode 100644 index 00000000..06d9f546 --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/gdrive/chunk.ts @@ -0,0 +1,85 @@ +import { createHash } from 'node:crypto'; +import { readdir, readFile } from 'node:fs/promises'; +import { join, relative } from 'node:path'; +import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js'; +import { gdriveManifestSchema, gdriveMetadataSchema } from './types.js'; + +const GDRIVE_RECONCILE_GUIDANCE = + 'Synthesize durable wiki knowledge from this Google Doc. Preserve product definitions, process documentation, and operating rules as wiki pages. Do not create semantic-layer sources from gdrive content in v1.'; + +function normalizeRawPath(path: string): string { + return path.replace(/\\/g, '/'); +} + +async function walk(root: string): Promise { + const entries = await readdir(root, { withFileTypes: true, recursive: true }); + return entries + .filter((entry) => entry.isFile()) + .map((entry) => normalizeRawPath(relative(root, join(entry.parentPath, entry.name)))) + .sort(); +} + +function safeUnitKey(path: string): string { + return `gdrive-${path.replace(/^docs\//, '').replace(/\/page\.md$/, '').replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`; +} + +async function readManifest(stagedDir: string) { + try { + return gdriveManifestSchema.parse(JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8'))); + } catch (error) { + throw new Error(`Invalid gdrive manifest: ${error instanceof Error ? error.message : String(error)}`); + } +} + +export async function chunkGdriveStagedDir(stagedDir: string, diffSet?: DiffSet): Promise { + const files = await walk(stagedDir); + const manifest = await readManifest(stagedDir); + const touched = diffSet + ? new Set([...diffSet.added, ...diffSet.modified].map((path) => normalizeRawPath(path))) + : null; + const workUnits: WorkUnit[] = []; + + for (const pagePath of files.filter((path) => path.endsWith('/page.md'))) { + const metadataPath = pagePath.replace(/\/page\.md$/, '/metadata.json'); + const primary = [metadataPath, pagePath].filter((path) => files.includes(path)); + if (touched && !primary.some((path) => touched.has(path))) { + continue; + } + const metadata = gdriveMetadataSchema.parse(JSON.parse(await readFile(join(stagedDir, metadataPath), 'utf-8'))); + const rawFiles = touched ? primary.filter((path) => touched.has(path)).sort() : primary.sort(); + const dependencyPaths = ['manifest.json'].filter((path) => !rawFiles.includes(path)); + const excluded = new Set([...rawFiles, ...dependencyPaths]); + const peerFileIndex = files.filter((path) => !excluded.has(path)).sort(); + workUnits.push({ + unitKey: safeUnitKey(pagePath), + displayLabel: metadata.path, + rawFiles, + dependencyPaths, + peerFileIndex, + notes: GDRIVE_RECONCILE_GUIDANCE, + }); + } + + return { + workUnits, + eviction: + diffSet && diffSet.deleted.length > 0 + ? { deletedRawPaths: diffSet.deleted.map((path) => normalizeRawPath(path)).sort() } + : undefined, + reconcileNotes: ['Google Drive docs are knowledge-only in v1; keep output in wiki pages unless later follow-up work expands scope.'], + contextReport: { capped: false, warnings: manifest.warnings }, + }; +} + +export async function describeGdriveScope(stagedDir: string): Promise { + const manifest = await readManifest(stagedDir); + const scopeKey = JSON.stringify({ + folderId: manifest.folderId, + recursive: manifest.recursive, + }); + const fingerprint = createHash('sha256').update(scopeKey).digest('hex'); + return { + fingerprint, + isPathInScope: (rawPath) => rawPath === 'manifest.json' || rawPath.startsWith('docs/'), + }; +} diff --git a/packages/cli/src/context/ingest/adapters/gdrive/detect.ts b/packages/cli/src/context/ingest/adapters/gdrive/detect.ts new file mode 100644 index 00000000..24c691e6 --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/gdrive/detect.ts @@ -0,0 +1,20 @@ +import { readFile, readdir } from 'node:fs/promises'; +import { join } from 'node:path'; + +export async function detectGdriveStagedDir(stagedDir: string): Promise { + try { + const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as { source?: unknown }; + if (manifest.source === 'gdrive') { + return true; + } + } catch { + // Fall through to structural detection. + } + + try { + const entries = await readdir(stagedDir, { withFileTypes: true, recursive: true }); + return entries.some((entry) => entry.isFile() && entry.name === 'page.md'); + } catch { + return false; + } +} diff --git a/packages/cli/src/context/ingest/adapters/gdrive/fetch.ts b/packages/cli/src/context/ingest/adapters/gdrive/fetch.ts new file mode 100644 index 00000000..288bc664 --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/gdrive/fetch.ts @@ -0,0 +1,109 @@ +import { createHash } from 'node:crypto'; +import { mkdir, writeFile } from 'node:fs/promises'; +import { dirname, join } from 'node:path'; +import { createGoogleDocsClients } from './gdrive-client.js'; +import { normalizeGoogleDocToMarkdown } from './normalize.js'; +import type { GdriveFileRecord, GdriveManifest, GdrivePullConfig } from './types.js'; +import { GDRIVE_DOC_MIME_TYPE, GDRIVE_SOURCE_KEY } from './types.js'; + +async function writeJson(path: string, value: unknown): Promise { + await mkdir(dirname(path), { recursive: true }); + await writeFile(path, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'); +} + +async function writeText(path: string, value: string): Promise { + await mkdir(dirname(path), { recursive: true }); + await writeFile(path, value.endsWith('\n') ? value : `${value}\n`, 'utf-8'); +} + +function slugifySegment(value: string): string { + const normalized = value + .normalize('NFKD') + .replace(/[^\x00-\x7F]/g, '') + .replace(/[^a-zA-Z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .toLowerCase(); + return normalized || 'untitled'; +} + +function compactSegment(value: string, maxLength = 24): string { + const slug = slugifySegment(value); + return slug.length > maxLength ? slug.slice(0, maxLength).replace(/-+$/g, '') || 'untitled' : slug; +} + +function shortHash(value: string, length = 10): string { + return createHash('sha1').update(value).digest('hex').slice(0, length); +} + +function gdriveDocDirName(title: string, fileId: string): string { + return `${compactSegment(title)}-${shortHash(fileId)}`; +} + +async function listFolderFiles( + drive: ReturnType['drive'], + folderId: string, + recursive: boolean, + parents: string[] = [], +): Promise> { + const q = `'${folderId}' in parents and trashed = false`; + const records: Array<{ file: GdriveFileRecord; drivePath: string[]; folderId: string }> = []; + let pageToken: string | undefined; + do { + const page = await drive.listFiles({ q, pageToken }); + for (const file of page.files) { + if (file.mimeType === 'application/vnd.google-apps.folder') { + if (recursive) { + records.push(...(await listFolderFiles(drive, file.id, true, [...parents, file.name]))); + } + continue; + } + if (file.mimeType !== GDRIVE_DOC_MIME_TYPE) { + continue; + } + records.push({ file, drivePath: parents, folderId }); + } + pageToken = page.nextPageToken ?? undefined; + } while (pageToken); + return records; +} + +export async function fetchGdriveSnapshot(params: { + key: unknown; + config: GdrivePullConfig; + stagedDir: string; +}): Promise { + await mkdir(params.stagedDir, { recursive: true }); + const clients = createGoogleDocsClients(params.key); + const docs = await listFolderFiles(clients.drive, params.config.folderId, params.config.recursive); + + for (const { file, drivePath, folderId } of docs) { + const document = await clients.docs.getDocument(file.id); + const title = (document.title?.trim() || file.name).trim(); + const relDir = join('docs', ...drivePath.map((segment) => compactSegment(segment)), gdriveDocDirName(title, file.id)); + const markdownBody = normalizeGoogleDocToMarkdown(document); + const pageMarkdown = [`# ${title}`, markdownBody].filter(Boolean).join('\n\n'); + await writeJson(join(params.stagedDir, relDir, 'metadata.json'), { + id: file.id, + title, + path: [...drivePath, title].join(' / ') || title, + url: file.webViewLink, + mimeType: file.mimeType, + folderId, + drivePath, + modifiedTime: file.modifiedTime, + }); + await writeText(join(params.stagedDir, relDir, 'page.md'), pageMarkdown); + } + + const manifest: GdriveManifest = { + source: GDRIVE_SOURCE_KEY, + folderId: params.config.folderId, + recursive: params.config.recursive, + fetchedAt: new Date().toISOString(), + fileCount: docs.length, + skipped: [], + warnings: [], + }; + await writeJson(join(params.stagedDir, 'manifest.json'), manifest); + return manifest; +} diff --git a/packages/cli/src/context/ingest/adapters/gdrive/gdrive-client.ts b/packages/cli/src/context/ingest/adapters/gdrive/gdrive-client.ts new file mode 100644 index 00000000..4b66bc7a --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/gdrive/gdrive-client.ts @@ -0,0 +1,106 @@ +import { JWT } from 'google-auth-library'; +import type { GdriveFileRecord, GdriveServiceAccountKey, GoogleDocsDocument } from './types.js'; +import { GDRIVE_SCOPES, gdriveServiceAccountKeySchema } from './types.js'; + +const GOOGLE_DRIVE_BASE_URL = 'https://www.googleapis.com/drive/v3'; +const GOOGLE_DOCS_BASE_URL = 'https://docs.googleapis.com/v1'; + +interface GoogleApiListResponse { + files?: Array<{ + id?: string; + name?: string; + mimeType?: string; + parents?: string[]; + webViewLink?: string; + modifiedTime?: string; + }>; + nextPageToken?: string; +} + +interface GoogleApiFile { + id?: string; + name?: string; + mimeType?: string; + parents?: string[]; + webViewLink?: string; + modifiedTime?: string; +} + +async function parseGoogleResponse(response: Response): Promise { + if (!response.ok) { + const body = await response.text(); + throw new Error(`Google API request failed (${response.status}): ${body || response.statusText}`); + } + return (await response.json()) as T; +} + +async function authorizedFetch(client: JWT, url: string): Promise { + const headers = await client.getRequestHeaders(url); + return fetch(url, { headers }); +} + +function isGoogleApiFileRecord(file: GoogleApiFile): file is GoogleApiFile & { + id: string; + name: string; + mimeType: string; +} { + return typeof file.id === 'string' && typeof file.name === 'string' && typeof file.mimeType === 'string'; +} + +export function createGoogleDocsClients(rawKey: unknown): { + drive: { + listFiles(args: { q: string; pageToken?: string }): Promise<{ files: GdriveFileRecord[]; nextPageToken: string | null }>; + }; + docs: { + getDocument(documentId: string): Promise; + }; +} { + const key = gdriveServiceAccountKeySchema.parse(rawKey) satisfies GdriveServiceAccountKey; + const client = new JWT({ + email: key.client_email, + key: key.private_key, + scopes: [...GDRIVE_SCOPES], + }); + + return { + drive: { + async listFiles(args) { + const params = new URLSearchParams({ + q: args.q, + supportsAllDrives: 'true', + includeItemsFromAllDrives: 'true', + pageSize: '1000', + fields: 'nextPageToken,files(id,name,mimeType,parents,webViewLink,modifiedTime)', + }); + if (args.pageToken) { + params.set('pageToken', args.pageToken); + } + const response = await authorizedFetch(client, `${GOOGLE_DRIVE_BASE_URL}/files?${params.toString()}`); + const parsed = await parseGoogleResponse(response); + return { + files: (parsed.files ?? []) + .filter(isGoogleApiFileRecord) + .map((file) => ({ + id: file.id, + name: file.name, + mimeType: file.mimeType, + parents: Array.isArray(file.parents) ? file.parents.filter((parent): parent is string => typeof parent === 'string') : [], + webViewLink: typeof file.webViewLink === 'string' ? file.webViewLink : null, + modifiedTime: typeof file.modifiedTime === 'string' ? file.modifiedTime : null, + })), + nextPageToken: typeof parsed.nextPageToken === 'string' ? parsed.nextPageToken : null, + }; + }, + }, + docs: { + async getDocument(documentId: string) { + const params = new URLSearchParams({ + includeTabsContent: 'true', + suggestionsViewMode: 'PREVIEW_WITHOUT_SUGGESTIONS', + }); + const response = await authorizedFetch(client, `${GOOGLE_DOCS_BASE_URL}/documents/${documentId}?${params.toString()}`); + return await parseGoogleResponse(response); + }, + }, + }; +} diff --git a/packages/cli/src/context/ingest/adapters/gdrive/gdrive.adapter.ts b/packages/cli/src/context/ingest/adapters/gdrive/gdrive.adapter.ts new file mode 100644 index 00000000..9234498f --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/gdrive/gdrive.adapter.ts @@ -0,0 +1,33 @@ +import type { ChunkResult, DiffSet, FetchContext, ScopeDescriptor, SourceAdapter } from '../../types.js'; +import { chunkGdriveStagedDir, describeGdriveScope } from './chunk.js'; +import { detectGdriveStagedDir } from './detect.js'; +import { fetchGdriveSnapshot } from './fetch.js'; +import { gdrivePullConfigSchema } from './types.js'; + +export class GdriveSourceAdapter implements SourceAdapter { + readonly source = 'gdrive'; + readonly skillNames = ['gdrive_synthesize']; + readonly reconcileSkillNames: string[] = []; + readonly evidenceIndexing = 'documents' as const; + + detect(stagedDir: string): Promise { + return detectGdriveStagedDir(stagedDir); + } + + async fetch(pullConfig: unknown, stagedDir: string, _ctx: FetchContext): Promise { + const config = gdrivePullConfigSchema.parse(pullConfig); + await fetchGdriveSnapshot({ + key: JSON.parse(config.serviceAccountKey), + config, + stagedDir, + }); + } + + chunk(stagedDir: string, diffSet?: DiffSet): Promise { + return chunkGdriveStagedDir(stagedDir, diffSet); + } + + describeScope(stagedDir: string): Promise { + return describeGdriveScope(stagedDir); + } +} diff --git a/packages/cli/src/context/ingest/adapters/gdrive/normalize.ts b/packages/cli/src/context/ingest/adapters/gdrive/normalize.ts new file mode 100644 index 00000000..7dd1da07 --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/gdrive/normalize.ts @@ -0,0 +1,323 @@ +import type { + GoogleDocsDocument, + GoogleDocsDocumentStyle, + GoogleDocsHeaderFooter, + GoogleDocsLinkTarget, + GoogleDocsList, + GoogleDocsParagraph, + GoogleDocsParagraphElement, + GoogleDocsStructuralElement, + GoogleDocsTab, + GoogleDocsTable, + GoogleDocsTableCell, +} from './types.js'; + +function escapeMarkdownText(value: string): string { + return value.replace(/([*_~`])/g, '\\$1'); +} + +function normalizeInternalLinkTarget(prefix: 'heading' | 'bookmark', target: GoogleDocsLinkTarget | string | undefined): string | null { + const id = typeof target === 'string' ? target : target?.id; + if (!id?.trim()) { + return null; + } + return `#${prefix}-${id.trim()}`; +} + +function resolveLinkHref(element: GoogleDocsParagraphElement): string | null { + const link = element.textRun?.textStyle?.link; + const href = link?.url?.trim(); + if (href) { + return href; + } + return ( + normalizeInternalLinkTarget('heading', link?.heading) ?? + normalizeInternalLinkTarget('heading', link?.headingId) ?? + normalizeInternalLinkTarget('bookmark', link?.bookmark) ?? + normalizeInternalLinkTarget('bookmark', link?.bookmarkId) ?? + null + ); +} + +function normalizeTextRun(element: GoogleDocsParagraphElement): string { + const content = element.textRun?.content ?? ''; + const style = element.textRun?.textStyle; + let text = escapeMarkdownText(content.replace(/\r/g, '')); + if (!text && element.inlineObjectElement) { + return '[Embedded object]'; + } + if (!text && element.pageBreak) { + return '\n---\n'; + } + if (!text) { + return ''; + } + const href = resolveLinkHref(element); + const isCode = style?.weightedFontFamily?.fontFamily === 'Courier New'; + if (isCode) { + text = `\`${text.replace(/`/g, '\\`')}\``; + } + if (style?.bold) { + text = `**${text}**`; + } + if (style?.italic) { + text = `*${text}*`; + } + if (style?.underline) { + text = `${text}`; + } + if (style?.strikethrough) { + text = `~~${text}~~`; + } + if (href) { + text = `[${text}](${href.replace(/\)/g, '\\)')})`; + } + if (style?.baselineOffset === 'SUPERSCRIPT') { + text = `${text}`; + } else if (style?.baselineOffset === 'SUBSCRIPT') { + text = `${text}`; + } + return text; +} + +function paragraphText(paragraph: GoogleDocsParagraph | undefined): string { + return (paragraph?.elements ?? []) + .map((element) => normalizeTextRun(element)) + .join('') + .replace(/\n/g, '') + .trim(); +} + +function headingPrefix(namedStyleType: string | undefined): string | null { + if (namedStyleType === 'TITLE') { + return '#'; + } + if (namedStyleType === 'SUBTITLE') { + return '##'; + } + if (!namedStyleType?.startsWith('HEADING_')) { + return null; + } + const level = Number.parseInt(namedStyleType.slice('HEADING_'.length), 10); + if (Number.isNaN(level) || level < 1) { + return null; + } + return '#'.repeat(Math.min(level, 6)); +} + +function isOrderedListLevel(level: { glyphType?: string; glyphSymbol?: string } | undefined): boolean { + const glyphType = level?.glyphType?.toUpperCase(); + if (glyphType) { + return ( + glyphType.includes('NUMBER') || + glyphType.includes('DECIMAL') || + glyphType.includes('ALPHA') || + glyphType.includes('ROMAN') || + glyphType.includes('LATIN') + ); + } + const glyphSymbol = level?.glyphSymbol?.trim(); + return glyphSymbol === '%0.' || glyphSymbol === '%0)' || glyphSymbol === '1.' || glyphSymbol === '1)'; +} + +function listPrefix(paragraph: GoogleDocsParagraph, lists: Record | undefined): string | null { + if (!paragraph.bullet) { + return null; + } + const level = Math.max(paragraph.bullet.nestingLevel ?? 0, 0); + const indent = ' '.repeat(level); + const listDefinition = paragraph.bullet.listId ? lists?.[paragraph.bullet.listId] : undefined; + const listLevel = listDefinition?.listProperties?.nestingLevels?.[level]; + return `${indent}${isOrderedListLevel(listLevel) ? '1. ' : '- '}`; +} + +function paragraphToMarkdown( + paragraph: GoogleDocsParagraph | undefined, + lists: Record | undefined, +): string | null { + const text = paragraphText(paragraph); + if (!text) { + return null; + } + const prefix = paragraph ? listPrefix(paragraph, lists) : null; + if (prefix) { + return `${prefix}${text}`; + } + const heading = headingPrefix(paragraph?.paragraphStyle?.namedStyleType); + if (heading) { + const headingLine = `${heading} ${text}`; + const headingId = paragraph?.paragraphStyle?.headingId?.trim(); + return headingId ? `\n${headingLine}` : headingLine; + } + return text; +} + +function normalizeTableCell( + cell: GoogleDocsTableCell | undefined, + lists: Record | undefined, +): string { + const blocks = normalizeStructuralElements(cell?.content ?? [], lists); + return blocks + .map((block) => block.replace(/\n/g, '
')) + .join(' / ') + .replace(/\|/g, '\\|') + .trim(); +} + +function markdownTableDivider(columnCount: number): string { + return `| ${Array.from({ length: columnCount }, () => '---').join(' | ')} |`; +} + +function normalizeTable(table: GoogleDocsTable | undefined, lists: Record | undefined): string[] { + const rows = table?.tableRows ?? []; + const normalizedRows = rows + .map((row) => (row.tableCells ?? []).map((cell) => normalizeTableCell(cell, lists))) + .filter((cells) => cells.length > 0); + if (normalizedRows.length === 0) { + return []; + } + const columnCount = Math.max(...normalizedRows.map((cells) => cells.length)); + const paddedRows = normalizedRows.map((cells) => + Array.from({ length: columnCount }, (_, index) => cells[index] ?? ''), + ); + const [header, ...body] = paddedRows; + const blocks = [`| ${header.join(' | ')} |`, markdownTableDivider(columnCount)]; + for (const row of body) { + blocks.push(`| ${row.join(' | ')} |`); + } + return [blocks.join('\n')]; +} + +function normalizeStructuralElements( + elements: GoogleDocsStructuralElement[], + lists: Record | undefined, +): string[] { + const blocks: string[] = []; + for (const element of elements) { + const line = paragraphToMarkdown(element.paragraph, lists); + if (line) { + blocks.push(line); + continue; + } + if (element.table) { + blocks.push(...normalizeTable(element.table, lists)); + } + } + return blocks; +} + +function headerFooterRoleMap( + label: 'Headers' | 'Footers', + documentStyle: GoogleDocsDocumentStyle | undefined, +): Map { + const roleMap = new Map(); + const roleEntries = + label === 'Headers' + ? [ + [documentStyle?.defaultHeaderId, 'Default Header'], + [documentStyle?.firstPageHeaderId, 'First Page Header'], + [documentStyle?.evenPageHeaderId, 'Even Page Header'], + ] + : [ + [documentStyle?.defaultFooterId, 'Default Footer'], + [documentStyle?.firstPageFooterId, 'First Page Footer'], + [documentStyle?.evenPageFooterId, 'Even Page Footer'], + ]; + for (const [id, role] of roleEntries) { + const normalizedId = id?.trim(); + if (!normalizedId || roleMap.has(normalizedId)) { + continue; + } + roleMap.set(normalizedId, role ?? normalizedId); + } + return roleMap; +} + +function normalizeHeaderFooterMap( + label: 'Headers' | 'Footers', + entries: Record | undefined, + lists: Record | undefined, + documentStyle: GoogleDocsDocumentStyle | undefined, +): string | null { + if (!entries) { + return null; + } + const ids = Object.keys(entries).sort(); + const roles = headerFooterRoleMap(label, documentStyle); + const sections: string[] = []; + for (const id of ids) { + const blocks = normalizeStructuralElements(entries[id]?.content ?? [], lists); + if (blocks.length === 0) { + continue; + } + const title = roles.get(id) ?? `${label.slice(0, -1)} ${escapeMarkdownText(id)}`; + sections.push(`### ${title}\n\n${blocks.join('\n\n').trim()}`); + } + if (sections.length === 0) { + return null; + } + return `## ${label}\n\n${sections.join('\n\n').trim()}`; +} + +function joinNonEmptySections(sections: Array): string | null { + const nonEmpty = sections.filter((section): section is string => Boolean(section?.trim())); + if (nonEmpty.length === 0) { + return null; + } + return nonEmpty.join('\n\n').trim(); +} + +function flattenGoogleDocsTabs(tabs: GoogleDocsTab[] | undefined): GoogleDocsTab[] { + if (!tabs?.length) { + return []; + } + const flattened: GoogleDocsTab[] = []; + for (const tab of tabs) { + flattened.push(tab); + flattened.push(...flattenGoogleDocsTabs(tab.childTabs)); + } + return flattened; +} + +function normalizeTab(tab: GoogleDocsTab, fallbackLists: Record | undefined): string | null { + const lists = tab.documentTab?.lists ?? fallbackLists; + const headerSection = normalizeHeaderFooterMap( + 'Headers', + tab.documentTab?.headers, + lists, + tab.documentTab?.documentStyle, + ); + const bodySection = normalizeStructuralElements(tab.documentTab?.body?.content ?? [], lists).join('\n\n').trim(); + const footerSection = normalizeHeaderFooterMap( + 'Footers', + tab.documentTab?.footers, + lists, + tab.documentTab?.documentStyle, + ); + const content = joinNonEmptySections([headerSection, bodySection, footerSection]); + if (!content) { + return null; + } + const title = tab.tabProperties?.title?.trim(); + if (!title) { + return content; + } + return [`# ${escapeMarkdownText(title)}`, content].join('\n\n').trim(); +} + +export function normalizeGoogleDocToMarkdown(document: GoogleDocsDocument): string { + const normalizedTabs = flattenGoogleDocsTabs(document.tabs) + .map((tab) => normalizeTab(tab, document.lists)) + .filter((tab): tab is string => Boolean(tab)); + if (normalizedTabs.length > 0) { + return normalizedTabs.join('\n\n').trim(); + } + const bodySection = normalizeStructuralElements(document.body?.content ?? [], document.lists).join('\n\n').trim(); + return ( + joinNonEmptySections([ + normalizeHeaderFooterMap('Headers', document.headers, document.lists, document.documentStyle), + bodySection, + normalizeHeaderFooterMap('Footers', document.footers, document.lists, document.documentStyle), + ]) ?? '' + ); +} diff --git a/packages/cli/src/context/ingest/adapters/gdrive/types.ts b/packages/cli/src/context/ingest/adapters/gdrive/types.ts new file mode 100644 index 00000000..abfd7047 --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/gdrive/types.ts @@ -0,0 +1,167 @@ +import { z } from 'zod'; + +const GDRIVE_DOCS_SCOPE = 'https://www.googleapis.com/auth/documents.readonly'; +const GDRIVE_DRIVE_SCOPE = 'https://www.googleapis.com/auth/drive.readonly'; +export const GDRIVE_SCOPES = [GDRIVE_DRIVE_SCOPE, GDRIVE_DOCS_SCOPE] as const; +export const GDRIVE_SOURCE_KEY = 'gdrive'; +export const GDRIVE_DOC_MIME_TYPE = 'application/vnd.google-apps.document'; + +export const gdrivePullConfigSchema = z.object({ + serviceAccountKey: z.string().min(1), + folderId: z.string().min(1), + recursive: z.boolean().default(false), +}); +export type GdrivePullConfig = z.infer; + +export const gdriveManifestSchema = z.object({ + source: z.literal(GDRIVE_SOURCE_KEY), + folderId: z.string().min(1), + recursive: z.boolean(), + fetchedAt: z.string().datetime(), + fileCount: z.number().int().nonnegative(), + skipped: z.array(z.object({ externalId: z.string(), reason: z.string() })).default([]), + warnings: z.array(z.string()).default([]), +}); +export type GdriveManifest = z.infer; + +export const gdriveMetadataSchema = z.object({ + id: z.string(), + title: z.string(), + path: z.string(), + url: z.string().nullable().default(null), + mimeType: z.literal(GDRIVE_DOC_MIME_TYPE), + folderId: z.string(), + drivePath: z.array(z.string()).default([]), + modifiedTime: z.string().datetime().nullable().default(null), +}); + +export const gdriveServiceAccountKeySchema = z.object({ + client_email: z.string().email(), + private_key: z.string().min(1), + project_id: z.string().min(1).optional(), +}); +export type GdriveServiceAccountKey = z.infer; + +export interface GdriveFileRecord { + id: string; + name: string; + mimeType: string; + parents: string[]; + webViewLink: string | null; + modifiedTime: string | null; +} + +export interface GoogleDocsDocument { + documentId?: string; + title?: string; + body?: { + content?: GoogleDocsStructuralElement[]; + }; + documentStyle?: GoogleDocsDocumentStyle; + lists?: Record; + headers?: Record; + footers?: Record; + tabs?: GoogleDocsTab[]; +} + +export interface GoogleDocsList { + listProperties?: { + nestingLevels?: GoogleDocsListNestingLevel[]; + }; +} + +interface GoogleDocsListNestingLevel { + glyphType?: string; + glyphSymbol?: string; +} + +export interface GoogleDocsTab { + tabProperties?: { + tabId?: string; + title?: string; + }; + childTabs?: GoogleDocsTab[]; + documentTab?: { + body?: { + content?: GoogleDocsStructuralElement[]; + }; + documentStyle?: GoogleDocsDocumentStyle; + lists?: Record; + headers?: Record; + footers?: Record; + }; +} + +export interface GoogleDocsDocumentStyle { + defaultHeaderId?: string; + defaultFooterId?: string; + firstPageHeaderId?: string; + firstPageFooterId?: string; + evenPageHeaderId?: string; + evenPageFooterId?: string; +} + +export interface GoogleDocsHeaderFooter { + headerId?: string; + footerId?: string; + content?: GoogleDocsStructuralElement[]; +} + +export interface GoogleDocsStructuralElement { + paragraph?: GoogleDocsParagraph; + table?: GoogleDocsTable; + sectionBreak?: unknown; +} + +export interface GoogleDocsTable { + tableRows?: GoogleDocsTableRow[]; +} + +interface GoogleDocsTableRow { + tableCells?: GoogleDocsTableCell[]; +} + +export interface GoogleDocsTableCell { + content?: GoogleDocsStructuralElement[]; +} + +export interface GoogleDocsParagraph { + elements?: GoogleDocsParagraphElement[]; + bullet?: { + listId?: string; + nestingLevel?: number; + }; + paragraphStyle?: { + namedStyleType?: string; + headingId?: string; + }; +} + +export interface GoogleDocsLinkTarget { + id?: string; + tabId?: string; +} + +export interface GoogleDocsParagraphElement { + textRun?: { + content?: string; + textStyle?: { + bold?: boolean; + italic?: boolean; + underline?: boolean; + strikethrough?: boolean; + link?: { + url?: string; + tabId?: string; + headingId?: string; + bookmarkId?: string; + heading?: GoogleDocsLinkTarget; + bookmark?: GoogleDocsLinkTarget; + }; + weightedFontFamily?: { fontFamily?: string }; + baselineOffset?: 'SUPERSCRIPT' | 'SUBSCRIPT' | string; + }; + }; + inlineObjectElement?: unknown; + pageBreak?: unknown; +} diff --git a/packages/cli/src/context/ingest/artifact-gates.ts b/packages/cli/src/context/ingest/artifact-gates.ts index a67f8455..52b2df85 100644 --- a/packages/cli/src/context/ingest/artifact-gates.ts +++ b/packages/cli/src/context/ingest/artifact-gates.ts @@ -21,6 +21,10 @@ export interface ProvenanceRawPathValidationInput { deletedRawPaths: Set; } +function normalizeRawPath(path: string): string { + return path.replace(/\\/g, '/').replace(/^\/+/, ''); +} + function parseSlRef(ref: string): { connectionId: string | null; sourceName: string; entityName: string | null } { const withoutConnection = ref.includes('/') ? ref.slice(ref.indexOf('/') + 1) : ref; const connectionId = ref.includes('/') ? ref.slice(0, ref.indexOf('/')) : null; @@ -132,8 +136,11 @@ export async function validateFinalIngestArtifacts(input: FinalArtifactGateInput } export function validateProvenanceRawPaths(input: ProvenanceRawPathValidationInput): void { + const currentRawPaths = new Set([...input.currentRawPaths].map(normalizeRawPath)); + const deletedRawPaths = new Set([...input.deletedRawPaths].map(normalizeRawPath)); for (const row of input.rows) { - if (!input.currentRawPaths.has(row.rawPath) && !input.deletedRawPaths.has(row.rawPath)) { + const rawPath = normalizeRawPath(row.rawPath); + if (!currentRawPaths.has(rawPath) && !deletedRawPaths.has(rawPath)) { throw new Error(`provenance row references raw path outside this snapshot: ${row.rawPath}`); } } diff --git a/packages/cli/src/context/ingest/local-adapters.ts b/packages/cli/src/context/ingest/local-adapters.ts index 3cd8a998..0dded4cb 100644 --- a/packages/cli/src/context/ingest/local-adapters.ts +++ b/packages/cli/src/context/ingest/local-adapters.ts @@ -1,4 +1,5 @@ import { join } from 'node:path'; +import { gdriveConnectionToPullConfig, parseGdriveConnectionConfig } from '../../context/connections/gdrive-config.js'; import { localConnectionToWarehouseDescriptor } from '../../context/connections/local-warehouse-descriptor.js'; import { notionConnectionToPullConfig, parseNotionConnectionConfig } from '../../context/connections/notion-config.js'; import { resolveKtxConfigReference } from '../core/config-reference.js'; @@ -7,6 +8,7 @@ import type { KtxLocalProject } from '../../context/project/project.js'; import type { SqlAnalysisPort } from '../../context/sql-analysis/ports.js'; import { DbtSourceAdapter } from './adapters/dbt/dbt.adapter.js'; import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js'; +import { GdriveSourceAdapter } from './adapters/gdrive/gdrive.adapter.js'; import { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.adapter.js'; import { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js'; import { resolveQueryHistoryScopeFloor } from './adapters/historic-sql/scope-floor.js'; @@ -103,6 +105,7 @@ export function createDefaultLocalIngestAdapters( createLocalMetabaseSourceAdapter(project, { ...(options.logger ? { logger: options.logger } : {}), }), + new GdriveSourceAdapter(), new LookerSourceAdapter({ clientFactory: { async createClient(config, ctx) { @@ -330,6 +333,9 @@ export async function localPullConfigForAdapter( lastSuccessfulCursor: await localNotionRuntimeStore(project).readCursor(connectionId), }; } + if (adapter.source === 'gdrive') { + return await gdriveConnectionToPullConfig(parseGdriveConnectionConfig(connection)); + } if (adapter.source === 'metricflow') { const metricflow = connection.metricflow; const metricflowConfig = diff --git a/packages/cli/src/context/ingest/tools/read-raw-file.tool.ts b/packages/cli/src/context/ingest/tools/read-raw-file.tool.ts index 4f84f28d..49304001 100644 --- a/packages/cli/src/context/ingest/tools/read-raw-file.tool.ts +++ b/packages/cli/src/context/ingest/tools/read-raw-file.tool.ts @@ -1,5 +1,5 @@ import { readFile, stat } from 'node:fs/promises'; -import { join, normalize, resolve } from 'node:path'; +import { isAbsolute, join, normalize, relative, resolve } from 'node:path'; import { tool } from 'ai'; import { z } from 'zod'; @@ -10,8 +10,13 @@ interface ReadRawFileDeps { const MAX_READ_RAW_FILE_BYTES = 120_000; +function normalizeRawPath(path: string): string { + return normalize(path).replace(/^[/\\]+/, '').replace(/\\/g, '/'); +} + export function createReadRawFileTool(deps: ReadRawFileDeps) { const stagedRoot = resolve(deps.stagedDir); + const allowedPaths = new Set([...deps.allowedPaths].map(normalizeRawPath)); return tool({ description: "Read the full text content of a raw source file inside this WorkUnit. `path` must be relative to the staged bundle root (no leading slash, no `..`) and must appear in the WorkUnit's rawFiles or dependencyPaths list.", @@ -19,12 +24,13 @@ export function createReadRawFileTool(deps: ReadRawFileDeps) { path: z.string().describe('Path relative to the staged bundle root. Example: "views/customers/customer.lkml".'), }), execute: async ({ path }) => { - const normalized = normalize(path).replace(/^[/\\]+/, ''); - if (normalized.startsWith('..') || !deps.allowedPaths.has(normalized)) { - return `Error: path "${path}" is not accessible from this WorkUnit. Allowed paths: ${[...deps.allowedPaths].sort().join(', ')}`; + const normalized = normalizeRawPath(path); + if (normalized.startsWith('..') || !allowedPaths.has(normalized)) { + return `Error: path "${path}" is not accessible from this WorkUnit. Allowed paths: ${[...allowedPaths].sort().join(', ')}`; } const absolute = resolve(join(stagedRoot, normalized)); - if (!absolute.startsWith(`${stagedRoot}/`) && absolute !== stagedRoot) { + const stagedRelative = relative(stagedRoot, absolute); + if (stagedRelative.startsWith('..') || isAbsolute(stagedRelative)) { return `Error: path "${path}" is not accessible from this WorkUnit.`; } try { diff --git a/packages/cli/src/context/ingest/tools/read-raw-span.tool.ts b/packages/cli/src/context/ingest/tools/read-raw-span.tool.ts index 21da54d1..04a22c33 100644 --- a/packages/cli/src/context/ingest/tools/read-raw-span.tool.ts +++ b/packages/cli/src/context/ingest/tools/read-raw-span.tool.ts @@ -1,5 +1,5 @@ import { readFile } from 'node:fs/promises'; -import { join, normalize, resolve } from 'node:path'; +import { isAbsolute, join, normalize, relative, resolve } from 'node:path'; import { tool } from 'ai'; import { z } from 'zod'; @@ -8,8 +8,13 @@ interface ReadRawSpanDeps { allowedPaths: Set; } +function normalizeRawPath(path: string): string { + return normalize(path).replace(/^[/\\]+/, '').replace(/\\/g, '/'); +} + export function createReadRawSpanTool(deps: ReadRawSpanDeps) { const stagedRoot = resolve(deps.stagedDir); + const allowedPaths = new Set([...deps.allowedPaths].map(normalizeRawPath)); return tool({ description: 'Read a 1-based inclusive line range from a raw source file. Use this to resolve a provenance pointer like `file.lkml#L15-28` without loading the whole file into context.', @@ -22,12 +27,13 @@ export function createReadRawSpanTool(deps: ReadRawSpanDeps) { if (startLine > endLine) { return `Error: startLine must be <= endLine (got startLine=${startLine}, endLine=${endLine})`; } - const normalized = normalize(path).replace(/^[/\\]+/, ''); - if (normalized.startsWith('..') || !deps.allowedPaths.has(normalized)) { - return `Error: path "${path}" is not accessible from this context. Allowed paths: ${[...deps.allowedPaths].sort().join(', ')}`; + const normalized = normalizeRawPath(path); + if (normalized.startsWith('..') || !allowedPaths.has(normalized)) { + return `Error: path "${path}" is not accessible from this context. Allowed paths: ${[...allowedPaths].sort().join(', ')}`; } const absolute = resolve(join(stagedRoot, normalized)); - if (!absolute.startsWith(`${stagedRoot}/`) && absolute !== stagedRoot) { + const stagedRelative = relative(stagedRoot, absolute); + if (stagedRelative.startsWith('..') || isAbsolute(stagedRelative)) { return `Error: path "${path}" is not accessible from this context.`; } try { diff --git a/packages/cli/src/context/project/driver-schemas.ts b/packages/cli/src/context/project/driver-schemas.ts index 72b4344e..1b2434c6 100644 --- a/packages/cli/src/context/project/driver-schemas.ts +++ b/packages/cli/src/context/project/driver-schemas.ts @@ -168,6 +168,18 @@ const notionConnectionSchema = z }) .describe('Notion context-source connection.'); +const gdriveConnectionSchema = z + .looseObject({ + driver: z.literal('gdrive'), + service_account_key_ref: z + .string() + .min(1) + .describe('Reference to a Google service-account JSON key file. Must use file:/absolute/path/to/key.json.'), + folder_id: z.string().min(1).describe('Google Drive folder ID to ingest.'), + recursive: z.boolean().optional().describe('When true, recursively traverse subfolders beneath folder_id.'), + }) + .describe('Google Drive Google Docs context-source connection.'); + const dbtConnectionSchema = z .looseObject({ driver: z.literal('dbt'), @@ -202,6 +214,7 @@ export const connectionConfigSchema = z.discriminatedUnion('driver', [ lookerConnectionSchema, lookmlConnectionSchema, notionConnectionSchema, + gdriveConnectionSchema, dbtConnectionSchema, metricflowConnectionSchema, ]); diff --git a/packages/cli/src/public-ingest.ts b/packages/cli/src/public-ingest.ts index 997cc0cc..b38107c3 100644 --- a/packages/cli/src/public-ingest.ts +++ b/packages/cli/src/public-ingest.ts @@ -133,6 +133,7 @@ const sourceAdapterByDriver = new Map([ ['local_metabase', 'metabase'], ['looker', 'looker'], ['notion', 'notion'], + ['gdrive', 'gdrive'], ['metricflow', 'metricflow'], ['dbt', 'dbt'], ['lookml', 'lookml'], diff --git a/packages/cli/src/setup-sources.ts b/packages/cli/src/setup-sources.ts index 70f42a67..5c071a81 100644 --- a/packages/cli/src/setup-sources.ts +++ b/packages/cli/src/setup-sources.ts @@ -3,8 +3,16 @@ import { tmpdir } from 'node:os'; import { join, relative, resolve } from 'node:path'; import { fileURLToPath, pathToFileURL } from 'node:url'; import { localConnectionTypeForConfig } from './context/connections/local-warehouse-descriptor.js'; +import { + parseGdriveConnectionConfig, + resolveGdriveServiceAccountKey, +} from './context/connections/gdrive-config.js'; import { resolveNotionConnectionAuthToken } from './context/connections/notion-config.js'; import { resolveKtxConfigReference } from './context/core/config-reference.js'; +import { + createGoogleDocsClients, +} from './context/ingest/adapters/gdrive/gdrive-client.js'; +import { GDRIVE_DOC_MIME_TYPE, gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js'; import { cloneOrPull, testRepoConnection } from './context/ingest/repo-fetch.js'; import { DEFAULT_METABASE_CLIENT_CONFIG, MetabaseClient } from './context/ingest/adapters/metabase/client.js'; import { discoverMetabaseDatabases, type DiscoveredMetabaseDatabase } from './context/ingest/adapters/metabase/mapping.js'; @@ -37,7 +45,7 @@ import { type KtxSetupPromptOption, } from './setup-prompts.js'; -export type KtxSetupSourceType = 'dbt' | 'metricflow' | 'metabase' | 'looker' | 'lookml' | 'notion'; +export type KtxSetupSourceType = 'dbt' | 'metricflow' | 'metabase' | 'looker' | 'lookml' | 'notion' | 'gdrive'; const DEFAULT_NOTION_MAX_KNOWLEDGE_CREATES_PER_RUN = 25; @@ -62,6 +70,9 @@ export interface KtxSetupSourcesArgs { metabaseDatabaseId?: number; notionCrawlMode?: 'all_accessible' | 'selected_roots'; notionRootPageIds?: string[]; + gdriveServiceAccountKeyRef?: string; + gdriveFolderId?: string; + gdriveRecursive?: boolean; runInitialSourceIngest: boolean; skipSources: boolean; } @@ -103,6 +114,7 @@ export interface KtxSetupSourcesDeps { validateLooker?: (projectDir: string, connectionId: string) => Promise; validateLookml?: (connection: KtxProjectConnectionConfig) => Promise; validateNotion?: (connection: KtxProjectConnectionConfig) => Promise; + validateGdrive?: (connection: KtxProjectConnectionConfig) => Promise; pickNotionRootPages?: typeof pickNotionRootPages; discoverMetabaseDatabases?: (args: { sourceUrl: string; @@ -125,6 +137,7 @@ const SOURCE_OPTIONS: Array<{ value: KtxSetupSourceType; label: string }> = [ { value: 'metricflow', label: 'MetricFlow' }, { value: 'looker', label: 'Looker' }, { value: 'lookml', label: 'LookML' }, + { value: 'gdrive', label: 'Google Drive' }, ]; const SOURCE_LABELS = Object.fromEntries(SOURCE_OPTIONS.map((option) => [option.value, option.label])) as Record< @@ -218,8 +231,10 @@ function credentialRef(value: string | undefined, label: string): string { return ref; } +type SharedSourceCredentialField = 'sourceAuthTokenRef' | 'sourceApiKeyRef' | 'sourceClientSecretRef'; + type SourceCredentialFlag = { - field: 'sourceAuthTokenRef' | 'sourceApiKeyRef' | 'sourceClientSecretRef'; + field: SharedSourceCredentialField | null; flag: string; }; @@ -232,9 +247,10 @@ const SOURCE_CREDENTIAL_FLAG: Record = notion: { field: 'sourceAuthTokenRef', flag: '--source-auth-token-ref' }, metabase: { field: 'sourceApiKeyRef', flag: '--source-api-key-ref' }, looker: { field: 'sourceClientSecretRef', flag: '--source-client-secret-ref' }, + gdrive: { field: null, flag: '--gdrive-service-account-key-ref' }, }; -const ALL_SOURCE_CREDENTIAL_FLAGS: SourceCredentialFlag[] = [ +const ALL_SOURCE_CREDENTIAL_FLAGS: Array<{ field: SharedSourceCredentialField; flag: string }> = [ { field: 'sourceAuthTokenRef', flag: '--source-auth-token-ref' }, { field: 'sourceApiKeyRef', flag: '--source-api-key-ref' }, { field: 'sourceClientSecretRef', flag: '--source-client-secret-ref' }, @@ -560,6 +576,22 @@ function buildNotionConnection(args: KtxSetupSourcesArgs): KtxProjectConnectionC }; } +function buildGdriveConnection(args: KtxSetupSourcesArgs): KtxProjectConnectionConfig { + const folderId = args.gdriveFolderId?.trim(); + if (!folderId) { + throw new Error('Google Drive setup requires --gdrive-folder-id.'); + } + return { + driver: 'gdrive', + service_account_key_ref: credentialRef( + args.gdriveServiceAccountKeyRef, + 'Google Drive service account key ref', + ), + folder_id: folderId, + recursive: args.gdriveRecursive === true, + }; +} + function sourcePathFromFileRepoUrl(repoUrl: string, subpath?: string): string { const root = fileURLToPath(repoUrl); return subpath ? join(root, subpath) : root; @@ -680,6 +712,17 @@ async function defaultValidateNotion(connection: KtxProjectConnectionConfig): Pr return { ok: true, detail: `roots=${roots.length}` }; } +async function defaultValidateGdrive(connection: KtxProjectConnectionConfig): Promise { + const config = parseGdriveConnectionConfig(connection); + const keyText = await resolveGdriveServiceAccountKey(config.service_account_key_ref); + const clients = createGoogleDocsClients(gdriveServiceAccountKeySchema.parse(JSON.parse(keyText))); + const result = await clients.drive.listFiles({ + q: `'${config.folder_id}' in parents and trashed = false`, + }); + const docs = result.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length; + return { ok: true, detail: `docs=${docs}` }; +} + interface MappingJsonOutput { connectionId: string; refresh: { ok: boolean; output: string[] }; @@ -1329,67 +1372,105 @@ async function promptForInteractiveSource( ]); } - return await runSourcePromptSteps(initialState, (state) => [ + if (source === 'notion') { + return await runSourcePromptSteps(initialState, (state) => [ + ...connectionSteps, + async (currentState) => { + const ref = await chooseSourceCredentialRef({ + prompts, + projectDir: args.projectDir, + label: 'Notion integration token', + envName: 'NOTION_TOKEN', + secretFileName: `${currentState.sourceConnectionId ?? 'notion-main'}-token`, + existingRef: currentState.sourceAuthTokenRef, + }); + if (ref === 'back') return 'back'; + currentState.sourceAuthTokenRef = ref; + return 'next'; + }, + async (currentState) => { + const crawlMode = await prompts.select({ + message: 'Which Notion pages should ktx ingest?', + options: [ + { value: 'all_accessible', label: 'All pages the integration can access' }, + { value: 'selected_roots', label: 'Specific pages and their subpages (choose them in a picker)' }, + { value: 'back', label: 'Back' }, + ], + }); + if (crawlMode === 'back') return 'back'; + currentState.notionCrawlMode = crawlMode === 'all_accessible' ? 'all_accessible' : 'selected_roots'; + if (currentState.notionCrawlMode === 'all_accessible') { + delete currentState.notionRootPageIds; + } + return 'next'; + }, + ...(state.notionCrawlMode === 'selected_roots' + ? [ + async (currentState: SourcePromptState) => { + const connectionId = currentState.sourceConnectionId ?? 'notion-main'; + const result = await (deps.pickNotionRootPages ?? pickNotionRootPages)( + { + connectionId, + connection: { + driver: 'notion', + auth_token_ref: credentialRef(currentState.sourceAuthTokenRef, 'Notion token ref'), + crawl_mode: 'selected_roots', + root_page_ids: currentState.notionRootPageIds ?? [], + root_database_ids: [], + root_data_source_ids: [], + }, + }, + io, + ); + if (result.kind === 'back') { + return 'back'; + } + if (result.kind === 'unavailable') { + io.stderr.write(`${result.message}\n`); + return 'back'; + } + currentState.notionRootPageIds = result.rootPageIds; + return 'next'; + }, + ] + : []), + ]); + } + + return await runSourcePromptSteps(initialState, () => [ ...connectionSteps, async (currentState) => { - const ref = await chooseSourceCredentialRef({ - prompts, - projectDir: args.projectDir, - label: 'Notion integration token', - envName: 'NOTION_TOKEN', - secretFileName: `${currentState.sourceConnectionId ?? 'notion-main'}-token`, - existingRef: currentState.sourceAuthTokenRef, + const keyRef = await promptText(prompts, { + message: 'Google Drive service account key file reference', + placeholder: 'file:/absolute/path/to/key.json', + ...(currentState.gdriveServiceAccountKeyRef ? { initialValue: currentState.gdriveServiceAccountKeyRef } : {}), }); - if (ref === 'back') return 'back'; - currentState.sourceAuthTokenRef = ref; + if (keyRef === undefined) return 'back'; + currentState.gdriveServiceAccountKeyRef = keyRef.trim(); return 'next'; }, async (currentState) => { - const crawlMode = await prompts.select({ - message: 'Which Notion pages should ktx ingest?', + const folderId = await promptText(prompts, { + message: 'Google Drive folder id', + ...(currentState.gdriveFolderId ? { initialValue: currentState.gdriveFolderId } : {}), + }); + if (folderId === undefined) return 'back'; + currentState.gdriveFolderId = folderId.trim(); + return 'next'; + }, + async (currentState) => { + const recursive = await prompts.select({ + message: 'Include Google Docs from subfolders?', options: [ - { value: 'all_accessible', label: 'All pages the integration can access' }, - { value: 'selected_roots', label: 'Specific pages and their subpages (choose them in a picker)' }, + { value: 'false', label: 'No' }, + { value: 'true', label: 'Yes' }, { value: 'back', label: 'Back' }, ], }); - if (crawlMode === 'back') return 'back'; - currentState.notionCrawlMode = crawlMode === 'all_accessible' ? 'all_accessible' : 'selected_roots'; - if (currentState.notionCrawlMode === 'all_accessible') { - delete currentState.notionRootPageIds; - } + if (recursive === 'back') return 'back'; + currentState.gdriveRecursive = recursive === 'true'; return 'next'; }, - ...(state.notionCrawlMode === 'selected_roots' - ? [ - async (currentState: SourcePromptState) => { - const connectionId = currentState.sourceConnectionId ?? 'notion-main'; - const result = await (deps.pickNotionRootPages ?? pickNotionRootPages)( - { - connectionId, - connection: { - driver: 'notion', - auth_token_ref: credentialRef(currentState.sourceAuthTokenRef, 'Notion token ref'), - crawl_mode: 'selected_roots', - root_page_ids: currentState.notionRootPageIds ?? [], - root_database_ids: [], - root_data_source_ids: [], - }, - }, - io, - ); - if (result.kind === 'back') { - return 'back'; - } - if (result.kind === 'unavailable') { - io.stderr.write(`${result.message}\n`); - return 'back'; - } - currentState.notionRootPageIds = result.rootPageIds; - return 'next'; - }, - ] - : []), ]); } @@ -1559,6 +1640,13 @@ function sourceArgsFromExistingConnection(input: { return sourceArgs; } + if (input.source === 'gdrive') { + sourceArgs.gdriveServiceAccountKeyRef = stringField(input.connection.service_account_key_ref); + sourceArgs.gdriveFolderId = stringField(input.connection.folder_id); + sourceArgs.gdriveRecursive = input.connection.recursive === true; + return sourceArgs; + } + sourceArgs.sourceAuthTokenRef = stringField(input.connection.auth_token_ref); sourceArgs.notionCrawlMode = input.connection.crawl_mode === 'all_accessible' ? 'all_accessible' : 'selected_roots'; @@ -1740,7 +1828,10 @@ function buildConnection(source: KtxSetupSourceType, args: KtxSetupSourcesArgs): if (source === 'lookml') { return buildLookmlConnection(args); } - return buildNotionConnection(args); + if (source === 'notion') { + return buildNotionConnection(args); + } + return buildGdriveConnection(args); } async function validateSource( @@ -1765,7 +1856,10 @@ async function validateSource( if (source === 'lookml') { return await (deps.validateLookml ?? defaultValidateLookml)(args.connection); } - return await (deps.validateNotion ?? defaultValidateNotion)(args.connection); + if (source === 'notion') { + return await (deps.validateNotion ?? defaultValidateNotion)(args.connection); + } + return await (deps.validateGdrive ?? defaultValidateGdrive)(args.connection); } async function createSourceSetupRollback(projectDir: string): Promise<() => Promise> { diff --git a/packages/cli/src/setup.ts b/packages/cli/src/setup.ts index dd893ce7..ff1d6846 100644 --- a/packages/cli/src/setup.ts +++ b/packages/cli/src/setup.ts @@ -126,6 +126,9 @@ export type KtxSetupArgs = metabaseDatabaseId?: number; notionCrawlMode?: 'all_accessible' | 'selected_roots'; notionRootPageIds?: string[]; + gdriveServiceAccountKeyRef?: string; + gdriveFolderId?: string; + gdriveRecursive?: boolean; runInitialSourceIngest?: boolean; skipSources?: boolean; showEntryMenu?: boolean; @@ -167,7 +170,7 @@ export interface KtxSetupDeps { setupUi?: KtxSetupUiAdapter; } -const SOURCE_DRIVERS = new Set(['dbt', 'metricflow', 'metabase', 'looker', 'lookml', 'notion']); +const SOURCE_DRIVERS = new Set(['dbt', 'metricflow', 'metabase', 'looker', 'lookml', 'notion', 'gdrive']); const KTX_DOCS_URL = 'https://docs.kaelio.com/ktx'; type KtxSetupEntryAction = 'setup' | 'new-project' | 'agents' | 'status' | 'demo' | 'exit'; @@ -873,6 +876,11 @@ async function runKtxSetupInner(args: KtxSetupArgs, io: KtxCliIo, deps: KtxSetup ...(args.metabaseDatabaseId !== undefined ? { metabaseDatabaseId: args.metabaseDatabaseId } : {}), ...(args.notionCrawlMode ? { notionCrawlMode: args.notionCrawlMode } : {}), ...(args.notionRootPageIds ? { notionRootPageIds: args.notionRootPageIds } : {}), + ...(args.gdriveServiceAccountKeyRef + ? { gdriveServiceAccountKeyRef: args.gdriveServiceAccountKeyRef } + : {}), + ...(args.gdriveFolderId ? { gdriveFolderId: args.gdriveFolderId } : {}), + ...(args.gdriveRecursive !== undefined ? { gdriveRecursive: args.gdriveRecursive } : {}), runInitialSourceIngest: args.runInitialSourceIngest ?? false, skipSources: args.skipSources === true || !shouldRunSources || skipSourcesFromDatabaseMenu, }, diff --git a/packages/cli/src/skills/gdrive_synthesize/SKILL.md b/packages/cli/src/skills/gdrive_synthesize/SKILL.md new file mode 100644 index 00000000..e9557b28 --- /dev/null +++ b/packages/cli/src/skills/gdrive_synthesize/SKILL.md @@ -0,0 +1,97 @@ +--- +name: gdrive_synthesize +description: Synthesize durable KTX wiki pages from staged Google Drive document pulls. Load when a WorkUnit contains Google Doc raw files from `docs/**`. +callers: [memory_agent] +--- + +# Google Drive Doc Synthesis + +Use this skill when a WorkUnit contains staged Google Drive content from `docs/**`. + +## Role + +Each WorkUnit is one Google Doc plus its metadata. Read the assigned raw files, then write a small set of durable wiki entries that capture reusable organizational knowledge. Write final memory directly; do not write candidates. + +## Required Workflow + +1. Read the WorkUnit notes and `rawFiles` list. Document content lives in `page.md`; `metadata.json` holds title, path, url, modified time, and Drive folder context. +2. For each assigned doc, call `read_raw_file`, or `read_raw_span` for oversized docs when the notes specify a span. +3. Search `wiki_search` for existing pages that overlap the WorkUnit topics. Prefer updating an existing page over creating a duplicate. +4. Use `context_evidence_search`, `context_evidence_read`, and `context_evidence_neighbors` when indexed document chunks would help reconcile related facts. Pass `chunkId` and `documentId` values verbatim as returned by the evidence tools. +5. Write durable business knowledge with `wiki_write`. Aim for a small number of high-quality pages per doc. Include `rawPaths` with the exact Google Drive raw files that support each page. +6. If a doc references warehouse, dbt, Looker, Metabase, or MetricFlow objects, you may verify them with `discover_data`, `entity_details`, `sql_execution`, `sl_discover`, or `sl_read_source`, but Google Drive docs are knowledge-only in v1. Do not create semantic-layer sources under the `gdrive` connection. +7. For every deleted raw path in the Eviction Set, call `eviction_list`, decide retention, then `emit_eviction_decision`. Do this even when no wiki write is needed. + +## What To Capture + +Capture durable, reusable company knowledge: + +- policies, workflows, process rules, ownership conventions, and operating procedures +- product definitions, business terminology, and organizational guidance +- source-of-truth statements, caveats, conflict notes, and supersession guidance +- cross-system aliases that connect doc terminology to warehouse, dbt, Looker, Metabase, or MetricFlow names + +Skip noisy or transient content: + +- brainstorming notes with no durable rule +- task lists, meeting scheduling details, and time-bounded status updates +- duplicate docs with no new fact +- shallow summaries that add no reusable policy or definition + +## Quality + +Prefer fewer, stronger entries. Every wiki entry must cite at least one Google Doc using its title or path and last modified date when available. When evidence conflicts, write a conflict note inside the wiki page rather than choosing silently. + +If one doc covers several related ideas, synthesize the shared durable rules instead of writing one thin page per paragraph. For oversized spans, read only the assigned span unless the WorkUnit explicitly asks for neighboring context. + +Search existing wiki pages for the same `tables:` or `sl_refs:` frontmatter and for source-of-truth aliases before creating a new page. If an existing page already documents the same warehouse object or business concept, update it instead of creating a differently named duplicate. + +## Citation Style + +```md +## Agentic Harness +- The harness provides the operational framework that turns an agent prototype into a production system. +- Source: Google Doc - Herness, last modified 2026-05-24. +- Conflict note: An older internal note uses a narrower definition focused only on tool wiring; treat the current Google Doc as the durable operating definition unless replaced explicitly. +``` + +## Semantic-Layer Rules + +- Google Drive docs are knowledge-only in v1; keep durable output in wiki pages. +- Do not create semantic-layer sources under the `gdrive` connection. +- If a doc references an existing warehouse or semantic-layer object and you can verify it, you may attach `sl_refs` in wiki output after confirmation. +- If a doc mentions a table or source that cannot be verified, keep the identifier in wiki text as unverified or use `emit_unmapped_fallback` only when the missing physical object itself is the important durable fact. + +## Identifier Verification Protocol + +Before writing a wiki page on any topic: + +1. `discover_data({query: ""})` - see what wikis, SL sources, and raw + tables already exist. Prefer updating existing pages over creating new ones. + +Before emitting any `schema.table` or `schema.table.column` into a wiki body, +`tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: + +2. `entity_details({connectionId, targets: [{display: ""}]})` - + confirm the identifier resolves; inspect native types, FK/PK, and + sampleValues. +3. For literal values from the doc, such as status codes or plan tiers, + check whether they appear in `entity_details` sampleValues for the relevant + column. If sampleValues is short or the sample may have missed real values, + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. +4. If the candidate identifier still does not resolve, do one of: + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. + If it errors, the identifier is fictional. + - Wrap the identifier in `[unverified - from ]` in the wiki body, + citing the exact raw path that mentioned it. + - When recording `emit_unmapped_fallback` with `no_physical_table`, include + the failing probe error in `clarification`. +5. Never copy `.` placeholder strings from these instructions + into output. + +## Tools + +Allowed: `read_raw_file`, `read_raw_span`, `wiki_search`, `wiki_read`, `wiki_write`, `discover_data`, `entity_details`, `sql_execution`, `sl_discover`, `sl_read_source`, `context_evidence_search`, `context_evidence_read`, `context_evidence_neighbors`, `emit_unmapped_fallback`, `eviction_list`, `emit_eviction_decision`. + +Not allowed: `context_candidate_write`, `context_candidate_mark`, `sl_write_source`, `sl_edit_source`, `sl_validate`. diff --git a/packages/cli/test/connection.test.ts b/packages/cli/test/connection.test.ts index 22c8bbe9..aab39554 100644 --- a/packages/cli/test/connection.test.ts +++ b/packages/cli/test/connection.test.ts @@ -416,6 +416,38 @@ describe('runKtxConnection', () => { expect(io.stdout()).toContain('Bot: bot-1'); }); + it('tests a Google Drive connection by listing visible Google Docs', async () => { + const projectDir = join(tempDir, 'project'); + await initKtxProject({ projectDir }); + await writeConnections(projectDir, { + docs_drive: { + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/gdrive-key.json', // pragma: allowlist secret + folder_id: 'folder-123', + recursive: false, + }, + }); + const listFiles = vi.fn(async () => ({ + files: [ + { id: '1', name: 'Spec', mimeType: 'application/vnd.google-apps.document', parents: [], webViewLink: null, modifiedTime: null }, + { id: '2', name: 'Sheet', mimeType: 'application/vnd.google-apps.spreadsheet', parents: [], webViewLink: null, modifiedTime: null }, + ], + nextPageToken: null, + })); + const createGdriveClient = vi.fn(async () => ({ listFiles })); + const io = makeIo(); + + await expect( + runKtxConnection({ command: 'test', projectDir, connectionId: 'docs_drive' }, io.io, { createGdriveClient }), + ).resolves.toBe(0); + + expect(createGdriveClient).toHaveBeenCalledWith(expect.objectContaining({ projectDir }), 'docs_drive'); + expect(listFiles).toHaveBeenCalledWith({ q: "'folder-123' in parents and trashed = false" }); + expect(io.stdout()).toContain('Connection test passed: docs_drive'); + expect(io.stdout()).toContain('Driver: gdrive'); + expect(io.stdout()).toContain('Docs: 1'); + }); + it('tests a dbt connection via testRepoConnection (success)', async () => { const projectDir = join(tempDir, 'project'); await initKtxProject({ projectDir }); @@ -543,6 +575,34 @@ describe('runKtxConnection', () => { expect(io.stderr()).toBe(''); }); + it('--all: includes Google Drive rows in the summary table', async () => { + const projectDir = join(tempDir, 'project'); + await initKtxProject({ projectDir }); + await writeConnections(projectDir, { + docs_drive: { + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/gdrive-key.json', // pragma: allowlist secret + folder_id: 'folder-123', + recursive: false, + }, + }); + const createGdriveClient = vi.fn(async () => ({ + listFiles: vi.fn(async () => ({ + files: [ + { id: '1', name: 'Spec', mimeType: 'application/vnd.google-apps.document', parents: [], webViewLink: null, modifiedTime: null }, + ], + nextPageToken: null, + })), + })); + const io = makeIo(); + + await expect( + runKtxConnection({ command: 'test-all', projectDir }, io.io, { createGdriveClient }), + ).resolves.toBe(0); + + expect(stripAnsi(io.stdout())).toMatch(/docs_drive\s+gdrive\s+✓ ok\s+Docs: 1/); + }); + it('--all: marks failing connections, keeps passing ones, and returns non-zero', async () => { const projectDir = join(tempDir, 'project'); await initKtxProject({ projectDir }); diff --git a/packages/cli/test/context/ingest/adapters/gdrive/chunk.test.ts b/packages/cli/test/context/ingest/adapters/gdrive/chunk.test.ts new file mode 100644 index 00000000..d5237e00 --- /dev/null +++ b/packages/cli/test/context/ingest/adapters/gdrive/chunk.test.ts @@ -0,0 +1,110 @@ +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { chunkGdriveStagedDir } from '../../../../../src/context/ingest/adapters/gdrive/chunk.js'; + +describe('chunkGdriveStagedDir', () => { + let stagedDir: string; + + beforeEach(async () => { + stagedDir = await mkdtemp(join(tmpdir(), 'ktx-gdrive-chunk-')); + }); + + afterEach(async () => { + await rm(stagedDir, { recursive: true, force: true }); + }); + + it('chunks changed documents into work units', async () => { + await writeFile( + join(stagedDir, 'manifest.json'), + JSON.stringify({ + source: 'gdrive', + folderId: 'folder-123', + recursive: false, + fetchedAt: '2026-05-23T00:00:00.000Z', + fileCount: 1, + skipped: [], + warnings: [], + }), + 'utf-8', + ); + await mkdir(join(stagedDir, 'docs', 'ops-handbook-doc-1'), { recursive: true }); + await writeFile( + join(stagedDir, 'docs', 'ops-handbook-doc-1', 'metadata.json'), + JSON.stringify({ + id: 'doc-1', + title: 'Ops Handbook', + path: 'Ops / Ops Handbook', + url: 'https://docs.google.com/document/d/doc-1', + mimeType: 'application/vnd.google-apps.document', + folderId: 'folder-123', + drivePath: ['Ops'], + modifiedTime: '2026-05-23T00:00:00.000Z', + }), + 'utf-8', + ); + await writeFile(join(stagedDir, 'docs', 'ops-handbook-doc-1', 'page.md'), '# Ops Handbook\n', 'utf-8'); + + const result = await chunkGdriveStagedDir(stagedDir, { + added: ['docs/ops-handbook-doc-1/metadata.json', 'docs/ops-handbook-doc-1/page.md'], + modified: [], + deleted: [], + unchanged: ['manifest.json'], + }); + + expect(result.workUnits).toHaveLength(1); + expect(result.workUnits[0]).toMatchObject({ + displayLabel: 'Ops / Ops Handbook', + rawFiles: ['docs/ops-handbook-doc-1/metadata.json', 'docs/ops-handbook-doc-1/page.md'], + dependencyPaths: ['manifest.json'], + }); + expect(result.workUnits[0].notes).toContain('Do not create semantic-layer sources from gdrive content in v1.'); + }); + + it('normalizes Windows-style diff paths before matching touched files', async () => { + await writeFile( + join(stagedDir, 'manifest.json'), + JSON.stringify({ + source: 'gdrive', + folderId: 'folder-123', + recursive: false, + fetchedAt: '2026-05-23T00:00:00.000Z', + fileCount: 1, + skipped: [], + warnings: [], + }), + 'utf-8', + ); + await mkdir(join(stagedDir, 'docs', 'ops-handbook-doc-1'), { recursive: true }); + await writeFile( + join(stagedDir, 'docs', 'ops-handbook-doc-1', 'metadata.json'), + JSON.stringify({ + id: 'doc-1', + title: 'Ops Handbook', + path: 'Ops / Ops Handbook', + url: 'https://docs.google.com/document/d/doc-1', + mimeType: 'application/vnd.google-apps.document', + folderId: 'folder-123', + drivePath: ['Ops'], + modifiedTime: '2026-05-23T00:00:00.000Z', + }), + 'utf-8', + ); + await writeFile(join(stagedDir, 'docs', 'ops-handbook-doc-1', 'page.md'), '# Ops Handbook\n', 'utf-8'); + + const result = await chunkGdriveStagedDir(stagedDir, { + added: ['docs\\ops-handbook-doc-1\\metadata.json', 'docs\\ops-handbook-doc-1\\page.md'], + modified: [], + deleted: ['docs\\old-doc\\page.md'], + unchanged: ['manifest.json'], + }); + + expect(result.workUnits).toHaveLength(1); + expect(result.workUnits[0]?.rawFiles).toEqual([ + 'docs/ops-handbook-doc-1/metadata.json', + 'docs/ops-handbook-doc-1/page.md', + ]); + expect(result.eviction).toEqual({ deletedRawPaths: ['docs/old-doc/page.md'] }); + }); +}); diff --git a/packages/cli/test/context/ingest/adapters/gdrive/detect.test.ts b/packages/cli/test/context/ingest/adapters/gdrive/detect.test.ts new file mode 100644 index 00000000..41dce516 --- /dev/null +++ b/packages/cli/test/context/ingest/adapters/gdrive/detect.test.ts @@ -0,0 +1,22 @@ +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { detectGdriveStagedDir } from '../../../../../src/context/ingest/adapters/gdrive/detect.js'; + +describe('detectGdriveStagedDir', () => { + let stagedDir: string; + + beforeEach(async () => { + stagedDir = await mkdtemp(join(tmpdir(), 'ktx-gdrive-detect-')); + }); + + afterEach(async () => { + await rm(stagedDir, { recursive: true, force: true }); + }); + + it('detects a manifest-backed gdrive staged dir', async () => { + await writeFile(join(stagedDir, 'manifest.json'), JSON.stringify({ source: 'gdrive' }), 'utf-8'); + await expect(detectGdriveStagedDir(stagedDir)).resolves.toBe(true); + }); +}); diff --git a/packages/cli/test/context/ingest/adapters/gdrive/fetch.test.ts b/packages/cli/test/context/ingest/adapters/gdrive/fetch.test.ts new file mode 100644 index 00000000..23bc58a0 --- /dev/null +++ b/packages/cli/test/context/ingest/adapters/gdrive/fetch.test.ts @@ -0,0 +1,84 @@ +import { mkdtemp, readdir, readFile, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join, relative } from 'node:path'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { fetchGdriveSnapshot } from '../../../../../src/context/ingest/adapters/gdrive/fetch.js'; + +const getDocument = vi.fn(async () => ({ + title: 'Herness and Enterprise Agent Operating Framework for Connected Systems', + body: { content: [] }, +})); +const listFiles = vi.fn(async () => ({ + files: [ + { + id: 'doc-1', + name: 'Herness and Enterprise Agent Operating Framework for Connected Systems', + mimeType: 'application/vnd.google-apps.document', + parents: ['folder-123'], + webViewLink: 'https://docs.google.com/document/d/doc-1', + modifiedTime: '2026-05-24T01:53:28.347Z', + }, + ], + nextPageToken: null, +})); + +vi.mock('../../../../../src/context/ingest/adapters/gdrive/gdrive-client.js', () => ({ + createGoogleDocsClients: vi.fn(() => ({ + drive: { listFiles }, + docs: { getDocument }, + })), +})); + +vi.mock('../../../../../src/context/ingest/adapters/gdrive/normalize.js', () => ({ + normalizeGoogleDocToMarkdown: vi.fn(() => 'Durable operating rules.'), +})); + +async function listRelativeFiles(root: string): Promise { + const entries = await readdir(root, { recursive: true, withFileTypes: true }); + return entries + .filter((entry) => entry.isFile()) + .map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/')) + .sort(); +} + +describe('fetchGdriveSnapshot', () => { + let stagedDir: string; + + afterEach(async () => { + await rm(stagedDir, { recursive: true, force: true }); + vi.clearAllMocks(); + }); + + it('writes compact staged paths while preserving full metadata title and path', async () => { + stagedDir = await mkdtemp(join(tmpdir(), 'ktx-gdrive-fetch-')); + + const manifest = await fetchGdriveSnapshot({ + key: { client_email: 'bot@example.com', private_key: 'secret' }, // pragma: allowlist secret + config: { serviceAccountKey: 'unused', folderId: 'folder-123', recursive: false }, // pragma: allowlist secret + stagedDir, + }); + + expect(manifest.fileCount).toBe(1); + expect(listFiles).toHaveBeenCalledWith({ q: "'folder-123' in parents and trashed = false", pageToken: undefined }); + expect(getDocument).toHaveBeenCalledWith('doc-1'); + + const files = await listRelativeFiles(stagedDir); + expect(files).toEqual([ + 'docs/herness-and-enterprise-a-7913523027/metadata.json', + 'docs/herness-and-enterprise-a-7913523027/page.md', + 'manifest.json', + ]); + + const metadata = JSON.parse( + await readFile(join(stagedDir, 'docs', 'herness-and-enterprise-a-7913523027', 'metadata.json'), 'utf-8'), + ); + expect(metadata).toMatchObject({ + id: 'doc-1', + title: 'Herness and Enterprise Agent Operating Framework for Connected Systems', + path: 'Herness and Enterprise Agent Operating Framework for Connected Systems', + }); + await expect( + readFile(join(stagedDir, 'docs', 'herness-and-enterprise-a-7913523027', 'page.md'), 'utf-8'), + ).resolves.toContain('# Herness and Enterprise Agent Operating Framework for Connected Systems'); + }); +}); diff --git a/packages/cli/test/context/ingest/adapters/gdrive/gdrive.adapter.test.ts b/packages/cli/test/context/ingest/adapters/gdrive/gdrive.adapter.test.ts new file mode 100644 index 00000000..62fae739 --- /dev/null +++ b/packages/cli/test/context/ingest/adapters/gdrive/gdrive.adapter.test.ts @@ -0,0 +1,60 @@ +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { GdriveSourceAdapter } from '../../../../../src/context/ingest/adapters/gdrive/gdrive.adapter.js'; + +describe('GdriveSourceAdapter', () => { + let stagedDir: string; + + beforeEach(async () => { + stagedDir = await mkdtemp(join(tmpdir(), 'ktx-gdrive-adapter-')); + }); + + afterEach(async () => { + await rm(stagedDir, { recursive: true, force: true }); + }); + + it('declares gdrive source behavior', () => { + const adapter = new GdriveSourceAdapter(); + expect(adapter.source).toBe('gdrive'); + expect(adapter.skillNames).toEqual(['gdrive_synthesize']); + expect(adapter.reconcileSkillNames).toEqual([]); + expect(adapter.evidenceIndexing).toBe('documents'); + }); + + it('detects a gdrive staged dir from manifest source', async () => { + const adapter = new GdriveSourceAdapter(); + await writeFile(join(stagedDir, 'manifest.json'), JSON.stringify({ source: 'gdrive' }), 'utf-8'); + await expect(adapter.detect(stagedDir)).resolves.toBe(true); + }); + + it('reports malformed manifests with a gdrive-specific error', async () => { + const adapter = new GdriveSourceAdapter(); + await writeFile(join(stagedDir, 'manifest.json'), '{bad json', 'utf-8'); + await expect(adapter.chunk(stagedDir)).rejects.toThrow(/Invalid gdrive manifest/); + }); + + it('describes complete folder scope', async () => { + const adapter = new GdriveSourceAdapter(); + await writeFile( + join(stagedDir, 'manifest.json'), + JSON.stringify({ + source: 'gdrive', + folderId: 'folder-123', + recursive: false, + fetchedAt: '2026-05-23T00:00:00.000Z', + fileCount: 0, + skipped: [], + warnings: [], + }), + 'utf-8', + ); + await mkdir(join(stagedDir, 'docs'), { recursive: true }); + + const scope = await adapter.describeScope?.(stagedDir); + expect(scope?.isPathInScope('manifest.json')).toBe(true); + expect(scope?.isPathInScope('docs/example/page.md')).toBe(true); + expect(scope?.isPathInScope('pages/example/page.md')).toBe(false); + }); +}); diff --git a/packages/cli/test/context/ingest/adapters/gdrive/normalize.test.ts b/packages/cli/test/context/ingest/adapters/gdrive/normalize.test.ts new file mode 100644 index 00000000..455b291b --- /dev/null +++ b/packages/cli/test/context/ingest/adapters/gdrive/normalize.test.ts @@ -0,0 +1,628 @@ +import { describe, expect, it } from 'vitest'; +import { normalizeGoogleDocToMarkdown } from '../../../../../src/context/ingest/adapters/gdrive/normalize.js'; + +describe('normalizeGoogleDocToMarkdown', () => { + it('maps title, subtitle, and heading named styles to markdown headings', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Executive Brief', + body: { + content: [ + { + paragraph: { + paragraphStyle: { namedStyleType: 'TITLE', headingId: 'title-anchor' }, + elements: [{ textRun: { content: 'Executive Brief' } }], + }, + }, + { + paragraph: { + paragraphStyle: { namedStyleType: 'SUBTITLE' }, + elements: [{ textRun: { content: 'Q3 Planning' } }], + }, + }, + { + paragraph: { + paragraphStyle: { namedStyleType: 'HEADING_1' }, + elements: [{ textRun: { content: 'Overview' } }], + }, + }, + { + paragraph: { + paragraphStyle: { namedStyleType: 'HEADING_3' }, + elements: [{ textRun: { content: 'Risks' } }], + }, + }, + { + paragraph: { + elements: [{ textRun: { content: 'Plain paragraph text.' } }], + }, + }, + ], + }, + }); + + expect(markdown).toContain('\n# Executive Brief'); + expect(markdown).toContain('## Q3 Planning'); + expect(markdown).toContain('# Overview'); + expect(markdown).toContain('### Risks'); + expect(markdown).toContain('Plain paragraph text.'); + }); + + it('converts headings, lists, links, and inline formatting', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Ops Handbook', + body: { + content: [ + { + paragraph: { + paragraphStyle: { namedStyleType: 'HEADING_1' }, + elements: [{ textRun: { content: 'Policy' } }], + }, + }, + { + paragraph: { + elements: [ + { textRun: { content: 'Use ' } }, + { textRun: { content: 'documented', textStyle: { bold: true } } }, + { textRun: { content: ' rules.' } }, + ], + }, + }, + { + paragraph: { + bullet: { nestingLevel: 0 }, + elements: [{ textRun: { content: 'First item' } }], + }, + }, + { + paragraph: { + bullet: { nestingLevel: 1 }, + elements: [{ textRun: { content: 'Nested item' } }], + }, + }, + { + paragraph: { + elements: [ + { + textRun: { + content: 'Reference', + textStyle: { link: { url: 'https://example.com/docs)' }, italic: true }, + }, + }, + ], + }, + }, + ], + }, + }); + + expect(markdown).toContain('# Policy'); + expect(markdown).toContain('Use **documented** rules.'); + expect(markdown).toContain('- First item'); + expect(markdown).toContain(' - Nested item'); + expect(markdown).toContain('[*Reference*](https://example.com/docs\\))'); + }); + + it('resolves ordered and unordered lists from document list metadata', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Decision Log', + lists: { + unordered: { + listProperties: { + nestingLevels: [{ glyphType: 'BULLET' }, { glyphType: 'BULLET' }], + }, + }, + ordered: { + listProperties: { + nestingLevels: [{ glyphType: 'DECIMAL' }, { glyphType: 'UPPER_ALPHA' }], + }, + }, + }, + body: { + content: [ + { + paragraph: { + bullet: { listId: 'unordered', nestingLevel: 0 }, + elements: [{ textRun: { content: 'Top-level bullet' } }], + }, + }, + { + paragraph: { + bullet: { listId: 'ordered', nestingLevel: 0 }, + elements: [{ textRun: { content: 'Top-level ordered item' } }], + }, + }, + { + paragraph: { + bullet: { listId: 'ordered', nestingLevel: 1 }, + elements: [{ textRun: { content: 'Nested ordered item' } }], + }, + }, + ], + }, + }); + + expect(markdown).toContain('- Top-level bullet'); + expect(markdown).toContain('1. Top-level ordered item'); + expect(markdown).toContain(' 1. Nested ordered item'); + }); + + it('falls back to unordered markers when list metadata is missing', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Fallback Lists', + body: { + content: [ + { + paragraph: { + bullet: { listId: 'missing-list', nestingLevel: 0 }, + elements: [{ textRun: { content: 'Still preserved' } }], + }, + }, + ], + }, + }); + + expect(markdown).toContain('- Still preserved'); + }); + + it('preserves table content instead of dropping it silently', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Ownership Matrix', + body: { + content: [ + { + paragraph: { + paragraphStyle: { namedStyleType: 'HEADING_2' }, + elements: [{ textRun: { content: 'Decisions' } }], + }, + }, + { + table: { + tableRows: [ + { + tableCells: [ + { + content: [{ paragraph: { elements: [{ textRun: { content: 'Decision' } }] } }], + }, + { + content: [{ paragraph: { elements: [{ textRun: { content: 'Owner' } }] } }], + }, + ], + }, + { + tableCells: [ + { + content: [{ paragraph: { elements: [{ textRun: { content: 'Escalation path' } }] } }], + }, + { + content: [{ paragraph: { elements: [{ textRun: { content: 'Platform Ops' } }] } }], + }, + ], + }, + ], + }, + }, + ], + }, + }); + + expect(markdown).toContain('## Decisions'); + expect(markdown).toContain('| Decision | Owner |'); + expect(markdown).toContain('| --- | --- |'); + expect(markdown).toContain('| Escalation path | Platform Ops |'); + }); + + it('flattens multi-block table cells into stable markdown text', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Checklist', + body: { + content: [ + { + table: { + tableRows: [ + { + tableCells: [ + { + content: [ + { paragraph: { elements: [{ textRun: { content: 'Action items' } }] } }, + { paragraph: { bullet: { nestingLevel: 0 }, elements: [{ textRun: { content: 'Review runbook' } }] } }, + ], + }, + { + content: [ + { + paragraph: { + elements: [ + { + textRun: { + content: 'https://example.com/ops', + textStyle: { link: { url: 'https://example.com/ops' } }, + }, + }, + ], + }, + }, + ], + }, + ], + }, + ], + }, + }, + ], + }, + }); + + expect(markdown).toContain( + '| Action items / - Review runbook | [https://example.com/ops](https://example.com/ops) |', + ); + }); + + it('preserves empty table cells and mixed inline elements', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Runbook Matrix', + body: { + content: [ + { + table: { + tableRows: [ + { + tableCells: [ + { + content: [{ paragraph: { elements: [{ textRun: { content: 'Step' } }] } }], + }, + { + content: [{ paragraph: { elements: [{ textRun: { content: 'Notes' } }] } }], + }, + ], + }, + { + tableCells: [ + { + content: [ + { + paragraph: { + elements: [ + { textRun: { content: 'Deploy', textStyle: { underline: true } } }, + { textRun: { content: ' artifact' } }, + { inlineObjectElement: {} }, + ], + }, + }, + ], + }, + { + content: [{ paragraph: { elements: [] } }], + }, + ], + }, + ], + }, + }, + { + paragraph: { + elements: [{ pageBreak: {} }], + }, + }, + { + paragraph: { + elements: [{ textRun: { content: 'Appendix' } }], + }, + }, + ], + }, + }); + + expect(markdown).toContain('| Step | Notes |'); + expect(markdown).toContain('| Deploy artifact[Embedded object] | |'); + expect(markdown).toContain('---'); + expect(markdown).toContain('Appendix'); + }); + + it('emits working heading anchors for legacy and tab-aware internal heading links', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Internal Links', + body: { + content: [ + { + paragraph: { + elements: [ + { + textRun: { + content: 'Jump to Overview', + textStyle: { link: { headingId: 'overview-heading' } }, + }, + }, + ], + }, + }, + { + paragraph: { + bullet: { nestingLevel: 0 }, + elements: [ + { + textRun: { + content: 'Linked list item', + textStyle: { link: { heading: { id: 'overview-heading', tabId: 'tab-1' } } }, + }, + }, + ], + }, + }, + { + paragraph: { + paragraphStyle: { namedStyleType: 'HEADING_2', headingId: 'overview-heading' }, + elements: [{ textRun: { content: 'Overview' } }], + }, + }, + ], + }, + }); + + expect(markdown).toContain('[Jump to Overview](#heading-overview-heading)'); + expect(markdown).toContain('- [Linked list item](#heading-overview-heading)'); + expect(markdown).toContain('\n## Overview'); + }); + + it('preserves bookmark links even when bookmark targets are unresolved', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Bookmarks', + body: { + content: [ + { + paragraph: { + elements: [ + { + textRun: { + content: 'Jump to bookmark', + textStyle: { link: { bookmarkId: 'bookmark-1' } }, + }, + }, + ], + }, + }, + { + table: { + tableRows: [ + { + tableCells: [ + { + content: [ + { + paragraph: { + elements: [ + { + textRun: { + content: 'Bookmark in table', + textStyle: { link: { bookmark: { id: 'bookmark-2', tabId: 'tab-1' } } }, + }, + }, + ], + }, + }, + ], + }, + ], + }, + ], + }, + }, + ], + }, + }); + + expect(markdown).toContain('[Jump to bookmark](#bookmark-bookmark-1)'); + expect(markdown).toContain('| [Bookmark in table](#bookmark-bookmark-2) |'); + }); + + it('falls back to legacy document.body content when tabs are absent', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Legacy Doc', + body: { + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Legacy body text.' } }], + }, + }, + ], + }, + }); + + expect(markdown).toBe('Legacy body text.'); + }); + + it('normalizes multi-tab documents in display order with tab headings', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Tabbed Doc', + tabs: [ + { + tabProperties: { tabId: 'tab-1', title: 'Overview' }, + documentTab: { + body: { + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Overview text.' } }], + }, + }, + ], + }, + }, + }, + { + tabProperties: { tabId: 'tab-2', title: 'Appendix' }, + documentTab: { + body: { + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Appendix text.' } }], + }, + }, + ], + }, + }, + }, + ], + body: { + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Legacy content should not win.' } }], + }, + }, + ], + }, + }); + + expect(markdown).toBe('# Overview\n\nOverview text.\n\n# Appendix\n\nAppendix text.'); + }); + + it('walks nested child tabs and uses tab-local list metadata', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Nested Tabs', + tabs: [ + { + tabProperties: { tabId: 'parent', title: 'Parent Tab' }, + documentTab: { + body: { + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Parent content.' } }], + }, + }, + ], + }, + }, + childTabs: [ + { + tabProperties: { tabId: 'child', title: 'Child Tab' }, + documentTab: { + lists: { + childList: { + listProperties: { + nestingLevels: [{ glyphType: 'DECIMAL' }], + }, + }, + }, + body: { + content: [ + { + paragraph: { + bullet: { listId: 'childList', nestingLevel: 0 }, + elements: [{ textRun: { content: 'Nested ordered item' } }], + }, + }, + ], + }, + }, + }, + ], + }, + ], + }); + + expect(markdown).toBe('# Parent Tab\n\nParent content.\n\n# Child Tab\n\n1. Nested ordered item'); + }); + + it('includes legacy document headers and footers as labeled sections', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Header Footer Doc', + documentStyle: { + defaultHeaderId: 'headerA', + firstPageFooterId: 'footerA', + }, + headers: { + headerA: { + headerId: 'headerA', + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Company Confidential' } }], + }, + }, + ], + }, + }, + footers: { + footerA: { + footerId: 'footerA', + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Page 1' } }], + }, + }, + ], + }, + }, + body: { + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Body content.' } }], + }, + }, + ], + }, + }); + + expect(markdown).toBe( + '## Headers\n\n### Default Header\n\nCompany Confidential\n\nBody content.\n\n## Footers\n\n### First Page Footer\n\nPage 1', + ); + }); + + it('includes tab-specific headers and footers around tab body content with role-aware labels and id fallback', () => { + const markdown = normalizeGoogleDocToMarkdown({ + title: 'Tabbed Header Footer Doc', + tabs: [ + { + tabProperties: { tabId: 'tab-1', title: 'Overview' }, + documentTab: { + documentStyle: { + evenPageHeaderId: 'overviewHeader', + }, + headers: { + overviewHeader: { + headerId: 'overviewHeader', + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Overview Header' } }], + }, + }, + ], + }, + }, + body: { + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Overview body.' } }], + }, + }, + ], + }, + footers: { + overviewFooter: { + footerId: 'overviewFooter', + content: [ + { + paragraph: { + elements: [{ textRun: { content: 'Overview Footer' } }], + }, + }, + ], + }, + }, + }, + }, + ], + }); + + expect(markdown).toBe( + '# Overview\n\n## Headers\n\n### Even Page Header\n\nOverview Header\n\nOverview body.\n\n## Footers\n\n### Footer overviewFooter\n\nOverview Footer', + ); + }); +}); diff --git a/packages/cli/test/context/ingest/artifact-gates.test.ts b/packages/cli/test/context/ingest/artifact-gates.test.ts index 473595b2..491a7feb 100644 --- a/packages/cli/test/context/ingest/artifact-gates.test.ts +++ b/packages/cli/test/context/ingest/artifact-gates.test.ts @@ -72,6 +72,19 @@ describe('artifact gates', () => { ).toThrow(/provenance row references raw path outside this snapshot: cards\/missing\.json/); }); + it('accepts equivalent provenance raw paths across platform path separators', () => { + expect(() => + validateProvenanceRawPaths({ + rows: [ + { rawPath: 'docs/herness-a88aa1bf05/page.md' }, + { rawPath: 'docs/herness-a88aa1bf05/metadata.json' }, + ], + currentRawPaths: new Set(['docs\\herness-a88aa1bf05\\page.md', 'docs\\herness-a88aa1bf05\\metadata.json']), + deletedRawPaths: new Set(), + }), + ).not.toThrow(); + }); + it('fails measure-level wiki frontmatter sl_refs that point at missing entities', async () => { const wikiService = wikiServiceWithPages({ 'account-segments': { diff --git a/packages/cli/test/context/ingest/ingest-runtime-assets.test.ts b/packages/cli/test/context/ingest/ingest-runtime-assets.test.ts index 083934c6..345ef064 100644 --- a/packages/cli/test/context/ingest/ingest-runtime-assets.test.ts +++ b/packages/cli/test/context/ingest/ingest-runtime-assets.test.ts @@ -14,6 +14,7 @@ const adapterSkillNames = [ 'metabase_ingest', 'metricflow_ingest', 'notion_synthesize', + 'gdrive_synthesize', 'historic_sql_table_digest', 'historic_sql_patterns', 'ingest_triage', diff --git a/packages/cli/test/context/ingest/local-adapters.test.ts b/packages/cli/test/context/ingest/local-adapters.test.ts index a8799cee..0db65153 100644 --- a/packages/cli/test/context/ingest/local-adapters.test.ts +++ b/packages/cli/test/context/ingest/local-adapters.test.ts @@ -73,6 +73,7 @@ describe('local ingest adapters', () => { 'lookml', 'dbt', 'metabase', + 'gdrive', 'looker', 'metricflow', 'notion', diff --git a/packages/cli/test/context/ingest/tools/read-raw-file.tool.test.ts b/packages/cli/test/context/ingest/tools/read-raw-file.tool.test.ts index 041ea188..de10be39 100644 --- a/packages/cli/test/context/ingest/tools/read-raw-file.tool.test.ts +++ b/packages/cli/test/context/ingest/tools/read-raw-file.tool.test.ts @@ -26,6 +26,15 @@ describe('read_raw_file tool', () => { expect(result).toContain('line2'); }); + it('accepts forward-slash allow-list paths on Windows-style path normalization', async () => { + const tool = createReadRawFileTool({ stagedDir, allowedPaths: new Set(['views/a.yml']) }); + const result = await (tool.execute as (...args: unknown[]) => unknown)( + { path: 'views\\a.yml' }, + { toolCallId: 't1', messages: [] }, + ); + expect(result).toContain('line1'); + }); + it('refuses to return oversized files and directs callers to read spans', async () => { await writeFile(join(stagedDir, 'views', 'huge.yml'), `${'x'.repeat(160_000)}\n`, 'utf-8'); const tool = createReadRawFileTool({ stagedDir, allowedPaths: new Set(['views/huge.yml']) }); diff --git a/packages/cli/test/context/ingest/tools/read-raw-span.tool.test.ts b/packages/cli/test/context/ingest/tools/read-raw-span.tool.test.ts index cd4e8f2a..3b71ed0b 100644 --- a/packages/cli/test/context/ingest/tools/read-raw-span.tool.test.ts +++ b/packages/cli/test/context/ingest/tools/read-raw-span.tool.test.ts @@ -24,6 +24,15 @@ describe('read_raw_span tool', () => { expect(result).toBe('line2\nline3\nline4'); }); + it('accepts forward-slash allow-list paths on Windows-style path normalization', async () => { + const tool = createReadRawSpanTool({ stagedDir, allowedPaths: new Set(['v/a.yml']) }); + const result = await (tool.execute as (...args: unknown[]) => unknown)( + { path: 'v\\a.yml', startLine: 2, endLine: 3 }, + { toolCallId: 't1', messages: [] }, + ); + expect(result).toBe('line2\nline3'); + }); + it('clamps endLine to the end of the file', async () => { const tool = createReadRawSpanTool({ stagedDir, allowedPaths: new Set(['v/a.yml']) }); const result = await (tool.execute as (...args: unknown[]) => unknown)( diff --git a/packages/cli/test/context/memory/memory-runtime-assets.test.ts b/packages/cli/test/context/memory/memory-runtime-assets.test.ts index d07da604..db8c31c3 100644 --- a/packages/cli/test/context/memory/memory-runtime-assets.test.ts +++ b/packages/cli/test/context/memory/memory-runtime-assets.test.ts @@ -16,6 +16,7 @@ const expectedSkillHeadings: Record = { sl_capture: '# Semantic Layer', }; const expectedAdapterSkillHeadings: Record = { + gdrive_synthesize: '# Google Drive Doc Synthesis', historic_sql_patterns: '# Historic SQL Patterns', historic_sql_table_digest: '# Historic SQL Table Digest', live_database_ingest: '# Live Database Ingest', @@ -25,6 +26,7 @@ const expectedAdapterSkillHeadings: Record = { metricflow_ingest: '# MetricFlow to ktx Semantic Layer', }; const verificationWriterSkills = [ + 'gdrive_synthesize', 'notion_synthesize', 'dbt_ingest', 'lookml_ingest', @@ -141,6 +143,14 @@ describe('memory runtime assets', () => { expect(body).toContain('no_physical_table'); }); + it('ships Google Drive guidance for knowledge-only doc synthesis', async () => { + const body = await readFile(join(skillsDir, 'gdrive_synthesize', 'SKILL.md'), 'utf-8'); + + expect(body).toContain('Google Drive docs are knowledge-only in v1'); + expect(body).toContain('Do not create semantic-layer sources under the `gdrive` connection'); + expect(body).toContain('Source: Google Doc -'); + }); + it('packages LookML connection-mismatch SL gate guidance', async () => { const body = await readFile(join(skillsDir, 'lookml_ingest', 'SKILL.md'), 'utf-8'); diff --git a/packages/cli/test/context/project/driver-schemas.test.ts b/packages/cli/test/context/project/driver-schemas.test.ts index c83a27a1..df85a0db 100644 --- a/packages/cli/test/context/project/driver-schemas.test.ts +++ b/packages/cli/test/context/project/driver-schemas.test.ts @@ -91,7 +91,7 @@ describe('connectionConfigSchema - context source drivers with mappings', () => }); }); -describe('connectionConfigSchema - notion / dbt / metricflow', () => { +describe('connectionConfigSchema - notion / gdrive / dbt / metricflow', () => { it('parses a notion connection with selected_roots crawl', () => { const parsed = connectionConfigSchema.parse({ driver: 'notion', @@ -118,6 +118,21 @@ describe('connectionConfigSchema - notion / dbt / metricflow', () => { ).toThrow(); }); + it('parses a gdrive connection', () => { + const parsed = connectionConfigSchema.parse({ + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/google-service-account.json', // pragma: allowlist secret + folder_id: 'folder-123', + recursive: true, + }); + expect(parsed).toMatchObject({ + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/google-service-account.json', // pragma: allowlist secret + folder_id: 'folder-123', + recursive: true, + }); + }); + it('parses a dbt connection from a local source_dir', () => { const parsed = connectionConfigSchema.parse({ driver: 'dbt', diff --git a/packages/cli/test/public-ingest.test.ts b/packages/cli/test/public-ingest.test.ts index 5e7fffdf..bab59b4f 100644 --- a/packages/cli/test/public-ingest.test.ts +++ b/packages/cli/test/public-ingest.test.ts @@ -105,6 +105,7 @@ describe('buildPublicIngestPlan', () => { warehouse: { driver: 'postgres' }, prod_metabase: { driver: 'metabase', api_url: 'https://metabase.example.com' }, docs: { driver: 'notion' }, + docs_drive: { driver: 'gdrive', service_account_key_ref: 'file:/tmp/gdrive-key.json', folder_id: 'folder-123' }, // pragma: allowlist secret }); expect(buildPublicIngestPlan(project, { projectDir: '/tmp/project', all: true })).toEqual({ @@ -127,6 +128,14 @@ describe('buildPublicIngestPlan', () => { debugCommand: 'ktx ingest docs --debug', steps: ['source-ingest', 'memory-update'], }, + { + connectionId: 'docs_drive', + driver: 'gdrive', + operation: 'source-ingest', + adapter: 'gdrive', + debugCommand: 'ktx ingest docs_drive --debug', + steps: ['source-ingest', 'memory-update'], + }, { connectionId: 'prod_metabase', driver: 'metabase', diff --git a/packages/cli/test/setup-sources.test.ts b/packages/cli/test/setup-sources.test.ts index 0d01c189..18e91bf4 100644 --- a/packages/cli/test/setup-sources.test.ts +++ b/packages/cli/test/setup-sources.test.ts @@ -281,6 +281,35 @@ describe('setup sources step', () => { expect((await readConfig()).connections['notion-main']?.last_successful_cursor).toBeUndefined(); }); + it('writes Google Drive config with a non-recursive default', async () => { + await addPrimarySource(); + const validateGdrive = vi.fn(async () => ({ ok: true as const, detail: 'docs=2' })); + + await expect( + runKtxSetupSourcesStep( + { + projectDir, + inputMode: 'disabled', + source: 'gdrive', + sourceConnectionId: 'gdrive-main', + gdriveServiceAccountKeyRef: 'file:/tmp/gdrive-key.json', // pragma: allowlist secret + gdriveFolderId: 'folder-123', + runInitialSourceIngest: false, + skipSources: false, + }, + makeIo().io, + { validateGdrive }, + ), + ).resolves.toEqual({ status: 'ready', projectDir, connectionIds: ['gdrive-main'] }); + + expect((await readConfig()).connections['gdrive-main']).toMatchObject({ + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/gdrive-key.json', // pragma: allowlist secret + folder_id: 'folder-123', + recursive: false, + }); + }); + it('rejects --source-api-key-ref for Notion and points at --source-auth-token-ref', async () => { await addPrimarySource(); const io = makeIo(); @@ -766,6 +795,7 @@ describe('setup sources step', () => { ); const options = vi.mocked(testPrompts.multiselect).mock.calls[0]?.[0].options ?? []; expect(options).toContainEqual({ value: 'notion', label: 'Notion' }); + expect(options).toContainEqual({ value: 'gdrive', label: 'Google Drive' }); }); it('shows already configured context sources in the interactive checklist', async () => { @@ -1432,6 +1462,39 @@ describe('setup sources step', () => { }); }); + it('edits an existing Google Drive source with the current key ref, folder id, and recursion flag', async () => { + await addPrimarySource(); + await addConnection('gdrive-main', { + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/old-key.json', // pragma: allowlist secret + folder_id: 'old-folder', + recursive: false, + }); + const testPrompts = prompts({ + multiselect: [['gdrive']], + select: ['edit:gdrive-main', 'true', 'done'], + text: ['file:/tmp/new-key.json', 'new-folder'], + }); + + await expect( + runKtxSetupSourcesStep( + { projectDir, inputMode: 'auto', runInitialSourceIngest: false, skipSources: false }, + makeIo().io, + { + prompts: testPrompts, + validateGdrive: vi.fn(async () => ({ ok: true as const, detail: 'docs=3' })), + }, + ), + ).resolves.toEqual({ status: 'ready', projectDir, connectionIds: ['gdrive-main'] }); + + expect((await readConfig()).connections['gdrive-main']).toMatchObject({ + driver: 'gdrive', + service_account_key_ref: 'file:/tmp/new-key.json', // pragma: allowlist secret + folder_id: 'new-folder', + recursive: true, + }); + }); + it('edits an existing Metabase source with the current URL and credential as defaults', async () => { await addPrimarySource(); await addConnection('metabase-main', { diff --git a/packages/cli/test/setup.test.ts b/packages/cli/test/setup.test.ts index ee158248..f8b20bc9 100644 --- a/packages/cli/test/setup.test.ts +++ b/packages/cli/test/setup.test.ts @@ -258,9 +258,10 @@ describe('setup status', () => { ' database_connection_ids: []', 'connections:', ' docs:', - ' driver: notion', - ' auth_token_ref: env:NOTION_TOKEN', - ' crawl_mode: all_accessible', + ' driver: gdrive', + ' service_account_key_ref: file:/tmp/gdrive-key.json', + ' folder_id: folder-123', + ' recursive: false', ' warehouse:', ' driver: postgres', ' url: env:DATABASE_URL', @@ -271,7 +272,7 @@ describe('setup status', () => { await writeKtxSetupState(tempDir, { completed_steps: ['project', 'sources'] }); await expect(readKtxSetupStatus(tempDir)).resolves.toMatchObject({ - sources: [{ connectionId: 'docs', type: 'notion', ready: true }], + sources: [{ connectionId: 'docs', type: 'gdrive', ready: true }], }); }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6c1eae07..2eea9996 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -179,6 +179,9 @@ importers: fflate: specifier: ^0.8.3 version: 0.8.3 + google-auth-library: + specifier: 10.6.2 + version: 10.6.2 handlebars: specifier: ^4.7.9 version: 4.7.9