Add gdrive context source adapter (#209)

* Add gdrive context source adapter

* feat(gdrive): normalize internal doc links, tabs, and header/footer structure

* fix(gdrive): reject generic source credential flags

* test(gdrive): include local adapter in expected list

* fix(gdrive): remove dead exports and silence false positive secret checks

* fix(setup): restore notion source auth flow
This commit is contained in:
ARYAN 2026-06-27 14:41:32 -07:00 committed by GitHub
parent 967a413a06
commit 5645dc4d28
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
39 changed files with 2546 additions and 74 deletions

View file

@ -57,7 +57,8 @@ function sourceType(value: string): KtxSetupSourceType {
value === 'metabase' ||
value === 'looker' ||
value === 'lookml' ||
value === 'notion'
value === 'notion' ||
value === 'gdrive'
) {
return value;
}
@ -132,6 +133,9 @@ function shouldShowSetupEntryMenu(
metabaseDatabaseId?: number;
notionCrawlMode?: string;
notionRootPageId?: string[];
gdriveServiceAccountKeyRef?: string;
gdriveFolderId?: string;
gdriveRecursive?: boolean;
skipSources?: boolean;
},
command: Command,
@ -197,6 +201,9 @@ function shouldShowSetupEntryMenu(
'sourceTarget',
'metabaseDatabaseId',
'notionCrawlMode',
'gdriveServiceAccountKeyRef',
'gdriveFolderId',
'gdriveRecursive',
'skipSources',
].some((optionName) => optionWasSpecified(command, optionName));
}
@ -337,6 +344,12 @@ export function registerSetupCommands(program: Command, context: KtxCliCommandCo
.default([] as string[])
.hideHelp(),
)
.addOption(
new Option('--gdrive-service-account-key-ref <ref>', 'file: reference to a Google service account JSON key')
.hideHelp(),
)
.addOption(new Option('--gdrive-folder-id <id>', 'Google Drive folder id to ingest').hideHelp())
.addOption(new Option('--gdrive-recursive', 'Recursively traverse Google Drive subfolders').hideHelp().default(false))
.addOption(new Option('--skip-sources', 'Mark optional source setup complete with no sources').hideHelp().default(false))
.showHelpAfterError();
@ -486,6 +499,11 @@ export function registerSetupCommands(program: Command, context: KtxCliCommandCo
...(options.metabaseDatabaseId !== undefined ? { metabaseDatabaseId: options.metabaseDatabaseId } : {}),
...(options.notionCrawlMode ? { notionCrawlMode: options.notionCrawlMode } : {}),
...(options.notionRootPageId.length > 0 ? { notionRootPageIds: options.notionRootPageId } : {}),
...(options.gdriveServiceAccountKeyRef
? { gdriveServiceAccountKeyRef: options.gdriveServiceAccountKeyRef }
: {}),
...(options.gdriveFolderId ? { gdriveFolderId: options.gdriveFolderId } : {}),
...(options.gdriveRecursive ? { gdriveRecursive: true } : {}),
runInitialSourceIngest: false,
skipSources: options.skipSources === true,
showEntryMenu: shouldShowSetupEntryMenu(options, command),

View file

@ -3,8 +3,11 @@ import { DefaultLookerConnectionClientFactory } from './context/ingest/adapters/
import type { LookerClient } from './context/ingest/adapters/looker/client.js';
import type { MetabaseRuntimeClient } from './context/ingest/adapters/metabase/client-port.js';
import { type NotionBotInfo, NotionClient } from './context/ingest/adapters/notion/notion-client.js';
import { parseGdriveConnectionConfig, resolveGdriveServiceAccountKey } from './context/connections/gdrive-config.js';
import { createLocalLookerCredentialResolver } from './context/ingest/adapters/looker/local-looker.adapter.js';
import { metabaseRuntimeConfigFromLocalConnection } from './context/ingest/adapters/metabase/local-metabase.adapter.js';
import { createGoogleDocsClients } from './context/ingest/adapters/gdrive/gdrive-client.js';
import { GDRIVE_DOC_MIME_TYPE, gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js';
import { testRepoConnection } from './context/ingest/repo-fetch.js';
import { federatedConnectionListing } from './context/connections/federation.js';
import { getDriverRegistration } from './context/connections/drivers.js';
@ -31,6 +34,10 @@ export type KtxConnectionArgs =
type MetabaseTestPort = Pick<MetabaseRuntimeClient, 'testConnection' | 'getDatabases' | 'cleanup'>;
type LookerTestPort = Pick<LookerClient, 'testConnection'>;
type NotionTestPort = Pick<NotionClient, 'retrieveBotUser'>;
type GdriveTestPort = Pick<
ReturnType<typeof createGoogleDocsClients>['drive'],
'listFiles'
>;
type TestRepoConnection = typeof testRepoConnection;
export interface KtxConnectionDeps {
@ -38,6 +45,7 @@ export interface KtxConnectionDeps {
createMetabaseClient?: (project: KtxLocalProject, connectionId: string) => Promise<MetabaseTestPort>;
createLookerClient?: (project: KtxLocalProject, connectionId: string) => Promise<LookerTestPort>;
createNotionClient?: (project: KtxLocalProject, connectionId: string) => Promise<NotionTestPort>;
createGdriveClient?: (project: KtxLocalProject, connectionId: string) => Promise<GdriveTestPort>;
testRepoConnection?: TestRepoConnection;
}
@ -52,6 +60,7 @@ const SUPPORTED_TEST_DRIVERS = [
'metabase',
'looker',
'notion',
'gdrive',
'dbt',
'metricflow',
'lookml',
@ -183,6 +192,39 @@ async function testNotionConnection(
return { bot: describeNotionBot(bot) };
}
async function createDefaultGdriveClient(
project: KtxLocalProject,
connectionId: string,
): Promise<GdriveTestPort> {
const connection = project.config.connections[connectionId];
if (!connection) {
throw new Error(`Connection "${connectionId}" is not configured in ktx.yaml`);
}
const parsed = parseGdriveConnectionConfig(connection);
const keyText = await resolveGdriveServiceAccountKey(parsed.service_account_key_ref);
const key = gdriveServiceAccountKeySchema.parse(JSON.parse(keyText));
return createGoogleDocsClients(key).drive;
}
async function testGdriveConnection(
project: KtxLocalProject,
connectionId: string,
createClient: (project: KtxLocalProject, connectionId: string) => Promise<GdriveTestPort>,
): Promise<{ docs: number }> {
const connection = project.config.connections[connectionId];
if (!connection) {
throw new Error(`Connection "${connectionId}" is not configured in ktx.yaml`);
}
const parsed = parseGdriveConnectionConfig(connection);
const client = await createClient(project, connectionId);
const result = await client.listFiles({
q: `'${parsed.folder_id}' in parents and trashed = false`,
});
return {
docs: result.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length,
};
}
interface GitConnectionFields {
repoUrl: string;
authToken: string | null;
@ -271,6 +313,15 @@ async function testConnectionByDriver(
return { driver, detailKey: 'Bot', detailValue: result.bot };
}
if (driver === 'gdrive') {
const result = await testGdriveConnection(
project,
connectionId,
deps.createGdriveClient ?? createDefaultGdriveClient,
);
return { driver, detailKey: 'Docs', detailValue: String(result.docs) };
}
if (driver === 'dbt' || driver === 'metricflow' || driver === 'lookml') {
const result = await testGitRepoConnection(
project,

View file

@ -0,0 +1,71 @@
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import {
gdriveConnectionToPullConfig,
parseGdriveConnectionConfig,
resolveGdriveServiceAccountKey,
} from './gdrive-config.js';
describe('standalone gdrive connection config', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-gdrive-config-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('parses config with safe defaults', () => {
const parsed = parseGdriveConnectionConfig({
driver: 'gdrive',
service_account_key_ref: 'file:/tmp/google-key.json', // pragma: allowlist secret
folder_id: 'folder-123',
});
expect(parsed).toEqual({
driver: 'gdrive',
service_account_key_ref: 'file:/tmp/google-key.json', // pragma: allowlist secret
folder_id: 'folder-123',
recursive: false,
});
});
it('requires file-based service account keys', () => {
expect(() =>
parseGdriveConnectionConfig({
driver: 'gdrive',
service_account_key_ref: 'env:GOOGLE_KEY', // pragma: allowlist secret
folder_id: 'folder-123',
}),
).toThrow('gdrive service_account_key_ref must use file:/path/to/key.json');
});
it('resolves service account key files', async () => {
const keyPath = join(tempDir, 'google-key.json');
await writeFile(keyPath, '{"client_email":"bot@example.com","private_key":"line-1"}\n', 'utf-8'); // pragma: allowlist secret
await expect(resolveGdriveServiceAccountKey(`file:${keyPath}`)).resolves.toContain('"client_email":"bot@example.com"');
});
it('converts config into adapter pull config', async () => {
const keyPath = join(tempDir, 'google-key.json');
await writeFile(keyPath, '{"client_email":"bot@example.com","private_key":"line-1"}\n', 'utf-8'); // pragma: allowlist secret
const pullConfig = await gdriveConnectionToPullConfig(
parseGdriveConnectionConfig({
driver: 'gdrive',
service_account_key_ref: `file:${keyPath}`, // pragma: allowlist secret
folder_id: 'folder-123',
recursive: true,
}),
);
expect(pullConfig).toEqual({
serviceAccountKey: '{"client_email":"bot@example.com","private_key":"line-1"}', // pragma: allowlist secret
folderId: 'folder-123',
recursive: true,
});
});
});

View file

@ -0,0 +1,87 @@
import { readFile } from 'node:fs/promises';
import { homedir } from 'node:os';
import { resolve } from 'node:path';
import type { KtxProjectConnectionConfig } from '../project/config.js';
import type { GdrivePullConfig } from '../ingest/adapters/gdrive/types.js';
import { gdrivePullConfigSchema } from '../ingest/adapters/gdrive/types.js';
type RawKtxGdriveConnectionConfig = Extract<KtxProjectConnectionConfig, { driver: 'gdrive' }>;
export type KtxGdriveConnectionConfig = Omit<
RawKtxGdriveConnectionConfig,
'service_account_key_ref' | 'folder_id' | 'recursive'
> & {
driver: 'gdrive';
service_account_key_ref: string;
folder_id: string;
recursive: boolean;
};
interface ResolveKeyOptions {
readTextFile?: (path: string) => Promise<string>;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function expandHome(path: string): string {
return path === '~' || path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path;
}
export function parseGdriveConnectionConfig(raw: unknown): KtxGdriveConnectionConfig {
if (!isRecord(raw)) {
throw new Error('gdrive connection config must be an object');
}
if (raw.driver !== 'gdrive') {
throw new Error('gdrive connection config requires driver: gdrive');
}
const keyRef =
typeof raw.service_account_key_ref === 'string' && raw.service_account_key_ref.trim().length > 0 // pragma: allowlist secret
? raw.service_account_key_ref.trim()
: null;
if (!keyRef) {
throw new Error('gdrive connection config requires service_account_key_ref');
}
if (!keyRef.startsWith('file:')) {
throw new Error('gdrive service_account_key_ref must use file:/path/to/key.json');
}
const folderId = typeof raw.folder_id === 'string' && raw.folder_id.trim().length > 0 ? raw.folder_id.trim() : null;
if (!folderId) {
throw new Error('gdrive connection config requires folder_id');
}
return {
driver: 'gdrive',
service_account_key_ref: keyRef,
folder_id: folderId,
recursive: raw.recursive === true,
};
}
/** @internal */
export async function resolveGdriveServiceAccountKey(
serviceAccountKeyRef: string,
options: ResolveKeyOptions = {},
): Promise<string> {
if (!serviceAccountKeyRef.startsWith('file:')) {
throw new Error('gdrive service_account_key_ref must use file:/path/to/key.json');
}
const path = expandHome(serviceAccountKeyRef.slice('file:'.length));
const readTextFile = options.readTextFile ?? ((filePath: string) => readFile(filePath, 'utf-8'));
const value = (await readTextFile(path)).trim();
if (!value) {
throw new Error(`gdrive service account key file is empty: ${path}`);
}
return value;
}
export async function gdriveConnectionToPullConfig(
config: KtxGdriveConnectionConfig,
options: ResolveKeyOptions = {},
): Promise<GdrivePullConfig> {
return gdrivePullConfigSchema.parse({
serviceAccountKey: await resolveGdriveServiceAccountKey(config.service_account_key_ref, options),
folderId: config.folder_id,
recursive: config.recursive,
});
}

View file

@ -0,0 +1,85 @@
import { createHash } from 'node:crypto';
import { readdir, readFile } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
import { gdriveManifestSchema, gdriveMetadataSchema } from './types.js';
const GDRIVE_RECONCILE_GUIDANCE =
'Synthesize durable wiki knowledge from this Google Doc. Preserve product definitions, process documentation, and operating rules as wiki pages. Do not create semantic-layer sources from gdrive content in v1.';
function normalizeRawPath(path: string): string {
return path.replace(/\\/g, '/');
}
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
return entries
.filter((entry) => entry.isFile())
.map((entry) => normalizeRawPath(relative(root, join(entry.parentPath, entry.name))))
.sort();
}
function safeUnitKey(path: string): string {
return `gdrive-${path.replace(/^docs\//, '').replace(/\/page\.md$/, '').replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
}
async function readManifest(stagedDir: string) {
try {
return gdriveManifestSchema.parse(JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')));
} catch (error) {
throw new Error(`Invalid gdrive manifest: ${error instanceof Error ? error.message : String(error)}`);
}
}
export async function chunkGdriveStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const files = await walk(stagedDir);
const manifest = await readManifest(stagedDir);
const touched = diffSet
? new Set([...diffSet.added, ...diffSet.modified].map((path) => normalizeRawPath(path)))
: null;
const workUnits: WorkUnit[] = [];
for (const pagePath of files.filter((path) => path.endsWith('/page.md'))) {
const metadataPath = pagePath.replace(/\/page\.md$/, '/metadata.json');
const primary = [metadataPath, pagePath].filter((path) => files.includes(path));
if (touched && !primary.some((path) => touched.has(path))) {
continue;
}
const metadata = gdriveMetadataSchema.parse(JSON.parse(await readFile(join(stagedDir, metadataPath), 'utf-8')));
const rawFiles = touched ? primary.filter((path) => touched.has(path)).sort() : primary.sort();
const dependencyPaths = ['manifest.json'].filter((path) => !rawFiles.includes(path));
const excluded = new Set([...rawFiles, ...dependencyPaths]);
const peerFileIndex = files.filter((path) => !excluded.has(path)).sort();
workUnits.push({
unitKey: safeUnitKey(pagePath),
displayLabel: metadata.path,
rawFiles,
dependencyPaths,
peerFileIndex,
notes: GDRIVE_RECONCILE_GUIDANCE,
});
}
return {
workUnits,
eviction:
diffSet && diffSet.deleted.length > 0
? { deletedRawPaths: diffSet.deleted.map((path) => normalizeRawPath(path)).sort() }
: undefined,
reconcileNotes: ['Google Drive docs are knowledge-only in v1; keep output in wiki pages unless later follow-up work expands scope.'],
contextReport: { capped: false, warnings: manifest.warnings },
};
}
export async function describeGdriveScope(stagedDir: string): Promise<ScopeDescriptor> {
const manifest = await readManifest(stagedDir);
const scopeKey = JSON.stringify({
folderId: manifest.folderId,
recursive: manifest.recursive,
});
const fingerprint = createHash('sha256').update(scopeKey).digest('hex');
return {
fingerprint,
isPathInScope: (rawPath) => rawPath === 'manifest.json' || rawPath.startsWith('docs/'),
};
}

View file

@ -0,0 +1,20 @@
import { readFile, readdir } from 'node:fs/promises';
import { join } from 'node:path';
export async function detectGdriveStagedDir(stagedDir: string): Promise<boolean> {
try {
const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as { source?: unknown };
if (manifest.source === 'gdrive') {
return true;
}
} catch {
// Fall through to structural detection.
}
try {
const entries = await readdir(stagedDir, { withFileTypes: true, recursive: true });
return entries.some((entry) => entry.isFile() && entry.name === 'page.md');
} catch {
return false;
}
}

View file

@ -0,0 +1,109 @@
import { createHash } from 'node:crypto';
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import { createGoogleDocsClients } from './gdrive-client.js';
import { normalizeGoogleDocToMarkdown } from './normalize.js';
import type { GdriveFileRecord, GdriveManifest, GdrivePullConfig } from './types.js';
import { GDRIVE_DOC_MIME_TYPE, GDRIVE_SOURCE_KEY } from './types.js';
async function writeJson(path: string, value: unknown): Promise<void> {
await mkdir(dirname(path), { recursive: true });
await writeFile(path, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
async function writeText(path: string, value: string): Promise<void> {
await mkdir(dirname(path), { recursive: true });
await writeFile(path, value.endsWith('\n') ? value : `${value}\n`, 'utf-8');
}
function slugifySegment(value: string): string {
const normalized = value
.normalize('NFKD')
.replace(/[^\x00-\x7F]/g, '')
.replace(/[^a-zA-Z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
.toLowerCase();
return normalized || 'untitled';
}
function compactSegment(value: string, maxLength = 24): string {
const slug = slugifySegment(value);
return slug.length > maxLength ? slug.slice(0, maxLength).replace(/-+$/g, '') || 'untitled' : slug;
}
function shortHash(value: string, length = 10): string {
return createHash('sha1').update(value).digest('hex').slice(0, length);
}
function gdriveDocDirName(title: string, fileId: string): string {
return `${compactSegment(title)}-${shortHash(fileId)}`;
}
async function listFolderFiles(
drive: ReturnType<typeof createGoogleDocsClients>['drive'],
folderId: string,
recursive: boolean,
parents: string[] = [],
): Promise<Array<{ file: GdriveFileRecord; drivePath: string[]; folderId: string }>> {
const q = `'${folderId}' in parents and trashed = false`;
const records: Array<{ file: GdriveFileRecord; drivePath: string[]; folderId: string }> = [];
let pageToken: string | undefined;
do {
const page = await drive.listFiles({ q, pageToken });
for (const file of page.files) {
if (file.mimeType === 'application/vnd.google-apps.folder') {
if (recursive) {
records.push(...(await listFolderFiles(drive, file.id, true, [...parents, file.name])));
}
continue;
}
if (file.mimeType !== GDRIVE_DOC_MIME_TYPE) {
continue;
}
records.push({ file, drivePath: parents, folderId });
}
pageToken = page.nextPageToken ?? undefined;
} while (pageToken);
return records;
}
export async function fetchGdriveSnapshot(params: {
key: unknown;
config: GdrivePullConfig;
stagedDir: string;
}): Promise<GdriveManifest> {
await mkdir(params.stagedDir, { recursive: true });
const clients = createGoogleDocsClients(params.key);
const docs = await listFolderFiles(clients.drive, params.config.folderId, params.config.recursive);
for (const { file, drivePath, folderId } of docs) {
const document = await clients.docs.getDocument(file.id);
const title = (document.title?.trim() || file.name).trim();
const relDir = join('docs', ...drivePath.map((segment) => compactSegment(segment)), gdriveDocDirName(title, file.id));
const markdownBody = normalizeGoogleDocToMarkdown(document);
const pageMarkdown = [`# ${title}`, markdownBody].filter(Boolean).join('\n\n');
await writeJson(join(params.stagedDir, relDir, 'metadata.json'), {
id: file.id,
title,
path: [...drivePath, title].join(' / ') || title,
url: file.webViewLink,
mimeType: file.mimeType,
folderId,
drivePath,
modifiedTime: file.modifiedTime,
});
await writeText(join(params.stagedDir, relDir, 'page.md'), pageMarkdown);
}
const manifest: GdriveManifest = {
source: GDRIVE_SOURCE_KEY,
folderId: params.config.folderId,
recursive: params.config.recursive,
fetchedAt: new Date().toISOString(),
fileCount: docs.length,
skipped: [],
warnings: [],
};
await writeJson(join(params.stagedDir, 'manifest.json'), manifest);
return manifest;
}

View file

@ -0,0 +1,106 @@
import { JWT } from 'google-auth-library';
import type { GdriveFileRecord, GdriveServiceAccountKey, GoogleDocsDocument } from './types.js';
import { GDRIVE_SCOPES, gdriveServiceAccountKeySchema } from './types.js';
const GOOGLE_DRIVE_BASE_URL = 'https://www.googleapis.com/drive/v3';
const GOOGLE_DOCS_BASE_URL = 'https://docs.googleapis.com/v1';
interface GoogleApiListResponse {
files?: Array<{
id?: string;
name?: string;
mimeType?: string;
parents?: string[];
webViewLink?: string;
modifiedTime?: string;
}>;
nextPageToken?: string;
}
interface GoogleApiFile {
id?: string;
name?: string;
mimeType?: string;
parents?: string[];
webViewLink?: string;
modifiedTime?: string;
}
async function parseGoogleResponse<T>(response: Response): Promise<T> {
if (!response.ok) {
const body = await response.text();
throw new Error(`Google API request failed (${response.status}): ${body || response.statusText}`);
}
return (await response.json()) as T;
}
async function authorizedFetch(client: JWT, url: string): Promise<Response> {
const headers = await client.getRequestHeaders(url);
return fetch(url, { headers });
}
function isGoogleApiFileRecord(file: GoogleApiFile): file is GoogleApiFile & {
id: string;
name: string;
mimeType: string;
} {
return typeof file.id === 'string' && typeof file.name === 'string' && typeof file.mimeType === 'string';
}
export function createGoogleDocsClients(rawKey: unknown): {
drive: {
listFiles(args: { q: string; pageToken?: string }): Promise<{ files: GdriveFileRecord[]; nextPageToken: string | null }>;
};
docs: {
getDocument(documentId: string): Promise<GoogleDocsDocument>;
};
} {
const key = gdriveServiceAccountKeySchema.parse(rawKey) satisfies GdriveServiceAccountKey;
const client = new JWT({
email: key.client_email,
key: key.private_key,
scopes: [...GDRIVE_SCOPES],
});
return {
drive: {
async listFiles(args) {
const params = new URLSearchParams({
q: args.q,
supportsAllDrives: 'true',
includeItemsFromAllDrives: 'true',
pageSize: '1000',
fields: 'nextPageToken,files(id,name,mimeType,parents,webViewLink,modifiedTime)',
});
if (args.pageToken) {
params.set('pageToken', args.pageToken);
}
const response = await authorizedFetch(client, `${GOOGLE_DRIVE_BASE_URL}/files?${params.toString()}`);
const parsed = await parseGoogleResponse<GoogleApiListResponse>(response);
return {
files: (parsed.files ?? [])
.filter(isGoogleApiFileRecord)
.map((file) => ({
id: file.id,
name: file.name,
mimeType: file.mimeType,
parents: Array.isArray(file.parents) ? file.parents.filter((parent): parent is string => typeof parent === 'string') : [],
webViewLink: typeof file.webViewLink === 'string' ? file.webViewLink : null,
modifiedTime: typeof file.modifiedTime === 'string' ? file.modifiedTime : null,
})),
nextPageToken: typeof parsed.nextPageToken === 'string' ? parsed.nextPageToken : null,
};
},
},
docs: {
async getDocument(documentId: string) {
const params = new URLSearchParams({
includeTabsContent: 'true',
suggestionsViewMode: 'PREVIEW_WITHOUT_SUGGESTIONS',
});
const response = await authorizedFetch(client, `${GOOGLE_DOCS_BASE_URL}/documents/${documentId}?${params.toString()}`);
return await parseGoogleResponse<GoogleDocsDocument>(response);
},
},
};
}

View file

@ -0,0 +1,33 @@
import type { ChunkResult, DiffSet, FetchContext, ScopeDescriptor, SourceAdapter } from '../../types.js';
import { chunkGdriveStagedDir, describeGdriveScope } from './chunk.js';
import { detectGdriveStagedDir } from './detect.js';
import { fetchGdriveSnapshot } from './fetch.js';
import { gdrivePullConfigSchema } from './types.js';
export class GdriveSourceAdapter implements SourceAdapter {
readonly source = 'gdrive';
readonly skillNames = ['gdrive_synthesize'];
readonly reconcileSkillNames: string[] = [];
readonly evidenceIndexing = 'documents' as const;
detect(stagedDir: string): Promise<boolean> {
return detectGdriveStagedDir(stagedDir);
}
async fetch(pullConfig: unknown, stagedDir: string, _ctx: FetchContext): Promise<void> {
const config = gdrivePullConfigSchema.parse(pullConfig);
await fetchGdriveSnapshot({
key: JSON.parse(config.serviceAccountKey),
config,
stagedDir,
});
}
chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
return chunkGdriveStagedDir(stagedDir, diffSet);
}
describeScope(stagedDir: string): Promise<ScopeDescriptor> {
return describeGdriveScope(stagedDir);
}
}

View file

@ -0,0 +1,323 @@
import type {
GoogleDocsDocument,
GoogleDocsDocumentStyle,
GoogleDocsHeaderFooter,
GoogleDocsLinkTarget,
GoogleDocsList,
GoogleDocsParagraph,
GoogleDocsParagraphElement,
GoogleDocsStructuralElement,
GoogleDocsTab,
GoogleDocsTable,
GoogleDocsTableCell,
} from './types.js';
function escapeMarkdownText(value: string): string {
return value.replace(/([*_~`])/g, '\\$1');
}
function normalizeInternalLinkTarget(prefix: 'heading' | 'bookmark', target: GoogleDocsLinkTarget | string | undefined): string | null {
const id = typeof target === 'string' ? target : target?.id;
if (!id?.trim()) {
return null;
}
return `#${prefix}-${id.trim()}`;
}
function resolveLinkHref(element: GoogleDocsParagraphElement): string | null {
const link = element.textRun?.textStyle?.link;
const href = link?.url?.trim();
if (href) {
return href;
}
return (
normalizeInternalLinkTarget('heading', link?.heading) ??
normalizeInternalLinkTarget('heading', link?.headingId) ??
normalizeInternalLinkTarget('bookmark', link?.bookmark) ??
normalizeInternalLinkTarget('bookmark', link?.bookmarkId) ??
null
);
}
function normalizeTextRun(element: GoogleDocsParagraphElement): string {
const content = element.textRun?.content ?? '';
const style = element.textRun?.textStyle;
let text = escapeMarkdownText(content.replace(/\r/g, ''));
if (!text && element.inlineObjectElement) {
return '[Embedded object]';
}
if (!text && element.pageBreak) {
return '\n---\n';
}
if (!text) {
return '';
}
const href = resolveLinkHref(element);
const isCode = style?.weightedFontFamily?.fontFamily === 'Courier New';
if (isCode) {
text = `\`${text.replace(/`/g, '\\`')}\``;
}
if (style?.bold) {
text = `**${text}**`;
}
if (style?.italic) {
text = `*${text}*`;
}
if (style?.underline) {
text = `<u>${text}</u>`;
}
if (style?.strikethrough) {
text = `~~${text}~~`;
}
if (href) {
text = `[${text}](${href.replace(/\)/g, '\\)')})`;
}
if (style?.baselineOffset === 'SUPERSCRIPT') {
text = `<sup>${text}</sup>`;
} else if (style?.baselineOffset === 'SUBSCRIPT') {
text = `<sub>${text}</sub>`;
}
return text;
}
function paragraphText(paragraph: GoogleDocsParagraph | undefined): string {
return (paragraph?.elements ?? [])
.map((element) => normalizeTextRun(element))
.join('')
.replace(/\n/g, '')
.trim();
}
function headingPrefix(namedStyleType: string | undefined): string | null {
if (namedStyleType === 'TITLE') {
return '#';
}
if (namedStyleType === 'SUBTITLE') {
return '##';
}
if (!namedStyleType?.startsWith('HEADING_')) {
return null;
}
const level = Number.parseInt(namedStyleType.slice('HEADING_'.length), 10);
if (Number.isNaN(level) || level < 1) {
return null;
}
return '#'.repeat(Math.min(level, 6));
}
function isOrderedListLevel(level: { glyphType?: string; glyphSymbol?: string } | undefined): boolean {
const glyphType = level?.glyphType?.toUpperCase();
if (glyphType) {
return (
glyphType.includes('NUMBER') ||
glyphType.includes('DECIMAL') ||
glyphType.includes('ALPHA') ||
glyphType.includes('ROMAN') ||
glyphType.includes('LATIN')
);
}
const glyphSymbol = level?.glyphSymbol?.trim();
return glyphSymbol === '%0.' || glyphSymbol === '%0)' || glyphSymbol === '1.' || glyphSymbol === '1)';
}
function listPrefix(paragraph: GoogleDocsParagraph, lists: Record<string, GoogleDocsList> | undefined): string | null {
if (!paragraph.bullet) {
return null;
}
const level = Math.max(paragraph.bullet.nestingLevel ?? 0, 0);
const indent = ' '.repeat(level);
const listDefinition = paragraph.bullet.listId ? lists?.[paragraph.bullet.listId] : undefined;
const listLevel = listDefinition?.listProperties?.nestingLevels?.[level];
return `${indent}${isOrderedListLevel(listLevel) ? '1. ' : '- '}`;
}
function paragraphToMarkdown(
paragraph: GoogleDocsParagraph | undefined,
lists: Record<string, GoogleDocsList> | undefined,
): string | null {
const text = paragraphText(paragraph);
if (!text) {
return null;
}
const prefix = paragraph ? listPrefix(paragraph, lists) : null;
if (prefix) {
return `${prefix}${text}`;
}
const heading = headingPrefix(paragraph?.paragraphStyle?.namedStyleType);
if (heading) {
const headingLine = `${heading} ${text}`;
const headingId = paragraph?.paragraphStyle?.headingId?.trim();
return headingId ? `<a id="heading-${headingId}"></a>\n${headingLine}` : headingLine;
}
return text;
}
function normalizeTableCell(
cell: GoogleDocsTableCell | undefined,
lists: Record<string, GoogleDocsList> | undefined,
): string {
const blocks = normalizeStructuralElements(cell?.content ?? [], lists);
return blocks
.map((block) => block.replace(/\n/g, ' <br> '))
.join(' / ')
.replace(/\|/g, '\\|')
.trim();
}
function markdownTableDivider(columnCount: number): string {
return `| ${Array.from({ length: columnCount }, () => '---').join(' | ')} |`;
}
function normalizeTable(table: GoogleDocsTable | undefined, lists: Record<string, GoogleDocsList> | undefined): string[] {
const rows = table?.tableRows ?? [];
const normalizedRows = rows
.map((row) => (row.tableCells ?? []).map((cell) => normalizeTableCell(cell, lists)))
.filter((cells) => cells.length > 0);
if (normalizedRows.length === 0) {
return [];
}
const columnCount = Math.max(...normalizedRows.map((cells) => cells.length));
const paddedRows = normalizedRows.map((cells) =>
Array.from({ length: columnCount }, (_, index) => cells[index] ?? ''),
);
const [header, ...body] = paddedRows;
const blocks = [`| ${header.join(' | ')} |`, markdownTableDivider(columnCount)];
for (const row of body) {
blocks.push(`| ${row.join(' | ')} |`);
}
return [blocks.join('\n')];
}
function normalizeStructuralElements(
elements: GoogleDocsStructuralElement[],
lists: Record<string, GoogleDocsList> | undefined,
): string[] {
const blocks: string[] = [];
for (const element of elements) {
const line = paragraphToMarkdown(element.paragraph, lists);
if (line) {
blocks.push(line);
continue;
}
if (element.table) {
blocks.push(...normalizeTable(element.table, lists));
}
}
return blocks;
}
function headerFooterRoleMap(
label: 'Headers' | 'Footers',
documentStyle: GoogleDocsDocumentStyle | undefined,
): Map<string, string> {
const roleMap = new Map<string, string>();
const roleEntries =
label === 'Headers'
? [
[documentStyle?.defaultHeaderId, 'Default Header'],
[documentStyle?.firstPageHeaderId, 'First Page Header'],
[documentStyle?.evenPageHeaderId, 'Even Page Header'],
]
: [
[documentStyle?.defaultFooterId, 'Default Footer'],
[documentStyle?.firstPageFooterId, 'First Page Footer'],
[documentStyle?.evenPageFooterId, 'Even Page Footer'],
];
for (const [id, role] of roleEntries) {
const normalizedId = id?.trim();
if (!normalizedId || roleMap.has(normalizedId)) {
continue;
}
roleMap.set(normalizedId, role ?? normalizedId);
}
return roleMap;
}
function normalizeHeaderFooterMap(
label: 'Headers' | 'Footers',
entries: Record<string, GoogleDocsHeaderFooter> | undefined,
lists: Record<string, GoogleDocsList> | undefined,
documentStyle: GoogleDocsDocumentStyle | undefined,
): string | null {
if (!entries) {
return null;
}
const ids = Object.keys(entries).sort();
const roles = headerFooterRoleMap(label, documentStyle);
const sections: string[] = [];
for (const id of ids) {
const blocks = normalizeStructuralElements(entries[id]?.content ?? [], lists);
if (blocks.length === 0) {
continue;
}
const title = roles.get(id) ?? `${label.slice(0, -1)} ${escapeMarkdownText(id)}`;
sections.push(`### ${title}\n\n${blocks.join('\n\n').trim()}`);
}
if (sections.length === 0) {
return null;
}
return `## ${label}\n\n${sections.join('\n\n').trim()}`;
}
function joinNonEmptySections(sections: Array<string | null>): string | null {
const nonEmpty = sections.filter((section): section is string => Boolean(section?.trim()));
if (nonEmpty.length === 0) {
return null;
}
return nonEmpty.join('\n\n').trim();
}
function flattenGoogleDocsTabs(tabs: GoogleDocsTab[] | undefined): GoogleDocsTab[] {
if (!tabs?.length) {
return [];
}
const flattened: GoogleDocsTab[] = [];
for (const tab of tabs) {
flattened.push(tab);
flattened.push(...flattenGoogleDocsTabs(tab.childTabs));
}
return flattened;
}
function normalizeTab(tab: GoogleDocsTab, fallbackLists: Record<string, GoogleDocsList> | undefined): string | null {
const lists = tab.documentTab?.lists ?? fallbackLists;
const headerSection = normalizeHeaderFooterMap(
'Headers',
tab.documentTab?.headers,
lists,
tab.documentTab?.documentStyle,
);
const bodySection = normalizeStructuralElements(tab.documentTab?.body?.content ?? [], lists).join('\n\n').trim();
const footerSection = normalizeHeaderFooterMap(
'Footers',
tab.documentTab?.footers,
lists,
tab.documentTab?.documentStyle,
);
const content = joinNonEmptySections([headerSection, bodySection, footerSection]);
if (!content) {
return null;
}
const title = tab.tabProperties?.title?.trim();
if (!title) {
return content;
}
return [`# ${escapeMarkdownText(title)}`, content].join('\n\n').trim();
}
export function normalizeGoogleDocToMarkdown(document: GoogleDocsDocument): string {
const normalizedTabs = flattenGoogleDocsTabs(document.tabs)
.map((tab) => normalizeTab(tab, document.lists))
.filter((tab): tab is string => Boolean(tab));
if (normalizedTabs.length > 0) {
return normalizedTabs.join('\n\n').trim();
}
const bodySection = normalizeStructuralElements(document.body?.content ?? [], document.lists).join('\n\n').trim();
return (
joinNonEmptySections([
normalizeHeaderFooterMap('Headers', document.headers, document.lists, document.documentStyle),
bodySection,
normalizeHeaderFooterMap('Footers', document.footers, document.lists, document.documentStyle),
]) ?? ''
);
}

View file

@ -0,0 +1,167 @@
import { z } from 'zod';
const GDRIVE_DOCS_SCOPE = 'https://www.googleapis.com/auth/documents.readonly';
const GDRIVE_DRIVE_SCOPE = 'https://www.googleapis.com/auth/drive.readonly';
export const GDRIVE_SCOPES = [GDRIVE_DRIVE_SCOPE, GDRIVE_DOCS_SCOPE] as const;
export const GDRIVE_SOURCE_KEY = 'gdrive';
export const GDRIVE_DOC_MIME_TYPE = 'application/vnd.google-apps.document';
export const gdrivePullConfigSchema = z.object({
serviceAccountKey: z.string().min(1),
folderId: z.string().min(1),
recursive: z.boolean().default(false),
});
export type GdrivePullConfig = z.infer<typeof gdrivePullConfigSchema>;
export const gdriveManifestSchema = z.object({
source: z.literal(GDRIVE_SOURCE_KEY),
folderId: z.string().min(1),
recursive: z.boolean(),
fetchedAt: z.string().datetime(),
fileCount: z.number().int().nonnegative(),
skipped: z.array(z.object({ externalId: z.string(), reason: z.string() })).default([]),
warnings: z.array(z.string()).default([]),
});
export type GdriveManifest = z.infer<typeof gdriveManifestSchema>;
export const gdriveMetadataSchema = z.object({
id: z.string(),
title: z.string(),
path: z.string(),
url: z.string().nullable().default(null),
mimeType: z.literal(GDRIVE_DOC_MIME_TYPE),
folderId: z.string(),
drivePath: z.array(z.string()).default([]),
modifiedTime: z.string().datetime().nullable().default(null),
});
export const gdriveServiceAccountKeySchema = z.object({
client_email: z.string().email(),
private_key: z.string().min(1),
project_id: z.string().min(1).optional(),
});
export type GdriveServiceAccountKey = z.infer<typeof gdriveServiceAccountKeySchema>;
export interface GdriveFileRecord {
id: string;
name: string;
mimeType: string;
parents: string[];
webViewLink: string | null;
modifiedTime: string | null;
}
export interface GoogleDocsDocument {
documentId?: string;
title?: string;
body?: {
content?: GoogleDocsStructuralElement[];
};
documentStyle?: GoogleDocsDocumentStyle;
lists?: Record<string, GoogleDocsList>;
headers?: Record<string, GoogleDocsHeaderFooter>;
footers?: Record<string, GoogleDocsHeaderFooter>;
tabs?: GoogleDocsTab[];
}
export interface GoogleDocsList {
listProperties?: {
nestingLevels?: GoogleDocsListNestingLevel[];
};
}
interface GoogleDocsListNestingLevel {
glyphType?: string;
glyphSymbol?: string;
}
export interface GoogleDocsTab {
tabProperties?: {
tabId?: string;
title?: string;
};
childTabs?: GoogleDocsTab[];
documentTab?: {
body?: {
content?: GoogleDocsStructuralElement[];
};
documentStyle?: GoogleDocsDocumentStyle;
lists?: Record<string, GoogleDocsList>;
headers?: Record<string, GoogleDocsHeaderFooter>;
footers?: Record<string, GoogleDocsHeaderFooter>;
};
}
export interface GoogleDocsDocumentStyle {
defaultHeaderId?: string;
defaultFooterId?: string;
firstPageHeaderId?: string;
firstPageFooterId?: string;
evenPageHeaderId?: string;
evenPageFooterId?: string;
}
export interface GoogleDocsHeaderFooter {
headerId?: string;
footerId?: string;
content?: GoogleDocsStructuralElement[];
}
export interface GoogleDocsStructuralElement {
paragraph?: GoogleDocsParagraph;
table?: GoogleDocsTable;
sectionBreak?: unknown;
}
export interface GoogleDocsTable {
tableRows?: GoogleDocsTableRow[];
}
interface GoogleDocsTableRow {
tableCells?: GoogleDocsTableCell[];
}
export interface GoogleDocsTableCell {
content?: GoogleDocsStructuralElement[];
}
export interface GoogleDocsParagraph {
elements?: GoogleDocsParagraphElement[];
bullet?: {
listId?: string;
nestingLevel?: number;
};
paragraphStyle?: {
namedStyleType?: string;
headingId?: string;
};
}
export interface GoogleDocsLinkTarget {
id?: string;
tabId?: string;
}
export interface GoogleDocsParagraphElement {
textRun?: {
content?: string;
textStyle?: {
bold?: boolean;
italic?: boolean;
underline?: boolean;
strikethrough?: boolean;
link?: {
url?: string;
tabId?: string;
headingId?: string;
bookmarkId?: string;
heading?: GoogleDocsLinkTarget;
bookmark?: GoogleDocsLinkTarget;
};
weightedFontFamily?: { fontFamily?: string };
baselineOffset?: 'SUPERSCRIPT' | 'SUBSCRIPT' | string;
};
};
inlineObjectElement?: unknown;
pageBreak?: unknown;
}

View file

@ -21,6 +21,10 @@ export interface ProvenanceRawPathValidationInput {
deletedRawPaths: Set<string>;
}
function normalizeRawPath(path: string): string {
return path.replace(/\\/g, '/').replace(/^\/+/, '');
}
function parseSlRef(ref: string): { connectionId: string | null; sourceName: string; entityName: string | null } {
const withoutConnection = ref.includes('/') ? ref.slice(ref.indexOf('/') + 1) : ref;
const connectionId = ref.includes('/') ? ref.slice(0, ref.indexOf('/')) : null;
@ -132,8 +136,11 @@ export async function validateFinalIngestArtifacts(input: FinalArtifactGateInput
}
export function validateProvenanceRawPaths(input: ProvenanceRawPathValidationInput): void {
const currentRawPaths = new Set([...input.currentRawPaths].map(normalizeRawPath));
const deletedRawPaths = new Set([...input.deletedRawPaths].map(normalizeRawPath));
for (const row of input.rows) {
if (!input.currentRawPaths.has(row.rawPath) && !input.deletedRawPaths.has(row.rawPath)) {
const rawPath = normalizeRawPath(row.rawPath);
if (!currentRawPaths.has(rawPath) && !deletedRawPaths.has(rawPath)) {
throw new Error(`provenance row references raw path outside this snapshot: ${row.rawPath}`);
}
}

View file

@ -1,4 +1,5 @@
import { join } from 'node:path';
import { gdriveConnectionToPullConfig, parseGdriveConnectionConfig } from '../../context/connections/gdrive-config.js';
import { localConnectionToWarehouseDescriptor } from '../../context/connections/local-warehouse-descriptor.js';
import { notionConnectionToPullConfig, parseNotionConnectionConfig } from '../../context/connections/notion-config.js';
import { resolveKtxConfigReference } from '../core/config-reference.js';
@ -7,6 +8,7 @@ import type { KtxLocalProject } from '../../context/project/project.js';
import type { SqlAnalysisPort } from '../../context/sql-analysis/ports.js';
import { DbtSourceAdapter } from './adapters/dbt/dbt.adapter.js';
import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js';
import { GdriveSourceAdapter } from './adapters/gdrive/gdrive.adapter.js';
import { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.adapter.js';
import { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js';
import { resolveQueryHistoryScopeFloor } from './adapters/historic-sql/scope-floor.js';
@ -103,6 +105,7 @@ export function createDefaultLocalIngestAdapters(
createLocalMetabaseSourceAdapter(project, {
...(options.logger ? { logger: options.logger } : {}),
}),
new GdriveSourceAdapter(),
new LookerSourceAdapter({
clientFactory: {
async createClient(config, ctx) {
@ -330,6 +333,9 @@ export async function localPullConfigForAdapter(
lastSuccessfulCursor: await localNotionRuntimeStore(project).readCursor(connectionId),
};
}
if (adapter.source === 'gdrive') {
return await gdriveConnectionToPullConfig(parseGdriveConnectionConfig(connection));
}
if (adapter.source === 'metricflow') {
const metricflow = connection.metricflow;
const metricflowConfig =

View file

@ -1,5 +1,5 @@
import { readFile, stat } from 'node:fs/promises';
import { join, normalize, resolve } from 'node:path';
import { isAbsolute, join, normalize, relative, resolve } from 'node:path';
import { tool } from 'ai';
import { z } from 'zod';
@ -10,8 +10,13 @@ interface ReadRawFileDeps {
const MAX_READ_RAW_FILE_BYTES = 120_000;
function normalizeRawPath(path: string): string {
return normalize(path).replace(/^[/\\]+/, '').replace(/\\/g, '/');
}
export function createReadRawFileTool(deps: ReadRawFileDeps) {
const stagedRoot = resolve(deps.stagedDir);
const allowedPaths = new Set([...deps.allowedPaths].map(normalizeRawPath));
return tool({
description:
"Read the full text content of a raw source file inside this WorkUnit. `path` must be relative to the staged bundle root (no leading slash, no `..`) and must appear in the WorkUnit's rawFiles or dependencyPaths list.",
@ -19,12 +24,13 @@ export function createReadRawFileTool(deps: ReadRawFileDeps) {
path: z.string().describe('Path relative to the staged bundle root. Example: "views/customers/customer.lkml".'),
}),
execute: async ({ path }) => {
const normalized = normalize(path).replace(/^[/\\]+/, '');
if (normalized.startsWith('..') || !deps.allowedPaths.has(normalized)) {
return `Error: path "${path}" is not accessible from this WorkUnit. Allowed paths: ${[...deps.allowedPaths].sort().join(', ')}`;
const normalized = normalizeRawPath(path);
if (normalized.startsWith('..') || !allowedPaths.has(normalized)) {
return `Error: path "${path}" is not accessible from this WorkUnit. Allowed paths: ${[...allowedPaths].sort().join(', ')}`;
}
const absolute = resolve(join(stagedRoot, normalized));
if (!absolute.startsWith(`${stagedRoot}/`) && absolute !== stagedRoot) {
const stagedRelative = relative(stagedRoot, absolute);
if (stagedRelative.startsWith('..') || isAbsolute(stagedRelative)) {
return `Error: path "${path}" is not accessible from this WorkUnit.`;
}
try {

View file

@ -1,5 +1,5 @@
import { readFile } from 'node:fs/promises';
import { join, normalize, resolve } from 'node:path';
import { isAbsolute, join, normalize, relative, resolve } from 'node:path';
import { tool } from 'ai';
import { z } from 'zod';
@ -8,8 +8,13 @@ interface ReadRawSpanDeps {
allowedPaths: Set<string>;
}
function normalizeRawPath(path: string): string {
return normalize(path).replace(/^[/\\]+/, '').replace(/\\/g, '/');
}
export function createReadRawSpanTool(deps: ReadRawSpanDeps) {
const stagedRoot = resolve(deps.stagedDir);
const allowedPaths = new Set([...deps.allowedPaths].map(normalizeRawPath));
return tool({
description:
'Read a 1-based inclusive line range from a raw source file. Use this to resolve a provenance pointer like `file.lkml#L15-28` without loading the whole file into context.',
@ -22,12 +27,13 @@ export function createReadRawSpanTool(deps: ReadRawSpanDeps) {
if (startLine > endLine) {
return `Error: startLine must be <= endLine (got startLine=${startLine}, endLine=${endLine})`;
}
const normalized = normalize(path).replace(/^[/\\]+/, '');
if (normalized.startsWith('..') || !deps.allowedPaths.has(normalized)) {
return `Error: path "${path}" is not accessible from this context. Allowed paths: ${[...deps.allowedPaths].sort().join(', ')}`;
const normalized = normalizeRawPath(path);
if (normalized.startsWith('..') || !allowedPaths.has(normalized)) {
return `Error: path "${path}" is not accessible from this context. Allowed paths: ${[...allowedPaths].sort().join(', ')}`;
}
const absolute = resolve(join(stagedRoot, normalized));
if (!absolute.startsWith(`${stagedRoot}/`) && absolute !== stagedRoot) {
const stagedRelative = relative(stagedRoot, absolute);
if (stagedRelative.startsWith('..') || isAbsolute(stagedRelative)) {
return `Error: path "${path}" is not accessible from this context.`;
}
try {

View file

@ -168,6 +168,18 @@ const notionConnectionSchema = z
})
.describe('Notion context-source connection.');
const gdriveConnectionSchema = z
.looseObject({
driver: z.literal('gdrive'),
service_account_key_ref: z
.string()
.min(1)
.describe('Reference to a Google service-account JSON key file. Must use file:/absolute/path/to/key.json.'),
folder_id: z.string().min(1).describe('Google Drive folder ID to ingest.'),
recursive: z.boolean().optional().describe('When true, recursively traverse subfolders beneath folder_id.'),
})
.describe('Google Drive Google Docs context-source connection.');
const dbtConnectionSchema = z
.looseObject({
driver: z.literal('dbt'),
@ -202,6 +214,7 @@ export const connectionConfigSchema = z.discriminatedUnion('driver', [
lookerConnectionSchema,
lookmlConnectionSchema,
notionConnectionSchema,
gdriveConnectionSchema,
dbtConnectionSchema,
metricflowConnectionSchema,
]);

View file

@ -133,6 +133,7 @@ const sourceAdapterByDriver = new Map<string, string>([
['local_metabase', 'metabase'],
['looker', 'looker'],
['notion', 'notion'],
['gdrive', 'gdrive'],
['metricflow', 'metricflow'],
['dbt', 'dbt'],
['lookml', 'lookml'],

View file

@ -3,8 +3,16 @@ import { tmpdir } from 'node:os';
import { join, relative, resolve } from 'node:path';
import { fileURLToPath, pathToFileURL } from 'node:url';
import { localConnectionTypeForConfig } from './context/connections/local-warehouse-descriptor.js';
import {
parseGdriveConnectionConfig,
resolveGdriveServiceAccountKey,
} from './context/connections/gdrive-config.js';
import { resolveNotionConnectionAuthToken } from './context/connections/notion-config.js';
import { resolveKtxConfigReference } from './context/core/config-reference.js';
import {
createGoogleDocsClients,
} from './context/ingest/adapters/gdrive/gdrive-client.js';
import { GDRIVE_DOC_MIME_TYPE, gdriveServiceAccountKeySchema } from './context/ingest/adapters/gdrive/types.js';
import { cloneOrPull, testRepoConnection } from './context/ingest/repo-fetch.js';
import { DEFAULT_METABASE_CLIENT_CONFIG, MetabaseClient } from './context/ingest/adapters/metabase/client.js';
import { discoverMetabaseDatabases, type DiscoveredMetabaseDatabase } from './context/ingest/adapters/metabase/mapping.js';
@ -37,7 +45,7 @@ import {
type KtxSetupPromptOption,
} from './setup-prompts.js';
export type KtxSetupSourceType = 'dbt' | 'metricflow' | 'metabase' | 'looker' | 'lookml' | 'notion';
export type KtxSetupSourceType = 'dbt' | 'metricflow' | 'metabase' | 'looker' | 'lookml' | 'notion' | 'gdrive';
const DEFAULT_NOTION_MAX_KNOWLEDGE_CREATES_PER_RUN = 25;
@ -62,6 +70,9 @@ export interface KtxSetupSourcesArgs {
metabaseDatabaseId?: number;
notionCrawlMode?: 'all_accessible' | 'selected_roots';
notionRootPageIds?: string[];
gdriveServiceAccountKeyRef?: string;
gdriveFolderId?: string;
gdriveRecursive?: boolean;
runInitialSourceIngest: boolean;
skipSources: boolean;
}
@ -103,6 +114,7 @@ export interface KtxSetupSourcesDeps {
validateLooker?: (projectDir: string, connectionId: string) => Promise<SourceValidationResult>;
validateLookml?: (connection: KtxProjectConnectionConfig) => Promise<SourceValidationResult>;
validateNotion?: (connection: KtxProjectConnectionConfig) => Promise<SourceValidationResult>;
validateGdrive?: (connection: KtxProjectConnectionConfig) => Promise<SourceValidationResult>;
pickNotionRootPages?: typeof pickNotionRootPages;
discoverMetabaseDatabases?: (args: {
sourceUrl: string;
@ -125,6 +137,7 @@ const SOURCE_OPTIONS: Array<{ value: KtxSetupSourceType; label: string }> = [
{ value: 'metricflow', label: 'MetricFlow' },
{ value: 'looker', label: 'Looker' },
{ value: 'lookml', label: 'LookML' },
{ value: 'gdrive', label: 'Google Drive' },
];
const SOURCE_LABELS = Object.fromEntries(SOURCE_OPTIONS.map((option) => [option.value, option.label])) as Record<
@ -218,8 +231,10 @@ function credentialRef(value: string | undefined, label: string): string {
return ref;
}
type SharedSourceCredentialField = 'sourceAuthTokenRef' | 'sourceApiKeyRef' | 'sourceClientSecretRef';
type SourceCredentialFlag = {
field: 'sourceAuthTokenRef' | 'sourceApiKeyRef' | 'sourceClientSecretRef';
field: SharedSourceCredentialField | null;
flag: string;
};
@ -232,9 +247,10 @@ const SOURCE_CREDENTIAL_FLAG: Record<KtxSetupSourceType, SourceCredentialFlag> =
notion: { field: 'sourceAuthTokenRef', flag: '--source-auth-token-ref' },
metabase: { field: 'sourceApiKeyRef', flag: '--source-api-key-ref' },
looker: { field: 'sourceClientSecretRef', flag: '--source-client-secret-ref' },
gdrive: { field: null, flag: '--gdrive-service-account-key-ref' },
};
const ALL_SOURCE_CREDENTIAL_FLAGS: SourceCredentialFlag[] = [
const ALL_SOURCE_CREDENTIAL_FLAGS: Array<{ field: SharedSourceCredentialField; flag: string }> = [
{ field: 'sourceAuthTokenRef', flag: '--source-auth-token-ref' },
{ field: 'sourceApiKeyRef', flag: '--source-api-key-ref' },
{ field: 'sourceClientSecretRef', flag: '--source-client-secret-ref' },
@ -560,6 +576,22 @@ function buildNotionConnection(args: KtxSetupSourcesArgs): KtxProjectConnectionC
};
}
function buildGdriveConnection(args: KtxSetupSourcesArgs): KtxProjectConnectionConfig {
const folderId = args.gdriveFolderId?.trim();
if (!folderId) {
throw new Error('Google Drive setup requires --gdrive-folder-id.');
}
return {
driver: 'gdrive',
service_account_key_ref: credentialRef(
args.gdriveServiceAccountKeyRef,
'Google Drive service account key ref',
),
folder_id: folderId,
recursive: args.gdriveRecursive === true,
};
}
function sourcePathFromFileRepoUrl(repoUrl: string, subpath?: string): string {
const root = fileURLToPath(repoUrl);
return subpath ? join(root, subpath) : root;
@ -680,6 +712,17 @@ async function defaultValidateNotion(connection: KtxProjectConnectionConfig): Pr
return { ok: true, detail: `roots=${roots.length}` };
}
async function defaultValidateGdrive(connection: KtxProjectConnectionConfig): Promise<SourceValidationResult> {
const config = parseGdriveConnectionConfig(connection);
const keyText = await resolveGdriveServiceAccountKey(config.service_account_key_ref);
const clients = createGoogleDocsClients(gdriveServiceAccountKeySchema.parse(JSON.parse(keyText)));
const result = await clients.drive.listFiles({
q: `'${config.folder_id}' in parents and trashed = false`,
});
const docs = result.files.filter((file) => file.mimeType === GDRIVE_DOC_MIME_TYPE).length;
return { ok: true, detail: `docs=${docs}` };
}
interface MappingJsonOutput {
connectionId: string;
refresh: { ok: boolean; output: string[] };
@ -1329,67 +1372,105 @@ async function promptForInteractiveSource(
]);
}
return await runSourcePromptSteps(initialState, (state) => [
if (source === 'notion') {
return await runSourcePromptSteps(initialState, (state) => [
...connectionSteps,
async (currentState) => {
const ref = await chooseSourceCredentialRef({
prompts,
projectDir: args.projectDir,
label: 'Notion integration token',
envName: 'NOTION_TOKEN',
secretFileName: `${currentState.sourceConnectionId ?? 'notion-main'}-token`,
existingRef: currentState.sourceAuthTokenRef,
});
if (ref === 'back') return 'back';
currentState.sourceAuthTokenRef = ref;
return 'next';
},
async (currentState) => {
const crawlMode = await prompts.select({
message: 'Which Notion pages should ktx ingest?',
options: [
{ value: 'all_accessible', label: 'All pages the integration can access' },
{ value: 'selected_roots', label: 'Specific pages and their subpages (choose them in a picker)' },
{ value: 'back', label: 'Back' },
],
});
if (crawlMode === 'back') return 'back';
currentState.notionCrawlMode = crawlMode === 'all_accessible' ? 'all_accessible' : 'selected_roots';
if (currentState.notionCrawlMode === 'all_accessible') {
delete currentState.notionRootPageIds;
}
return 'next';
},
...(state.notionCrawlMode === 'selected_roots'
? [
async (currentState: SourcePromptState) => {
const connectionId = currentState.sourceConnectionId ?? 'notion-main';
const result = await (deps.pickNotionRootPages ?? pickNotionRootPages)(
{
connectionId,
connection: {
driver: 'notion',
auth_token_ref: credentialRef(currentState.sourceAuthTokenRef, 'Notion token ref'),
crawl_mode: 'selected_roots',
root_page_ids: currentState.notionRootPageIds ?? [],
root_database_ids: [],
root_data_source_ids: [],
},
},
io,
);
if (result.kind === 'back') {
return 'back';
}
if (result.kind === 'unavailable') {
io.stderr.write(`${result.message}\n`);
return 'back';
}
currentState.notionRootPageIds = result.rootPageIds;
return 'next';
},
]
: []),
]);
}
return await runSourcePromptSteps(initialState, () => [
...connectionSteps,
async (currentState) => {
const ref = await chooseSourceCredentialRef({
prompts,
projectDir: args.projectDir,
label: 'Notion integration token',
envName: 'NOTION_TOKEN',
secretFileName: `${currentState.sourceConnectionId ?? 'notion-main'}-token`,
existingRef: currentState.sourceAuthTokenRef,
const keyRef = await promptText(prompts, {
message: 'Google Drive service account key file reference',
placeholder: 'file:/absolute/path/to/key.json',
...(currentState.gdriveServiceAccountKeyRef ? { initialValue: currentState.gdriveServiceAccountKeyRef } : {}),
});
if (ref === 'back') return 'back';
currentState.sourceAuthTokenRef = ref;
if (keyRef === undefined) return 'back';
currentState.gdriveServiceAccountKeyRef = keyRef.trim();
return 'next';
},
async (currentState) => {
const crawlMode = await prompts.select({
message: 'Which Notion pages should ktx ingest?',
const folderId = await promptText(prompts, {
message: 'Google Drive folder id',
...(currentState.gdriveFolderId ? { initialValue: currentState.gdriveFolderId } : {}),
});
if (folderId === undefined) return 'back';
currentState.gdriveFolderId = folderId.trim();
return 'next';
},
async (currentState) => {
const recursive = await prompts.select({
message: 'Include Google Docs from subfolders?',
options: [
{ value: 'all_accessible', label: 'All pages the integration can access' },
{ value: 'selected_roots', label: 'Specific pages and their subpages (choose them in a picker)' },
{ value: 'false', label: 'No' },
{ value: 'true', label: 'Yes' },
{ value: 'back', label: 'Back' },
],
});
if (crawlMode === 'back') return 'back';
currentState.notionCrawlMode = crawlMode === 'all_accessible' ? 'all_accessible' : 'selected_roots';
if (currentState.notionCrawlMode === 'all_accessible') {
delete currentState.notionRootPageIds;
}
if (recursive === 'back') return 'back';
currentState.gdriveRecursive = recursive === 'true';
return 'next';
},
...(state.notionCrawlMode === 'selected_roots'
? [
async (currentState: SourcePromptState) => {
const connectionId = currentState.sourceConnectionId ?? 'notion-main';
const result = await (deps.pickNotionRootPages ?? pickNotionRootPages)(
{
connectionId,
connection: {
driver: 'notion',
auth_token_ref: credentialRef(currentState.sourceAuthTokenRef, 'Notion token ref'),
crawl_mode: 'selected_roots',
root_page_ids: currentState.notionRootPageIds ?? [],
root_database_ids: [],
root_data_source_ids: [],
},
},
io,
);
if (result.kind === 'back') {
return 'back';
}
if (result.kind === 'unavailable') {
io.stderr.write(`${result.message}\n`);
return 'back';
}
currentState.notionRootPageIds = result.rootPageIds;
return 'next';
},
]
: []),
]);
}
@ -1559,6 +1640,13 @@ function sourceArgsFromExistingConnection(input: {
return sourceArgs;
}
if (input.source === 'gdrive') {
sourceArgs.gdriveServiceAccountKeyRef = stringField(input.connection.service_account_key_ref);
sourceArgs.gdriveFolderId = stringField(input.connection.folder_id);
sourceArgs.gdriveRecursive = input.connection.recursive === true;
return sourceArgs;
}
sourceArgs.sourceAuthTokenRef = stringField(input.connection.auth_token_ref);
sourceArgs.notionCrawlMode =
input.connection.crawl_mode === 'all_accessible' ? 'all_accessible' : 'selected_roots';
@ -1740,7 +1828,10 @@ function buildConnection(source: KtxSetupSourceType, args: KtxSetupSourcesArgs):
if (source === 'lookml') {
return buildLookmlConnection(args);
}
return buildNotionConnection(args);
if (source === 'notion') {
return buildNotionConnection(args);
}
return buildGdriveConnection(args);
}
async function validateSource(
@ -1765,7 +1856,10 @@ async function validateSource(
if (source === 'lookml') {
return await (deps.validateLookml ?? defaultValidateLookml)(args.connection);
}
return await (deps.validateNotion ?? defaultValidateNotion)(args.connection);
if (source === 'notion') {
return await (deps.validateNotion ?? defaultValidateNotion)(args.connection);
}
return await (deps.validateGdrive ?? defaultValidateGdrive)(args.connection);
}
async function createSourceSetupRollback(projectDir: string): Promise<() => Promise<void>> {

View file

@ -126,6 +126,9 @@ export type KtxSetupArgs =
metabaseDatabaseId?: number;
notionCrawlMode?: 'all_accessible' | 'selected_roots';
notionRootPageIds?: string[];
gdriveServiceAccountKeyRef?: string;
gdriveFolderId?: string;
gdriveRecursive?: boolean;
runInitialSourceIngest?: boolean;
skipSources?: boolean;
showEntryMenu?: boolean;
@ -167,7 +170,7 @@ export interface KtxSetupDeps {
setupUi?: KtxSetupUiAdapter;
}
const SOURCE_DRIVERS = new Set(['dbt', 'metricflow', 'metabase', 'looker', 'lookml', 'notion']);
const SOURCE_DRIVERS = new Set(['dbt', 'metricflow', 'metabase', 'looker', 'lookml', 'notion', 'gdrive']);
const KTX_DOCS_URL = 'https://docs.kaelio.com/ktx';
type KtxSetupEntryAction = 'setup' | 'new-project' | 'agents' | 'status' | 'demo' | 'exit';
@ -873,6 +876,11 @@ async function runKtxSetupInner(args: KtxSetupArgs, io: KtxCliIo, deps: KtxSetup
...(args.metabaseDatabaseId !== undefined ? { metabaseDatabaseId: args.metabaseDatabaseId } : {}),
...(args.notionCrawlMode ? { notionCrawlMode: args.notionCrawlMode } : {}),
...(args.notionRootPageIds ? { notionRootPageIds: args.notionRootPageIds } : {}),
...(args.gdriveServiceAccountKeyRef
? { gdriveServiceAccountKeyRef: args.gdriveServiceAccountKeyRef }
: {}),
...(args.gdriveFolderId ? { gdriveFolderId: args.gdriveFolderId } : {}),
...(args.gdriveRecursive !== undefined ? { gdriveRecursive: args.gdriveRecursive } : {}),
runInitialSourceIngest: args.runInitialSourceIngest ?? false,
skipSources: args.skipSources === true || !shouldRunSources || skipSourcesFromDatabaseMenu,
},

View file

@ -0,0 +1,97 @@
---
name: gdrive_synthesize
description: Synthesize durable KTX wiki pages from staged Google Drive document pulls. Load when a WorkUnit contains Google Doc raw files from `docs/**`.
callers: [memory_agent]
---
# Google Drive Doc Synthesis
Use this skill when a WorkUnit contains staged Google Drive content from `docs/**`.
## Role
Each WorkUnit is one Google Doc plus its metadata. Read the assigned raw files, then write a small set of durable wiki entries that capture reusable organizational knowledge. Write final memory directly; do not write candidates.
## Required Workflow
1. Read the WorkUnit notes and `rawFiles` list. Document content lives in `page.md`; `metadata.json` holds title, path, url, modified time, and Drive folder context.
2. For each assigned doc, call `read_raw_file`, or `read_raw_span` for oversized docs when the notes specify a span.
3. Search `wiki_search` for existing pages that overlap the WorkUnit topics. Prefer updating an existing page over creating a duplicate.
4. Use `context_evidence_search`, `context_evidence_read`, and `context_evidence_neighbors` when indexed document chunks would help reconcile related facts. Pass `chunkId` and `documentId` values verbatim as returned by the evidence tools.
5. Write durable business knowledge with `wiki_write`. Aim for a small number of high-quality pages per doc. Include `rawPaths` with the exact Google Drive raw files that support each page.
6. If a doc references warehouse, dbt, Looker, Metabase, or MetricFlow objects, you may verify them with `discover_data`, `entity_details`, `sql_execution`, `sl_discover`, or `sl_read_source`, but Google Drive docs are knowledge-only in v1. Do not create semantic-layer sources under the `gdrive` connection.
7. For every deleted raw path in the Eviction Set, call `eviction_list`, decide retention, then `emit_eviction_decision`. Do this even when no wiki write is needed.
## What To Capture
Capture durable, reusable company knowledge:
- policies, workflows, process rules, ownership conventions, and operating procedures
- product definitions, business terminology, and organizational guidance
- source-of-truth statements, caveats, conflict notes, and supersession guidance
- cross-system aliases that connect doc terminology to warehouse, dbt, Looker, Metabase, or MetricFlow names
Skip noisy or transient content:
- brainstorming notes with no durable rule
- task lists, meeting scheduling details, and time-bounded status updates
- duplicate docs with no new fact
- shallow summaries that add no reusable policy or definition
## Quality
Prefer fewer, stronger entries. Every wiki entry must cite at least one Google Doc using its title or path and last modified date when available. When evidence conflicts, write a conflict note inside the wiki page rather than choosing silently.
If one doc covers several related ideas, synthesize the shared durable rules instead of writing one thin page per paragraph. For oversized spans, read only the assigned span unless the WorkUnit explicitly asks for neighboring context.
Search existing wiki pages for the same `tables:` or `sl_refs:` frontmatter and for source-of-truth aliases before creating a new page. If an existing page already documents the same warehouse object or business concept, update it instead of creating a differently named duplicate.
## Citation Style
```md
## Agentic Harness
- The harness provides the operational framework that turns an agent prototype into a production system.
- Source: Google Doc - Herness, last modified 2026-05-24.
- Conflict note: An older internal note uses a narrower definition focused only on tool wiring; treat the current Google Doc as the durable operating definition unless replaced explicitly.
```
## Semantic-Layer Rules
- Google Drive docs are knowledge-only in v1; keep durable output in wiki pages.
- Do not create semantic-layer sources under the `gdrive` connection.
- If a doc references an existing warehouse or semantic-layer object and you can verify it, you may attach `sl_refs` in wiki output after confirmation.
- If a doc mentions a table or source that cannot be verified, keep the identifier in wiki text as unverified or use `emit_unmapped_fallback` only when the missing physical object itself is the important durable fact.
## Identifier Verification Protocol
Before writing a wiki page on any topic:
1. `discover_data({query: "<topic>"})` - see what wikis, SL sources, and raw
tables already exist. Prefer updating existing pages over creating new ones.
Before emitting any `schema.table` or `schema.table.column` into a wiki body,
`tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`:
2. `entity_details({connectionId, targets: [{display: "<identifier>"}]})` -
confirm the identifier resolves; inspect native types, FK/PK, and
sampleValues.
3. For literal values from the doc, such as status codes or plan tiers,
check whether they appear in `entity_details` sampleValues for the relevant
column. If sampleValues is short or the sample may have missed real values,
run a `sql_execution` probe with the same warehouse connection id:
`sql_execution({connectionId, sql: "SELECT DISTINCT <col> FROM <ref> LIMIT 50"})`.
4. If the candidate identifier still does not resolve, do one of:
- Use `sql_execution({connectionId, sql: "SELECT 1 FROM <ref> LIMIT 0"})`.
If it errors, the identifier is fictional.
- Wrap the identifier in `[unverified - from <rawPath>]` in the wiki body,
citing the exact raw path that mentioned it.
- When recording `emit_unmapped_fallback` with `no_physical_table`, include
the failing probe error in `clarification`.
5. Never copy `<schema>.<table>` placeholder strings from these instructions
into output.
## Tools
Allowed: `read_raw_file`, `read_raw_span`, `wiki_search`, `wiki_read`, `wiki_write`, `discover_data`, `entity_details`, `sql_execution`, `sl_discover`, `sl_read_source`, `context_evidence_search`, `context_evidence_read`, `context_evidence_neighbors`, `emit_unmapped_fallback`, `eviction_list`, `emit_eviction_decision`.
Not allowed: `context_candidate_write`, `context_candidate_mark`, `sl_write_source`, `sl_edit_source`, `sl_validate`.