diff --git a/apps/x/apps/main/src/browser/control-service.ts b/apps/x/apps/main/src/browser/control-service.ts new file mode 100644 index 00000000..04edd93e --- /dev/null +++ b/apps/x/apps/main/src/browser/control-service.ts @@ -0,0 +1,292 @@ +import { BrowserWindow } from 'electron'; +import type { IBrowserControlService } from '@x/core/dist/application/browser-control/service.js'; +import type { BrowserControlAction, BrowserControlInput, BrowserControlResult } from '@x/shared/dist/browser-control.js'; +import { browserViewManager } from './view.js'; + +const SEARCH_ENGINE_BASE_URL = 'https://www.google.com/search?q='; + +function normalizeNavigationTarget(target: string): string { + const trimmed = target.trim(); + if (!trimmed) { + throw new Error('Navigation target cannot be empty.'); + } + + const lower = trimmed.toLowerCase(); + if ( + lower.startsWith('javascript:') + || lower.startsWith('file://') + || lower.startsWith('chrome://') + || lower.startsWith('chrome-extension://') + ) { + throw new Error('That URL scheme is not allowed in the embedded browser.'); + } + + if (/^[a-z][a-z0-9+.-]*:/i.test(trimmed)) { + return trimmed; + } + + const looksLikeHost = + trimmed.startsWith('localhost') + || /^[\w.-]+\.[a-z]{2,}/i.test(trimmed) + || /^\d{1,3}(?:\.\d{1,3}){3}(?::\d+)?(?:\/.*)?$/.test(trimmed); + + if (looksLikeHost && !/\s/.test(trimmed)) { + return trimmed; + } + + return `${SEARCH_ENGINE_BASE_URL}${encodeURIComponent(trimmed)}`; +} + +function emitPaneState(open: boolean): void { + const windows = BrowserWindow.getAllWindows(); + for (const win of windows) { + if (!win.isDestroyed() && win.webContents) { + win.webContents.send('browser:didRequestPaneState', { open }); + } + } +} + +function buildSuccessResult( + action: BrowserControlAction, + message: string, + page?: BrowserControlResult['page'], +): BrowserControlResult { + return { + success: true, + action, + message, + browser: browserViewManager.getState(), + ...(page ? { page } : {}), + }; +} + +function buildErrorResult(action: BrowserControlAction, error: string): BrowserControlResult { + return { + success: false, + action, + error, + browser: browserViewManager.getState(), + }; +} + +export class ElectronBrowserControlService implements IBrowserControlService { + private ensurePaneOpen(): void { + emitPaneState(true); + browserViewManager.setVisible(true); + } + + async execute( + input: BrowserControlInput, + ctx?: { signal?: AbortSignal }, + ): Promise { + const signal = ctx?.signal; + this.ensurePaneOpen(); + + try { + switch (input.action) { + case 'open': { + await browserViewManager.ensureActiveTabReady(signal); + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult('open', 'Opened the browser pane.', page); + } + + case 'get-state': + return buildSuccessResult('get-state', 'Read the current browser state.'); + + case 'new-tab': { + const target = input.target ? normalizeNavigationTarget(input.target) : undefined; + const result = await browserViewManager.newTab(target); + if (!result.ok) { + return buildErrorResult('new-tab', result.error ?? 'Failed to open a new tab.'); + } + await browserViewManager.ensureActiveTabReady(signal); + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult( + 'new-tab', + target ? `Opened a new tab for ${target}.` : 'Opened a new tab.', + page, + ); + } + + case 'switch-tab': { + const tabId = input.tabId; + if (!tabId) { + return buildErrorResult('switch-tab', 'tabId is required for switch-tab.'); + } + const result = browserViewManager.switchTab(tabId); + if (!result.ok) { + return buildErrorResult('switch-tab', `No browser tab exists with id ${tabId}.`); + } + await browserViewManager.ensureActiveTabReady(signal); + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult('switch-tab', `Switched to tab ${tabId}.`, page); + } + + case 'close-tab': { + const tabId = input.tabId; + if (!tabId) { + return buildErrorResult('close-tab', 'tabId is required for close-tab.'); + } + const result = browserViewManager.closeTab(tabId); + if (!result.ok) { + return buildErrorResult('close-tab', `Could not close tab ${tabId}.`); + } + await browserViewManager.ensureActiveTabReady(signal); + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult('close-tab', `Closed tab ${tabId}.`, page); + } + + case 'navigate': { + const rawTarget = input.target; + if (!rawTarget) { + return buildErrorResult('navigate', 'target is required for navigate.'); + } + const target = normalizeNavigationTarget(rawTarget); + const result = await browserViewManager.navigate(target); + if (!result.ok) { + return buildErrorResult('navigate', result.error ?? `Failed to navigate to ${target}.`); + } + await browserViewManager.ensureActiveTabReady(signal); + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult('navigate', `Navigated to ${target}.`, page); + } + + case 'back': { + const result = browserViewManager.back(); + if (!result.ok) { + return buildErrorResult('back', 'The active tab cannot go back.'); + } + await browserViewManager.ensureActiveTabReady(signal); + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult('back', 'Went back in the active tab.', page); + } + + case 'forward': { + const result = browserViewManager.forward(); + if (!result.ok) { + return buildErrorResult('forward', 'The active tab cannot go forward.'); + } + await browserViewManager.ensureActiveTabReady(signal); + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult('forward', 'Went forward in the active tab.', page); + } + + case 'reload': { + browserViewManager.reload(); + await browserViewManager.ensureActiveTabReady(signal); + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult('reload', 'Reloaded the active tab.', page); + } + + case 'read-page': { + const result = await browserViewManager.readPage( + { + maxElements: input.maxElements, + maxTextLength: input.maxTextLength, + }, + signal, + ); + if (!result.ok || !result.page) { + return buildErrorResult('read-page', result.error ?? 'Failed to read the current page.'); + } + return buildSuccessResult('read-page', 'Read the current page.', result.page); + } + + case 'click': { + const result = await browserViewManager.click( + { + index: input.index, + selector: input.selector, + snapshotId: input.snapshotId, + }, + signal, + ); + if (!result.ok) { + return buildErrorResult('click', result.error ?? 'Failed to click the requested element.'); + } + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult( + 'click', + result.description ? `Clicked ${result.description}.` : 'Clicked the requested element.', + page, + ); + } + + case 'type': { + const text = input.text; + if (text === undefined) { + return buildErrorResult('type', 'text is required for type.'); + } + const result = await browserViewManager.type( + { + index: input.index, + selector: input.selector, + snapshotId: input.snapshotId, + }, + text, + signal, + ); + if (!result.ok) { + return buildErrorResult('type', result.error ?? 'Failed to type into the requested element.'); + } + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult( + 'type', + result.description ? `Typed into ${result.description}.` : 'Typed into the requested element.', + page, + ); + } + + case 'press': { + const key = input.key; + if (!key) { + return buildErrorResult('press', 'key is required for press.'); + } + const result = await browserViewManager.press( + key, + { + index: input.index, + selector: input.selector, + snapshotId: input.snapshotId, + }, + signal, + ); + if (!result.ok) { + return buildErrorResult('press', result.error ?? `Failed to press ${key}.`); + } + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult( + 'press', + result.description ? `Pressed ${result.description}.` : `Pressed ${key}.`, + page, + ); + } + + case 'scroll': { + const result = await browserViewManager.scroll( + input.direction ?? 'down', + input.amount ?? 700, + signal, + ); + if (!result.ok) { + return buildErrorResult('scroll', result.error ?? 'Failed to scroll the page.'); + } + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult('scroll', `Scrolled ${input.direction ?? 'down'}.`, page); + } + + case 'wait': { + const duration = input.ms ?? 1000; + await browserViewManager.wait(duration, signal); + const page = await browserViewManager.readPageSummary(signal) ?? undefined; + return buildSuccessResult('wait', `Waited ${duration}ms for the page to settle.`, page); + } + } + } catch (error) { + return buildErrorResult( + input.action, + error instanceof Error ? error.message : 'Browser control failed unexpectedly.', + ); + } + } +} diff --git a/apps/x/apps/main/src/browser/view.ts b/apps/x/apps/main/src/browser/view.ts index 39f33a09..6b4978bb 100644 --- a/apps/x/apps/main/src/browser/view.ts +++ b/apps/x/apps/main/src/browser/view.ts @@ -1,6 +1,14 @@ import { randomUUID } from 'node:crypto'; import { EventEmitter } from 'node:events'; -import { BrowserWindow, WebContentsView, session, shell, type Session } from 'electron'; +import { BrowserWindow, WebContentsView, session, shell, type Session, type WebContents } from 'electron'; +import type { + BrowserPageElement, + BrowserPageSnapshot, + BrowserState, + BrowserTabState, +} from '@x/shared/dist/browser-control.js'; + +export type { BrowserPageSnapshot, BrowserState, BrowserTabState }; /** * Embedded browser pane implementation. @@ -22,6 +30,166 @@ const SPOOF_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'; const HOME_URL = 'https://www.google.com'; +const NAVIGATION_TIMEOUT_MS = 10000; +const POST_ACTION_IDLE_MS = 400; +const POST_ACTION_MAX_ELEMENTS = 25; +const POST_ACTION_MAX_TEXT_LENGTH = 4000; +const DEFAULT_READ_MAX_ELEMENTS = 50; +const DEFAULT_READ_MAX_TEXT_LENGTH = 8000; + +const INTERACTABLE_SELECTORS = [ + 'a[href]', + 'button', + 'input', + 'textarea', + 'select', + 'summary', + '[role="button"]', + '[role="link"]', + '[role="tab"]', + '[role="menuitem"]', + '[role="option"]', + '[contenteditable="true"]', + '[tabindex]:not([tabindex="-1"])', +].join(', '); + +const DOM_HELPERS_SOURCE = String.raw` +const truncateText = (value, max) => { + const normalized = String(value ?? '').replace(/\s+/g, ' ').trim(); + if (!normalized) return ''; + if (normalized.length <= max) return normalized; + const safeMax = Math.max(0, max - 3); + return normalized.slice(0, safeMax).trim() + '...'; +}; + +const cssEscapeValue = (value) => { + if (typeof CSS !== 'undefined' && typeof CSS.escape === 'function') { + return CSS.escape(value); + } + return String(value).replace(/[^a-zA-Z0-9_-]/g, (char) => '\\' + char); +}; + +const isVisibleElement = (element) => { + if (!(element instanceof Element)) return false; + const style = window.getComputedStyle(element); + if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') { + return false; + } + if (element.getAttribute('aria-hidden') === 'true') return false; + const rect = element.getBoundingClientRect(); + return rect.width > 0 && rect.height > 0; +}; + +const isDisabledElement = (element) => { + if (!(element instanceof Element)) return true; + if (element.getAttribute('aria-disabled') === 'true') return true; + return 'disabled' in element && Boolean(element.disabled); +}; + +const getElementRole = (element) => { + const explicitRole = element.getAttribute('role'); + if (explicitRole) return explicitRole; + if (element instanceof HTMLAnchorElement) return 'link'; + if (element instanceof HTMLButtonElement) return 'button'; + if (element instanceof HTMLInputElement) return element.type === 'checkbox' ? 'checkbox' : 'input'; + if (element instanceof HTMLTextAreaElement) return 'textbox'; + if (element instanceof HTMLSelectElement) return 'combobox'; + if (element instanceof HTMLElement && element.isContentEditable) return 'textbox'; + return null; +}; + +const getElementType = (element) => { + if (element instanceof HTMLInputElement) return element.type || 'text'; + if (element instanceof HTMLTextAreaElement) return 'textarea'; + if (element instanceof HTMLSelectElement) return 'select'; + if (element instanceof HTMLButtonElement) return 'button'; + if (element instanceof HTMLElement && element.isContentEditable) return 'contenteditable'; + return null; +}; + +const getElementLabel = (element) => { + const ariaLabel = truncateText(element.getAttribute('aria-label') ?? '', 120); + if (ariaLabel) return ariaLabel; + + if ('labels' in element && element.labels && element.labels.length > 0) { + const labelText = truncateText( + Array.from(element.labels).map((label) => label.innerText || label.textContent || '').join(' '), + 120, + ); + if (labelText) return labelText; + } + + if (element.id) { + const label = document.querySelector('label[for="' + cssEscapeValue(element.id) + '"]'); + const labelText = truncateText(label?.textContent ?? '', 120); + if (labelText) return labelText; + } + + const placeholder = truncateText(element.getAttribute('placeholder') ?? '', 120); + if (placeholder) return placeholder; + + const text = truncateText( + element instanceof HTMLInputElement || element instanceof HTMLTextAreaElement + ? element.value + : element.textContent ?? '', + 120, + ); + return text || null; +}; + +const describeElement = (element) => { + const role = getElementRole(element) || element.tagName.toLowerCase(); + const label = getElementLabel(element); + return label ? role + ' "' + label + '"' : role; +}; + +const buildUniqueSelector = (element) => { + if (!(element instanceof Element)) return null; + + if (element.id) { + const idSelector = '#' + cssEscapeValue(element.id); + try { + if (document.querySelectorAll(idSelector).length === 1) return idSelector; + } catch {} + } + + const segments = []; + let current = element; + while (current && current instanceof Element && current !== document.documentElement) { + const tag = current.tagName.toLowerCase(); + if (!tag) break; + + let segment = tag; + const name = current.getAttribute('name'); + if (name) { + const nameSelector = tag + '[name="' + cssEscapeValue(name) + '"]'; + try { + if (document.querySelectorAll(nameSelector).length === 1) { + segments.unshift(nameSelector); + return segments.join(' > '); + } + } catch {} + } + + const parent = current.parentElement; + if (parent) { + const sameTagSiblings = Array.from(parent.children).filter((child) => child.tagName === current.tagName); + const position = sameTagSiblings.indexOf(current) + 1; + segment += ':nth-of-type(' + position + ')'; + } + + segments.unshift(segment); + const selector = segments.join(' > '); + try { + if (document.querySelectorAll(selector).length === 1) return selector; + } catch {} + + current = current.parentElement; + } + + return segments.length > 0 ? segments.join(' > ') : null; +}; +`; export interface BrowserBounds { x: number; @@ -30,30 +198,236 @@ export interface BrowserBounds { height: number; } -export interface BrowserTabState { - id: string; - url: string; - title: string; - canGoBack: boolean; - canGoForward: boolean; - loading: boolean; -} - -export interface BrowserState { - activeTabId: string | null; - tabs: BrowserTabState[]; -} - type BrowserTab = { id: string; view: WebContentsView; }; +type CachedSnapshot = { + snapshotId: string; + elements: Array<{ index: number; selector: string }>; +}; + +type RawBrowserPageElement = BrowserPageElement & { + selector: string; +}; + +type RawBrowserPageSnapshot = { + url: string; + title: string; + loading: boolean; + text: string; + elements: RawBrowserPageElement[]; +}; + +type ElementTarget = { + index?: number; + selector?: string; + snapshotId?: string; +}; + const EMPTY_STATE: BrowserState = { activeTabId: null, tabs: [], }; +function abortIfNeeded(signal?: AbortSignal): void { + if (!signal?.aborted) return; + throw signal.reason instanceof Error ? signal.reason : new Error('Browser action aborted'); +} + +async function sleep(ms: number, signal?: AbortSignal): Promise { + if (ms <= 0) return; + abortIfNeeded(signal); + await new Promise((resolve, reject) => { + const abortSignal = signal; + const timer = setTimeout(() => { + abortSignal?.removeEventListener('abort', onAbort); + resolve(); + }, ms); + + const onAbort = () => { + clearTimeout(timer); + abortSignal?.removeEventListener('abort', onAbort); + reject(abortSignal?.reason instanceof Error ? abortSignal.reason : new Error('Browser action aborted')); + }; + + abortSignal?.addEventListener('abort', onAbort, { once: true }); + }); +} + +function buildReadPageScript(maxElements: number, maxTextLength: number): string { + return `(() => { + ${DOM_HELPERS_SOURCE} + const candidates = Array.from(document.querySelectorAll(${JSON.stringify(INTERACTABLE_SELECTORS)})); + const elements = []; + const seenSelectors = new Set(); + + for (const candidate of candidates) { + if (!(candidate instanceof Element)) continue; + if (!isVisibleElement(candidate)) continue; + + const selector = buildUniqueSelector(candidate); + if (!selector || seenSelectors.has(selector)) continue; + seenSelectors.add(selector); + + elements.push({ + index: elements.length + 1, + selector, + tagName: candidate.tagName.toLowerCase(), + role: getElementRole(candidate), + type: getElementType(candidate), + label: getElementLabel(candidate), + text: truncateText(candidate.innerText || candidate.textContent || '', 120) || null, + placeholder: truncateText(candidate.getAttribute('placeholder') ?? '', 120) || null, + href: candidate instanceof HTMLAnchorElement ? candidate.href : candidate.getAttribute('href'), + disabled: isDisabledElement(candidate), + }); + + if (elements.length >= ${JSON.stringify(maxElements)}) break; + } + + return { + url: window.location.href, + title: document.title || '', + loading: document.readyState !== 'complete', + text: truncateText(document.body?.innerText || document.body?.textContent || '', ${JSON.stringify(maxTextLength)}), + elements, + }; + })()`; +} + +function buildClickScript(selector: string): string { + return `(() => { + ${DOM_HELPERS_SOURCE} + const element = document.querySelector(${JSON.stringify(selector)}); + if (!(element instanceof Element)) { + return { ok: false, error: 'Element not found.' }; + } + if (!isVisibleElement(element)) { + return { ok: false, error: 'Element is not visible.' }; + } + if (isDisabledElement(element)) { + return { ok: false, error: 'Element is disabled.' }; + } + + if (element instanceof HTMLElement) { + element.scrollIntoView({ block: 'center', inline: 'center' }); + element.focus({ preventScroll: true }); + element.click(); + } else { + element.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true, view: window })); + } + + return { ok: true, description: describeElement(element) }; + })()`; +} + +function buildTypeScript(selector: string, text: string): string { + return `(() => { + ${DOM_HELPERS_SOURCE} + const element = document.querySelector(${JSON.stringify(selector)}); + if (!(element instanceof Element)) { + return { ok: false, error: 'Element not found.' }; + } + if (!isVisibleElement(element)) { + return { ok: false, error: 'Element is not visible.' }; + } + if (isDisabledElement(element)) { + return { ok: false, error: 'Element is disabled.' }; + } + + const nextValue = ${JSON.stringify(text)}; + + const setNativeValue = (target, value) => { + const prototype = Object.getPrototypeOf(target); + const descriptor = Object.getOwnPropertyDescriptor(prototype, 'value'); + if (descriptor && typeof descriptor.set === 'function') { + descriptor.set.call(target, value); + } else { + target.value = value; + } + }; + + if (element instanceof HTMLInputElement || element instanceof HTMLTextAreaElement) { + if (element.readOnly) { + return { ok: false, error: 'Element is read-only.' }; + } + element.scrollIntoView({ block: 'center', inline: 'center' }); + element.focus({ preventScroll: true }); + setNativeValue(element, nextValue); + element.dispatchEvent(new InputEvent('input', { bubbles: true, data: nextValue, inputType: 'insertText' })); + element.dispatchEvent(new Event('change', { bubbles: true })); + return { ok: true, description: describeElement(element) }; + } + + if (element instanceof HTMLElement && element.isContentEditable) { + element.scrollIntoView({ block: 'center', inline: 'center' }); + element.focus({ preventScroll: true }); + element.textContent = nextValue; + element.dispatchEvent(new InputEvent('input', { bubbles: true, data: nextValue, inputType: 'insertText' })); + return { ok: true, description: describeElement(element) }; + } + + return { ok: false, error: 'Element does not accept text input.' }; + })()`; +} + +function buildFocusScript(selector: string): string { + return `(() => { + ${DOM_HELPERS_SOURCE} + const element = document.querySelector(${JSON.stringify(selector)}); + if (!(element instanceof Element)) { + return { ok: false, error: 'Element not found.' }; + } + if (!isVisibleElement(element)) { + return { ok: false, error: 'Element is not visible.' }; + } + if (element instanceof HTMLElement) { + element.scrollIntoView({ block: 'center', inline: 'center' }); + element.focus({ preventScroll: true }); + } + return { ok: true, description: describeElement(element) }; + })()`; +} + +function buildScrollScript(offset: number): string { + return `(() => { + window.scrollBy({ top: ${JSON.stringify(offset)}, left: 0, behavior: 'auto' }); + return { ok: true }; + })()`; +} + +function normalizeKeyCode(key: string): string { + const trimmed = key.trim(); + if (!trimmed) return 'Enter'; + + const aliases: Record = { + esc: 'Escape', + escape: 'Escape', + return: 'Enter', + enter: 'Enter', + tab: 'Tab', + space: 'Space', + ' ': 'Space', + left: 'ArrowLeft', + right: 'ArrowRight', + up: 'ArrowUp', + down: 'ArrowDown', + arrowleft: 'ArrowLeft', + arrowright: 'ArrowRight', + arrowup: 'ArrowUp', + arrowdown: 'ArrowDown', + backspace: 'Backspace', + delete: 'Delete', + }; + + const alias = aliases[trimmed.toLowerCase()]; + if (alias) return alias; + if (trimmed.length === 1) return trimmed.toUpperCase(); + return trimmed[0].toUpperCase() + trimmed.slice(1); +} + export class BrowserViewManager extends EventEmitter { private window: BrowserWindow | null = null; private browserSession: Session | null = null; @@ -63,6 +437,7 @@ export class BrowserViewManager extends EventEmitter { private attachedTabId: string | null = null; private visible = false; private bounds: BrowserBounds = { x: 0, y: 0, width: 0, height: 0 }; + private snapshotCache = new Map(); attach(window: BrowserWindow): void { this.window = window; @@ -74,6 +449,7 @@ export class BrowserViewManager extends EventEmitter { this.activeTabId = null; this.attachedTabId = null; this.visible = false; + this.snapshotCache.clear(); }); } @@ -98,6 +474,10 @@ export class BrowserViewManager extends EventEmitter { return this.getTab(this.activeTabId); } + private invalidateSnapshot(tabId: string): void { + this.snapshotCache.delete(tabId); + } + private normalizeUrl(rawUrl: string): string { let url = rawUrl.trim(); if (!/^[a-z][a-z0-9+.-]*:/i.test(url)) { @@ -128,11 +508,6 @@ export class BrowserViewManager extends EventEmitter { private wireEvents(tabId: string, view: WebContentsView): void { const wc = view.webContents; - const emit = () => this.emitState(); - - // Electron occasionally drops WebContentsView layout on navigation. - // Re-applying the cached bounds is cheap and keeps the active tab pinned - // to the renderer-computed viewport. const reapplyBounds = () => { if ( this.attachedTabId === tabId && @@ -144,15 +519,23 @@ export class BrowserViewManager extends EventEmitter { } }; - wc.on('did-start-navigation', reapplyBounds); - wc.on('did-navigate', () => { reapplyBounds(); emit(); }); - wc.on('did-navigate-in-page', () => { reapplyBounds(); emit(); }); - wc.on('did-start-loading', () => { reapplyBounds(); emit(); }); - wc.on('did-stop-loading', () => { reapplyBounds(); emit(); }); - wc.on('did-finish-load', () => { reapplyBounds(); emit(); }); + const invalidateAndEmit = () => { + this.invalidateSnapshot(tabId); + this.emitState(); + }; + + wc.on('did-start-navigation', () => { + this.invalidateSnapshot(tabId); + reapplyBounds(); + }); + wc.on('did-navigate', () => { reapplyBounds(); invalidateAndEmit(); }); + wc.on('did-navigate-in-page', () => { reapplyBounds(); invalidateAndEmit(); }); + wc.on('did-start-loading', () => { this.invalidateSnapshot(tabId); reapplyBounds(); this.emitState(); }); + wc.on('did-stop-loading', () => { reapplyBounds(); invalidateAndEmit(); }); + wc.on('did-finish-load', () => { reapplyBounds(); invalidateAndEmit(); }); wc.on('did-frame-finish-load', reapplyBounds); - wc.on('did-fail-load', () => { reapplyBounds(); emit(); }); - wc.on('page-title-updated', emit); + wc.on('did-fail-load', () => { reapplyBounds(); invalidateAndEmit(); }); + wc.on('page-title-updated', this.emitState.bind(this)); wc.setWindowOpenHandler(({ url }) => { if (this.isEmbeddedTabUrl(url)) { @@ -223,6 +606,7 @@ export class BrowserViewManager extends EventEmitter { this.tabs.set(tabId, tab); this.tabOrder.push(tabId); this.activeTabId = tabId; + this.invalidateSnapshot(tabId); this.syncAttachedView(); this.emitState(); @@ -244,12 +628,101 @@ export class BrowserViewManager extends EventEmitter { } private destroyTab(tab: BrowserTab): void { + this.invalidateSnapshot(tab.id); tab.view.webContents.removeAllListeners(); if (!tab.view.webContents.isDestroyed()) { tab.view.webContents.close(); } } + private async waitForWebContentsSettle( + wc: WebContents, + signal?: AbortSignal, + idleMs = POST_ACTION_IDLE_MS, + timeoutMs = NAVIGATION_TIMEOUT_MS, + ): Promise { + const startedAt = Date.now(); + let sawLoading = wc.isLoading(); + + while (Date.now() - startedAt < timeoutMs) { + abortIfNeeded(signal); + if (wc.isDestroyed()) return; + + if (wc.isLoading()) { + sawLoading = true; + await sleep(100, signal); + continue; + } + + await sleep(sawLoading ? idleMs : Math.min(idleMs, 200), signal); + if (!wc.isLoading()) return; + sawLoading = true; + } + } + + private async executeOnActiveTab(script: string, signal?: AbortSignal): Promise { + abortIfNeeded(signal); + const activeTab = this.getActiveTab() ?? this.ensureInitialTab(); + await this.waitForWebContentsSettle(activeTab.view.webContents, signal); + abortIfNeeded(signal); + return activeTab.view.webContents.executeJavaScript(script, true) as Promise; + } + + private cacheSnapshot(tabId: string, rawSnapshot: RawBrowserPageSnapshot, loading: boolean): BrowserPageSnapshot { + const snapshotId = randomUUID(); + const elements: BrowserPageElement[] = rawSnapshot.elements.map((element, index) => { + const { selector, ...rest } = element; + void selector; + return { + ...rest, + index: index + 1, + }; + }); + + this.snapshotCache.set(tabId, { + snapshotId, + elements: rawSnapshot.elements.map((element, index) => ({ + index: index + 1, + selector: element.selector, + })), + }); + + return { + snapshotId, + url: rawSnapshot.url, + title: rawSnapshot.title, + loading, + text: rawSnapshot.text, + elements, + }; + } + + private resolveElementSelector(tabId: string, target: ElementTarget): { ok: true; selector: string } | { ok: false; error: string } { + if (target.selector?.trim()) { + return { ok: true, selector: target.selector.trim() }; + } + + if (target.index == null) { + return { ok: false, error: 'Provide an element index or selector.' }; + } + + const cachedSnapshot = this.snapshotCache.get(tabId); + if (!cachedSnapshot) { + return { ok: false, error: 'No page snapshot is available yet. Call read-page first.' }; + } + + if (target.snapshotId && cachedSnapshot.snapshotId !== target.snapshotId) { + return { ok: false, error: 'The page changed since the last read-page call. Call read-page again.' }; + } + + const entry = cachedSnapshot.elements.find((element) => element.index === target.index); + if (!entry) { + return { ok: false, error: `No element found for index ${target.index}.` }; + } + + return { ok: true, selector: entry.selector }; + } + setVisible(visible: boolean): void { this.visible = visible; if (visible) { @@ -266,6 +739,11 @@ export class BrowserViewManager extends EventEmitter { } } + async ensureActiveTabReady(signal?: AbortSignal): Promise { + const activeTab = this.getActiveTab() ?? this.ensureInitialTab(); + await this.waitForWebContentsSettle(activeTab.view.webContents, signal); + } + async newTab(rawUrl?: string): Promise<{ ok: boolean; tabId?: string; error?: string }> { try { const tab = this.createTab(rawUrl?.trim() ? rawUrl : HOME_URL); @@ -313,6 +791,7 @@ export class BrowserViewManager extends EventEmitter { async navigate(rawUrl: string): Promise<{ ok: boolean; error?: string }> { try { const activeTab = this.getActiveTab() ?? this.ensureInitialTab(); + this.invalidateSnapshot(activeTab.id); await activeTab.view.webContents.loadURL(this.normalizeUrl(rawUrl)); return { ok: true }; } catch (err) { @@ -325,6 +804,7 @@ export class BrowserViewManager extends EventEmitter { if (!activeTab) return { ok: false }; const history = activeTab.view.webContents.navigationHistory; if (!history.canGoBack()) return { ok: false }; + this.invalidateSnapshot(activeTab.id); history.goBack(); return { ok: true }; } @@ -334,6 +814,7 @@ export class BrowserViewManager extends EventEmitter { if (!activeTab) return { ok: false }; const history = activeTab.view.webContents.navigationHistory; if (!history.canGoForward()) return { ok: false }; + this.invalidateSnapshot(activeTab.id); history.goForward(); return { ok: true }; } @@ -341,9 +822,184 @@ export class BrowserViewManager extends EventEmitter { reload(): void { const activeTab = this.getActiveTab(); if (!activeTab) return; + this.invalidateSnapshot(activeTab.id); activeTab.view.webContents.reload(); } + async readPage( + options?: { maxElements?: number; maxTextLength?: number }, + signal?: AbortSignal, + ): Promise<{ ok: boolean; page?: BrowserPageSnapshot; error?: string }> { + try { + const activeTab = this.getActiveTab() ?? this.ensureInitialTab(); + const rawSnapshot = await this.executeOnActiveTab( + buildReadPageScript( + options?.maxElements ?? DEFAULT_READ_MAX_ELEMENTS, + options?.maxTextLength ?? DEFAULT_READ_MAX_TEXT_LENGTH, + ), + signal, + ); + return { + ok: true, + page: this.cacheSnapshot(activeTab.id, rawSnapshot, activeTab.view.webContents.isLoading()), + }; + } catch (error) { + return { + ok: false, + error: error instanceof Error ? error.message : 'Failed to read the current page.', + }; + } + } + + async readPageSummary(signal?: AbortSignal): Promise { + const result = await this.readPage( + { + maxElements: POST_ACTION_MAX_ELEMENTS, + maxTextLength: POST_ACTION_MAX_TEXT_LENGTH, + }, + signal, + ); + return result.ok ? result.page ?? null : null; + } + + async click(target: ElementTarget, signal?: AbortSignal): Promise<{ ok: boolean; error?: string; description?: string }> { + const activeTab = this.getActiveTab(); + if (!activeTab) { + return { ok: false, error: 'No active browser tab is open.' }; + } + + const resolved = this.resolveElementSelector(activeTab.id, target); + if (!resolved.ok) return resolved; + + try { + const result = await this.executeOnActiveTab<{ ok: boolean; error?: string; description?: string }>( + buildClickScript(resolved.selector), + signal, + ); + if (!result.ok) return result; + this.invalidateSnapshot(activeTab.id); + await this.waitForWebContentsSettle(activeTab.view.webContents, signal); + return result; + } catch (error) { + return { + ok: false, + error: error instanceof Error ? error.message : 'Failed to click the element.', + }; + } + } + + async type(target: ElementTarget, text: string, signal?: AbortSignal): Promise<{ ok: boolean; error?: string; description?: string }> { + const activeTab = this.getActiveTab(); + if (!activeTab) { + return { ok: false, error: 'No active browser tab is open.' }; + } + + const resolved = this.resolveElementSelector(activeTab.id, target); + if (!resolved.ok) return resolved; + + try { + const result = await this.executeOnActiveTab<{ ok: boolean; error?: string; description?: string }>( + buildTypeScript(resolved.selector, text), + signal, + ); + if (!result.ok) return result; + this.invalidateSnapshot(activeTab.id); + await this.waitForWebContentsSettle(activeTab.view.webContents, signal); + return result; + } catch (error) { + return { + ok: false, + error: error instanceof Error ? error.message : 'Failed to type into the element.', + }; + } + } + + async press( + key: string, + target?: ElementTarget, + signal?: AbortSignal, + ): Promise<{ ok: boolean; error?: string; description?: string }> { + const activeTab = this.getActiveTab(); + if (!activeTab) { + return { ok: false, error: 'No active browser tab is open.' }; + } + + let description = 'active element'; + + if (target?.index != null || target?.selector?.trim()) { + const resolved = this.resolveElementSelector(activeTab.id, target); + if (!resolved.ok) return resolved; + + try { + const focusResult = await this.executeOnActiveTab<{ ok: boolean; error?: string; description?: string }>( + buildFocusScript(resolved.selector), + signal, + ); + if (!focusResult.ok) return focusResult; + description = focusResult.description ?? description; + } catch (error) { + return { + ok: false, + error: error instanceof Error ? error.message : 'Failed to focus the element before pressing a key.', + }; + } + } + + try { + const wc = activeTab.view.webContents; + const keyCode = normalizeKeyCode(key); + wc.sendInputEvent({ type: 'keyDown', keyCode }); + if (keyCode.length === 1) { + wc.sendInputEvent({ type: 'char', keyCode }); + } + wc.sendInputEvent({ type: 'keyUp', keyCode }); + + this.invalidateSnapshot(activeTab.id); + await this.waitForWebContentsSettle(wc, signal); + + return { + ok: true, + description: `${keyCode} on ${description}`, + }; + } catch (error) { + return { + ok: false, + error: error instanceof Error ? error.message : 'Failed to press the requested key.', + }; + } + } + + async scroll(direction: 'up' | 'down' = 'down', amount = 700, signal?: AbortSignal): Promise<{ ok: boolean; error?: string }> { + const activeTab = this.getActiveTab(); + if (!activeTab) { + return { ok: false, error: 'No active browser tab is open.' }; + } + + try { + const offset = Math.max(1, amount) * (direction === 'up' ? -1 : 1); + const result = await this.executeOnActiveTab<{ ok: boolean; error?: string }>( + buildScrollScript(offset), + signal, + ); + if (!result.ok) return result; + this.invalidateSnapshot(activeTab.id); + await sleep(250, signal); + return result; + } catch (error) { + return { + ok: false, + error: error instanceof Error ? error.message : 'Failed to scroll the page.', + }; + } + } + + async wait(ms = 1000, signal?: AbortSignal): Promise { + await sleep(ms, signal); + const activeTab = this.getActiveTab(); + if (!activeTab) return; + await this.waitForWebContentsSettle(activeTab.view.webContents, signal); + } + getState(): BrowserState { return this.snapshotState(); } diff --git a/apps/x/apps/main/src/main.ts b/apps/x/apps/main/src/main.ts index e6e5c016..a690d207 100644 --- a/apps/x/apps/main/src/main.ts +++ b/apps/x/apps/main/src/main.ts @@ -31,8 +31,10 @@ import started from "electron-squirrel-startup"; import { execSync, exec, execFileSync } from "node:child_process"; import { promisify } from "node:util"; import { init as initChromeSync } from "@x/core/dist/knowledge/chrome-extension/server/server.js"; +import { registerContainerValues } from "@x/core/dist/di/container.js"; import { browserViewManager } from "./browser/view.js"; import { setupBrowserEventForwarding } from "./browser/ipc.js"; +import { ElectronBrowserControlService } from "./browser/control-service.js"; const execAsync = promisify(exec); @@ -221,6 +223,10 @@ app.whenReady().then(async () => { // Initialize all config files before UI can access them await initConfigs(); + registerContainerValues({ + browserControlService: new ElectronBrowserControlService(), + }); + setupIpcHandlers(); setupBrowserEventForwarding(); diff --git a/apps/x/apps/renderer/src/App.tsx b/apps/x/apps/renderer/src/App.tsx index 3a4cb5ac..20bfff1e 100644 --- a/apps/x/apps/renderer/src/App.tsx +++ b/apps/x/apps/renderer/src/App.tsx @@ -780,6 +780,19 @@ function App() { return cleanup }, [refreshVoiceAvailability]) + useEffect(() => { + const cleanup = window.ipc.on('browser:didRequestPaneState', (event) => { + if (event.open) { + setIsBrowserOpen(true) + setIsChatSidebarOpen(true) + setIsRightPaneMaximized(false) + return + } + setIsBrowserOpen(false) + }) + return cleanup + }, []) + const handleStartRecording = useCallback(() => { setIsRecording(true) isRecordingRef.current = true diff --git a/apps/x/apps/renderer/src/lib/chat-conversation.ts b/apps/x/apps/renderer/src/lib/chat-conversation.ts index 68c8366d..d457814e 100644 --- a/apps/x/apps/renderer/src/lib/chat-conversation.ts +++ b/apps/x/apps/renderer/src/lib/chat-conversation.ts @@ -231,6 +231,55 @@ export const getAppActionCardData = (tool: ToolCall): AppActionCardData | null = } } +const BROWSER_PENDING_LABELS: Record = { + open: 'Opening browser...', + 'get-state': 'Reading browser state...', + 'new-tab': 'Opening new browser tab...', + 'switch-tab': 'Switching browser tab...', + 'close-tab': 'Closing browser tab...', + navigate: 'Navigating browser...', + back: 'Going back...', + forward: 'Going forward...', + reload: 'Reloading page...', + 'read-page': 'Reading page...', + click: 'Clicking page element...', + type: 'Typing into page...', + press: 'Sending key press...', + scroll: 'Scrolling page...', + wait: 'Waiting for page...', +} + +export const getBrowserControlLabel = (tool: ToolCall): string | null => { + if (tool.name !== 'browser-control') return null + + const input = normalizeToolInput(tool.input) as Record | undefined + const result = tool.result as Record | undefined + const action = (input?.action as string | undefined) || (result?.action as string | undefined) || 'browser' + + if (tool.status !== 'completed') { + if (action === 'click' && typeof input?.index === 'number') { + return `Clicking element ${input.index}...` + } + if (action === 'type' && typeof input?.index === 'number') { + return `Typing into element ${input.index}...` + } + if (action === 'navigate' && typeof input?.target === 'string') { + return `Navigating to ${input.target}...` + } + return BROWSER_PENDING_LABELS[action] || 'Controlling browser...' + } + + if (result?.success === false) { + return typeof result.error === 'string' ? `Browser error: ${result.error}` : 'Browser action failed' + } + + if (typeof result?.message === 'string' && result.message.trim()) { + return result.message + } + + return 'Controlled browser' +} + // Parse attached files from message content and return clean message + file paths. export const parseAttachedFiles = (content: string): { message: string; files: string[] } => { const attachedFilesRegex = /\s*([\s\S]*?)\s*<\/attached-files>/ @@ -315,6 +364,7 @@ const TOOL_DISPLAY_NAMES: Record = { 'web-search': 'Searching the web', 'save-to-memory': 'Saving to memory', 'app-navigation': 'Navigating app', + 'browser-control': 'Controlling browser', 'composio-list-toolkits': 'Listing integrations', 'composio-search-tools': 'Searching tools', 'composio-execute-tool': 'Running tool', @@ -328,6 +378,8 @@ const TOOL_DISPLAY_NAMES: Record = { * Falls back to the raw tool name if no mapping exists. */ export const getToolDisplayName = (tool: ToolCall): string => { + const browserLabel = getBrowserControlLabel(tool) + if (browserLabel) return browserLabel const composioData = getComposioActionCardData(tool) if (composioData) return composioData.label return TOOL_DISPLAY_NAMES[tool.name] || tool.name diff --git a/apps/x/packages/core/src/application/assistant/instructions.ts b/apps/x/packages/core/src/application/assistant/instructions.ts index 31619366..f1643cf5 100644 --- a/apps/x/packages/core/src/application/assistant/instructions.ts +++ b/apps/x/packages/core/src/application/assistant/instructions.ts @@ -71,6 +71,7 @@ Rowboat is an agentic assistant for everyday work - emails, meetings, projects, **App Control:** When users ask you to open notes, show the bases or graph view, filter or search notes, or manage saved views, load the \`app-navigation\` skill first. It provides structured guidance for navigating the app UI and controlling the knowledge base view. **Tracks (Auto-Updating Note Blocks):** When users ask you to **track**, **monitor**, **watch**, or **keep an eye on** something in a note — or say things like "every morning tell me X", "show the current Y in this note", "pin live updates of Z here" — load the \`tracks\` skill first. Also load it when a user presses Cmd+K with a note open and requests auto-refreshing content at the cursor. Track blocks are YAML-fenced scheduled blocks whose output is rewritten on each run — useful for weather, news, prices, status pages, and personal dashboards. +**Browser Control:** When users ask you to open a website, browse in-app, search the web in the embedded browser, or interact with a live webpage inside Rowboat, load the \`browser-control\` skill first. It explains the \`read-page -> indexed action -> refreshed page\` workflow for the browser pane. ## Learning About the User (save-to-memory) @@ -243,6 +244,7 @@ ${runtimeContextPrompt} - \`slack-checkConnection\`, \`slack-listAvailableTools\`, \`slack-executeAction\` - Slack integration (requires Slack to be connected via Composio). Use \`slack-listAvailableTools\` first to discover available tool slugs, then \`slack-executeAction\` to execute them. - \`web-search\` - Search the web. Returns rich results with full text, highlights, and metadata. The \`category\` parameter defaults to \`general\` (full web search) — only use a specific category like \`news\`, \`company\`, \`research paper\` etc. when the query is clearly about that type. For everyday queries (weather, restaurants, prices, how-to), use \`general\`. - \`app-navigation\` - Control the app UI: open notes, switch views, filter/search the knowledge base, manage saved views. **Load the \`app-navigation\` skill before using this tool.** +- \`browser-control\` - Control the embedded browser pane: open sites, inspect the live page, switch tabs, and interact with indexed page elements. **Load the \`browser-control\` skill before using this tool.** - \`save-to-memory\` - Save observations about the user to the agent memory system. Use this proactively during conversations. - \`composio-list-toolkits\`, \`composio-search-tools\`, \`composio-execute-tool\`, \`composio-connect-toolkit\` — Composio integration tools. Load the \`composio-integration\` skill for usage guidance. diff --git a/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts b/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts new file mode 100644 index 00000000..0a378b4d --- /dev/null +++ b/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts @@ -0,0 +1,104 @@ +export const skill = String.raw` +# Browser Control Skill + +You have access to the **browser-control** tool, which controls Rowboat's embedded browser pane directly. + +Use this skill when the user asks you to open a website, browse in-app, search the web in the browser pane, click something on a page, fill a form, or otherwise interact with a live webpage inside Rowboat. + +## Core Workflow + +1. Start with ` + "`browser-control({ action: \"open\" })`" + ` if the browser pane may not already be open. +2. Use ` + "`browser-control({ action: \"read-page\" })`" + ` to inspect the current page. +3. The tool returns: + - ` + "`snapshotId`" + ` + - page ` + "`url`" + ` and ` + "`title`" + ` + - visible page text + - interactable elements with numbered ` + "`index`" + ` values +4. Prefer acting on those numbered indices with ` + "`click`" + ` / ` + "`type`" + ` / ` + "`press`" + `. +5. After each action, read the returned page snapshot before deciding the next step. + +## Actions + +### open +Open the browser pane and ensure an active tab exists. + +### get-state +Return the current browser tabs and active tab id. + +### new-tab +Open a new browser tab. + +Parameters: +- ` + "`target`" + ` (optional): URL or plain-language search query + +### switch-tab +Switch to a tab by ` + "`tabId`" + `. + +### close-tab +Close a tab by ` + "`tabId`" + `. + +### navigate +Navigate the active tab. + +Parameters: +- ` + "`target`" + `: URL or plain-language search query + +Plain-language targets are converted into a search automatically. + +### back / forward / reload +Standard browser navigation controls. + +### read-page +Read the current page and return a compact snapshot. + +Parameters: +- ` + "`maxElements`" + ` (optional) +- ` + "`maxTextLength`" + ` (optional) + +### click +Click an element. + +Prefer: +- ` + "`index`" + `: element index from ` + "`read-page`" + ` + +Optional: +- ` + "`snapshotId`" + `: include it when acting on a recent snapshot +- ` + "`selector`" + `: fallback only when no usable index exists + +### type +Type into an input, textarea, or contenteditable element. + +Parameters: +- ` + "`text`" + `: text to enter +- plus the same target fields as ` + "`click`" + ` + +### press +Send a key press such as ` + "`Enter`" + `, ` + "`Tab`" + `, ` + "`Escape`" + `, or arrow keys. + +Parameters: +- ` + "`key`" + ` +- optional target fields if you need to focus a specific element first + +### scroll +Scroll the current page. + +Parameters: +- ` + "`direction`" + `: ` + "`\"up\"`" + ` or ` + "`\"down\"`" + ` (optional; defaults down) +- ` + "`amount`" + `: pixel distance (optional) + +### wait +Wait for the page to settle, useful after async UI changes. + +Parameters: +- ` + "`ms`" + `: milliseconds to wait (optional) + +## Important Rules + +- Prefer ` + "`read-page`" + ` before interacting. +- Prefer element ` + "`index`" + ` over CSS selectors. +- If the tool says the snapshot is stale, call ` + "`read-page`" + ` again. +- After navigation, clicking, typing, pressing, or scrolling, use the returned page snapshot instead of assuming the page state. +- Use Rowboat's browser for live interaction. Use web search tools for research where a live session is unnecessary. +`; + +export default skill; diff --git a/apps/x/packages/core/src/application/assistant/skills/index.ts b/apps/x/packages/core/src/application/assistant/skills/index.ts index 18f29b62..d22db680 100644 --- a/apps/x/packages/core/src/application/assistant/skills/index.ts +++ b/apps/x/packages/core/src/application/assistant/skills/index.ts @@ -11,6 +11,7 @@ import backgroundAgentsSkill from "./background-agents/skill.js"; import createPresentationsSkill from "./create-presentations/skill.js"; import appNavigationSkill from "./app-navigation/skill.js"; +import browserControlSkill from "./browser-control/skill.js"; import composioIntegrationSkill from "./composio-integration/skill.js"; import tracksSkill from "./tracks/skill.js"; @@ -105,6 +106,12 @@ const definitions: SkillDefinition[] = [ summary: "Create and manage track blocks — YAML-scheduled auto-updating content blocks in notes (weather, news, prices, status, dashboards). Insert at cursor (Cmd+K) or append to notes.", content: tracksSkill, }, + { + id: "browser-control", + title: "Browser Control", + summary: "Control the embedded browser pane - open sites, inspect page state, and interact with indexed page elements.", + content: browserControlSkill, + }, ]; const skillEntries = definitions.map((definition) => ({ diff --git a/apps/x/packages/core/src/application/browser-control/service.ts b/apps/x/packages/core/src/application/browser-control/service.ts new file mode 100644 index 00000000..201160c3 --- /dev/null +++ b/apps/x/packages/core/src/application/browser-control/service.ts @@ -0,0 +1,8 @@ +import type { BrowserControlInput, BrowserControlResult } from '@x/shared/dist/browser-control.js'; + +export interface IBrowserControlService { + execute( + input: BrowserControlInput, + ctx?: { signal?: AbortSignal }, + ): Promise; +} diff --git a/apps/x/packages/core/src/application/lib/builtin-tools.ts b/apps/x/packages/core/src/application/lib/builtin-tools.ts index fcad4f32..a2b68427 100644 --- a/apps/x/packages/core/src/application/lib/builtin-tools.ts +++ b/apps/x/packages/core/src/application/lib/builtin-tools.ts @@ -17,6 +17,7 @@ import { WorkDir } from "../../config/config.js"; import { composioAccountsRepo } from "../../composio/repo.js"; import { executeAction as executeComposioAction, isConfigured as isComposioConfigured, searchTools as searchComposioTools } from "../../composio/client.js"; import { CURATED_TOOLKITS, CURATED_TOOLKIT_SLUGS } from "@x/shared/dist/composio.js"; +import { BrowserControlInputSchema, type BrowserControlInput } from "@x/shared/dist/browser-control.js"; import type { ToolContext } from "./exec-tool.js"; import { generateText } from "ai"; import { createProvider } from "../../models/models.js"; @@ -26,6 +27,7 @@ import { getGatewayProvider } from "../../models/gateway.js"; import { getAccessToken } from "../../auth/tokens.js"; import { API_URL } from "../../config/env.js"; import { updateContent, updateTrackBlock } from "../../knowledge/track/fileops.js"; +import type { IBrowserControlService } from "../browser-control/service.js"; // Parser libraries are loaded dynamically inside parseFile.execute() // to avoid pulling pdfjs-dist's DOM polyfills into the main bundle. // Import paths are computed so esbuild cannot statically resolve them. @@ -562,7 +564,7 @@ export const BuiltinTools: z.infer = { count: matches.length, tool: 'ripgrep', }; - } catch (rgError) { + } catch { // Fallback to basic grep if ripgrep not available or failed const grepArgs = [ '-rn', @@ -997,6 +999,39 @@ export const BuiltinTools: z.infer = { }, }, + // ============================================================================ + // Browser Control + // ============================================================================ + + 'browser-control': { + description: 'Control the embedded browser pane. Read the current page, inspect indexed interactable elements, and navigate/click/type/press keys in the active browser tab.', + inputSchema: BrowserControlInputSchema, + isAvailable: async () => { + try { + container.resolve('browserControlService'); + return true; + } catch { + return false; + } + }, + execute: async (input: BrowserControlInput, ctx?: ToolContext) => { + try { + const browserControlService = container.resolve('browserControlService'); + return await browserControlService.execute(input, { signal: ctx?.signal }); + } catch (error) { + return { + success: false, + action: input.action, + error: error instanceof Error ? error.message : 'Browser control is unavailable.', + browser: { + activeTabId: null, + tabs: [], + }, + }; + } + }, + }, + // ============================================================================ // App Navigation // ============================================================================ diff --git a/apps/x/packages/core/src/di/container.ts b/apps/x/packages/core/src/di/container.ts index d7a9b9a0..a4545405 100644 --- a/apps/x/packages/core/src/di/container.ts +++ b/apps/x/packages/core/src/di/container.ts @@ -1,4 +1,4 @@ -import { asClass, createContainer, InjectionMode } from "awilix"; +import { asClass, asValue, createContainer, InjectionMode } from "awilix"; import { FSModelConfigRepo, IModelConfigRepo } from "../models/repo.js"; import { FSMcpConfigRepo, IMcpConfigRepo } from "../mcp/repo.js"; import { FSAgentsRepo, IAgentsRepo } from "../agents/repo.js"; @@ -41,4 +41,11 @@ container.register({ slackConfigRepo: asClass(FSSlackConfigRepo).singleton(), }); -export default container; \ No newline at end of file +export default container; + +export function registerContainerValues(values: Record): void { + const registrations = Object.fromEntries( + Object.entries(values).map(([key, value]) => [key, asValue(value)]), + ); + container.register(registrations); +} diff --git a/apps/x/packages/shared/src/browser-control.ts b/apps/x/packages/shared/src/browser-control.ts new file mode 100644 index 00000000..e1418a5e --- /dev/null +++ b/apps/x/packages/shared/src/browser-control.ts @@ -0,0 +1,134 @@ +import { z } from 'zod'; + +export const BrowserTabStateSchema = z.object({ + id: z.string(), + url: z.string(), + title: z.string(), + canGoBack: z.boolean(), + canGoForward: z.boolean(), + loading: z.boolean(), +}); + +export const BrowserStateSchema = z.object({ + activeTabId: z.string().nullable(), + tabs: z.array(BrowserTabStateSchema), +}); + +export const BrowserPageElementSchema = z.object({ + index: z.number().int().positive(), + tagName: z.string(), + role: z.string().nullable(), + type: z.string().nullable(), + label: z.string().nullable(), + text: z.string().nullable(), + placeholder: z.string().nullable(), + href: z.string().nullable(), + disabled: z.boolean(), +}); + +export const BrowserPageSnapshotSchema = z.object({ + snapshotId: z.string(), + url: z.string(), + title: z.string(), + loading: z.boolean(), + text: z.string(), + elements: z.array(BrowserPageElementSchema), +}); + +export const BrowserControlActionSchema = z.enum([ + 'open', + 'get-state', + 'new-tab', + 'switch-tab', + 'close-tab', + 'navigate', + 'back', + 'forward', + 'reload', + 'read-page', + 'click', + 'type', + 'press', + 'scroll', + 'wait', +]); + +const BrowserElementTargetFields = { + index: z.number().int().positive().optional(), + selector: z.string().min(1).optional(), + snapshotId: z.string().optional(), +} as const; + +export const BrowserControlInputSchema = z.object({ + action: BrowserControlActionSchema, + target: z.string().min(1).optional(), + tabId: z.string().min(1).optional(), + text: z.string().optional(), + key: z.string().min(1).optional(), + direction: z.enum(['up', 'down']).optional(), + amount: z.number().int().positive().max(5000).optional(), + ms: z.number().int().positive().max(30000).optional(), + maxElements: z.number().int().positive().max(100).optional(), + maxTextLength: z.number().int().positive().max(20000).optional(), + ...BrowserElementTargetFields, +}).strict().superRefine((value, ctx) => { + const needsElementTarget = value.action === 'click' || value.action === 'type'; + const hasElementTarget = value.index !== undefined || value.selector !== undefined; + + if ((value.action === 'switch-tab' || value.action === 'close-tab') && !value.tabId) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ['tabId'], + message: 'tabId is required for this action.', + }); + } + + if ((value.action === 'navigate') && !value.target) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ['target'], + message: 'target is required for navigate.', + }); + } + + if (value.action === 'type' && value.text === undefined) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ['text'], + message: 'text is required for type.', + }); + } + + if (value.action === 'press' && !value.key) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ['key'], + message: 'key is required for press.', + }); + } + + if (needsElementTarget && !hasElementTarget) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ['index'], + message: 'Provide an element index or selector.', + }); + } +}); + +export const BrowserControlResultSchema = z.object({ + success: z.boolean(), + action: BrowserControlActionSchema, + message: z.string().optional(), + error: z.string().optional(), + browser: BrowserStateSchema, + page: BrowserPageSnapshotSchema.optional(), +}); + +export type BrowserTabState = z.infer; +export type BrowserState = z.infer; +export type BrowserPageElement = z.infer; +export type BrowserPageSnapshot = z.infer; +export type BrowserControlAction = z.infer; +export type BrowserControlInput = z.infer; +export type BrowserControlResult = z.infer; diff --git a/apps/x/packages/shared/src/index.ts b/apps/x/packages/shared/src/index.ts index 8bdee4f9..5bdc49fd 100644 --- a/apps/x/packages/shared/src/index.ts +++ b/apps/x/packages/shared/src/index.ts @@ -12,4 +12,5 @@ export * as blocks from './blocks.js'; export * as trackBlock from './track-block.js'; export * as frontmatter from './frontmatter.js'; export * as bases from './bases.js'; +export * as browserControl from './browser-control.js'; export { PrefixLogger }; diff --git a/apps/x/packages/shared/src/ipc.ts b/apps/x/packages/shared/src/ipc.ts index a43e8620..292c6627 100644 --- a/apps/x/packages/shared/src/ipc.ts +++ b/apps/x/packages/shared/src/ipc.ts @@ -10,6 +10,7 @@ import { TrackEvent } from './track-block.js'; import { UserMessageContent } from './message.js'; import { RowboatApiConfig } from './rowboat-account.js'; import { ZListToolkitsResponse } from './composio.js'; +import { BrowserStateSchema } from './browser-control.js'; // ============================================================================ // Runtime Validation Schemas (Single Source of Truth) @@ -701,29 +702,15 @@ const ipcSchemas = { }, 'browser:getState': { req: z.null(), - res: z.object({ - activeTabId: z.string().nullable(), - tabs: z.array(z.object({ - id: z.string(), - url: z.string(), - title: z.string(), - canGoBack: z.boolean(), - canGoForward: z.boolean(), - loading: z.boolean(), - })), - }), + res: BrowserStateSchema, }, 'browser:didUpdateState': { + req: BrowserStateSchema, + res: z.null(), + }, + 'browser:didRequestPaneState': { req: z.object({ - activeTabId: z.string().nullable(), - tabs: z.array(z.object({ - id: z.string(), - url: z.string(), - title: z.string(), - canGoBack: z.boolean(), - canGoForward: z.boolean(), - loading: z.boolean(), - })), + open: z.boolean(), }), res: z.null(), },