browser control

This commit is contained in:
Arjun 2026-04-11 23:36:22 +05:30
parent 25f28564d8
commit 884d8601cd
14 changed files with 1355 additions and 51 deletions

View file

@ -0,0 +1,292 @@
import { BrowserWindow } from 'electron';
import type { IBrowserControlService } from '@x/core/dist/application/browser-control/service.js';
import type { BrowserControlAction, BrowserControlInput, BrowserControlResult } from '@x/shared/dist/browser-control.js';
import { browserViewManager } from './view.js';
const SEARCH_ENGINE_BASE_URL = 'https://www.google.com/search?q=';
function normalizeNavigationTarget(target: string): string {
const trimmed = target.trim();
if (!trimmed) {
throw new Error('Navigation target cannot be empty.');
}
const lower = trimmed.toLowerCase();
if (
lower.startsWith('javascript:')
|| lower.startsWith('file://')
|| lower.startsWith('chrome://')
|| lower.startsWith('chrome-extension://')
) {
throw new Error('That URL scheme is not allowed in the embedded browser.');
}
if (/^[a-z][a-z0-9+.-]*:/i.test(trimmed)) {
return trimmed;
}
const looksLikeHost =
trimmed.startsWith('localhost')
|| /^[\w.-]+\.[a-z]{2,}/i.test(trimmed)
|| /^\d{1,3}(?:\.\d{1,3}){3}(?::\d+)?(?:\/.*)?$/.test(trimmed);
if (looksLikeHost && !/\s/.test(trimmed)) {
return trimmed;
}
return `${SEARCH_ENGINE_BASE_URL}${encodeURIComponent(trimmed)}`;
}
function emitPaneState(open: boolean): void {
const windows = BrowserWindow.getAllWindows();
for (const win of windows) {
if (!win.isDestroyed() && win.webContents) {
win.webContents.send('browser:didRequestPaneState', { open });
}
}
}
function buildSuccessResult(
action: BrowserControlAction,
message: string,
page?: BrowserControlResult['page'],
): BrowserControlResult {
return {
success: true,
action,
message,
browser: browserViewManager.getState(),
...(page ? { page } : {}),
};
}
function buildErrorResult(action: BrowserControlAction, error: string): BrowserControlResult {
return {
success: false,
action,
error,
browser: browserViewManager.getState(),
};
}
export class ElectronBrowserControlService implements IBrowserControlService {
private ensurePaneOpen(): void {
emitPaneState(true);
browserViewManager.setVisible(true);
}
async execute(
input: BrowserControlInput,
ctx?: { signal?: AbortSignal },
): Promise<BrowserControlResult> {
const signal = ctx?.signal;
this.ensurePaneOpen();
try {
switch (input.action) {
case 'open': {
await browserViewManager.ensureActiveTabReady(signal);
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult('open', 'Opened the browser pane.', page);
}
case 'get-state':
return buildSuccessResult('get-state', 'Read the current browser state.');
case 'new-tab': {
const target = input.target ? normalizeNavigationTarget(input.target) : undefined;
const result = await browserViewManager.newTab(target);
if (!result.ok) {
return buildErrorResult('new-tab', result.error ?? 'Failed to open a new tab.');
}
await browserViewManager.ensureActiveTabReady(signal);
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult(
'new-tab',
target ? `Opened a new tab for ${target}.` : 'Opened a new tab.',
page,
);
}
case 'switch-tab': {
const tabId = input.tabId;
if (!tabId) {
return buildErrorResult('switch-tab', 'tabId is required for switch-tab.');
}
const result = browserViewManager.switchTab(tabId);
if (!result.ok) {
return buildErrorResult('switch-tab', `No browser tab exists with id ${tabId}.`);
}
await browserViewManager.ensureActiveTabReady(signal);
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult('switch-tab', `Switched to tab ${tabId}.`, page);
}
case 'close-tab': {
const tabId = input.tabId;
if (!tabId) {
return buildErrorResult('close-tab', 'tabId is required for close-tab.');
}
const result = browserViewManager.closeTab(tabId);
if (!result.ok) {
return buildErrorResult('close-tab', `Could not close tab ${tabId}.`);
}
await browserViewManager.ensureActiveTabReady(signal);
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult('close-tab', `Closed tab ${tabId}.`, page);
}
case 'navigate': {
const rawTarget = input.target;
if (!rawTarget) {
return buildErrorResult('navigate', 'target is required for navigate.');
}
const target = normalizeNavigationTarget(rawTarget);
const result = await browserViewManager.navigate(target);
if (!result.ok) {
return buildErrorResult('navigate', result.error ?? `Failed to navigate to ${target}.`);
}
await browserViewManager.ensureActiveTabReady(signal);
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult('navigate', `Navigated to ${target}.`, page);
}
case 'back': {
const result = browserViewManager.back();
if (!result.ok) {
return buildErrorResult('back', 'The active tab cannot go back.');
}
await browserViewManager.ensureActiveTabReady(signal);
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult('back', 'Went back in the active tab.', page);
}
case 'forward': {
const result = browserViewManager.forward();
if (!result.ok) {
return buildErrorResult('forward', 'The active tab cannot go forward.');
}
await browserViewManager.ensureActiveTabReady(signal);
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult('forward', 'Went forward in the active tab.', page);
}
case 'reload': {
browserViewManager.reload();
await browserViewManager.ensureActiveTabReady(signal);
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult('reload', 'Reloaded the active tab.', page);
}
case 'read-page': {
const result = await browserViewManager.readPage(
{
maxElements: input.maxElements,
maxTextLength: input.maxTextLength,
},
signal,
);
if (!result.ok || !result.page) {
return buildErrorResult('read-page', result.error ?? 'Failed to read the current page.');
}
return buildSuccessResult('read-page', 'Read the current page.', result.page);
}
case 'click': {
const result = await browserViewManager.click(
{
index: input.index,
selector: input.selector,
snapshotId: input.snapshotId,
},
signal,
);
if (!result.ok) {
return buildErrorResult('click', result.error ?? 'Failed to click the requested element.');
}
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult(
'click',
result.description ? `Clicked ${result.description}.` : 'Clicked the requested element.',
page,
);
}
case 'type': {
const text = input.text;
if (text === undefined) {
return buildErrorResult('type', 'text is required for type.');
}
const result = await browserViewManager.type(
{
index: input.index,
selector: input.selector,
snapshotId: input.snapshotId,
},
text,
signal,
);
if (!result.ok) {
return buildErrorResult('type', result.error ?? 'Failed to type into the requested element.');
}
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult(
'type',
result.description ? `Typed into ${result.description}.` : 'Typed into the requested element.',
page,
);
}
case 'press': {
const key = input.key;
if (!key) {
return buildErrorResult('press', 'key is required for press.');
}
const result = await browserViewManager.press(
key,
{
index: input.index,
selector: input.selector,
snapshotId: input.snapshotId,
},
signal,
);
if (!result.ok) {
return buildErrorResult('press', result.error ?? `Failed to press ${key}.`);
}
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult(
'press',
result.description ? `Pressed ${result.description}.` : `Pressed ${key}.`,
page,
);
}
case 'scroll': {
const result = await browserViewManager.scroll(
input.direction ?? 'down',
input.amount ?? 700,
signal,
);
if (!result.ok) {
return buildErrorResult('scroll', result.error ?? 'Failed to scroll the page.');
}
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult('scroll', `Scrolled ${input.direction ?? 'down'}.`, page);
}
case 'wait': {
const duration = input.ms ?? 1000;
await browserViewManager.wait(duration, signal);
const page = await browserViewManager.readPageSummary(signal) ?? undefined;
return buildSuccessResult('wait', `Waited ${duration}ms for the page to settle.`, page);
}
}
} catch (error) {
return buildErrorResult(
input.action,
error instanceof Error ? error.message : 'Browser control failed unexpectedly.',
);
}
}
}

View file

@ -1,6 +1,14 @@
import { randomUUID } from 'node:crypto';
import { EventEmitter } from 'node:events';
import { BrowserWindow, WebContentsView, session, shell, type Session } from 'electron';
import { BrowserWindow, WebContentsView, session, shell, type Session, type WebContents } from 'electron';
import type {
BrowserPageElement,
BrowserPageSnapshot,
BrowserState,
BrowserTabState,
} from '@x/shared/dist/browser-control.js';
export type { BrowserPageSnapshot, BrowserState, BrowserTabState };
/**
* Embedded browser pane implementation.
@ -22,6 +30,166 @@ const SPOOF_UA =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36';
const HOME_URL = 'https://www.google.com';
const NAVIGATION_TIMEOUT_MS = 10000;
const POST_ACTION_IDLE_MS = 400;
const POST_ACTION_MAX_ELEMENTS = 25;
const POST_ACTION_MAX_TEXT_LENGTH = 4000;
const DEFAULT_READ_MAX_ELEMENTS = 50;
const DEFAULT_READ_MAX_TEXT_LENGTH = 8000;
const INTERACTABLE_SELECTORS = [
'a[href]',
'button',
'input',
'textarea',
'select',
'summary',
'[role="button"]',
'[role="link"]',
'[role="tab"]',
'[role="menuitem"]',
'[role="option"]',
'[contenteditable="true"]',
'[tabindex]:not([tabindex="-1"])',
].join(', ');
const DOM_HELPERS_SOURCE = String.raw`
const truncateText = (value, max) => {
const normalized = String(value ?? '').replace(/\s+/g, ' ').trim();
if (!normalized) return '';
if (normalized.length <= max) return normalized;
const safeMax = Math.max(0, max - 3);
return normalized.slice(0, safeMax).trim() + '...';
};
const cssEscapeValue = (value) => {
if (typeof CSS !== 'undefined' && typeof CSS.escape === 'function') {
return CSS.escape(value);
}
return String(value).replace(/[^a-zA-Z0-9_-]/g, (char) => '\\' + char);
};
const isVisibleElement = (element) => {
if (!(element instanceof Element)) return false;
const style = window.getComputedStyle(element);
if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') {
return false;
}
if (element.getAttribute('aria-hidden') === 'true') return false;
const rect = element.getBoundingClientRect();
return rect.width > 0 && rect.height > 0;
};
const isDisabledElement = (element) => {
if (!(element instanceof Element)) return true;
if (element.getAttribute('aria-disabled') === 'true') return true;
return 'disabled' in element && Boolean(element.disabled);
};
const getElementRole = (element) => {
const explicitRole = element.getAttribute('role');
if (explicitRole) return explicitRole;
if (element instanceof HTMLAnchorElement) return 'link';
if (element instanceof HTMLButtonElement) return 'button';
if (element instanceof HTMLInputElement) return element.type === 'checkbox' ? 'checkbox' : 'input';
if (element instanceof HTMLTextAreaElement) return 'textbox';
if (element instanceof HTMLSelectElement) return 'combobox';
if (element instanceof HTMLElement && element.isContentEditable) return 'textbox';
return null;
};
const getElementType = (element) => {
if (element instanceof HTMLInputElement) return element.type || 'text';
if (element instanceof HTMLTextAreaElement) return 'textarea';
if (element instanceof HTMLSelectElement) return 'select';
if (element instanceof HTMLButtonElement) return 'button';
if (element instanceof HTMLElement && element.isContentEditable) return 'contenteditable';
return null;
};
const getElementLabel = (element) => {
const ariaLabel = truncateText(element.getAttribute('aria-label') ?? '', 120);
if (ariaLabel) return ariaLabel;
if ('labels' in element && element.labels && element.labels.length > 0) {
const labelText = truncateText(
Array.from(element.labels).map((label) => label.innerText || label.textContent || '').join(' '),
120,
);
if (labelText) return labelText;
}
if (element.id) {
const label = document.querySelector('label[for="' + cssEscapeValue(element.id) + '"]');
const labelText = truncateText(label?.textContent ?? '', 120);
if (labelText) return labelText;
}
const placeholder = truncateText(element.getAttribute('placeholder') ?? '', 120);
if (placeholder) return placeholder;
const text = truncateText(
element instanceof HTMLInputElement || element instanceof HTMLTextAreaElement
? element.value
: element.textContent ?? '',
120,
);
return text || null;
};
const describeElement = (element) => {
const role = getElementRole(element) || element.tagName.toLowerCase();
const label = getElementLabel(element);
return label ? role + ' "' + label + '"' : role;
};
const buildUniqueSelector = (element) => {
if (!(element instanceof Element)) return null;
if (element.id) {
const idSelector = '#' + cssEscapeValue(element.id);
try {
if (document.querySelectorAll(idSelector).length === 1) return idSelector;
} catch {}
}
const segments = [];
let current = element;
while (current && current instanceof Element && current !== document.documentElement) {
const tag = current.tagName.toLowerCase();
if (!tag) break;
let segment = tag;
const name = current.getAttribute('name');
if (name) {
const nameSelector = tag + '[name="' + cssEscapeValue(name) + '"]';
try {
if (document.querySelectorAll(nameSelector).length === 1) {
segments.unshift(nameSelector);
return segments.join(' > ');
}
} catch {}
}
const parent = current.parentElement;
if (parent) {
const sameTagSiblings = Array.from(parent.children).filter((child) => child.tagName === current.tagName);
const position = sameTagSiblings.indexOf(current) + 1;
segment += ':nth-of-type(' + position + ')';
}
segments.unshift(segment);
const selector = segments.join(' > ');
try {
if (document.querySelectorAll(selector).length === 1) return selector;
} catch {}
current = current.parentElement;
}
return segments.length > 0 ? segments.join(' > ') : null;
};
`;
export interface BrowserBounds {
x: number;
@ -30,30 +198,236 @@ export interface BrowserBounds {
height: number;
}
export interface BrowserTabState {
id: string;
url: string;
title: string;
canGoBack: boolean;
canGoForward: boolean;
loading: boolean;
}
export interface BrowserState {
activeTabId: string | null;
tabs: BrowserTabState[];
}
type BrowserTab = {
id: string;
view: WebContentsView;
};
type CachedSnapshot = {
snapshotId: string;
elements: Array<{ index: number; selector: string }>;
};
type RawBrowserPageElement = BrowserPageElement & {
selector: string;
};
type RawBrowserPageSnapshot = {
url: string;
title: string;
loading: boolean;
text: string;
elements: RawBrowserPageElement[];
};
type ElementTarget = {
index?: number;
selector?: string;
snapshotId?: string;
};
const EMPTY_STATE: BrowserState = {
activeTabId: null,
tabs: [],
};
function abortIfNeeded(signal?: AbortSignal): void {
if (!signal?.aborted) return;
throw signal.reason instanceof Error ? signal.reason : new Error('Browser action aborted');
}
async function sleep(ms: number, signal?: AbortSignal): Promise<void> {
if (ms <= 0) return;
abortIfNeeded(signal);
await new Promise<void>((resolve, reject) => {
const abortSignal = signal;
const timer = setTimeout(() => {
abortSignal?.removeEventListener('abort', onAbort);
resolve();
}, ms);
const onAbort = () => {
clearTimeout(timer);
abortSignal?.removeEventListener('abort', onAbort);
reject(abortSignal?.reason instanceof Error ? abortSignal.reason : new Error('Browser action aborted'));
};
abortSignal?.addEventListener('abort', onAbort, { once: true });
});
}
function buildReadPageScript(maxElements: number, maxTextLength: number): string {
return `(() => {
${DOM_HELPERS_SOURCE}
const candidates = Array.from(document.querySelectorAll(${JSON.stringify(INTERACTABLE_SELECTORS)}));
const elements = [];
const seenSelectors = new Set();
for (const candidate of candidates) {
if (!(candidate instanceof Element)) continue;
if (!isVisibleElement(candidate)) continue;
const selector = buildUniqueSelector(candidate);
if (!selector || seenSelectors.has(selector)) continue;
seenSelectors.add(selector);
elements.push({
index: elements.length + 1,
selector,
tagName: candidate.tagName.toLowerCase(),
role: getElementRole(candidate),
type: getElementType(candidate),
label: getElementLabel(candidate),
text: truncateText(candidate.innerText || candidate.textContent || '', 120) || null,
placeholder: truncateText(candidate.getAttribute('placeholder') ?? '', 120) || null,
href: candidate instanceof HTMLAnchorElement ? candidate.href : candidate.getAttribute('href'),
disabled: isDisabledElement(candidate),
});
if (elements.length >= ${JSON.stringify(maxElements)}) break;
}
return {
url: window.location.href,
title: document.title || '',
loading: document.readyState !== 'complete',
text: truncateText(document.body?.innerText || document.body?.textContent || '', ${JSON.stringify(maxTextLength)}),
elements,
};
})()`;
}
function buildClickScript(selector: string): string {
return `(() => {
${DOM_HELPERS_SOURCE}
const element = document.querySelector(${JSON.stringify(selector)});
if (!(element instanceof Element)) {
return { ok: false, error: 'Element not found.' };
}
if (!isVisibleElement(element)) {
return { ok: false, error: 'Element is not visible.' };
}
if (isDisabledElement(element)) {
return { ok: false, error: 'Element is disabled.' };
}
if (element instanceof HTMLElement) {
element.scrollIntoView({ block: 'center', inline: 'center' });
element.focus({ preventScroll: true });
element.click();
} else {
element.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true, view: window }));
}
return { ok: true, description: describeElement(element) };
})()`;
}
function buildTypeScript(selector: string, text: string): string {
return `(() => {
${DOM_HELPERS_SOURCE}
const element = document.querySelector(${JSON.stringify(selector)});
if (!(element instanceof Element)) {
return { ok: false, error: 'Element not found.' };
}
if (!isVisibleElement(element)) {
return { ok: false, error: 'Element is not visible.' };
}
if (isDisabledElement(element)) {
return { ok: false, error: 'Element is disabled.' };
}
const nextValue = ${JSON.stringify(text)};
const setNativeValue = (target, value) => {
const prototype = Object.getPrototypeOf(target);
const descriptor = Object.getOwnPropertyDescriptor(prototype, 'value');
if (descriptor && typeof descriptor.set === 'function') {
descriptor.set.call(target, value);
} else {
target.value = value;
}
};
if (element instanceof HTMLInputElement || element instanceof HTMLTextAreaElement) {
if (element.readOnly) {
return { ok: false, error: 'Element is read-only.' };
}
element.scrollIntoView({ block: 'center', inline: 'center' });
element.focus({ preventScroll: true });
setNativeValue(element, nextValue);
element.dispatchEvent(new InputEvent('input', { bubbles: true, data: nextValue, inputType: 'insertText' }));
element.dispatchEvent(new Event('change', { bubbles: true }));
return { ok: true, description: describeElement(element) };
}
if (element instanceof HTMLElement && element.isContentEditable) {
element.scrollIntoView({ block: 'center', inline: 'center' });
element.focus({ preventScroll: true });
element.textContent = nextValue;
element.dispatchEvent(new InputEvent('input', { bubbles: true, data: nextValue, inputType: 'insertText' }));
return { ok: true, description: describeElement(element) };
}
return { ok: false, error: 'Element does not accept text input.' };
})()`;
}
function buildFocusScript(selector: string): string {
return `(() => {
${DOM_HELPERS_SOURCE}
const element = document.querySelector(${JSON.stringify(selector)});
if (!(element instanceof Element)) {
return { ok: false, error: 'Element not found.' };
}
if (!isVisibleElement(element)) {
return { ok: false, error: 'Element is not visible.' };
}
if (element instanceof HTMLElement) {
element.scrollIntoView({ block: 'center', inline: 'center' });
element.focus({ preventScroll: true });
}
return { ok: true, description: describeElement(element) };
})()`;
}
function buildScrollScript(offset: number): string {
return `(() => {
window.scrollBy({ top: ${JSON.stringify(offset)}, left: 0, behavior: 'auto' });
return { ok: true };
})()`;
}
function normalizeKeyCode(key: string): string {
const trimmed = key.trim();
if (!trimmed) return 'Enter';
const aliases: Record<string, string> = {
esc: 'Escape',
escape: 'Escape',
return: 'Enter',
enter: 'Enter',
tab: 'Tab',
space: 'Space',
' ': 'Space',
left: 'ArrowLeft',
right: 'ArrowRight',
up: 'ArrowUp',
down: 'ArrowDown',
arrowleft: 'ArrowLeft',
arrowright: 'ArrowRight',
arrowup: 'ArrowUp',
arrowdown: 'ArrowDown',
backspace: 'Backspace',
delete: 'Delete',
};
const alias = aliases[trimmed.toLowerCase()];
if (alias) return alias;
if (trimmed.length === 1) return trimmed.toUpperCase();
return trimmed[0].toUpperCase() + trimmed.slice(1);
}
export class BrowserViewManager extends EventEmitter {
private window: BrowserWindow | null = null;
private browserSession: Session | null = null;
@ -63,6 +437,7 @@ export class BrowserViewManager extends EventEmitter {
private attachedTabId: string | null = null;
private visible = false;
private bounds: BrowserBounds = { x: 0, y: 0, width: 0, height: 0 };
private snapshotCache = new Map<string, CachedSnapshot>();
attach(window: BrowserWindow): void {
this.window = window;
@ -74,6 +449,7 @@ export class BrowserViewManager extends EventEmitter {
this.activeTabId = null;
this.attachedTabId = null;
this.visible = false;
this.snapshotCache.clear();
});
}
@ -98,6 +474,10 @@ export class BrowserViewManager extends EventEmitter {
return this.getTab(this.activeTabId);
}
private invalidateSnapshot(tabId: string): void {
this.snapshotCache.delete(tabId);
}
private normalizeUrl(rawUrl: string): string {
let url = rawUrl.trim();
if (!/^[a-z][a-z0-9+.-]*:/i.test(url)) {
@ -128,11 +508,6 @@ export class BrowserViewManager extends EventEmitter {
private wireEvents(tabId: string, view: WebContentsView): void {
const wc = view.webContents;
const emit = () => this.emitState();
// Electron occasionally drops WebContentsView layout on navigation.
// Re-applying the cached bounds is cheap and keeps the active tab pinned
// to the renderer-computed viewport.
const reapplyBounds = () => {
if (
this.attachedTabId === tabId &&
@ -144,15 +519,23 @@ export class BrowserViewManager extends EventEmitter {
}
};
wc.on('did-start-navigation', reapplyBounds);
wc.on('did-navigate', () => { reapplyBounds(); emit(); });
wc.on('did-navigate-in-page', () => { reapplyBounds(); emit(); });
wc.on('did-start-loading', () => { reapplyBounds(); emit(); });
wc.on('did-stop-loading', () => { reapplyBounds(); emit(); });
wc.on('did-finish-load', () => { reapplyBounds(); emit(); });
const invalidateAndEmit = () => {
this.invalidateSnapshot(tabId);
this.emitState();
};
wc.on('did-start-navigation', () => {
this.invalidateSnapshot(tabId);
reapplyBounds();
});
wc.on('did-navigate', () => { reapplyBounds(); invalidateAndEmit(); });
wc.on('did-navigate-in-page', () => { reapplyBounds(); invalidateAndEmit(); });
wc.on('did-start-loading', () => { this.invalidateSnapshot(tabId); reapplyBounds(); this.emitState(); });
wc.on('did-stop-loading', () => { reapplyBounds(); invalidateAndEmit(); });
wc.on('did-finish-load', () => { reapplyBounds(); invalidateAndEmit(); });
wc.on('did-frame-finish-load', reapplyBounds);
wc.on('did-fail-load', () => { reapplyBounds(); emit(); });
wc.on('page-title-updated', emit);
wc.on('did-fail-load', () => { reapplyBounds(); invalidateAndEmit(); });
wc.on('page-title-updated', this.emitState.bind(this));
wc.setWindowOpenHandler(({ url }) => {
if (this.isEmbeddedTabUrl(url)) {
@ -223,6 +606,7 @@ export class BrowserViewManager extends EventEmitter {
this.tabs.set(tabId, tab);
this.tabOrder.push(tabId);
this.activeTabId = tabId;
this.invalidateSnapshot(tabId);
this.syncAttachedView();
this.emitState();
@ -244,12 +628,101 @@ export class BrowserViewManager extends EventEmitter {
}
private destroyTab(tab: BrowserTab): void {
this.invalidateSnapshot(tab.id);
tab.view.webContents.removeAllListeners();
if (!tab.view.webContents.isDestroyed()) {
tab.view.webContents.close();
}
}
private async waitForWebContentsSettle(
wc: WebContents,
signal?: AbortSignal,
idleMs = POST_ACTION_IDLE_MS,
timeoutMs = NAVIGATION_TIMEOUT_MS,
): Promise<void> {
const startedAt = Date.now();
let sawLoading = wc.isLoading();
while (Date.now() - startedAt < timeoutMs) {
abortIfNeeded(signal);
if (wc.isDestroyed()) return;
if (wc.isLoading()) {
sawLoading = true;
await sleep(100, signal);
continue;
}
await sleep(sawLoading ? idleMs : Math.min(idleMs, 200), signal);
if (!wc.isLoading()) return;
sawLoading = true;
}
}
private async executeOnActiveTab<T>(script: string, signal?: AbortSignal): Promise<T> {
abortIfNeeded(signal);
const activeTab = this.getActiveTab() ?? this.ensureInitialTab();
await this.waitForWebContentsSettle(activeTab.view.webContents, signal);
abortIfNeeded(signal);
return activeTab.view.webContents.executeJavaScript(script, true) as Promise<T>;
}
private cacheSnapshot(tabId: string, rawSnapshot: RawBrowserPageSnapshot, loading: boolean): BrowserPageSnapshot {
const snapshotId = randomUUID();
const elements: BrowserPageElement[] = rawSnapshot.elements.map((element, index) => {
const { selector, ...rest } = element;
void selector;
return {
...rest,
index: index + 1,
};
});
this.snapshotCache.set(tabId, {
snapshotId,
elements: rawSnapshot.elements.map((element, index) => ({
index: index + 1,
selector: element.selector,
})),
});
return {
snapshotId,
url: rawSnapshot.url,
title: rawSnapshot.title,
loading,
text: rawSnapshot.text,
elements,
};
}
private resolveElementSelector(tabId: string, target: ElementTarget): { ok: true; selector: string } | { ok: false; error: string } {
if (target.selector?.trim()) {
return { ok: true, selector: target.selector.trim() };
}
if (target.index == null) {
return { ok: false, error: 'Provide an element index or selector.' };
}
const cachedSnapshot = this.snapshotCache.get(tabId);
if (!cachedSnapshot) {
return { ok: false, error: 'No page snapshot is available yet. Call read-page first.' };
}
if (target.snapshotId && cachedSnapshot.snapshotId !== target.snapshotId) {
return { ok: false, error: 'The page changed since the last read-page call. Call read-page again.' };
}
const entry = cachedSnapshot.elements.find((element) => element.index === target.index);
if (!entry) {
return { ok: false, error: `No element found for index ${target.index}.` };
}
return { ok: true, selector: entry.selector };
}
setVisible(visible: boolean): void {
this.visible = visible;
if (visible) {
@ -266,6 +739,11 @@ export class BrowserViewManager extends EventEmitter {
}
}
async ensureActiveTabReady(signal?: AbortSignal): Promise<void> {
const activeTab = this.getActiveTab() ?? this.ensureInitialTab();
await this.waitForWebContentsSettle(activeTab.view.webContents, signal);
}
async newTab(rawUrl?: string): Promise<{ ok: boolean; tabId?: string; error?: string }> {
try {
const tab = this.createTab(rawUrl?.trim() ? rawUrl : HOME_URL);
@ -313,6 +791,7 @@ export class BrowserViewManager extends EventEmitter {
async navigate(rawUrl: string): Promise<{ ok: boolean; error?: string }> {
try {
const activeTab = this.getActiveTab() ?? this.ensureInitialTab();
this.invalidateSnapshot(activeTab.id);
await activeTab.view.webContents.loadURL(this.normalizeUrl(rawUrl));
return { ok: true };
} catch (err) {
@ -325,6 +804,7 @@ export class BrowserViewManager extends EventEmitter {
if (!activeTab) return { ok: false };
const history = activeTab.view.webContents.navigationHistory;
if (!history.canGoBack()) return { ok: false };
this.invalidateSnapshot(activeTab.id);
history.goBack();
return { ok: true };
}
@ -334,6 +814,7 @@ export class BrowserViewManager extends EventEmitter {
if (!activeTab) return { ok: false };
const history = activeTab.view.webContents.navigationHistory;
if (!history.canGoForward()) return { ok: false };
this.invalidateSnapshot(activeTab.id);
history.goForward();
return { ok: true };
}
@ -341,9 +822,184 @@ export class BrowserViewManager extends EventEmitter {
reload(): void {
const activeTab = this.getActiveTab();
if (!activeTab) return;
this.invalidateSnapshot(activeTab.id);
activeTab.view.webContents.reload();
}
async readPage(
options?: { maxElements?: number; maxTextLength?: number },
signal?: AbortSignal,
): Promise<{ ok: boolean; page?: BrowserPageSnapshot; error?: string }> {
try {
const activeTab = this.getActiveTab() ?? this.ensureInitialTab();
const rawSnapshot = await this.executeOnActiveTab<RawBrowserPageSnapshot>(
buildReadPageScript(
options?.maxElements ?? DEFAULT_READ_MAX_ELEMENTS,
options?.maxTextLength ?? DEFAULT_READ_MAX_TEXT_LENGTH,
),
signal,
);
return {
ok: true,
page: this.cacheSnapshot(activeTab.id, rawSnapshot, activeTab.view.webContents.isLoading()),
};
} catch (error) {
return {
ok: false,
error: error instanceof Error ? error.message : 'Failed to read the current page.',
};
}
}
async readPageSummary(signal?: AbortSignal): Promise<BrowserPageSnapshot | null> {
const result = await this.readPage(
{
maxElements: POST_ACTION_MAX_ELEMENTS,
maxTextLength: POST_ACTION_MAX_TEXT_LENGTH,
},
signal,
);
return result.ok ? result.page ?? null : null;
}
async click(target: ElementTarget, signal?: AbortSignal): Promise<{ ok: boolean; error?: string; description?: string }> {
const activeTab = this.getActiveTab();
if (!activeTab) {
return { ok: false, error: 'No active browser tab is open.' };
}
const resolved = this.resolveElementSelector(activeTab.id, target);
if (!resolved.ok) return resolved;
try {
const result = await this.executeOnActiveTab<{ ok: boolean; error?: string; description?: string }>(
buildClickScript(resolved.selector),
signal,
);
if (!result.ok) return result;
this.invalidateSnapshot(activeTab.id);
await this.waitForWebContentsSettle(activeTab.view.webContents, signal);
return result;
} catch (error) {
return {
ok: false,
error: error instanceof Error ? error.message : 'Failed to click the element.',
};
}
}
async type(target: ElementTarget, text: string, signal?: AbortSignal): Promise<{ ok: boolean; error?: string; description?: string }> {
const activeTab = this.getActiveTab();
if (!activeTab) {
return { ok: false, error: 'No active browser tab is open.' };
}
const resolved = this.resolveElementSelector(activeTab.id, target);
if (!resolved.ok) return resolved;
try {
const result = await this.executeOnActiveTab<{ ok: boolean; error?: string; description?: string }>(
buildTypeScript(resolved.selector, text),
signal,
);
if (!result.ok) return result;
this.invalidateSnapshot(activeTab.id);
await this.waitForWebContentsSettle(activeTab.view.webContents, signal);
return result;
} catch (error) {
return {
ok: false,
error: error instanceof Error ? error.message : 'Failed to type into the element.',
};
}
}
async press(
key: string,
target?: ElementTarget,
signal?: AbortSignal,
): Promise<{ ok: boolean; error?: string; description?: string }> {
const activeTab = this.getActiveTab();
if (!activeTab) {
return { ok: false, error: 'No active browser tab is open.' };
}
let description = 'active element';
if (target?.index != null || target?.selector?.trim()) {
const resolved = this.resolveElementSelector(activeTab.id, target);
if (!resolved.ok) return resolved;
try {
const focusResult = await this.executeOnActiveTab<{ ok: boolean; error?: string; description?: string }>(
buildFocusScript(resolved.selector),
signal,
);
if (!focusResult.ok) return focusResult;
description = focusResult.description ?? description;
} catch (error) {
return {
ok: false,
error: error instanceof Error ? error.message : 'Failed to focus the element before pressing a key.',
};
}
}
try {
const wc = activeTab.view.webContents;
const keyCode = normalizeKeyCode(key);
wc.sendInputEvent({ type: 'keyDown', keyCode });
if (keyCode.length === 1) {
wc.sendInputEvent({ type: 'char', keyCode });
}
wc.sendInputEvent({ type: 'keyUp', keyCode });
this.invalidateSnapshot(activeTab.id);
await this.waitForWebContentsSettle(wc, signal);
return {
ok: true,
description: `${keyCode} on ${description}`,
};
} catch (error) {
return {
ok: false,
error: error instanceof Error ? error.message : 'Failed to press the requested key.',
};
}
}
async scroll(direction: 'up' | 'down' = 'down', amount = 700, signal?: AbortSignal): Promise<{ ok: boolean; error?: string }> {
const activeTab = this.getActiveTab();
if (!activeTab) {
return { ok: false, error: 'No active browser tab is open.' };
}
try {
const offset = Math.max(1, amount) * (direction === 'up' ? -1 : 1);
const result = await this.executeOnActiveTab<{ ok: boolean; error?: string }>(
buildScrollScript(offset),
signal,
);
if (!result.ok) return result;
this.invalidateSnapshot(activeTab.id);
await sleep(250, signal);
return result;
} catch (error) {
return {
ok: false,
error: error instanceof Error ? error.message : 'Failed to scroll the page.',
};
}
}
async wait(ms = 1000, signal?: AbortSignal): Promise<void> {
await sleep(ms, signal);
const activeTab = this.getActiveTab();
if (!activeTab) return;
await this.waitForWebContentsSettle(activeTab.view.webContents, signal);
}
getState(): BrowserState {
return this.snapshotState();
}

View file

@ -31,8 +31,10 @@ import started from "electron-squirrel-startup";
import { execSync, exec, execFileSync } from "node:child_process";
import { promisify } from "node:util";
import { init as initChromeSync } from "@x/core/dist/knowledge/chrome-extension/server/server.js";
import { registerContainerValues } from "@x/core/dist/di/container.js";
import { browserViewManager } from "./browser/view.js";
import { setupBrowserEventForwarding } from "./browser/ipc.js";
import { ElectronBrowserControlService } from "./browser/control-service.js";
const execAsync = promisify(exec);
@ -221,6 +223,10 @@ app.whenReady().then(async () => {
// Initialize all config files before UI can access them
await initConfigs();
registerContainerValues({
browserControlService: new ElectronBrowserControlService(),
});
setupIpcHandlers();
setupBrowserEventForwarding();

View file

@ -780,6 +780,19 @@ function App() {
return cleanup
}, [refreshVoiceAvailability])
useEffect(() => {
const cleanup = window.ipc.on('browser:didRequestPaneState', (event) => {
if (event.open) {
setIsBrowserOpen(true)
setIsChatSidebarOpen(true)
setIsRightPaneMaximized(false)
return
}
setIsBrowserOpen(false)
})
return cleanup
}, [])
const handleStartRecording = useCallback(() => {
setIsRecording(true)
isRecordingRef.current = true

View file

@ -231,6 +231,55 @@ export const getAppActionCardData = (tool: ToolCall): AppActionCardData | null =
}
}
const BROWSER_PENDING_LABELS: Record<string, string> = {
open: 'Opening browser...',
'get-state': 'Reading browser state...',
'new-tab': 'Opening new browser tab...',
'switch-tab': 'Switching browser tab...',
'close-tab': 'Closing browser tab...',
navigate: 'Navigating browser...',
back: 'Going back...',
forward: 'Going forward...',
reload: 'Reloading page...',
'read-page': 'Reading page...',
click: 'Clicking page element...',
type: 'Typing into page...',
press: 'Sending key press...',
scroll: 'Scrolling page...',
wait: 'Waiting for page...',
}
export const getBrowserControlLabel = (tool: ToolCall): string | null => {
if (tool.name !== 'browser-control') return null
const input = normalizeToolInput(tool.input) as Record<string, unknown> | undefined
const result = tool.result as Record<string, unknown> | undefined
const action = (input?.action as string | undefined) || (result?.action as string | undefined) || 'browser'
if (tool.status !== 'completed') {
if (action === 'click' && typeof input?.index === 'number') {
return `Clicking element ${input.index}...`
}
if (action === 'type' && typeof input?.index === 'number') {
return `Typing into element ${input.index}...`
}
if (action === 'navigate' && typeof input?.target === 'string') {
return `Navigating to ${input.target}...`
}
return BROWSER_PENDING_LABELS[action] || 'Controlling browser...'
}
if (result?.success === false) {
return typeof result.error === 'string' ? `Browser error: ${result.error}` : 'Browser action failed'
}
if (typeof result?.message === 'string' && result.message.trim()) {
return result.message
}
return 'Controlled browser'
}
// Parse attached files from message content and return clean message + file paths.
export const parseAttachedFiles = (content: string): { message: string; files: string[] } => {
const attachedFilesRegex = /<attached-files>\s*([\s\S]*?)\s*<\/attached-files>/
@ -315,6 +364,7 @@ const TOOL_DISPLAY_NAMES: Record<string, string> = {
'web-search': 'Searching the web',
'save-to-memory': 'Saving to memory',
'app-navigation': 'Navigating app',
'browser-control': 'Controlling browser',
'composio-list-toolkits': 'Listing integrations',
'composio-search-tools': 'Searching tools',
'composio-execute-tool': 'Running tool',
@ -328,6 +378,8 @@ const TOOL_DISPLAY_NAMES: Record<string, string> = {
* Falls back to the raw tool name if no mapping exists.
*/
export const getToolDisplayName = (tool: ToolCall): string => {
const browserLabel = getBrowserControlLabel(tool)
if (browserLabel) return browserLabel
const composioData = getComposioActionCardData(tool)
if (composioData) return composioData.label
return TOOL_DISPLAY_NAMES[tool.name] || tool.name

View file

@ -71,6 +71,7 @@ Rowboat is an agentic assistant for everyday work - emails, meetings, projects,
**App Control:** When users ask you to open notes, show the bases or graph view, filter or search notes, or manage saved views, load the \`app-navigation\` skill first. It provides structured guidance for navigating the app UI and controlling the knowledge base view.
**Tracks (Auto-Updating Note Blocks):** When users ask you to **track**, **monitor**, **watch**, or **keep an eye on** something in a note or say things like "every morning tell me X", "show the current Y in this note", "pin live updates of Z here" load the \`tracks\` skill first. Also load it when a user presses Cmd+K with a note open and requests auto-refreshing content at the cursor. Track blocks are YAML-fenced scheduled blocks whose output is rewritten on each run — useful for weather, news, prices, status pages, and personal dashboards.
**Browser Control:** When users ask you to open a website, browse in-app, search the web in the embedded browser, or interact with a live webpage inside Rowboat, load the \`browser-control\` skill first. It explains the \`read-page -> indexed action -> refreshed page\` workflow for the browser pane.
## Learning About the User (save-to-memory)
@ -243,6 +244,7 @@ ${runtimeContextPrompt}
- \`slack-checkConnection\`, \`slack-listAvailableTools\`, \`slack-executeAction\` - Slack integration (requires Slack to be connected via Composio). Use \`slack-listAvailableTools\` first to discover available tool slugs, then \`slack-executeAction\` to execute them.
- \`web-search\` - Search the web. Returns rich results with full text, highlights, and metadata. The \`category\` parameter defaults to \`general\` (full web search) — only use a specific category like \`news\`, \`company\`, \`research paper\` etc. when the query is clearly about that type. For everyday queries (weather, restaurants, prices, how-to), use \`general\`.
- \`app-navigation\` - Control the app UI: open notes, switch views, filter/search the knowledge base, manage saved views. **Load the \`app-navigation\` skill before using this tool.**
- \`browser-control\` - Control the embedded browser pane: open sites, inspect the live page, switch tabs, and interact with indexed page elements. **Load the \`browser-control\` skill before using this tool.**
- \`save-to-memory\` - Save observations about the user to the agent memory system. Use this proactively during conversations.
- \`composio-list-toolkits\`, \`composio-search-tools\`, \`composio-execute-tool\`, \`composio-connect-toolkit\` — Composio integration tools. Load the \`composio-integration\` skill for usage guidance.

View file

@ -0,0 +1,104 @@
export const skill = String.raw`
# Browser Control Skill
You have access to the **browser-control** tool, which controls Rowboat's embedded browser pane directly.
Use this skill when the user asks you to open a website, browse in-app, search the web in the browser pane, click something on a page, fill a form, or otherwise interact with a live webpage inside Rowboat.
## Core Workflow
1. Start with ` + "`browser-control({ action: \"open\" })`" + ` if the browser pane may not already be open.
2. Use ` + "`browser-control({ action: \"read-page\" })`" + ` to inspect the current page.
3. The tool returns:
- ` + "`snapshotId`" + `
- page ` + "`url`" + ` and ` + "`title`" + `
- visible page text
- interactable elements with numbered ` + "`index`" + ` values
4. Prefer acting on those numbered indices with ` + "`click`" + ` / ` + "`type`" + ` / ` + "`press`" + `.
5. After each action, read the returned page snapshot before deciding the next step.
## Actions
### open
Open the browser pane and ensure an active tab exists.
### get-state
Return the current browser tabs and active tab id.
### new-tab
Open a new browser tab.
Parameters:
- ` + "`target`" + ` (optional): URL or plain-language search query
### switch-tab
Switch to a tab by ` + "`tabId`" + `.
### close-tab
Close a tab by ` + "`tabId`" + `.
### navigate
Navigate the active tab.
Parameters:
- ` + "`target`" + `: URL or plain-language search query
Plain-language targets are converted into a search automatically.
### back / forward / reload
Standard browser navigation controls.
### read-page
Read the current page and return a compact snapshot.
Parameters:
- ` + "`maxElements`" + ` (optional)
- ` + "`maxTextLength`" + ` (optional)
### click
Click an element.
Prefer:
- ` + "`index`" + `: element index from ` + "`read-page`" + `
Optional:
- ` + "`snapshotId`" + `: include it when acting on a recent snapshot
- ` + "`selector`" + `: fallback only when no usable index exists
### type
Type into an input, textarea, or contenteditable element.
Parameters:
- ` + "`text`" + `: text to enter
- plus the same target fields as ` + "`click`" + `
### press
Send a key press such as ` + "`Enter`" + `, ` + "`Tab`" + `, ` + "`Escape`" + `, or arrow keys.
Parameters:
- ` + "`key`" + `
- optional target fields if you need to focus a specific element first
### scroll
Scroll the current page.
Parameters:
- ` + "`direction`" + `: ` + "`\"up\"`" + ` or ` + "`\"down\"`" + ` (optional; defaults down)
- ` + "`amount`" + `: pixel distance (optional)
### wait
Wait for the page to settle, useful after async UI changes.
Parameters:
- ` + "`ms`" + `: milliseconds to wait (optional)
## Important Rules
- Prefer ` + "`read-page`" + ` before interacting.
- Prefer element ` + "`index`" + ` over CSS selectors.
- If the tool says the snapshot is stale, call ` + "`read-page`" + ` again.
- After navigation, clicking, typing, pressing, or scrolling, use the returned page snapshot instead of assuming the page state.
- Use Rowboat's browser for live interaction. Use web search tools for research where a live session is unnecessary.
`;
export default skill;

View file

@ -11,6 +11,7 @@ import backgroundAgentsSkill from "./background-agents/skill.js";
import createPresentationsSkill from "./create-presentations/skill.js";
import appNavigationSkill from "./app-navigation/skill.js";
import browserControlSkill from "./browser-control/skill.js";
import composioIntegrationSkill from "./composio-integration/skill.js";
import tracksSkill from "./tracks/skill.js";
@ -105,6 +106,12 @@ const definitions: SkillDefinition[] = [
summary: "Create and manage track blocks — YAML-scheduled auto-updating content blocks in notes (weather, news, prices, status, dashboards). Insert at cursor (Cmd+K) or append to notes.",
content: tracksSkill,
},
{
id: "browser-control",
title: "Browser Control",
summary: "Control the embedded browser pane - open sites, inspect page state, and interact with indexed page elements.",
content: browserControlSkill,
},
];
const skillEntries = definitions.map((definition) => ({

View file

@ -0,0 +1,8 @@
import type { BrowserControlInput, BrowserControlResult } from '@x/shared/dist/browser-control.js';
export interface IBrowserControlService {
execute(
input: BrowserControlInput,
ctx?: { signal?: AbortSignal },
): Promise<BrowserControlResult>;
}

View file

@ -17,6 +17,7 @@ import { WorkDir } from "../../config/config.js";
import { composioAccountsRepo } from "../../composio/repo.js";
import { executeAction as executeComposioAction, isConfigured as isComposioConfigured, searchTools as searchComposioTools } from "../../composio/client.js";
import { CURATED_TOOLKITS, CURATED_TOOLKIT_SLUGS } from "@x/shared/dist/composio.js";
import { BrowserControlInputSchema, type BrowserControlInput } from "@x/shared/dist/browser-control.js";
import type { ToolContext } from "./exec-tool.js";
import { generateText } from "ai";
import { createProvider } from "../../models/models.js";
@ -26,6 +27,7 @@ import { getGatewayProvider } from "../../models/gateway.js";
import { getAccessToken } from "../../auth/tokens.js";
import { API_URL } from "../../config/env.js";
import { updateContent, updateTrackBlock } from "../../knowledge/track/fileops.js";
import type { IBrowserControlService } from "../browser-control/service.js";
// Parser libraries are loaded dynamically inside parseFile.execute()
// to avoid pulling pdfjs-dist's DOM polyfills into the main bundle.
// Import paths are computed so esbuild cannot statically resolve them.
@ -562,7 +564,7 @@ export const BuiltinTools: z.infer<typeof BuiltinToolsSchema> = {
count: matches.length,
tool: 'ripgrep',
};
} catch (rgError) {
} catch {
// Fallback to basic grep if ripgrep not available or failed
const grepArgs = [
'-rn',
@ -997,6 +999,39 @@ export const BuiltinTools: z.infer<typeof BuiltinToolsSchema> = {
},
},
// ============================================================================
// Browser Control
// ============================================================================
'browser-control': {
description: 'Control the embedded browser pane. Read the current page, inspect indexed interactable elements, and navigate/click/type/press keys in the active browser tab.',
inputSchema: BrowserControlInputSchema,
isAvailable: async () => {
try {
container.resolve<IBrowserControlService>('browserControlService');
return true;
} catch {
return false;
}
},
execute: async (input: BrowserControlInput, ctx?: ToolContext) => {
try {
const browserControlService = container.resolve<IBrowserControlService>('browserControlService');
return await browserControlService.execute(input, { signal: ctx?.signal });
} catch (error) {
return {
success: false,
action: input.action,
error: error instanceof Error ? error.message : 'Browser control is unavailable.',
browser: {
activeTabId: null,
tabs: [],
},
};
}
},
},
// ============================================================================
// App Navigation
// ============================================================================

View file

@ -1,4 +1,4 @@
import { asClass, createContainer, InjectionMode } from "awilix";
import { asClass, asValue, createContainer, InjectionMode } from "awilix";
import { FSModelConfigRepo, IModelConfigRepo } from "../models/repo.js";
import { FSMcpConfigRepo, IMcpConfigRepo } from "../mcp/repo.js";
import { FSAgentsRepo, IAgentsRepo } from "../agents/repo.js";
@ -41,4 +41,11 @@ container.register({
slackConfigRepo: asClass<ISlackConfigRepo>(FSSlackConfigRepo).singleton(),
});
export default container;
export default container;
export function registerContainerValues(values: Record<string, unknown>): void {
const registrations = Object.fromEntries(
Object.entries(values).map(([key, value]) => [key, asValue(value)]),
);
container.register(registrations);
}

View file

@ -0,0 +1,134 @@
import { z } from 'zod';
export const BrowserTabStateSchema = z.object({
id: z.string(),
url: z.string(),
title: z.string(),
canGoBack: z.boolean(),
canGoForward: z.boolean(),
loading: z.boolean(),
});
export const BrowserStateSchema = z.object({
activeTabId: z.string().nullable(),
tabs: z.array(BrowserTabStateSchema),
});
export const BrowserPageElementSchema = z.object({
index: z.number().int().positive(),
tagName: z.string(),
role: z.string().nullable(),
type: z.string().nullable(),
label: z.string().nullable(),
text: z.string().nullable(),
placeholder: z.string().nullable(),
href: z.string().nullable(),
disabled: z.boolean(),
});
export const BrowserPageSnapshotSchema = z.object({
snapshotId: z.string(),
url: z.string(),
title: z.string(),
loading: z.boolean(),
text: z.string(),
elements: z.array(BrowserPageElementSchema),
});
export const BrowserControlActionSchema = z.enum([
'open',
'get-state',
'new-tab',
'switch-tab',
'close-tab',
'navigate',
'back',
'forward',
'reload',
'read-page',
'click',
'type',
'press',
'scroll',
'wait',
]);
const BrowserElementTargetFields = {
index: z.number().int().positive().optional(),
selector: z.string().min(1).optional(),
snapshotId: z.string().optional(),
} as const;
export const BrowserControlInputSchema = z.object({
action: BrowserControlActionSchema,
target: z.string().min(1).optional(),
tabId: z.string().min(1).optional(),
text: z.string().optional(),
key: z.string().min(1).optional(),
direction: z.enum(['up', 'down']).optional(),
amount: z.number().int().positive().max(5000).optional(),
ms: z.number().int().positive().max(30000).optional(),
maxElements: z.number().int().positive().max(100).optional(),
maxTextLength: z.number().int().positive().max(20000).optional(),
...BrowserElementTargetFields,
}).strict().superRefine((value, ctx) => {
const needsElementTarget = value.action === 'click' || value.action === 'type';
const hasElementTarget = value.index !== undefined || value.selector !== undefined;
if ((value.action === 'switch-tab' || value.action === 'close-tab') && !value.tabId) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ['tabId'],
message: 'tabId is required for this action.',
});
}
if ((value.action === 'navigate') && !value.target) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ['target'],
message: 'target is required for navigate.',
});
}
if (value.action === 'type' && value.text === undefined) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ['text'],
message: 'text is required for type.',
});
}
if (value.action === 'press' && !value.key) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ['key'],
message: 'key is required for press.',
});
}
if (needsElementTarget && !hasElementTarget) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ['index'],
message: 'Provide an element index or selector.',
});
}
});
export const BrowserControlResultSchema = z.object({
success: z.boolean(),
action: BrowserControlActionSchema,
message: z.string().optional(),
error: z.string().optional(),
browser: BrowserStateSchema,
page: BrowserPageSnapshotSchema.optional(),
});
export type BrowserTabState = z.infer<typeof BrowserTabStateSchema>;
export type BrowserState = z.infer<typeof BrowserStateSchema>;
export type BrowserPageElement = z.infer<typeof BrowserPageElementSchema>;
export type BrowserPageSnapshot = z.infer<typeof BrowserPageSnapshotSchema>;
export type BrowserControlAction = z.infer<typeof BrowserControlActionSchema>;
export type BrowserControlInput = z.infer<typeof BrowserControlInputSchema>;
export type BrowserControlResult = z.infer<typeof BrowserControlResultSchema>;

View file

@ -12,4 +12,5 @@ export * as blocks from './blocks.js';
export * as trackBlock from './track-block.js';
export * as frontmatter from './frontmatter.js';
export * as bases from './bases.js';
export * as browserControl from './browser-control.js';
export { PrefixLogger };

View file

@ -10,6 +10,7 @@ import { TrackEvent } from './track-block.js';
import { UserMessageContent } from './message.js';
import { RowboatApiConfig } from './rowboat-account.js';
import { ZListToolkitsResponse } from './composio.js';
import { BrowserStateSchema } from './browser-control.js';
// ============================================================================
// Runtime Validation Schemas (Single Source of Truth)
@ -701,29 +702,15 @@ const ipcSchemas = {
},
'browser:getState': {
req: z.null(),
res: z.object({
activeTabId: z.string().nullable(),
tabs: z.array(z.object({
id: z.string(),
url: z.string(),
title: z.string(),
canGoBack: z.boolean(),
canGoForward: z.boolean(),
loading: z.boolean(),
})),
}),
res: BrowserStateSchema,
},
'browser:didUpdateState': {
req: BrowserStateSchema,
res: z.null(),
},
'browser:didRequestPaneState': {
req: z.object({
activeTabId: z.string().nullable(),
tabs: z.array(z.object({
id: z.string(),
url: z.string(),
title: z.string(),
canGoBack: z.boolean(),
canGoForward: z.boolean(),
loading: z.boolean(),
})),
open: z.boolean(),
}),
res: z.null(),
},