diff --git a/apps/x/apps/main/src/browser/control-service.ts b/apps/x/apps/main/src/browser/control-service.ts index b83ea7cb..a7549e8d 100644 --- a/apps/x/apps/main/src/browser/control-service.ts +++ b/apps/x/apps/main/src/browser/control-service.ts @@ -1,8 +1,24 @@ import type { IBrowserControlService } from '@x/core/dist/application/browser-control/service.js'; -import type { BrowserControlAction, BrowserControlInput, BrowserControlResult } from '@x/shared/dist/browser-control.js'; +import type { BrowserControlAction, BrowserControlInput, BrowserControlResult, SuggestedBrowserSkill } from '@x/shared/dist/browser-control.js'; +import { ensureLoaded, matchSkillsForUrl } from '@x/core/dist/application/browser-skills/index.js'; import { browserViewManager } from './view.js'; import { normalizeNavigationTarget } from './navigation.js'; +async function getSuggestedSkills(url: string | undefined): Promise { + if (!url) return undefined; + try { + const status = await ensureLoaded(); + if (status.status === 'ready' || status.status === 'stale') { + const matched = matchSkillsForUrl(status.index, url); + if (matched.length === 0) return undefined; + return matched.map((e) => ({ id: e.id, title: e.title, path: e.path })); + } + } catch (err) { + console.warn('[browser-control] suggestedSkills lookup failed:', err); + } + return undefined; +} + function buildSuccessResult( action: BrowserControlAction, message: string, @@ -52,11 +68,13 @@ export class ElectronBrowserControlService implements IBrowserControlService { } await browserViewManager.ensureActiveTabReady(signal); const page = await browserViewManager.readPageSummary(signal, { waitForReady: false }) ?? undefined; - return buildSuccessResult( + const suggestedSkills = await getSuggestedSkills(page?.url); + const success = buildSuccessResult( 'new-tab', target ? `Opened a new tab for ${target}.` : 'Opened a new tab.', page, ); + return suggestedSkills ? { ...success, suggestedSkills } : success; } case 'switch-tab': { @@ -99,7 +117,9 @@ export class ElectronBrowserControlService implements IBrowserControlService { } await browserViewManager.ensureActiveTabReady(signal); const page = await browserViewManager.readPageSummary(signal, { waitForReady: false }) ?? undefined; - return buildSuccessResult('navigate', `Navigated to ${target}.`, page); + const suggestedSkills = await getSuggestedSkills(page?.url); + const success = buildSuccessResult('navigate', `Navigated to ${target}.`, page); + return suggestedSkills ? { ...success, suggestedSkills } : success; } case 'back': { @@ -140,7 +160,9 @@ export class ElectronBrowserControlService implements IBrowserControlService { if (!result.ok || !result.page) { return buildErrorResult('read-page', result.error ?? 'Failed to read the current page.'); } - return buildSuccessResult('read-page', 'Read the current page.', result.page); + const suggestedSkills = await getSuggestedSkills(result.page.url); + const success = buildSuccessResult('read-page', 'Read the current page.', result.page); + return suggestedSkills ? { ...success, suggestedSkills } : success; } case 'click': { @@ -232,6 +254,20 @@ export class ElectronBrowserControlService implements IBrowserControlService { const page = await browserViewManager.readPageSummary(signal, { waitForReady: false }) ?? undefined; return buildSuccessResult('wait', `Waited ${duration}ms for the page to settle.`, page); } + + case 'eval': { + const code = input.code; + if (!code) { + return buildErrorResult('eval', 'code is required for eval.'); + } + await browserViewManager.ensureActiveTabReady(signal); + const result = await browserViewManager.executeScript(code, signal); + if (!result.ok) { + return buildErrorResult('eval', result.error); + } + const success = buildSuccessResult('eval', 'Evaluated script in the active tab.'); + return { ...success, result: result.result }; + } } } catch (error) { return buildErrorResult( diff --git a/apps/x/apps/main/src/browser/view.ts b/apps/x/apps/main/src/browser/view.ts index d319c5fb..0b43e346 100644 --- a/apps/x/apps/main/src/browser/view.ts +++ b/apps/x/apps/main/src/browser/view.ts @@ -78,6 +78,41 @@ function abortIfNeeded(signal?: AbortSignal): void { throw signal.reason instanceof Error ? signal.reason : new Error('Browser action aborted'); } +const EVAL_RESULT_MAX_BYTES = 200_000; + +function safeSerialize(value: unknown): unknown { + const seen = new WeakSet(); + const coerce = (v: unknown): unknown => { + if (v === null || v === undefined) return v; + const t = typeof v; + if (t === 'string' || t === 'number' || t === 'boolean') return v; + if (t === 'bigint') return (v as bigint).toString(); + if (t === 'function' || t === 'symbol') return `[${t}]`; + if (typeof v === 'object') { + if (seen.has(v as object)) return '[circular]'; + seen.add(v as object); + if (Array.isArray(v)) return v.map(coerce); + const out: Record = {}; + for (const [k, val] of Object.entries(v as Record)) { + out[k] = coerce(val); + } + return out; + } + return String(v); + }; + + const coerced = coerce(value); + try { + const json = JSON.stringify(coerced); + if (json && json.length > EVAL_RESULT_MAX_BYTES) { + return { truncated: true, preview: json.slice(0, EVAL_RESULT_MAX_BYTES) }; + } + } catch { + return String(value); + } + return coerced; +} + async function sleep(ms: number, signal?: AbortSignal): Promise { if (ms <= 0) return; abortIfNeeded(signal); @@ -778,6 +813,17 @@ export class BrowserViewManager extends EventEmitter { await this.waitForWebContentsSettle(activeTab, signal); } + async executeScript(code: string, signal?: AbortSignal): Promise<{ ok: true; result: unknown } | { ok: false; error: string }> { + try { + const wrapped = `(async () => { ${code} \n})()`; + const raw = await this.executeOnActiveTab(wrapped, signal); + const serialized = safeSerialize(raw); + return { ok: true, result: serialized }; + } catch (error) { + return { ok: false, error: error instanceof Error ? error.message : 'Script evaluation failed.' }; + } + } + getState(): BrowserState { return this.snapshotState(); } diff --git a/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts b/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts index f1c06f0c..19fe4d11 100644 --- a/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts +++ b/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts @@ -14,8 +14,10 @@ Use this skill when the user asks you to open a website, browse in-app, search t - page ` + "`url`" + ` and ` + "`title`" + ` - visible page text - interactable elements with numbered ` + "`index`" + ` values -4. Prefer acting on those numbered indices with ` + "`click`" + ` / ` + "`type`" + ` / ` + "`press`" + `. -5. After each action, read the returned page snapshot before deciding the next step. + - ` + "`suggestedSkills`" + ` — site-specific and interaction-specific skill hints for the current page +4. **Always inspect ` + "`suggestedSkills`" + ` before acting.** If any skill in the list matches what the user asked for (site or task), call ` + "`load-browser-skill({ id: \"\" })`" + ` *first*, read it in full, then plan your actions. These skills encode selectors, timing, and gotchas that would otherwise cost you several failed attempts to rediscover. If no skill matches, proceed — but do not skip this check. +5. Prefer acting on those numbered indices with ` + "`click`" + ` / ` + "`type`" + ` / ` + "`press`" + `. +6. After each action, read the returned page snapshot before deciding the next step — including re-checking ` + "`suggestedSkills`" + ` if the navigation landed you on a new domain. ## Actions @@ -92,12 +94,38 @@ Wait for the page to settle, useful after async UI changes. Parameters: - ` + "`ms`" + `: milliseconds to wait (optional) +### eval +Run arbitrary JavaScript in the active tab and return its value. Use this as an escape hatch when the structured actions above are insufficient — for example, submitting a form (` + "`form.submit()`" + `), reading DOM state (` + "`document.querySelector(...).textContent`" + `), or computing something that requires page-scoped APIs. + +Parameters: +- ` + "`code`" + `: JavaScript source. The code runs inside an ` + "`async`" + ` IIFE, so you can ` + "`await`" + ` freely. The final expression's value (or a ` + "`return`" + `ed value) is serialized back. Non-serializable values (DOM nodes, functions) are coerced to placeholder strings. Large results are truncated. + +Example: +- ` + "`{ action: \"eval\", code: \"return document.querySelector('meta[name=user-login]')?.content ?? null\" }`" + ` + +Security: ` + "`eval`" + ` runs in the active tab's origin with the user's cookies. Do not exfiltrate credentials, cookies, or localStorage contents to third-party origins. + +## Companion Tools + +### http-fetch +Use for **unauthenticated** API calls (e.g., ` + "`api.github.com`" + `, public REST endpoints) where you don't need the browser's logged-in session. Often faster and cleaner than DOM scraping — many sites expose a public API that returns the same data. For authenticated requests that require the user's browser cookies, use ` + "`browser-control`" + ` with ` + "`action: \"eval\"`" + ` and call ` + "`fetch()`" + ` inside the page context instead. + +### load-browser-skill +Rowboat caches a library of browser skills (from ` + "`browser-use/browser-harness`" + `) indexed by both **domain** (github, linkedin, amazon, booking, …) and **interaction type** within a domain (e.g. ` + "`github/repo-actions`" + `, ` + "`github/scraping`" + `, ` + "`arxiv-bulk/*`" + `). Whenever ` + "`browser-control`" + ` returns a ` + "`suggestedSkills`" + ` array — which it does on ` + "`navigate`" + `, ` + "`new-tab`" + `, and ` + "`read-page`" + ` — treat it as a required reading step, not optional. Pick the entry that matches the current task (domain match first, then the interaction-specific variant if one exists) and call ` + "`load-browser-skill({ id: \"\" })`" + ` before attempting the action. + +You can also proactively call ` + "`load-browser-skill({ action: \"list\", site: \"\" })`" + ` when you know you're about to work on a site, to see what skills exist even if ` + "`suggestedSkills`" + ` is empty (e.g. before navigating). + +These skills are written against a Python harness, so treat them as **reference knowledge** — adapt the recipes into the actions above (especially ` + "`eval`" + ` + ` + "`http-fetch`" + ` for the ` + "`js(...)`" + ` and ` + "`http_get(...)`" + ` calls they use). The selectors, DOM gotchas, and sequencing are the durable part; the exact function names are not. + ## Important Rules - Prefer ` + "`read-page`" + ` before interacting. - Prefer element ` + "`index`" + ` over CSS selectors. - If the tool says the snapshot is stale, call ` + "`read-page`" + ` again. - After navigation, clicking, typing, pressing, or scrolling, use the returned page snapshot instead of assuming the page state. +- **Always check ` + "`suggestedSkills`" + ` after ` + "`navigate`" + `, ` + "`new-tab`" + `, or ` + "`read-page`" + `, and load the matching domain or interaction skill before acting.** Skipping this step is the single most common way to waste a dozen failed clicks on a site whose quirks are already documented. If the array is empty, proceed normally — but don't skip the check. +- Prefer structured actions (click/type/press) over ` + "`eval`" + ` when both work. Reach for ` + "`eval`" + ` when the site fights synthetic events, when you need to submit a form directly, or when you need to read DOM state the structured actions don't surface. +- For read-only data, check if ` + "`http-fetch`" + ` against the site's public API works before scraping the DOM. - Use Rowboat's browser for live interaction. Use web search tools for research where a live session is unnecessary. - Do not wrap browser URLs or browser pages in ` + "```filepath" + ` blocks. Filepath cards are only for real files on disk, not web pages or browser tabs. - If you mention a page the browser opened, use plain text for the URL/title instead of trying to create a clickable file card. diff --git a/apps/x/packages/core/src/application/browser-skills/index.ts b/apps/x/packages/core/src/application/browser-skills/index.ts new file mode 100644 index 00000000..2040c963 --- /dev/null +++ b/apps/x/packages/core/src/application/browser-skills/index.ts @@ -0,0 +1,3 @@ +export { ensureLoaded, readSkillContent, refreshFromRemote } from './loader.js'; +export type { SkillEntry, SkillsIndex, LoaderStatus } from './loader.js'; +export { matchSkillsForUrl } from './matcher.js'; diff --git a/apps/x/packages/core/src/application/browser-skills/loader.ts b/apps/x/packages/core/src/application/browser-skills/loader.ts new file mode 100644 index 00000000..3e68d7ca --- /dev/null +++ b/apps/x/packages/core/src/application/browser-skills/loader.ts @@ -0,0 +1,215 @@ +import * as path from 'node:path'; +import * as fs from 'node:fs/promises'; +import { WorkDir } from '../../config/config.js'; + +const REPO_OWNER = 'browser-use'; +const REPO_NAME = 'browser-harness'; +const REPO_BRANCH = 'main'; +const DOMAIN_SKILLS_PREFIX = 'domain-skills/'; + +const MANIFEST_TTL_MS = 24 * 60 * 60 * 1000; +const FETCH_TIMEOUT_MS = 20_000; + +export type SkillEntry = { + id: string; // e.g. "github/repo-actions" + site: string; // e.g. "github" + fileName: string; // e.g. "repo-actions.md" + title: string; // first H1 from the markdown, or a derived title + path: string; // relative repo path, e.g. "domain-skills/github/repo-actions.md" + localPath: string; // absolute path on disk +}; + +export type SkillsIndex = { + fetchedAt: number; + treeSha: string; + entries: SkillEntry[]; +}; + +export type LoaderStatus = + | { status: 'ready'; index: SkillsIndex } + | { status: 'stale'; index: SkillsIndex; refreshing: boolean } + | { status: 'empty' } + | { status: 'error'; error: string }; + +const cacheRoot = () => path.join(WorkDir, 'cache', 'browser-skills'); +const skillsDir = () => path.join(cacheRoot(), 'domain-skills'); +const manifestPath = () => path.join(cacheRoot(), 'manifest.json'); + +async function ensureCacheDir(): Promise { + await fs.mkdir(skillsDir(), { recursive: true }); +} + +async function readManifest(): Promise { + try { + const raw = await fs.readFile(manifestPath(), 'utf8'); + const parsed = JSON.parse(raw) as SkillsIndex; + if (!parsed.entries || !Array.isArray(parsed.entries)) return null; + return parsed; + } catch { + return null; + } +} + +async function writeManifest(index: SkillsIndex): Promise { + await ensureCacheDir(); + await fs.writeFile(manifestPath(), JSON.stringify(index, null, 2), 'utf8'); +} + +function extractTitle(markdown: string, fallback: string): string { + const match = markdown.match(/^#\s+(.+?)\s*$/m); + if (match?.[1]) return match[1].trim(); + return fallback; +} + +async function fetchWithTimeout(url: string, init?: RequestInit): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); + try { + return await fetch(url, { + ...init, + signal: controller.signal, + headers: { + 'User-Agent': 'rowboat-browser-skills', + Accept: 'application/vnd.github+json', + ...(init?.headers ?? {}), + }, + }); + } finally { + clearTimeout(timer); + } +} + +type GithubTreeNode = { path: string; type: string; sha: string }; + +async function fetchRepoTree(): Promise<{ treeSha: string; skillPaths: string[] }> { + const branchUrl = `https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/branches/${REPO_BRANCH}`; + const branchRes = await fetchWithTimeout(branchUrl); + if (!branchRes.ok) { + throw new Error(`GitHub branch fetch failed: ${branchRes.status} ${branchRes.statusText}`); + } + const branch = (await branchRes.json()) as { commit: { commit: { tree: { sha: string } } } }; + const treeSha = branch.commit?.commit?.tree?.sha; + if (!treeSha) throw new Error('Could not resolve tree SHA from branch response.'); + + const treeUrl = `https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/git/trees/${treeSha}?recursive=1`; + const treeRes = await fetchWithTimeout(treeUrl); + if (!treeRes.ok) { + throw new Error(`GitHub tree fetch failed: ${treeRes.status} ${treeRes.statusText}`); + } + const tree = (await treeRes.json()) as { tree: GithubTreeNode[]; truncated: boolean }; + + const skillPaths = tree.tree + .filter((n) => n.type === 'blob' && n.path.startsWith(DOMAIN_SKILLS_PREFIX) && n.path.endsWith('.md')) + .map((n) => n.path); + + return { treeSha, skillPaths }; +} + +async function fetchRawFile(repoPath: string): Promise { + const url = `https://raw.githubusercontent.com/${REPO_OWNER}/${REPO_NAME}/${REPO_BRANCH}/${repoPath}`; + const res = await fetchWithTimeout(url, { headers: { Accept: 'text/plain' } }); + if (!res.ok) { + throw new Error(`Raw file fetch failed for ${repoPath}: ${res.status} ${res.statusText}`); + } + return res.text(); +} + +function parseRepoPath(repoPath: string): { id: string; site: string; fileName: string } | null { + const rel = repoPath.slice(DOMAIN_SKILLS_PREFIX.length); + const parts = rel.split('/'); + if (parts.length < 2) return null; + const site = parts[0]; + const fileName = parts.slice(1).join('/'); + const id = rel.replace(/\.md$/, ''); + return { id, site, fileName }; +} + +export async function refreshFromRemote(): Promise { + await ensureCacheDir(); + const { treeSha, skillPaths } = await fetchRepoTree(); + + const entries: SkillEntry[] = []; + await Promise.all(skillPaths.map(async (repoPath) => { + const parsed = parseRepoPath(repoPath); + if (!parsed) return; + try { + const content = await fetchRawFile(repoPath); + const localRel = path.join(parsed.site, parsed.fileName); + const localPath = path.join(skillsDir(), localRel); + await fs.mkdir(path.dirname(localPath), { recursive: true }); + await fs.writeFile(localPath, content, 'utf8'); + entries.push({ + id: parsed.id, + site: parsed.site, + fileName: parsed.fileName, + title: extractTitle(content, parsed.id), + path: repoPath, + localPath, + }); + } catch (err) { + console.warn(`[browser-skills] Failed to fetch ${repoPath}:`, err); + } + })); + + entries.sort((a, b) => a.id.localeCompare(b.id)); + + const index: SkillsIndex = { + fetchedAt: Date.now(), + treeSha, + entries, + }; + await writeManifest(index); + return index; +} + +let inFlightRefresh: Promise | null = null; + +export async function ensureLoaded(options?: { forceRefresh?: boolean }): Promise { + try { + const existing = await readManifest(); + const fresh = existing && Date.now() - existing.fetchedAt < MANIFEST_TTL_MS; + + if (existing && fresh && !options?.forceRefresh) { + return { status: 'ready', index: existing }; + } + + if (existing && !options?.forceRefresh) { + if (!inFlightRefresh) { + inFlightRefresh = refreshFromRemote() + .catch((err) => { + console.warn('[browser-skills] Background refresh failed:', err); + return existing; + }) + .finally(() => { inFlightRefresh = null; }); + } + return { status: 'stale', index: existing, refreshing: true }; + } + + if (!inFlightRefresh) { + inFlightRefresh = refreshFromRemote().finally(() => { inFlightRefresh = null; }); + } + try { + const index = await inFlightRefresh; + return { status: 'ready', index }; + } catch (err) { + return { status: 'error', error: err instanceof Error ? err.message : 'Failed to load skills.' }; + } + } catch (err) { + return { status: 'error', error: err instanceof Error ? err.message : 'Skill loader failed.' }; + } +} + +export async function readSkillContent(id: string): Promise<{ ok: true; content: string; entry: SkillEntry } | { ok: false; error: string }> { + const status = await ensureLoaded(); + if (status.status === 'error' || status.status === 'empty') { + return { ok: false, error: status.status === 'error' ? status.error : 'No skills cached yet.' }; + } + const entry = status.index.entries.find((e) => e.id === id); + if (!entry) return { ok: false, error: `Skill '${id}' not found.` }; + try { + const content = await fs.readFile(entry.localPath, 'utf8'); + return { ok: true, content, entry }; + } catch (err) { + return { ok: false, error: err instanceof Error ? err.message : 'Failed to read skill file.' }; + } +} diff --git a/apps/x/packages/core/src/application/browser-skills/matcher.ts b/apps/x/packages/core/src/application/browser-skills/matcher.ts new file mode 100644 index 00000000..a4aabde8 --- /dev/null +++ b/apps/x/packages/core/src/application/browser-skills/matcher.ts @@ -0,0 +1,56 @@ +import type { SkillEntry, SkillsIndex } from './loader.js'; + +/** + * Map browser-harness `domain-skills//` folder names to hostname tokens we + * match against the current tab's URL. + * + * Heuristic: for each site folder we generate candidate hostnames like + * "booking-com" -> ["booking-com", "bookingcom", "booking.com"] + * "github" -> ["github", "github.com"] + * "dev-to" -> ["dev-to", "devto", "dev.to"] + * Then we check whether any candidate is a substring of the tab hostname. + */ +function siteCandidates(site: string): string[] { + const candidates = new Set(); + candidates.add(site); + candidates.add(site.replace(/-/g, '')); + candidates.add(site.replace(/-/g, '.')); + if (site.endsWith('-com')) { + candidates.add(`${site.slice(0, -4)}.com`); + } + if (site.endsWith('-org')) { + candidates.add(`${site.slice(0, -4)}.org`); + } + if (site.endsWith('-io')) { + candidates.add(`${site.slice(0, -3)}.io`); + } + return Array.from(candidates); +} + +function extractHostname(url: string): string | null { + try { + return new URL(url).hostname.toLowerCase(); + } catch { + return null; + } +} + +export function matchSkillsForUrl(index: SkillsIndex, url: string, limit = 5): SkillEntry[] { + const hostname = extractHostname(url); + if (!hostname) return []; + + const bySite = new Map(); + for (const entry of index.entries) { + if (!bySite.has(entry.site)) bySite.set(entry.site, []); + bySite.get(entry.site)!.push(entry); + } + + const matched: SkillEntry[] = []; + for (const [site, entries] of bySite) { + const candidates = siteCandidates(site); + const hit = candidates.some((c) => hostname === c || hostname.endsWith(`.${c}`) || hostname.includes(c)); + if (hit) matched.push(...entries); + } + + return matched.slice(0, limit); +} diff --git a/apps/x/packages/core/src/application/lib/builtin-tools.ts b/apps/x/packages/core/src/application/lib/builtin-tools.ts index 52083277..ae581147 100644 --- a/apps/x/packages/core/src/application/lib/builtin-tools.ts +++ b/apps/x/packages/core/src/application/lib/builtin-tools.ts @@ -18,6 +18,7 @@ import { composioAccountsRepo } from "../../composio/repo.js"; import { executeAction as executeComposioAction, isConfigured as isComposioConfigured, searchTools as searchComposioTools } from "../../composio/client.js"; import { CURATED_TOOLKITS, CURATED_TOOLKIT_SLUGS } from "@x/shared/dist/composio.js"; import { BrowserControlInputSchema, type BrowserControlInput } from "@x/shared/dist/browser-control.js"; +import { ensureLoaded as ensureBrowserSkillsLoaded, readSkillContent as readBrowserSkillContent, refreshFromRemote as refreshBrowserSkills } from "../browser-skills/index.js"; import type { ToolContext } from "./exec-tool.js"; import { generateText } from "ai"; import { createProvider } from "../../models/models.js"; @@ -994,6 +995,147 @@ export const BuiltinTools: z.infer = { }, }, + // ============================================================================ + // HTTP Fetch + // ============================================================================ + + 'http-fetch': { + description: 'Make a plain HTTP request (GET/POST/etc.) and return the response. Use this for API calls that do not require a logged-in browser session. For authenticated requests that need the user\'s active browser cookies, use browser-control with action "eval" and call fetch() inside the page context instead.', + inputSchema: z.object({ + url: z.string().url().describe('Absolute URL to fetch.'), + method: z.enum(['GET', 'POST', 'PUT', 'PATCH', 'DELETE', 'HEAD']).optional().describe('HTTP method. Defaults to GET.'), + headers: z.record(z.string(), z.string()).optional().describe('Request headers.'), + body: z.string().optional().describe('Request body as a string. For JSON, stringify first and set Content-Type: application/json.'), + responseType: z.enum(['text', 'json']).optional().describe('How to parse the response body. Defaults to text.'), + timeoutMs: z.number().int().positive().max(60000).optional().describe('Request timeout in milliseconds. Defaults to 15000.'), + }), + execute: async (input: { + url: string; + method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE' | 'HEAD'; + headers?: Record; + body?: string; + responseType?: 'text' | 'json'; + timeoutMs?: number; + }) => { + const MAX_BODY_BYTES = 500_000; + const timeout = input.timeoutMs ?? 15000; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeout); + try { + const response = await fetch(input.url, { + method: input.method ?? 'GET', + headers: input.headers, + body: input.body, + signal: controller.signal, + redirect: 'follow', + }); + const responseHeaders: Record = {}; + response.headers.forEach((value, key) => { responseHeaders[key] = value; }); + const rawText = await response.text(); + const truncated = rawText.length > MAX_BODY_BYTES; + const text = truncated ? rawText.slice(0, MAX_BODY_BYTES) : rawText; + let parsed: unknown = undefined; + if (input.responseType === 'json') { + try { + parsed = JSON.parse(rawText); + } catch (err) { + return { + success: false, + status: response.status, + statusText: response.statusText, + url: response.url, + headers: responseHeaders, + error: `Response was not valid JSON: ${err instanceof Error ? err.message : 'parse error'}`, + bodyPreview: text.slice(0, 2000), + }; + } + } + return { + success: response.ok, + status: response.status, + statusText: response.statusText, + url: response.url, + headers: responseHeaders, + body: input.responseType === 'json' ? parsed : text, + truncated, + }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : 'HTTP fetch failed.', + aborted: controller.signal.aborted, + }; + } finally { + clearTimeout(timer); + } + }, + }, + + // ============================================================================ + // Browser Skills (browser-use/browser-harness domain-skills cache) + // ============================================================================ + + 'load-browser-skill': { + description: 'Load a site-specific browser skill (from the browser-use/browser-harness domain-skills library) by id. Returns the full markdown content with selectors, gotchas, and recipes for the target site. Call this after browser-control responses surface a matching skill in suggestedSkills. Pass action="list" to see all available skills. Skills are fetched on first use and cached locally; pass action="refresh" to force an update from upstream.', + inputSchema: z.object({ + action: z.enum(['load', 'list', 'refresh']).optional().describe('load: fetch a skill by id (default). list: list all cached skills. refresh: re-fetch the library from upstream.'), + id: z.string().optional().describe('Skill id (e.g., "github/repo-actions") — required for load.'), + site: z.string().optional().describe('Filter list results to a single site (e.g., "github").'), + }), + execute: async (input: { action?: 'load' | 'list' | 'refresh'; id?: string; site?: string }) => { + const action = input.action ?? 'load'; + try { + if (action === 'refresh') { + const index = await refreshBrowserSkills(); + return { + success: true, + message: `Refreshed ${index.entries.length} skill${index.entries.length === 1 ? '' : 's'} from upstream.`, + count: index.entries.length, + treeSha: index.treeSha, + }; + } + + if (action === 'list') { + const status = await ensureBrowserSkillsLoaded(); + if (status.status === 'error') { + return { success: false, error: status.error }; + } + if (status.status === 'empty') { + return { success: false, error: 'No browser skills cached yet.' }; + } + const entries = status.index.entries + .filter((e) => !input.site || e.site === input.site) + .map((e) => ({ id: e.id, title: e.title, site: e.site })); + return { + success: true, + count: entries.length, + skills: entries, + cacheAgeMs: Date.now() - status.index.fetchedAt, + refreshing: status.status === 'stale' ? status.refreshing : false, + }; + } + + if (!input.id) { + return { success: false, error: 'id is required for load.' }; + } + const result = await readBrowserSkillContent(input.id); + if (!result.ok) { + return { success: false, error: result.error }; + } + return { + success: true, + id: result.entry.id, + title: result.entry.title, + site: result.entry.site, + path: result.entry.path, + content: result.content, + }; + } catch (err) { + return { success: false, error: err instanceof Error ? err.message : 'Failed to load browser skill.' }; + } + }, + }, + // ============================================================================ // Browser Control // ============================================================================ diff --git a/apps/x/packages/shared/src/browser-control.ts b/apps/x/packages/shared/src/browser-control.ts index e1418a5e..c8277712 100644 --- a/apps/x/packages/shared/src/browser-control.ts +++ b/apps/x/packages/shared/src/browser-control.ts @@ -51,6 +51,7 @@ export const BrowserControlActionSchema = z.enum([ 'press', 'scroll', 'wait', + 'eval', ]); const BrowserElementTargetFields = { @@ -70,6 +71,7 @@ export const BrowserControlInputSchema = z.object({ ms: z.number().int().positive().max(30000).optional(), maxElements: z.number().int().positive().max(100).optional(), maxTextLength: z.number().int().positive().max(20000).optional(), + code: z.string().min(1).max(50000).optional(), ...BrowserElementTargetFields, }).strict().superRefine((value, ctx) => { const needsElementTarget = value.action === 'click' || value.action === 'type'; @@ -114,6 +116,20 @@ export const BrowserControlInputSchema = z.object({ message: 'Provide an element index or selector.', }); } + + if (value.action === 'eval' && !value.code) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ['code'], + message: 'code is required for eval.', + }); + } +}); + +export const SuggestedBrowserSkillSchema = z.object({ + id: z.string(), + title: z.string(), + path: z.string(), }); export const BrowserControlResultSchema = z.object({ @@ -123,6 +139,8 @@ export const BrowserControlResultSchema = z.object({ error: z.string().optional(), browser: BrowserStateSchema, page: BrowserPageSnapshotSchema.optional(), + result: z.unknown().optional(), + suggestedSkills: z.array(SuggestedBrowserSkillSchema).optional(), }); export type BrowserTabState = z.infer; @@ -132,3 +150,4 @@ export type BrowserPageSnapshot = z.infer; export type BrowserControlAction = z.infer; export type BrowserControlInput = z.infer; export type BrowserControlResult = z.infer; +export type SuggestedBrowserSkill = z.infer;