diff --git a/apps/x/apps/main/src/browser/control-service.ts b/apps/x/apps/main/src/browser/control-service.ts index b83ea7cb..7c97ea7a 100644 --- a/apps/x/apps/main/src/browser/control-service.ts +++ b/apps/x/apps/main/src/browser/control-service.ts @@ -1,8 +1,24 @@ import type { IBrowserControlService } from '@x/core/dist/application/browser-control/service.js'; -import type { BrowserControlAction, BrowserControlInput, BrowserControlResult } from '@x/shared/dist/browser-control.js'; +import type { BrowserControlAction, BrowserControlInput, BrowserControlResult, SuggestedBrowserSkill } from '@x/shared/dist/browser-control.js'; +import { ensureLoaded, matchSkillsForUrl } from '@x/core/dist/application/browser-skills/index.js'; import { browserViewManager } from './view.js'; import { normalizeNavigationTarget } from './navigation.js'; +async function getSuggestedSkills(url: string | undefined): Promise { + if (!url) return undefined; + try { + const status = await ensureLoaded(); + if (status.status === 'ready' || status.status === 'stale') { + const matched = matchSkillsForUrl(status.index, url); + if (matched.length === 0) return undefined; + return matched.map((e) => ({ id: e.id, title: e.title, path: e.path })); + } + } catch (err) { + console.warn('[browser-control] suggestedSkills lookup failed:', err); + } + return undefined; +} + function buildSuccessResult( action: BrowserControlAction, message: string, @@ -52,11 +68,13 @@ export class ElectronBrowserControlService implements IBrowserControlService { } await browserViewManager.ensureActiveTabReady(signal); const page = await browserViewManager.readPageSummary(signal, { waitForReady: false }) ?? undefined; - return buildSuccessResult( + const suggestedSkills = await getSuggestedSkills(page?.url); + const success = buildSuccessResult( 'new-tab', target ? `Opened a new tab for ${target}.` : 'Opened a new tab.', page, ); + return suggestedSkills ? { ...success, suggestedSkills } : success; } case 'switch-tab': { @@ -99,7 +117,9 @@ export class ElectronBrowserControlService implements IBrowserControlService { } await browserViewManager.ensureActiveTabReady(signal); const page = await browserViewManager.readPageSummary(signal, { waitForReady: false }) ?? undefined; - return buildSuccessResult('navigate', `Navigated to ${target}.`, page); + const suggestedSkills = await getSuggestedSkills(page?.url); + const success = buildSuccessResult('navigate', `Navigated to ${target}.`, page); + return suggestedSkills ? { ...success, suggestedSkills } : success; } case 'back': { @@ -140,7 +160,9 @@ export class ElectronBrowserControlService implements IBrowserControlService { if (!result.ok || !result.page) { return buildErrorResult('read-page', result.error ?? 'Failed to read the current page.'); } - return buildSuccessResult('read-page', 'Read the current page.', result.page); + const suggestedSkills = await getSuggestedSkills(result.page.url); + const success = buildSuccessResult('read-page', 'Read the current page.', result.page); + return suggestedSkills ? { ...success, suggestedSkills } : success; } case 'click': { diff --git a/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts b/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts index f1c06f0c..868ce8e8 100644 --- a/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts +++ b/apps/x/packages/core/src/application/assistant/skills/browser-control/skill.ts @@ -14,8 +14,10 @@ Use this skill when the user asks you to open a website, browse in-app, search t - page ` + "`url`" + ` and ` + "`title`" + ` - visible page text - interactable elements with numbered ` + "`index`" + ` values -4. Prefer acting on those numbered indices with ` + "`click`" + ` / ` + "`type`" + ` / ` + "`press`" + `. -5. After each action, read the returned page snapshot before deciding the next step. + - ` + "`suggestedSkills`" + ` — site-specific and interaction-specific skill hints for the current page +4. **Always inspect ` + "`suggestedSkills`" + ` before acting.** If any skill in the list matches what the user asked for (site or task), call ` + "`load-browser-skill({ id: \"\" })`" + ` *first*, read it in full, then plan your actions. These skills encode selectors, timing, and gotchas that would otherwise cost you several failed attempts to rediscover. If no skill matches, proceed — but do not skip this check. +5. Prefer acting on those numbered indices with ` + "`click`" + ` / ` + "`type`" + ` / ` + "`press`" + `. +6. After each action, read the returned page snapshot before deciding the next step — including re-checking ` + "`suggestedSkills`" + ` if the navigation landed you on a new domain. ## Actions @@ -92,12 +94,23 @@ Wait for the page to settle, useful after async UI changes. Parameters: - ` + "`ms`" + `: milliseconds to wait (optional) +## Companion Tools + +### load-browser-skill +Rowboat caches a library of browser skills (from ` + "`browser-use/browser-harness`" + `) indexed by both **domain** (github, linkedin, amazon, booking, …) and **interaction type** within a domain (e.g. ` + "`github/repo-actions`" + `, ` + "`github/scraping`" + `, ` + "`arxiv-bulk/*`" + `). Whenever ` + "`browser-control`" + ` returns a ` + "`suggestedSkills`" + ` array — which it does on ` + "`navigate`" + `, ` + "`new-tab`" + `, and ` + "`read-page`" + ` — treat it as a required reading step, not optional. Pick the entry that matches the current task (domain match first, then the interaction-specific variant if one exists) and call ` + "`load-browser-skill({ id: \"\" })`" + ` before attempting the action. + +You can also proactively call ` + "`load-browser-skill({ action: \"list\", site: \"\" })`" + ` when you know you're about to work on a site, to see what skills exist even if ` + "`suggestedSkills`" + ` is empty (e.g. before navigating). + +These skills are written against a Python harness, so treat them as **reference knowledge**. Reuse the selectors, timing, and sequencing, but adapt them to Rowboat's structured browser actions. **Do not look for or call ` + "`http-fetch`" + `.** If a browser-harness recipe suggests ` + "`js(...)`" + ` or ` + "`http_get(...)`" + ` style shortcuts, treat those as non-portable and fall back to reading and interacting with the page itself. + ## Important Rules - Prefer ` + "`read-page`" + ` before interacting. - Prefer element ` + "`index`" + ` over CSS selectors. - If the tool says the snapshot is stale, call ` + "`read-page`" + ` again. - After navigation, clicking, typing, pressing, or scrolling, use the returned page snapshot instead of assuming the page state. +- **Always check ` + "`suggestedSkills`" + ` after ` + "`navigate`" + `, ` + "`new-tab`" + `, or ` + "`read-page`" + `, and load the matching domain or interaction skill before acting.** Skipping this step is the single most common way to waste a dozen failed clicks on a site whose quirks are already documented. If the array is empty, proceed normally — but don't skip the check. +- Do not try to use ` + "`http-fetch`" + `. If a browser-harness recipe mentions ` + "`http_get(...)`" + ` or a public API shortcut, adapt it to DOM-based browsing instead. - Use Rowboat's browser for live interaction. Use web search tools for research where a live session is unnecessary. - Do not wrap browser URLs or browser pages in ` + "```filepath" + ` blocks. Filepath cards are only for real files on disk, not web pages or browser tabs. - If you mention a page the browser opened, use plain text for the URL/title instead of trying to create a clickable file card. diff --git a/apps/x/packages/core/src/application/browser-skills/index.ts b/apps/x/packages/core/src/application/browser-skills/index.ts new file mode 100644 index 00000000..2040c963 --- /dev/null +++ b/apps/x/packages/core/src/application/browser-skills/index.ts @@ -0,0 +1,3 @@ +export { ensureLoaded, readSkillContent, refreshFromRemote } from './loader.js'; +export type { SkillEntry, SkillsIndex, LoaderStatus } from './loader.js'; +export { matchSkillsForUrl } from './matcher.js'; diff --git a/apps/x/packages/core/src/application/browser-skills/loader.ts b/apps/x/packages/core/src/application/browser-skills/loader.ts new file mode 100644 index 00000000..3e68d7ca --- /dev/null +++ b/apps/x/packages/core/src/application/browser-skills/loader.ts @@ -0,0 +1,215 @@ +import * as path from 'node:path'; +import * as fs from 'node:fs/promises'; +import { WorkDir } from '../../config/config.js'; + +const REPO_OWNER = 'browser-use'; +const REPO_NAME = 'browser-harness'; +const REPO_BRANCH = 'main'; +const DOMAIN_SKILLS_PREFIX = 'domain-skills/'; + +const MANIFEST_TTL_MS = 24 * 60 * 60 * 1000; +const FETCH_TIMEOUT_MS = 20_000; + +export type SkillEntry = { + id: string; // e.g. "github/repo-actions" + site: string; // e.g. "github" + fileName: string; // e.g. "repo-actions.md" + title: string; // first H1 from the markdown, or a derived title + path: string; // relative repo path, e.g. "domain-skills/github/repo-actions.md" + localPath: string; // absolute path on disk +}; + +export type SkillsIndex = { + fetchedAt: number; + treeSha: string; + entries: SkillEntry[]; +}; + +export type LoaderStatus = + | { status: 'ready'; index: SkillsIndex } + | { status: 'stale'; index: SkillsIndex; refreshing: boolean } + | { status: 'empty' } + | { status: 'error'; error: string }; + +const cacheRoot = () => path.join(WorkDir, 'cache', 'browser-skills'); +const skillsDir = () => path.join(cacheRoot(), 'domain-skills'); +const manifestPath = () => path.join(cacheRoot(), 'manifest.json'); + +async function ensureCacheDir(): Promise { + await fs.mkdir(skillsDir(), { recursive: true }); +} + +async function readManifest(): Promise { + try { + const raw = await fs.readFile(manifestPath(), 'utf8'); + const parsed = JSON.parse(raw) as SkillsIndex; + if (!parsed.entries || !Array.isArray(parsed.entries)) return null; + return parsed; + } catch { + return null; + } +} + +async function writeManifest(index: SkillsIndex): Promise { + await ensureCacheDir(); + await fs.writeFile(manifestPath(), JSON.stringify(index, null, 2), 'utf8'); +} + +function extractTitle(markdown: string, fallback: string): string { + const match = markdown.match(/^#\s+(.+?)\s*$/m); + if (match?.[1]) return match[1].trim(); + return fallback; +} + +async function fetchWithTimeout(url: string, init?: RequestInit): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); + try { + return await fetch(url, { + ...init, + signal: controller.signal, + headers: { + 'User-Agent': 'rowboat-browser-skills', + Accept: 'application/vnd.github+json', + ...(init?.headers ?? {}), + }, + }); + } finally { + clearTimeout(timer); + } +} + +type GithubTreeNode = { path: string; type: string; sha: string }; + +async function fetchRepoTree(): Promise<{ treeSha: string; skillPaths: string[] }> { + const branchUrl = `https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/branches/${REPO_BRANCH}`; + const branchRes = await fetchWithTimeout(branchUrl); + if (!branchRes.ok) { + throw new Error(`GitHub branch fetch failed: ${branchRes.status} ${branchRes.statusText}`); + } + const branch = (await branchRes.json()) as { commit: { commit: { tree: { sha: string } } } }; + const treeSha = branch.commit?.commit?.tree?.sha; + if (!treeSha) throw new Error('Could not resolve tree SHA from branch response.'); + + const treeUrl = `https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/git/trees/${treeSha}?recursive=1`; + const treeRes = await fetchWithTimeout(treeUrl); + if (!treeRes.ok) { + throw new Error(`GitHub tree fetch failed: ${treeRes.status} ${treeRes.statusText}`); + } + const tree = (await treeRes.json()) as { tree: GithubTreeNode[]; truncated: boolean }; + + const skillPaths = tree.tree + .filter((n) => n.type === 'blob' && n.path.startsWith(DOMAIN_SKILLS_PREFIX) && n.path.endsWith('.md')) + .map((n) => n.path); + + return { treeSha, skillPaths }; +} + +async function fetchRawFile(repoPath: string): Promise { + const url = `https://raw.githubusercontent.com/${REPO_OWNER}/${REPO_NAME}/${REPO_BRANCH}/${repoPath}`; + const res = await fetchWithTimeout(url, { headers: { Accept: 'text/plain' } }); + if (!res.ok) { + throw new Error(`Raw file fetch failed for ${repoPath}: ${res.status} ${res.statusText}`); + } + return res.text(); +} + +function parseRepoPath(repoPath: string): { id: string; site: string; fileName: string } | null { + const rel = repoPath.slice(DOMAIN_SKILLS_PREFIX.length); + const parts = rel.split('/'); + if (parts.length < 2) return null; + const site = parts[0]; + const fileName = parts.slice(1).join('/'); + const id = rel.replace(/\.md$/, ''); + return { id, site, fileName }; +} + +export async function refreshFromRemote(): Promise { + await ensureCacheDir(); + const { treeSha, skillPaths } = await fetchRepoTree(); + + const entries: SkillEntry[] = []; + await Promise.all(skillPaths.map(async (repoPath) => { + const parsed = parseRepoPath(repoPath); + if (!parsed) return; + try { + const content = await fetchRawFile(repoPath); + const localRel = path.join(parsed.site, parsed.fileName); + const localPath = path.join(skillsDir(), localRel); + await fs.mkdir(path.dirname(localPath), { recursive: true }); + await fs.writeFile(localPath, content, 'utf8'); + entries.push({ + id: parsed.id, + site: parsed.site, + fileName: parsed.fileName, + title: extractTitle(content, parsed.id), + path: repoPath, + localPath, + }); + } catch (err) { + console.warn(`[browser-skills] Failed to fetch ${repoPath}:`, err); + } + })); + + entries.sort((a, b) => a.id.localeCompare(b.id)); + + const index: SkillsIndex = { + fetchedAt: Date.now(), + treeSha, + entries, + }; + await writeManifest(index); + return index; +} + +let inFlightRefresh: Promise | null = null; + +export async function ensureLoaded(options?: { forceRefresh?: boolean }): Promise { + try { + const existing = await readManifest(); + const fresh = existing && Date.now() - existing.fetchedAt < MANIFEST_TTL_MS; + + if (existing && fresh && !options?.forceRefresh) { + return { status: 'ready', index: existing }; + } + + if (existing && !options?.forceRefresh) { + if (!inFlightRefresh) { + inFlightRefresh = refreshFromRemote() + .catch((err) => { + console.warn('[browser-skills] Background refresh failed:', err); + return existing; + }) + .finally(() => { inFlightRefresh = null; }); + } + return { status: 'stale', index: existing, refreshing: true }; + } + + if (!inFlightRefresh) { + inFlightRefresh = refreshFromRemote().finally(() => { inFlightRefresh = null; }); + } + try { + const index = await inFlightRefresh; + return { status: 'ready', index }; + } catch (err) { + return { status: 'error', error: err instanceof Error ? err.message : 'Failed to load skills.' }; + } + } catch (err) { + return { status: 'error', error: err instanceof Error ? err.message : 'Skill loader failed.' }; + } +} + +export async function readSkillContent(id: string): Promise<{ ok: true; content: string; entry: SkillEntry } | { ok: false; error: string }> { + const status = await ensureLoaded(); + if (status.status === 'error' || status.status === 'empty') { + return { ok: false, error: status.status === 'error' ? status.error : 'No skills cached yet.' }; + } + const entry = status.index.entries.find((e) => e.id === id); + if (!entry) return { ok: false, error: `Skill '${id}' not found.` }; + try { + const content = await fs.readFile(entry.localPath, 'utf8'); + return { ok: true, content, entry }; + } catch (err) { + return { ok: false, error: err instanceof Error ? err.message : 'Failed to read skill file.' }; + } +} diff --git a/apps/x/packages/core/src/application/browser-skills/matcher.ts b/apps/x/packages/core/src/application/browser-skills/matcher.ts new file mode 100644 index 00000000..a4aabde8 --- /dev/null +++ b/apps/x/packages/core/src/application/browser-skills/matcher.ts @@ -0,0 +1,56 @@ +import type { SkillEntry, SkillsIndex } from './loader.js'; + +/** + * Map browser-harness `domain-skills//` folder names to hostname tokens we + * match against the current tab's URL. + * + * Heuristic: for each site folder we generate candidate hostnames like + * "booking-com" -> ["booking-com", "bookingcom", "booking.com"] + * "github" -> ["github", "github.com"] + * "dev-to" -> ["dev-to", "devto", "dev.to"] + * Then we check whether any candidate is a substring of the tab hostname. + */ +function siteCandidates(site: string): string[] { + const candidates = new Set(); + candidates.add(site); + candidates.add(site.replace(/-/g, '')); + candidates.add(site.replace(/-/g, '.')); + if (site.endsWith('-com')) { + candidates.add(`${site.slice(0, -4)}.com`); + } + if (site.endsWith('-org')) { + candidates.add(`${site.slice(0, -4)}.org`); + } + if (site.endsWith('-io')) { + candidates.add(`${site.slice(0, -3)}.io`); + } + return Array.from(candidates); +} + +function extractHostname(url: string): string | null { + try { + return new URL(url).hostname.toLowerCase(); + } catch { + return null; + } +} + +export function matchSkillsForUrl(index: SkillsIndex, url: string, limit = 5): SkillEntry[] { + const hostname = extractHostname(url); + if (!hostname) return []; + + const bySite = new Map(); + for (const entry of index.entries) { + if (!bySite.has(entry.site)) bySite.set(entry.site, []); + bySite.get(entry.site)!.push(entry); + } + + const matched: SkillEntry[] = []; + for (const [site, entries] of bySite) { + const candidates = siteCandidates(site); + const hit = candidates.some((c) => hostname === c || hostname.endsWith(`.${c}`) || hostname.includes(c)); + if (hit) matched.push(...entries); + } + + return matched.slice(0, limit); +} diff --git a/apps/x/packages/core/src/application/lib/builtin-tools.ts b/apps/x/packages/core/src/application/lib/builtin-tools.ts index 65b398a1..7dd06dd2 100644 --- a/apps/x/packages/core/src/application/lib/builtin-tools.ts +++ b/apps/x/packages/core/src/application/lib/builtin-tools.ts @@ -18,6 +18,7 @@ import { composioAccountsRepo } from "../../composio/repo.js"; import { executeAction as executeComposioAction, isConfigured as isComposioConfigured, searchTools as searchComposioTools } from "../../composio/client.js"; import { CURATED_TOOLKITS, CURATED_TOOLKIT_SLUGS } from "@x/shared/dist/composio.js"; import { BrowserControlInputSchema, type BrowserControlInput } from "@x/shared/dist/browser-control.js"; +import { ensureLoaded as ensureBrowserSkillsLoaded, readSkillContent as readBrowserSkillContent, refreshFromRemote as refreshBrowserSkills } from "../browser-skills/index.js"; import type { ToolContext } from "./exec-tool.js"; import { generateText } from "ai"; import { createProvider } from "../../models/models.js"; @@ -1007,6 +1008,71 @@ export const BuiltinTools: z.infer = { }, }, + // ============================================================================ + // Browser Skills (browser-use/browser-harness domain-skills cache) + // ============================================================================ + + 'load-browser-skill': { + description: 'Load a site-specific browser skill (from the browser-use/browser-harness domain-skills library) by id. Returns the full markdown content with selectors, gotchas, and recipes for the target site. Call this after browser-control responses surface a matching skill in suggestedSkills. Pass action="list" to see all available skills. Skills are fetched on first use and cached locally; pass action="refresh" to force an update from upstream.', + inputSchema: z.object({ + action: z.enum(['load', 'list', 'refresh']).optional().describe('load: fetch a skill by id (default). list: list all cached skills. refresh: re-fetch the library from upstream.'), + id: z.string().optional().describe('Skill id (e.g., "github/repo-actions") — required for load.'), + site: z.string().optional().describe('Filter list results to a single site (e.g., "github").'), + }), + execute: async (input: { action?: 'load' | 'list' | 'refresh'; id?: string; site?: string }) => { + const action = input.action ?? 'load'; + try { + if (action === 'refresh') { + const index = await refreshBrowserSkills(); + return { + success: true, + message: `Refreshed ${index.entries.length} skill${index.entries.length === 1 ? '' : 's'} from upstream.`, + count: index.entries.length, + treeSha: index.treeSha, + }; + } + + if (action === 'list') { + const status = await ensureBrowserSkillsLoaded(); + if (status.status === 'error') { + return { success: false, error: status.error }; + } + if (status.status === 'empty') { + return { success: false, error: 'No browser skills cached yet.' }; + } + const entries = status.index.entries + .filter((e) => !input.site || e.site === input.site) + .map((e) => ({ id: e.id, title: e.title, site: e.site })); + return { + success: true, + count: entries.length, + skills: entries, + cacheAgeMs: Date.now() - status.index.fetchedAt, + refreshing: status.status === 'stale' ? status.refreshing : false, + }; + } + + if (!input.id) { + return { success: false, error: 'id is required for load.' }; + } + const result = await readBrowserSkillContent(input.id); + if (!result.ok) { + return { success: false, error: result.error }; + } + return { + success: true, + id: result.entry.id, + title: result.entry.title, + site: result.entry.site, + path: result.entry.path, + content: result.content, + }; + } catch (err) { + return { success: false, error: err instanceof Error ? err.message : 'Failed to load browser skill.' }; + } + }, + }, + // ============================================================================ // Browser Control // ============================================================================ diff --git a/apps/x/packages/shared/src/browser-control.ts b/apps/x/packages/shared/src/browser-control.ts index e1418a5e..e4eb112d 100644 --- a/apps/x/packages/shared/src/browser-control.ts +++ b/apps/x/packages/shared/src/browser-control.ts @@ -116,6 +116,12 @@ export const BrowserControlInputSchema = z.object({ } }); +export const SuggestedBrowserSkillSchema = z.object({ + id: z.string(), + title: z.string(), + path: z.string(), +}); + export const BrowserControlResultSchema = z.object({ success: z.boolean(), action: BrowserControlActionSchema, @@ -123,6 +129,7 @@ export const BrowserControlResultSchema = z.object({ error: z.string().optional(), browser: BrowserStateSchema, page: BrowserPageSnapshotSchema.optional(), + suggestedSkills: z.array(SuggestedBrowserSkillSchema).optional(), }); export type BrowserTabState = z.infer; @@ -132,3 +139,4 @@ export type BrowserPageSnapshot = z.infer; export type BrowserControlAction = z.infer; export type BrowserControlInput = z.infer; export type BrowserControlResult = z.infer; +export type SuggestedBrowserSkill = z.infer;