From c6e42631dacb45b3aaec4c9600310adce1170b73 Mon Sep 17 00:00:00 2001 From: yzlin Date: Fri, 12 Apr 2024 14:15:40 +0800 Subject: [PATCH] improve browser, rm vision, add text view, comment out find_links --- examples/di/imitate_webpage.py | 6 +- examples/di/use_browser.py | 7 +- metagpt/tools/libs/browser.py | 144 +++++++++++++---------- tests/metagpt/tools/libs/test_browser.py | 22 ++-- 4 files changed, 101 insertions(+), 78 deletions(-) diff --git a/examples/di/imitate_webpage.py b/examples/di/imitate_webpage.py index 60ebab389..d181e0dfc 100644 --- a/examples/di/imitate_webpage.py +++ b/examples/di/imitate_webpage.py @@ -11,10 +11,10 @@ from metagpt.roles.di.data_interpreter import DataInterpreter async def main(): web_url = "https://pytorch.org/" prompt = f"""This is a URL of webpage: '{web_url}' . -Firstly, utilize Selenium and WebDriver for rendering. -Secondly, convert image to a webpage including HTML, CSS and JS in one go. +Firstly, open the page and take a screenshot of the page. +Secondly, convert the image to a webpage including HTML, CSS and JS in one go. Note: All required dependencies and environments have been fully installed and configured.""" - di = DataInterpreter(tools=["GPTvGenerator"]) + di = DataInterpreter(tools=["GPTvGenerator", "Browser"]) await di.run(prompt) diff --git a/examples/di/use_browser.py b/examples/di/use_browser.py index 6dfc8de24..a3a079ccc 100644 --- a/examples/di/use_browser.py +++ b/examples/di/use_browser.py @@ -2,20 +2,23 @@ import asyncio from metagpt.roles.di.data_interpreter import DataInterpreter -# an example to showcase navigation MG_LLM_CONFIG_REQ = """ This is a link to the doc site of MetaGPT project: https://docs.deepwisdom.ai/main/en/ Check where you can go to on the site and try to find out the list of LLM APIs supported by MetaGPT. Don't write all codes in one response, each time, just write code for one step. """ -# an example to showcase searching PAPER_LIST_REQ = """" At https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/, find the first paper whose title includes `multiagent`, open it and summarize its abstract. Don't write all codes in one response, each time, just write code for one step. """ +DESCRIBE_GITHUB_ISSUE_REQ = """ +Visit https://github.com/geekan/MetaGPT, navigate to Issues page, open the first issue related to DataInterpreter, then summarize what the issue is in one sentence. +Don't write all codes in one response, each time, just write code for one step. +""" + async def main(): di = DataInterpreter(tools=["Browser"], react_mode="react") diff --git a/metagpt/tools/libs/browser.py b/metagpt/tools/libs/browser.py index 48b1cab69..b6a5b7cbf 100644 --- a/metagpt/tools/libs/browser.py +++ b/metagpt/tools/libs/browser.py @@ -3,26 +3,19 @@ from playwright.async_api import async_playwright from metagpt.const import DEFAULT_WORKSPACE_ROOT from metagpt.logs import ToolLogItem, log_tool_output_async from metagpt.tools.tool_registry import register_tool -from metagpt.utils.common import encode_image @register_tool() class Browser: """ A tool for browsing the web. Don't initialize a new instance of this class if one already exists. - Note: Combine searching, scrolling, extraction, and link finding together to achieve most effective browsing. DON'T stick to one method. + Note: Combine searching and scrolling together to achieve most effective browsing. DON'T stick to one method. """ def __init__(self): """initiate the browser, create pages placeholder later to be managed as {page_url: page object}""" self.browser = None - from metagpt.config2 import config - from metagpt.llm import LLM - - self.llm = LLM(llm_config=config.get_openai_llm()) - self.llm.model = "gpt-4-vision-preview" - # browser status management self.pages = {} self.current_page_url = None @@ -33,25 +26,26 @@ class Browser: self.playwright = await async_playwright().start() self.browser = await self.playwright.chromium.launch() - def _set_current_page(self, page, url): + async def _set_current_page(self, page, url): self.current_page = page self.current_page_url = url print("Now on page ", url) + print(await self._view()) async def open_new_page(self, url: str): - """open a new page in the browser, set it as the current page""" + """open a new page in the browser and view the page""" page = await self.browser.new_page() await page.goto(url) self.pages[url] = page - self._set_current_page(page, url) + await self._set_current_page(page, url) await log_tool_output_async( ToolLogItem(type="object", name="open_new_page", value=self.current_page), tool_name="Browser" ) async def switch_page(self, url: str): - """switch to an opened page in the browser, set it as the current page""" + """switch to an opened page in the browser and view the page""" if url in self.pages: - self._set_current_page(self.pages[url], url) + await self._set_current_page(self.pages[url], url) await log_tool_output_async( ToolLogItem(type="object", name="switch_page", value=self.current_page), tool_name="Browser" ) @@ -91,22 +85,7 @@ class Browser: position = await element.evaluate("e => ({ from_top: e.offsetTop, from_left: e.offsetLeft })") # Retrieve the surrounding block of text and links with their text - content = await element.evaluate( - """ - (element) => { - // const block = element.closest('p, div, section, article'); - const block = element.parentElement; - return { - text_block: block.innerText, - // Create an array of objects, each containing the text and href of a link - links: Array.from(block.querySelectorAll('a')).map(a => ({ - text: a.innerText, - href: a.href - })) - }; - } - """ - ) + content = await element.evaluate(SEARCH_CONTENT_JS) search_results.append( {"index": len(search_results), "content": content, "position": position, "element_obj": element} @@ -131,56 +110,53 @@ class Browser: index = len(search_results) - 1 element = search_results[index]["element_obj"] await element.scroll_into_view_if_needed() - print(f"Successfully scrolled to the {index}-th search result, consider extract more info around it.") await log_tool_output_async( ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser" ) + print(f"Successfully scrolled to the {index}-th search result") + print(await self._view()) - async def find_links(self) -> list: - """Finds all links in the current page and returns a list of dictionaries with link text and the URL. - Useful for navigating to more pages and exploring more resources. + # async def find_links(self) -> list: + # """Finds all links in the current page and returns a list of dictionaries with link text and the URL. + # Useful for navigating to more pages and exploring more resources. - Returns: - list: A list of dictionaries, each containing 'text' and 'href' keys. - """ - # Use a CSS selector to find all elements in the page. - links = await self.current_page.query_selector_all("a") + # Returns: + # list: A list of dictionaries, each containing 'text' and 'href' keys. + # """ + # # Use a CSS selector to find all elements in the page. + # links = await self.current_page.query_selector_all("a") - # Prepare an empty list to hold link information. - link_info = [] + # # Prepare an empty list to hold link information. + # link_info = [] - # Iterate over each link element to extract its text and href attributes. - for link in links: - text = await link.text_content() - href = await link.get_attribute("href") - link_info.append({"text": text, "href": href}) + # # Iterate over each link element to extract its text and href attributes. + # for link in links: + # text = await link.text_content() + # href = await link.get_attribute("href") + # link_info.append({"text": text, "href": href}) - print(f"Found {len(link_info)} links:\n\n{link_info}") + # print(f"Found {len(link_info)} links:\n\n{link_info}") - return link_info + # return link_info - async def extract_info_from_view(self, instruction: str) -> str: - """ - Extract useful info from the current page view. + async def screenshot(self, path: str = DEFAULT_WORKSPACE_ROOT / "screenshot_temp.png"): + """Take a screenshot of the current page and save it to the specified path.""" + await self.current_page.screenshot(path=path) + print(f"Screenshot saved to: {path}") - Args: - instruction (str): explain what info needs to be extracted - - Returns: - str: extracted info from current view - """ - img_path = DEFAULT_WORKSPACE_ROOT / "screenshot_temp.png" - await self.current_page.screenshot(path=img_path) - rsp = await self.llm.aask(msg=instruction, images=[encode_image(img_path)]) - return rsp + async def _view(self) -> str: + """simulate human viewing the current page, return the visible text with links""" + visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS) + return visible_text_with_links async def scroll_current_page(self, offset: int = 500): - """scroll the current page by offset pixels, negative value means scrolling up, returning the content observed after scrolling""" + """scroll the current page by offset pixels, negative value means scrolling up, will print out observed content after scrolling""" await self.current_page.evaluate(f"window.scrollBy(0, {offset})") - print(f"Scrolled current page by {offset} pixels. Perceive the scrolled view if needed") await log_tool_output_async( ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser" ) + print(f"Scrolled current page by {offset} pixels.") + print(await self._view()) def check_all_pages(self) -> dict: """return all pages opened in the browser, a dictionary with {page_url: page_title}, useful for understanding the current browser state""" @@ -195,3 +171,47 @@ class Browser: async def get_scroll_position(page): return await page.evaluate("() => ({ x: window.scrollX, y: window.scrollY })") + + +SEARCH_CONTENT_JS = """ +(element) => { + // const block = element.closest('p, div, section, article'); + const block = element.parentElement; + return { + text_block: block.innerText, + // Create an array of objects, each containing the text and href of a link + links: Array.from(block.querySelectorAll('a')).map(a => ({ + text: a.innerText, + href: a.href + })) + }; +} +""" + + +VIEW_CONTENT_JS = """ +() => { + return Array.from(document.querySelectorAll('body *')).filter(el => { + if (!(el.offsetWidth || el.offsetHeight || el.getClientRects().length)) return false; + const style = window.getComputedStyle(el); + if (style.display === 'none' || style.visibility !== 'visible' || style.opacity === '0') return false; + const rect = el.getBoundingClientRect(); + const elemCenter = { + x: rect.left + rect.width / 2, + y: rect.top + rect.height / 2 + }; + if (elemCenter.x < 0 || elemCenter.y < 0 || elemCenter.x > window.innerWidth || elemCenter.y > window.innerHeight) return false; + if (document.elementFromPoint(elemCenter.x, elemCenter.y) !== el) return false; + return true; + }).map(el => { + let text = el.innerText || ''; + text = text.trim(); + if (!text.length) return ''; + const parentAnchor = el.closest('a'); + if (parentAnchor && parentAnchor.href) { + return `${text} (${parentAnchor.href})`; + } + return text; + }).filter(text => text.length > 0).join("\\n"); +} +""" diff --git a/tests/metagpt/tools/libs/test_browser.py b/tests/metagpt/tools/libs/test_browser.py index 0c3009fef..ec0b5c848 100644 --- a/tests/metagpt/tools/libs/test_browser.py +++ b/tests/metagpt/tools/libs/test_browser.py @@ -54,22 +54,18 @@ async def test_search(browser): # scroll to search result await browser.scroll_to_search_result(search_results, index=0) - # perceive current view - rsp = await browser.extract_info_from_view("what is the command to run exactly?") - assert "metagpt" in rsp - await browser.close() -@pytest.mark.asyncio -async def test_find_links(browser): - await browser.start() +# @pytest.mark.asyncio +# async def test_find_links(browser): +# await browser.start() - await browser.open_new_page(TEST_URL) - link_info = await browser.find_links() - assert link_info +# await browser.open_new_page(TEST_URL) +# link_info = await browser.find_links() +# assert link_info - await browser.close() +# await browser.close() @pytest.mark.asyncio @@ -80,9 +76,13 @@ async def test_scroll(browser): await browser.scroll_current_page(offset=-500) assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 0} # no change if you scrol up from top + initial_view = await browser._view() await browser.scroll_current_page(offset=500) # scroll down assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 500} + scrolled_view = await browser._view() + + assert initial_view != scrolled_view await browser.scroll_current_page(offset=-200) # scroll up assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 300}