Merge branch 'di_mgx' into 'mgx_ops'

improve browser, rm vision, add text view, comment out find_links See merge request pub/MetaGPT!45
2026-05-02 20:32:38 +02:00 · 2024-04-12 06:27:23 +00:00 · 2024-04-12 06:27:23 +00:00 · dab8ab1bdb
commit dab8ab1bdb
parent 3554c0276f c6e42631da
4 changed files with 101 additions and 78 deletions
--- a/examples/di/imitate_webpage.py
+++ b/examples/di/imitate_webpage.py
@ -11,10 +11,10 @@ from metagpt.roles.di.data_interpreter import DataInterpreter
 async def main():
    web_url = "https://pytorch.org/"
    prompt = f"""This is a URL of webpage: '{web_url}' .
-Firstly, utilize Selenium and WebDriver for rendering. 
-Secondly, convert image to a webpage including HTML, CSS and JS in one go.
+Firstly, open the page and take a screenshot of the page. 
+Secondly, convert the image to a webpage including HTML, CSS and JS in one go.
 Note: All required dependencies and environments have been fully installed and configured."""
-    di = DataInterpreter(tools=["GPTvGenerator"])
+    di = DataInterpreter(tools=["GPTvGenerator", "Browser"])

    await di.run(prompt)

--- a/examples/di/use_browser.py
+++ b/examples/di/use_browser.py
@ -2,20 +2,23 @@ import asyncio

 from metagpt.roles.di.data_interpreter import DataInterpreter

-# an example to showcase navigation
 MG_LLM_CONFIG_REQ = """
 This is a link to the doc site of MetaGPT project: https://docs.deepwisdom.ai/main/en/
 Check where you can go to on the site and try to find out the list of LLM APIs supported by MetaGPT.
 Don't write all codes in one response, each time, just write code for one step.
 """

-# an example to showcase searching
 PAPER_LIST_REQ = """"
 At https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
 find the first paper whose title includes `multiagent`, open it and summarize its abstract.
 Don't write all codes in one response, each time, just write code for one step.
 """

+DESCRIBE_GITHUB_ISSUE_REQ = """
+Visit https://github.com/geekan/MetaGPT, navigate to Issues page, open the first issue related to DataInterpreter, then summarize what the issue is in one sentence.
+Don't write all codes in one response, each time, just write code for one step.
+"""
+

 async def main():
    di = DataInterpreter(tools=["Browser"], react_mode="react")
--- a/metagpt/tools/libs/browser.py
+++ b/metagpt/tools/libs/browser.py
@ -3,26 +3,19 @@ from playwright.async_api import async_playwright
 from metagpt.const import DEFAULT_WORKSPACE_ROOT
 from metagpt.logs import ToolLogItem, log_tool_output_async
 from metagpt.tools.tool_registry import register_tool
-from metagpt.utils.common import encode_image


@register_tool()
 class Browser:
    """
    A tool for browsing the web. Don't initialize a new instance of this class if one already exists.
-    Note: Combine searching, scrolling, extraction, and link finding together to achieve most effective browsing. DON'T stick to one method.
+    Note: Combine searching and scrolling together to achieve most effective browsing. DON'T stick to one method.
    """

    def __init__(self):
        """initiate the browser, create pages placeholder later to be managed as {page_url: page object}"""
        self.browser = None

-        from metagpt.config2 import config
-        from metagpt.llm import LLM
-
-        self.llm = LLM(llm_config=config.get_openai_llm())
-        self.llm.model = "gpt-4-vision-preview"
-
        # browser status management
        self.pages = {}
        self.current_page_url = None
@ -33,25 +26,26 @@ class Browser:
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch()

-    def _set_current_page(self, page, url):
+    async def _set_current_page(self, page, url):
        self.current_page = page
        self.current_page_url = url
        print("Now on page ", url)
+        print(await self._view())

    async def open_new_page(self, url: str):
-        """open a new page in the browser, set it as the current page"""
+        """open a new page in the browser and view the page"""
        page = await self.browser.new_page()
        await page.goto(url)
        self.pages[url] = page
-        self._set_current_page(page, url)
+        await self._set_current_page(page, url)
        await log_tool_output_async(
            ToolLogItem(type="object", name="open_new_page", value=self.current_page), tool_name="Browser"
        )

    async def switch_page(self, url: str):
-        """switch to an opened page in the browser, set it as the current page"""
+        """switch to an opened page in the browser and view the page"""
        if url in self.pages:
-            self._set_current_page(self.pages[url], url)
+            await self._set_current_page(self.pages[url], url)
            await log_tool_output_async(
                ToolLogItem(type="object", name="switch_page", value=self.current_page), tool_name="Browser"
            )
@ -91,22 +85,7 @@ class Browser:
                position = await element.evaluate("e => ({ from_top: e.offsetTop, from_left: e.offsetLeft })")

                # Retrieve the surrounding block of text and links with their text
-                content = await element.evaluate(
-                    """
-                    (element) => {
-                        // const block = element.closest('p, div, section, article');
-                        const block = element.parentElement;
-                        return {
-                            text_block: block.innerText,
-                            // Create an array of objects, each containing the text and href of a link
-                            links: Array.from(block.querySelectorAll('a')).map(a => ({
-                                text: a.innerText, 
-                                href: a.href
-                            }))
-                        };
-                    }
-                """
-                )
+                content = await element.evaluate(SEARCH_CONTENT_JS)

                search_results.append(
                    {"index": len(search_results), "content": content, "position": position, "element_obj": element}
@ -131,56 +110,53 @@ class Browser:
            index = len(search_results) - 1
        element = search_results[index]["element_obj"]
        await element.scroll_into_view_if_needed()
-        print(f"Successfully scrolled to the {index}-th search result, consider extract more info around it.")
        await log_tool_output_async(
            ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser"
        )
+        print(f"Successfully scrolled to the {index}-th search result")
+        print(await self._view())

-    async def find_links(self) -> list:
-        """Finds all links in the current page and returns a list of dictionaries with link text and the URL.
-        Useful for navigating to more pages and exploring more resources.
+    # async def find_links(self) -> list:
+    #     """Finds all links in the current page and returns a list of dictionaries with link text and the URL.
+    #     Useful for navigating to more pages and exploring more resources.

-        Returns:
-            list: A list of dictionaries, each containing 'text' and 'href' keys.
-        """
-        # Use a CSS selector to find all <a> elements in the page.
-        links = await self.current_page.query_selector_all("a")
+    #     Returns:
+    #         list: A list of dictionaries, each containing 'text' and 'href' keys.
+    #     """
+    #     # Use a CSS selector to find all <a> elements in the page.
+    #     links = await self.current_page.query_selector_all("a")

-        # Prepare an empty list to hold link information.
-        link_info = []
+    #     # Prepare an empty list to hold link information.
+    #     link_info = []

-        # Iterate over each link element to extract its text and href attributes.
-        for link in links:
-            text = await link.text_content()
-            href = await link.get_attribute("href")
-            link_info.append({"text": text, "href": href})
+    #     # Iterate over each link element to extract its text and href attributes.
+    #     for link in links:
+    #         text = await link.text_content()
+    #         href = await link.get_attribute("href")
+    #         link_info.append({"text": text, "href": href})

-        print(f"Found {len(link_info)} links:\n\n{link_info}")
+    #     print(f"Found {len(link_info)} links:\n\n{link_info}")

-        return link_info
+    #     return link_info

-    async def extract_info_from_view(self, instruction: str) -> str:
-        """
-        Extract useful info from the current page view.
+    async def screenshot(self, path: str = DEFAULT_WORKSPACE_ROOT / "screenshot_temp.png"):
+        """Take a screenshot of the current page and save it to the specified path."""
+        await self.current_page.screenshot(path=path)
+        print(f"Screenshot saved to: {path}")

-        Args:
-            instruction (str): explain what info needs to be extracted
-
-        Returns:
-            str: extracted info from current view
-        """
-        img_path = DEFAULT_WORKSPACE_ROOT / "screenshot_temp.png"
-        await self.current_page.screenshot(path=img_path)
-        rsp = await self.llm.aask(msg=instruction, images=[encode_image(img_path)])
-        return rsp
+    async def _view(self) -> str:
+        """simulate human viewing the current page, return the visible text with links"""
+        visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS)
+        return visible_text_with_links

    async def scroll_current_page(self, offset: int = 500):
-        """scroll the current page by offset pixels, negative value means scrolling up, returning the content observed after scrolling"""
+        """scroll the current page by offset pixels, negative value means scrolling up, will print out observed content after scrolling"""
        await self.current_page.evaluate(f"window.scrollBy(0, {offset})")
-        print(f"Scrolled current page by {offset} pixels. Perceive the scrolled view if needed")
        await log_tool_output_async(
            ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser"
        )
+        print(f"Scrolled current page by {offset} pixels.")
+        print(await self._view())

    def check_all_pages(self) -> dict:
        """return all pages opened in the browser, a dictionary with {page_url: page_title}, useful for understanding the current browser state"""
@ -195,3 +171,47 @@ class Browser:

 async def get_scroll_position(page):
    return await page.evaluate("() => ({ x: window.scrollX, y: window.scrollY })")
+
+
+SEARCH_CONTENT_JS = """
+(element) => {
+    // const block = element.closest('p, div, section, article');
+    const block = element.parentElement;
+    return {
+        text_block: block.innerText,
+        // Create an array of objects, each containing the text and href of a link
+        links: Array.from(block.querySelectorAll('a')).map(a => ({
+            text: a.innerText, 
+            href: a.href
+        }))
+    };
+}
+"""
+
+
+VIEW_CONTENT_JS = """
+() => {
+    return Array.from(document.querySelectorAll('body *')).filter(el => {
+        if (!(el.offsetWidth || el.offsetHeight || el.getClientRects().length)) return false;
+        const style = window.getComputedStyle(el);
+        if (style.display === 'none' || style.visibility !== 'visible' || style.opacity === '0') return false;
+        const rect = el.getBoundingClientRect();
+        const elemCenter = {
+            x: rect.left + rect.width / 2,
+            y: rect.top + rect.height / 2
+        };
+        if (elemCenter.x < 0 || elemCenter.y < 0 || elemCenter.x > window.innerWidth || elemCenter.y > window.innerHeight) return false;
+        if (document.elementFromPoint(elemCenter.x, elemCenter.y) !== el) return false;
+        return true;
+    }).map(el => {
+        let text = el.innerText || '';
+        text = text.trim();
+        if (!text.length) return '';
+        const parentAnchor = el.closest('a');
+        if (parentAnchor && parentAnchor.href) {
+            return `${text} (${parentAnchor.href})`;
+        }
+        return text;
+    }).filter(text => text.length > 0).join("\\n");
+}
+"""
--- a/tests/metagpt/tools/libs/test_browser.py
+++ b/tests/metagpt/tools/libs/test_browser.py
@ -54,22 +54,18 @@ async def test_search(browser):
    # scroll to search result
    await browser.scroll_to_search_result(search_results, index=0)

-    # perceive current view
-    rsp = await browser.extract_info_from_view("what is the command to run exactly?")
-    assert "metagpt" in rsp
-
    await browser.close()


-@pytest.mark.asyncio
-async def test_find_links(browser):
-    await browser.start()
+# @pytest.mark.asyncio
+# async def test_find_links(browser):
+#     await browser.start()

-    await browser.open_new_page(TEST_URL)
-    link_info = await browser.find_links()
-    assert link_info
+#     await browser.open_new_page(TEST_URL)
+#     link_info = await browser.find_links()
+#     assert link_info

-    await browser.close()
+#     await browser.close()


@pytest.mark.asyncio
@ -80,9 +76,13 @@ async def test_scroll(browser):

    await browser.scroll_current_page(offset=-500)
    assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 0}  # no change if you scrol up from top
+    initial_view = await browser._view()

    await browser.scroll_current_page(offset=500)  # scroll down
    assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 500}
+    scrolled_view = await browser._view()
+
+    assert initial_view != scrolled_view

    await browser.scroll_current_page(offset=-200)  # scroll up
    assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 300}