From d8aea069fadf62bb2ab0f700625aa85003f8b987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= Date: Mon, 8 Apr 2024 23:54:55 +0800 Subject: [PATCH 01/20] fixbug: terminal --- metagpt/logs.py | 2 +- metagpt/tools/libs/terminal.py | 11 +++++++++-- tests/metagpt/tools/libs/test_terminal.py | 6 ++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/metagpt/logs.py b/metagpt/logs.py index 480477e6b..4f87b62e8 100644 --- a/metagpt/logs.py +++ b/metagpt/logs.py @@ -26,7 +26,7 @@ class ToolLogItem(BaseModel): TOOL_LOG_END_MARKER = ToolLogItem( - type="str", name="end_marker", value="#END#" + type="str", name="end_marker", value="\x18\x19\x1B\x18" ) # A special log item to suggest the end of a stream log diff --git a/metagpt/tools/libs/terminal.py b/metagpt/tools/libs/terminal.py index a23ebb86a..351aa4f6f 100644 --- a/metagpt/tools/libs/terminal.py +++ b/metagpt/tools/libs/terminal.py @@ -39,7 +39,7 @@ class Terminal: # Send the command self.process.stdin.write(cmd + self.command_terminator) self.process.stdin.write( - f'echo "{TOOL_LOG_END_MARKER.value}"' + self.command_terminator + f'echo "{TOOL_LOG_END_MARKER.value}"' + self.command_terminator # write EOF ) # Unique marker to signal command end self.process.stdin.flush() log_tool_output( @@ -49,7 +49,14 @@ class Terminal: # Read the output until the unique marker is found while True: line = self.process.stdout.readline() - if line.strip() == TOOL_LOG_END_MARKER.value: + ix = line.rfind(TOOL_LOG_END_MARKER.value) + if ix >= 0: + line = line[0:ix] + if line: + log_tool_output( + output=ToolLogItem(name="output", value=line), tool_name="Terminal" + ) # log stdout in real-time + cmd_output.append(line) log_tool_output(TOOL_LOG_END_MARKER) break log_tool_output( diff --git a/tests/metagpt/tools/libs/test_terminal.py b/tests/metagpt/tools/libs/test_terminal.py index 97c33b977..98ed63dd8 100644 --- a/tests/metagpt/tools/libs/test_terminal.py +++ b/tests/metagpt/tools/libs/test_terminal.py @@ -1,3 +1,5 @@ +import pytest + from metagpt.const import DATA_PATH, METAGPT_ROOT from metagpt.tools.libs.terminal import Terminal @@ -13,3 +15,7 @@ def test_terminal(): terminal.run_command("cd data") output = terminal.run_command("pwd") assert output.strip() == str(DATA_PATH) + + +if __name__ == "__main__": + pytest.main([__file__, "-s"]) From 3e83a8d7fa77df187361b301c942f8b0a123710a Mon Sep 17 00:00:00 2001 From: stellahsr Date: Tue, 9 Apr 2024 16:35:59 +0800 Subject: [PATCH 02/20] update example --- examples/mgx/run_mgx.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/mgx/run_mgx.py b/examples/mgx/run_mgx.py index 4a1964d59..edfaff3d7 100644 --- a/examples/mgx/run_mgx.py +++ b/examples/mgx/run_mgx.py @@ -9,12 +9,13 @@ requirement = ( # "design a game using Gym (an open source Python library), including a graphical interface and interactive gameplay" # "帮我把pip的源设置成:https://pypi.tuna.tsinghua.edu.cn/simple" # "This is a website url does not require login: https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767 please write a similar web page,developed in vue language, The package.json dependency must be generated" - "I would like to imitate the website available at https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767. Could you please browse through it?" + # "I would like to imitate the website available at https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767. Could you please browse through it?" + "Create a 2048 Game" ) async def main(requirement: str = ""): - mgx = MGX(use_intent=True) + mgx = MGX(use_intent=True, tools=["software development"]) await mgx.run(requirement) From 2b56c367879e737696d3ecc5d305b3ff61c71cb4 Mon Sep 17 00:00:00 2001 From: stellahsr Date: Tue, 9 Apr 2024 17:07:03 +0800 Subject: [PATCH 03/20] update code --- metagpt/strategy/task_type.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metagpt/strategy/task_type.py b/metagpt/strategy/task_type.py index c5453f1aa..2bc53b964 100644 --- a/metagpt/strategy/task_type.py +++ b/metagpt/strategy/task_type.py @@ -67,7 +67,10 @@ class TaskType(Enum): name="email login", desc="For logging to an email.", ) - DEVELOP_SOFTWARE = TaskTypeDef(name="develop software", desc="Develop software.") + DEVELOP_SOFTWARE = TaskTypeDef( + name="develop software", + desc="SOP related to develop software such as Writes a PRD, Writes a design, Writes a project plan and Writes code to implement designed features according to the project plan", + ) @property def type_name(self): From eef0375c6c8ef4ddca9de542c33f428b1cbf8f26 Mon Sep 17 00:00:00 2001 From: stellahsr Date: Tue, 9 Apr 2024 17:37:49 +0800 Subject: [PATCH 04/20] update tools according to intention update TaskTypeDef for software development --- examples/mgx/run_mgx.py | 6 +++--- metagpt/actions/di/detect_intent.py | 6 ++++-- metagpt/actions/di/write_plan.py | 18 +++++++++--------- metagpt/roles/di/mgx.py | 7 ++++++- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/examples/mgx/run_mgx.py b/examples/mgx/run_mgx.py index edfaff3d7..86aa67ad7 100644 --- a/examples/mgx/run_mgx.py +++ b/examples/mgx/run_mgx.py @@ -9,13 +9,13 @@ requirement = ( # "design a game using Gym (an open source Python library), including a graphical interface and interactive gameplay" # "帮我把pip的源设置成:https://pypi.tuna.tsinghua.edu.cn/simple" # "This is a website url does not require login: https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767 please write a similar web page,developed in vue language, The package.json dependency must be generated" - # "I would like to imitate the website available at https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767. Could you please browse through it?" - "Create a 2048 Game" + "I would like to imitate the website available at https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767. Could you please browse through it?" + # "Create a 2048 Game" ) async def main(requirement: str = ""): - mgx = MGX(use_intent=True, tools=["software development"]) + mgx = MGX(use_intent=True, tools=[""]) await mgx.run(requirement) diff --git a/metagpt/actions/di/detect_intent.py b/metagpt/actions/di/detect_intent.py index 5d25e9fbe..d7187c4f3 100644 --- a/metagpt/actions/di/detect_intent.py +++ b/metagpt/actions/di/detect_intent.py @@ -98,10 +98,12 @@ To meet user requirements, the following standard operating procedure(SOP) must {sop} -### SOP Type -{sop_type} + """ +### SOP Type +# {sop_type} + class DetectIntent(Action): async def run(self, with_message: Message, **kwargs) -> Tuple[str, str]: diff --git a/metagpt/actions/di/write_plan.py b/metagpt/actions/di/write_plan.py index 83709bc1a..201280d9b 100644 --- a/metagpt/actions/di/write_plan.py +++ b/metagpt/actions/di/write_plan.py @@ -29,15 +29,15 @@ If you are modifying an existing plan, carefully follow the instruction, don't m If you encounter errors on the current task, revise and output the current single task only. Output a list of jsons following the format: ```json -[ - {{ - "task_id": str = "unique identifier for a task in plan, can be an ordinal", - "dependent_task_ids": list[str] = "ids of tasks prerequisite to this task", - "instruction": "what you should do in this task, one short phrase or sentence. If the SOP description is provided in the `Knowledge` section, the reference to the SOP description should be included intact in the instruction.", - "task_type": "type of this task, should be one of Available Task Types. You can refer to the hints in the `SOP Type` section to make a selection if `SOP Type` section is provided.", - }}, - ... -] + [ + {{ + "task_id": str = "unique identifier for a task in plan, can be an ordinal", + "dependent_task_ids": list[str] = "ids of tasks prerequisite to this task", + "instruction": "what you should do in this task, one short phrase or sentence", + "task_type": "type of this task, should be one of Available Task Types", + }}, + ... + ] ``` """ diff --git a/metagpt/roles/di/mgx.py b/metagpt/roles/di/mgx.py index 0fa7c77b6..9d2f182e3 100644 --- a/metagpt/roles/di/mgx.py +++ b/metagpt/roles/di/mgx.py @@ -4,10 +4,11 @@ import asyncio from typing import Dict -from metagpt.actions.di.detect_intent import DetectIntent +from metagpt.actions.di.detect_intent import DetectIntent, SOPItem from metagpt.logs import logger from metagpt.roles.di.data_interpreter import DataInterpreter from metagpt.schema import Message +from metagpt.tools.tool_recommend import BM25ToolRecommender class MGX(DataInterpreter): @@ -18,6 +19,10 @@ class MGX(DataInterpreter): todo = DetectIntent(context=self.context) request_with_sop, sop_type = await todo.run(user_msg) logger.info(f"{sop_type} {request_with_sop}") + if sop_type == SOPItem.SOFTWARE_DEVELOPMENT.type_name: + self.tool_recommender = BM25ToolRecommender(tools=["software development"]) + else: + self.tool_recommender = BM25ToolRecommender(tools=[""]) return request_with_sop async def _plan_and_act(self) -> Message: From 4d072997fac2a797baa23da95bf825eb0f0101a1 Mon Sep 17 00:00:00 2001 From: stellahsr Date: Tue, 9 Apr 2024 18:21:27 +0800 Subject: [PATCH 05/20] update --- metagpt/actions/di/detect_intent.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/metagpt/actions/di/detect_intent.py b/metagpt/actions/di/detect_intent.py index 79d49b285..d9b587e5c 100644 --- a/metagpt/actions/di/detect_intent.py +++ b/metagpt/actions/di/detect_intent.py @@ -99,9 +99,6 @@ To meet user requirements, the following standard operating procedure(SOP) must {sop} """ -### SOP Type -# {sop_type} - class DetectIntent(Action): async def run(self, with_message: Message, **kwargs) -> Tuple[str, str]: From b130751b01dff73604eda53ed41b72102bc6555d Mon Sep 17 00:00:00 2001 From: stellahsr Date: Wed, 10 Apr 2024 17:03:07 +0800 Subject: [PATCH 06/20] update prompt indention --- metagpt/actions/di/write_plan.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/metagpt/actions/di/write_plan.py b/metagpt/actions/di/write_plan.py index 04aed7e17..8d6eccf57 100644 --- a/metagpt/actions/di/write_plan.py +++ b/metagpt/actions/di/write_plan.py @@ -16,9 +16,7 @@ from metagpt.schema import Message, Plan, Task from metagpt.strategy.task_type import TaskType from metagpt.utils.common import CodeParser - -class WritePlan(Action): - PROMPT_TEMPLATE: str = """ +PROMPT_TEMPLATE: str = """ # Context: {context} # Available Task Types: @@ -29,21 +27,23 @@ If you are modifying an existing plan, carefully follow the instruction, don't m If you encounter errors on the current task, revise and output the current single task only. Output a list of jsons following the format: ```json - [ - {{ - "task_id": str = "unique identifier for a task in plan, can be an ordinal", - "dependent_task_ids": list[str] = "ids of tasks prerequisite to this task", - "instruction": "what you should do in this task, one short phrase or sentence.", - "task_type": "type of this task, should be one of Available Task Types.", - }}, - ... - ] +[ + {{ + "task_id": str = "unique identifier for a task in plan, can be an ordinal", + "dependent_task_ids": list[str] = "ids of tasks prerequisite to this task", + "instruction": "what you should do in this task, one short phrase or sentence.", + "task_type": "type of this task, should be one of Available Task Types.", + }}, + ... +] ``` - """ +""" + +class WritePlan(Action): async def run(self, context: list[Message], max_tasks: int = 5) -> str: task_type_desc = "\n".join([f"- **{tt.type_name}**: {tt.value.desc}" for tt in TaskType]) - prompt = self.PROMPT_TEMPLATE.format( + prompt = PROMPT_TEMPLATE.format( context="\n".join([str(ct) for ct in context]), max_tasks=max_tasks, task_type_desc=task_type_desc ) rsp = await self._aask(prompt) From 4fc24ac98dacc59b35093ca3e2fcffcc1e7c32fc Mon Sep 17 00:00:00 2001 From: stellahsr Date: Wed, 10 Apr 2024 17:14:59 +0800 Subject: [PATCH 07/20] bugfix: return last 500 outputs for !pip execution --- metagpt/actions/di/execute_nb_code.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/metagpt/actions/di/execute_nb_code.py b/metagpt/actions/di/execute_nb_code.py index 873c11106..e78c5acf3 100644 --- a/metagpt/actions/di/execute_nb_code.py +++ b/metagpt/actions/di/execute_nb_code.py @@ -27,6 +27,8 @@ from metagpt.actions import Action from metagpt.const import DEFAULT_WORKSPACE_ROOT from metagpt.logs import ToolLogItem, log_tool_output, logger +INSTALL_KEEPLEN = 500 + class ExecuteNbCode(Action): """execute notebook code block, return result to llm, and display it.""" @@ -207,6 +209,7 @@ class ExecuteNbCode(Action): if "!pip" in code: success = False + outputs = outputs[-INSTALL_KEEPLEN:] file_path = DEFAULT_WORKSPACE_ROOT / "code.ipynb" nbformat.write(self.nb, file_path) From f8bea537c07bb090ad05b8faf62adae1e1d3a7a8 Mon Sep 17 00:00:00 2001 From: stellahsr Date: Wed, 10 Apr 2024 17:17:42 +0800 Subject: [PATCH 08/20] rm test path --- examples/di/rm_image_background.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/di/rm_image_background.py b/examples/di/rm_image_background.py index 524ecdb33..cb7900a0a 100644 --- a/examples/di/rm_image_background.py +++ b/examples/di/rm_image_background.py @@ -1,16 +1,15 @@ import asyncio -from metagpt.roles.di.mgx import MGX +from metagpt.roles.di.data_interpreter import DataInterpreter async def main(requirement: str = ""): - # di = DataInterpreter() - di = MGX(use_intent=False, tools=[""]) + di = DataInterpreter() await di.run(requirement) if __name__ == "__main__": - image_path = r"F:\deepWisdom\metaGPT\hsr\MetaGPT\examples\data\dog.beebf16d.jpg" - save_path = r"F:\deepWisdom\metaGPT\hsr\MetaGPT\examples\data\/image_rm_bg.png" + image_path = "/your/path/to/the/image.jpeg" + save_path = "/your/intended/save/path/for/image_rm_bg.png" requirement = f"This is a image, you need to use python toolkit rembg to remove the background of the image and save the result. image path:{image_path}; save path:{save_path}." asyncio.run(main(requirement)) From aa17c250569e1322b53dd34abc8cded88315905b Mon Sep 17 00:00:00 2001 From: stellahsr Date: Wed, 10 Apr 2024 17:29:44 +0800 Subject: [PATCH 09/20] merge --- examples/mgx/run_mgx.py | 1 + metagpt/roles/di/mgx.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/mgx/run_mgx.py b/examples/mgx/run_mgx.py index 1dc59d71a..54d6eb691 100644 --- a/examples/mgx/run_mgx.py +++ b/examples/mgx/run_mgx.py @@ -8,6 +8,7 @@ from metagpt.roles.di.mgx import MGX requirement = ( # "design a game using Gym (an open source Python library), including a graphical interface and interactive gameplay" 'Create a "2048 game"' + # "Create a 2048 Game" ) diff --git a/metagpt/roles/di/mgx.py b/metagpt/roles/di/mgx.py index 1715b9b6c..8e9a0dfae 100644 --- a/metagpt/roles/di/mgx.py +++ b/metagpt/roles/di/mgx.py @@ -4,10 +4,11 @@ import asyncio from typing import Dict -from metagpt.actions.di.detect_intent import DetectIntent +from metagpt.actions.di.detect_intent import DetectIntent, SOPItem from metagpt.logs import logger from metagpt.roles.di.data_interpreter import DataInterpreter from metagpt.schema import Message +from metagpt.tools.tool_recommend import BM25ToolRecommender class MGX(DataInterpreter): @@ -18,6 +19,10 @@ class MGX(DataInterpreter): todo = DetectIntent(context=self.context) request_with_sop, sop_type = await todo.run(user_msg) logger.info(f"{sop_type} {request_with_sop}") + if sop_type == SOPItem.SOFTWARE_DEVELOPMENT.type_name: + self.tool_recommender = BM25ToolRecommender(tools=["software development"]) + else: + self.tool_recommender = BM25ToolRecommender(tools=[""]) return request_with_sop async def _plan_and_act(self) -> Message: From 04a5529cb306f00612efd1bf5b8d2bf70a268e63 Mon Sep 17 00:00:00 2001 From: stellahsr Date: Wed, 10 Apr 2024 17:35:04 +0800 Subject: [PATCH 10/20] sync mgx_ops --- examples/mgx/run_mgx.py | 4 +++- metagpt/tools/libs/software_development.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/mgx/run_mgx.py b/examples/mgx/run_mgx.py index 54d6eb691..86aa67ad7 100644 --- a/examples/mgx/run_mgx.py +++ b/examples/mgx/run_mgx.py @@ -7,7 +7,9 @@ from metagpt.roles.di.mgx import MGX requirement = ( # "design a game using Gym (an open source Python library), including a graphical interface and interactive gameplay" - 'Create a "2048 game"' + # "帮我把pip的源设置成:https://pypi.tuna.tsinghua.edu.cn/simple" + # "This is a website url does not require login: https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767 please write a similar web page,developed in vue language, The package.json dependency must be generated" + "I would like to imitate the website available at https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767. Could you please browse through it?" # "Create a 2048 Game" ) diff --git a/metagpt/tools/libs/software_development.py b/metagpt/tools/libs/software_development.py index 9f74c58e1..291fc78d9 100644 --- a/metagpt/tools/libs/software_development.py +++ b/metagpt/tools/libs/software_development.py @@ -86,9 +86,8 @@ async def write_design(prd_path: str | Path) -> Path: Path: The path to the system design files under the project directory. Example: - >>> from metagpt.tools.libs.software_development import write_prd >>> from metagpt.tools.libs.software_development import write_design - >>> prd_path = await write_prd("Create a new feature for the application") + >>> prd_path = '/path/to/project_path/docs/prd' # Returned by `write_prd` >>> system_design_path = await write_desgin(prd_path) >>> print(system_design_path) '/path/to/project_path/docs/system_design/' From 523877854ae2717ac44a23a02d5ce594b0e099e6 Mon Sep 17 00:00:00 2001 From: yzlin Date: Thu, 11 Apr 2024 04:06:42 +0800 Subject: [PATCH 11/20] add browser, first version --- examples/di/use_browser.py | 26 ++++ metagpt/tools/libs/__init__.py | 2 + metagpt/tools/libs/browser.py | 183 +++++++++++++++++++++++ metagpt/utils/common.py | 4 +- tests/metagpt/tools/libs/test_browser.py | 90 +++++++++++ 5 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 examples/di/use_browser.py create mode 100644 metagpt/tools/libs/browser.py create mode 100644 tests/metagpt/tools/libs/test_browser.py diff --git a/examples/di/use_browser.py b/examples/di/use_browser.py new file mode 100644 index 000000000..6dfc8de24 --- /dev/null +++ b/examples/di/use_browser.py @@ -0,0 +1,26 @@ +import asyncio + +from metagpt.roles.di.data_interpreter import DataInterpreter + +# an example to showcase navigation +MG_LLM_CONFIG_REQ = """ +This is a link to the doc site of MetaGPT project: https://docs.deepwisdom.ai/main/en/ +Check where you can go to on the site and try to find out the list of LLM APIs supported by MetaGPT. +Don't write all codes in one response, each time, just write code for one step. +""" + +# an example to showcase searching +PAPER_LIST_REQ = """" +At https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/, +find the first paper whose title includes `multiagent`, open it and summarize its abstract. +Don't write all codes in one response, each time, just write code for one step. +""" + + +async def main(): + di = DataInterpreter(tools=["Browser"], react_mode="react") + await di.run(MG_LLM_CONFIG_REQ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/metagpt/tools/libs/__init__.py b/metagpt/tools/libs/__init__.py index fb96db735..92f73ea54 100644 --- a/metagpt/tools/libs/__init__.py +++ b/metagpt/tools/libs/__init__.py @@ -13,6 +13,7 @@ from metagpt.tools.libs import ( email_login, terminal, file_manager, + browser, ) from metagpt.tools.libs.software_development import ( write_prd, @@ -40,4 +41,5 @@ _ = ( git_archive, terminal, file_manager, + browser, ) # Avoid pre-commit error diff --git a/metagpt/tools/libs/browser.py b/metagpt/tools/libs/browser.py new file mode 100644 index 000000000..0a73e9fc7 --- /dev/null +++ b/metagpt/tools/libs/browser.py @@ -0,0 +1,183 @@ +from playwright.async_api import async_playwright + +from metagpt.const import DEFAULT_WORKSPACE_ROOT +from metagpt.tools.tool_registry import register_tool +from metagpt.utils.common import encode_image + + +@register_tool() +class Browser: + """ + A tool for browsing the web. Don't initialize a new instance of this class if one already exists. + Note: Combine searching, scrolling, extraction, and link finding together to achieve most effective browsing. DON'T stick to one method. + """ + + def __init__(self): + """initiate the browser, create pages placeholder later to be managed as {page_url: page object}""" + self.browser = None + + from metagpt.config2 import config + from metagpt.llm import LLM + + self.llm = LLM(llm_config=config.get_openai_llm()) + self.llm.model = "gpt-4-vision-preview" + + # browser status management + self.pages = {} + self.current_page_url = None + self.current_page = None + + async def start(self): + """Starts Playwright and launches a browser""" + self.playwright = await async_playwright().start() + self.browser = await self.playwright.chromium.launch() + + async def open_new_page(self, url: str): + """open a new page in the browser, set it as the current page""" + page = await self.browser.new_page() + await page.goto(url) + self.pages[url] = page + self.current_page = page + self.current_page_url = url + print(f"Opened new page: {url}") + + async def switch_page(self, url: str): + """switch to an opened page in the browser, set it as the current page""" + if url in self.pages: + self.current_page = self.pages[url] + self.current_page_url = url + print(f"Switched to page: {url}") + else: + print(f"Page not found: {url}") + + async def search_content_all(self, search_term: str) -> list[dict]: + """search all occurences of search term in the current page and return the search results with their position. + Useful if you have a keyword or sentence in mind and want to quickly narrow down the content relevant to it. + + Args: + search_term (str): the search term + + Returns: + list[dict]: a list of dictionaries containing the elements and their positions, e.g. + [ + { + "index": ..., + "content": { + "text_block": ..., + "links": [ + {"text": ..., "href": ...}, + ... + ] + }, + "position": {from_top: ..., from_left: ...}, + }, + ... + ] + """ + locator = self.current_page.locator(f"text={search_term}") + count = await locator.count() + search_results = [] + for i in range(count): + element = locator.nth(i) + if await element.is_visible(): + position = await element.evaluate("e => ({ from_top: e.offsetTop, from_left: e.offsetLeft })") + + # Retrieve the surrounding block of text and links with their text + content = await element.evaluate( + """ + (element) => { + // const block = element.closest('p, div, section, article'); + const block = element.parentElement; + return { + text_block: block.innerText, + // Create an array of objects, each containing the text and href of a link + links: Array.from(block.querySelectorAll('a')).map(a => ({ + text: a.innerText, + href: a.href + })) + }; + } + """ + ) + + search_results.append( + {"index": len(search_results), "content": content, "position": position, "element_obj": element} + ) + + print(f"Found {len(search_results)} instances of the term '{search_term}':\n\n{search_results}") + + return search_results + + async def scroll_to_search_result(self, search_results: list[dict], index: int = 0): + """Scroll to the index-th search result, potentially for subsequent perception. + Useful if you have located a search result, the search result does not fulfill your requirement, and you need more information around that search result. Can only be used after search_all_content. + + Args: + search_results (list[dict]): search_results from search_content_all + index (int, optional): the index of the search result to scroll to. Index starts from 0. Defaults to 0. + """ + if not search_results: + return {} + if index >= len(search_results): + print(f"Index {index} is out of range. Scrolling to the last instance.") + index = len(search_results) - 1 + element = search_results[index]["element_obj"] + await element.scroll_into_view_if_needed() + print(f"Successfully scrolled to the {index}-th search result, consider extract more info around it.") + + async def find_links(self) -> list: + """Finds all links in the current page and returns a list of dictionaries with link text and the URL. + Useful for navigating to more pages and exploring more resources. + + Returns: + list: A list of dictionaries, each containing 'text' and 'href' keys. + """ + # Use a CSS selector to find all elements in the page. + links = await self.current_page.query_selector_all("a") + + # Prepare an empty list to hold link information. + link_info = [] + + # Iterate over each link element to extract its text and href attributes. + for link in links: + text = await link.text_content() + href = await link.get_attribute("href") + link_info.append({"text": text, "href": href}) + + print(f"Found {len(link_info)} links:\n\n{link_info}") + + return link_info + + async def extract_info_from_view(self, instruction: str) -> str: + """ + Extract useful info from the current page view. + + Args: + instruction (str): explain what info needs to be extracted + + Returns: + str: extracted info from current view + """ + img_path = DEFAULT_WORKSPACE_ROOT / "screenshot_temp.png" + await self.current_page.screenshot(path=img_path) + rsp = await self.llm.aask(msg=instruction, images=[encode_image(img_path)]) + return rsp + + async def scroll_current_page(self, offset: int = 500): + """scroll the current page by offset pixels, negative value means scrolling up, returning the content observed after scrolling""" + await self.current_page.evaluate(f"window.scrollBy(0, {offset})") + print(f"Scrolled current page by {offset} pixels. Perceive the scrolled view if needed") + + def check_all_pages(self) -> dict: + """return all pages opened in the browser, a dictionary with {page_url: page_title}, useful for understanding the current browser state""" + pages_info = {url: page.title() for url, page in self.pages.items()} + return pages_info + + async def close(self): + """close the browser and all pages""" + await self.browser.close() + await self.playwright.stop() + + +async def get_scroll_position(page): + return await page.evaluate("() => ({ x: window.scrollX, y: window.scrollY })") diff --git a/metagpt/utils/common.py b/metagpt/utils/common.py index 1340f32cb..ffc25ac05 100644 --- a/metagpt/utils/common.py +++ b/metagpt/utils/common.py @@ -783,13 +783,15 @@ def load_mc_skills_code(skill_names: list[str] = None, skills_dir: Path = None) return skills -def encode_image(image_path_or_pil: Union[Path, Image], encoding: str = "utf-8") -> str: +def encode_image(image_path_or_pil: Union[Path, Image, str], encoding: str = "utf-8") -> str: """encode image from file or PIL.Image into base64""" if isinstance(image_path_or_pil, Image.Image): buffer = BytesIO() image_path_or_pil.save(buffer, format="JPEG") bytes_data = buffer.getvalue() else: + if isinstance(image_path_or_pil, str): + image_path_or_pil = Path(image_path_or_pil) if not image_path_or_pil.exists(): raise FileNotFoundError(f"{image_path_or_pil} not exists") with open(str(image_path_or_pil), "rb") as image_file: diff --git a/tests/metagpt/tools/libs/test_browser.py b/tests/metagpt/tools/libs/test_browser.py new file mode 100644 index 000000000..0c3009fef --- /dev/null +++ b/tests/metagpt/tools/libs/test_browser.py @@ -0,0 +1,90 @@ +import pytest + +from metagpt.const import TEST_DATA_PATH +from metagpt.tools.libs.browser import Browser, get_scroll_position + +TEST_URL = "https://docs.deepwisdom.ai/main/en/guide/get_started/quickstart.html" + +TEST_SCREENSHOT_PATH = TEST_DATA_PATH / "screenshot.png" + + +@pytest.fixture(autouse=True) +def llm_mock(rsp_cache, mocker, request): + # An empty fixture to overwrite the global llm_mock fixture + # because in provider folder, we want to test the aask and aask functions for the specific models + pass + + +@pytest.fixture +def browser(): + browser_instance = Browser() + yield browser_instance + + +@pytest.mark.asyncio +async def test_open_and_switch_page(browser): + await browser.start() + + await browser.open_new_page("https://baidu.com") + await browser.open_new_page("https://tencent.com") + assert browser.current_page_url == "https://tencent.com" + await browser.switch_page("https://baidu.com") + assert browser.current_page_url == "https://baidu.com" + + await browser.close() + + +@pytest.mark.asyncio +async def test_search(browser): + await browser.start() + + # search all + await browser.open_new_page(TEST_URL) + search_term = "startup example" + search_results = await browser.search_content_all(search_term) + print(search_results) + # expected search result as of 20240410: + # [{'index': 0, 'content': {'text_block': 'Below is a breakdown of the software startup example. If you install MetaGPT with the git clone approach, simply run', 'links': [{'text': 'software startup example', 'href': 'https://github.com/geekan/MetaGPT/blob/main/metagpt/software_company.py'}]}, 'position': {'from_top': 640, 'from_left': 225}, 'element_obj': selector='text=startup example >> nth=0'>}] + first_result = search_results[0]["content"] + assert "software startup example" in first_result["text_block"] + assert first_result["links"] + assert first_result["links"][0]["href"] == "https://github.com/geekan/MetaGPT/blob/main/metagpt/software_company.py" + assert search_results[0]["position"] + + # scroll to search result + await browser.scroll_to_search_result(search_results, index=0) + + # perceive current view + rsp = await browser.extract_info_from_view("what is the command to run exactly?") + assert "metagpt" in rsp + + await browser.close() + + +@pytest.mark.asyncio +async def test_find_links(browser): + await browser.start() + + await browser.open_new_page(TEST_URL) + link_info = await browser.find_links() + assert link_info + + await browser.close() + + +@pytest.mark.asyncio +async def test_scroll(browser): + await browser.start() + + await browser.open_new_page(TEST_URL) + + await browser.scroll_current_page(offset=-500) + assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 0} # no change if you scrol up from top + + await browser.scroll_current_page(offset=500) # scroll down + assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 500} + + await browser.scroll_current_page(offset=-200) # scroll up + assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 300} + + await browser.close() From 391eda6a35c9d05170e925657b9e8a78401fc4ba Mon Sep 17 00:00:00 2001 From: yzlin Date: Thu, 11 Apr 2024 14:30:38 +0800 Subject: [PATCH 12/20] add async logs for browser --- metagpt/logs.py | 16 ++++++++++++++++ metagpt/tools/libs/browser.py | 26 ++++++++++++++++++++------ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/metagpt/logs.py b/metagpt/logs.py index 480477e6b..b208e0868 100644 --- a/metagpt/logs.py +++ b/metagpt/logs.py @@ -54,6 +54,11 @@ def log_tool_output(output: ToolLogItem | list[ToolLogItem], tool_name: str = "" _tool_output_log(output=output, tool_name=tool_name) +async def log_tool_output_async(output: ToolLogItem | list[ToolLogItem], tool_name: str = ""): + """async interface for logging tool output, used when output contains async object""" + await _tool_output_log_async(output=output, tool_name=tool_name) + + def set_llm_stream_logfunc(func): global _llm_stream_log _llm_stream_log = func @@ -64,9 +69,20 @@ def set_tool_output_logfunc(func): _tool_output_log = func +async def set_tool_output_logfunc_async(func): + # async version + global _tool_output_log_async + _tool_output_log_async = func + + _llm_stream_log = partial(print, end="") _tool_output_log = ( lambda *args, **kwargs: None ) # a dummy function to avoid errors if set_tool_output_logfunc is not called + + +async def _tool_output_log_async(*args, **kwargs): + # async version + pass diff --git a/metagpt/tools/libs/browser.py b/metagpt/tools/libs/browser.py index 0a73e9fc7..48b1cab69 100644 --- a/metagpt/tools/libs/browser.py +++ b/metagpt/tools/libs/browser.py @@ -1,6 +1,7 @@ from playwright.async_api import async_playwright from metagpt.const import DEFAULT_WORKSPACE_ROOT +from metagpt.logs import ToolLogItem, log_tool_output_async from metagpt.tools.tool_registry import register_tool from metagpt.utils.common import encode_image @@ -32,21 +33,28 @@ class Browser: self.playwright = await async_playwright().start() self.browser = await self.playwright.chromium.launch() + def _set_current_page(self, page, url): + self.current_page = page + self.current_page_url = url + print("Now on page ", url) + async def open_new_page(self, url: str): """open a new page in the browser, set it as the current page""" page = await self.browser.new_page() await page.goto(url) self.pages[url] = page - self.current_page = page - self.current_page_url = url - print(f"Opened new page: {url}") + self._set_current_page(page, url) + await log_tool_output_async( + ToolLogItem(type="object", name="open_new_page", value=self.current_page), tool_name="Browser" + ) async def switch_page(self, url: str): """switch to an opened page in the browser, set it as the current page""" if url in self.pages: - self.current_page = self.pages[url] - self.current_page_url = url - print(f"Switched to page: {url}") + self._set_current_page(self.pages[url], url) + await log_tool_output_async( + ToolLogItem(type="object", name="switch_page", value=self.current_page), tool_name="Browser" + ) else: print(f"Page not found: {url}") @@ -124,6 +132,9 @@ class Browser: element = search_results[index]["element_obj"] await element.scroll_into_view_if_needed() print(f"Successfully scrolled to the {index}-th search result, consider extract more info around it.") + await log_tool_output_async( + ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser" + ) async def find_links(self) -> list: """Finds all links in the current page and returns a list of dictionaries with link text and the URL. @@ -167,6 +178,9 @@ class Browser: """scroll the current page by offset pixels, negative value means scrolling up, returning the content observed after scrolling""" await self.current_page.evaluate(f"window.scrollBy(0, {offset})") print(f"Scrolled current page by {offset} pixels. Perceive the scrolled view if needed") + await log_tool_output_async( + ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser" + ) def check_all_pages(self) -> dict: """return all pages opened in the browser, a dictionary with {page_url: page_title}, useful for understanding the current browser state""" From 5376a869298b26eafc74a952a352e81b712bdacb Mon Sep 17 00:00:00 2001 From: yzlin Date: Thu, 11 Apr 2024 16:55:52 +0800 Subject: [PATCH 13/20] fix tool convert bug and add more tests --- metagpt/tools/tool_convert.py | 5 ++--- tests/metagpt/tools/test_tool_convert.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/metagpt/tools/tool_convert.py b/metagpt/tools/tool_convert.py index 829269b1b..a84cbeea0 100644 --- a/metagpt/tools/tool_convert.py +++ b/metagpt/tools/tool_convert.py @@ -20,8 +20,7 @@ def convert_code_to_tool_schema(obj, include: list[str] = None) -> dict: continue # method_doc = inspect.getdoc(method) method_doc = get_class_method_docstring(obj, name) - if method_doc: - schema["methods"][name] = function_docstring_to_schema(method, method_doc) + schema["methods"][name] = function_docstring_to_schema(method, method_doc) elif inspect.isfunction(obj): schema = function_docstring_to_schema(obj, docstring) @@ -39,7 +38,7 @@ def convert_code_to_tool_schema_ast(code: str) -> list[dict]: return visitor.get_tool_schemas() -def function_docstring_to_schema(fn_obj, docstring) -> dict: +def function_docstring_to_schema(fn_obj, docstring="") -> dict: """ Converts a function's docstring into a schema dictionary. diff --git a/tests/metagpt/tools/test_tool_convert.py b/tests/metagpt/tools/test_tool_convert.py index 4798d32b0..5aa53ce4f 100644 --- a/tests/metagpt/tools/test_tool_convert.py +++ b/tests/metagpt/tools/test_tool_convert.py @@ -48,6 +48,14 @@ class DummyClass: pass +class DummySubClass(DummyClass): + """sub class docstring""" + + def sub_method(self, df: pd.DataFrame): + """sub method""" + pass + + def dummy_fn( df: pd.DataFrame, s: str, @@ -117,6 +125,18 @@ def test_convert_code_to_tool_schema_class(): assert schema == expected +def test_convert_code_to_tool_schema_subclass(): + schema = convert_code_to_tool_schema(DummySubClass) + assert "sub_method" in schema["methods"] # sub class method should be included + assert "fit" in schema["methods"] # parent class method should be included + + +def test_convert_code_to_tool_schema_include(): + schema = convert_code_to_tool_schema(DummyClass, include=["fit"]) + assert "fit" in schema["methods"] + assert "transform" not in schema["methods"] + + def test_convert_code_to_tool_schema_function(): expected = { "type": "function", From 914fc4358ce1389ac790dc3fab63bb3f294039a4 Mon Sep 17 00:00:00 2001 From: yzlin Date: Thu, 11 Apr 2024 21:13:53 +0800 Subject: [PATCH 14/20] rm web imitation sop, extend to web operation --- metagpt/actions/di/detect_intent.py | 52 ++++++++++++++--------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/metagpt/actions/di/detect_intent.py b/metagpt/actions/di/detect_intent.py index bfefcacff..eb7f479e1 100644 --- a/metagpt/actions/di/detect_intent.py +++ b/metagpt/actions/di/detect_intent.py @@ -39,35 +39,29 @@ class SOPItem(Enum): "Stage and commit changes for the project repository using Git.", ], ) - FIX_BUGS = SOPItemDef( - name="fix bugs", - description="Fix bugs in a given project.", - sop=[ - "Fix bugs in the project repository.", - "Stage and commit changes for the project repository using Git.", - ], - ) - FORMAT_REPO = SOPItemDef( - name="format repo", - description="download repository from git and format the project to MetaGPT project", - sop=[ - "Imports a project from a Git website and formats it to MetaGPT project format to enable incremental appending requirements.", - "Stage and commit changes for the project repository using Git.", - ], - ) - WEBPAGE_IMITATION = SOPItemDef( - name="webpage_imitation", - description="webpage browsing, imitation and other applications etc.", - sop=[ - "Utilize Selenium and WebDriver for rendering.", - "Capture a screenshot of the rendered webpage.", - "Convert image to a webpage including HTML, CSS and JS in one go.", - ], + # FIX_BUGS = SOPItemDef( + # name="fix bugs", + # description="Fix bugs in a given project.", + # sop=[ + # "Fix bugs in the project repository.", + # "Stage and commit changes for the project repository using Git.", + # ], + # ) + # FORMAT_REPO = SOPItemDef( + # name="format repo", + # description="download repository from git and format the project to MetaGPT project", + # sop=[ + # "Imports a project from a Git website and formats it to MetaGPT project format to enable incremental appending requirements.", + # "Stage and commit changes for the project repository using Git.", + # ], + # ) + WEB_OPERATION = SOPItemDef( + name="web operation", + description="web browsing, scraping, imitation and other interaction with the web", ) OTHER = SOPItemDef( name="other", description="Other intentions that do not fall into the above categories, including data science, machine learning, deep learning and text-to-image etc.", - sop=[], ) @property @@ -130,7 +124,13 @@ class DetectIntent(Action): async def main(): # Example usage of the DetectIntent action - user_requirements = ["Develop a 2048 game.", "Run data analysis on sklearn wine dataset"] + user_requirements = [ + "Develop a 2048 game.", + "Run data analysis on sklearn wine dataset", + "帮我把pip的源设置成:https://pypi.tuna.tsinghua.edu.cn/simple", + "This is a website url does not require login: https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767 please write a similar web page,developed in vue language, The package.json dependency must be generated", + "I would like to imitate the website available at https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767. Could you please browse through it?", + ] detect_intent = DetectIntent() for user_requirement in user_requirements: From 19cd30c3941a39a0f28ad144c493f41275d19fba Mon Sep 17 00:00:00 2001 From: yzlin Date: Thu, 11 Apr 2024 21:52:19 +0800 Subject: [PATCH 15/20] minor phrasing update --- metagpt/actions/di/detect_intent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metagpt/actions/di/detect_intent.py b/metagpt/actions/di/detect_intent.py index eb7f479e1..c24f0679f 100644 --- a/metagpt/actions/di/detect_intent.py +++ b/metagpt/actions/di/detect_intent.py @@ -29,7 +29,7 @@ class SOPItemDef(BaseModel): class SOPItem(Enum): SOFTWARE_DEVELOPMENT = SOPItemDef( name="software development", - description="Intentions related to or including software development, such as developing or building software, games, app, websites, etc. Excluding bug fixes, report any issues, environment setup, operations and pip install.", + description="Software development intention including developing or building software, games, app, websites, etc. EXCLUDING bug fixes, report any issues, environment setup, terminal operations, and pip install.", sop=[ "Writes a PRD based on software requirements.", "Writes a design to the project repository, based on the PRD of the project.", @@ -61,7 +61,7 @@ class SOPItem(Enum): ) OTHER = SOPItemDef( name="other", - description="Other intentions that do not fall into the above categories, including data science, machine learning, deep learning and text-to-image etc.", + description="Other intentions that do not fall into the above categories, including data science, data analysis, machine learning, deep learning and text-to-image etc.", ) @property From 9139227a8a9bed282bce713644c572454f41b1e6 Mon Sep 17 00:00:00 2001 From: yzlin Date: Fri, 12 Apr 2024 10:27:56 +0800 Subject: [PATCH 16/20] add service deployer --- metagpt/tools/libs/deployer.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 metagpt/tools/libs/deployer.py diff --git a/metagpt/tools/libs/deployer.py b/metagpt/tools/libs/deployer.py new file mode 100644 index 000000000..3c9c2f7e5 --- /dev/null +++ b/metagpt/tools/libs/deployer.py @@ -0,0 +1,11 @@ +from metagpt.logs import ToolLogItem, log_tool_output +from metagpt.tools.tool_registry import register_tool + + +# An un-implemented tool reserved for deploying a local service to public +@register_tool() +class Deployer: + """Deploy a local service to public. Used only for final deployment, you should NOT use it for development and testing.""" + + def deploy_to_public(self, local_url: str): + log_tool_output(ToolLogItem(name="local_url", value=local_url), tool_name="Deployer") From a9bbdc92a1202acf3c24572d2d06e7920756731a Mon Sep 17 00:00:00 2001 From: yzlin Date: Fri, 12 Apr 2024 10:32:21 +0800 Subject: [PATCH 17/20] add deployer --- metagpt/tools/libs/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metagpt/tools/libs/__init__.py b/metagpt/tools/libs/__init__.py index 92f73ea54..d807a7d41 100644 --- a/metagpt/tools/libs/__init__.py +++ b/metagpt/tools/libs/__init__.py @@ -14,6 +14,7 @@ from metagpt.tools.libs import ( terminal, file_manager, browser, + deployer, ) from metagpt.tools.libs.software_development import ( write_prd, @@ -42,4 +43,5 @@ _ = ( terminal, file_manager, browser, + deployer, ) # Avoid pre-commit error From 99774418aff62c7e3cbfc1bfc110a3014242beec Mon Sep 17 00:00:00 2001 From: seeker Date: Fri, 12 Apr 2024 11:38:15 +0800 Subject: [PATCH 18/20] update: The teminal tool adds Conda environment support for daemon mode running --- examples/di/run_flask.py | 20 ++++++++++ examples/di/use_github_repo.py | 3 +- metagpt/tools/libs/__init__.py | 2 + metagpt/tools/libs/terminal.py | 72 ++++++++++++++++++++++++++++------ 4 files changed, 85 insertions(+), 12 deletions(-) create mode 100644 examples/di/run_flask.py diff --git a/examples/di/run_flask.py b/examples/di/run_flask.py new file mode 100644 index 000000000..ed0f35b8e --- /dev/null +++ b/examples/di/run_flask.py @@ -0,0 +1,20 @@ +import asyncio + +from metagpt.roles.di.data_interpreter import DataInterpreter + + +USE_GOT_REPO_REQ = """ +Write a service using Flask, create a conda environment and run it, and call the service's interface for validation. +Notice: Don't write all codes in one response, each time, just write code for one step. +""" +# If you have created a conda environment, you can say: +# I have created the conda environment '{env_name}', please use this environment to execute. + + +async def main(): + di = DataInterpreter(tools=["Terminal", "FileManager"]) + await di.run(USE_GOT_REPO_REQ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/di/use_github_repo.py b/examples/di/use_github_repo.py index ad541d2d9..7327f4597 100644 --- a/examples/di/use_github_repo.py +++ b/examples/di/use_github_repo.py @@ -5,7 +5,8 @@ from metagpt.roles.di.data_interpreter import DataInterpreter USE_GOT_REPO_REQ = """ This is a link to the GOT github repo: https://github.com/spcl/graph-of-thoughts.git. Clone it, read the README to understand the usage, install it, and finally run the quick start example. -**Note the config for LLM is at `config/config_got.json`, use this path directly.** Don't write all codes in one response, each time, just write code for one step. +**Note the config for LLM is at `config/config_got.json`, it's outside the repo path, before using it, you need to copy it into graph-of-thoughts. +** Don't write all codes in one response, each time, just write code for one step. """ diff --git a/metagpt/tools/libs/__init__.py b/metagpt/tools/libs/__init__.py index cd70d9811..fb96db735 100644 --- a/metagpt/tools/libs/__init__.py +++ b/metagpt/tools/libs/__init__.py @@ -12,6 +12,7 @@ from metagpt.tools.libs import ( web_scraping, email_login, terminal, + file_manager, ) from metagpt.tools.libs.software_development import ( write_prd, @@ -38,4 +39,5 @@ _ = ( fix_bug, git_archive, terminal, + file_manager, ) # Avoid pre-commit error diff --git a/metagpt/tools/libs/terminal.py b/metagpt/tools/libs/terminal.py index a23ebb86a..ee0d155cb 100644 --- a/metagpt/tools/libs/terminal.py +++ b/metagpt/tools/libs/terminal.py @@ -1,4 +1,6 @@ import subprocess +import threading +from queue import Queue from metagpt.logs import TOOL_LOG_END_MARKER, ToolLogItem, log_tool_output from metagpt.tools.tool_registry import register_tool @@ -6,7 +8,12 @@ from metagpt.tools.tool_registry import register_tool @register_tool() class Terminal: - """A tool for running terminal commands. Don't initialize a new instance of this class if one already exists.""" + """ + A tool for running terminal commands. + Don't initialize a new instance of this class if one already exists. + For commands that need to be executed within a Conda environment, it is recommended + to use the `execute_in_conda_env` method. + """ def __init__(self): self.shell_command = ["bash"] # FIXME: should consider windows support later @@ -21,20 +28,31 @@ class Terminal: text=True, bufsize=1, # Line buffered ) + self.stdout_queue = Queue() - def run_command(self, cmd: str) -> str: + def run_command(self, cmd: str, daemon=False) -> str: """ - Run a command in the terminal and return the output. - When the command is being executed, stream the output to the terminal. - Maintains state across commands, such as current directory. + Executes a specified command in the terminal and streams the output back in real time. + This command maintains state across executions, such as the current directory, + allowing for sequential commands to be contextually aware. The output from the + command execution is placed into `stdout_queue`, which can be consumed as needed. Args: - cmd (str): The command to run in the terminal. + cmd (str): The command to execute in the terminal. + daemon (bool): If True, executes the command in a background thread, allowing + the main program to continue execution. The command's output is + collected asynchronously in daemon mode and placed into `stdout_queue`. Returns: - str: The output of the terminal command. + str: The command's output or an empty string if `daemon` is True. Remember that + when `daemon` is True, the output is collected into `stdout_queue` and must + be consumed from there. + + Note: + If `stdout_queue` is not periodically consumed, it could potentially grow indefinitely, + consuming memory. Ensure that there's a mechanism in place to consume this queue, + especially during long-running or output-heavy command executions. """ - cmd_output = [] # Send the command self.process.stdin.write(cmd + self.command_terminator) @@ -42,6 +60,38 @@ class Terminal: f'echo "{TOOL_LOG_END_MARKER.value}"' + self.command_terminator ) # Unique marker to signal command end self.process.stdin.flush() + if daemon: + threading.Thread(target=self._read_and_process_output, args=(cmd,), daemon=True).start() + return "" + else: + return self._read_and_process_output(cmd) + + def execute_in_conda_env(self, cmd: str, env, daemon=False) -> str: + """ + Executes a given command within a specified Conda environment automatically without + the need for manual activation. Users just need to provide the name of the Conda + environment and the command to execute. + + Args: + cmd (str): The command to execute within the Conda environment. + env (str, optional): The name of the Conda environment to activate before executing the command. + If not specified, the command will run in the current active environment. + daemon (bool): If True, the command is run in a background thread, similar to `run_command`, + affecting error logging and handling in the same manner. + + Returns: + str: The command's output, or an empty string if `daemon` is True, with output processed + asynchronously in that case. + + Note: + This function wraps `run_command`, prepending the necessary Conda activation commands + to ensure the specified environment is active for the command's execution. + """ + cmd = f"conda run -n {env} {cmd}" + return self.run_command(cmd, daemon=daemon) + + def _read_and_process_output(self, cmd): + cmd_output = [] log_tool_output( output=ToolLogItem(name="cmd", value=cmd + self.command_terminator), tool_name="Terminal" ) # log the command @@ -52,10 +102,10 @@ class Terminal: if line.strip() == TOOL_LOG_END_MARKER.value: log_tool_output(TOOL_LOG_END_MARKER) break - log_tool_output( - output=ToolLogItem(name="output", value=line), tool_name="Terminal" - ) # log stdout in real-time + # log stdout in real-time + log_tool_output(output=ToolLogItem(name="output", value=line), tool_name="Terminal") cmd_output.append(line) + self.stdout_queue.put(line) return "".join(cmd_output) From c6e42631dacb45b3aaec4c9600310adce1170b73 Mon Sep 17 00:00:00 2001 From: yzlin Date: Fri, 12 Apr 2024 14:15:40 +0800 Subject: [PATCH 19/20] improve browser, rm vision, add text view, comment out find_links --- examples/di/imitate_webpage.py | 6 +- examples/di/use_browser.py | 7 +- metagpt/tools/libs/browser.py | 144 +++++++++++++---------- tests/metagpt/tools/libs/test_browser.py | 22 ++-- 4 files changed, 101 insertions(+), 78 deletions(-) diff --git a/examples/di/imitate_webpage.py b/examples/di/imitate_webpage.py index 60ebab389..d181e0dfc 100644 --- a/examples/di/imitate_webpage.py +++ b/examples/di/imitate_webpage.py @@ -11,10 +11,10 @@ from metagpt.roles.di.data_interpreter import DataInterpreter async def main(): web_url = "https://pytorch.org/" prompt = f"""This is a URL of webpage: '{web_url}' . -Firstly, utilize Selenium and WebDriver for rendering. -Secondly, convert image to a webpage including HTML, CSS and JS in one go. +Firstly, open the page and take a screenshot of the page. +Secondly, convert the image to a webpage including HTML, CSS and JS in one go. Note: All required dependencies and environments have been fully installed and configured.""" - di = DataInterpreter(tools=["GPTvGenerator"]) + di = DataInterpreter(tools=["GPTvGenerator", "Browser"]) await di.run(prompt) diff --git a/examples/di/use_browser.py b/examples/di/use_browser.py index 6dfc8de24..a3a079ccc 100644 --- a/examples/di/use_browser.py +++ b/examples/di/use_browser.py @@ -2,20 +2,23 @@ import asyncio from metagpt.roles.di.data_interpreter import DataInterpreter -# an example to showcase navigation MG_LLM_CONFIG_REQ = """ This is a link to the doc site of MetaGPT project: https://docs.deepwisdom.ai/main/en/ Check where you can go to on the site and try to find out the list of LLM APIs supported by MetaGPT. Don't write all codes in one response, each time, just write code for one step. """ -# an example to showcase searching PAPER_LIST_REQ = """" At https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/, find the first paper whose title includes `multiagent`, open it and summarize its abstract. Don't write all codes in one response, each time, just write code for one step. """ +DESCRIBE_GITHUB_ISSUE_REQ = """ +Visit https://github.com/geekan/MetaGPT, navigate to Issues page, open the first issue related to DataInterpreter, then summarize what the issue is in one sentence. +Don't write all codes in one response, each time, just write code for one step. +""" + async def main(): di = DataInterpreter(tools=["Browser"], react_mode="react") diff --git a/metagpt/tools/libs/browser.py b/metagpt/tools/libs/browser.py index 48b1cab69..b6a5b7cbf 100644 --- a/metagpt/tools/libs/browser.py +++ b/metagpt/tools/libs/browser.py @@ -3,26 +3,19 @@ from playwright.async_api import async_playwright from metagpt.const import DEFAULT_WORKSPACE_ROOT from metagpt.logs import ToolLogItem, log_tool_output_async from metagpt.tools.tool_registry import register_tool -from metagpt.utils.common import encode_image @register_tool() class Browser: """ A tool for browsing the web. Don't initialize a new instance of this class if one already exists. - Note: Combine searching, scrolling, extraction, and link finding together to achieve most effective browsing. DON'T stick to one method. + Note: Combine searching and scrolling together to achieve most effective browsing. DON'T stick to one method. """ def __init__(self): """initiate the browser, create pages placeholder later to be managed as {page_url: page object}""" self.browser = None - from metagpt.config2 import config - from metagpt.llm import LLM - - self.llm = LLM(llm_config=config.get_openai_llm()) - self.llm.model = "gpt-4-vision-preview" - # browser status management self.pages = {} self.current_page_url = None @@ -33,25 +26,26 @@ class Browser: self.playwright = await async_playwright().start() self.browser = await self.playwright.chromium.launch() - def _set_current_page(self, page, url): + async def _set_current_page(self, page, url): self.current_page = page self.current_page_url = url print("Now on page ", url) + print(await self._view()) async def open_new_page(self, url: str): - """open a new page in the browser, set it as the current page""" + """open a new page in the browser and view the page""" page = await self.browser.new_page() await page.goto(url) self.pages[url] = page - self._set_current_page(page, url) + await self._set_current_page(page, url) await log_tool_output_async( ToolLogItem(type="object", name="open_new_page", value=self.current_page), tool_name="Browser" ) async def switch_page(self, url: str): - """switch to an opened page in the browser, set it as the current page""" + """switch to an opened page in the browser and view the page""" if url in self.pages: - self._set_current_page(self.pages[url], url) + await self._set_current_page(self.pages[url], url) await log_tool_output_async( ToolLogItem(type="object", name="switch_page", value=self.current_page), tool_name="Browser" ) @@ -91,22 +85,7 @@ class Browser: position = await element.evaluate("e => ({ from_top: e.offsetTop, from_left: e.offsetLeft })") # Retrieve the surrounding block of text and links with their text - content = await element.evaluate( - """ - (element) => { - // const block = element.closest('p, div, section, article'); - const block = element.parentElement; - return { - text_block: block.innerText, - // Create an array of objects, each containing the text and href of a link - links: Array.from(block.querySelectorAll('a')).map(a => ({ - text: a.innerText, - href: a.href - })) - }; - } - """ - ) + content = await element.evaluate(SEARCH_CONTENT_JS) search_results.append( {"index": len(search_results), "content": content, "position": position, "element_obj": element} @@ -131,56 +110,53 @@ class Browser: index = len(search_results) - 1 element = search_results[index]["element_obj"] await element.scroll_into_view_if_needed() - print(f"Successfully scrolled to the {index}-th search result, consider extract more info around it.") await log_tool_output_async( ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser" ) + print(f"Successfully scrolled to the {index}-th search result") + print(await self._view()) - async def find_links(self) -> list: - """Finds all links in the current page and returns a list of dictionaries with link text and the URL. - Useful for navigating to more pages and exploring more resources. + # async def find_links(self) -> list: + # """Finds all links in the current page and returns a list of dictionaries with link text and the URL. + # Useful for navigating to more pages and exploring more resources. - Returns: - list: A list of dictionaries, each containing 'text' and 'href' keys. - """ - # Use a CSS selector to find all elements in the page. - links = await self.current_page.query_selector_all("a") + # Returns: + # list: A list of dictionaries, each containing 'text' and 'href' keys. + # """ + # # Use a CSS selector to find all elements in the page. + # links = await self.current_page.query_selector_all("a") - # Prepare an empty list to hold link information. - link_info = [] + # # Prepare an empty list to hold link information. + # link_info = [] - # Iterate over each link element to extract its text and href attributes. - for link in links: - text = await link.text_content() - href = await link.get_attribute("href") - link_info.append({"text": text, "href": href}) + # # Iterate over each link element to extract its text and href attributes. + # for link in links: + # text = await link.text_content() + # href = await link.get_attribute("href") + # link_info.append({"text": text, "href": href}) - print(f"Found {len(link_info)} links:\n\n{link_info}") + # print(f"Found {len(link_info)} links:\n\n{link_info}") - return link_info + # return link_info - async def extract_info_from_view(self, instruction: str) -> str: - """ - Extract useful info from the current page view. + async def screenshot(self, path: str = DEFAULT_WORKSPACE_ROOT / "screenshot_temp.png"): + """Take a screenshot of the current page and save it to the specified path.""" + await self.current_page.screenshot(path=path) + print(f"Screenshot saved to: {path}") - Args: - instruction (str): explain what info needs to be extracted - - Returns: - str: extracted info from current view - """ - img_path = DEFAULT_WORKSPACE_ROOT / "screenshot_temp.png" - await self.current_page.screenshot(path=img_path) - rsp = await self.llm.aask(msg=instruction, images=[encode_image(img_path)]) - return rsp + async def _view(self) -> str: + """simulate human viewing the current page, return the visible text with links""" + visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS) + return visible_text_with_links async def scroll_current_page(self, offset: int = 500): - """scroll the current page by offset pixels, negative value means scrolling up, returning the content observed after scrolling""" + """scroll the current page by offset pixels, negative value means scrolling up, will print out observed content after scrolling""" await self.current_page.evaluate(f"window.scrollBy(0, {offset})") - print(f"Scrolled current page by {offset} pixels. Perceive the scrolled view if needed") await log_tool_output_async( ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser" ) + print(f"Scrolled current page by {offset} pixels.") + print(await self._view()) def check_all_pages(self) -> dict: """return all pages opened in the browser, a dictionary with {page_url: page_title}, useful for understanding the current browser state""" @@ -195,3 +171,47 @@ class Browser: async def get_scroll_position(page): return await page.evaluate("() => ({ x: window.scrollX, y: window.scrollY })") + + +SEARCH_CONTENT_JS = """ +(element) => { + // const block = element.closest('p, div, section, article'); + const block = element.parentElement; + return { + text_block: block.innerText, + // Create an array of objects, each containing the text and href of a link + links: Array.from(block.querySelectorAll('a')).map(a => ({ + text: a.innerText, + href: a.href + })) + }; +} +""" + + +VIEW_CONTENT_JS = """ +() => { + return Array.from(document.querySelectorAll('body *')).filter(el => { + if (!(el.offsetWidth || el.offsetHeight || el.getClientRects().length)) return false; + const style = window.getComputedStyle(el); + if (style.display === 'none' || style.visibility !== 'visible' || style.opacity === '0') return false; + const rect = el.getBoundingClientRect(); + const elemCenter = { + x: rect.left + rect.width / 2, + y: rect.top + rect.height / 2 + }; + if (elemCenter.x < 0 || elemCenter.y < 0 || elemCenter.x > window.innerWidth || elemCenter.y > window.innerHeight) return false; + if (document.elementFromPoint(elemCenter.x, elemCenter.y) !== el) return false; + return true; + }).map(el => { + let text = el.innerText || ''; + text = text.trim(); + if (!text.length) return ''; + const parentAnchor = el.closest('a'); + if (parentAnchor && parentAnchor.href) { + return `${text} (${parentAnchor.href})`; + } + return text; + }).filter(text => text.length > 0).join("\\n"); +} +""" diff --git a/tests/metagpt/tools/libs/test_browser.py b/tests/metagpt/tools/libs/test_browser.py index 0c3009fef..ec0b5c848 100644 --- a/tests/metagpt/tools/libs/test_browser.py +++ b/tests/metagpt/tools/libs/test_browser.py @@ -54,22 +54,18 @@ async def test_search(browser): # scroll to search result await browser.scroll_to_search_result(search_results, index=0) - # perceive current view - rsp = await browser.extract_info_from_view("what is the command to run exactly?") - assert "metagpt" in rsp - await browser.close() -@pytest.mark.asyncio -async def test_find_links(browser): - await browser.start() +# @pytest.mark.asyncio +# async def test_find_links(browser): +# await browser.start() - await browser.open_new_page(TEST_URL) - link_info = await browser.find_links() - assert link_info +# await browser.open_new_page(TEST_URL) +# link_info = await browser.find_links() +# assert link_info - await browser.close() +# await browser.close() @pytest.mark.asyncio @@ -80,9 +76,13 @@ async def test_scroll(browser): await browser.scroll_current_page(offset=-500) assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 0} # no change if you scrol up from top + initial_view = await browser._view() await browser.scroll_current_page(offset=500) # scroll down assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 500} + scrolled_view = await browser._view() + + assert initial_view != scrolled_view await browser.scroll_current_page(offset=-200) # scroll up assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 300} From 5edf78f1507ed76c841b46cf8a697a3ca3b99b5b Mon Sep 17 00:00:00 2001 From: yzlin Date: Fri, 12 Apr 2024 16:01:10 +0800 Subject: [PATCH 20/20] update phrasing for software tool and sop --- metagpt/actions/di/detect_intent.py | 2 +- metagpt/tools/libs/software_development.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/metagpt/actions/di/detect_intent.py b/metagpt/actions/di/detect_intent.py index c24f0679f..6dd51dfd1 100644 --- a/metagpt/actions/di/detect_intent.py +++ b/metagpt/actions/di/detect_intent.py @@ -32,7 +32,7 @@ class SOPItem(Enum): description="Software development intention including developing or building software, games, app, websites, etc. EXCLUDING bug fixes, report any issues, environment setup, terminal operations, and pip install.", sop=[ "Writes a PRD based on software requirements.", - "Writes a design to the project repository, based on the PRD of the project.", + "Writes a system design to the project repository, based on the PRD of the project. Write high-level system design instead of the actual code.", "Writes a project plan to the project repository, based on the design of the project.", "Writes code to implement designed features according to the project plan and adds them to the project repository.", # "Run QA test on the project repository.", diff --git a/metagpt/tools/libs/software_development.py b/metagpt/tools/libs/software_development.py index f8a409878..7367a9331 100644 --- a/metagpt/tools/libs/software_development.py +++ b/metagpt/tools/libs/software_development.py @@ -73,9 +73,9 @@ async def write_prd(idea: str, project_path: Optional[str | Path] = None) -> Pat return ctx.repo.docs.prd.workdir -@register_tool(tags=["software development", "Architect"]) +@register_tool(tags=["Design", "software development", "Architect"]) async def write_design(prd_path: str | Path) -> Path: - """Writes a design to the project repository, based on the PRD of the project. + """Writes a system design to the project repository, based on the PRD of the project. Args: prd_path (str|Path): The path to the PRD files under the project directory. @@ -177,6 +177,7 @@ async def write_project_plan(system_design_path: str | Path) -> Path: @register_tool(tags=["software development", "Engineer"]) async def write_codes(task_path: str | Path, inc: bool = False) -> Path: """Writes code to implement designed features according to the project plan and adds them to the project repository. + In code writing tasks, prioritize calling this tool against writing code from scratch directly. Args: task_path (str|Path): The path to task files under the project directory.