Merge branch 'mgx_ops' into feature-terminal

2026-06-08 15:05:17 +02:00 · 2024-04-12 11:55:40 +08:00 · 2024-04-12 11:55:40 +08:00 · 73afb493de
commit 73afb493de
parent 99774418af 7bb9d706d3
21 changed files with 517 additions and 67 deletions
--- a/examples/di/use_browser.py
+++ b/examples/di/use_browser.py
@ -0,0 +1,26 @@
+import asyncio
+
+from metagpt.roles.di.data_interpreter import DataInterpreter
+
+# an example to showcase navigation
+MG_LLM_CONFIG_REQ = """
+This is a link to the doc site of MetaGPT project: https://docs.deepwisdom.ai/main/en/
+Check where you can go to on the site and try to find out the list of LLM APIs supported by MetaGPT.
+Don't write all codes in one response, each time, just write code for one step.
+"""
+
+# an example to showcase searching
+PAPER_LIST_REQ = """"
+At https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
+find the first paper whose title includes `multiagent`, open it and summarize its abstract.
+Don't write all codes in one response, each time, just write code for one step.
+"""
+
+
+async def main():
+    di = DataInterpreter(tools=["Browser"], react_mode="react")
+    await di.run(MG_LLM_CONFIG_REQ)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/examples/mgx/run_mgx.py
+++ b/examples/mgx/run_mgx.py
@ -6,12 +6,16 @@ import asyncio
 from metagpt.roles.di.mgx import MGX

 requirement = (
-    "design a game using Gym (an open source Python library), including a graphical interface and interactive gameplay"
+    # "design a game using Gym (an open source Python library), including a graphical interface and interactive gameplay"
+    # "帮我把pip的源设置成：https://pypi.tuna.tsinghua.edu.cn/simple"
+    # "This is a website url does not require login: https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767 please write a similar web page,developed in vue language, The package.json dependency must be generated"
+    "I would like to imitate the website available at https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767. Could you please browse through it?"
+    # "Create a 2048 Game"
 )


 async def main(requirement: str = ""):
-    mgx = MGX(use_intent=True)
+    mgx = MGX(use_intent=True, tools=["<all>"])
    await mgx.run(requirement)


--- a/metagpt/actions/di/detect_intent.py
+++ b/metagpt/actions/di/detect_intent.py
@ -29,36 +29,38 @@ class SOPItemDef(BaseModel):
 class SOPItem(Enum):
    SOFTWARE_DEVELOPMENT = SOPItemDef(
        name="software development",
-        description="Intentions related to or including software development, such as developing or building software, games, app, websites, etc. Excluding bug fixes, report any issues.",
+        description="Software development intention including developing or building software, games, app, websites, etc. EXCLUDING bug fixes, report any issues, environment setup, terminal operations, and pip install.",
        sop=[
            "Writes a PRD based on software requirements.",
            "Writes a design to the project repository, based on the PRD of the project.",
            "Writes a project plan to the project repository, based on the design of the project.",
            "Writes code to implement designed features according to the project plan and adds them to the project repository.",
-            # "Run QA test on the project repository.",
            "Stage and commit changes for the project repository using Git.",
        ],
    )
-    FIX_BUGS = SOPItemDef(
-        name="fix bugs",
-        description="Fix bugs in a given project.",
-        sop=[
-            "Fix bugs in the project repository.",
-            "Stage and commit changes for the project repository using Git.",
-        ],
-    )
-    FORMAT_REPO = SOPItemDef(
-        name="format repo",
-        description="download repository from git and format the project to MetaGPT project",
-        sop=[
-            "Imports a project from a Git website and formats it to MetaGPT project format to enable incremental appending requirements.",
-            "Stage and commit changes for the project repository using Git.",
-        ],
+    # FIX_BUGS = SOPItemDef(
+    #     name="fix bugs",
+    #     description="Fix bugs in a given project.",
+    #     sop=[
+    #         "Fix bugs in the project repository.",
+    #         "Stage and commit changes for the project repository using Git.",
+    #     ],
+    # )
+    # FORMAT_REPO = SOPItemDef(
+    #     name="format repo",
+    #     description="download repository from git and format the project to MetaGPT project",
+    #     sop=[
+    #         "Imports a project from a Git website and formats it to MetaGPT project format to enable incremental appending requirements.",
+    #         "Stage and commit changes for the project repository using Git.",
+    #     ],
+    # )
+    WEB_OPERATION = SOPItemDef(
+        name="web operation",
+        description="web browsing, scraping, imitation and other interaction with the web",
    )
    OTHER = SOPItemDef(
        name="other",
-        description="Other intentions that do not fall into the above categories, including data science, machine learning, deep learning, etc.",
-        sop=[],
+        description="Other intentions that do not fall into the above categories, including data science, data analysis, machine learning, deep learning and text-to-image etc.",
    )

    @property
@ -86,8 +88,7 @@ Intention index:
 REQ_WITH_SOP = """
 {user_requirement}
 ## Knowledge
-To meet user requirements, the following standard operating procedure(SOP) must be used. 
-SOP descriptions cannot be modified; user requirements can only be appended to the end of corresponding steps.
+To meet user requirements, the following standard operating procedure(SOP) must be used:

 {sop}
 """
@ -97,7 +98,7 @@ class DetectIntent(Action):
    async def run(self, with_message: Message, **kwargs) -> Tuple[str, str]:
        user_requirement = with_message.content
        mappings = {i + 1: si for i, si in enumerate(SOPItem)}
-        intentions = "\n".join([f"{i+1}. {si.type_name}: {si.value.description}" for i, si in enumerate(SOPItem)])
+        intentions = "\n".join([f"{i + 1}. {si.type_name}: {si.value.description}" for i, si in enumerate(SOPItem)])
        prompt = DETECT_PROMPT.format(user_requirement=user_requirement, intentions=intentions)

        rsp = await self._aask(prompt)
@ -110,7 +111,7 @@ class DetectIntent(Action):

        req_with_sop = (
            REQ_WITH_SOP.format(
-                user_requirement=user_requirement, sop="\n".join([f"{i+1}. {v}" for i, v in enumerate(sop)])
+                user_requirement=user_requirement, sop="\n".join([f"{i + 1}. {v}" for i, v in enumerate(sop)])
            )
            if sop
            else user_requirement
@ -121,7 +122,13 @@ class DetectIntent(Action):

 async def main():
    # Example usage of the DetectIntent action
-    user_requirements = ["Develop a 2048 game.", "Run data analysis on sklearn wine dataset"]
+    user_requirements = [
+        "Develop a 2048 game.",
+        "Run data analysis on sklearn wine dataset",
+        "帮我把pip的源设置成：https://pypi.tuna.tsinghua.edu.cn/simple",
+        "This is a website url does not require login: https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767 please write a similar web page,developed in vue language, The package.json dependency must be generated",
+        "I would like to imitate the website available at https://demosc.chinaz.net/Files/DownLoad//moban/202404/moban7767. Could you please browse through it?",
+    ]
    detect_intent = DetectIntent()

    for user_requirement in user_requirements:
--- a/metagpt/actions/di/execute_nb_code.py
+++ b/metagpt/actions/di/execute_nb_code.py
@ -24,7 +24,10 @@ from rich.panel import Panel
 from rich.syntax import Syntax

 from metagpt.actions import Action
-from metagpt.logs import logger
+from metagpt.const import DEFAULT_WORKSPACE_ROOT
+from metagpt.logs import ToolLogItem, log_tool_output, logger
+
+INSTALL_KEEPLEN = 500


 class ExecuteNbCode(Action):
@ -43,7 +46,7 @@ class ExecuteNbCode(Action):
    ):
        super().__init__(
            nb=nb,
-            nb_client=NotebookClient(nb, timeout=timeout),
+            nb_client=NotebookClient(nb, timeout=timeout, resources={"metadata": {"path": DEFAULT_WORKSPACE_ROOT}}),
            timeout=timeout,
            console=Console(),
            interaction=("ipython" if self.is_ipython() else "terminal"),
@ -206,6 +209,11 @@ class ExecuteNbCode(Action):

            if "!pip" in code:
                success = False
+                outputs = outputs[-INSTALL_KEEPLEN:]
+
+            file_path = DEFAULT_WORKSPACE_ROOT / "code.ipynb"
+            nbformat.write(self.nb, file_path)
+            log_tool_output(ToolLogItem(name="file_path", value=file_path), tool_name="ExecuteNbCode")

            return outputs, success

--- a/metagpt/actions/di/write_plan.py
+++ b/metagpt/actions/di/write_plan.py
@ -16,34 +16,34 @@ from metagpt.schema import Message, Plan, Task
 from metagpt.strategy.task_type import TaskType
 from metagpt.utils.common import CodeParser

+PROMPT_TEMPLATE: str = """
+# Context:
+{context}
+# Available Task Types:
+{task_type_desc}
+# Task:
+Based on the context, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to {max_tasks} tasks.
+If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.
+If you encounter errors on the current task, revise and output the current single task only.
+Output a list of jsons following the format:
+```json
+[
+    {{
+        "task_id": str = "unique identifier for a task in plan, can be an ordinal",
+        "dependent_task_ids": list[str] = "ids of tasks prerequisite to this task",
+        "instruction": "what you should do in this task, one short phrase or sentence.",
+        "task_type": "type of this task, should be one of Available Task Types.",
+    }},
+    ...
+]
+```
+"""
+

 class WritePlan(Action):
-    PROMPT_TEMPLATE: str = """
-    # Context:
-    {context}
-    # Available Task Types:
-    {task_type_desc}
-    # Task:
-    Based on the context, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to {max_tasks} tasks.
-    If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.
-    If you encounter errors on the current task, revise and output the current single task only.
-    Output a list of jsons following the format:
-    ```json
-    [
-        {{
-            "task_id": str = "unique identifier for a task in plan, can be an ordinal",
-            "dependent_task_ids": list[str] = "ids of tasks prerequisite to this task",
-            "instruction": "what you should do in this task, one short phrase or sentence",
-            "task_type": "type of this task, should be one of Available Task Types",
-        }},
-        ...
-    ]
-    ```
-    """
-
    async def run(self, context: list[Message], max_tasks: int = 5) -> str:
        task_type_desc = "\n".join([f"- **{tt.type_name}**: {tt.value.desc}" for tt in TaskType])
-        prompt = self.PROMPT_TEMPLATE.format(
+        prompt = PROMPT_TEMPLATE.format(
            context="\n".join([str(ct) for ct in context]), max_tasks=max_tasks, task_type_desc=task_type_desc
        )
        rsp = await self._aask(prompt)
--- a/metagpt/actions/project_management.py
+++ b/metagpt/actions/project_management.py
@ -70,6 +70,7 @@ class WriteTasks(Action):
                dependencies={system_design_doc.root_relative_path},
            )
        await self._update_requirements(task_doc)
+        await self.repo.resources.api_spec_and_task.save_pdf(doc=task_doc)
        return task_doc

    async def _run_new_tasks(self, context):
--- a/metagpt/configs/mermaid_config.py
+++ b/metagpt/configs/mermaid_config.py
@ -13,7 +13,7 @@ from metagpt.utils.yaml_model import YamlModel
 class MermaidConfig(YamlModel):
    """Config for Mermaid"""

-    engine: Literal["nodejs", "ink", "playwright", "pyppeteer"] = "nodejs"
+    engine: Literal["nodejs", "ink", "playwright", "pyppeteer", "none"] = "nodejs"
    path: str = "mmdc"  # mmdc
    puppeteer_config: str = ""
    pyppeteer_path: str = "/usr/bin/google-chrome-stable"
--- a/metagpt/const.py
+++ b/metagpt/const.py
@ -135,3 +135,6 @@ AGGREGATION = "Aggregate"
 # Timeout
 USE_CONFIG_TIMEOUT = 0  # Using llm.timeout configuration.
 LLM_API_TIMEOUT = 300
+
+# Assistant alias
+ASSISTANT_ALIAS = "response"
--- a/metagpt/logs.py
+++ b/metagpt/logs.py
@ -54,6 +54,11 @@ def log_tool_output(output: ToolLogItem | list[ToolLogItem], tool_name: str = ""
    _tool_output_log(output=output, tool_name=tool_name)


+async def log_tool_output_async(output: ToolLogItem | list[ToolLogItem], tool_name: str = ""):
+    """async interface for logging tool output, used when output contains async object"""
+    await _tool_output_log_async(output=output, tool_name=tool_name)
+
+
 def set_llm_stream_logfunc(func):
    global _llm_stream_log
    _llm_stream_log = func
@ -64,9 +69,20 @@ def set_tool_output_logfunc(func):
    _tool_output_log = func


+async def set_tool_output_logfunc_async(func):
+    # async version
+    global _tool_output_log_async
+    _tool_output_log_async = func
+
+
 _llm_stream_log = partial(print, end="")


 _tool_output_log = (
    lambda *args, **kwargs: None
 )  # a dummy function to avoid errors if set_tool_output_logfunc is not called
+
+
+async def _tool_output_log_async(*args, **kwargs):
+    # async version
+    pass
--- a/metagpt/roles/di/mgx.py
+++ b/metagpt/roles/di/mgx.py
@ -4,10 +4,11 @@
 import asyncio
 from typing import Dict

-from metagpt.actions.di.detect_intent import DetectIntent
+from metagpt.actions.di.detect_intent import DetectIntent, SOPItem
 from metagpt.logs import logger
 from metagpt.roles.di.data_interpreter import DataInterpreter
 from metagpt.schema import Message
+from metagpt.tools.tool_recommend import BM25ToolRecommender


 class MGX(DataInterpreter):
@ -18,6 +19,10 @@ class MGX(DataInterpreter):
        todo = DetectIntent(context=self.context)
        request_with_sop, sop_type = await todo.run(user_msg)
        logger.info(f"{sop_type} {request_with_sop}")
+        if sop_type == SOPItem.SOFTWARE_DEVELOPMENT.type_name:
+            self.tool_recommender = BM25ToolRecommender(tools=["software development"])
+        else:
+            self.tool_recommender = BM25ToolRecommender(tools=["<all>"])
        return request_with_sop

    async def _plan_and_act(self) -> Message:
@ -28,6 +33,7 @@ class MGX(DataInterpreter):
        if self.use_intent:  # add mode
            user_message = Message(content=goal, role="user")
            goal = await self._detect_intent(user_message)
+
        logger.info(f"Goal is {goal}")

        await self.planner.update_plan(goal=goal)
--- a/metagpt/strategy/planner.py
+++ b/metagpt/strategy/planner.py
@ -119,7 +119,7 @@ class Planner(BaseModel):
        If human confirms the task result, then we deem the task completed, regardless of whether the code run succeeds;
        if auto mode, then the code run has to succeed for the task to be considered completed.
        """
-        auto_run = auto_run or self.auto_run
+        auto_run = auto_run if auto_run is not None else self.auto_run
        if not auto_run:
            context = self.get_useful_memories()
            review, confirmed = await AskReview().run(
--- a/metagpt/strategy/task_type.py
+++ b/metagpt/strategy/task_type.py
@ -67,6 +67,10 @@ class TaskType(Enum):
        name="email login",
        desc="For logging to an email.",
    )
+    DEVELOP_SOFTWARE = TaskTypeDef(
+        name="develop software",
+        desc="SOP related to develop software such as Writes a PRD, Writes a design, Writes a project plan and Writes code to implement designed features according to the project plan",
+    )

    @property
    def type_name(self):
--- a/metagpt/tools/libs/init.py
+++ b/metagpt/tools/libs/init.py
@ -13,6 +13,7 @@ from metagpt.tools.libs import (
    email_login,
    terminal,
    file_manager,
+    browser,
 )
 from metagpt.tools.libs.software_development import (
    write_prd,
@ -40,4 +41,5 @@ _ = (
    git_archive,
    terminal,
    file_manager,
+    browser,
 )  # Avoid pre-commit error
--- a/metagpt/tools/libs/browser.py
+++ b/metagpt/tools/libs/browser.py
@ -0,0 +1,197 @@
+from playwright.async_api import async_playwright
+
+from metagpt.const import DEFAULT_WORKSPACE_ROOT
+from metagpt.logs import ToolLogItem, log_tool_output_async
+from metagpt.tools.tool_registry import register_tool
+from metagpt.utils.common import encode_image
+
+
+@register_tool()
+class Browser:
+    """
+    A tool for browsing the web. Don't initialize a new instance of this class if one already exists.
+    Note: Combine searching, scrolling, extraction, and link finding together to achieve most effective browsing. DON'T stick to one method.
+    """
+
+    def __init__(self):
+        """initiate the browser, create pages placeholder later to be managed as {page_url: page object}"""
+        self.browser = None
+
+        from metagpt.config2 import config
+        from metagpt.llm import LLM
+
+        self.llm = LLM(llm_config=config.get_openai_llm())
+        self.llm.model = "gpt-4-vision-preview"
+
+        # browser status management
+        self.pages = {}
+        self.current_page_url = None
+        self.current_page = None
+
+    async def start(self):
+        """Starts Playwright and launches a browser"""
+        self.playwright = await async_playwright().start()
+        self.browser = await self.playwright.chromium.launch()
+
+    def _set_current_page(self, page, url):
+        self.current_page = page
+        self.current_page_url = url
+        print("Now on page ", url)
+
+    async def open_new_page(self, url: str):
+        """open a new page in the browser, set it as the current page"""
+        page = await self.browser.new_page()
+        await page.goto(url)
+        self.pages[url] = page
+        self._set_current_page(page, url)
+        await log_tool_output_async(
+            ToolLogItem(type="object", name="open_new_page", value=self.current_page), tool_name="Browser"
+        )
+
+    async def switch_page(self, url: str):
+        """switch to an opened page in the browser, set it as the current page"""
+        if url in self.pages:
+            self._set_current_page(self.pages[url], url)
+            await log_tool_output_async(
+                ToolLogItem(type="object", name="switch_page", value=self.current_page), tool_name="Browser"
+            )
+        else:
+            print(f"Page not found: {url}")
+
+    async def search_content_all(self, search_term: str) -> list[dict]:
+        """search all occurences of search term in the current page and return the search results with their position.
+        Useful if you have a keyword or sentence in mind and want to quickly narrow down the content relevant to it.
+
+        Args:
+            search_term (str): the search term
+
+        Returns:
+            list[dict]: a list of dictionaries containing the elements and their positions, e.g.
+            [
+                {
+                    "index": ...,
+                    "content": {
+                        "text_block": ...,
+                        "links": [
+                            {"text": ..., "href": ...},
+                            ...
+                        ]
+                    },
+                    "position": {from_top: ..., from_left: ...},
+                },
+                ...
+            ]
+        """
+        locator = self.current_page.locator(f"text={search_term}")
+        count = await locator.count()
+        search_results = []
+        for i in range(count):
+            element = locator.nth(i)
+            if await element.is_visible():
+                position = await element.evaluate("e => ({ from_top: e.offsetTop, from_left: e.offsetLeft })")
+
+                # Retrieve the surrounding block of text and links with their text
+                content = await element.evaluate(
+                    """
+                    (element) => {
+                        // const block = element.closest('p, div, section, article');
+                        const block = element.parentElement;
+                        return {
+                            text_block: block.innerText,
+                            // Create an array of objects, each containing the text and href of a link
+                            links: Array.from(block.querySelectorAll('a')).map(a => ({
+                                text: a.innerText, 
+                                href: a.href
+                            }))
+                        };
+                    }
+                """
+                )
+
+                search_results.append(
+                    {"index": len(search_results), "content": content, "position": position, "element_obj": element}
+                )
+
+        print(f"Found {len(search_results)} instances of the term '{search_term}':\n\n{search_results}")
+
+        return search_results
+
+    async def scroll_to_search_result(self, search_results: list[dict], index: int = 0):
+        """Scroll to the index-th search result, potentially for subsequent perception.
+        Useful if you have located a search result, the search result does not fulfill your requirement, and you need more information around that search result. Can only be used after search_all_content.
+
+        Args:
+            search_results (list[dict]): search_results from search_content_all
+            index (int, optional): the index of the search result to scroll to. Index starts from 0. Defaults to 0.
+        """
+        if not search_results:
+            return {}
+        if index >= len(search_results):
+            print(f"Index {index} is out of range. Scrolling to the last instance.")
+            index = len(search_results) - 1
+        element = search_results[index]["element_obj"]
+        await element.scroll_into_view_if_needed()
+        print(f"Successfully scrolled to the {index}-th search result, consider extract more info around it.")
+        await log_tool_output_async(
+            ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser"
+        )
+
+    async def find_links(self) -> list:
+        """Finds all links in the current page and returns a list of dictionaries with link text and the URL.
+        Useful for navigating to more pages and exploring more resources.
+
+        Returns:
+            list: A list of dictionaries, each containing 'text' and 'href' keys.
+        """
+        # Use a CSS selector to find all <a> elements in the page.
+        links = await self.current_page.query_selector_all("a")
+
+        # Prepare an empty list to hold link information.
+        link_info = []
+
+        # Iterate over each link element to extract its text and href attributes.
+        for link in links:
+            text = await link.text_content()
+            href = await link.get_attribute("href")
+            link_info.append({"text": text, "href": href})
+
+        print(f"Found {len(link_info)} links:\n\n{link_info}")
+
+        return link_info
+
+    async def extract_info_from_view(self, instruction: str) -> str:
+        """
+        Extract useful info from the current page view.
+
+        Args:
+            instruction (str): explain what info needs to be extracted
+
+        Returns:
+            str: extracted info from current view
+        """
+        img_path = DEFAULT_WORKSPACE_ROOT / "screenshot_temp.png"
+        await self.current_page.screenshot(path=img_path)
+        rsp = await self.llm.aask(msg=instruction, images=[encode_image(img_path)])
+        return rsp
+
+    async def scroll_current_page(self, offset: int = 500):
+        """scroll the current page by offset pixels, negative value means scrolling up, returning the content observed after scrolling"""
+        await self.current_page.evaluate(f"window.scrollBy(0, {offset})")
+        print(f"Scrolled current page by {offset} pixels. Perceive the scrolled view if needed")
+        await log_tool_output_async(
+            ToolLogItem(type="object", name="scroll_page", value=self.current_page), tool_name="Browser"
+        )
+
+    def check_all_pages(self) -> dict:
+        """return all pages opened in the browser, a dictionary with {page_url: page_title}, useful for understanding the current browser state"""
+        pages_info = {url: page.title() for url, page in self.pages.items()}
+        return pages_info
+
+    async def close(self):
+        """close the browser and all pages"""
+        await self.browser.close()
+        await self.playwright.stop()
+
+
+async def get_scroll_position(page):
+    return await page.evaluate("() => ({ x: window.scrollX, y: window.scrollY })")
--- a/metagpt/tools/libs/software_development.py
+++ b/metagpt/tools/libs/software_development.py
@ -5,11 +5,12 @@ from __future__ import annotations
 from pathlib import Path
 from typing import Optional

-from metagpt.const import BUGFIX_FILENAME, REQUIREMENT_FILENAME
+from metagpt.const import ASSISTANT_ALIAS, BUGFIX_FILENAME, REQUIREMENT_FILENAME
 from metagpt.logs import ToolLogItem, log_tool_output
 from metagpt.schema import BugFixContext, Message
 from metagpt.tools.tool_registry import register_tool
 from metagpt.utils.common import any_to_str
+from metagpt.utils.project_repo import ProjectRepo


@register_tool(tags=["software development", "ProductManager"])
@ -42,22 +43,33 @@ async def write_prd(idea: str, project_path: Optional[str | Path] = None) -> Pat
    from metagpt.context import Context
    from metagpt.roles import ProductManager

+    log_tool_output(output=[ToolLogItem(name=ASSISTANT_ALIAS, value=write_prd.__name__)], tool_name=write_prd.__name__)
+
    ctx = Context()
    if project_path and Path(project_path).exists():
        ctx.config.project_path = Path(project_path)
        ctx.config.inc = True
+
    role = ProductManager(context=ctx)
    msg = await role.run(with_message=Message(content=idea, cause_by=UserRequirement))
    await role.run(with_message=msg)

    outputs = [
-        ToolLogItem(name="PRD File", value=str(ctx.repo.docs.prd.workdir / i))
+        ToolLogItem(name="Intermedia PRD File", value=str(ctx.repo.docs.prd.workdir / i))
        for i in ctx.repo.docs.prd.changed_files.keys()
    ]
-    for i in ctx.repo.resources.competitive_analysis.changed_files.keys():
-        outputs.append(
+    outputs.extend(
+        [
+            ToolLogItem(name="PRD File", value=str(ctx.repo.resources.prd.workdir / i))
+            for i in ctx.repo.resources.prd.changed_files.keys()
+        ]
+    )
+    outputs.extend(
+        [
            ToolLogItem(name="Competitive Analysis", value=str(ctx.repo.resources.competitive_analysis.workdir / i))
-        )
+            for i in ctx.repo.resources.competitive_analysis.changed_files.keys()
+        ]
+    )
    log_tool_output(output=outputs, tool_name=write_prd.__name__)

    return ctx.repo.docs.prd.workdir
@ -85,6 +97,10 @@ async def write_design(prd_path: str | Path) -> Path:
    from metagpt.context import Context
    from metagpt.roles import Architect

+    log_tool_output(
+        output=[ToolLogItem(name=ASSISTANT_ALIAS, value=write_design.__name__)], tool_name=write_design.__name__
+    )
+
    ctx = Context()
    prd_path = Path(prd_path)
    project_path = (Path(prd_path) if not prd_path.is_file() else prd_path.parent) / "../.."
@ -132,6 +148,11 @@ async def write_project_plan(system_design_path: str | Path) -> Path:
    from metagpt.context import Context
    from metagpt.roles import ProjectManager

+    log_tool_output(
+        output=[ToolLogItem(name=ASSISTANT_ALIAS, value=write_project_plan.__name__)],
+        tool_name=write_project_plan.__name__,
+    )
+
    ctx = Context()
    system_design_path = Path(system_design_path)
    project_path = (system_design_path if not system_design_path.is_file() else system_design_path.parent) / "../.."
@ -141,9 +162,15 @@ async def write_project_plan(system_design_path: str | Path) -> Path:
    await role.run(with_message=Message(content="", cause_by=WriteDesign))

    outputs = [
-        ToolLogItem(name="Project Plan", value=str(ctx.repo.docs.task.workdir / i))
+        ToolLogItem(name="Intermedia Project Plan", value=str(ctx.repo.docs.task.workdir / i))
        for i in ctx.repo.docs.task.changed_files.keys()
    ]
+    outputs.extend(
+        [
+            ToolLogItem(name="Project Plan", value=str(ctx.repo.resources.api_spec_and_task.workdir / i))
+            for i in ctx.repo.resources.api_spec_and_task.changed_files.keys()
+        ]
+    )
    log_tool_output(output=outputs, tool_name=write_project_plan.__name__)

    return ctx.repo.docs.task.workdir
@ -179,6 +206,10 @@ async def write_codes(task_path: str | Path, inc: bool = False) -> Path:
    from metagpt.context import Context
    from metagpt.roles import Engineer

+    log_tool_output(
+        output=[ToolLogItem(name=ASSISTANT_ALIAS, value=write_codes.__name__)], tool_name=write_codes.__name__
+    )
+
    ctx = Context()
    ctx.config.inc = inc
    task_path = Path(task_path)
@ -222,6 +253,10 @@ async def run_qa_test(src_path: str | Path) -> Path:
    from metagpt.environment import Environment
    from metagpt.roles import QaEngineer

+    log_tool_output(
+        output=[ToolLogItem(name=ASSISTANT_ALIAS, value=run_qa_test.__name__)], tool_name=run_qa_test.__name__
+    )
+
    ctx = Context()
    src_path = Path(src_path)
    project_path = (src_path if not src_path.is_file() else src_path.parent) / ".."
@ -270,6 +305,8 @@ async def fix_bug(project_path: str | Path, issue: str) -> Path:
    from metagpt.context import Context
    from metagpt.roles import Engineer

+    log_tool_output(output=[ToolLogItem(name=ASSISTANT_ALIAS, value=fix_bug.__name__)], tool_name=fix_bug.__name__)
+
    ctx = Context()
    ctx.set_repo_dir(project_path)
    ctx.src_workspace = ctx.git_repo.workdir / ctx.git_repo.workdir.name
@ -325,11 +362,21 @@ async def git_archive(project_path: str | Path) -> str:
    """
    from metagpt.context import Context

+    log_tool_output(
+        output=[ToolLogItem(name=ASSISTANT_ALIAS, value=git_archive.__name__)], tool_name=git_archive.__name__
+    )
+
    ctx = Context()
-    ctx.set_repo_dir(project_path)
+    project_dir = ProjectRepo.search_project_path(project_path)
+    if not project_dir:
+        ValueError(f"{project_path} is not a valid git repository.")
+    ctx.set_repo_dir(project_dir)
+    files = " ".join(ctx.git_repo.changed_files.keys())
+    outputs = [ToolLogItem(name="cmd", value=f"git add {files}")]
+    log_tool_output(output=outputs, tool_name=git_archive.__name__)
    ctx.git_repo.archive()

-    outputs = [ToolLogItem(name="Git Commit", value=str(ctx.repo.workdir))]
+    outputs = [ToolLogItem(name="cmd", value="git commit -m 'Archive'")]
    log_tool_output(output=outputs, tool_name=git_archive.__name__)

    return ctx.git_repo.log()
@ -358,6 +405,10 @@ async def import_git_repo(url: str) -> Path:
    from metagpt.actions.import_repo import ImportRepo
    from metagpt.context import Context

+    log_tool_output(
+        output=[ToolLogItem(name=ASSISTANT_ALIAS, value=import_git_repo.__name__)], tool_name=import_git_repo.__name__
+    )
+
    ctx = Context()
    action = ImportRepo(repo_path=url, context=ctx)
    await action.run()
--- a/metagpt/tools/tool_convert.py
+++ b/metagpt/tools/tool_convert.py
@ -20,8 +20,7 @@ def convert_code_to_tool_schema(obj, include: list[str] = None) -> dict:
                continue
            # method_doc = inspect.getdoc(method)
            method_doc = get_class_method_docstring(obj, name)
-            if method_doc:
-                schema["methods"][name] = function_docstring_to_schema(method, method_doc)
+            schema["methods"][name] = function_docstring_to_schema(method, method_doc)

    elif inspect.isfunction(obj):
        schema = function_docstring_to_schema(obj, docstring)
@ -39,7 +38,7 @@ def convert_code_to_tool_schema_ast(code: str) -> list[dict]:
    return visitor.get_tool_schemas()


-def function_docstring_to_schema(fn_obj, docstring) -> dict:
+def function_docstring_to_schema(fn_obj, docstring="") -> dict:
    """
    Converts a function's docstring into a schema dictionary.

--- a/metagpt/utils/common.py
+++ b/metagpt/utils/common.py
@ -783,13 +783,15 @@ def load_mc_skills_code(skill_names: list[str] = None, skills_dir: Path = None)
    return skills


-def encode_image(image_path_or_pil: Union[Path, Image], encoding: str = "utf-8") -> str:
+def encode_image(image_path_or_pil: Union[Path, Image, str], encoding: str = "utf-8") -> str:
    """encode image from file or PIL.Image into base64"""
    if isinstance(image_path_or_pil, Image.Image):
        buffer = BytesIO()
        image_path_or_pil.save(buffer, format="JPEG")
        bytes_data = buffer.getvalue()
    else:
+        if isinstance(image_path_or_pil, str):
+            image_path_or_pil = Path(image_path_or_pil)
        if not image_path_or_pil.exists():
            raise FileNotFoundError(f"{image_path_or_pil} not exists")
        with open(str(image_path_or_pil), "rb") as image_file:
--- a/metagpt/utils/mermaid.py
+++ b/metagpt/utils/mermaid.py
@ -81,6 +81,8 @@ async def mermaid_to_file(engine, mermaid_code, output_file_without_suffix, widt
            from metagpt.utils.mmdc_ink import mermaid_to_file

            return await mermaid_to_file(mermaid_code, output_file_without_suffix)
+        elif engine == "none":
+            return 0
        else:
            logger.warning(f"Unsupported mermaid engine: {engine}")
    return 0
--- a/metagpt/utils/project_repo.py
+++ b/metagpt/utils/project_repo.py
@ -10,6 +10,7 @@
 from __future__ import annotations

 from pathlib import Path
+from typing import Optional

 from metagpt.const import (
    CLASS_VIEW_FILE_REPO,
@ -148,3 +149,14 @@ class ProjectRepo(FileRepository):
    @property
    def src_relative_path(self) -> Path | None:
        return self._srcs_path
+
+    @staticmethod
+    def search_project_path(filename: str | Path) -> Optional[Path]:
+        root = Path(filename).parent if Path(filename).is_file() else Path(filename)
+        root = root.resolve()
+        while str(root) != "/":
+            git_repo = root / ".git"
+            if git_repo.exists():
+                return root
+            root = root.parent
+        return None
--- a/tests/metagpt/tools/libs/test_browser.py
+++ b/tests/metagpt/tools/libs/test_browser.py
@ -0,0 +1,90 @@
+import pytest
+
+from metagpt.const import TEST_DATA_PATH
+from metagpt.tools.libs.browser import Browser, get_scroll_position
+
+TEST_URL = "https://docs.deepwisdom.ai/main/en/guide/get_started/quickstart.html"
+
+TEST_SCREENSHOT_PATH = TEST_DATA_PATH / "screenshot.png"
+
+
+@pytest.fixture(autouse=True)
+def llm_mock(rsp_cache, mocker, request):
+    # An empty fixture to overwrite the global llm_mock fixture
+    # because in provider folder, we want to test the aask and aask functions for the specific models
+    pass
+
+
+@pytest.fixture
+def browser():
+    browser_instance = Browser()
+    yield browser_instance
+
+
+@pytest.mark.asyncio
+async def test_open_and_switch_page(browser):
+    await browser.start()
+
+    await browser.open_new_page("https://baidu.com")
+    await browser.open_new_page("https://tencent.com")
+    assert browser.current_page_url == "https://tencent.com"
+    await browser.switch_page("https://baidu.com")
+    assert browser.current_page_url == "https://baidu.com"
+
+    await browser.close()
+
+
+@pytest.mark.asyncio
+async def test_search(browser):
+    await browser.start()
+
+    # search all
+    await browser.open_new_page(TEST_URL)
+    search_term = "startup example"
+    search_results = await browser.search_content_all(search_term)
+    print(search_results)
+    # expected search result as of 20240410:
+    # [{'index': 0, 'content': {'text_block': 'Below is a breakdown of the software startup example. If you install MetaGPT with the git clone approach, simply run', 'links': [{'text': 'software startup example', 'href': 'https://github.com/geekan/MetaGPT/blob/main/metagpt/software_company.py'}]}, 'position': {'from_top': 640, 'from_left': 225}, 'element_obj': <Locator frame=<Frame name= url='https://docs.deepwisdom.ai/main/en/guide/get_started/quickstart.html'> selector='text=startup example >> nth=0'>}]
+    first_result = search_results[0]["content"]
+    assert "software startup example" in first_result["text_block"]
+    assert first_result["links"]
+    assert first_result["links"][0]["href"] == "https://github.com/geekan/MetaGPT/blob/main/metagpt/software_company.py"
+    assert search_results[0]["position"]
+
+    # scroll to search result
+    await browser.scroll_to_search_result(search_results, index=0)
+
+    # perceive current view
+    rsp = await browser.extract_info_from_view("what is the command to run exactly?")
+    assert "metagpt" in rsp
+
+    await browser.close()
+
+
+@pytest.mark.asyncio
+async def test_find_links(browser):
+    await browser.start()
+
+    await browser.open_new_page(TEST_URL)
+    link_info = await browser.find_links()
+    assert link_info
+
+    await browser.close()
+
+
+@pytest.mark.asyncio
+async def test_scroll(browser):
+    await browser.start()
+
+    await browser.open_new_page(TEST_URL)
+
+    await browser.scroll_current_page(offset=-500)
+    assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 0}  # no change if you scrol up from top
+
+    await browser.scroll_current_page(offset=500)  # scroll down
+    assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 500}
+
+    await browser.scroll_current_page(offset=-200)  # scroll up
+    assert await get_scroll_position(browser.current_page) == {"x": 0, "y": 300}
+
+    await browser.close()
--- a/tests/metagpt/tools/test_tool_convert.py
+++ b/tests/metagpt/tools/test_tool_convert.py
@ -48,6 +48,14 @@ class DummyClass:
        pass


+class DummySubClass(DummyClass):
+    """sub class docstring"""
+
+    def sub_method(self, df: pd.DataFrame):
+        """sub method"""
+        pass
+
+
 def dummy_fn(
    df: pd.DataFrame,
    s: str,
@ -117,6 +125,18 @@ def test_convert_code_to_tool_schema_class():
    assert schema == expected


+def test_convert_code_to_tool_schema_subclass():
+    schema = convert_code_to_tool_schema(DummySubClass)
+    assert "sub_method" in schema["methods"]  # sub class method should be included
+    assert "fit" in schema["methods"]  # parent class method should be included
+
+
+def test_convert_code_to_tool_schema_include():
+    schema = convert_code_to_tool_schema(DummyClass, include=["fit"])
+    assert "fit" in schema["methods"]
+    assert "transform" not in schema["methods"]
+
+
 def test_convert_code_to_tool_schema_function():
    expected = {
        "type": "function",