diff --git a/examples/crawl_webpage.py b/examples/crawl_webpage.py new file mode 100644 index 000000000..35413d2ff --- /dev/null +++ b/examples/crawl_webpage.py @@ -0,0 +1,22 @@ +# -*- encoding: utf-8 -*- +""" +@Date : 2024/01/24 15:11:27 +@Author : orange-crow +@File : crawl_webpage.py +""" + +from metagpt.roles.code_interpreter import CodeInterpreter + + +async def main(): + prompt = """Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/, + and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key data*""" + ci = CodeInterpreter(goal=prompt, use_tools=True) + + await ci.run(prompt) + + +if __name__ == "__main__": + import asyncio + + asyncio.run(main()) diff --git a/metagpt/actions/execute_code.py b/metagpt/actions/execute_code.py index 6591f479f..6a4a9abb8 100644 --- a/metagpt/actions/execute_code.py +++ b/metagpt/actions/execute_code.py @@ -123,7 +123,10 @@ class ExecutePyCode(ExecuteCode, Action): return parsed_output for i, output in enumerate(outputs): - if output["output_type"] == "stream": + if output["output_type"] == "stream" and not any( + tag in output["text"] + for tag in ["| INFO | metagpt", "| ERROR | metagpt", "| WARNING | metagpt"] + ): parsed_output += output["text"] elif output["output_type"] == "display_data": if "image/png" in output["data"]: diff --git a/metagpt/roles/code_interpreter.py b/metagpt/roles/code_interpreter.py index 26be17fd4..026fec562 100644 --- a/metagpt/roles/code_interpreter.py +++ b/metagpt/roles/code_interpreter.py @@ -79,7 +79,7 @@ class CodeInterpreter(Role): if ReviewConst.CHANGE_WORD[0] in review: counter = 0 # redo the task again with help of human suggestions - return code["code"] if code["language"] != "markdown" else "", result, success + return code["code"] if code.get("language") != "markdown" else "", result, success async def _write_code(self): todo = WriteCodeByGenerate() if not self.use_tools else WriteCodeWithTools(selected_tools=self.tools) diff --git a/metagpt/tools/libs/web_scraping.py b/metagpt/tools/libs/web_scraping.py index e8e73f123..921fca809 100644 --- a/metagpt/tools/libs/web_scraping.py +++ b/metagpt/tools/libs/web_scraping.py @@ -19,4 +19,4 @@ async def scrape_web_playwright(url, *urls): web = await PlaywrightWrapper("chromium").run(url, *urls) # Return the inner text content of the web page - return {"inner_text": web.inner_text, "html": web.html} + return {"inner_text": web.inner_text.strip(), "html": web.html.strip()} diff --git a/tests/metagpt/tools/libs/test_web_scraping.py b/tests/metagpt/tools/libs/test_web_scraping.py new file mode 100644 index 000000000..c11960e68 --- /dev/null +++ b/tests/metagpt/tools/libs/test_web_scraping.py @@ -0,0 +1,23 @@ +import pytest + +from metagpt.tools.libs.web_scraping import scrape_web_playwright + + +@pytest.mark.asyncio +async def test_scrape_web_playwright(): + test_url = "https://www.deepwisdom.ai" + + result = await scrape_web_playwright(test_url) + + # Assert that the result is a dictionary + assert isinstance(result, dict) + + # Assert that the result contains 'inner_text' and 'html' keys + assert "inner_text" in result + assert "html" in result + + # Assert startswith and endswith + assert not result["inner_text"].startswith(" ") + assert not result["inner_text"].endswith(" ") + assert not result["html"].startswith(" ") + assert not result["html"].endswith(" ")