Merge branch 'add-web-scraping-test' into 'code_intepreter'

Add web scraping test See merge request agents/data_agents_opt!60
2026-06-08 15:05:17 +02:00 · 2024-01-25 07:04:38 +00:00 · 2024-01-25 07:04:38 +00:00 · 1fb77121f4
commit 1fb77121f4
parent 5f624844cc 54a08747db
5 changed files with 51 additions and 3 deletions
--- a/examples/crawl_webpage.py
+++ b/examples/crawl_webpage.py
@ -0,0 +1,22 @@
+# -*- encoding: utf-8 -*-
+"""
+@Date    :   2024/01/24 15:11:27
+@Author  :   orange-crow
+@File    :   crawl_webpage.py
+"""
+
+from metagpt.roles.code_interpreter import CodeInterpreter
+
+
+async def main():
+    prompt = """Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
+    and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key data*"""
+    ci = CodeInterpreter(goal=prompt, use_tools=True)
+
+    await ci.run(prompt)
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
--- a/metagpt/actions/execute_code.py
+++ b/metagpt/actions/execute_code.py
@ -123,7 +123,10 @@ class ExecutePyCode(ExecuteCode, Action):
            return parsed_output

        for i, output in enumerate(outputs):
-            if output["output_type"] == "stream":
+            if output["output_type"] == "stream" and not any(
+                tag in output["text"]
+                for tag in ["| INFO     | metagpt", "| ERROR    | metagpt", "| WARNING  | metagpt"]
+            ):
                parsed_output += output["text"]
            elif output["output_type"] == "display_data":
                if "image/png" in output["data"]:
--- a/metagpt/roles/code_interpreter.py
+++ b/metagpt/roles/code_interpreter.py
@ -79,7 +79,7 @@ class CodeInterpreter(Role):
                if ReviewConst.CHANGE_WORD[0] in review:
                    counter = 0  # redo the task again with help of human suggestions

-        return code["code"] if code["language"] != "markdown" else "", result, success
+        return code["code"] if code.get("language") != "markdown" else "", result, success

    async def _write_code(self):
        todo = WriteCodeByGenerate() if not self.use_tools else WriteCodeWithTools(selected_tools=self.tools)
--- a/metagpt/tools/libs/web_scraping.py
+++ b/metagpt/tools/libs/web_scraping.py
@ -19,4 +19,4 @@ async def scrape_web_playwright(url, *urls):
    web = await PlaywrightWrapper("chromium").run(url, *urls)

    # Return the inner text content of the web page
-    return {"inner_text": web.inner_text, "html": web.html}
+    return {"inner_text": web.inner_text.strip(), "html": web.html.strip()}
--- a/tests/metagpt/tools/libs/test_web_scraping.py
+++ b/tests/metagpt/tools/libs/test_web_scraping.py
@ -0,0 +1,23 @@
+import pytest
+
+from metagpt.tools.libs.web_scraping import scrape_web_playwright
+
+
+@pytest.mark.asyncio
+async def test_scrape_web_playwright():
+    test_url = "https://www.deepwisdom.ai"
+
+    result = await scrape_web_playwright(test_url)
+
+    # Assert that the result is a dictionary
+    assert isinstance(result, dict)
+
+    # Assert that the result contains 'inner_text' and 'html' keys
+    assert "inner_text" in result
+    assert "html" in result
+
+    # Assert startswith and endswith
+    assert not result["inner_text"].startswith(" ")
+    assert not result["inner_text"].endswith(" ")
+    assert not result["html"].startswith(" ")
+    assert not result["html"].endswith(" ")