Merge branch 'add-web-scraping-test' into 'code_intepreter'

Add web scraping test

See merge request agents/data_agents_opt!60
This commit is contained in:
林义章 2024-01-25 07:04:38 +00:00
commit 1fb77121f4
5 changed files with 51 additions and 3 deletions

22
examples/crawl_webpage.py Normal file
View file

@ -0,0 +1,22 @@
# -*- encoding: utf-8 -*-
"""
@Date : 2024/01/24 15:11:27
@Author : orange-crow
@File : crawl_webpage.py
"""
from metagpt.roles.code_interpreter import CodeInterpreter
async def main():
prompt = """Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key data*"""
ci = CodeInterpreter(goal=prompt, use_tools=True)
await ci.run(prompt)
if __name__ == "__main__":
import asyncio
asyncio.run(main())

View file

@ -123,7 +123,10 @@ class ExecutePyCode(ExecuteCode, Action):
return parsed_output
for i, output in enumerate(outputs):
if output["output_type"] == "stream":
if output["output_type"] == "stream" and not any(
tag in output["text"]
for tag in ["| INFO | metagpt", "| ERROR | metagpt", "| WARNING | metagpt"]
):
parsed_output += output["text"]
elif output["output_type"] == "display_data":
if "image/png" in output["data"]:

View file

@ -79,7 +79,7 @@ class CodeInterpreter(Role):
if ReviewConst.CHANGE_WORD[0] in review:
counter = 0 # redo the task again with help of human suggestions
return code["code"] if code["language"] != "markdown" else "", result, success
return code["code"] if code.get("language") != "markdown" else "", result, success
async def _write_code(self):
todo = WriteCodeByGenerate() if not self.use_tools else WriteCodeWithTools(selected_tools=self.tools)

View file

@ -19,4 +19,4 @@ async def scrape_web_playwright(url, *urls):
web = await PlaywrightWrapper("chromium").run(url, *urls)
# Return the inner text content of the web page
return {"inner_text": web.inner_text, "html": web.html}
return {"inner_text": web.inner_text.strip(), "html": web.html.strip()}

View file

@ -0,0 +1,23 @@
import pytest
from metagpt.tools.libs.web_scraping import scrape_web_playwright
@pytest.mark.asyncio
async def test_scrape_web_playwright():
test_url = "https://www.deepwisdom.ai"
result = await scrape_web_playwright(test_url)
# Assert that the result is a dictionary
assert isinstance(result, dict)
# Assert that the result contains 'inner_text' and 'html' keys
assert "inner_text" in result
assert "html" in result
# Assert startswith and endswith
assert not result["inner_text"].startswith(" ")
assert not result["inner_text"].endswith(" ")
assert not result["html"].startswith(" ")
assert not result["html"].endswith(" ")