mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-04-26 01:06:27 +02:00
Merge pull request #69 from shenchucheng/main
Add web page scraping feature implemented by Playwright/Selenium
This commit is contained in:
commit
007c8c0457
13 changed files with 479 additions and 30 deletions
|
|
@ -12,6 +12,8 @@ import pytest
|
|||
|
||||
from metagpt.logs import logger
|
||||
from metagpt.provider.openai_api import OpenAIGPTAPI as GPTAPI
|
||||
import asyncio
|
||||
import re
|
||||
|
||||
|
||||
class Context:
|
||||
|
|
@ -38,3 +40,31 @@ def llm_api():
|
|||
def mock_llm():
|
||||
# Create a mock LLM for testing
|
||||
return Mock()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def proxy():
|
||||
pattern = re.compile(
|
||||
rb"(?P<method>[a-zA-Z]+) (?P<uri>(\w+://)?(?P<host>[^\s\'\"<>\[\]{}|/:]+)(:(?P<port>\d+))?[^\s\'\"<>\[\]{}|]*) "
|
||||
)
|
||||
|
||||
async def pipe(reader, writer):
|
||||
while not reader.at_eof():
|
||||
writer.write(await reader.read(2048))
|
||||
writer.close()
|
||||
|
||||
async def handle_client(reader, writer):
|
||||
data = await reader.readuntil(b"\r\n\r\n")
|
||||
print(f"Proxy: {data}") # checking with capfd fixture
|
||||
infos = pattern.match(data)
|
||||
host, port = infos.group("host"), infos.group("port")
|
||||
port = int(port) if port else 80
|
||||
remote_reader, remote_writer = await asyncio.open_connection(host, port)
|
||||
if data.startswith(b"CONNECT"):
|
||||
writer.write(b"HTTP/1.1 200 Connection Established\r\n\r\n")
|
||||
else:
|
||||
remote_writer.write(data)
|
||||
await asyncio.gather(pipe(reader, remote_writer), pipe(remote_reader, writer))
|
||||
|
||||
server = asyncio.get_event_loop().run_until_complete(asyncio.start_server(handle_client, "127.0.0.1", 0))
|
||||
return "http://{}:{}".format(*server.sockets[0].getsockname())
|
||||
|
|
|
|||
25
tests/metagpt/tools/test_web_browser_engine.py
Normal file
25
tests/metagpt/tools/test_web_browser_engine.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
import pytest
|
||||
from metagpt.config import Config
|
||||
from metagpt.tools import web_browser_engine, WebBrowserEngineType
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"browser_type, url, urls",
|
||||
[
|
||||
(WebBrowserEngineType.PLAYWRIGHT, "https://fuzhi.ai", ("https://fuzhi.ai",)),
|
||||
(WebBrowserEngineType.SELENIUM, "https://fuzhi.ai", ("https://fuzhi.ai",)),
|
||||
],
|
||||
ids=["playwright", "selenium"],
|
||||
)
|
||||
async def test_scrape_web_page(browser_type, url, urls):
|
||||
browser = web_browser_engine.WebBrowserEngine(browser_type)
|
||||
result = await browser.run(url)
|
||||
assert isinstance(result, str)
|
||||
assert "深度赋智" in result
|
||||
|
||||
if urls:
|
||||
results = await browser.run(url, *urls)
|
||||
assert isinstance(results, list)
|
||||
assert len(results) == len(urls) + 1
|
||||
assert all(("深度赋智" in i) for i in results)
|
||||
34
tests/metagpt/tools/test_web_browser_engine_playwright.py
Normal file
34
tests/metagpt/tools/test_web_browser_engine_playwright.py
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import pytest
|
||||
from metagpt.config import CONFIG
|
||||
from metagpt.tools import web_browser_engine_playwright
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"browser_type, use_proxy, kwagrs, url, urls",
|
||||
[
|
||||
("chromium", {"proxy": True}, {}, "https://fuzhi.ai", ("https://fuzhi.ai",)),
|
||||
("firefox", {}, {"ignore_https_errors": True}, "https://fuzhi.ai", ("https://fuzhi.ai",)),
|
||||
("webkit", {}, {"ignore_https_errors": True}, "https://fuzhi.ai", ("https://fuzhi.ai",)),
|
||||
],
|
||||
ids=["chromium-normal", "firefox-normal", "webkit-normal"],
|
||||
)
|
||||
async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy, capfd):
|
||||
try:
|
||||
global_proxy = CONFIG.global_proxy
|
||||
if use_proxy:
|
||||
CONFIG.global_proxy = proxy
|
||||
browser = web_browser_engine_playwright.PlaywrightWrapper(browser_type, **kwagrs)
|
||||
result = await browser.run(url)
|
||||
assert isinstance(result, str)
|
||||
assert "Deepwisdom" in result
|
||||
|
||||
if urls:
|
||||
results = await browser.run(url, *urls)
|
||||
assert isinstance(results, list)
|
||||
assert len(results) == len(urls) + 1
|
||||
assert all(("Deepwisdom" in i) for i in results)
|
||||
if use_proxy:
|
||||
assert "Proxy:" in capfd.readouterr().out
|
||||
finally:
|
||||
CONFIG.global_proxy = global_proxy
|
||||
34
tests/metagpt/tools/test_web_browser_engine_selenium.py
Normal file
34
tests/metagpt/tools/test_web_browser_engine_selenium.py
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import pytest
|
||||
from metagpt.config import CONFIG
|
||||
from metagpt.tools import web_browser_engine_selenium
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"browser_type, use_proxy, url, urls",
|
||||
[
|
||||
("chrome", True, "https://fuzhi.ai", ("https://fuzhi.ai",)),
|
||||
("firefox", False, "https://fuzhi.ai", ("https://fuzhi.ai",)),
|
||||
("edge", False, "https://fuzhi.ai", ("https://fuzhi.ai",)),
|
||||
],
|
||||
ids=["chrome-normal", "firefox-normal", "edge-normal"],
|
||||
)
|
||||
async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd):
|
||||
try:
|
||||
global_proxy = CONFIG.global_proxy
|
||||
if use_proxy:
|
||||
CONFIG.global_proxy = proxy
|
||||
browser = web_browser_engine_selenium.SeleniumWrapper(browser_type)
|
||||
result = await browser.run(url)
|
||||
assert isinstance(result, str)
|
||||
assert "Deepwisdom" in result
|
||||
|
||||
if urls:
|
||||
results = await browser.run(url, *urls)
|
||||
assert isinstance(results, list)
|
||||
assert len(results) == len(urls) + 1
|
||||
assert all(("Deepwisdom" in i) for i in results)
|
||||
if use_proxy:
|
||||
assert "Proxy:" in capfd.readouterr().out
|
||||
finally:
|
||||
CONFIG.global_proxy = global_proxy
|
||||
Loading…
Add table
Add a link
Reference in a new issue