Merge pull request #69 from shenchucheng/main

Add web page scraping feature implemented by Playwright/Selenium
2026-04-26 01:06:27 +02:00 · 2023-07-25 11:28:34 +08:00 · 2023-07-25 11:28:34 +08:00 · 007c8c0457
commit 007c8c0457
parent a538f9a3ca e44410b3ad
13 changed files with 479 additions and 30 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -12,6 +12,8 @@ import pytest

 from metagpt.logs import logger
 from metagpt.provider.openai_api import OpenAIGPTAPI as GPTAPI
+import asyncio
+import re


 class Context:
@ -38,3 +40,31 @@ def llm_api():
 def mock_llm():
    # Create a mock LLM for testing
    return Mock()
+
+
+@pytest.fixture(scope="session")
+def proxy():
+    pattern = re.compile(
+        rb"(?P<method>[a-zA-Z]+) (?P<uri>(\w+://)?(?P<host>[^\s\'\"<>\[\]{}|/:]+)(:(?P<port>\d+))?[^\s\'\"<>\[\]{}|]*) "
+    )
+
+    async def pipe(reader, writer):
+        while not reader.at_eof():
+            writer.write(await reader.read(2048))
+        writer.close()
+
+    async def handle_client(reader, writer):
+        data = await reader.readuntil(b"\r\n\r\n")
+        print(f"Proxy: {data}")  # checking with capfd fixture
+        infos = pattern.match(data)
+        host, port = infos.group("host"), infos.group("port")
+        port = int(port) if port else 80
+        remote_reader, remote_writer = await asyncio.open_connection(host, port)
+        if data.startswith(b"CONNECT"):
+            writer.write(b"HTTP/1.1 200 Connection Established\r\n\r\n")
+        else:
+            remote_writer.write(data)
+        await asyncio.gather(pipe(reader, remote_writer), pipe(remote_reader, writer))
+
+    server = asyncio.get_event_loop().run_until_complete(asyncio.start_server(handle_client, "127.0.0.1", 0))
+    return "http://{}:{}".format(*server.sockets[0].getsockname())
--- a/tests/metagpt/tools/test_web_browser_engine.py
+++ b/tests/metagpt/tools/test_web_browser_engine.py
@ -0,0 +1,25 @@
+import pytest
+from metagpt.config import Config
+from metagpt.tools import web_browser_engine, WebBrowserEngineType
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "browser_type, url, urls",
+    [
+        (WebBrowserEngineType.PLAYWRIGHT, "https://fuzhi.ai", ("https://fuzhi.ai",)),
+        (WebBrowserEngineType.SELENIUM, "https://fuzhi.ai", ("https://fuzhi.ai",)),
+    ],
+    ids=["playwright", "selenium"],
+)
+async def test_scrape_web_page(browser_type, url, urls):
+    browser = web_browser_engine.WebBrowserEngine(browser_type)
+    result = await browser.run(url)
+    assert isinstance(result, str)
+    assert "深度赋智" in result
+
+    if urls:
+        results = await browser.run(url, *urls)
+        assert isinstance(results, list)
+        assert len(results) == len(urls) + 1
+        assert all(("深度赋智" in i) for i in results)
--- a/tests/metagpt/tools/test_web_browser_engine_playwright.py
+++ b/tests/metagpt/tools/test_web_browser_engine_playwright.py
@ -0,0 +1,34 @@
+import pytest
+from metagpt.config import CONFIG
+from metagpt.tools import web_browser_engine_playwright
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "browser_type, use_proxy, kwagrs, url, urls",
+    [
+        ("chromium", {"proxy": True}, {}, "https://fuzhi.ai", ("https://fuzhi.ai",)),
+        ("firefox", {}, {"ignore_https_errors": True}, "https://fuzhi.ai", ("https://fuzhi.ai",)),
+        ("webkit", {}, {"ignore_https_errors": True}, "https://fuzhi.ai", ("https://fuzhi.ai",)),
+    ],
+    ids=["chromium-normal", "firefox-normal", "webkit-normal"],
+)
+async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy, capfd):
+    try:
+        global_proxy = CONFIG.global_proxy
+        if use_proxy:
+            CONFIG.global_proxy = proxy
+        browser = web_browser_engine_playwright.PlaywrightWrapper(browser_type, **kwagrs)
+        result = await browser.run(url)
+        assert isinstance(result, str)
+        assert "Deepwisdom" in result
+
+        if urls:
+            results = await browser.run(url, *urls)
+            assert isinstance(results, list)
+            assert len(results) == len(urls) + 1
+            assert all(("Deepwisdom" in i) for i in results)
+        if use_proxy:
+            assert "Proxy:" in capfd.readouterr().out
+    finally:
+        CONFIG.global_proxy = global_proxy
--- a/tests/metagpt/tools/test_web_browser_engine_selenium.py
+++ b/tests/metagpt/tools/test_web_browser_engine_selenium.py
@ -0,0 +1,34 @@
+import pytest
+from metagpt.config import CONFIG
+from metagpt.tools import web_browser_engine_selenium
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "browser_type, use_proxy, url, urls",
+    [
+        ("chrome", True, "https://fuzhi.ai", ("https://fuzhi.ai",)),
+        ("firefox", False, "https://fuzhi.ai", ("https://fuzhi.ai",)),
+        ("edge", False, "https://fuzhi.ai", ("https://fuzhi.ai",)),
+    ],
+    ids=["chrome-normal", "firefox-normal", "edge-normal"],
+)
+async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd):
+    try:
+        global_proxy = CONFIG.global_proxy
+        if use_proxy:
+            CONFIG.global_proxy = proxy
+        browser = web_browser_engine_selenium.SeleniumWrapper(browser_type)
+        result = await browser.run(url)
+        assert isinstance(result, str)
+        assert "Deepwisdom" in result
+
+        if urls:
+            results = await browser.run(url, *urls)
+            assert isinstance(results, list)
+            assert len(results) == len(urls) + 1
+            assert all(("Deepwisdom" in i) for i in results)
+        if use_proxy:
+            assert "Proxy:" in capfd.readouterr().out
+    finally:
+        CONFIG.global_proxy = global_proxy