Merge pull request #69 from shenchucheng/main

Add web page scraping feature implemented by Playwright/Selenium
This commit is contained in:
geekan 2023-07-25 11:28:34 +08:00 committed by GitHub
commit 007c8c0457
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 479 additions and 30 deletions

View file

@ -12,6 +12,8 @@ import pytest
from metagpt.logs import logger
from metagpt.provider.openai_api import OpenAIGPTAPI as GPTAPI
import asyncio
import re
class Context:
@ -38,3 +40,31 @@ def llm_api():
def mock_llm():
# Create a mock LLM for testing
return Mock()
@pytest.fixture(scope="session")
def proxy():
pattern = re.compile(
rb"(?P<method>[a-zA-Z]+) (?P<uri>(\w+://)?(?P<host>[^\s\'\"<>\[\]{}|/:]+)(:(?P<port>\d+))?[^\s\'\"<>\[\]{}|]*) "
)
async def pipe(reader, writer):
while not reader.at_eof():
writer.write(await reader.read(2048))
writer.close()
async def handle_client(reader, writer):
data = await reader.readuntil(b"\r\n\r\n")
print(f"Proxy: {data}") # checking with capfd fixture
infos = pattern.match(data)
host, port = infos.group("host"), infos.group("port")
port = int(port) if port else 80
remote_reader, remote_writer = await asyncio.open_connection(host, port)
if data.startswith(b"CONNECT"):
writer.write(b"HTTP/1.1 200 Connection Established\r\n\r\n")
else:
remote_writer.write(data)
await asyncio.gather(pipe(reader, remote_writer), pipe(remote_reader, writer))
server = asyncio.get_event_loop().run_until_complete(asyncio.start_server(handle_client, "127.0.0.1", 0))
return "http://{}:{}".format(*server.sockets[0].getsockname())

View file

@ -0,0 +1,25 @@
import pytest
from metagpt.config import Config
from metagpt.tools import web_browser_engine, WebBrowserEngineType
@pytest.mark.asyncio
@pytest.mark.parametrize(
"browser_type, url, urls",
[
(WebBrowserEngineType.PLAYWRIGHT, "https://fuzhi.ai", ("https://fuzhi.ai",)),
(WebBrowserEngineType.SELENIUM, "https://fuzhi.ai", ("https://fuzhi.ai",)),
],
ids=["playwright", "selenium"],
)
async def test_scrape_web_page(browser_type, url, urls):
browser = web_browser_engine.WebBrowserEngine(browser_type)
result = await browser.run(url)
assert isinstance(result, str)
assert "深度赋智" in result
if urls:
results = await browser.run(url, *urls)
assert isinstance(results, list)
assert len(results) == len(urls) + 1
assert all(("深度赋智" in i) for i in results)

View file

@ -0,0 +1,34 @@
import pytest
from metagpt.config import CONFIG
from metagpt.tools import web_browser_engine_playwright
@pytest.mark.asyncio
@pytest.mark.parametrize(
"browser_type, use_proxy, kwagrs, url, urls",
[
("chromium", {"proxy": True}, {}, "https://fuzhi.ai", ("https://fuzhi.ai",)),
("firefox", {}, {"ignore_https_errors": True}, "https://fuzhi.ai", ("https://fuzhi.ai",)),
("webkit", {}, {"ignore_https_errors": True}, "https://fuzhi.ai", ("https://fuzhi.ai",)),
],
ids=["chromium-normal", "firefox-normal", "webkit-normal"],
)
async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy, capfd):
try:
global_proxy = CONFIG.global_proxy
if use_proxy:
CONFIG.global_proxy = proxy
browser = web_browser_engine_playwright.PlaywrightWrapper(browser_type, **kwagrs)
result = await browser.run(url)
assert isinstance(result, str)
assert "Deepwisdom" in result
if urls:
results = await browser.run(url, *urls)
assert isinstance(results, list)
assert len(results) == len(urls) + 1
assert all(("Deepwisdom" in i) for i in results)
if use_proxy:
assert "Proxy:" in capfd.readouterr().out
finally:
CONFIG.global_proxy = global_proxy

View file

@ -0,0 +1,34 @@
import pytest
from metagpt.config import CONFIG
from metagpt.tools import web_browser_engine_selenium
@pytest.mark.asyncio
@pytest.mark.parametrize(
"browser_type, use_proxy, url, urls",
[
("chrome", True, "https://fuzhi.ai", ("https://fuzhi.ai",)),
("firefox", False, "https://fuzhi.ai", ("https://fuzhi.ai",)),
("edge", False, "https://fuzhi.ai", ("https://fuzhi.ai",)),
],
ids=["chrome-normal", "firefox-normal", "edge-normal"],
)
async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd):
try:
global_proxy = CONFIG.global_proxy
if use_proxy:
CONFIG.global_proxy = proxy
browser = web_browser_engine_selenium.SeleniumWrapper(browser_type)
result = await browser.run(url)
assert isinstance(result, str)
assert "Deepwisdom" in result
if urls:
results = await browser.run(url, *urls)
assert isinstance(results, list)
assert len(results) == len(urls) + 1
assert all(("Deepwisdom" in i) for i in results)
if use_proxy:
assert "Proxy:" in capfd.readouterr().out
finally:
CONFIG.global_proxy = global_proxy