replace *.deepwisdom.ai for scraping by local server

This commit is contained in:
shenchucheng 2024-02-21 15:40:23 +08:00
parent ac755e7668
commit d53cfd39f7
6 changed files with 50 additions and 35 deletions

View file

@ -4,8 +4,8 @@ from metagpt.tools.libs.web_scraping import scrape_web_playwright
@pytest.mark.asyncio
async def test_scrape_web_playwright():
test_url = "https://www.deepwisdom.ai"
async def test_scrape_web_playwright(http_server):
server, test_url = await http_server()
result = await scrape_web_playwright(test_url)
@ -21,3 +21,4 @@ async def test_scrape_web_playwright():
assert not result["inner_text"].endswith(" ")
assert not result["html"].startswith(" ")
assert not result["html"].endswith(" ")
await server.stop()

View file

@ -9,14 +9,16 @@ from metagpt.utils.parse_html import WebPage
@pytest.mark.asyncio
@pytest.mark.parametrize(
"browser_type, url, urls",
"browser_type",
[
(WebBrowserEngineType.PLAYWRIGHT, "https://deepwisdom.ai", ("https://deepwisdom.ai",)),
(WebBrowserEngineType.SELENIUM, "https://deepwisdom.ai", ("https://deepwisdom.ai",)),
WebBrowserEngineType.PLAYWRIGHT,
WebBrowserEngineType.SELENIUM,
],
ids=["playwright", "selenium"],
)
async def test_scrape_web_page(browser_type, url, urls):
async def test_scrape_web_page(browser_type, http_server):
server, url = await http_server()
urls = [url, url, url]
browser = web_browser_engine.WebBrowserEngine(engine=browser_type)
result = await browser.run(url)
assert isinstance(result, WebPage)
@ -27,6 +29,7 @@ async def test_scrape_web_page(browser_type, url, urls):
assert isinstance(results, list)
assert len(results) == len(urls) + 1
assert all(("MetaGPT" in i.inner_text) for i in results)
await server.stop()
if __name__ == "__main__":

View file

@ -9,18 +9,28 @@ from metagpt.utils.parse_html import WebPage
@pytest.mark.asyncio
@pytest.mark.parametrize(
"browser_type, use_proxy, kwagrs, url, urls",
"browser_type, use_proxy, kwagrs,",
[
("chromium", {"proxy": True}, {}, "https://www.deepwisdom.ai", ("https://www.deepwisdom.ai",)),
("firefox", {}, {"ignore_https_errors": True}, "https://www.deepwisdom.ai", ("https://www.deepwisdom.ai",)),
("webkit", {}, {"ignore_https_errors": True}, "https://www.deepwisdom.ai", ("https://www.deepwisdom.ai",)),
("chromium", {"proxy": True}, {}),
(
"firefox",
{},
{"ignore_https_errors": True},
),
(
"webkit",
{},
{"ignore_https_errors": True},
),
],
ids=["chromium-normal", "firefox-normal", "webkit-normal"],
)
async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy, capfd):
async def test_scrape_web_page(browser_type, use_proxy, kwagrs, proxy, capfd, http_server):
server, url = await http_server()
urls = [url, url, url]
proxy_url = None
if use_proxy:
server, proxy_url = await proxy()
proxy_server, proxy_url = await proxy()
browser = web_browser_engine_playwright.PlaywrightWrapper(browser_type=browser_type, proxy=proxy_url, **kwagrs)
result = await browser.run(url)
assert isinstance(result, WebPage)
@ -32,8 +42,10 @@ async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy
assert len(results) == len(urls) + 1
assert all(("MetaGPT" in i.inner_text) for i in results)
if use_proxy:
server.close()
proxy_server.close()
await proxy_server.wait_closed()
assert "Proxy:" in capfd.readouterr().out
await server.stop()
if __name__ == "__main__":

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import browsers
import pytest
@ -10,51 +11,48 @@ from metagpt.utils.parse_html import WebPage
@pytest.mark.asyncio
@pytest.mark.parametrize(
"browser_type, use_proxy, url, urls",
"browser_type, use_proxy,",
[
pytest.param(
"chrome",
True,
"https://deepwisdom.ai",
("https://deepwisdom.ai",),
False,
marks=pytest.mark.skipif(not browsers.get("chrome"), reason="chrome browser not found"),
),
pytest.param(
"firefox",
False,
"https://deepwisdom.ai",
("https://deepwisdom.ai",),
marks=pytest.mark.skipif(not browsers.get("firefox"), reason="firefox browser not found"),
),
pytest.param(
"edge",
False,
"https://deepwisdom.ai",
("https://deepwisdom.ai",),
marks=pytest.mark.skipif(not browsers.get("msedge"), reason="edge browser not found"),
),
],
ids=["chrome-normal", "firefox-normal", "edge-normal"],
)
async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd):
async def test_scrape_web_page(browser_type, use_proxy, proxy, capfd, http_server):
# Prerequisites
# firefox, chrome, Microsoft Edge
server, url = await http_server()
urls = [url, url, url]
proxy_url = None
if use_proxy:
server, proxy_url = await proxy()
proxy_server, proxy_url = await proxy()
browser = web_browser_engine_selenium.SeleniumWrapper(browser_type=browser_type, proxy=proxy_url)
result = await browser.run(url)
assert isinstance(result, WebPage)
assert "MetaGPT" in result.inner_text
if urls:
results = await browser.run(url, *urls)
assert isinstance(results, list)
assert len(results) == len(urls) + 1
assert all(("MetaGPT" in i.inner_text) for i in results)
results = await browser.run(url, *urls)
assert isinstance(results, list)
assert len(results) == len(urls) + 1
assert all(("MetaGPT" in i.inner_text) for i in results)
if use_proxy:
server.close()
assert "Proxy:" in capfd.readouterr().out
proxy_server.close()
await proxy_server.wait_closed()
assert "Proxy: localhost" in capfd.readouterr().out
await server.stop()
if __name__ == "__main__":