mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-05 05:42:37 +02:00
Unified WebPage object return for the WebBrowserEngine API
This commit is contained in:
parent
c62c870ab9
commit
ede23b2fe9
7 changed files with 228 additions and 69 deletions
|
|
@ -1,22 +1,20 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import importlib
|
||||
|
||||
from typing import Any, Callable, Coroutine, overload
|
||||
import importlib
|
||||
from typing import Any, Callable, Coroutine, Literal, overload
|
||||
|
||||
from metagpt.config import CONFIG
|
||||
from metagpt.tools import WebBrowserEngineType
|
||||
from bs4 import BeautifulSoup
|
||||
from metagpt.utils.parse_html import WebPage
|
||||
|
||||
|
||||
class WebBrowserEngine:
|
||||
def __init__(
|
||||
self,
|
||||
engine: WebBrowserEngineType | None = None,
|
||||
run_func: Callable[..., Coroutine[Any, Any, str | list[str]]] | None = None,
|
||||
parse_func: Callable[[str], str] | None = None,
|
||||
run_func: Callable[..., Coroutine[Any, Any, WebPage | list[WebPage]]] | None = None,
|
||||
):
|
||||
engine = engine or CONFIG.web_browser_engine
|
||||
|
||||
|
|
@ -30,30 +28,25 @@ class WebBrowserEngine:
|
|||
run_func = run_func
|
||||
else:
|
||||
raise NotImplementedError
|
||||
self.parse_func = parse_func or get_page_content
|
||||
self.run_func = run_func
|
||||
self.engine = engine
|
||||
|
||||
@overload
|
||||
async def run(self, url: str) -> str:
|
||||
async def run(self, url: str) -> WebPage:
|
||||
...
|
||||
|
||||
@overload
|
||||
async def run(self, url: str, *urls: str) -> list[str]:
|
||||
async def run(self, url: str, *urls: str) -> list[WebPage]:
|
||||
...
|
||||
|
||||
async def run(self, url: str, *urls: str) -> str | list[str]:
|
||||
page = await self.run_func(url, *urls)
|
||||
if isinstance(page, str):
|
||||
return self.parse_func(page)
|
||||
return [self.parse_func(i) for i in page]
|
||||
|
||||
|
||||
def get_page_content(page: str):
|
||||
soup = BeautifulSoup(page, "html.parser")
|
||||
return "\n".join(i.text.strip() for i in soup.find_all(["h1", "h2", "h3", "h4", "h5", "p", "pre"]))
|
||||
async def run(self, url: str, *urls: str) -> WebPage | list[WebPage]:
|
||||
return await self.run_func(url, *urls)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
text = asyncio.run(WebBrowserEngine().run("https://fuzhi.ai/"))
|
||||
print(text)
|
||||
import fire
|
||||
|
||||
async def main(url: str, *urls: str, engine_type: Literal["playwright", "selenium"] = "playwright", **kwargs):
|
||||
return await WebBrowserEngine(WebBrowserEngineType(engine_type), **kwargs).run(url, *urls)
|
||||
|
||||
fire.Fire(main)
|
||||
|
|
|
|||
|
|
@ -2,12 +2,15 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from metagpt.config import CONFIG
|
||||
from metagpt.logs import logger
|
||||
from metagpt.utils.parse_html import WebPage
|
||||
|
||||
|
||||
class PlaywrightWrapper:
|
||||
|
|
@ -16,7 +19,7 @@ class PlaywrightWrapper:
|
|||
To use this module, you should have the `playwright` Python package installed and ensure that
|
||||
the required browsers are also installed. You can install playwright by running the command
|
||||
`pip install metagpt[playwright]` and download the necessary browser binaries by running the
|
||||
command `playwright install` for the first time."
|
||||
command `playwright install` for the first time.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
|
@ -40,27 +43,30 @@ class PlaywrightWrapper:
|
|||
self._context_kwargs = context_kwargs
|
||||
self._has_run_precheck = False
|
||||
|
||||
async def run(self, url: str, *urls: str) -> str | list[str]:
|
||||
async def run(self, url: str, *urls: str) -> WebPage | list[WebPage]:
|
||||
async with async_playwright() as ap:
|
||||
browser_type = getattr(ap, self.browser_type)
|
||||
await self._run_precheck(browser_type)
|
||||
browser = await browser_type.launch(**self.launch_kwargs)
|
||||
|
||||
async def _scrape(url):
|
||||
context = await browser.new_context(**self._context_kwargs)
|
||||
page = await context.new_page()
|
||||
async with page:
|
||||
try:
|
||||
await page.goto(url)
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
content = await page.content()
|
||||
return content
|
||||
except Exception as e:
|
||||
return f"Fail to load page content for {e}"
|
||||
_scrape = self._scrape
|
||||
|
||||
if urls:
|
||||
return await asyncio.gather(_scrape(url), *(_scrape(i) for i in urls))
|
||||
return await _scrape(url)
|
||||
return await asyncio.gather(_scrape(browser, url), *(_scrape(browser, i) for i in urls))
|
||||
return await _scrape(browser, url)
|
||||
|
||||
async def _scrape(self, browser, url):
|
||||
context = await browser.new_context(**self._context_kwargs)
|
||||
page = await context.new_page()
|
||||
async with page:
|
||||
try:
|
||||
await page.goto(url)
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
html = await page.content()
|
||||
inner_text = await page.evaluate("() => document.body.innerText")
|
||||
except Exception as e:
|
||||
inner_text = f"Fail to load page content for {e}"
|
||||
html = ""
|
||||
return WebPage(inner_text=inner_text, html=html, url=url)
|
||||
|
||||
async def _run_precheck(self, browser_type):
|
||||
if self._has_run_precheck:
|
||||
|
|
@ -72,6 +78,10 @@ class PlaywrightWrapper:
|
|||
if CONFIG.global_proxy:
|
||||
kwargs["env"] = {"ALL_PROXY": CONFIG.global_proxy}
|
||||
await _install_browsers(self.browser_type, **kwargs)
|
||||
|
||||
if self._has_run_precheck:
|
||||
return
|
||||
|
||||
if not executable_path.exists():
|
||||
parts = executable_path.parts
|
||||
available_paths = list(Path(*parts[:-3]).glob(f"{self.browser_type}-*"))
|
||||
|
|
@ -85,25 +95,37 @@ class PlaywrightWrapper:
|
|||
self._has_run_precheck = True
|
||||
|
||||
|
||||
def _get_install_lock():
|
||||
global _install_lock
|
||||
if _install_lock is None:
|
||||
_install_lock = asyncio.Lock()
|
||||
return _install_lock
|
||||
|
||||
|
||||
async def _install_browsers(*browsers, **kwargs) -> None:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
sys.executable,
|
||||
"-m",
|
||||
"playwright",
|
||||
"install",
|
||||
*browsers,
|
||||
"--with-deps",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
**kwargs,
|
||||
)
|
||||
async with _get_install_lock():
|
||||
browsers = [i for i in browsers if i not in _install_cache]
|
||||
if not browsers:
|
||||
return
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
sys.executable,
|
||||
"-m",
|
||||
"playwright",
|
||||
"install",
|
||||
*browsers,
|
||||
# "--with-deps",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
await asyncio.gather(_log_stream(process.stdout, logger.info), _log_stream(process.stderr, logger.warning))
|
||||
await asyncio.gather(_log_stream(process.stdout, logger.info), _log_stream(process.stderr, logger.warning))
|
||||
|
||||
if await process.wait() == 0:
|
||||
logger.info(f"Install browser for playwright successfully.")
|
||||
else:
|
||||
logger.warning(f"Fail to install browser for playwright.")
|
||||
if await process.wait() == 0:
|
||||
logger.info("Install browser for playwright successfully.")
|
||||
else:
|
||||
logger.warning("Fail to install browser for playwright.")
|
||||
_install_cache.update(browsers)
|
||||
|
||||
|
||||
async def _log_stream(sr, log_func):
|
||||
|
|
@ -114,8 +136,14 @@ async def _log_stream(sr, log_func):
|
|||
log_func(f"[playwright install browser]: {line.decode().strip()}")
|
||||
|
||||
|
||||
_install_lock: asyncio.Lock = None
|
||||
_install_cache = set()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
for i in ("chromium", "firefox", "webkit"):
|
||||
text = asyncio.run(PlaywrightWrapper(i).run("https://httpbin.org/ip"))
|
||||
print(text)
|
||||
print(i)
|
||||
import fire
|
||||
|
||||
async def main(url: str, *urls: str, browser_type: str = "chromium", **kwargs):
|
||||
return await PlaywrightWrapper(browser_type, **kwargs).run(url, *urls)
|
||||
|
||||
fire.Fire(main)
|
||||
|
|
|
|||
|
|
@ -2,16 +2,17 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from copy import deepcopy
|
||||
import importlib
|
||||
from concurrent import futures
|
||||
from copy import deepcopy
|
||||
from typing import Literal
|
||||
from metagpt.config import CONFIG
|
||||
import asyncio
|
||||
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.wait import WebDriverWait
|
||||
from concurrent import futures
|
||||
|
||||
from metagpt.config import CONFIG
|
||||
from metagpt.utils.parse_html import WebPage
|
||||
|
||||
|
||||
class SeleniumWrapper:
|
||||
|
|
@ -48,7 +49,7 @@ class SeleniumWrapper:
|
|||
self.loop = loop
|
||||
self.executor = executor
|
||||
|
||||
async def run(self, url: str, *urls: str) -> str | list[str]:
|
||||
async def run(self, url: str, *urls: str) -> WebPage | list[WebPage]:
|
||||
await self._run_precheck()
|
||||
|
||||
_scrape = lambda url: self.loop.run_in_executor(self.executor, self._scrape_website, url)
|
||||
|
|
@ -69,9 +70,15 @@ class SeleniumWrapper:
|
|||
|
||||
def _scrape_website(self, url):
|
||||
with self._get_driver() as driver:
|
||||
driver.get(url)
|
||||
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
||||
return driver.page_source
|
||||
try:
|
||||
driver.get(url)
|
||||
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
||||
inner_text = driver.execute_script("return document.body.innerText;")
|
||||
html = driver.page_source
|
||||
except Exception as e:
|
||||
inner_text = f"Fail to load page content for {e}"
|
||||
html = ""
|
||||
return WebPage(inner_text=inner_text, html=html, url=url)
|
||||
|
||||
|
||||
_webdriver_manager_types = {
|
||||
|
|
@ -97,6 +104,7 @@ def _gen_get_driver_func(browser_type, *args, executable_path=None):
|
|||
def _get_driver():
|
||||
options = Options()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--enable-javascript")
|
||||
if browser_type == "chrome":
|
||||
options.add_argument("--no-sandbox")
|
||||
for i in args:
|
||||
|
|
@ -107,5 +115,9 @@ def _gen_get_driver_func(browser_type, *args, executable_path=None):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
text = asyncio.run(SeleniumWrapper("chrome").run("https://fuzhi.ai/"))
|
||||
print(text)
|
||||
import fire
|
||||
|
||||
async def main(url: str, *urls: str, browser_type: str = "chrome", **kwargs):
|
||||
return await SeleniumWrapper(browser_type, **kwargs).run(url, *urls)
|
||||
|
||||
fire.Fire(main)
|
||||
|
|
|
|||
57
metagpt/utils/parse_html.py
Normal file
57
metagpt/utils/parse_html.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Generator, Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class WebPage(BaseModel):
|
||||
inner_text: str
|
||||
html: str
|
||||
url: str
|
||||
|
||||
class Config:
|
||||
underscore_attrs_are_private = True
|
||||
|
||||
_soup : Optional[BeautifulSoup] = None
|
||||
_title: Optional[str] = None
|
||||
|
||||
@property
|
||||
def soup(self) -> BeautifulSoup:
|
||||
if self._soup is None:
|
||||
self._soup = BeautifulSoup(self.html, "html.parser")
|
||||
return self._soup
|
||||
|
||||
@property
|
||||
def title(self):
|
||||
if self._title is None:
|
||||
title_tag = self.soup.find("title")
|
||||
self._title = title_tag.text.strip() if title_tag is not None else ""
|
||||
return self._title
|
||||
|
||||
def get_links(self) -> Generator[str, None, None]:
|
||||
for i in self.soup.find_all("a", href=True):
|
||||
url = i["href"]
|
||||
if url.startswith("data:"):
|
||||
continue
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = urljoin(self.url, url)
|
||||
yield url
|
||||
|
||||
|
||||
def get_html_content(page: str, base: str):
|
||||
soup = _get_soup(page)
|
||||
|
||||
return soup.get_text(strip=True)
|
||||
|
||||
|
||||
def _get_soup(page: str):
|
||||
soup = BeautifulSoup(page, "html.parser")
|
||||
# https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
|
||||
for s in soup(["style", "script", "[document]", "head", "title"]):
|
||||
s.extract()
|
||||
|
||||
return soup
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
|
||||
from metagpt.config import CONFIG
|
||||
from metagpt.tools import web_browser_engine_playwright
|
||||
|
||||
|
|
@ -20,6 +21,7 @@ async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy
|
|||
CONFIG.global_proxy = proxy
|
||||
browser = web_browser_engine_playwright.PlaywrightWrapper(browser_type, **kwagrs)
|
||||
result = await browser.run(url)
|
||||
result = result.inner_text
|
||||
assert isinstance(result, str)
|
||||
assert "Deepwisdom" in result
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
|
||||
from metagpt.config import CONFIG
|
||||
from metagpt.tools import web_browser_engine_selenium
|
||||
|
||||
|
|
@ -20,6 +21,7 @@ async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd)
|
|||
CONFIG.global_proxy = proxy
|
||||
browser = web_browser_engine_selenium.SeleniumWrapper(browser_type)
|
||||
result = await browser.run(url)
|
||||
result = result.inner_text
|
||||
assert isinstance(result, str)
|
||||
assert "Deepwisdom" in result
|
||||
|
||||
|
|
@ -27,7 +29,7 @@ async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd)
|
|||
results = await browser.run(url, *urls)
|
||||
assert isinstance(results, list)
|
||||
assert len(results) == len(urls) + 1
|
||||
assert all(("Deepwisdom" in i) for i in results)
|
||||
assert all(("Deepwisdom" in i.inner_text) for i in results)
|
||||
if use_proxy:
|
||||
assert "Proxy:" in capfd.readouterr().out
|
||||
finally:
|
||||
|
|
|
|||
65
tests/metagpt/utils/test_parse_html.py
Normal file
65
tests/metagpt/utils/test_parse_html.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
from metagpt.utils import parse_html
|
||||
|
||||
PAGE = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Random HTML Example</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>This is a Heading</h1>
|
||||
<p>This is a paragraph with <a href="test">a link</a> and some <em>emphasized</em> text.</p>
|
||||
<ul>
|
||||
<li>Item 1</li>
|
||||
<li>Item 2</li>
|
||||
<li>Item 3</li>
|
||||
</ul>
|
||||
<ol>
|
||||
<li>Numbered Item 1</li>
|
||||
<li>Numbered Item 2</li>
|
||||
<li>Numbered Item 3</li>
|
||||
</ol>
|
||||
<table>
|
||||
<tr>
|
||||
<th>Header 1</th>
|
||||
<th>Header 2</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Row 1, Cell 1</td>
|
||||
<td>Row 1, Cell 2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Row 2, Cell 1</td>
|
||||
<td>Row 2, Cell 2</td>
|
||||
</tr>
|
||||
</table>
|
||||
<img src="image.jpg" alt="Sample Image">
|
||||
<form action="/submit" method="post">
|
||||
<label for="name">Name:</label>
|
||||
<input type="text" id="name" name="name" required>
|
||||
<label for="email">Email:</label>
|
||||
<input type="email" id="email" name="email" required>
|
||||
<button type="submit">Submit</button>
|
||||
</form>
|
||||
<div class="box">
|
||||
<p>This is a div with a class "box".</p>
|
||||
<p><a href="https://metagpt.com">a link</a></p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
CONTENT = 'This is a HeadingThis is a paragraph witha linkand someemphasizedtext.Item 1Item 2Item 3Numbered Item 1Numbered '\
|
||||
'Item 2Numbered Item 3Header 1Header 2Row 1, Cell 1Row 1, Cell 2Row 2, Cell 1Row 2, Cell 2Name:Email:SubmitThis is a div '\
|
||||
'with a class "box".a link'
|
||||
|
||||
|
||||
def test_web_page():
|
||||
page = parse_html.WebPage(inner_text=CONTENT, html=PAGE, url="http://example.com")
|
||||
assert page.title == "Random HTML Example"
|
||||
assert list(page.get_links()) == ["http://example.com/test", "https://metagpt.com"]
|
||||
|
||||
|
||||
def test_get_page_content():
|
||||
ret = parse_html.get_html_content(PAGE, "http://example.com")
|
||||
assert ret == CONTENT
|
||||
Loading…
Add table
Add a link
Reference in a new issue