conflict fixes

This commit is contained in:
brucemeek 2023-08-15 07:46:02 -05:00
parent 193178b7d1
commit 3ce123a7bf
3 changed files with 69 additions and 63 deletions

View file

@ -2,12 +2,15 @@
from __future__ import annotations
import asyncio
from pathlib import Path
import sys
from pathlib import Path
from typing import Literal
from playwright.async_api import async_playwright
from metagpt.config import CONFIG
from metagpt.logs import logger
from metagpt.utils.parse_html import WebPage
class PlaywrightWrapper:
@ -16,7 +19,7 @@ class PlaywrightWrapper:
To use this module, you should have the `playwright` Python package installed and ensure that
the required browsers are also installed. You can install playwright by running the command
`pip install metagpt[playwright]` and download the necessary browser binaries by running the
command `playwright install` for the first time."
command `playwright install` for the first time.
"""
def __init__(
@ -40,27 +43,30 @@ class PlaywrightWrapper:
self._context_kwargs = context_kwargs
self._has_run_precheck = False
async def run(self, url: str, *urls: str) -> str | list[str]:
async def run(self, url: str, *urls: str) -> WebPage | list[WebPage]:
async with async_playwright() as ap:
browser_type = getattr(ap, self.browser_type)
await self._run_precheck(browser_type)
browser = await browser_type.launch(**self.launch_kwargs)
async def _scrape(url):
context = await browser.new_context(**self._context_kwargs)
page = await context.new_page()
async with page:
try:
await page.goto(url)
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
content = await page.content()
return content
except Exception as e:
return f"Fail to load page content for {e}"
_scrape = self._scrape
if urls:
return await asyncio.gather(_scrape(url), *(_scrape(i) for i in urls))
return await _scrape(url)
return await asyncio.gather(_scrape(browser, url), *(_scrape(browser, i) for i in urls))
return await _scrape(browser, url)
async def _scrape(self, browser, url):
context = await browser.new_context(**self._context_kwargs)
page = await context.new_page()
async with page:
try:
await page.goto(url)
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
html = await page.content()
inner_text = await page.evaluate("() => document.body.innerText")
except Exception as e:
inner_text = f"Fail to load page content for {e}"
html = ""
return WebPage(inner_text=inner_text, html=html, url=url)
async def _run_precheck(self, browser_type):
if self._has_run_precheck:
@ -72,6 +78,10 @@ class PlaywrightWrapper:
if CONFIG.global_proxy:
kwargs["env"] = {"ALL_PROXY": CONFIG.global_proxy}
await _install_browsers(self.browser_type, **kwargs)
if self._has_run_precheck:
return
if not executable_path.exists():
parts = executable_path.parts
available_paths = list(Path(*parts[:-3]).glob(f"{self.browser_type}-*"))
@ -85,25 +95,37 @@ class PlaywrightWrapper:
self._has_run_precheck = True
def _get_install_lock():
global _install_lock
if _install_lock is None:
_install_lock = asyncio.Lock()
return _install_lock
async def _install_browsers(*browsers, **kwargs) -> None:
process = await asyncio.create_subprocess_exec(
sys.executable,
"-m",
"playwright",
"install",
*browsers,
"--with-deps",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
**kwargs,
)
async with _get_install_lock():
browsers = [i for i in browsers if i not in _install_cache]
if not browsers:
return
process = await asyncio.create_subprocess_exec(
sys.executable,
"-m",
"playwright",
"install",
*browsers,
# "--with-deps",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
**kwargs,
)
await asyncio.gather(_log_stream(process.stdout, logger.info), _log_stream(process.stderr, logger.warning))
await asyncio.gather(_log_stream(process.stdout, logger.info), _log_stream(process.stderr, logger.warning))
if await process.wait() == 0:
logger.info(f"Install browser for playwright successfully.")
else:
logger.warning(f"Fail to install browser for playwright.")
if await process.wait() == 0:
logger.info("Install browser for playwright successfully.")
else:
logger.warning("Fail to install browser for playwright.")
_install_cache.update(browsers)
async def _log_stream(sr, log_func):
@ -114,8 +136,14 @@ async def _log_stream(sr, log_func):
log_func(f"[playwright install browser]: {line.decode().strip()}")
_install_lock: asyncio.Lock = None
_install_cache = set()
if __name__ == "__main__":
for i in ("chromium", "firefox", "webkit"):
text = asyncio.run(PlaywrightWrapper(i).run("https://httpbin.org/ip"))
print(text)
print(i)
import fire
async def main(url: str, *urls: str, browser_type: str = "chromium", **kwargs):
return await PlaywrightWrapper(browser_type, **kwargs).run(url, *urls)
fire.Fire(main)