add view_page_element_to_scrape tool

This commit is contained in:
shenchucheng 2024-06-25 11:04:43 +08:00
parent 877b06bfdc
commit bd675d5178
4 changed files with 57 additions and 19 deletions

View file

@ -28,7 +28,7 @@ from metagpt.utils.report import BrowserReporter
@register_tool(
tags=["web", "browse", "scrape"],
tags=["web", "browse"],
include_functions=[
"click",
"close_tab",
@ -197,3 +197,10 @@ class Browser:
async def view(self):
observation = parse_accessibility_tree(self.accessibility_tree)
return f"Current _Browser Viewer\n URL: {self.page.url}\nOBSERVATION:\n{observation[0]}\n"
async def __aenter__(self):
await self.start()
return self
async def __aexit__(self, *args, **kwargs):
await self.stop()

View file

@ -1,20 +1,50 @@
import contextlib
from uuid import uuid4
from metagpt.tools.libs.browser import Browser
from metagpt.tools.tool_registry import register_tool
from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper
from metagpt.utils.file import MemoryFileSystem
from metagpt.utils.parse_html import simplify_html
@register_tool(tags=["web scraping", "web"])
async def scrape_web_playwright(url):
"""
Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright.
@register_tool(tags=["web scraping"])
async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> None:
"""view the HTML content of current page to understand the structure. When executed, the content will be printed out
Args:
url (str): The main URL to fetch inner text from.
Returns:
dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'.
url (str): The URL of the web page to scrape.
requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements.
keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required
"""
# Create a PlaywrightWrapper instance for the Chromium browser
web = await PlaywrightWrapper().run(url)
async with Browser() as browser:
await browser.goto(url)
page = browser.page
html = await page.content()
html = simplify_html(html, url=page.url, keep_links=keep_links)
mem_fs = MemoryFileSystem()
filename = f"{uuid4().hex}.html"
with mem_fs.open(filename, "w") as f:
f.write(html)
# Return the inner text content of the web page
return {"inner_text": web.inner_text.strip(), "html": web.html.strip()}
# Since RAG is an optional optimization, if it fails, the simplified HTML can be used as a fallback.
with contextlib.suppress(Exception):
from metagpt.rag.engines import SimpleEngine # avoid circular import
# TODO make `from_docs` asynchronous
engine = SimpleEngine.from_docs(input_files=[filename], fs=mem_fs)
nodes = await engine.aretrieve(requirement)
html = "\n".join(i.text for i in nodes)
mem_fs.rm_file(filename)
print(html)
# async def get_elements_outerhtml(self, element_ids: list[int]):
# """Inspect the outer HTML of the elements in Current Browser Viewer.
# """
# page = self.page
# data = []
# for element_id in element_ids:
# html = await get_element_outer_html(page, get_backend_node_id(element_id, self.accessibility_tree))
# data.append(html)
# return "\n".join(f"[{element_id}]. {html}" for element_id, html in zip(element_ids, data))