mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-21 14:05:17 +02:00
add view_page_element_to_scrape tool
This commit is contained in:
parent
877b06bfdc
commit
bd675d5178
4 changed files with 57 additions and 19 deletions
|
|
@ -6,9 +6,7 @@
|
|||
"""
|
||||
|
||||
from metagpt.roles.di.data_interpreter import DataInterpreter
|
||||
|
||||
__import__("metagpt.tools.libs.browser", fromlist=["Browser"]) # To skip pre-commit check
|
||||
|
||||
from metagpt.tools.libs.web_scraping import view_page_element_to_scrape
|
||||
|
||||
PAPER_LIST_REQ = """"
|
||||
Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
|
||||
|
|
@ -34,7 +32,7 @@ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash
|
|||
|
||||
|
||||
async def main():
|
||||
di = DataInterpreter(tools=["Browser"])
|
||||
di = DataInterpreter(tools=[view_page_element_to_scrape.__name__])
|
||||
|
||||
await di.run(ECOMMERCE_REQ)
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import json
|
|||
import os
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import fsspec
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.callbacks.base import CallbackManager
|
||||
from llama_index.core.embeddings import BaseEmbedding
|
||||
|
|
@ -83,6 +84,7 @@ class SimpleEngine(RetrieverQueryEngine):
|
|||
llm: LLM = None,
|
||||
retriever_configs: list[BaseRetrieverConfig] = None,
|
||||
ranker_configs: list[BaseRankerConfig] = None,
|
||||
fs: Optional[fsspec.AbstractFileSystem] = None,
|
||||
) -> "SimpleEngine":
|
||||
"""From docs.
|
||||
|
||||
|
|
@ -96,11 +98,12 @@ class SimpleEngine(RetrieverQueryEngine):
|
|||
llm: Must supported by llama index. Default OpenAI.
|
||||
retriever_configs: Configuration for retrievers. If more than one config, will use SimpleHybridRetriever.
|
||||
ranker_configs: Configuration for rankers.
|
||||
fs: File system to use.
|
||||
"""
|
||||
if not input_dir and not input_files:
|
||||
raise ValueError("Must provide either `input_dir` or `input_files`.")
|
||||
|
||||
documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data()
|
||||
documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files, fs=fs).load_data()
|
||||
cls._fix_document_metadata(documents)
|
||||
|
||||
transformations = transformations or cls._default_transformations()
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ from metagpt.utils.report import BrowserReporter
|
|||
|
||||
|
||||
@register_tool(
|
||||
tags=["web", "browse", "scrape"],
|
||||
tags=["web", "browse"],
|
||||
include_functions=[
|
||||
"click",
|
||||
"close_tab",
|
||||
|
|
@ -197,3 +197,10 @@ class Browser:
|
|||
async def view(self):
|
||||
observation = parse_accessibility_tree(self.accessibility_tree)
|
||||
return f"Current _Browser Viewer\n URL: {self.page.url}\nOBSERVATION:\n{observation[0]}\n"
|
||||
|
||||
async def __aenter__(self):
|
||||
await self.start()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *args, **kwargs):
|
||||
await self.stop()
|
||||
|
|
|
|||
|
|
@ -1,20 +1,50 @@
|
|||
import contextlib
|
||||
from uuid import uuid4
|
||||
|
||||
from metagpt.tools.libs.browser import Browser
|
||||
from metagpt.tools.tool_registry import register_tool
|
||||
from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper
|
||||
from metagpt.utils.file import MemoryFileSystem
|
||||
from metagpt.utils.parse_html import simplify_html
|
||||
|
||||
|
||||
@register_tool(tags=["web scraping", "web"])
|
||||
async def scrape_web_playwright(url):
|
||||
"""
|
||||
Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright.
|
||||
@register_tool(tags=["web scraping"])
|
||||
async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> None:
|
||||
"""view the HTML content of current page to understand the structure. When executed, the content will be printed out
|
||||
|
||||
Args:
|
||||
url (str): The main URL to fetch inner text from.
|
||||
|
||||
Returns:
|
||||
dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'.
|
||||
url (str): The URL of the web page to scrape.
|
||||
requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements.
|
||||
keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required
|
||||
"""
|
||||
# Create a PlaywrightWrapper instance for the Chromium browser
|
||||
web = await PlaywrightWrapper().run(url)
|
||||
async with Browser() as browser:
|
||||
await browser.goto(url)
|
||||
page = browser.page
|
||||
html = await page.content()
|
||||
html = simplify_html(html, url=page.url, keep_links=keep_links)
|
||||
mem_fs = MemoryFileSystem()
|
||||
filename = f"{uuid4().hex}.html"
|
||||
with mem_fs.open(filename, "w") as f:
|
||||
f.write(html)
|
||||
|
||||
# Return the inner text content of the web page
|
||||
return {"inner_text": web.inner_text.strip(), "html": web.html.strip()}
|
||||
# Since RAG is an optional optimization, if it fails, the simplified HTML can be used as a fallback.
|
||||
with contextlib.suppress(Exception):
|
||||
from metagpt.rag.engines import SimpleEngine # avoid circular import
|
||||
|
||||
# TODO make `from_docs` asynchronous
|
||||
engine = SimpleEngine.from_docs(input_files=[filename], fs=mem_fs)
|
||||
nodes = await engine.aretrieve(requirement)
|
||||
html = "\n".join(i.text for i in nodes)
|
||||
|
||||
mem_fs.rm_file(filename)
|
||||
print(html)
|
||||
|
||||
|
||||
# async def get_elements_outerhtml(self, element_ids: list[int]):
|
||||
# """Inspect the outer HTML of the elements in Current Browser Viewer.
|
||||
# """
|
||||
# page = self.page
|
||||
# data = []
|
||||
# for element_id in element_ids:
|
||||
# html = await get_element_outer_html(page, get_backend_node_id(element_id, self.accessibility_tree))
|
||||
# data.append(html)
|
||||
# return "\n".join(f"[{element_id}]. {html}" for element_id, html in zip(element_ids, data))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue