diff --git a/examples/di/crawl_webpage.py b/examples/di/crawl_webpage.py index 92e3c32b0..c4e1b6599 100644 --- a/examples/di/crawl_webpage.py +++ b/examples/di/crawl_webpage.py @@ -6,9 +6,7 @@ """ from metagpt.roles.di.data_interpreter import DataInterpreter - -__import__("metagpt.tools.libs.browser", fromlist=["Browser"]) # To skip pre-commit check - +from metagpt.tools.libs.web_scraping import view_page_element_to_scrape PAPER_LIST_REQ = """" Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/, @@ -34,7 +32,7 @@ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash async def main(): - di = DataInterpreter(tools=["Browser"]) + di = DataInterpreter(tools=[view_page_element_to_scrape.__name__]) await di.run(ECOMMERCE_REQ) diff --git a/metagpt/rag/engines/simple.py b/metagpt/rag/engines/simple.py index c237dcf69..8a9ccaffd 100644 --- a/metagpt/rag/engines/simple.py +++ b/metagpt/rag/engines/simple.py @@ -4,6 +4,7 @@ import json import os from typing import Any, Optional, Union +import fsspec from llama_index.core import SimpleDirectoryReader from llama_index.core.callbacks.base import CallbackManager from llama_index.core.embeddings import BaseEmbedding @@ -83,6 +84,7 @@ class SimpleEngine(RetrieverQueryEngine): llm: LLM = None, retriever_configs: list[BaseRetrieverConfig] = None, ranker_configs: list[BaseRankerConfig] = None, + fs: Optional[fsspec.AbstractFileSystem] = None, ) -> "SimpleEngine": """From docs. @@ -96,11 +98,12 @@ class SimpleEngine(RetrieverQueryEngine): llm: Must supported by llama index. Default OpenAI. retriever_configs: Configuration for retrievers. If more than one config, will use SimpleHybridRetriever. ranker_configs: Configuration for rankers. + fs: File system to use. """ if not input_dir and not input_files: raise ValueError("Must provide either `input_dir` or `input_files`.") - documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data() + documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files, fs=fs).load_data() cls._fix_document_metadata(documents) transformations = transformations or cls._default_transformations() diff --git a/metagpt/tools/libs/browser.py b/metagpt/tools/libs/browser.py index df9d43135..9d24d4baf 100644 --- a/metagpt/tools/libs/browser.py +++ b/metagpt/tools/libs/browser.py @@ -28,7 +28,7 @@ from metagpt.utils.report import BrowserReporter @register_tool( - tags=["web", "browse", "scrape"], + tags=["web", "browse"], include_functions=[ "click", "close_tab", @@ -197,3 +197,10 @@ class Browser: async def view(self): observation = parse_accessibility_tree(self.accessibility_tree) return f"Current _Browser Viewer\n URL: {self.page.url}\nOBSERVATION:\n{observation[0]}\n" + + async def __aenter__(self): + await self.start() + return self + + async def __aexit__(self, *args, **kwargs): + await self.stop() diff --git a/metagpt/tools/libs/web_scraping.py b/metagpt/tools/libs/web_scraping.py index bc34b1306..489c3a472 100644 --- a/metagpt/tools/libs/web_scraping.py +++ b/metagpt/tools/libs/web_scraping.py @@ -1,20 +1,50 @@ +import contextlib +from uuid import uuid4 + +from metagpt.tools.libs.browser import Browser from metagpt.tools.tool_registry import register_tool -from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper +from metagpt.utils.file import MemoryFileSystem +from metagpt.utils.parse_html import simplify_html -@register_tool(tags=["web scraping", "web"]) -async def scrape_web_playwright(url): - """ - Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright. +@register_tool(tags=["web scraping"]) +async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> None: + """view the HTML content of current page to understand the structure. When executed, the content will be printed out Args: - url (str): The main URL to fetch inner text from. - - Returns: - dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'. + url (str): The URL of the web page to scrape. + requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements. + keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required """ - # Create a PlaywrightWrapper instance for the Chromium browser - web = await PlaywrightWrapper().run(url) + async with Browser() as browser: + await browser.goto(url) + page = browser.page + html = await page.content() + html = simplify_html(html, url=page.url, keep_links=keep_links) + mem_fs = MemoryFileSystem() + filename = f"{uuid4().hex}.html" + with mem_fs.open(filename, "w") as f: + f.write(html) - # Return the inner text content of the web page - return {"inner_text": web.inner_text.strip(), "html": web.html.strip()} + # Since RAG is an optional optimization, if it fails, the simplified HTML can be used as a fallback. + with contextlib.suppress(Exception): + from metagpt.rag.engines import SimpleEngine # avoid circular import + + # TODO make `from_docs` asynchronous + engine = SimpleEngine.from_docs(input_files=[filename], fs=mem_fs) + nodes = await engine.aretrieve(requirement) + html = "\n".join(i.text for i in nodes) + + mem_fs.rm_file(filename) + print(html) + + +# async def get_elements_outerhtml(self, element_ids: list[int]): +# """Inspect the outer HTML of the elements in Current Browser Viewer. +# """ +# page = self.page +# data = [] +# for element_id in element_ids: +# html = await get_element_outer_html(page, get_backend_node_id(element_id, self.accessibility_tree)) +# data.append(html) +# return "\n".join(f"[{element_id}]. {html}" for element_id, html in zip(element_ids, data))