diff --git a/examples/di/crawl_webpage.py b/examples/di/crawl_webpage.py
index b8226f4f4..10b230f2b 100644
--- a/examples/di/crawl_webpage.py
+++ b/examples/di/crawl_webpage.py
@@ -6,16 +6,19 @@
 """
 
 from metagpt.roles.di.data_interpreter import DataInterpreter
+from metagpt.tools.libs.browser import Browser as _
+
 
 PAPER_LIST_REQ = """"
 Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
-and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables*
+and save it to a csv file. paper title must include `multiagent` or `large language model`.
+**Notice: view the page element before writing scraping code**
 """
 
 ECOMMERCE_REQ = """
 Get products data from website https://scrapeme.live/shop/ and save it as a csv file.
-**Notice: Firstly parse the web page encoding and the text HTML structure;
-The first page product name, price, product URL, and image URL must be saved in the csv;**
+The first page product name, price, product URL, and image URL must be saved in the csv.
+**Notice: view the page element before writing scraping code**
 """
 
 NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, **注意: 这是一个中文网站**;
@@ -25,11 +28,12 @@ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash
 3. 反思*快讯的html内容示例*中的规律, 设计正则匹配表达式来获取*`快讯`*的标题、链接、时间;
 4. 筛选最近3天的初创企业融资*`快讯`*, 以list[dict]形式打印前5个。
 5. 将全部结果存在本地csv中
+**Notice: view the page element before writing scraping code**
 """
 
 
 async def main():
-    di = DataInterpreter(tools=["scrape_web_playwright"])
+    di = DataInterpreter(tools=["Browser"])
 
     await di.run(ECOMMERCE_REQ)
 
diff --git a/metagpt/rag/engines/simple.py b/metagpt/rag/engines/simple.py
index 5c5810308..623b3f350 100644
--- a/metagpt/rag/engines/simple.py
+++ b/metagpt/rag/engines/simple.py
@@ -4,6 +4,7 @@ import json
 import os
 from typing import Any, Optional, Union
 
+from fsspec import AbstractFileSystem
 from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
 from llama_index.core.callbacks.base import CallbackManager
 from llama_index.core.embeddings import BaseEmbedding
@@ -83,6 +84,7 @@ class SimpleEngine(RetrieverQueryEngine):
         llm: LLM = None,
         retriever_configs: list[BaseRetrieverConfig] = None,
         ranker_configs: list[BaseRankerConfig] = None,
+        fs: Optional[AbstractFileSystem] = None,
     ) -> "SimpleEngine":
         """From docs.
 
@@ -100,7 +102,7 @@ class SimpleEngine(RetrieverQueryEngine):
         if not input_dir and not input_files:
             raise ValueError("Must provide either `input_dir` or `input_files`.")
 
-        documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data()
+        documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files, fs=fs).load_data()
         cls._fix_document_metadata(documents)
 
         index = VectorStoreIndex.from_documents(
diff --git a/metagpt/tools/libs/browser.py b/metagpt/tools/libs/browser.py
index 7fde804fe..8d6daec11 100644
--- a/metagpt/tools/libs/browser.py
+++ b/metagpt/tools/libs/browser.py
@@ -1,9 +1,12 @@
 from __future__ import annotations
+import contextlib
 
 from playwright.async_api import async_playwright
-
+from metagpt.utils.file import MemoryFileSystem
+from uuid import uuid4
 from metagpt.const import DEFAULT_WORKSPACE_ROOT
 from metagpt.tools.tool_registry import register_tool
+from metagpt.utils.parse_html import simplify_html
 from metagpt.utils.report import BrowserReporter
 
 
@@ -35,16 +38,49 @@ class Browser:
         print("Now on page ", url)
         await self._view()
 
-    async def open_new_page(self, url: str):
+    async def open_new_page(self, url: str, timeout: float = 30000):
         """open a new page in the browser and view the page"""
         async with self.reporter as reporter:
             page = await self.browser.new_page()
             await reporter.async_report(url, "url")
-            await page.goto(url)
+            await page.goto(url, timeout=timeout)
             self.pages[url] = page
             await self._set_current_page(page, url)
             await reporter.async_report(page, "page")
 
+    async def view_page_element_to_scrape(self, requirement: str, keep_links: bool = False) -> None:
+        """view the HTML content of current page to understand the structure. When executed, the content will be printed out
+
+        Args:
+            requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements.
+            keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required
+        """
+        html = await self.current_page.content()
+        html = simplify_html(html, url=self.current_page.url, keep_links=keep_links)
+        mem_fs = MemoryFileSystem()
+        filename = f"{uuid4().hex}.html"
+        with mem_fs.open(filename, "w") as f:
+            f.write(html)
+
+        # Since RAG is an optional optimization, if it fails, the simplified HTML can be used as a fallback.
+        with contextlib.suppress(Exception):
+
+            from metagpt.rag.engines import SimpleEngine  # avoid circular import
+
+            # TODO make `from_docs` asynchronous
+            engine = SimpleEngine.from_docs(input_files=[filename], fs=mem_fs)
+            nodes = await engine.aretrieve(requirement)
+            html = "\n".join(i.text for i in nodes)
+
+        mem_fs.rm_file(filename)
+        print(html)
+
+    async def get_page_content(self) -> str:
+        """Get the HTML content of current page."""
+        html = await self.current_page.content()
+        html_content = html.strip()
+        return html_content
+
     async def switch_page(self, url: str):
         """switch to an opened page in the browser and view the page"""
         if url in self.pages:
@@ -152,8 +188,8 @@ class Browser:
 
     async def _view(self, keep_len: int = 5000) -> str:
         """simulate human viewing the current page, return the visible text with links"""
-        visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS)
-        print("The visible text and their links (if any): ", visible_text_with_links[:keep_len])
+        # visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS)
+        # print("The visible text and their links (if any): ", visible_text_with_links[:keep_len])
         # html_content = await self._view_page_html(keep_len=keep_len)
         # print("The html content: ", html_content)
 
diff --git a/metagpt/utils/file.py b/metagpt/utils/file.py
index f62b44eb8..a8ed482d9 100644
--- a/metagpt/utils/file.py
+++ b/metagpt/utils/file.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 
 import aiofiles
+from fsspec.implementations.memory import MemoryFileSystem as _MemoryFileSystem
 
 from metagpt.logs import logger
 from metagpt.utils.exceptions import handle_exception
@@ -68,3 +69,10 @@ class File:
             content = b"".join(chunks)
             logger.debug(f"Successfully read file, the path of file: {file_path}")
             return content
+
+
+class MemoryFileSystem(_MemoryFileSystem):
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return super()._strip_protocol(str(path))
diff --git a/metagpt/utils/parse_html.py b/metagpt/utils/parse_html.py
index 65aa3f236..3aac8ca6c 100644
--- a/metagpt/utils/parse_html.py
+++ b/metagpt/utils/parse_html.py
@@ -7,6 +7,8 @@ from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, PrivateAttr
 
+import htmlmin
+
 
 class WebPage(BaseModel):
     inner_text: str
@@ -38,6 +40,22 @@ class WebPage(BaseModel):
             elif url.startswith(("http://", "https://")):
                 yield urljoin(self.url, url)
 
+    def get_slim_soup(self, keep_links: bool = False):
+        soup = _get_soup(self.html)
+        keep_attrs = ["class"]
+        if keep_links:
+            keep_attrs.append("href")
+
+        for i in soup.find_all(True):
+            for name in list(i.attrs):
+                if i[name] and name not in keep_attrs:
+                    del i[name]
+
+        for i in soup.find_all(["svg", "img", "video", "audio"]):
+            i.decompose()
+
+        return soup
+
 
 def get_html_content(page: str, base: str):
     soup = _get_soup(page)
@@ -48,7 +66,12 @@ def get_html_content(page: str, base: str):
 def _get_soup(page: str):
     soup = BeautifulSoup(page, "html.parser")
     # https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
-    for s in soup(["style", "script", "[document]", "head", "title"]):
+    for s in soup(["style", "script", "[document]", "head", "title", "footer"]):
         s.extract()
 
     return soup
+
+
+def simplify_html(html: str, url: str, keep_links: bool = False):
+    html = WebPage(inner_text="", html=html, url=url).get_slim_soup(keep_links).decode()
+    return htmlmin.minify(html, remove_comments=True, remove_empty_space=True)
diff --git a/requirements.txt b/requirements.txt
index b40c69c9f..83a904156 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -71,4 +71,6 @@ dashscope==1.14.1
 rank-bm25==0.2.2  # for tool recommendation
 gymnasium==0.29.1
 pylint~=3.0.3
-pygithub~=2.3
\ No newline at end of file
+pygithub~=2.3
+htmlmin
+fsspec