From 43340b79eb55c46b99e2c0422bdd8c208cc10434 Mon Sep 17 00:00:00 2001 From: seehi <6580@pm.me> Date: Fri, 9 Aug 2024 18:22:50 +0800 Subject: [PATCH 1/3] solve anti-robot in web search --- metagpt/actions/research.py | 8 +++++++- metagpt/actions/search_enhanced_qa.py | 14 +++++++++++++- metagpt/tools/web_browser_engine_playwright.py | 8 +++----- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/metagpt/actions/research.py b/metagpt/actions/research.py index 7b7a2e911..0522fbd19 100644 --- a/metagpt/actions/research.py +++ b/metagpt/actions/research.py @@ -180,7 +180,13 @@ class CollectLinks(Action): results = self.rank_func(results) return [i["link"] for i in results[:num_results]] - async def _search_urls(self, query: str, max_results: int) -> list[str]: + async def _search_urls(self, query: str, max_results: int) -> list[dict[str, str]]: + """Use search_engine to get urls + + Returns: + e.g. [{"title": "...", "link": "...", "snippet", "..."}] + """ + return await self.search_engine.run(query, max_results=max_results, as_string=False) diff --git a/metagpt/actions/search_enhanced_qa.py b/metagpt/actions/search_enhanced_qa.py index ebf945fd6..c2538dbfb 100644 --- a/metagpt/actions/search_enhanced_qa.py +++ b/metagpt/actions/search_enhanced_qa.py @@ -74,6 +74,14 @@ class SearchEnhancedQA(Action): java_script_enabled: bool = Field( default=False, description="Whether or not to enable JavaScript in the web browser context. Defaults to False." ) + user_agent: str = Field( + default="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81", + description="Specific user agent to use in browser", + ) + extra_http_headers: dict = Field( + default={"sec-ch-ua": 'Chromium";v="125", "Not.A/Brand";v="24'}, + description="An object containing additional HTTP headers to be sent with every request.", + ) max_chars_per_webpage_summary: int = Field( default=4000, description="Maximum summary length for each web page content." ) @@ -86,7 +94,11 @@ class SearchEnhancedQA(Action): def initialize(self): if self.web_browse_and_summarize_action is None: self.web_browser_engine = WebBrowserEngine.from_browser_config( - self.config.browser, proxy=self.config.proxy, java_script_enabled=self.java_script_enabled + self.config.browser, + proxy=self.config.proxy, + java_script_enabled=self.java_script_enabled, + extra_http_headers=self.extra_http_headers, + user_agent=self.user_agent, ) self.web_browse_and_summarize_action = WebBrowseAndSummarize(web_browser_engine=self.web_browser_engine) diff --git a/metagpt/tools/web_browser_engine_playwright.py b/metagpt/tools/web_browser_engine_playwright.py index 33f6ae3a5..f38a3b296 100644 --- a/metagpt/tools/web_browser_engine_playwright.py +++ b/metagpt/tools/web_browser_engine_playwright.py @@ -39,11 +39,9 @@ class PlaywrightWrapper(BaseModel): if not any(str.startswith(i, "--proxy-server=") for i in args): launch_kwargs["proxy"] = {"server": self.proxy} - if "ignore_https_errors" in kwargs: - self.context_kwargs["ignore_https_errors"] = kwargs["ignore_https_errors"] - - if "java_script_enabled" in kwargs: - self.context_kwargs["java_script_enabled"] = kwargs["java_script_enabled"] + for key in ["ignore_https_errors", "java_script_enabled", "extra_http_headers", "user_agent"]: + if key in kwargs: + self.context_kwargs[key] = kwargs[key] async def run(self, url: str, *urls: str, per_page_timeout: float = None) -> WebPage | list[WebPage]: async with async_playwright() as ap: From 75b015bdf0e684282f2cee5b3bbd55be7555ac53 Mon Sep 17 00:00:00 2001 From: seehi <6580@pm.me> Date: Fri, 9 Aug 2024 18:27:31 +0800 Subject: [PATCH 2/3] solve anti-robot in web search --- metagpt/actions/research.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagpt/actions/research.py b/metagpt/actions/research.py index 0522fbd19..5e670520c 100644 --- a/metagpt/actions/research.py +++ b/metagpt/actions/research.py @@ -181,7 +181,7 @@ class CollectLinks(Action): return [i["link"] for i in results[:num_results]] async def _search_urls(self, query: str, max_results: int) -> list[dict[str, str]]: - """Use search_engine to get urls + """Use search_engine to get urls. Returns: e.g. [{"title": "...", "link": "...", "snippet", "..."}] From 8f6f71ba7efc2c8e39602c29ba050d549133818b Mon Sep 17 00:00:00 2001 From: seehi <6580@pm.me> Date: Fri, 9 Aug 2024 18:30:28 +0800 Subject: [PATCH 3/3] solve anti-robot in web search --- metagpt/actions/search_enhanced_qa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagpt/actions/search_enhanced_qa.py b/metagpt/actions/search_enhanced_qa.py index c2538dbfb..1d7944d61 100644 --- a/metagpt/actions/search_enhanced_qa.py +++ b/metagpt/actions/search_enhanced_qa.py @@ -76,7 +76,7 @@ class SearchEnhancedQA(Action): ) user_agent: str = Field( default="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81", - description="Specific user agent to use in browser", + description="Specific user agent to use in browser.", ) extra_http_headers: dict = Field( default={"sec-ch-ua": 'Chromium";v="125", "Not.A/Brand";v="24'},