diff --git a/metagpt/actions/research.py b/metagpt/actions/research.py index 7b7a2e911..0522fbd19 100644 --- a/metagpt/actions/research.py +++ b/metagpt/actions/research.py @@ -180,7 +180,13 @@ class CollectLinks(Action): results = self.rank_func(results) return [i["link"] for i in results[:num_results]] - async def _search_urls(self, query: str, max_results: int) -> list[str]: + async def _search_urls(self, query: str, max_results: int) -> list[dict[str, str]]: + """Use search_engine to get urls + + Returns: + e.g. [{"title": "...", "link": "...", "snippet", "..."}] + """ + return await self.search_engine.run(query, max_results=max_results, as_string=False) diff --git a/metagpt/actions/search_enhanced_qa.py b/metagpt/actions/search_enhanced_qa.py index ebf945fd6..c2538dbfb 100644 --- a/metagpt/actions/search_enhanced_qa.py +++ b/metagpt/actions/search_enhanced_qa.py @@ -74,6 +74,14 @@ class SearchEnhancedQA(Action): java_script_enabled: bool = Field( default=False, description="Whether or not to enable JavaScript in the web browser context. Defaults to False." ) + user_agent: str = Field( + default="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81", + description="Specific user agent to use in browser", + ) + extra_http_headers: dict = Field( + default={"sec-ch-ua": 'Chromium";v="125", "Not.A/Brand";v="24'}, + description="An object containing additional HTTP headers to be sent with every request.", + ) max_chars_per_webpage_summary: int = Field( default=4000, description="Maximum summary length for each web page content." ) @@ -86,7 +94,11 @@ class SearchEnhancedQA(Action): def initialize(self): if self.web_browse_and_summarize_action is None: self.web_browser_engine = WebBrowserEngine.from_browser_config( - self.config.browser, proxy=self.config.proxy, java_script_enabled=self.java_script_enabled + self.config.browser, + proxy=self.config.proxy, + java_script_enabled=self.java_script_enabled, + extra_http_headers=self.extra_http_headers, + user_agent=self.user_agent, ) self.web_browse_and_summarize_action = WebBrowseAndSummarize(web_browser_engine=self.web_browser_engine) diff --git a/metagpt/tools/web_browser_engine_playwright.py b/metagpt/tools/web_browser_engine_playwright.py index 33f6ae3a5..f38a3b296 100644 --- a/metagpt/tools/web_browser_engine_playwright.py +++ b/metagpt/tools/web_browser_engine_playwright.py @@ -39,11 +39,9 @@ class PlaywrightWrapper(BaseModel): if not any(str.startswith(i, "--proxy-server=") for i in args): launch_kwargs["proxy"] = {"server": self.proxy} - if "ignore_https_errors" in kwargs: - self.context_kwargs["ignore_https_errors"] = kwargs["ignore_https_errors"] - - if "java_script_enabled" in kwargs: - self.context_kwargs["java_script_enabled"] = kwargs["java_script_enabled"] + for key in ["ignore_https_errors", "java_script_enabled", "extra_http_headers", "user_agent"]: + if key in kwargs: + self.context_kwargs[key] = kwargs[key] async def run(self, url: str, *urls: str, per_page_timeout: float = None) -> WebPage | list[WebPage]: async with async_playwright() as ap: