From 6177dfcf8bd0b20c00bf3ddb49b85ce7348611a8 Mon Sep 17 00:00:00 2001
From: seehi <6580@pm.me>
Date: Mon, 5 Aug 2024 10:26:38 +0800
Subject: [PATCH] optimize the summary of web content

---
 metagpt/actions/research.py           |  7 +++--
 metagpt/actions/search_enhanced_qa.py | 37 ++++++++++++++++++++++-----
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/metagpt/actions/research.py b/metagpt/actions/research.py
index 2a32341f0..7b7a2e911 100644
--- a/metagpt/actions/research.py
+++ b/metagpt/actions/research.py
@@ -149,18 +149,21 @@ class CollectLinks(Action):
             ret[query] = await self._search_and_rank_urls(topic, query, url_per_query)
         return ret
 
-    async def _search_and_rank_urls(self, topic: str, query: str, num_results: int = 4) -> list[str]:
+    async def _search_and_rank_urls(
+        self, topic: str, query: str, num_results: int = 4, max_num_results: int = None
+    ) -> list[str]:
         """Search and rank URLs based on a query.
 
         Args:
             topic: The research topic.
             query: The search query.
             num_results: The number of URLs to collect.
+            max_num_results: The max number of URLs to collect.
 
         Returns:
             A list of ranked URLs.
         """
-        max_results = max(num_results * 2, 6)
+        max_results = max_num_results or max(num_results * 2, 6)
         results = await self._search_urls(query, max_results=max_results)
         _results = "\n".join(f"{i}: {j}" for i, j in zip(range(max_results), results))
         prompt = COLLECT_AND_RANKURLS_PROMPT.format(topic=topic, query=query, results=_results)
diff --git a/metagpt/actions/search_enhanced_qa.py b/metagpt/actions/search_enhanced_qa.py
index e3340c17b..6d7229580 100644
--- a/metagpt/actions/search_enhanced_qa.py
+++ b/metagpt/actions/search_enhanced_qa.py
@@ -11,6 +11,7 @@ from metagpt.actions.research import CollectLinks, WebBrowseAndSummarize
 from metagpt.logs import logger
 from metagpt.tools.web_browser_engine import WebBrowserEngine
 from metagpt.utils.common import CodeParser
+from metagpt.utils.parse_html import WebPage
 
 REWRITE_QUERY_PROMPT = """
 Role: You are a highly efficient assistant that provide a better search query for web search engine to answer the given question.
@@ -72,6 +73,13 @@ class SearchEnhancedQA(Action):
     java_script_enabled: bool = Field(
         default=False, description="Whether or not to enable JavaScript in the web browser context. Defaults to False."
     )
+    max_chars_per_webpage_summary: int = Field(
+        default=4000, description="Maximum summary length for each web page content."
+    )
+    max_search_results: int = Field(
+        default=10,
+        description="Maximum number of search results (links) to collect using the collect_links_action. This controls the number of potential sources for answering the question.",
+    )
 
     @model_validator(mode="after")
     def initialize(self):
@@ -188,7 +196,7 @@ class SearchEnhancedQA(Action):
 
         logger.info(f"The Relevant links are: {relevant_urls}")
 
-        web_summaries = await self._summarize_web_content(relevant_urls, query)
+        web_summaries = await self._summarize_web_content(relevant_urls)
         if not web_summaries:
             logger.warning(f"No summaries generated for query: {query}")
             return []
@@ -207,21 +215,38 @@ class SearchEnhancedQA(Action):
             list[str]: Ranked list of relevant URLs.
         """
 
-        return await self.collect_links_action._search_and_rank_urls(topic=query, query=query)
+        return await self.collect_links_action._search_and_rank_urls(
+            topic=query, query=query, max_num_results=self.max_search_results
+        )
 
-    async def _summarize_web_content(self, urls: list[str], query: str) -> dict[str, str]:
+    async def _summarize_web_content(self, urls: list[str]) -> dict[str, str]:
         """Fetch and summarize content from given URLs.
 
         Args:
             urls (list[str]): List of URLs to summarize.
-            query (str): The original query for context.
 
         Returns:
             dict[str, str]: Mapping of URLs to their summaries.
         """
+        contents = await self._fetch_web_contents(urls)
 
-        return await self.web_browse_and_summarize_action.run(
-            *urls, query=query, use_concurrent_summarization=True, per_page_timeout=self.per_page_timeout
+        summaries = {}
+        for content in contents:
+            url = content.url
+            inner_text = content.inner_text.replace("\n", "")
+
+            if self.web_browse_and_summarize_action._is_content_invalid(inner_text):
+                logger.warning(f"Invalid content detected for URL {url}: {inner_text[:10]}...")
+                continue
+
+            summary = inner_text[: self.max_chars_per_webpage_summary]
+            summaries[url] = summary
+
+        return summaries
+
+    async def _fetch_web_contents(self, urls: list[str]) -> list[WebPage]:
+        return await self.web_browse_and_summarize_action._fetch_web_contents(
+            *urls, per_page_timeout=self.per_page_timeout
         )
 
     async def _generate_answer(self, query: str, context: str) -> str: