optimize the summary of web content

2026-07-17 16:41:05 +02:00 · 2024-08-05 10:26:38 +08:00 · 2024-08-05 10:26:38 +08:00 · 6177dfcf8b
commit 6177dfcf8b
parent 654ed131d7
2 changed files with 36 additions and 8 deletions
--- a/metagpt/actions/research.py
+++ b/metagpt/actions/research.py
@ -149,18 +149,21 @@ class CollectLinks(Action):
            ret[query] = await self._search_and_rank_urls(topic, query, url_per_query)
        return ret

-    async def _search_and_rank_urls(self, topic: str, query: str, num_results: int = 4) -> list[str]:
+    async def _search_and_rank_urls(
+        self, topic: str, query: str, num_results: int = 4, max_num_results: int = None
+    ) -> list[str]:
        """Search and rank URLs based on a query.

        Args:
            topic: The research topic.
            query: The search query.
            num_results: The number of URLs to collect.
+            max_num_results: The max number of URLs to collect.

        Returns:
            A list of ranked URLs.
        """
-        max_results = max(num_results * 2, 6)
+        max_results = max_num_results or max(num_results * 2, 6)
        results = await self._search_urls(query, max_results=max_results)
        _results = "\n".join(f"{i}: {j}" for i, j in zip(range(max_results), results))
        prompt = COLLECT_AND_RANKURLS_PROMPT.format(topic=topic, query=query, results=_results)
--- a/metagpt/actions/search_enhanced_qa.py
+++ b/metagpt/actions/search_enhanced_qa.py
@ -11,6 +11,7 @@ from metagpt.actions.research import CollectLinks, WebBrowseAndSummarize
 from metagpt.logs import logger
 from metagpt.tools.web_browser_engine import WebBrowserEngine
 from metagpt.utils.common import CodeParser
+from metagpt.utils.parse_html import WebPage

 REWRITE_QUERY_PROMPT = """
 Role: You are a highly efficient assistant that provide a better search query for web search engine to answer the given question.
@ -72,6 +73,13 @@ class SearchEnhancedQA(Action):
    java_script_enabled: bool = Field(
        default=False, description="Whether or not to enable JavaScript in the web browser context. Defaults to False."
    )
+    max_chars_per_webpage_summary: int = Field(
+        default=4000, description="Maximum summary length for each web page content."
+    )
+    max_search_results: int = Field(
+        default=10,
+        description="Maximum number of search results (links) to collect using the collect_links_action. This controls the number of potential sources for answering the question.",
+    )

    @model_validator(mode="after")
    def initialize(self):
@ -188,7 +196,7 @@ class SearchEnhancedQA(Action):

        logger.info(f"The Relevant links are: {relevant_urls}")

-        web_summaries = await self._summarize_web_content(relevant_urls, query)
+        web_summaries = await self._summarize_web_content(relevant_urls)
        if not web_summaries:
            logger.warning(f"No summaries generated for query: {query}")
            return []
@ -207,21 +215,38 @@ class SearchEnhancedQA(Action):
            list[str]: Ranked list of relevant URLs.
        """

-        return await self.collect_links_action._search_and_rank_urls(topic=query, query=query)
+        return await self.collect_links_action._search_and_rank_urls(
+            topic=query, query=query, max_num_results=self.max_search_results
+        )

-    async def _summarize_web_content(self, urls: list[str], query: str) -> dict[str, str]:
+    async def _summarize_web_content(self, urls: list[str]) -> dict[str, str]:
        """Fetch and summarize content from given URLs.

        Args:
            urls (list[str]): List of URLs to summarize.
-            query (str): The original query for context.

        Returns:
            dict[str, str]: Mapping of URLs to their summaries.
        """
+        contents = await self._fetch_web_contents(urls)

-        return await self.web_browse_and_summarize_action.run(
-            *urls, query=query, use_concurrent_summarization=True, per_page_timeout=self.per_page_timeout
+        summaries = {}
+        for content in contents:
+            url = content.url
+            inner_text = content.inner_text.replace("\n", "")
+
+            if self.web_browse_and_summarize_action._is_content_invalid(inner_text):
+                logger.warning(f"Invalid content detected for URL {url}: {inner_text[:10]}...")
+                continue
+
+            summary = inner_text[: self.max_chars_per_webpage_summary]
+            summaries[url] = summary
+
+        return summaries
+
+    async def _fetch_web_contents(self, urls: list[str]) -> list[WebPage]:
+        return await self.web_browse_and_summarize_action._fetch_web_contents(
+            *urls, per_page_timeout=self.per_page_timeout
        )

    async def _generate_answer(self, query: str, context: str) -> str: