Merge pull request #161 from shenchucheng/main

Feature/researcher
2026-06-29 15:59:42 +02:00 · 2023-08-10 20:21:24 +08:00 · 2023-08-10 20:21:24 +08:00 · 2e1e647628
commit 2e1e647628
parent 9ee2421aec fc88a0edd5
27 changed files with 1282 additions and 246 deletions
--- a/metagpt/actions/init.py
+++ b/metagpt/actions/init.py
@ -22,6 +22,7 @@ from metagpt.actions.write_code_review import WriteCodeReview
 from metagpt.actions.write_prd import WritePRD
 from metagpt.actions.write_prd_review import WritePRDReview
 from metagpt.actions.write_test import WriteTest
+from metagpt.actions.research import CollectLinks, WebBrowseAndSummarize, ConductResearch


 class ActionType(Enum):
@ -40,3 +41,6 @@ class ActionType(Enum):
    WRITE_TASKS = WriteTasks
    ASSIGN_TASKS = AssignTasks
    SEARCH_AND_SUMMARIZE = SearchAndSummarize
+    COLLECT_LINKS = CollectLinks
+    WEB_BROWSE_AND_SUMMARIZE = WebBrowseAndSummarize
+    CONDUCT_RESEARCH = ConductResearch
--- a/metagpt/actions/research.py
+++ b/metagpt/actions/research.py
@ -0,0 +1,277 @@
+#!/usr/bin/env python
+
+from __future__ import annotations
+
+import asyncio
+import json
+from typing import Callable
+
+from pydantic import parse_obj_as
+
+from metagpt.actions import Action
+from metagpt.config import CONFIG
+from metagpt.logs import logger
+from metagpt.tools.search_engine import SearchEngine
+from metagpt.tools.web_browser_engine import WebBrowserEngine, WebBrowserEngineType
+from metagpt.utils.text import generate_prompt_chunk, reduce_message_length
+
+LANG_PROMPT = "Please respond in {language}."
+
+RESEARCH_BASE_SYSTEM = """You are an AI critical thinker research assistant. Your sole purpose is to write well \
+written, critically acclaimed, objective and structured reports on the given text."""
+
+RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is:\n#TOPIC#\n{topic}"
+
+SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic for Google search. \
+Your response must be in JSON format, for example: ["keyword1", "keyword2"]."""
+
+SUMMARIZE_SEARCH_PROMPT = """### Requirements
+1. The keywords related to your research topic and the search results are shown in the "Search Result Information" section.
+2. Provide up to {decomposition_nums} queries related to your research topic base on the search results.
+3. Please respond in the following JSON format: ["query1", "query2", "query3", ...].
+
+### Search Result Information
+{search_results}
+"""
+
+COLLECT_AND_RANKURLS_PROMPT = """### Topic
+{topic}
+### Query
+{query}
+
+### The online search results
+{results}
+
+### Requirements
+Please remove irrelevant search results that are not related to the query or topic. Then, sort the remaining search results \
+based on the link credibility. If two results have equal credibility, prioritize them based on the relevance. Provide the
+ranked results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words.
+"""
+
+WEB_BROWSE_AND_SUMMARIZE_PROMPT = '''### Requirements
+1. Utilize the text in the "Reference Information" section to respond to the question "{query}".
+2. If the question cannot be directly answered using the text, but the text is related to the research topic, please provide \
+a comprehensive summary of the text.
+3. If the text is entirely unrelated to the research topic, please reply with a simple text "Not relevant."
+4. Include all relevant factual information, numbers, statistics, etc., if available.
+
+### Reference Information
+{content}
+'''
+
+
+CONDUCT_RESEARCH_PROMPT = '''### Reference Information
+{content}
+
+### Requirements
+Please provide a detailed research report in response to the following topic: "{topic}", using the information provided \
+above. The report must meet the following requirements:
+
+- Focus on directly addressing the chosen topic.
+- Ensure a well-structured and in-depth presentation, incorporating relevant facts and figures where available.
+- Present data and findings in an intuitive manner, utilizing feature comparative tables, if applicable.
+- The report should have a minimum word count of 2,000 and be formatted with Markdown syntax following APA style guidelines.
+- Include all source URLs in APA format at the end of the report.
+'''
+
+
+class CollectLinks(Action):
+    """Action class to collect links from a search engine."""
+    def __init__(
+        self,
+        name: str = "",
+        *args,
+        rank_func: Callable[[list[str]], None] | None = None,
+        **kwargs,
+    ):
+        super().__init__(name, *args, **kwargs)
+        self.desc = "Collect links from a search engine."
+        self.search_engine = SearchEngine()
+        self.rank_func = rank_func
+
+    async def run(
+        self,
+        topic: str,
+        decomposition_nums: int = 4,
+        url_per_query: int = 4,
+        system_text: str | None = None,
+    ) -> dict[str, list[str]]:
+        """Run the action to collect links.
+
+        Args:
+            topic: The research topic.
+            decomposition_nums: The number of search questions to generate.
+            url_per_query: The number of URLs to collect per search question.
+            system_text: The system text.
+
+        Returns:
+            A dictionary containing the search questions as keys and the collected URLs as values.
+        """
+        system_text = system_text if system_text else RESEARCH_TOPIC_SYSTEM.format(topic=topic)
+        keywords = await self._aask(SEARCH_TOPIC_PROMPT, [system_text])
+        try:
+            keywords = json.loads(keywords)
+            keywords = parse_obj_as(list[str], keywords)
+        except Exception as e:
+            logger.exception(f"fail to get keywords related to the research topic \"{topic}\" for {e}")
+            keywords = [topic]
+        results = await asyncio.gather(*(self.search_engine.run(i, as_string=False) for i in keywords))
+
+        def gen_msg():
+            while True:
+                search_results = "\n".join(f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results))
+                prompt = SUMMARIZE_SEARCH_PROMPT.format(decomposition_nums=decomposition_nums, search_results=search_results)
+                yield prompt
+                remove = max(results, key=len)
+                remove.pop()
+                if len(remove) == 0:
+                    break
+        prompt = reduce_message_length(gen_msg(), self.llm.model, system_text, CONFIG.max_tokens_rsp)
+        logger.debug(prompt)
+        queries = await self._aask(prompt, [system_text])
+        try:
+            queries = json.loads(queries)
+            queries = parse_obj_as(list[str], queries)
+        except Exception as e:
+            logger.exception(f"fail to break down the research question due to {e}")
+            queries = keywords
+        ret = {}
+        for query in queries:
+            ret[query] = await self._search_and_rank_urls(topic, query, url_per_query)
+        return ret
+
+    async def _search_and_rank_urls(self, topic: str, query: str, num_results: int = 4) -> list[str]:
+        """Search and rank URLs based on a query.
+
+        Args:
+            topic: The research topic.
+            query: The search query.
+            num_results: The number of URLs to collect.
+
+        Returns:
+            A list of ranked URLs.
+        """
+        max_results = max(num_results * 2, 6)
+        results = await self.search_engine.run(query, max_results=max_results, as_string=False)
+        _results = "\n".join(f"{i}: {j}" for i, j in zip(range(max_results), results))
+        prompt = COLLECT_AND_RANKURLS_PROMPT.format(topic=topic, query=query, results=_results)
+        logger.debug(prompt)
+        indices = await self._aask(prompt)
+        try:
+            indices = json.loads(indices)
+            assert all(isinstance(i, int) for i in indices)
+        except Exception as e:
+            logger.exception(f"fail to rank results for {e}")
+            indices = list(range(max_results))
+        results = [results[i] for i in indices]
+        if self.rank_func:
+            results = self.rank_func(results)
+        return [i["link"] for i in results[:num_results]]
+
+
+class WebBrowseAndSummarize(Action):
+    """Action class to explore the web and provide summaries of articles and webpages."""
+    def __init__(
+        self,
+        *args,
+        browse_func: Callable[[list[str]], None] | None = None,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        if CONFIG.model_for_researcher_summary:
+            self.llm.model = CONFIG.model_for_researcher_summary
+        self.web_browser_engine = WebBrowserEngine(
+            engine=WebBrowserEngineType.CUSTOM if browse_func else None,
+            run_func=browse_func,
+        )
+        self.desc = "Explore the web and provide summaries of articles and webpages."
+
+    async def run(
+        self,
+        url: str,
+        *urls: str,
+        query: str,
+        system_text: str = RESEARCH_BASE_SYSTEM,
+    ) -> dict[str, str]:
+        """Run the action to browse the web and provide summaries.
+
+        Args:
+            url: The main URL to browse.
+            urls: Additional URLs to browse.
+            query: The research question.
+            system_text: The system text.
+
+        Returns:
+            A dictionary containing the URLs as keys and their summaries as values.
+        """
+        contents = await self.web_browser_engine.run(url, *urls)
+        if not urls:
+            contents = [contents]
+
+        summaries = {}
+        prompt_template = WEB_BROWSE_AND_SUMMARIZE_PROMPT.format(query=query, content="{}")
+        for u, content in zip([url, *urls], contents):
+            content = content.inner_text
+            chunk_summaries = []
+            for prompt in generate_prompt_chunk(content, prompt_template, self.llm.model, system_text, CONFIG.max_tokens_rsp):
+                logger.debug(prompt)
+                summary = await self._aask(prompt, [system_text])
+                if summary == "Not relevant.":
+                    continue
+                chunk_summaries.append(summary)
+
+            if not chunk_summaries:
+                summaries[u] = None
+                continue
+
+            if len(chunk_summaries) == 1:
+                summaries[u] = chunk_summaries[0]
+                continue
+
+            content = "\n".join(chunk_summaries)
+            prompt = WEB_BROWSE_AND_SUMMARIZE_PROMPT.format(query=query, content=content)
+            summary = await self._aask(prompt, [system_text])
+            summaries[u] = summary
+        return summaries
+
+
+class ConductResearch(Action):
+    """Action class to conduct research and generate a research report."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if CONFIG.model_for_researcher_report:
+            self.llm.model = CONFIG.model_for_researcher_report
+
+    async def run(
+        self,
+        topic: str,
+        content: str,
+        system_text: str = RESEARCH_BASE_SYSTEM,
+    ) -> str:
+        """Run the action to conduct research and generate a research report.
+
+        Args:
+            topic: The research topic.
+            content: The content for research.
+            system_text: The system text.
+
+        Returns:
+            The generated research report.
+        """
+        prompt = CONDUCT_RESEARCH_PROMPT.format(topic=topic, content=content)
+        logger.debug(prompt)
+        self.llm.auto_max_tokens = True
+        return await self._aask(prompt, [system_text])
+
+
+def get_research_system_text(topic: str, language: str):
+    """Get the system text for conducting research.
+
+    Args:
+        topic: The research topic.
+        language: The language for the system text.
+
+    Returns:
+        The system text for conducting research.
+    """
+    return " ".join((RESEARCH_TOPIC_SYSTEM.format(topic=topic), LANG_PROMPT.format(language=language)))