From e43ae21d985a65776d7c01a18a8e34eac7d85271 Mon Sep 17 00:00:00 2001
From: shenchucheng <chuchengshen@fuzhi.ai>
Date: Wed, 9 Aug 2023 20:28:31 +0800
Subject: [PATCH] Optimize: Update certain prompt formats.

---
 examples/research.py                   |  2 +-
 metagpt/actions/research.py            | 53 ++++++++++----------------
 metagpt/provider/openai_api.py         |  2 +-
 metagpt/roles/researcher.py            |  4 +-
 metagpt/utils/parse_html.py            | 12 +++---
 metagpt/utils/text.py                  |  8 ++--
 metagpt/utils/token_counter.py         |  6 ++-
 tests/metagpt/roles/test_researcher.py |  2 +-
 tests/metagpt/utils/test_parse_html.py |  3 ++
 9 files changed, 43 insertions(+), 49 deletions(-)

diff --git a/examples/research.py b/examples/research.py
index 8625eaa1b..344f8d0e9 100644
--- a/examples/research.py
+++ b/examples/research.py
@@ -6,7 +6,7 @@ from metagpt.roles.researcher import RESEARCH_PATH, Researcher
 
 
 async def main():
-    topic = "dataiku .vs datarobot"
+    topic = "dataiku vs. datarobot"
     role = Researcher(language="en-us")
     await role.run(topic)
     print(f"save report to {RESEARCH_PATH / f'{topic}.md'}.")
diff --git a/metagpt/actions/research.py b/metagpt/actions/research.py
index b350b0929..81eb876dd 100644
--- a/metagpt/actions/research.py
+++ b/metagpt/actions/research.py
@@ -18,43 +18,34 @@ from metagpt.utils.text import generate_prompt_chunk, reduce_message_length
 LANG_PROMPT = "Please respond in {language}."
 
 RESEARCH_BASE_SYSTEM = """You are an AI critical thinker research assistant. Your sole purpose is to write well \
-written, critically acclaimed, objective and structured reports on given text."""
+written, critically acclaimed, objective and structured reports on the given text."""
 
-RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is \"{topic}\"."
+RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is:\n#TOPIC#\n{topic}"
 
-SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic that require Google search. \
+SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic for Google search. \
 Your response must be in JSON format, for example: ["keyword1", "keyword2"]."""
 
 SUMMARIZE_SEARCH_PROMPT = """### Requirements
-1. The keywords related to your research topic and the search results are shown in the "Reference Information" section.
+1. The keywords related to your research topic and the search results are shown in the "Search Result Information" section.
 2. Provide up to {decomposition_nums} queries related to your research topic base on the search results.
-3. Please respond in JSON format as follows: ["query1", "query2", "query3", ...].
+3. Please respond in the following JSON format: ["query1", "query2", "query3", ...].
 
-### Reference Information
-{search}
+### Search Result Information
+{search_results}
 """
 
-DECOMPOSITION_PROMPT = """You are a researcher, and before delving into an topic, you break it down into several \
-sub-questions. These sub-questions can be researched through online searches to gather objective opinions about the given \
-topic.
+COLLECT_AND_RANKURLS_PROMPT = """### Topic
+{topic}
+### Query
+{query}
 
----
-The topic is: {topic}
+### The online search results
+{results}
 
----
-Now, please break down the provided topic into {decomposition_nums} search questions. You should respond with an array of \
-strings in JSON format like ["question1", "question2", ...].
-"""
-
-COLLECT_AND_RANKURLS_PROMPT = """### Reference Information
-1. Topic: "{topic}"
-2. Query: "{query}"
-3. The online search results: {results}
-
----
+### Requirements
 Please remove irrelevant search results that are not related to the query or topic. Then, sort the remaining search results \
-based on link credibility. If two results have equal credibility, prioritize them based on relevance. Provide the ranked \
-results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words.
+based on the link credibility. If two results have equal credibility, prioritize them based on the relevance. Provide the
+ranked results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words.
 """
 
 WEB_BROWSE_AND_SUMMARIZE_PROMPT = '''### Requirements
@@ -74,7 +65,7 @@ CONDUCT_RESEARCH_PROMPT = '''### Reference Information
 
 ### Requirements
 Please provide a detailed research report in response to the following topic: "{topic}", using the information provided \
-above. The report must adhere to the following requirements:
+above. The report must meet the following requirements:
 
 - Focus on directly addressing the chosen topic.
 - Ensure a well-structured and in-depth presentation, incorporating relevant facts and figures where available.
@@ -117,9 +108,7 @@ class CollectLinks(Action):
             A dictionary containing the search questions as keys and the collected URLs as values.
         """
         system_text = system_text if system_text else RESEARCH_TOPIC_SYSTEM.format(topic=topic)
-        search_topic_prompt = SEARCH_TOPIC_PROMPT.format(topic=topic)
-        logger.debug(search_topic_prompt)
-        keywords = await self._aask(search_topic_prompt, [system_text])
+        keywords = await self._aask(SEARCH_TOPIC_PROMPT, [system_text])
         try:
             keywords = json.loads(keywords)
             keywords = parse_obj_as(list[str], keywords)
@@ -130,8 +119,8 @@ class CollectLinks(Action):
 
         def gen_msg():
             while True:
-                search = "\n".join(f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results))
-                prompt = SUMMARIZE_SEARCH_PROMPT.format(decomposition_nums=decomposition_nums, search=search)
+                search_results = "\n".join(f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results))
+                prompt = SUMMARIZE_SEARCH_PROMPT.format(decomposition_nums=decomposition_nums, search_results=search_results)
                 yield prompt
                 remove = max(results, key=len)
                 remove.pop()
@@ -144,7 +133,7 @@ class CollectLinks(Action):
             queries = json.loads(queries)
             queries = parse_obj_as(list[str], queries)
         except Exception as e:
-            logger.exception(f"fail to break down the research question for {e}")
+            logger.exception(f"fail to break down the research question due to {e}")
             queries = keywords
         ret = {}
         for query in queries:
diff --git a/metagpt/provider/openai_api.py b/metagpt/provider/openai_api.py
index b87c142e6..e10c78c8f 100644
--- a/metagpt/provider/openai_api.py
+++ b/metagpt/provider/openai_api.py
@@ -270,4 +270,4 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
     def get_max_tokens(self, messages: list[dict]):
         if not self.auto_max_tokens:
             return CONFIG.max_tokens_rsp
-        return get_max_completion_tokens(messages, self.model)
+        return get_max_completion_tokens(messages, self.model, CONFIG.max_tokens_rsp)
diff --git a/metagpt/roles/researcher.py b/metagpt/roles/researcher.py
index 6ad3c2215..9e32820ed 100644
--- a/metagpt/roles/researcher.py
+++ b/metagpt/roles/researcher.py
@@ -22,7 +22,7 @@ class Report(BaseModel):
 class Researcher(Role):
     def __init__(
         self,
-        name: str = "Bob",
+        name: str = "David",
         profile: str = "Researcher",
         goal: str = "Gather information and conduct research",
         constraints: str = "Ensure accuracy and relevance of information",
@@ -88,4 +88,4 @@ class Researcher(Role):
 
 if __name__ == "__main__":
     role = Researcher(language="en-us")
-    asyncio.run(role.run("dataiku .vs datarobot"))
+    asyncio.run(role.run("dataiku vs. datarobot"))
diff --git a/metagpt/utils/parse_html.py b/metagpt/utils/parse_html.py
index 4631005cf..62de26541 100644
--- a/metagpt/utils/parse_html.py
+++ b/metagpt/utils/parse_html.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 from typing import Generator, Optional
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse
 
 from bs4 import BeautifulSoup
 from pydantic import BaseModel
@@ -35,11 +35,11 @@ class WebPage(BaseModel):
     def get_links(self) -> Generator[str, None, None]:
         for i in self.soup.find_all("a", href=True):
             url = i["href"]
-            if url.startswith("data:"):
-                continue
-            if not url.startswith(("http://", "https://")):
-                url = urljoin(self.url, url)
-            yield url
+            result = urlparse(url)
+            if not result.scheme and result.path:
+                yield urljoin(self.url, url)
+            elif url.startswith(("http://", "https://")):
+                yield urljoin(self.url, url)
 
 
 def get_html_content(page: str, base: str):
diff --git a/metagpt/utils/text.py b/metagpt/utils/text.py
index 6bae1d9dd..6acb31b67 100644
--- a/metagpt/utils/text.py
+++ b/metagpt/utils/text.py
@@ -4,19 +4,19 @@ from metagpt.utils.token_counter import TOKEN_MAX, count_string_tokens
 
 
 def reduce_message_length(msgs: Generator[str, None, None], model_name: str, system_text: str, reserved: int = 0,) -> str:
-    """Reduce the length of messages to fit within the maximum token size.
+    """Reduce the length of concatenated message segments to fit within the maximum token size.
 
     Args:
-        msgs: A generator of strings representing the messages.
+        msgs: A generator of strings representing progressively shorter valid prompts.
         model_name: The name of the encoding to use. (e.g., "gpt-3.5-turbo")
         system_text: The system prompts.
         reserved: The number of reserved tokens.
 
     Returns:
-        The reduced message.
+        The concatenated message segments reduced to fit within the maximum token size.
 
     Raises:
-        RuntimeError: If it fails to reduce the message length.
+        RuntimeError: If it fails to reduce the concatenated message length.
     """
     max_token = TOKEN_MAX.get(model_name, 2048) - count_string_tokens(system_text, model_name) - reserved
     for msg in msgs:
diff --git a/metagpt/utils/token_counter.py b/metagpt/utils/token_counter.py
index 364eb8aca..591bb60f0 100644
--- a/metagpt/utils/token_counter.py
+++ b/metagpt/utils/token_counter.py
@@ -96,7 +96,7 @@ def count_string_tokens(string: str, model_name: str) -> int:
     return len(encoding.encode(string))
 
 
-def get_max_completion_tokens(messages: list[dict], model: str): 
+def get_max_completion_tokens(messages: list[dict], model: str, default: int) -> int: 
     """Calculate the maximum number of completion tokens for a given model and list of messages.
 
     Args:
@@ -106,4 +106,6 @@ def get_max_completion_tokens(messages: list[dict], model: str):
     Returns:
         The maximum number of completion tokens.
     """
-    return TOKEN_MAX.get(model, 4096) - count_message_tokens(messages)
+    if model not in TOKEN_MAX:
+        return default
+    return TOKEN_MAX[model] - count_message_tokens(messages)
diff --git a/tests/metagpt/roles/test_researcher.py b/tests/metagpt/roles/test_researcher.py
index 0efa59be6..01b5dae3b 100644
--- a/tests/metagpt/roles/test_researcher.py
+++ b/tests/metagpt/roles/test_researcher.py
@@ -25,7 +25,7 @@ async def mock_llm_ask(self, prompt: str, system_msgs):
 @pytest.mark.asyncio
 async def test_researcher(mocker):
     with TemporaryDirectory() as dirname:
-        topic = "dataiku .vs datarobot"
+        topic = "dataiku vs. datarobot"
         mocker.patch("metagpt.provider.base_gpt_api.BaseGPTAPI.aask", mock_llm_ask)
         researcher.RESEARCH_PATH = Path(dirname)
         await researcher.Researcher().run(topic)
diff --git a/tests/metagpt/utils/test_parse_html.py b/tests/metagpt/utils/test_parse_html.py
index d7a4d0898..42be416a6 100644
--- a/tests/metagpt/utils/test_parse_html.py
+++ b/tests/metagpt/utils/test_parse_html.py
@@ -44,6 +44,9 @@ PAGE = """
     <div class="box">
         <p>This is a div with a class "box".</p>
         <p><a href="https://metagpt.com">a link</a></p>
+        <p><a href="#section2"></a></p>
+        <p><a href="ftp://192.168.1.1:8080"></a></p>
+        <p><a href="javascript:alert('Hello');"></a></p>
     </div>
 </body>
 </html>