Optimize: Update certain prompt formats.

2026-07-05 16:02:14 +02:00 · 2023-08-09 20:28:31 +08:00 · 2023-08-09 20:28:31 +08:00 · e43ae21d98
commit e43ae21d98
parent 2d8a3f1296
9 changed files with 43 additions and 49 deletions
--- a/examples/research.py
+++ b/examples/research.py
@ -6,7 +6,7 @@ from metagpt.roles.researcher import RESEARCH_PATH, Researcher


 async def main():
-    topic = "dataiku .vs datarobot"
+    topic = "dataiku vs. datarobot"
    role = Researcher(language="en-us")
    await role.run(topic)
    print(f"save report to {RESEARCH_PATH / f'{topic}.md'}.")
--- a/metagpt/actions/research.py
+++ b/metagpt/actions/research.py
@ -18,43 +18,34 @@ from metagpt.utils.text import generate_prompt_chunk, reduce_message_length
 LANG_PROMPT = "Please respond in {language}."

 RESEARCH_BASE_SYSTEM = """You are an AI critical thinker research assistant. Your sole purpose is to write well \
-written, critically acclaimed, objective and structured reports on given text."""
+written, critically acclaimed, objective and structured reports on the given text."""

-RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is \"{topic}\"."
+RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is:\n#TOPIC#\n{topic}"

-SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic that require Google search. \
+SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic for Google search. \
 Your response must be in JSON format, for example: ["keyword1", "keyword2"]."""

 SUMMARIZE_SEARCH_PROMPT = """### Requirements
-1. The keywords related to your research topic and the search results are shown in the "Reference Information" section.
+1. The keywords related to your research topic and the search results are shown in the "Search Result Information" section.
 2. Provide up to {decomposition_nums} queries related to your research topic base on the search results.
-3. Please respond in JSON format as follows: ["query1", "query2", "query3", ...].
+3. Please respond in the following JSON format: ["query1", "query2", "query3", ...].

-### Reference Information
-{search}
+### Search Result Information
+{search_results}
 """

-DECOMPOSITION_PROMPT = """You are a researcher, and before delving into an topic, you break it down into several \
-sub-questions. These sub-questions can be researched through online searches to gather objective opinions about the given \
-topic.
+COLLECT_AND_RANKURLS_PROMPT = """### Topic
+{topic}
+### Query
+{query}

---
-The topic is: {topic}
+### The online search results
+{results}

---
-Now, please break down the provided topic into {decomposition_nums} search questions. You should respond with an array of \
-strings in JSON format like ["question1", "question2", ...].
-"""
-
-COLLECT_AND_RANKURLS_PROMPT = """### Reference Information
-1. Topic: "{topic}"
-2. Query: "{query}"
-3. The online search results: {results}
-
---
+### Requirements
 Please remove irrelevant search results that are not related to the query or topic. Then, sort the remaining search results \
-based on link credibility. If two results have equal credibility, prioritize them based on relevance. Provide the ranked \
-results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words.
+based on the link credibility. If two results have equal credibility, prioritize them based on the relevance. Provide the
+ranked results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words.
 """

 WEB_BROWSE_AND_SUMMARIZE_PROMPT = '''### Requirements
@ -74,7 +65,7 @@ CONDUCT_RESEARCH_PROMPT = '''### Reference Information

 ### Requirements
 Please provide a detailed research report in response to the following topic: "{topic}", using the information provided \
-above. The report must adhere to the following requirements:
+above. The report must meet the following requirements:

 - Focus on directly addressing the chosen topic.
 - Ensure a well-structured and in-depth presentation, incorporating relevant facts and figures where available.
@ -117,9 +108,7 @@ class CollectLinks(Action):
            A dictionary containing the search questions as keys and the collected URLs as values.
        """
        system_text = system_text if system_text else RESEARCH_TOPIC_SYSTEM.format(topic=topic)
-        search_topic_prompt = SEARCH_TOPIC_PROMPT.format(topic=topic)
-        logger.debug(search_topic_prompt)
-        keywords = await self._aask(search_topic_prompt, [system_text])
+        keywords = await self._aask(SEARCH_TOPIC_PROMPT, [system_text])
        try:
            keywords = json.loads(keywords)
            keywords = parse_obj_as(list[str], keywords)
@ -130,8 +119,8 @@ class CollectLinks(Action):

        def gen_msg():
            while True:
-                search = "\n".join(f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results))
-                prompt = SUMMARIZE_SEARCH_PROMPT.format(decomposition_nums=decomposition_nums, search=search)
+                search_results = "\n".join(f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results))
+                prompt = SUMMARIZE_SEARCH_PROMPT.format(decomposition_nums=decomposition_nums, search_results=search_results)
                yield prompt
                remove = max(results, key=len)
                remove.pop()
@ -144,7 +133,7 @@ class CollectLinks(Action):
            queries = json.loads(queries)
            queries = parse_obj_as(list[str], queries)
        except Exception as e:
-            logger.exception(f"fail to break down the research question for {e}")
+            logger.exception(f"fail to break down the research question due to {e}")
            queries = keywords
        ret = {}
        for query in queries:
--- a/metagpt/provider/openai_api.py
+++ b/metagpt/provider/openai_api.py
@ -270,4 +270,4 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
    def get_max_tokens(self, messages: list[dict]):
        if not self.auto_max_tokens:
            return CONFIG.max_tokens_rsp
-        return get_max_completion_tokens(messages, self.model)
+        return get_max_completion_tokens(messages, self.model, CONFIG.max_tokens_rsp)
--- a/metagpt/roles/researcher.py
+++ b/metagpt/roles/researcher.py
@ -22,7 +22,7 @@ class Report(BaseModel):
 class Researcher(Role):
    def __init__(
        self,
-        name: str = "Bob",
+        name: str = "David",
        profile: str = "Researcher",
        goal: str = "Gather information and conduct research",
        constraints: str = "Ensure accuracy and relevance of information",
@ -88,4 +88,4 @@ class Researcher(Role):

 if __name__ == "__main__":
    role = Researcher(language="en-us")
-    asyncio.run(role.run("dataiku .vs datarobot"))
+    asyncio.run(role.run("dataiku vs. datarobot"))
--- a/metagpt/utils/parse_html.py
+++ b/metagpt/utils/parse_html.py
@ -2,7 +2,7 @@
 from __future__ import annotations

 from typing import Generator, Optional
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse

 from bs4 import BeautifulSoup
 from pydantic import BaseModel
@ -35,11 +35,11 @@ class WebPage(BaseModel):
    def get_links(self) -> Generator[str, None, None]:
        for i in self.soup.find_all("a", href=True):
            url = i["href"]
-            if url.startswith("data:"):
-                continue
-            if not url.startswith(("http://", "https://")):
-                url = urljoin(self.url, url)
-            yield url
+            result = urlparse(url)
+            if not result.scheme and result.path:
+                yield urljoin(self.url, url)
+            elif url.startswith(("http://", "https://")):
+                yield urljoin(self.url, url)


 def get_html_content(page: str, base: str):
--- a/metagpt/utils/text.py
+++ b/metagpt/utils/text.py
@ -4,19 +4,19 @@ from metagpt.utils.token_counter import TOKEN_MAX, count_string_tokens


 def reduce_message_length(msgs: Generator[str, None, None], model_name: str, system_text: str, reserved: int = 0,) -> str:
-    """Reduce the length of messages to fit within the maximum token size.
+    """Reduce the length of concatenated message segments to fit within the maximum token size.

    Args:
-        msgs: A generator of strings representing the messages.
+        msgs: A generator of strings representing progressively shorter valid prompts.
        model_name: The name of the encoding to use. (e.g., "gpt-3.5-turbo")
        system_text: The system prompts.
        reserved: The number of reserved tokens.

    Returns:
-        The reduced message.
+        The concatenated message segments reduced to fit within the maximum token size.

    Raises:
-        RuntimeError: If it fails to reduce the message length.
+        RuntimeError: If it fails to reduce the concatenated message length.
    """
    max_token = TOKEN_MAX.get(model_name, 2048) - count_string_tokens(system_text, model_name) - reserved
    for msg in msgs:
--- a/metagpt/utils/token_counter.py
+++ b/metagpt/utils/token_counter.py
@ -96,7 +96,7 @@ def count_string_tokens(string: str, model_name: str) -> int:
    return len(encoding.encode(string))


-def get_max_completion_tokens(messages: list[dict], model: str): 
+def get_max_completion_tokens(messages: list[dict], model: str, default: int) -> int: 
    """Calculate the maximum number of completion tokens for a given model and list of messages.

    Args:
@ -106,4 +106,6 @@ def get_max_completion_tokens(messages: list[dict], model: str):
    Returns:
        The maximum number of completion tokens.
    """
-    return TOKEN_MAX.get(model, 4096) - count_message_tokens(messages)
+    if model not in TOKEN_MAX:
+        return default
+    return TOKEN_MAX[model] - count_message_tokens(messages)
--- a/tests/metagpt/roles/test_researcher.py
+++ b/tests/metagpt/roles/test_researcher.py
@ -25,7 +25,7 @@ async def mock_llm_ask(self, prompt: str, system_msgs):
@pytest.mark.asyncio
 async def test_researcher(mocker):
    with TemporaryDirectory() as dirname:
-        topic = "dataiku .vs datarobot"
+        topic = "dataiku vs. datarobot"
        mocker.patch("metagpt.provider.base_gpt_api.BaseGPTAPI.aask", mock_llm_ask)
        researcher.RESEARCH_PATH = Path(dirname)
        await researcher.Researcher().run(topic)
--- a/tests/metagpt/utils/test_parse_html.py
+++ b/tests/metagpt/utils/test_parse_html.py
@ -44,6 +44,9 @@ PAGE = """
    <div class="box">
        <p>This is a div with a class "box".</p>
        <p><a href="https://metagpt.com">a link</a></p>
+        <p><a href="#section2"></a></p>
+        <p><a href="ftp://192.168.1.1:8080"></a></p>
+        <p><a href="javascript:alert('Hello');"></a></p>
    </div>
 </body>
 </html>