From e43ae21d985a65776d7c01a18a8e34eac7d85271 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Wed, 9 Aug 2023 20:28:31 +0800 Subject: [PATCH] Optimize: Update certain prompt formats. --- examples/research.py | 2 +- metagpt/actions/research.py | 53 ++++++++++---------------- metagpt/provider/openai_api.py | 2 +- metagpt/roles/researcher.py | 4 +- metagpt/utils/parse_html.py | 12 +++--- metagpt/utils/text.py | 8 ++-- metagpt/utils/token_counter.py | 6 ++- tests/metagpt/roles/test_researcher.py | 2 +- tests/metagpt/utils/test_parse_html.py | 3 ++ 9 files changed, 43 insertions(+), 49 deletions(-) diff --git a/examples/research.py b/examples/research.py index 8625eaa1b..344f8d0e9 100644 --- a/examples/research.py +++ b/examples/research.py @@ -6,7 +6,7 @@ from metagpt.roles.researcher import RESEARCH_PATH, Researcher async def main(): - topic = "dataiku .vs datarobot" + topic = "dataiku vs. datarobot" role = Researcher(language="en-us") await role.run(topic) print(f"save report to {RESEARCH_PATH / f'{topic}.md'}.") diff --git a/metagpt/actions/research.py b/metagpt/actions/research.py index b350b0929..81eb876dd 100644 --- a/metagpt/actions/research.py +++ b/metagpt/actions/research.py @@ -18,43 +18,34 @@ from metagpt.utils.text import generate_prompt_chunk, reduce_message_length LANG_PROMPT = "Please respond in {language}." RESEARCH_BASE_SYSTEM = """You are an AI critical thinker research assistant. Your sole purpose is to write well \ -written, critically acclaimed, objective and structured reports on given text.""" +written, critically acclaimed, objective and structured reports on the given text.""" -RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is \"{topic}\"." +RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is:\n#TOPIC#\n{topic}" -SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic that require Google search. \ +SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic for Google search. \ Your response must be in JSON format, for example: ["keyword1", "keyword2"].""" SUMMARIZE_SEARCH_PROMPT = """### Requirements -1. The keywords related to your research topic and the search results are shown in the "Reference Information" section. +1. The keywords related to your research topic and the search results are shown in the "Search Result Information" section. 2. Provide up to {decomposition_nums} queries related to your research topic base on the search results. -3. Please respond in JSON format as follows: ["query1", "query2", "query3", ...]. +3. Please respond in the following JSON format: ["query1", "query2", "query3", ...]. -### Reference Information -{search} +### Search Result Information +{search_results} """ -DECOMPOSITION_PROMPT = """You are a researcher, and before delving into an topic, you break it down into several \ -sub-questions. These sub-questions can be researched through online searches to gather objective opinions about the given \ -topic. +COLLECT_AND_RANKURLS_PROMPT = """### Topic +{topic} +### Query +{query} ---- -The topic is: {topic} +### The online search results +{results} ---- -Now, please break down the provided topic into {decomposition_nums} search questions. You should respond with an array of \ -strings in JSON format like ["question1", "question2", ...]. -""" - -COLLECT_AND_RANKURLS_PROMPT = """### Reference Information -1. Topic: "{topic}" -2. Query: "{query}" -3. The online search results: {results} - ---- +### Requirements Please remove irrelevant search results that are not related to the query or topic. Then, sort the remaining search results \ -based on link credibility. If two results have equal credibility, prioritize them based on relevance. Provide the ranked \ -results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words. +based on the link credibility. If two results have equal credibility, prioritize them based on the relevance. Provide the +ranked results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words. """ WEB_BROWSE_AND_SUMMARIZE_PROMPT = '''### Requirements @@ -74,7 +65,7 @@ CONDUCT_RESEARCH_PROMPT = '''### Reference Information ### Requirements Please provide a detailed research report in response to the following topic: "{topic}", using the information provided \ -above. The report must adhere to the following requirements: +above. The report must meet the following requirements: - Focus on directly addressing the chosen topic. - Ensure a well-structured and in-depth presentation, incorporating relevant facts and figures where available. @@ -117,9 +108,7 @@ class CollectLinks(Action): A dictionary containing the search questions as keys and the collected URLs as values. """ system_text = system_text if system_text else RESEARCH_TOPIC_SYSTEM.format(topic=topic) - search_topic_prompt = SEARCH_TOPIC_PROMPT.format(topic=topic) - logger.debug(search_topic_prompt) - keywords = await self._aask(search_topic_prompt, [system_text]) + keywords = await self._aask(SEARCH_TOPIC_PROMPT, [system_text]) try: keywords = json.loads(keywords) keywords = parse_obj_as(list[str], keywords) @@ -130,8 +119,8 @@ class CollectLinks(Action): def gen_msg(): while True: - search = "\n".join(f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results)) - prompt = SUMMARIZE_SEARCH_PROMPT.format(decomposition_nums=decomposition_nums, search=search) + search_results = "\n".join(f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results)) + prompt = SUMMARIZE_SEARCH_PROMPT.format(decomposition_nums=decomposition_nums, search_results=search_results) yield prompt remove = max(results, key=len) remove.pop() @@ -144,7 +133,7 @@ class CollectLinks(Action): queries = json.loads(queries) queries = parse_obj_as(list[str], queries) except Exception as e: - logger.exception(f"fail to break down the research question for {e}") + logger.exception(f"fail to break down the research question due to {e}") queries = keywords ret = {} for query in queries: diff --git a/metagpt/provider/openai_api.py b/metagpt/provider/openai_api.py index b87c142e6..e10c78c8f 100644 --- a/metagpt/provider/openai_api.py +++ b/metagpt/provider/openai_api.py @@ -270,4 +270,4 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter): def get_max_tokens(self, messages: list[dict]): if not self.auto_max_tokens: return CONFIG.max_tokens_rsp - return get_max_completion_tokens(messages, self.model) + return get_max_completion_tokens(messages, self.model, CONFIG.max_tokens_rsp) diff --git a/metagpt/roles/researcher.py b/metagpt/roles/researcher.py index 6ad3c2215..9e32820ed 100644 --- a/metagpt/roles/researcher.py +++ b/metagpt/roles/researcher.py @@ -22,7 +22,7 @@ class Report(BaseModel): class Researcher(Role): def __init__( self, - name: str = "Bob", + name: str = "David", profile: str = "Researcher", goal: str = "Gather information and conduct research", constraints: str = "Ensure accuracy and relevance of information", @@ -88,4 +88,4 @@ class Researcher(Role): if __name__ == "__main__": role = Researcher(language="en-us") - asyncio.run(role.run("dataiku .vs datarobot")) + asyncio.run(role.run("dataiku vs. datarobot")) diff --git a/metagpt/utils/parse_html.py b/metagpt/utils/parse_html.py index 4631005cf..62de26541 100644 --- a/metagpt/utils/parse_html.py +++ b/metagpt/utils/parse_html.py @@ -2,7 +2,7 @@ from __future__ import annotations from typing import Generator, Optional -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from pydantic import BaseModel @@ -35,11 +35,11 @@ class WebPage(BaseModel): def get_links(self) -> Generator[str, None, None]: for i in self.soup.find_all("a", href=True): url = i["href"] - if url.startswith("data:"): - continue - if not url.startswith(("http://", "https://")): - url = urljoin(self.url, url) - yield url + result = urlparse(url) + if not result.scheme and result.path: + yield urljoin(self.url, url) + elif url.startswith(("http://", "https://")): + yield urljoin(self.url, url) def get_html_content(page: str, base: str): diff --git a/metagpt/utils/text.py b/metagpt/utils/text.py index 6bae1d9dd..6acb31b67 100644 --- a/metagpt/utils/text.py +++ b/metagpt/utils/text.py @@ -4,19 +4,19 @@ from metagpt.utils.token_counter import TOKEN_MAX, count_string_tokens def reduce_message_length(msgs: Generator[str, None, None], model_name: str, system_text: str, reserved: int = 0,) -> str: - """Reduce the length of messages to fit within the maximum token size. + """Reduce the length of concatenated message segments to fit within the maximum token size. Args: - msgs: A generator of strings representing the messages. + msgs: A generator of strings representing progressively shorter valid prompts. model_name: The name of the encoding to use. (e.g., "gpt-3.5-turbo") system_text: The system prompts. reserved: The number of reserved tokens. Returns: - The reduced message. + The concatenated message segments reduced to fit within the maximum token size. Raises: - RuntimeError: If it fails to reduce the message length. + RuntimeError: If it fails to reduce the concatenated message length. """ max_token = TOKEN_MAX.get(model_name, 2048) - count_string_tokens(system_text, model_name) - reserved for msg in msgs: diff --git a/metagpt/utils/token_counter.py b/metagpt/utils/token_counter.py index 364eb8aca..591bb60f0 100644 --- a/metagpt/utils/token_counter.py +++ b/metagpt/utils/token_counter.py @@ -96,7 +96,7 @@ def count_string_tokens(string: str, model_name: str) -> int: return len(encoding.encode(string)) -def get_max_completion_tokens(messages: list[dict], model: str): +def get_max_completion_tokens(messages: list[dict], model: str, default: int) -> int: """Calculate the maximum number of completion tokens for a given model and list of messages. Args: @@ -106,4 +106,6 @@ def get_max_completion_tokens(messages: list[dict], model: str): Returns: The maximum number of completion tokens. """ - return TOKEN_MAX.get(model, 4096) - count_message_tokens(messages) + if model not in TOKEN_MAX: + return default + return TOKEN_MAX[model] - count_message_tokens(messages) diff --git a/tests/metagpt/roles/test_researcher.py b/tests/metagpt/roles/test_researcher.py index 0efa59be6..01b5dae3b 100644 --- a/tests/metagpt/roles/test_researcher.py +++ b/tests/metagpt/roles/test_researcher.py @@ -25,7 +25,7 @@ async def mock_llm_ask(self, prompt: str, system_msgs): @pytest.mark.asyncio async def test_researcher(mocker): with TemporaryDirectory() as dirname: - topic = "dataiku .vs datarobot" + topic = "dataiku vs. datarobot" mocker.patch("metagpt.provider.base_gpt_api.BaseGPTAPI.aask", mock_llm_ask) researcher.RESEARCH_PATH = Path(dirname) await researcher.Researcher().run(topic) diff --git a/tests/metagpt/utils/test_parse_html.py b/tests/metagpt/utils/test_parse_html.py index d7a4d0898..42be416a6 100644 --- a/tests/metagpt/utils/test_parse_html.py +++ b/tests/metagpt/utils/test_parse_html.py @@ -44,6 +44,9 @@ PAGE = """

This is a div with a class "box".

a link

+

+

+