Optimize: Update certain prompt formats.

This commit is contained in:
shenchucheng 2023-08-09 20:28:31 +08:00
parent 2d8a3f1296
commit e43ae21d98
9 changed files with 43 additions and 49 deletions

View file

@ -6,7 +6,7 @@ from metagpt.roles.researcher import RESEARCH_PATH, Researcher
async def main():
topic = "dataiku .vs datarobot"
topic = "dataiku vs. datarobot"
role = Researcher(language="en-us")
await role.run(topic)
print(f"save report to {RESEARCH_PATH / f'{topic}.md'}.")

View file

@ -18,43 +18,34 @@ from metagpt.utils.text import generate_prompt_chunk, reduce_message_length
LANG_PROMPT = "Please respond in {language}."
RESEARCH_BASE_SYSTEM = """You are an AI critical thinker research assistant. Your sole purpose is to write well \
written, critically acclaimed, objective and structured reports on given text."""
written, critically acclaimed, objective and structured reports on the given text."""
RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is \"{topic}\"."
RESEARCH_TOPIC_SYSTEM = "You are an AI researcher assistant, and your research topic is:\n#TOPIC#\n{topic}"
SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic that require Google search. \
SEARCH_TOPIC_PROMPT = """Please provide up to 2 necessary keywords related to your research topic for Google search. \
Your response must be in JSON format, for example: ["keyword1", "keyword2"]."""
SUMMARIZE_SEARCH_PROMPT = """### Requirements
1. The keywords related to your research topic and the search results are shown in the "Reference Information" section.
1. The keywords related to your research topic and the search results are shown in the "Search Result Information" section.
2. Provide up to {decomposition_nums} queries related to your research topic base on the search results.
3. Please respond in JSON format as follows: ["query1", "query2", "query3", ...].
3. Please respond in the following JSON format: ["query1", "query2", "query3", ...].
### Reference Information
{search}
### Search Result Information
{search_results}
"""
DECOMPOSITION_PROMPT = """You are a researcher, and before delving into an topic, you break it down into several \
sub-questions. These sub-questions can be researched through online searches to gather objective opinions about the given \
topic.
COLLECT_AND_RANKURLS_PROMPT = """### Topic
{topic}
### Query
{query}
---
The topic is: {topic}
### The online search results
{results}
---
Now, please break down the provided topic into {decomposition_nums} search questions. You should respond with an array of \
strings in JSON format like ["question1", "question2", ...].
"""
COLLECT_AND_RANKURLS_PROMPT = """### Reference Information
1. Topic: "{topic}"
2. Query: "{query}"
3. The online search results: {results}
---
### Requirements
Please remove irrelevant search results that are not related to the query or topic. Then, sort the remaining search results \
based on link credibility. If two results have equal credibility, prioritize them based on relevance. Provide the ranked \
results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words.
based on the link credibility. If two results have equal credibility, prioritize them based on the relevance. Provide the
ranked results' indices in JSON format, like [0, 1, 3, 4, ...], without including other words.
"""
WEB_BROWSE_AND_SUMMARIZE_PROMPT = '''### Requirements
@ -74,7 +65,7 @@ CONDUCT_RESEARCH_PROMPT = '''### Reference Information
### Requirements
Please provide a detailed research report in response to the following topic: "{topic}", using the information provided \
above. The report must adhere to the following requirements:
above. The report must meet the following requirements:
- Focus on directly addressing the chosen topic.
- Ensure a well-structured and in-depth presentation, incorporating relevant facts and figures where available.
@ -117,9 +108,7 @@ class CollectLinks(Action):
A dictionary containing the search questions as keys and the collected URLs as values.
"""
system_text = system_text if system_text else RESEARCH_TOPIC_SYSTEM.format(topic=topic)
search_topic_prompt = SEARCH_TOPIC_PROMPT.format(topic=topic)
logger.debug(search_topic_prompt)
keywords = await self._aask(search_topic_prompt, [system_text])
keywords = await self._aask(SEARCH_TOPIC_PROMPT, [system_text])
try:
keywords = json.loads(keywords)
keywords = parse_obj_as(list[str], keywords)
@ -130,8 +119,8 @@ class CollectLinks(Action):
def gen_msg():
while True:
search = "\n".join(f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results))
prompt = SUMMARIZE_SEARCH_PROMPT.format(decomposition_nums=decomposition_nums, search=search)
search_results = "\n".join(f"#### Keyword: {i}\n Search Result: {j}\n" for (i, j) in zip(keywords, results))
prompt = SUMMARIZE_SEARCH_PROMPT.format(decomposition_nums=decomposition_nums, search_results=search_results)
yield prompt
remove = max(results, key=len)
remove.pop()
@ -144,7 +133,7 @@ class CollectLinks(Action):
queries = json.loads(queries)
queries = parse_obj_as(list[str], queries)
except Exception as e:
logger.exception(f"fail to break down the research question for {e}")
logger.exception(f"fail to break down the research question due to {e}")
queries = keywords
ret = {}
for query in queries:

View file

@ -270,4 +270,4 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
def get_max_tokens(self, messages: list[dict]):
if not self.auto_max_tokens:
return CONFIG.max_tokens_rsp
return get_max_completion_tokens(messages, self.model)
return get_max_completion_tokens(messages, self.model, CONFIG.max_tokens_rsp)

View file

@ -22,7 +22,7 @@ class Report(BaseModel):
class Researcher(Role):
def __init__(
self,
name: str = "Bob",
name: str = "David",
profile: str = "Researcher",
goal: str = "Gather information and conduct research",
constraints: str = "Ensure accuracy and relevance of information",
@ -88,4 +88,4 @@ class Researcher(Role):
if __name__ == "__main__":
role = Researcher(language="en-us")
asyncio.run(role.run("dataiku .vs datarobot"))
asyncio.run(role.run("dataiku vs. datarobot"))

View file

@ -2,7 +2,7 @@
from __future__ import annotations
from typing import Generator, Optional
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from pydantic import BaseModel
@ -35,11 +35,11 @@ class WebPage(BaseModel):
def get_links(self) -> Generator[str, None, None]:
for i in self.soup.find_all("a", href=True):
url = i["href"]
if url.startswith("data:"):
continue
if not url.startswith(("http://", "https://")):
url = urljoin(self.url, url)
yield url
result = urlparse(url)
if not result.scheme and result.path:
yield urljoin(self.url, url)
elif url.startswith(("http://", "https://")):
yield urljoin(self.url, url)
def get_html_content(page: str, base: str):

View file

@ -4,19 +4,19 @@ from metagpt.utils.token_counter import TOKEN_MAX, count_string_tokens
def reduce_message_length(msgs: Generator[str, None, None], model_name: str, system_text: str, reserved: int = 0,) -> str:
"""Reduce the length of messages to fit within the maximum token size.
"""Reduce the length of concatenated message segments to fit within the maximum token size.
Args:
msgs: A generator of strings representing the messages.
msgs: A generator of strings representing progressively shorter valid prompts.
model_name: The name of the encoding to use. (e.g., "gpt-3.5-turbo")
system_text: The system prompts.
reserved: The number of reserved tokens.
Returns:
The reduced message.
The concatenated message segments reduced to fit within the maximum token size.
Raises:
RuntimeError: If it fails to reduce the message length.
RuntimeError: If it fails to reduce the concatenated message length.
"""
max_token = TOKEN_MAX.get(model_name, 2048) - count_string_tokens(system_text, model_name) - reserved
for msg in msgs:

View file

@ -96,7 +96,7 @@ def count_string_tokens(string: str, model_name: str) -> int:
return len(encoding.encode(string))
def get_max_completion_tokens(messages: list[dict], model: str):
def get_max_completion_tokens(messages: list[dict], model: str, default: int) -> int:
"""Calculate the maximum number of completion tokens for a given model and list of messages.
Args:
@ -106,4 +106,6 @@ def get_max_completion_tokens(messages: list[dict], model: str):
Returns:
The maximum number of completion tokens.
"""
return TOKEN_MAX.get(model, 4096) - count_message_tokens(messages)
if model not in TOKEN_MAX:
return default
return TOKEN_MAX[model] - count_message_tokens(messages)

View file

@ -25,7 +25,7 @@ async def mock_llm_ask(self, prompt: str, system_msgs):
@pytest.mark.asyncio
async def test_researcher(mocker):
with TemporaryDirectory() as dirname:
topic = "dataiku .vs datarobot"
topic = "dataiku vs. datarobot"
mocker.patch("metagpt.provider.base_gpt_api.BaseGPTAPI.aask", mock_llm_ask)
researcher.RESEARCH_PATH = Path(dirname)
await researcher.Researcher().run(topic)

View file

@ -44,6 +44,9 @@ PAGE = """
<div class="box">
<p>This is a div with a class "box".</p>
<p><a href="https://metagpt.com">a link</a></p>
<p><a href="#section2"></a></p>
<p><a href="ftp://192.168.1.1:8080"></a></p>
<p><a href="javascript:alert('Hello');"></a></p>
</div>
</body>
</html>