diff --git a/.well-known/skills.yaml b/.well-known/skills.yaml index 5ccb8094b..7a035910c 100644 --- a/.well-known/skills.yaml +++ b/.well-known/skills.yaml @@ -3,15 +3,41 @@ entities: skills: - name: text_to_speech description: Text-to-speech + id: text_to_speech.text_to_speech requisite: - AZURE_TTS_SUBSCRIPTION_KEY - AZURE_TTS_REGION + arguments: + text: 'The text used for voice conversion. Required.' + lang: 'The value can contain a language code such as en (English), or a locale such as en-US (English - United States). The optional parameter are "English", "Chinese". Default value: "Chinese".' + voice: 'Default value: "zh-CN-XiaomoNeural".' + style: 'Speaking style to express different emotions like cheerfulness, empathy, and calm. The optional parameter values are "affectionate", "angry", "calm", "cheerful", "depressed", "disgruntled", "embarrassed", "envious", "fearful", "gentle", "sad", "serious". Default value: "affectionate".' + role: 'With roles, the same voice can act as a different age and gender. The optional parameter values are "Girl", "Boy", "OlderAdultFemale", "OlderAdultMale", "SeniorFemale", "SeniorMale", "YoungAdultFemale", "YoungAdultMale". Default value: "Girl".' + examples: + - ask: 'A girl says "hello world"' + answer: 'text_to_speech(text="hello world", role="Girl")' + - ask: 'A boy affectionate says "hello world"' + answer: 'text_to_speech(text="hello world", role="Boy", style="affectionate")' + - ask: 'A boy says "你好"' + answer: 'text_to_speech(text="hello world", role="Boy", lang="Chinese")' + returns: + type: string + format: base64 + - name: text_to_image description: Create a drawing based on the text. + id: text_to_image.text_to_image requisite: - OPENAI_API_KEY - METAGPT_TEXT_TO_IMAGE_MODEL - - name: text_to_embedding - description: Convert the text into embeddings. - requisite: - - OPENAI_API_KEY + arguments: + text: 'The text used for image conversion. Required.' + size_type: 'Default value: "512x512".' + examples: + - ask: 'Draw a girl' + answer: 'text_to_image(text="Draw a girl", size_type="512x512")' + - ask: 'Draw an apple' + answer: 'text_to_image(text="Draw an apple", size_type="512x512")' + returns: + type: string + format: base64 diff --git a/metagpt/actions/skill_action.py b/metagpt/actions/skill_action.py index e69de29bb..8cc7b6c42 100644 --- a/metagpt/actions/skill_action.py +++ b/metagpt/actions/skill_action.py @@ -0,0 +1,88 @@ +import ast +import importlib + +from metagpt.actions import Action, ActionOutput +from metagpt.learn.skill_loader import Skill +from metagpt.logs import logger + + +class ArgumentsParingAction(Action): + def __init__(self, options, last_talk: str, skill: Skill, context=None, llm=None, **kwargs): + super(ArgumentsParingAction, self).__init__(options=options, name='', context=context, llm=llm) + self.skill = skill + self.ask = last_talk + self.rsp = None + self.args = None + + @property + def prompt(self): + prompt = f"{self.skill.name} function parameters description:\n" + for k, v in self.skill.arguments.items(): + prompt += f"parameter `{k}`: {v}\n" + prompt += "\n" + prompt += "Examples:\n" + for e in self.skill.examples: + prompt += f"If want you to do `{e.ask}`, return `{e.answer}` brief and clear.\n" + prompt += f"\nNow I want you to do `{self.ask}`, return in examples format above, brief and clear." + return prompt + + async def run(self, *args, **kwargs) -> ActionOutput: + prompt = self.prompt + logger.info(prompt) + rsp = await self.llm.aask(msg=prompt, system_msgs=[]) + logger.info(rsp) + self.args = ArgumentsParingAction.parse_arguments(skill_name=self.skill.name, txt=rsp) + self.rsp = ActionOutput(content=rsp) + return self.rsp + + @staticmethod + def parse_arguments(skill_name, txt) -> dict: + prefix = skill_name + "(" + if prefix not in txt: + logger.error(f"{skill_name} not in {txt}") + return None + if ")" not in txt: + logger.error(f"')' not in {txt}") + return None + begin_ix = txt.find(prefix) + end_ix = txt.rfind(")") + args_txt = txt[begin_ix + len(prefix): end_ix] + logger.info(args_txt) + fake_expression = f"dict({args_txt})" + parsed_expression = ast.parse(fake_expression, mode='eval') + args = {} + for keyword in parsed_expression.body.keywords: + key = keyword.arg + value = ast.literal_eval(keyword.value) + args[key] = value + return args + + +class SkillAction(Action): + def __init__(self, options, skill: Skill, args: dict, context=None, llm=None, **kwargs): + super(SkillAction, self).__init__(options=options, name='', context=context, llm=llm) + self._skill = skill + self._args = args + self.rsp = None + + async def run(self, *args, **kwargs) -> str | ActionOutput | None: + """Run action""" + self.rsp = self.find_and_call_function(self._skill.name, args=self._args, **self.options) + return ActionOutput(content=self.rsp, instruct_content=self._skill.json()) + + @staticmethod + def find_and_call_function(function_name, args, **kwargs): + try: + module = importlib.import_module("metagpt.learn") + function = getattr(module, function_name) + # 调用函数并返回结果 + result = function(**args, **kwargs) + return result + except (ModuleNotFoundError, AttributeError): + logger.error(f"{function_name} not found") + return None + + +if __name__ == '__main__': + ArgumentsParingAction.parse_arguments(skill_name="text_to_image", + txt='`text_to_image(text="Draw an apple", size_type="512x512")`') diff --git a/metagpt/actions/talk_action.py b/metagpt/actions/talk_action.py index 4275a1b9e..5485456c5 100644 --- a/metagpt/actions/talk_action.py +++ b/metagpt/actions/talk_action.py @@ -4,7 +4,7 @@ from metagpt.logs import logger class TalkAction(Action): - def __init__(self, options, name: str = '', talk='', history_summary='', context=None, llm=None): + def __init__(self, options, name: str = '', talk='', history_summary='', context=None, llm=None, **kwargs): context = context or {} context["talk"] = talk context["history_summery"] = history_summary diff --git a/metagpt/learn/__init__.py b/metagpt/learn/__init__.py index 28b8739c3..c8270dbfb 100644 --- a/metagpt/learn/__init__.py +++ b/metagpt/learn/__init__.py @@ -5,3 +5,11 @@ @Author : alexanderwu @File : __init__.py """ + +from metagpt.learn.text_to_image import text_to_image +from metagpt.learn.text_to_speech import text_to_speech + +__all__ = [ + "text_to_image", + "text_to_speech", +] \ No newline at end of file diff --git a/metagpt/learn/skill_loader.py b/metagpt/learn/skill_loader.py index eeca12871..46ead728d 100644 --- a/metagpt/learn/skill_loader.py +++ b/metagpt/learn/skill_loader.py @@ -1,14 +1,26 @@ from pathlib import Path -from typing import List, Dict +from typing import List, Dict, Optional import yaml from pydantic import BaseModel +class Example(BaseModel): + ask: str + answer: str + +class Returns(BaseModel): + type: str + format: Optional[str] = None + class Skill(BaseModel): name: str description: str + id: str requisite: List[str] + arguments: Dict + examples: List[Example] + returns: Returns class EntitySkills(BaseModel): @@ -26,13 +38,26 @@ class SkillLoader: skills = yaml.safe_load(file) self._skills = SkillsDeclaration(**skills) - def get_skill_list(self, entity_name: str = "Assistant"): - if not self._skills or entity_name not in self._skills.entities: + def get_skill_list(self, entity_name: str = "Assistant") -> Dict: + entity_skills = self.get_entity(entity_name) + if not entity_skills: return {} - entity_skills = self._skills.entities.get(entity_name) description_to_name_mappings = {} for s in entity_skills.skills: description_to_name_mappings[s.description] = s.name return description_to_name_mappings + + def get_skill(self, name, entity_name: str = "Assistant") -> Skill: + entity = self.get_entity(entity_name) + if not entity: + return None + for sk in entity.skills: + if sk.name == name: + return sk + + def get_entity(self, name) -> EntitySkills: + if not self._skills: + return None + return self._skills.entities.get(name) \ No newline at end of file diff --git a/metagpt/learn/text_to_embedding.py b/metagpt/learn/text_to_embedding.py index 38fd7c0cb..6d0cefcdb 100644 --- a/metagpt/learn/text_to_embedding.py +++ b/metagpt/learn/text_to_embedding.py @@ -16,7 +16,7 @@ from metagpt.utils.common import initialize_environment @skill_metadata(name="Text to Embedding", description="Convert the text into embeddings.", requisite="`OPENAI_API_KEY`") -def text_to_embedding(text, model="text-embedding-ada-002", openai_api_key=""): +def text_to_embedding(text, model="text-embedding-ada-002", openai_api_key="", **kwargs): """Text to embedding :param text: The text used for embedding. diff --git a/metagpt/learn/text_to_image.py b/metagpt/learn/text_to_image.py index d123e116a..2f946e239 100644 --- a/metagpt/learn/text_to_image.py +++ b/metagpt/learn/text_to_image.py @@ -17,7 +17,7 @@ from metagpt.utils.common import initialize_environment @skill_metadata(name="Text to image", description="Create a drawing based on the text.", requisite="`OPENAI_API_KEY` or `METAGPT_TEXT_TO_IMAGE_MODEL`") -def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url=""): +def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url="", **kwargs): """Text to image :param text: The text used for image conversion. @@ -27,8 +27,14 @@ def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url :return: The image data is returned in Base64 encoding. """ initialize_environment() + image_declaration = "data:image/png;base64," if os.environ.get("METAGPT_TEXT_TO_IMAGE_MODEL") or model_url: - return oas3_metagpt_text_to_image(text, size_type, model_url) + data = oas3_metagpt_text_to_image(text, size_type, model_url) + return image_declaration + data if data else "" if os.environ.get("OPENAI_API_KEY") or openai_api_key: - return oas3_openai_text_to_image(text, size_type, openai_api_key) + data = oas3_openai_text_to_image(text, size_type, openai_api_key) + return image_declaration + data if data else "" + raise EnvironmentError + + diff --git a/metagpt/learn/text_to_speech.py b/metagpt/learn/text_to_speech.py index 5631ef45e..90dd878a1 100644 --- a/metagpt/learn/text_to_speech.py +++ b/metagpt/learn/text_to_speech.py @@ -17,7 +17,7 @@ from metagpt.utils.common import initialize_environment description="Text-to-speech", requisite="`AZURE_TTS_SUBSCRIPTION_KEY` and `AZURE_TTS_REGION`") def text_to_speech(text, lang="zh-CN", voice="zh-CN-XiaomoNeural", style="affectionate", role="Girl", - subscription_key="", region=""): + subscription_key="", region="", **kwargs): """Text to speech For more details, check out:`https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts` @@ -32,8 +32,10 @@ def text_to_speech(text, lang="zh-CN", voice="zh-CN-XiaomoNeural", style="affect """ initialize_environment() + audio_declaration = "data:audio/wav;base64," if (os.environ.get("AZURE_TTS_SUBSCRIPTION_KEY") and os.environ.get("AZURE_TTS_REGION")) or \ (subscription_key and region): - return oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region) + data = oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region) + return audio_declaration + data if data else data raise EnvironmentError diff --git a/metagpt/memory/brain_memory.py b/metagpt/memory/brain_memory.py index 97319859a..68e930144 100644 --- a/metagpt/memory/brain_memory.py +++ b/metagpt/memory/brain_memory.py @@ -35,9 +35,15 @@ class BrainMemory(pydantic.BaseModel): return "\n".join(texts) def move_to_solution(self): - while len(self.history) > 1: - msg = self.history.pop() - self.solution.append(msg) + if len(self.history) < 2: + return + msgs = self.history[:-1] + self.solution.extend(msgs) + if not self.history[-1].is_contain(MessageType.Talk.value): + self.solution.append(self.history[-1]) + self.history = [] + else: + self.history = self.history[-1:] @property def last_talk(self): diff --git a/metagpt/provider/openai_api.py b/metagpt/provider/openai_api.py index e98acbd75..27f22e491 100644 --- a/metagpt/provider/openai_api.py +++ b/metagpt/provider/openai_api.py @@ -153,26 +153,10 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter): self.rpm = int(self._options.get("RPM", 10)) async def _achat_completion_stream(self, messages: list[dict]) -> str: - max_try = 5 - response = None - for i in range(max_try): - try: - response = await openai.ChatCompletion.acreate( + response = await self.async_retry_call(openai.ChatCompletion.acreate, **self._cons_kwargs(messages), stream=True ) - break - except openai.error.RateLimitError as e: - random_time = random.uniform(0, 3) # 生成0到5秒之间的随机时间 - rounded_time = round(random_time, 1) # 保留一位小数,以实现0.1秒的精度 - logger.warning(f"Exception:{e}, sleeping for {rounded_time} seconds") - await asyncio.sleep(rounded_time) - continue - except Exception as e: - error_str = traceback.format_exc() - logger.error(f"Exception:{e}, stack:{error_str}") - raise e - # create variables to collect the stream of chunks collected_chunks = [] collected_messages = [] @@ -213,12 +197,12 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter): return kwargs async def _achat_completion(self, messages: list[dict]) -> dict: - rsp = await self.llm.ChatCompletion.acreate(**self._cons_kwargs(messages)) + rsp = await self.async_retry_call(self.llm.ChatCompletion.acreate, **self._cons_kwargs(messages)) self._update_costs(rsp.get("usage")) return rsp def _chat_completion(self, messages: list[dict]) -> dict: - rsp = self.llm.ChatCompletion.create(**self._cons_kwargs(messages)) + rsp = self.retry_call(self.llm.ChatCompletion.create, **self._cons_kwargs(messages)) self._update_costs(rsp) return rsp @@ -398,4 +382,43 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter): if match: return match.group(1), match.group(2) else: - return None, input_string \ No newline at end of file + return None, input_string + + @staticmethod + async def async_retry_call(func, *args, **kwargs): + for i in range(OpenAIGPTAPI.MAX_TRY): + try: + rsp = await func(*args, **kwargs) + return rsp + except openai.error.RateLimitError as e: + random_time = random.uniform(0, 3) # 生成0到5秒之间的随机时间 + rounded_time = round(random_time, 1) # 保留一位小数,以实现0.1秒的精度 + logger.warning(f"Exception:{e}, sleeping for {rounded_time} seconds") + await asyncio.sleep(rounded_time) + continue + except openai.error.APIConnectionError as e: + logger.warning(f"Exception:{e}") + continue + except Exception as e: + error_str = traceback.format_exc() + logger.error(f"Exception:{e}, stack:{error_str}") + raise e + + @staticmethod + def retry_call(func, *args, **kwargs): + for i in range(OpenAIGPTAPI.MAX_TRY): + try: + rsp = func(*args, **kwargs) + return rsp + except openai.error.RateLimitError as e: + logger.warning(f"Exception:{e}") + continue + except openai.error.APIConnectionError as e: + logger.warning(f"Exception:{e}") + continue + except Exception as e: + error_str = traceback.format_exc() + logger.error(f"Exception:{e}, stack:{error_str}") + raise e + + MAX_TRY = 5 diff --git a/metagpt/roles/assistant.py b/metagpt/roles/assistant.py index dfbd406bc..032d73ca5 100644 --- a/metagpt/roles/assistant.py +++ b/metagpt/roles/assistant.py @@ -15,8 +15,8 @@ """ import asyncio - from metagpt.actions import ActionOutput +from metagpt.actions.skill_action import SkillAction, ArgumentsParingAction from metagpt.actions.talk_action import TalkAction from metagpt.config import Config from metagpt.learn.skill_loader import SkillLoader @@ -53,7 +53,7 @@ class Assistant(Role): logger.info(prompt) rsp = await self._llm.aask(prompt, []) logger.info(rsp) - return await self._plan(rsp) + return await self._plan(rsp, last_talk=last_talk) async def act(self) -> ActionOutput: result = await self._rc.todo.run(**self._options) @@ -88,8 +88,18 @@ class Assistant(Role): return True async def skill_handler(self, text, **kwargs) -> bool: - skill = - pass + last_talk = kwargs.get("last_talk") + skill = self.skills.get_skill(text) + logger.info(f"skill not found: {text}") + if not skill: + return await self.talk_handler(text=last_talk, **kwargs) + action = ArgumentsParingAction(options=self.options, skill=skill, llm=self._llm, **kwargs) + await action.run(**kwargs) + if action.args is None: + return await self.talk_handler(text=last_talk, **kwargs) + action = SkillAction(options=self.options, skill=skill, args=action.args, llm=self._llm) + self.add_to_do(action) + return True async def refine_memory(self) -> str: history_text = self.memory.history_text @@ -97,7 +107,7 @@ class Assistant(Role): if history_text == "": return last_talk history_summary = await self._llm.get_context_title(history_text, max_words=20) - if await self._llm.is_related(last_talk, history_summary): # 合并相关内容 + if last_talk and await self._llm.is_related(last_talk, history_summary): # 合并相关内容 last_talk = await self._llm.rewrite(sentence=last_talk, context=history_text) return last_talk @@ -109,11 +119,20 @@ class Assistant(Role): from metagpt.provider.openai_api import OpenAIGPTAPI return OpenAIGPTAPI.extract_info(input_string) + def get_memory(self) -> str: + return self.memory.json() + + def load_memory(self, jsn): + try: + self.memory = BrainMemory(**jsn) + except Exception as e: + logger.exception(f"load error:{e}, data:{jsn}") + async def main(): options = Config().runtime_options cost_manager = CostManager(**options) - topic = "dataiku vs. datarobot" + topic = "draw an apple" role = Assistant(options=options, cost_manager=cost_manager, language="Chinese") await role.talk(topic) while True: @@ -121,8 +140,9 @@ async def main(): if not has_action: break msg = await role.act() - print(msg) + logger.info(msg) # 获取用户终端输入 + logger.info("Enter prompt") talk = input("You: ") await role.talk(talk) diff --git a/metagpt/roles/role.py b/metagpt/roles/role.py index 1bb73f884..47f494c69 100644 --- a/metagpt/roles/role.py +++ b/metagpt/roles/role.py @@ -325,4 +325,12 @@ class Role: self._actions.append(act) def add_to_do(self, act): - self._rc.todo = act \ No newline at end of file + self._rc.todo = act + + async def think(self) -> bool: + return await self._think() + + async def act(self) -> ActionOutput: + msg = await self._act() + return ActionOutput(content=msg.content, + instruct_content=msg.instruct_content) diff --git a/metagpt/schema.py b/metagpt/schema.py index e1cd011c6..909313886 100644 --- a/metagpt/schema.py +++ b/metagpt/schema.py @@ -67,6 +67,9 @@ class Message: intersection = set(tags) & self.tags return len(intersection) > 0 + def is_contain(self, tag): + return self.is_contain_tags([tag]) + @dataclass class UserMessage(Message):