feat: +common talk role

2026-07-26 17:11:07 +02:00 · 2023-08-26 16:59:12 +08:00 · 2023-08-26 16:59:12 +08:00 · 2c593bedea
commit 2c593bedea
parent 4fe3d6e879
13 changed files with 261 additions and 46 deletions
--- a/.well-known/skills.yaml
+++ b/.well-known/skills.yaml
@ -3,15 +3,41 @@ entities:
     skills:
     - name: text_to_speech
       description: Text-to-speech
+       id: text_to_speech.text_to_speech
       requisite:
       - AZURE_TTS_SUBSCRIPTION_KEY
       - AZURE_TTS_REGION
+       arguments:
+         text: 'The text used for voice conversion. Required.'
+         lang: 'The value can contain a language code such as en (English), or a locale such as en-US (English - United States). The optional parameter are "English", "Chinese". Default value: "Chinese".'
+         voice: 'Default value: "zh-CN-XiaomoNeural".'
+         style: 'Speaking style to express different emotions like cheerfulness, empathy, and calm. The optional parameter values are "affectionate", "angry", "calm", "cheerful", "depressed", "disgruntled", "embarrassed", "envious", "fearful", "gentle", "sad", "serious". Default value: "affectionate".'
+         role: 'With roles, the same voice can act as a different age and gender. The optional parameter values are "Girl", "Boy", "OlderAdultFemale", "OlderAdultMale", "SeniorFemale", "SeniorMale", "YoungAdultFemale", "YoungAdultMale". Default value: "Girl".'
+       examples:
+         - ask: 'A girl says "hello world"'
+           answer: 'text_to_speech(text="hello world", role="Girl")'
+         - ask: 'A boy affectionate says "hello world"'
+           answer: 'text_to_speech(text="hello world", role="Boy", style="affectionate")'
+         - ask: 'A boy says "你好"'
+           answer: 'text_to_speech(text="hello world", role="Boy", lang="Chinese")'
+       returns:
+         type: string
+         format: base64
+
     - name: text_to_image
       description: Create a drawing based on the text.
+       id: text_to_image.text_to_image
       requisite:
       - OPENAI_API_KEY
       - METAGPT_TEXT_TO_IMAGE_MODEL
-     - name: text_to_embedding
-       description: Convert the text into embeddings.
-       requisite:
-       - OPENAI_API_KEY
+       arguments:
+         text: 'The text used for image conversion. Required.'
+         size_type: 'Default value: "512x512".'
+       examples:
+         - ask: 'Draw a girl'
+           answer: 'text_to_image(text="Draw a girl", size_type="512x512")'
+         - ask: 'Draw an apple'
+           answer: 'text_to_image(text="Draw an apple", size_type="512x512")'
+       returns:
+         type: string
+         format: base64
--- a/metagpt/actions/skill_action.py
+++ b/metagpt/actions/skill_action.py
@ -0,0 +1,88 @@
+import ast
+import importlib
+
+from metagpt.actions import Action, ActionOutput
+from metagpt.learn.skill_loader import Skill
+from metagpt.logs import logger
+
+
+class ArgumentsParingAction(Action):
+    def __init__(self, options, last_talk: str, skill: Skill, context=None, llm=None, **kwargs):
+        super(ArgumentsParingAction, self).__init__(options=options, name='', context=context, llm=llm)
+        self.skill = skill
+        self.ask = last_talk
+        self.rsp = None
+        self.args = None
+
+    @property
+    def prompt(self):
+        prompt = f"{self.skill.name} function parameters description:\n"
+        for k, v in self.skill.arguments.items():
+            prompt += f"parameter `{k}`: {v}\n"
+        prompt += "\n"
+        prompt += "Examples:\n"
+        for e in self.skill.examples:
+            prompt += f"If want you to do `{e.ask}`, return `{e.answer}` brief and clear.\n"
+        prompt += f"\nNow I want you to do `{self.ask}`, return in examples format above, brief and clear."
+        return prompt
+
+    async def run(self, *args, **kwargs) -> ActionOutput:
+        prompt = self.prompt
+        logger.info(prompt)
+        rsp = await self.llm.aask(msg=prompt, system_msgs=[])
+        logger.info(rsp)
+        self.args = ArgumentsParingAction.parse_arguments(skill_name=self.skill.name, txt=rsp)
+        self.rsp = ActionOutput(content=rsp)
+        return self.rsp
+
+    @staticmethod
+    def parse_arguments(skill_name, txt) -> dict:
+        prefix = skill_name + "("
+        if prefix not in txt:
+            logger.error(f"{skill_name} not in {txt}")
+            return None
+        if ")" not in txt:
+            logger.error(f"')' not in {txt}")
+            return None
+        begin_ix = txt.find(prefix)
+        end_ix = txt.rfind(")")
+        args_txt = txt[begin_ix + len(prefix): end_ix]
+        logger.info(args_txt)
+        fake_expression = f"dict({args_txt})"
+        parsed_expression = ast.parse(fake_expression, mode='eval')
+        args = {}
+        for keyword in parsed_expression.body.keywords:
+            key = keyword.arg
+            value = ast.literal_eval(keyword.value)
+            args[key] = value
+        return args
+
+
+class SkillAction(Action):
+    def __init__(self, options, skill: Skill, args: dict, context=None, llm=None, **kwargs):
+        super(SkillAction, self).__init__(options=options, name='', context=context, llm=llm)
+        self._skill = skill
+        self._args = args
+        self.rsp = None
+
+    async def run(self, *args, **kwargs) -> str | ActionOutput | None:
+        """Run action"""
+        self.rsp = self.find_and_call_function(self._skill.name, args=self._args, **self.options)
+        return ActionOutput(content=self.rsp, instruct_content=self._skill.json())
+
+    @staticmethod
+    def find_and_call_function(function_name, args, **kwargs):
+        try:
+            module = importlib.import_module("metagpt.learn")
+            function = getattr(module, function_name)
+            # 调用函数并返回结果
+            result = function(**args, **kwargs)
+            return result
+        except (ModuleNotFoundError, AttributeError):
+            logger.error(f"{function_name} not found")
+            return None
+
+
+if __name__ == '__main__':
+    ArgumentsParingAction.parse_arguments(skill_name="text_to_image",
+                                          txt='`text_to_image(text="Draw an apple", size_type="512x512")`')
--- a/metagpt/actions/talk_action.py
+++ b/metagpt/actions/talk_action.py
@ -4,7 +4,7 @@ from metagpt.logs import logger


 class TalkAction(Action):
-    def __init__(self, options, name: str = '', talk='', history_summary='', context=None, llm=None):
+    def __init__(self, options, name: str = '', talk='', history_summary='', context=None, llm=None, **kwargs):
        context = context or {}
        context["talk"] = talk
        context["history_summery"] = history_summary
--- a/metagpt/learn/init.py
+++ b/metagpt/learn/init.py
@ -5,3 +5,11 @@
@Author  : alexanderwu
@File    : __init__.py
 """
+
+from metagpt.learn.text_to_image import text_to_image
+from metagpt.learn.text_to_speech import text_to_speech
+
+__all__ = [
+    "text_to_image",
+    "text_to_speech",
+]
--- a/metagpt/learn/skill_loader.py
+++ b/metagpt/learn/skill_loader.py
@ -1,14 +1,26 @@
 from pathlib import Path
-from typing import List, Dict
+from typing import List, Dict, Optional

 import yaml
 from pydantic import BaseModel


+class Example(BaseModel):
+    ask: str
+    answer: str
+
+class Returns(BaseModel):
+    type: str
+    format: Optional[str] = None
+
 class Skill(BaseModel):
    name: str
    description: str
+    id: str
    requisite: List[str]
+    arguments: Dict
+    examples: List[Example]
+    returns: Returns


 class EntitySkills(BaseModel):
@ -26,13 +38,26 @@ class SkillLoader:
            skills = yaml.safe_load(file)
        self._skills = SkillsDeclaration(**skills)

-    def get_skill_list(self, entity_name: str = "Assistant"):
-        if not self._skills or entity_name not in self._skills.entities:
+    def get_skill_list(self, entity_name: str = "Assistant") -> Dict:
+        entity_skills = self.get_entity(entity_name)
+        if not entity_skills:
            return {}
-        entity_skills = self._skills.entities.get(entity_name)

        description_to_name_mappings = {}
        for s in entity_skills.skills:
            description_to_name_mappings[s.description] = s.name

        return description_to_name_mappings
+
+    def get_skill(self, name, entity_name: str = "Assistant") -> Skill:
+        entity = self.get_entity(entity_name)
+        if not entity:
+            return None
+        for sk in entity.skills:
+            if sk.name == name:
+                return sk
+
+    def get_entity(self, name) -> EntitySkills:
+        if not self._skills:
+            return None
+        return self._skills.entities.get(name)
--- a/metagpt/learn/text_to_embedding.py
+++ b/metagpt/learn/text_to_embedding.py
@ -16,7 +16,7 @@ from metagpt.utils.common import initialize_environment
@skill_metadata(name="Text to Embedding",
                description="Convert the text into embeddings.",
                requisite="`OPENAI_API_KEY`")
-def text_to_embedding(text, model="text-embedding-ada-002", openai_api_key=""):
+def text_to_embedding(text, model="text-embedding-ada-002", openai_api_key="", **kwargs):
    """Text to embedding

    :param text: The text used for embedding.
--- a/metagpt/learn/text_to_image.py
+++ b/metagpt/learn/text_to_image.py
@ -17,7 +17,7 @@ from metagpt.utils.common import initialize_environment
@skill_metadata(name="Text to image",
                description="Create a drawing based on the text.",
                requisite="`OPENAI_API_KEY` or `METAGPT_TEXT_TO_IMAGE_MODEL`")
-def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url=""):
+def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url="", **kwargs):
    """Text to image

    :param text: The text used for image conversion.
@ -27,8 +27,14 @@ def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url
    :return: The image data is returned in Base64 encoding.
    """
    initialize_environment()
+    image_declaration = "data:image/png;base64,"
    if os.environ.get("METAGPT_TEXT_TO_IMAGE_MODEL") or model_url:
-        return oas3_metagpt_text_to_image(text, size_type, model_url)
+        data = oas3_metagpt_text_to_image(text, size_type, model_url)
+        return image_declaration + data if data else ""
    if os.environ.get("OPENAI_API_KEY") or openai_api_key:
-        return oas3_openai_text_to_image(text, size_type, openai_api_key)
+        data = oas3_openai_text_to_image(text, size_type, openai_api_key)
+        return image_declaration + data if data else ""
+
    raise EnvironmentError
+
+
--- a/metagpt/learn/text_to_speech.py
+++ b/metagpt/learn/text_to_speech.py
@ -17,7 +17,7 @@ from metagpt.utils.common import initialize_environment
                description="Text-to-speech",
                requisite="`AZURE_TTS_SUBSCRIPTION_KEY` and `AZURE_TTS_REGION`")
 def text_to_speech(text, lang="zh-CN", voice="zh-CN-XiaomoNeural", style="affectionate", role="Girl",
-                   subscription_key="", region=""):
+                   subscription_key="", region="", **kwargs):
    """Text to speech
    For more details, check out:`https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`

@ -32,8 +32,10 @@ def text_to_speech(text, lang="zh-CN", voice="zh-CN-XiaomoNeural", style="affect

    """
    initialize_environment()
+    audio_declaration = "data:audio/wav;base64,"
    if (os.environ.get("AZURE_TTS_SUBSCRIPTION_KEY") and os.environ.get("AZURE_TTS_REGION")) or \
            (subscription_key and region):
-        return oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region)
+        data = oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region)
+        return audio_declaration + data if data else data

    raise EnvironmentError
--- a/metagpt/memory/brain_memory.py
+++ b/metagpt/memory/brain_memory.py
@ -35,9 +35,15 @@ class BrainMemory(pydantic.BaseModel):
        return "\n".join(texts)

    def move_to_solution(self):
-        while len(self.history) > 1:
-            msg = self.history.pop()
-            self.solution.append(msg)
+        if len(self.history) < 2:
+            return
+        msgs = self.history[:-1]
+        self.solution.extend(msgs)
+        if not self.history[-1].is_contain(MessageType.Talk.value):
+            self.solution.append(self.history[-1])
+            self.history = []
+        else:
+            self.history = self.history[-1:]

    @property
    def last_talk(self):
--- a/metagpt/provider/openai_api.py
+++ b/metagpt/provider/openai_api.py
@ -153,26 +153,10 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
        self.rpm = int(self._options.get("RPM", 10))

    async def _achat_completion_stream(self, messages: list[dict]) -> str:
-        max_try = 5
-        response = None
-        for i in range(max_try):
-            try:
-                response = await openai.ChatCompletion.acreate(
+        response = await self.async_retry_call(openai.ChatCompletion.acreate,
                    **self._cons_kwargs(messages),
                    stream=True
                )
-                break
-            except openai.error.RateLimitError as e:
-                random_time = random.uniform(0, 3)  # 生成0到5秒之间的随机时间
-                rounded_time = round(random_time, 1)  # 保留一位小数，以实现0.1秒的精度
-                logger.warning(f"Exception:{e}, sleeping for {rounded_time} seconds")
-                await asyncio.sleep(rounded_time)
-                continue
-            except Exception as e:
-                error_str = traceback.format_exc()
-                logger.error(f"Exception:{e}, stack:{error_str}")
-                raise e
-
        # create variables to collect the stream of chunks
        collected_chunks = []
        collected_messages = []
@ -213,12 +197,12 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
        return kwargs

    async def _achat_completion(self, messages: list[dict]) -> dict:
-        rsp = await self.llm.ChatCompletion.acreate(**self._cons_kwargs(messages))
+        rsp = await self.async_retry_call(self.llm.ChatCompletion.acreate, **self._cons_kwargs(messages))
        self._update_costs(rsp.get("usage"))
        return rsp

    def _chat_completion(self, messages: list[dict]) -> dict:
-        rsp = self.llm.ChatCompletion.create(**self._cons_kwargs(messages))
+        rsp = self.retry_call(self.llm.ChatCompletion.create, **self._cons_kwargs(messages))
        self._update_costs(rsp)
        return rsp

@ -398,4 +382,43 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
        if match:
            return match.group(1), match.group(2)
        else:
-            return None, input_string
+            return None, input_string
+
+    @staticmethod
+    async def async_retry_call(func,  *args, **kwargs):
+        for i in range(OpenAIGPTAPI.MAX_TRY):
+            try:
+                rsp = await func(*args, **kwargs)
+                return rsp
+            except openai.error.RateLimitError as e:
+                random_time = random.uniform(0, 3)  # 生成0到5秒之间的随机时间
+                rounded_time = round(random_time, 1)  # 保留一位小数，以实现0.1秒的精度
+                logger.warning(f"Exception:{e}, sleeping for {rounded_time} seconds")
+                await asyncio.sleep(rounded_time)
+                continue
+            except openai.error.APIConnectionError as e:
+                logger.warning(f"Exception:{e}")
+                continue
+            except Exception as e:
+                error_str = traceback.format_exc()
+                logger.error(f"Exception:{e}, stack:{error_str}")
+                raise e
+
+    @staticmethod
+    def retry_call(func, *args, **kwargs):
+        for i in range(OpenAIGPTAPI.MAX_TRY):
+            try:
+                rsp = func(*args, **kwargs)
+                return rsp
+            except openai.error.RateLimitError as e:
+                logger.warning(f"Exception:{e}")
+                continue
+            except openai.error.APIConnectionError as e:
+                logger.warning(f"Exception:{e}")
+                continue
+            except Exception as e:
+                error_str = traceback.format_exc()
+                logger.error(f"Exception:{e}, stack:{error_str}")
+                raise e
+
+    MAX_TRY = 5
--- a/metagpt/roles/assistant.py
+++ b/metagpt/roles/assistant.py
@ -15,8 +15,8 @@
 """
 import asyncio

-
 from metagpt.actions import ActionOutput
+from metagpt.actions.skill_action import SkillAction, ArgumentsParingAction
 from metagpt.actions.talk_action import TalkAction
 from metagpt.config import Config
 from metagpt.learn.skill_loader import SkillLoader
@ -53,7 +53,7 @@ class Assistant(Role):
        logger.info(prompt)
        rsp = await self._llm.aask(prompt, [])
        logger.info(rsp)
-        return await self._plan(rsp)
+        return await self._plan(rsp, last_talk=last_talk)

    async def act(self) -> ActionOutput:
        result = await self._rc.todo.run(**self._options)
@ -88,8 +88,18 @@ class Assistant(Role):
        return True

    async def skill_handler(self, text, **kwargs) -> bool:
-        skill =
-        pass
+        last_talk = kwargs.get("last_talk")
+        skill = self.skills.get_skill(text)
+        logger.info(f"skill not found: {text}")
+        if not skill:
+            return await self.talk_handler(text=last_talk, **kwargs)
+        action = ArgumentsParingAction(options=self.options, skill=skill, llm=self._llm, **kwargs)
+        await action.run(**kwargs)
+        if action.args is None:
+            return await self.talk_handler(text=last_talk, **kwargs)
+        action = SkillAction(options=self.options, skill=skill, args=action.args, llm=self._llm)
+        self.add_to_do(action)
+        return True

    async def refine_memory(self) -> str:
        history_text = self.memory.history_text
@ -97,7 +107,7 @@ class Assistant(Role):
        if history_text == "":
            return last_talk
        history_summary = await self._llm.get_context_title(history_text, max_words=20)
-        if await self._llm.is_related(last_talk, history_summary):  # 合并相关内容
+        if last_talk and await self._llm.is_related(last_talk, history_summary):  # 合并相关内容
            last_talk = await self._llm.rewrite(sentence=last_talk, context=history_text)
            return last_talk

@ -109,11 +119,20 @@ class Assistant(Role):
        from metagpt.provider.openai_api import OpenAIGPTAPI
        return OpenAIGPTAPI.extract_info(input_string)

+    def get_memory(self) -> str:
+        return self.memory.json()
+
+    def load_memory(self, jsn):
+        try:
+            self.memory = BrainMemory(**jsn)
+        except Exception as e:
+            logger.exception(f"load error:{e}, data:{jsn}")
+

 async def main():
    options = Config().runtime_options
    cost_manager = CostManager(**options)
-    topic = "dataiku vs. datarobot"
+    topic = "draw an apple"
    role = Assistant(options=options, cost_manager=cost_manager, language="Chinese")
    await role.talk(topic)
    while True:
@ -121,8 +140,9 @@ async def main():
        if not has_action:
            break
        msg = await role.act()
-        print(msg)
+        logger.info(msg)
        # 获取用户终端输入
+        logger.info("Enter prompt")
        talk = input("You: ")
        await role.talk(talk)

--- a/metagpt/roles/role.py
+++ b/metagpt/roles/role.py
@ -325,4 +325,12 @@ class Role:
        self._actions.append(act)

    def add_to_do(self, act):
-        self._rc.todo = act
+        self._rc.todo = act
+
+    async def think(self) -> bool:
+        return await self._think()
+
+    async def act(self) -> ActionOutput:
+        msg = await self._act()
+        return ActionOutput(content=msg.content,
+                            instruct_content=msg.instruct_content)
--- a/metagpt/schema.py
+++ b/metagpt/schema.py
@ -67,6 +67,9 @@ class Message:
        intersection = set(tags) & self.tags
        return len(intersection) > 0

+    def is_contain(self, tag):
+        return self.is_contain_tags([tag])
+

@dataclass
 class UserMessage(Message):