diff --git a/.well-known/skills.yaml b/.well-known/skills.yaml
index 5ccb8094b..7a035910c 100644
--- a/.well-known/skills.yaml
+++ b/.well-known/skills.yaml
@@ -3,15 +3,41 @@ entities:
      skills:
      - name: text_to_speech
        description: Text-to-speech
+       id: text_to_speech.text_to_speech
        requisite:
        - AZURE_TTS_SUBSCRIPTION_KEY
        - AZURE_TTS_REGION
+       arguments:
+         text: 'The text used for voice conversion. Required.'
+         lang: 'The value can contain a language code such as en (English), or a locale such as en-US (English - United States). The optional parameter are "English", "Chinese". Default value: "Chinese".'
+         voice: 'Default value: "zh-CN-XiaomoNeural".'
+         style: 'Speaking style to express different emotions like cheerfulness, empathy, and calm. The optional parameter values are "affectionate", "angry", "calm", "cheerful", "depressed", "disgruntled", "embarrassed", "envious", "fearful", "gentle", "sad", "serious". Default value: "affectionate".'
+         role: 'With roles, the same voice can act as a different age and gender. The optional parameter values are "Girl", "Boy", "OlderAdultFemale", "OlderAdultMale", "SeniorFemale", "SeniorMale", "YoungAdultFemale", "YoungAdultMale". Default value: "Girl".'
+       examples:
+         - ask: 'A girl says "hello world"'
+           answer: 'text_to_speech(text="hello world", role="Girl")'
+         - ask: 'A boy affectionate says "hello world"'
+           answer: 'text_to_speech(text="hello world", role="Boy", style="affectionate")'
+         - ask: 'A boy says "你好"'
+           answer: 'text_to_speech(text="hello world", role="Boy", lang="Chinese")'
+       returns:
+         type: string
+         format: base64
+
      - name: text_to_image
        description: Create a drawing based on the text.
+       id: text_to_image.text_to_image
        requisite:
        - OPENAI_API_KEY
        - METAGPT_TEXT_TO_IMAGE_MODEL
-     - name: text_to_embedding
-       description: Convert the text into embeddings.
-       requisite:
-       - OPENAI_API_KEY
+       arguments:
+         text: 'The text used for image conversion. Required.'
+         size_type: 'Default value: "512x512".'
+       examples:
+         - ask: 'Draw a girl'
+           answer: 'text_to_image(text="Draw a girl", size_type="512x512")'
+         - ask: 'Draw an apple'
+           answer: 'text_to_image(text="Draw an apple", size_type="512x512")'
+       returns:
+         type: string
+         format: base64
diff --git a/metagpt/actions/skill_action.py b/metagpt/actions/skill_action.py
index e69de29bb..8cc7b6c42 100644
--- a/metagpt/actions/skill_action.py
+++ b/metagpt/actions/skill_action.py
@@ -0,0 +1,88 @@
+import ast
+import importlib
+
+from metagpt.actions import Action, ActionOutput
+from metagpt.learn.skill_loader import Skill
+from metagpt.logs import logger
+
+
+class ArgumentsParingAction(Action):
+    def __init__(self, options, last_talk: str, skill: Skill, context=None, llm=None, **kwargs):
+        super(ArgumentsParingAction, self).__init__(options=options, name='', context=context, llm=llm)
+        self.skill = skill
+        self.ask = last_talk
+        self.rsp = None
+        self.args = None
+
+    @property
+    def prompt(self):
+        prompt = f"{self.skill.name} function parameters description:\n"
+        for k, v in self.skill.arguments.items():
+            prompt += f"parameter `{k}`: {v}\n"
+        prompt += "\n"
+        prompt += "Examples:\n"
+        for e in self.skill.examples:
+            prompt += f"If want you to do `{e.ask}`, return `{e.answer}` brief and clear.\n"
+        prompt += f"\nNow I want you to do `{self.ask}`, return in examples format above, brief and clear."
+        return prompt
+
+    async def run(self, *args, **kwargs) -> ActionOutput:
+        prompt = self.prompt
+        logger.info(prompt)
+        rsp = await self.llm.aask(msg=prompt, system_msgs=[])
+        logger.info(rsp)
+        self.args = ArgumentsParingAction.parse_arguments(skill_name=self.skill.name, txt=rsp)
+        self.rsp = ActionOutput(content=rsp)
+        return self.rsp
+
+    @staticmethod
+    def parse_arguments(skill_name, txt) -> dict:
+        prefix = skill_name + "("
+        if prefix not in txt:
+            logger.error(f"{skill_name} not in {txt}")
+            return None
+        if ")" not in txt:
+            logger.error(f"')' not in {txt}")
+            return None
+        begin_ix = txt.find(prefix)
+        end_ix = txt.rfind(")")
+        args_txt = txt[begin_ix + len(prefix): end_ix]
+        logger.info(args_txt)
+        fake_expression = f"dict({args_txt})"
+        parsed_expression = ast.parse(fake_expression, mode='eval')
+        args = {}
+        for keyword in parsed_expression.body.keywords:
+            key = keyword.arg
+            value = ast.literal_eval(keyword.value)
+            args[key] = value
+        return args
+
+
+class SkillAction(Action):
+    def __init__(self, options, skill: Skill, args: dict, context=None, llm=None, **kwargs):
+        super(SkillAction, self).__init__(options=options, name='', context=context, llm=llm)
+        self._skill = skill
+        self._args = args
+        self.rsp = None
+
+    async def run(self, *args, **kwargs) -> str | ActionOutput | None:
+        """Run action"""
+        self.rsp = self.find_and_call_function(self._skill.name, args=self._args, **self.options)
+        return ActionOutput(content=self.rsp, instruct_content=self._skill.json())
+
+    @staticmethod
+    def find_and_call_function(function_name, args, **kwargs):
+        try:
+            module = importlib.import_module("metagpt.learn")
+            function = getattr(module, function_name)
+            # 调用函数并返回结果
+            result = function(**args, **kwargs)
+            return result
+        except (ModuleNotFoundError, AttributeError):
+            logger.error(f"{function_name} not found")
+            return None
+
+
+if __name__ == '__main__':
+    ArgumentsParingAction.parse_arguments(skill_name="text_to_image",
+                                          txt='`text_to_image(text="Draw an apple", size_type="512x512")`')
diff --git a/metagpt/actions/talk_action.py b/metagpt/actions/talk_action.py
index 4275a1b9e..5485456c5 100644
--- a/metagpt/actions/talk_action.py
+++ b/metagpt/actions/talk_action.py
@@ -4,7 +4,7 @@ from metagpt.logs import logger
 
 
 class TalkAction(Action):
-    def __init__(self, options, name: str = '', talk='', history_summary='', context=None, llm=None):
+    def __init__(self, options, name: str = '', talk='', history_summary='', context=None, llm=None, **kwargs):
         context = context or {}
         context["talk"] = talk
         context["history_summery"] = history_summary
diff --git a/metagpt/learn/__init__.py b/metagpt/learn/__init__.py
index 28b8739c3..c8270dbfb 100644
--- a/metagpt/learn/__init__.py
+++ b/metagpt/learn/__init__.py
@@ -5,3 +5,11 @@
 @Author  : alexanderwu
 @File    : __init__.py
 """
+
+from metagpt.learn.text_to_image import text_to_image
+from metagpt.learn.text_to_speech import text_to_speech
+
+__all__ = [
+    "text_to_image",
+    "text_to_speech",
+]
\ No newline at end of file
diff --git a/metagpt/learn/skill_loader.py b/metagpt/learn/skill_loader.py
index eeca12871..46ead728d 100644
--- a/metagpt/learn/skill_loader.py
+++ b/metagpt/learn/skill_loader.py
@@ -1,14 +1,26 @@
 from pathlib import Path
-from typing import List, Dict
+from typing import List, Dict, Optional
 
 import yaml
 from pydantic import BaseModel
 
 
+class Example(BaseModel):
+    ask: str
+    answer: str
+
+class Returns(BaseModel):
+    type: str
+    format: Optional[str] = None
+
 class Skill(BaseModel):
     name: str
     description: str
+    id: str
     requisite: List[str]
+    arguments: Dict
+    examples: List[Example]
+    returns: Returns
 
 
 class EntitySkills(BaseModel):
@@ -26,13 +38,26 @@ class SkillLoader:
             skills = yaml.safe_load(file)
         self._skills = SkillsDeclaration(**skills)
 
-    def get_skill_list(self, entity_name: str = "Assistant"):
-        if not self._skills or entity_name not in self._skills.entities:
+    def get_skill_list(self, entity_name: str = "Assistant") -> Dict:
+        entity_skills = self.get_entity(entity_name)
+        if not entity_skills:
             return {}
-        entity_skills = self._skills.entities.get(entity_name)
 
         description_to_name_mappings = {}
         for s in entity_skills.skills:
             description_to_name_mappings[s.description] = s.name
 
         return description_to_name_mappings
+
+    def get_skill(self, name, entity_name: str = "Assistant") -> Skill:
+        entity = self.get_entity(entity_name)
+        if not entity:
+            return None
+        for sk in entity.skills:
+            if sk.name == name:
+                return sk
+
+    def get_entity(self, name) -> EntitySkills:
+        if not self._skills:
+            return None
+        return self._skills.entities.get(name)
\ No newline at end of file
diff --git a/metagpt/learn/text_to_embedding.py b/metagpt/learn/text_to_embedding.py
index 38fd7c0cb..6d0cefcdb 100644
--- a/metagpt/learn/text_to_embedding.py
+++ b/metagpt/learn/text_to_embedding.py
@@ -16,7 +16,7 @@ from metagpt.utils.common import initialize_environment
 @skill_metadata(name="Text to Embedding",
                 description="Convert the text into embeddings.",
                 requisite="`OPENAI_API_KEY`")
-def text_to_embedding(text, model="text-embedding-ada-002", openai_api_key=""):
+def text_to_embedding(text, model="text-embedding-ada-002", openai_api_key="", **kwargs):
     """Text to embedding
 
     :param text: The text used for embedding.
diff --git a/metagpt/learn/text_to_image.py b/metagpt/learn/text_to_image.py
index d123e116a..2f946e239 100644
--- a/metagpt/learn/text_to_image.py
+++ b/metagpt/learn/text_to_image.py
@@ -17,7 +17,7 @@ from metagpt.utils.common import initialize_environment
 @skill_metadata(name="Text to image",
                 description="Create a drawing based on the text.",
                 requisite="`OPENAI_API_KEY` or `METAGPT_TEXT_TO_IMAGE_MODEL`")
-def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url=""):
+def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url="", **kwargs):
     """Text to image
 
     :param text: The text used for image conversion.
@@ -27,8 +27,14 @@ def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url
     :return: The image data is returned in Base64 encoding.
     """
     initialize_environment()
+    image_declaration = "data:image/png;base64,"
     if os.environ.get("METAGPT_TEXT_TO_IMAGE_MODEL") or model_url:
-        return oas3_metagpt_text_to_image(text, size_type, model_url)
+        data = oas3_metagpt_text_to_image(text, size_type, model_url)
+        return image_declaration + data if data else ""
     if os.environ.get("OPENAI_API_KEY") or openai_api_key:
-        return oas3_openai_text_to_image(text, size_type, openai_api_key)
+        data = oas3_openai_text_to_image(text, size_type, openai_api_key)
+        return image_declaration + data if data else ""
+
     raise EnvironmentError
+
+
diff --git a/metagpt/learn/text_to_speech.py b/metagpt/learn/text_to_speech.py
index 5631ef45e..90dd878a1 100644
--- a/metagpt/learn/text_to_speech.py
+++ b/metagpt/learn/text_to_speech.py
@@ -17,7 +17,7 @@ from metagpt.utils.common import initialize_environment
                 description="Text-to-speech",
                 requisite="`AZURE_TTS_SUBSCRIPTION_KEY` and `AZURE_TTS_REGION`")
 def text_to_speech(text, lang="zh-CN", voice="zh-CN-XiaomoNeural", style="affectionate", role="Girl",
-                   subscription_key="", region=""):
+                   subscription_key="", region="", **kwargs):
     """Text to speech
     For more details, check out:`https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
 
@@ -32,8 +32,10 @@ def text_to_speech(text, lang="zh-CN", voice="zh-CN-XiaomoNeural", style="affect
 
     """
     initialize_environment()
+    audio_declaration = "data:audio/wav;base64,"
     if (os.environ.get("AZURE_TTS_SUBSCRIPTION_KEY") and os.environ.get("AZURE_TTS_REGION")) or \
             (subscription_key and region):
-        return oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region)
+        data = oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region)
+        return audio_declaration + data if data else data
 
     raise EnvironmentError
diff --git a/metagpt/memory/brain_memory.py b/metagpt/memory/brain_memory.py
index 97319859a..68e930144 100644
--- a/metagpt/memory/brain_memory.py
+++ b/metagpt/memory/brain_memory.py
@@ -35,9 +35,15 @@ class BrainMemory(pydantic.BaseModel):
         return "\n".join(texts)
 
     def move_to_solution(self):
-        while len(self.history) > 1:
-            msg = self.history.pop()
-            self.solution.append(msg)
+        if len(self.history) < 2:
+            return
+        msgs = self.history[:-1]
+        self.solution.extend(msgs)
+        if not self.history[-1].is_contain(MessageType.Talk.value):
+            self.solution.append(self.history[-1])
+            self.history = []
+        else:
+            self.history = self.history[-1:]
 
     @property
     def last_talk(self):
diff --git a/metagpt/provider/openai_api.py b/metagpt/provider/openai_api.py
index e98acbd75..27f22e491 100644
--- a/metagpt/provider/openai_api.py
+++ b/metagpt/provider/openai_api.py
@@ -153,26 +153,10 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
         self.rpm = int(self._options.get("RPM", 10))
 
     async def _achat_completion_stream(self, messages: list[dict]) -> str:
-        max_try = 5
-        response = None
-        for i in range(max_try):
-            try:
-                response = await openai.ChatCompletion.acreate(
+        response = await self.async_retry_call(openai.ChatCompletion.acreate,
                     **self._cons_kwargs(messages),
                     stream=True
                 )
-                break
-            except openai.error.RateLimitError as e:
-                random_time = random.uniform(0, 3)  # 生成0到5秒之间的随机时间
-                rounded_time = round(random_time, 1)  # 保留一位小数，以实现0.1秒的精度
-                logger.warning(f"Exception:{e}, sleeping for {rounded_time} seconds")
-                await asyncio.sleep(rounded_time)
-                continue
-            except Exception as e:
-                error_str = traceback.format_exc()
-                logger.error(f"Exception:{e}, stack:{error_str}")
-                raise e
-
         # create variables to collect the stream of chunks
         collected_chunks = []
         collected_messages = []
@@ -213,12 +197,12 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
         return kwargs
 
     async def _achat_completion(self, messages: list[dict]) -> dict:
-        rsp = await self.llm.ChatCompletion.acreate(**self._cons_kwargs(messages))
+        rsp = await self.async_retry_call(self.llm.ChatCompletion.acreate, **self._cons_kwargs(messages))
         self._update_costs(rsp.get("usage"))
         return rsp
 
     def _chat_completion(self, messages: list[dict]) -> dict:
-        rsp = self.llm.ChatCompletion.create(**self._cons_kwargs(messages))
+        rsp = self.retry_call(self.llm.ChatCompletion.create, **self._cons_kwargs(messages))
         self._update_costs(rsp)
         return rsp
 
@@ -398,4 +382,43 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
         if match:
             return match.group(1), match.group(2)
         else:
-            return None, input_string
\ No newline at end of file
+            return None, input_string
+
+    @staticmethod
+    async def async_retry_call(func,  *args, **kwargs):
+        for i in range(OpenAIGPTAPI.MAX_TRY):
+            try:
+                rsp = await func(*args, **kwargs)
+                return rsp
+            except openai.error.RateLimitError as e:
+                random_time = random.uniform(0, 3)  # 生成0到5秒之间的随机时间
+                rounded_time = round(random_time, 1)  # 保留一位小数，以实现0.1秒的精度
+                logger.warning(f"Exception:{e}, sleeping for {rounded_time} seconds")
+                await asyncio.sleep(rounded_time)
+                continue
+            except openai.error.APIConnectionError as e:
+                logger.warning(f"Exception:{e}")
+                continue
+            except Exception as e:
+                error_str = traceback.format_exc()
+                logger.error(f"Exception:{e}, stack:{error_str}")
+                raise e
+
+    @staticmethod
+    def retry_call(func, *args, **kwargs):
+        for i in range(OpenAIGPTAPI.MAX_TRY):
+            try:
+                rsp = func(*args, **kwargs)
+                return rsp
+            except openai.error.RateLimitError as e:
+                logger.warning(f"Exception:{e}")
+                continue
+            except openai.error.APIConnectionError as e:
+                logger.warning(f"Exception:{e}")
+                continue
+            except Exception as e:
+                error_str = traceback.format_exc()
+                logger.error(f"Exception:{e}, stack:{error_str}")
+                raise e
+
+    MAX_TRY = 5
diff --git a/metagpt/roles/assistant.py b/metagpt/roles/assistant.py
index dfbd406bc..032d73ca5 100644
--- a/metagpt/roles/assistant.py
+++ b/metagpt/roles/assistant.py
@@ -15,8 +15,8 @@
 """
 import asyncio
 
-
 from metagpt.actions import ActionOutput
+from metagpt.actions.skill_action import SkillAction, ArgumentsParingAction
 from metagpt.actions.talk_action import TalkAction
 from metagpt.config import Config
 from metagpt.learn.skill_loader import SkillLoader
@@ -53,7 +53,7 @@ class Assistant(Role):
         logger.info(prompt)
         rsp = await self._llm.aask(prompt, [])
         logger.info(rsp)
-        return await self._plan(rsp)
+        return await self._plan(rsp, last_talk=last_talk)
 
     async def act(self) -> ActionOutput:
         result = await self._rc.todo.run(**self._options)
@@ -88,8 +88,18 @@ class Assistant(Role):
         return True
 
     async def skill_handler(self, text, **kwargs) -> bool:
-        skill =
-        pass
+        last_talk = kwargs.get("last_talk")
+        skill = self.skills.get_skill(text)
+        logger.info(f"skill not found: {text}")
+        if not skill:
+            return await self.talk_handler(text=last_talk, **kwargs)
+        action = ArgumentsParingAction(options=self.options, skill=skill, llm=self._llm, **kwargs)
+        await action.run(**kwargs)
+        if action.args is None:
+            return await self.talk_handler(text=last_talk, **kwargs)
+        action = SkillAction(options=self.options, skill=skill, args=action.args, llm=self._llm)
+        self.add_to_do(action)
+        return True
 
     async def refine_memory(self) -> str:
         history_text = self.memory.history_text
@@ -97,7 +107,7 @@ class Assistant(Role):
         if history_text == "":
             return last_talk
         history_summary = await self._llm.get_context_title(history_text, max_words=20)
-        if await self._llm.is_related(last_talk, history_summary):  # 合并相关内容
+        if last_talk and await self._llm.is_related(last_talk, history_summary):  # 合并相关内容
             last_talk = await self._llm.rewrite(sentence=last_talk, context=history_text)
             return last_talk
 
@@ -109,11 +119,20 @@ class Assistant(Role):
         from metagpt.provider.openai_api import OpenAIGPTAPI
         return OpenAIGPTAPI.extract_info(input_string)
 
+    def get_memory(self) -> str:
+        return self.memory.json()
+
+    def load_memory(self, jsn):
+        try:
+            self.memory = BrainMemory(**jsn)
+        except Exception as e:
+            logger.exception(f"load error:{e}, data:{jsn}")
+
 
 async def main():
     options = Config().runtime_options
     cost_manager = CostManager(**options)
-    topic = "dataiku vs. datarobot"
+    topic = "draw an apple"
     role = Assistant(options=options, cost_manager=cost_manager, language="Chinese")
     await role.talk(topic)
     while True:
@@ -121,8 +140,9 @@ async def main():
         if not has_action:
             break
         msg = await role.act()
-        print(msg)
+        logger.info(msg)
         # 获取用户终端输入
+        logger.info("Enter prompt")
         talk = input("You: ")
         await role.talk(talk)
 
diff --git a/metagpt/roles/role.py b/metagpt/roles/role.py
index 1bb73f884..47f494c69 100644
--- a/metagpt/roles/role.py
+++ b/metagpt/roles/role.py
@@ -325,4 +325,12 @@ class Role:
         self._actions.append(act)
 
     def add_to_do(self, act):
-        self._rc.todo = act
\ No newline at end of file
+        self._rc.todo = act
+
+    async def think(self) -> bool:
+        return await self._think()
+
+    async def act(self) -> ActionOutput:
+        msg = await self._act()
+        return ActionOutput(content=msg.content,
+                            instruct_content=msg.instruct_content)
diff --git a/metagpt/schema.py b/metagpt/schema.py
index e1cd011c6..909313886 100644
--- a/metagpt/schema.py
+++ b/metagpt/schema.py
@@ -67,6 +67,9 @@ class Message:
         intersection = set(tags) & self.tags
         return len(intersection) > 0
 
+    def is_contain(self, tag):
+        return self.is_contain_tags([tag])
+
 
 @dataclass
 class UserMessage(Message):