Merge pull request #48 from iorisa/feature/talk_prompt

feat: +科大讯飞TTS; update skill spec
2026-07-11 16:22:15 +02:00 · 2023-09-25 20:01:30 +08:00 · 2023-09-25 20:01:30 +08:00 · 2e19f79f47
commit 2e19f79f47
parent 58631ca0ca 56bf0b9b97
11 changed files with 534 additions and 96 deletions
--- a/.well-known/metagpt_oas3_api.yaml
+++ b/.well-known/metagpt_oas3_api.yaml
@ -13,10 +13,17 @@ servers:
 paths:
  /tts/azsure:
    x-prerequisite:
-      - name: AZURE_TTS_SUBSCRIPTION_KEY
-        description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
-      - name: AZURE_TTS_REGION
-        description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
+      configurations:
+        AZURE_TTS_SUBSCRIPTION_KEY:
+          type: string
+          description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
+        AZURE_TTS_REGION:
+          type: string
+          description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
+      required:
+        allOf:
+          - AZURE_TTS_SUBSCRIPTION_KEY
+          - AZURE_TTS_REGION
    post:
      summary: "Convert Text to Base64-encoded .wav File Stream"
      description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
@ -73,10 +80,81 @@ paths:
        '500':
          description: "Internal Server Error"

+  /tts/iflytek:
+    x-prerequisite:
+      configurations:
+        IFLYTEK_APP_ID:
+          type: string
+          description: "Application ID is used to access your iFlyTek service API, see: `https://console.xfyun.cn/services/tts`"
+        IFLYTEK_API_KEY:
+          type: string
+          description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`"
+        IFLYTEK_API_SECRET:
+          type: string
+          description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`"
+      required:
+        allOf:
+          - IFLYTEK_APP_ID
+          - IFLYTEK_API_KEY
+          - IFLYTEK_API_SECRET
+    post:
+      summary: "Convert Text to Base64-encoded .mp3 File Stream"
+      description: "For more details, check out: [iFlyTek](https://console.xfyun.cn/services/tts)"
+      operationId: iflytek_tts.oas3_iflytek_tts
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+                - text
+              properties:
+                text:
+                  type: string
+                  description: Text to convert
+                voice:
+                  type: string
+                  description: "Voice style, see: [iFlyTek Text-to_Speech](https://www.xfyun.cn/doc/tts/online_tts/API.html#%E6%8E%A5%E5%8F%A3%E8%B0%83%E7%94%A8%E6%B5%81%E7%A8%8B)"
+                  default: "xiaoyan"
+                app_id:
+                  type: string
+                  description: "Application ID is used to access your iFlyTek service API, see: `https://console.xfyun.cn/services/tts`"
+                  default: ""
+                api_key:
+                  type: string
+                  description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`"
+                  default: ""
+                api_secret:
+                  type: string
+                  description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`"
+                  default: ""
+      responses:
+        '200':
+          description: "Base64-encoded .mp3 file data if successful, otherwise an empty string."
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  wav_data:
+                    type: string
+                    format: base64
+        '400':
+          description: "Bad Request"
+        '500':
+          description: "Internal Server Error"
+
+
  /txt2img/openai:
    x-prerequisite:
-      - name: OPENAI_API_KEY
-        description: "OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`"
+      configurations:
+        OPENAI_API_KEY:
+          type: string
+          description: "OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`"
+      required:
+        allOf:
+          - OPENAI_API_KEY
    post:
      summary: "Convert Text to Base64-encoded Image Data Stream"
      operationId: openai_text_to_image.oas3_openai_text_to_image
@ -116,8 +194,13 @@ paths:
          description: "Internal Server Error"
  /txt2embedding/openai:
    x-prerequisite:
-      - name: OPENAI_API_KEY
-        description: "OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`"
+      configurations:
+        OPENAI_API_KEY:
+          type: string
+          description: "OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`"
+      required:
+        allOf:
+          - OPENAI_API_KEY
    post:
      summary: Text to embedding
      operationId: openai_text_to_embedding.oas3_openai_text_to_embedding
@ -158,8 +241,13 @@ paths:

  /txt2image/metagpt:
    x-prerequisite:
-      - name: METAGPT_TEXT_TO_IMAGE_MODEL_URL
-        description: "Model url."
+      configurations:
+        METAGPT_TEXT_TO_IMAGE_MODEL_URL:
+          type: string
+          description: "Model url."
+      required:
+        allOf:
+          - METAGPT_TEXT_TO_IMAGE_MODEL_URL
    post:
      summary: "Text to Image"
      description: "Generate an image from the provided text using the MetaGPT Text-to-Image API."
--- a/.well-known/skills.yaml
+++ b/.well-known/skills.yaml
@ -1,66 +1,161 @@
+skillapi: "0.1.0"
+
+info:
+  title: "Agent Skill Specification"
+  version: "1.0"
+
 entities:
  Assistant:
-     skills:
-     - name: text_to_speech
-       description: Text-to-speech
-       id: text_to_speech.text_to_speech
-       x-prerequisite:
-         - name: AZURE_TTS_SUBSCRIPTION_KEY
-           description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
-         - name: AZURE_TTS_REGION
-           description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
-       arguments:
-         text: 'The text used for voice conversion. Required.'
-         lang: 'The value can contain a language code such as en (English), or a locale such as en-US (English - United States). The optional parameter are "English", "Chinese". Default value: "Chinese".'
-         voice: 'Default value: "zh-CN-XiaomoNeural".'
-         style: 'Speaking style to express different emotions like cheerfulness, empathy, and calm. The optional parameter values are "affectionate", "angry", "calm", "cheerful", "depressed", "disgruntled", "embarrassed", "envious", "fearful", "gentle", "sad", "serious". Default value: "affectionate".'
-         role: 'With roles, the same voice can act as a different age and gender. The optional parameter values are "Girl", "Boy", "OlderAdultFemale", "OlderAdultMale", "SeniorFemale", "SeniorMale", "YoungAdultFemale", "YoungAdultMale". Default value: "Girl".'
-       examples:
-         - ask: 'A girl says "hello world"'
-           answer: 'text_to_speech(text="hello world", role="Girl")'
-         - ask: 'A boy affectionate says "hello world"'
-           answer: 'text_to_speech(text="hello world", role="Boy", style="affectionate")'
-         - ask: 'A boy says "你好"'
-           answer: 'text_to_speech(text="hello world", role="Boy", lang="Chinese")'
-       returns:
-         type: string
-         format: base64
+    summary: assistant
+    description: assistant
+    skills:
+      - name: text_to_speech
+        description: Generate a voice file from the input text, text-to-speech
+        id: text_to_speech.text_to_speech
+        x-prerequisite:
+          configurations:
+            AZURE_TTS_SUBSCRIPTION_KEY:
+              type: string
+              description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
+            AZURE_TTS_REGION:
+              type: string
+              description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
+            IFLYTEK_APP_ID:
+              type: string
+              description: "Application ID is used to access your iFlyTek service API, see: `https://console.xfyun.cn/services/tts`"
+            IFLYTEK_API_KEY:
+              type: string
+              description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`"
+            IFLYTEK_API_SECRET:
+              type: string
+              description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`"
+          required:
+            oneOf:
+              - allOf:
+                - AZURE_TTS_SUBSCRIPTION_KEY
+                - AZURE_TTS_REGION
+              - allOf:
+                - IFLYTEK_APP_ID
+                - IFLYTEK_API_KEY
+                - IFLYTEK_API_SECRET
+        parameters:
+          text:
+            description: 'The text used for voice conversion.'
+            required: true
+            type: string
+          lang:
+            description: 'The value can contain a language code such as en (English), or a locale such as en-US (English - United States).'
+            type: string
+            enum:
+              - English
+              - Chinese
+            default: Chinese
+          voice:
+            description: Name of voice styles
+            type: string
+            default: zh-CN-XiaomoNeural
+          style:
+            type: string
+            description: Speaking style to express different emotions like cheerfulness, empathy, and calm.
+            enum:
+              - affectionate
+              - angry
+              - calm
+              - cheerful
+              - depressed
+              - disgruntled
+              - embarrassed
+              - envious
+              - fearful
+              - gentle
+              - sad
+              - serious
+            default: affectionate
+          role:
+            type: string
+            description: With roles, the same voice can act as a different age and gender.
+            enum:
+              - Girl
+              - Boy
+              - OlderAdultFemale
+              - OlderAdultMale
+              - SeniorFemale
+              - SeniorMale
+              - YoungAdultFemale
+              - YoungAdultMale
+            default: Girl
+        examples:
+           - ask: 'A girl says "hello world"'
+             answer: 'text_to_speech(text="hello world", role="Girl")'
+           - ask: 'A boy affectionate says "hello world"'
+             answer: 'text_to_speech(text="hello world", role="Boy", style="affectionate")'
+           - ask: 'A boy says "你好"'
+             answer: 'text_to_speech(text="你好", role="Boy", lang="Chinese")'
+        returns:
+          type: string
+          format: base64

-     - name: text_to_image
-       description: Create a drawing based on the text.
-       id: text_to_image.text_to_image
-       x-prerequisite:
-       - name: OPENAI_API_KEY
-         description: "OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`"
-       - name: METAGPT_TEXT_TO_IMAGE_MODEL_URL
-         description: "Model url."
-       arguments:
-         text: 'The text used for image conversion. Required.'
-         size_type: 'Default value: "512x512".'
-       examples:
-         - ask: 'Draw a girl'
-           answer: 'text_to_image(text="Draw a girl", size_type="512x512")'
-         - ask: 'Draw an apple'
-           answer: 'text_to_image(text="Draw an apple", size_type="512x512")'
-       returns:
-         type: string
-         format: base64
+      - name: text_to_image
+        description: Create a drawing based on the text.
+        id: text_to_image.text_to_image
+        x-prerequisite:
+          configurations:
+            OPENAI_API_KEY:
+              type: string
+              description: "OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`"
+            METAGPT_TEXT_TO_IMAGE_MODEL_URL:
+              type: string
+              description: "Model url."
+          required:
+            oneOf:
+              - OPENAI_API_KEY
+              - METAGPT_TEXT_TO_IMAGE_MODEL_URL
+        parameters:
+          text:
+            description: 'The text used for image conversion.'
+            type: string
+            required: true
+          size_type:
+            description: size type
+            type: string
+            default: "512x512"
+        examples:
+          - ask: 'Draw a girl'
+            answer: 'text_to_image(text="Draw a girl", size_type="512x512")'
+          - ask: 'Draw an apple'
+            answer: 'text_to_image(text="Draw an apple", size_type="512x512")'
+        returns:
+          type: string
+          format: base64

-     - name: web_search
-       description: Perform Google searches to provide real-time information.
-       id: web_search.web_search
-       x-prerequisite:
-       - name: SEARCH_ENGINE
-         description: "Supported values: serpapi/google/serper/ddg"
-       - name: SERPER_API_KEY
-         description: "SERPER API KEY, For more details, checkout: `https://serper.dev/api-key`"
-       arguments:
-         query: 'The search query. Required.'
-         max_results: 'The number of search results to retrieve. Default value: 6.'
-       examples:
-         - ask: 'Search for information about artificial intelligence'
-           answer: 'web_search(query="Search for information about artificial intelligence", max_results=6)'
-         - ask: 'Find news articles about climate change'
-           answer: 'web_search(query="Find news articles about climate change", max_results=6)'
-       returns:
-         type: string
+      - name: web_search
+        description: Perform Google searches to provide real-time information.
+        id: web_search.web_search
+        x-prerequisite:
+          configurations:
+            SEARCH_ENGINE:
+              type: string
+              description: "Supported values: serpapi/google/serper/ddg"
+            SERPER_API_KEY:
+              type: string
+              description: "SERPER API KEY, For more details, checkout: `https://serper.dev/api-key`"
+          required:
+            allOf:
+              - SEARCH_ENGINE
+              - SERPER_API_KEY
+        parameters:
+          query:
+            type: string
+            description: 'The search query.'
+            required: true
+          max_results:
+            type: number
+            default: 6
+            description: 'The number of search results to retrieve.'
+        examples:
+          - ask: 'Search for information about artificial intelligence'
+            answer: 'web_search(query="Search for information about artificial intelligence", max_results=6)'
+          - ask: 'Find news articles about climate change'
+            answer: 'web_search(query="Find news articles about climate change", max_results=6)'
+        returns:
+          type: string
--- a/metagpt/actions/talk_action.py
+++ b/metagpt/actions/talk_action.py
@ -50,7 +50,7 @@ class TalkAction(Action):
        return prompt

    @property
-    def prompt_bad(self):
+    def prompt_gpt4(self):
        kvs = {
            "{role}": CONFIG.agent_description or "",
            "{history}": self._history_summary or "",
--- a/metagpt/learn/skill_loader.py
+++ b/metagpt/learn/skill_loader.py
@ -25,29 +25,43 @@ class Returns(BaseModel):
    format: Optional[str] = None


-class Prerequisite(BaseModel):
-    name: str
-    type: Optional[str] = None
-    description: Optional[str] = None
-    default: Optional[str] = None
+class Parameter(BaseModel):
+    type: str
+    description: str = None


 class Skill(BaseModel):
    name: str
-    description: str
-    id: str
-    x_prerequisite: Optional[List[Prerequisite]] = Field(default=None, alias="x-prerequisite")
-    arguments: Dict
+    description: str = None
+    id: str = None
+    x_prerequisite: Dict = Field(default=None, alias="x-prerequisite")
+    parameters: Dict[str, Parameter] = None
    examples: List[Example]
    returns: Returns

+    @property
+    def arguments(self) -> Dict:
+        if not self.parameters:
+            return {}
+        ret = {}
+        for k, v in self.parameters.items():
+            ret[k] = v.description if v.description else ""
+        return ret

-class EntitySkills(BaseModel):
+
+class Entity(BaseModel):
+    name: str = None
    skills: List[Skill]


+class Components(BaseModel):
+    pass
+
+
 class SkillsDeclaration(BaseModel):
-    entities: Dict[str, EntitySkills]
+    skillapi: str
+    entities: Dict[str, Entity]
+    components: Components = None


 class SkillLoader:
@ -60,8 +74,8 @@ class SkillLoader:

    def get_skill_list(self, entity_name: str = "Assistant") -> Dict:
        """Return the skill name based on the skill description."""
-        entity_skills = self.get_entity(entity_name)
-        if not entity_skills:
+        entity = self.get_entity(entity_name)
+        if not entity:
            return {}

        agent_skills = CONFIG.agent_skills
@ -73,7 +87,7 @@ class SkillLoader:

        names = [AgentSkill(**i).name for i in agent_skills]
        description_to_name_mappings = {}
-        for s in entity_skills.skills:
+        for s in entity.skills:
            if s.name not in names:
                continue
            description_to_name_mappings[s.description] = s.name
@ -89,8 +103,21 @@ class SkillLoader:
            if sk.name == name:
                return sk

-    def get_entity(self, name) -> EntitySkills:
+    def get_entity(self, name) -> Entity:
        """Return a list of skills for the entity."""
        if not self._skills:
            return None
        return self._skills.entities.get(name)
+
+
+if __name__ == "__main__":
+    CONFIG.agent_skills = [
+        {"id": 1, "name": "text_to_speech", "type": "builtin", "config": {}, "enabled": True},
+        {"id": 2, "name": "text_to_image", "type": "builtin", "config": {}, "enabled": True},
+        {"id": 3, "name": "ai_call", "type": "builtin", "config": {}, "enabled": True},
+        {"id": 3, "name": "data_analysis", "type": "builtin", "config": {}, "enabled": True},
+        {"id": 5, "name": "crawler", "type": "builtin", "config": {"engine": "ddg"}, "enabled": True},
+        {"id": 6, "name": "knowledge", "type": "builtin", "config": {}, "enabled": True},
+    ]
+    loader = SkillLoader()
+    print(loader.get_skill_list())
--- a/metagpt/learn/text_to_speech.py
+++ b/metagpt/learn/text_to_speech.py
@ -11,6 +11,7 @@ import openai
 from metagpt.config import CONFIG
 from metagpt.const import BASE64_FORMAT
 from metagpt.tools.azure_tts import oas3_azsure_tts
+from metagpt.tools.iflytek_tts import oas3_iflytek_tts
 from metagpt.utils.s3 import S3


@ -22,6 +23,9 @@ async def text_to_speech(
    role="Girl",
    subscription_key="",
    region="",
+    iflytek_app_id="",
+    iflytek_api_key="",
+    iflytek_api_secret="",
    **kwargs,
 ):
    """Text to speech
@ -34,16 +38,35 @@ async def text_to_speech(
    :param text: The text used for voice conversion.
    :param subscription_key: key is used to access your Azure AI service API, see: `https://portal.azure.com/` > `Resource Management` > `Keys and Endpoint`
    :param region: This is the location (or region) of your resource. You may need to use this field when making calls to this API.
-    :return: Returns the Base64-encoded .wav file data if successful, otherwise an empty string.
+    :param iflytek_app_id: Application ID is used to access your iFlyTek service API, see: `https://console.xfyun.cn/services/tts`
+    :param iflytek_api_key: WebAPI argument, see: `https://console.xfyun.cn/services/tts`
+    :param iflytek_api_secret: WebAPI argument, see: `https://console.xfyun.cn/services/tts`
+    :return: Returns the Base64-encoded .wav/.mp3 file data if successful, otherwise an empty string.

    """
-    audio_declaration = "data:audio/wav;base64,"
+
    if (CONFIG.AZURE_TTS_SUBSCRIPTION_KEY and CONFIG.AZURE_TTS_REGION) or (subscription_key and region):
+        audio_declaration = "data:audio/wav;base64,"
        base64_data = await oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region)
        s3 = S3()
        url = await s3.cache(data=base64_data, file_ext=".wav", format=BASE64_FORMAT)
        if url:
            return f"[{text}]({url})"
        return audio_declaration + base64_data if base64_data else base64_data
+    if (CONFIG.IFLYTEK_APP_ID and CONFIG.IFLYTEK_API_KEY and CONFIG.IFLYTEK_API_SECRET) or (
+        iflytek_app_id and iflytek_api_key and iflytek_api_secret
+    ):
+        audio_declaration = "data:audio/mp3;base64,"
+        base64_data = await oas3_iflytek_tts(
+            text=text, app_id=iflytek_app_id, api_key=iflytek_api_key, api_secret=iflytek_api_secret
+        )
+        s3 = S3()
+        url = await s3.cache(data=base64_data, file_ext=".mp3", format=BASE64_FORMAT)
+        if url:
+            return f"[{text}]({url})"
+        return audio_declaration + base64_data if base64_data else base64_data

-    raise openai.error.InvalidRequestError(message="AZURE_TTS_SUBSCRIPTION_KEY and AZURE_TTS_REGION error", param={})
+    raise openai.error.InvalidRequestError(
+        message="AZURE_TTS_SUBSCRIPTION_KEY, AZURE_TTS_REGION, IFLYTEK_APP_ID, IFLYTEK_API_KEY, IFLYTEK_API_SECRET error",
+        param={},
+    )
--- a/metagpt/provider/openai_api.py
+++ b/metagpt/provider/openai_api.py
@ -228,4 +228,4 @@ class OpenAIGPTAPI(BaseGPTAPI, RateLimiter):
        from metagpt.memory.brain_memory import BrainMemory

        memory = BrainMemory(llm_type=LLMType.OPENAI.value, historical_summary=text, cacheable=False)
-        return await memory.summarize(llm=self, max_length=max_words, keep_language=keep_language)
+        return await memory.summarize(llm=self, max_words=max_words, keep_language=keep_language)
--- a/metagpt/tools/iflytek_tts.py
+++ b/metagpt/tools/iflytek_tts.py
@ -0,0 +1,162 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/17
+@Author  : mashenquan
+@File    : iflytek_tts.py
+@Desc    : iFLYTEK TTS OAS3 api, which provides text-to-speech functionality
+"""
+import asyncio
+import base64
+import hashlib
+import hmac
+import json
+import uuid
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from time import mktime
+from typing import Optional
+from urllib.parse import urlencode
+from wsgiref.handlers import format_date_time
+
+import aiofiles
+import websockets as websockets
+from pydantic import BaseModel
+
+from metagpt.config import CONFIG
+from metagpt.logs import logger
+
+
+class IFlyTekTTSStatus(Enum):
+    STATUS_FIRST_FRAME = 0  # The first frame
+    STATUS_CONTINUE_FRAME = 1  # The intermediate frame
+    STATUS_LAST_FRAME = 2  # The last frame
+
+
+class AudioData(BaseModel):
+    audio: str
+    status: int
+    ced: str
+
+
+class IFlyTekTTSResponse(BaseModel):
+    code: int
+    message: str
+    data: Optional[AudioData] = None
+    sid: str
+
+
+DEFAULT_IFLYTEK_VOICE = "xiaoyan"
+
+
+class IFlyTekTTS(object):
+    def __init__(self, app_id: str, api_key: str, api_secret: str):
+        """
+        :param app_id: Application ID is used to access your iFlyTek service API, see: `https://console.xfyun.cn/services/tts`
+        :param api_key: WebAPI argument, see: `https://console.xfyun.cn/services/tts`
+        :param api_secret: WebAPI argument, see: `https://console.xfyun.cn/services/tts`
+        """
+        self.app_id = app_id or CONFIG.IFLYTEK_APP_ID
+        self.api_key = api_key or CONFIG.IFLYTEK_API_KEY
+        self.api_secret = api_secret or CONFIG.API_SECRET
+
+    async def synthesize_speech(self, text, output_file: str, voice=DEFAULT_IFLYTEK_VOICE):
+        url = self._create_url()
+        data = {
+            "common": {"app_id": self.app_id},
+            "business": {"aue": "lame", "sfl": 1, "auf": "audio/L16;rate=16000", "vcn": voice, "tte": "utf8"},
+            "data": {"status": 2, "text": str(base64.b64encode(text.encode("utf-8")), "UTF8")},
+        }
+        req = json.dumps(data)
+        async with websockets.connect(url) as websocket:
+            # send request
+            await websocket.send(req)
+
+            # receive frames
+            async with aiofiles.open(str(output_file), "w") as writer:
+                while True:
+                    v = await websocket.recv()
+                    rsp = IFlyTekTTSResponse(**json.loads(v))
+                    if rsp.data:
+                        await writer.write(rsp.data.audio)
+                        if rsp.data.status != IFlyTekTTSStatus.STATUS_LAST_FRAME.value:
+                            continue
+                    break
+
+    def _create_url(self):
+        """Create request url"""
+        url = "wss://tts-api.xfyun.cn/v2/tts"
+        # Generate a timestamp in RFC1123 format
+        now = datetime.now()
+        date = format_date_time(mktime(now.timetuple()))
+
+        signature_origin = "host: " + "ws-api.xfyun.cn" + "\n"
+        signature_origin += "date: " + date + "\n"
+        signature_origin += "GET " + "/v2/tts " + "HTTP/1.1"
+        # Perform HMAC-SHA256 encryption
+        signature_sha = hmac.new(
+            self.api_secret.encode("utf-8"), signature_origin.encode("utf-8"), digestmod=hashlib.sha256
+        ).digest()
+        signature_sha = base64.b64encode(signature_sha).decode(encoding="utf-8")
+
+        authorization_origin = 'api_key="%s", algorithm="%s", headers="%s", signature="%s"' % (
+            self.api_key,
+            "hmac-sha256",
+            "host date request-line",
+            signature_sha,
+        )
+        authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode(encoding="utf-8")
+        # Combine the authentication parameters of the request into a dictionary.
+        v = {"authorization": authorization, "date": date, "host": "ws-api.xfyun.cn"}
+        # Concatenate the authentication parameters to generate the URL.
+        url = url + "?" + urlencode(v)
+        return url
+
+
+# Export
+async def oas3_iflytek_tts(text: str, voice: str = "", app_id: str = "", api_key: str = "", api_secret: str = ""):
+    """Text to speech
+    For more details, check out:`https://www.xfyun.cn/doc/tts/online_tts/API.html`
+
+    :param voice: Default `xiaoyan`. For more details, checkout: `https://www.xfyun.cn/doc/tts/online_tts/API.html#%E6%8E%A5%E5%8F%A3%E8%B0%83%E7%94%A8%E6%B5%81%E7%A8%8B`
+    :param text: The text used for voice conversion.
+    :param app_id: Application ID is used to access your iFlyTek service API, see: `https://console.xfyun.cn/services/tts`
+    :param api_key: WebAPI argument, see: `https://console.xfyun.cn/services/tts`
+    :param api_secret: WebAPI argument, see: `https://console.xfyun.cn/services/tts`
+    :return: Returns the Base64-encoded .mp3 file data if successful, otherwise an empty string.
+
+    """
+    if not app_id:
+        app_id = CONFIG.IFLYTEK_APP_ID
+    if not api_key:
+        api_key = CONFIG.IFLYTEK_API_KEY
+    if not api_secret:
+        api_secret = CONFIG.IFLYTEK_API_SECRET
+    if not voice:
+        voice = CONFIG.IFLYTEK_VOICE or DEFAULT_IFLYTEK_VOICE
+
+    filename = Path(__file__).parent / (uuid.uuid4().hex + ".mp3")
+    try:
+        tts = IFlyTekTTS(app_id=app_id, api_key=api_key, api_secret=api_secret)
+        await tts.synthesize_speech(text=text, output_file=str(filename), voice=voice)
+        async with aiofiles.open(str(filename), mode="r") as reader:
+            base64_string = await reader.read()
+    except Exception as e:
+        logger.error(f"text:{text}, error:{e}")
+        base64_string = ""
+    finally:
+        filename.unlink()
+
+    return base64_string
+
+
+if __name__ == "__main__":
+    asyncio.get_event_loop().run_until_complete(
+        oas3_iflytek_tts(
+            text="你好，hello",
+            app_id="f7acef62",
+            api_key="fda72e3aa286042a492525816a5efa08",
+            api_secret="ZDk3NjdiMDBkODJlOWQ1NjRjMGI2NDY4",
+        )
+    )
--- a/metagpt/tools/metagpt_oas3_api_svc.py
+++ b/metagpt/tools/metagpt_oas3_api_svc.py
@ -7,8 +7,8 @@
@Desc    : MetaGPT OpenAPI Specification 3.0 REST API service
 """
 import asyncio
-from pathlib import Path
 import sys
+from pathlib import Path

 import connexion

@ -17,7 +17,7 @@ sys.path.append(str(Path(__file__).resolve().parent.parent.parent))  # fix-bug:

 def oas_http_svc():
    """Start the OAS 3.0 OpenAPI HTTP service"""
-    app = connexion.AioHttpApp(__name__, specification_dir='../../.well-known/')
+    app = connexion.AioHttpApp(__name__, specification_dir="../../.well-known/")
    app.add_api("metagpt_oas3_api.yaml")
    app.add_api("openapi.yaml")
    app.run(port=8080)
@ -35,6 +35,7 @@ async def async_main():


 def main():
+    print("http://localhost:8080/oas3/ui/")
    oas_http_svc()


--- a/metagpt/utils/s3.py
+++ b/metagpt/utils/s3.py
@ -132,7 +132,7 @@ class S3:

    async def cache(self, data: str, file_ext: str, format: str = "") -> str:
        """Save data to remote S3 and return url"""
-        object_name = str(uuid.uuid4()).replace("-", "") + file_ext
+        object_name = uuid.uuid4().hex + file_ext
        path = Path(__file__).parent
        pathname = path / object_name
        try:
--- a/requirements.txt
+++ b/requirements.txt
@ -42,4 +42,5 @@ connexion[swagger-ui]
 aiohttp_jinja2
 azure-cognitiveservices-speech==1.31.0
 aioboto3~=11.3.0
-redis==4.3.5
+redis==4.3.5
+websocket-client==1.6.2
--- a/tests/metagpt/learn/test_skill_loader.py
+++ b/tests/metagpt/learn/test_skill_loader.py
@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/9/19
+@Author  : mashenquan
+@File    : test_skill_loader.py
+@Desc    : Unit tests.
+"""
+
+from metagpt.config import CONFIG
+from metagpt.learn.skill_loader import SkillLoader
+
+
+def test_suite():
+    CONFIG.agent_skills = [
+        {"id": 1, "name": "text_to_speech", "type": "builtin", "config": {}, "enabled": True},
+        {"id": 2, "name": "text_to_image", "type": "builtin", "config": {}, "enabled": True},
+        {"id": 3, "name": "ai_call", "type": "builtin", "config": {}, "enabled": True},
+        {"id": 3, "name": "data_analysis", "type": "builtin", "config": {}, "enabled": True},
+        {"id": 5, "name": "crawler", "type": "builtin", "config": {"engine": "ddg"}, "enabled": True},
+        {"id": 6, "name": "knowledge", "type": "builtin", "config": {}, "enabled": True},
+        {"id": 6, "name": "web_search", "type": "builtin", "config": {}, "enabled": True},
+    ]
+    loader = SkillLoader()
+    skills = loader.get_skill_list()
+    assert skills
+    assert len(skills) >= 3
+    for desc, name in skills.items():
+        assert desc
+        assert name
+
+    entity = loader.get_entity("Assistant")
+    assert entity
+    assert entity.skills
+    for sk in entity.skills:
+        assert sk
+        assert sk.arguments
+
+
+if __name__ == "__main__":
+    test_suite()