diff --git a/.well-known/metagpt_oas3_api.yaml b/.well-known/metagpt_oas3_api.yaml index 56c6f42d5..1e3cecb10 100644 --- a/.well-known/metagpt_oas3_api.yaml +++ b/.well-known/metagpt_oas3_api.yaml @@ -73,6 +73,63 @@ paths: '500': description: "Internal Server Error" + /tts/iflytek: + x-prerequisite: + - name: IFLYTEK_APP_ID + description: "Application ID is used to access your iFlyTek service API, see: `https://console.xfyun.cn/services/tts`" + - name: IFLYTEK_API_KEY + description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`" + - name: IFLYTEK_API_SECRET + description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`" + post: + summary: "Convert Text to Base64-encoded .mp3 File Stream" + description: "For more details, check out: [iFlyTek](https://console.xfyun.cn/services/tts)" + operationId: iflytek_tts.oas3_iflytek_tts + requestBody: + required: true + content: + application/json: + schema: + type: object + required: + - text + properties: + text: + type: string + description: Text to convert + voice: + type: string + description: "Voice style, see: [iFlyTek Text-to_Speech](https://www.xfyun.cn/doc/tts/online_tts/API.html#%E6%8E%A5%E5%8F%A3%E8%B0%83%E7%94%A8%E6%B5%81%E7%A8%8B)" + default: "xiaoyan" + app_id: + type: string + description: "Application ID is used to access your iFlyTek service API, see: `https://console.xfyun.cn/services/tts`" + default: "" + api_key: + type: string + description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`" + default: "" + api_secret: + type: string + description: "WebAPI argument, see: `https://console.xfyun.cn/services/tts`" + default: "" + responses: + '200': + description: "Base64-encoded .mp3 file data if successful, otherwise an empty string." + content: + application/json: + schema: + type: object + properties: + wav_data: + type: string + format: base64 + '400': + description: "Bad Request" + '500': + description: "Internal Server Error" + + /txt2img/openai: x-prerequisite: - name: OPENAI_API_KEY diff --git a/metagpt/learn/text_to_speech.py b/metagpt/learn/text_to_speech.py index 81bc8512b..7c085c02f 100644 --- a/metagpt/learn/text_to_speech.py +++ b/metagpt/learn/text_to_speech.py @@ -11,6 +11,7 @@ import openai from metagpt.config import CONFIG from metagpt.const import BASE64_FORMAT from metagpt.tools.azure_tts import oas3_azsure_tts +from metagpt.tools.iflytek_tts import oas3_iflytek_tts from metagpt.utils.s3 import S3 @@ -22,6 +23,9 @@ async def text_to_speech( role="Girl", subscription_key="", region="", + iflytek_app_id="", + iflytek_api_key="", + iflytek_api_secret="", **kwargs, ): """Text to speech @@ -34,16 +38,35 @@ async def text_to_speech( :param text: The text used for voice conversion. :param subscription_key: key is used to access your Azure AI service API, see: `https://portal.azure.com/` > `Resource Management` > `Keys and Endpoint` :param region: This is the location (or region) of your resource. You may need to use this field when making calls to this API. - :return: Returns the Base64-encoded .wav file data if successful, otherwise an empty string. + :param iflytek_app_id: Application ID is used to access your iFlyTek service API, see: `https://console.xfyun.cn/services/tts` + :param iflytek_api_key: WebAPI argument, see: `https://console.xfyun.cn/services/tts` + :param iflytek_api_secret: WebAPI argument, see: `https://console.xfyun.cn/services/tts` + :return: Returns the Base64-encoded .wav/.mp3 file data if successful, otherwise an empty string. """ - audio_declaration = "data:audio/wav;base64," + if (CONFIG.AZURE_TTS_SUBSCRIPTION_KEY and CONFIG.AZURE_TTS_REGION) or (subscription_key and region): + audio_declaration = "data:audio/wav;base64," base64_data = await oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region) s3 = S3() url = await s3.cache(data=base64_data, file_ext=".wav", format=BASE64_FORMAT) if url: return f"[{text}]({url})" return audio_declaration + base64_data if base64_data else base64_data + if (CONFIG.IFLYTEK_APP_ID and CONFIG.IFLYTEK_API_KEY and CONFIG.IFLYTEK_API_SECRET) or ( + iflytek_app_id and iflytek_api_key and iflytek_api_secret + ): + audio_declaration = "data:audio/mp3;base64," + base64_data = await oas3_iflytek_tts( + text=text, app_id=iflytek_app_id, api_key=iflytek_api_key, api_secret=iflytek_api_secret + ) + s3 = S3() + url = await s3.cache(data=base64_data, file_ext=".mp3", format=BASE64_FORMAT) + if url: + return f"[{text}]({url})" + return audio_declaration + base64_data if base64_data else base64_data - raise openai.error.InvalidRequestError(message="AZURE_TTS_SUBSCRIPTION_KEY and AZURE_TTS_REGION error", param={}) + raise openai.error.InvalidRequestError( + message="AZURE_TTS_SUBSCRIPTION_KEY, AZURE_TTS_REGION, IFLYTEK_APP_ID, IFLYTEK_API_KEY, IFLYTEK_API_SECRET error", + param={}, + ) diff --git a/requirements.txt b/requirements.txt index 588b29e0b..2dd767026 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,4 +42,5 @@ connexion[swagger-ui] aiohttp_jinja2 azure-cognitiveservices-speech==1.31.0 aioboto3~=11.3.0 -redis==4.3.5 \ No newline at end of file +redis==4.3.5 +websocket-client \ No newline at end of file