feat: merge feature/skills

2026-07-26 17:11:07 +02:00 · 2023-08-18 20:31:16 +08:00 · 2023-08-18 20:31:16 +08:00 · fb137b018c
commit fb137b018c
parent 88e7698e72 df5a50f6e6
22 changed files with 1018 additions and 54 deletions
--- a/.gitignore
+++ b/.gitignore
@ -167,3 +167,5 @@ output.wav

 # output folder
 output
+tmp.png
+
--- a/.well-known/ai-plugin.json
+++ b/.well-known/ai-plugin.json
@ -0,0 +1,18 @@
+{
+  "schema_version": "v1",
+  "name_for_model": "text processing tools",
+  "name_for_human": "MetaGPT Text Plugin",
+  "description_for_model": "Plugins for text processing, including text-to-speech, text-to-image, text-to-embedding, text summarization, text-to-code, vector similarity calculation, web content crawling, and more.",
+  "description_for_human": "Plugins for text processing, including text-to-speech, text-to-image, text-to-embedding, text summarization, text-to-code, vector similarity calculation, web content crawling, and more.",
+  "auth": {
+    "type": "none"
+  },
+  "api": {
+    "type": "openapi",
+    "url": "https://github.com/iorisa/MetaGPT/blob/feature/oas3/.well-known/metagpt_oas3_api.yaml",
+    "has_user_authentication": false
+  },
+  "logo_url": "https://github.com/iorisa/MetaGPT/blob/feature/oas3/docs/resources/MetaGPT-logo.png",
+  "contact_email": "mashenquan@fuzhi.cn",
+  "legal_info_url": "https://github.com/iorisa/MetaGPT/blob/feature/oas3/docs/README_CN.md"
+}
--- a/.well-known/metagpt_oas3_api.yaml
+++ b/.well-known/metagpt_oas3_api.yaml
@ -0,0 +1,236 @@
+openapi: "3.0.0"
+
+info:
+  title: "MetaGPT Export OpenAPIs"
+  version: "1.0"
+servers:
+  - url: "/oas3"
+    variables:
+      port:
+        default: '8080'
+        description: HTTP service port
+
+paths:
+  /tts/azsure:
+    post:
+      summary: "Convert Text to Base64-encoded .wav File Stream"
+      description: "For more details, check out: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
+      operationId: azure_tts.oas3_azsure_tts
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+                - text
+              properties:
+                text:
+                  type: string
+                  description: Text to convert
+                lang:
+                  type: string
+                  description: The language code or locale, e.g., en-US (English - United States)
+                  default: "zh-CN"
+                voice:
+                  type: string
+                  description: "Voice style, see: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts), [Voice Gallery](https://speech.microsoft.com/portal/voicegallery)"
+                  default: "zh-CN-XiaomoNeural"
+                style:
+                  type: string
+                  description: "Speaking style to express different emotions. For more details, checkout: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
+                  default: "affectionate"
+                role:
+                  type: string
+                  description: "Role to specify age and gender. For more details, checkout: [Azure Text-to_Speech](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)"
+                  default: "Girl"
+                subscription_key:
+                  type: string
+                  description: "Key used to access Azure AI service API, see: [Azure Portal](https://portal.azure.com/) > `Resource Management` > `Keys and Endpoint`"
+                  default: ""
+                region:
+                  type: string
+                  description: "Location (or region) of your resource, see: [Azure Portal](https://portal.azure.com/) > `Resource Management` > `Keys and Endpoint`"
+                  default: ""
+      responses:
+        '200':
+          description: "Base64-encoded .wav file data if successful, otherwise an empty string."
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  wav_data:
+                    type: string
+                    format: base64
+        '400':
+          description: "Bad Request"
+        '500':
+          description: "Internal Server Error"
+
+  /txt2img/openai:
+    post:
+      summary: "Convert Text to Base64-encoded Image Data Stream"
+      operationId: openai_text_to_image.oas3_openai_text_to_image
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                text:
+                  type: string
+                  description: "The text used for image conversion."
+                size_type:
+                  type: string
+                  enum: ["256x256", "512x512", "1024x1024"]
+                  default: "1024x1024"
+                  description: "Size of the generated image."
+                openai_api_key:
+                  type: string
+                  default: ""
+                  description: "OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`"
+      responses:
+        '200':
+          description: "Base64-encoded image data."
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  image_data:
+                    type: string
+                    format: base64
+        '400':
+          description: "Bad Request"
+        '500':
+          description: "Internal Server Error"
+  /txt2embedding/openai:
+    post:
+      summary: Text to embedding
+      operationId: openai_text_to_embedding.oas3_openai_text_to_embedding
+      description: Retrieve an embedding for the provided text using the OpenAI API.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                input:
+                  type: string
+                  description: The text used for embedding.
+                model:
+                  type: string
+                  description: "ID of the model to use. For more details, checkout: [models](https://api.openai.com/v1/models)"
+                  enum:
+                    - text-embedding-ada-002
+      responses:
+        "200":
+          description: Successful response
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ResultEmbedding"
+        "4XX":
+          description: Client error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "5XX":
+          description: Server error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+  /txt2image/metagpt:
+    post:
+      summary: "Text to Image"
+      description: "Generate an image from the provided text using the MetaGPT Text-to-Image API."
+      operationId: metagpt_text_to_image.oas3_metagpt_text_to_image
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+                - text
+              properties:
+                text:
+                  type: string
+                  description: "The text used for image conversion."
+                size_type:
+                  type: string
+                  enum: ["512x512", "512x768"]
+                  default: "512x512"
+                  description: "Size of the generated image."
+                model_url:
+                  type: string
+                  description: "Model reset API URL for text-to-image."
+                  default: ""
+      responses:
+        '200':
+          description: "Base64-encoded image data."
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  image_data:
+                    type: string
+                    format: base64
+        '400':
+          description: "Bad Request"
+        '500':
+          description: "Internal Server Error"
+
+components:
+  schemas:
+    Embedding:
+      type: object
+      description: Represents an embedding vector returned by the embedding endpoint.
+      properties:
+        object:
+          type: string
+          example: embedding
+        embedding:
+          type: array
+          items:
+            type: number
+          example: [0.0023064255, -0.009327292, ...]
+        index:
+          type: integer
+          example: 0
+    Usage:
+      type: object
+      properties:
+        prompt_tokens:
+          type: integer
+          example: 8
+        total_tokens:
+          type: integer
+          example: 8
+    ResultEmbedding:
+      type: object
+      properties:
+        object:
+          type: string
+          example: result_embedding
+        data:
+          type: array
+          items:
+            $ref: "#/components/schemas/Embedding"
+        model:
+          type: string
+          example: text-embedding-ada-002
+        usage:
+          $ref: "#/components/schemas/Usage"
+    Error:
+      type: object
+      properties:
+        error:
+          type: string
+          example: An error occurred
--- a/.well-known/openapi.yaml
+++ b/.well-known/openapi.yaml
@ -0,0 +1,35 @@
+openapi: "3.0.0"
+
+info:
+  title: Hello World
+  version: "1.0"
+servers:
+  - url: /openapi
+
+paths:
+  /greeting/{name}:
+    post:
+      summary: Generate greeting
+      description: Generates a greeting message.
+      operationId: hello.post_greeting
+      responses:
+        200:
+          description: greeting response
+          content:
+            text/plain:
+              schema:
+                type: string
+                example: "hello dave!"
+      parameters:
+        - name: name
+          in: path
+          description: Name of the person to greet.
+          required: true
+          schema:
+            type: string
+            example: "dave"
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
--- a/config/config.yaml
+++ b/config/config.yaml
@ -70,3 +70,6 @@ SD_T2I_API: "/sdapi/v1/txt2img"
 ### for Research
 MODEL_FOR_RESEARCHER_SUMMARY: gpt-3.5-turbo
 MODEL_FOR_RESEARCHER_REPORT: gpt-3.5-turbo-16k
+
+### Meta Models
+#METAGPT_TEXT_TO_IMAGE_MODEL: MODEL_URL
--- a/metagpt/actions/azure_tts.py
+++ b/metagpt/actions/azure_tts.py
@ -1,53 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-@Time    : 2023/6/9 22:22
-@Author  : Leo Xiao
-@File    : azure_tts.py
-"""
-from azure.cognitiveservices.speech import AudioConfig, SpeechConfig, SpeechSynthesizer
-
-from metagpt.actions.action import Action
-from metagpt.config import Config
-
-
-class AzureTTS(Action):
-    def __init__(self, name, context=None, llm=None):
-        super().__init__(name, context, llm)
-        self.config = Config()
-
-    # 参数参考：https://learn.microsoft.com/zh-cn/azure/cognitive-services/speech-service/language-support?tabs=tts#voice-styles-and-roles
-    def synthesize_speech(self, lang, voice, role, text, output_file):
-        subscription_key = self.config.get('AZURE_TTS_SUBSCRIPTION_KEY')
-        region = self.config.get('AZURE_TTS_REGION')
-        speech_config = SpeechConfig(
-            subscription=subscription_key, region=region)
-
-        speech_config.speech_synthesis_voice_name = voice
-        audio_config = AudioConfig(filename=output_file)
-        synthesizer = SpeechSynthesizer(
-            speech_config=speech_config,
-            audio_config=audio_config)
-
-        # if voice=="zh-CN-YunxiNeural":
-        ssml_string = f"""
-            <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='{lang}' xmlns:mstts='http://www.w3.org/2001/mstts'>
-                <voice name='{voice}'>
-                    <mstts:express-as style='affectionate' role='{role}'>
-                        {text}
-                    </mstts:express-as>
-                </voice>
-            </speak>
-            """
-
-        synthesizer.speak_ssml_async(ssml_string).get()
-
-
-if __name__ == "__main__":
-    azure_tts = AzureTTS("azure_tts")
-    azure_tts.synthesize_speech(
-        "zh-CN",
-        "zh-CN-YunxiNeural",
-        "Boy",
-        "你好，我是卡卡",
-        "output.wav")
--- a/metagpt/learn/text_to_embedding.py
+++ b/metagpt/learn/text_to_embedding.py
@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/18
+@Author  : mashenquan
+@File    : text_to_embedding.py
+@Desc    : Text-to-Embedding skill, which provides text-to-embedding functionality.
+"""
+import os
+
+from metagpt.tools.openai_text_to_embedding import oas3_openai_text_to_embedding
+from metagpt.utils.common import initialize_environment
+
+
+def text_to_embedding(text, model="text-embedding-ada-002", openai_api_key=""):
+    """Text to embedding
+
+    :param text: The text used for embedding.
+    :param model: One of ['text-embedding-ada-002'], ID of the model to use. For more details, checkout: `https://api.openai.com/v1/models`.
+    :param openai_api_key: OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`
+    :return: A json object of :class:`ResultEmbedding` class if successful, otherwise `{}`.
+    """
+    initialize_environment()
+    if os.environ.get("OPENAI_API_KEY") or openai_api_key:
+        return oas3_openai_text_to_embedding(text, model=model, openai_api_key=openai_api_key)
+    raise EnvironmentError
--- a/metagpt/learn/text_to_image.py
+++ b/metagpt/learn/text_to_image.py
@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/18
+@Author  : mashenquan
+@File    : text_to_image.py
+@Desc    : Text-to-Image skill, which provides text-to-image functionality.
+"""
+import os
+
+from metagpt.tools.metagpt_text_to_image import oas3_metagpt_text_to_image
+from metagpt.tools.openai_text_to_image import oas3_openai_text_to_image
+from metagpt.utils.common import initialize_environment
+
+
+def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url=""):
+    """Text to image
+
+    :param text: The text used for image conversion.
+    :param openai_api_key: OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`
+    :param size_type: If using OPENAI, the available size options are ['256x256', '512x512', '1024x1024'], while for MetaGPT, the options are ['512x512', '512x768'].
+    :param model_url: MetaGPT model url
+    :return: The image data is returned in Base64 encoding.
+    """
+    initialize_environment()
+    if os.environ.get("METAGPT_TEXT_TO_IMAGE_MODEL") or model_url:
+        return oas3_metagpt_text_to_image(text, size_type, model_url)
+    if os.environ.get("OPENAI_API_KEY") or openai_api_key:
+        return oas3_openai_text_to_image(text, size_type, openai_api_key)
+    raise EnvironmentError
--- a/metagpt/learn/text_to_speech.py
+++ b/metagpt/learn/text_to_speech.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/17
+@Author  : mashenquan
+@File    : text_to_speech.py
+@Desc    : Text-to-Speech skill, which provides text-to-speech functionality
+"""
+import os
+
+from metagpt.tools.azure_tts import oas3_azsure_tts
+from metagpt.utils.common import initialize_environment
+
+
+def text_to_speech(text, lang="zh-CN", voice="zh-CN-XiaomoNeural", style="affectionate", role="Girl",
+                   subscription_key="", region=""):
+    """Text to speech
+    For more details, check out:`https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
+
+    :param lang: The value can contain a language code such as en (English), or a locale such as en-US (English - United States). For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
+    :param voice: For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`, `https://speech.microsoft.com/portal/voicegallery`
+    :param style: Speaking style to express different emotions like cheerfulness, empathy, and calm. For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
+    :param role: With roles, the same voice can act as a different age and gender. For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
+    :param text: The text used for voice conversion.
+    :param subscription_key: key is used to access your Azure AI service API, see: `https://portal.azure.com/` > `Resource Management` > `Keys and Endpoint`
+    :param region: This is the location (or region) of your resource. You may need to use this field when making calls to this API.
+    :return: Returns the Base64-encoded .wav file data if successful, otherwise an empty string.
+
+    """
+    initialize_environment()
+    if (os.environ.get("AZURE_TTS_SUBSCRIPTION_KEY") and os.environ.get("AZURE_TTS_REGION")) or \
+            (subscription_key and region):
+        return oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region)
+
+    raise EnvironmentError
--- a/metagpt/tools/azure_tts.py
+++ b/metagpt/tools/azure_tts.py
@ -0,0 +1,114 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/17
+@Author  : mashenquan
+@File    : azure_tts.py
+@Desc    : azure TTS OAS3 api, which provides text-to-speech functionality
+"""
+from pathlib import Path
+from uuid import uuid4
+import base64
+import sys
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))  # fix-bug: No module named 'metagpt'
+from metagpt.utils.common import initialize_environment
+from metagpt.logs import logger
+
+from azure.cognitiveservices.speech import AudioConfig, SpeechConfig, SpeechSynthesizer
+import os
+
+
+class AzureTTS:
+    """Azure Text-to-Speech"""
+
+    def __init__(self, subscription_key, region):
+        """
+        :param subscription_key: key is used to access your Azure AI service API, see: `https://portal.azure.com/` > `Resource Management` > `Keys and Endpoint`
+        :param region: This is the location (or region) of your resource. You may need to use this field when making calls to this API.
+        """
+        self.subscription_key = subscription_key if subscription_key else os.environ.get('AZURE_TTS_SUBSCRIPTION_KEY')
+        self.region = region if region else os.environ.get('AZURE_TTS_REGION')
+
+    # 参数参考：https://learn.microsoft.com/zh-cn/azure/cognitive-services/speech-service/language-support?tabs=tts#voice-styles-and-roles
+    def synthesize_speech(self, lang, voice, text, output_file):
+        speech_config = SpeechConfig(
+            subscription=self.subscription_key, region=self.region)
+        speech_config.speech_synthesis_voice_name = voice
+        audio_config = AudioConfig(filename=output_file)
+        synthesizer = SpeechSynthesizer(
+            speech_config=speech_config,
+            audio_config=audio_config)
+
+        # More detail: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice
+        ssml_string = "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' " \
+                      f"xml:lang='{lang}' xmlns:mstts='http://www.w3.org/2001/mstts'>" \
+                      f"<voice name='{voice}'>{text}</voice></speak>"
+
+        return synthesizer.speak_ssml_async(ssml_string).get()
+
+    @staticmethod
+    def role_style_text(role, style, text):
+        return f'<mstts:express-as role="{role}" style="{style}">{text}</mstts:express-as>'
+
+    @staticmethod
+    def role_text(role, text):
+        return f'<mstts:express-as role="{role}">{text}</mstts:express-as>'
+
+    @staticmethod
+    def style_text(style, text):
+        return f'<mstts:express-as style="{style}">{text}</mstts:express-as>'
+
+
+# Export
+def oas3_azsure_tts(text, lang="", voice="", style="", role="", subscription_key="", region=""):
+    """Text to speech
+    For more details, check out:`https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
+
+    :param lang: The value can contain a language code such as en (English), or a locale such as en-US (English - United States). For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
+    :param voice: For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`, `https://speech.microsoft.com/portal/voicegallery`
+    :param style: Speaking style to express different emotions like cheerfulness, empathy, and calm. For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
+    :param role: With roles, the same voice can act as a different age and gender. For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
+    :param text: The text used for voice conversion.
+    :param subscription_key: key is used to access your Azure AI service API, see: `https://portal.azure.com/` > `Resource Management` > `Keys and Endpoint`
+    :param region: This is the location (or region) of your resource. You may need to use this field when making calls to this API.
+    :return: Returns the Base64-encoded .wav file data if successful, otherwise an empty string.
+
+    """
+    if not text:
+        return ""
+
+    if not lang:
+        lang = "zh-CN"
+    if not voice:
+        voice = "zh-CN-XiaomoNeural"
+    if not role:
+        role = "Girl"
+    if not style:
+        style = "affectionate"
+    if not subscription_key:
+        subscription_key = os.environ.get("AZURE_TTS_SUBSCRIPTION_KEY")
+    if not region:
+        region = os.environ.get("AZURE_TTS_REGION")
+
+    xml_value = AzureTTS.role_style_text(role=role, style=style, text=text)
+    tts = AzureTTS(subscription_key=subscription_key, region=region)
+    filename = Path(__file__).resolve().parent / (str(uuid4()).replace("-", "") + ".wav")
+    try:
+        tts.synthesize_speech(lang=lang, voice=voice, text=xml_value, output_file=str(filename))
+        with open(str(filename), mode="rb") as reader:
+            data = reader.read()
+            base64_string = base64.b64encode(data).decode('utf-8')
+        filename.unlink()
+    except Exception as e:
+        logger.error(f"text:{text}, error:{e}")
+        return ""
+
+    return base64_string
+
+
+if __name__ == "__main__":
+    initialize_environment()
+
+    v = oas3_azsure_tts("测试，test")
+    print(v)
--- a/metagpt/tools/hello.py
+++ b/metagpt/tools/hello.py
@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/5/2 16:03
+@Author  : mashenquan
+@File    : hello.py
+@Desc    : Implement the OpenAPI Specification 3.0 demo and use the following command to test the HTTP service:
+
+        curl -X 'POST' \
+        'http://localhost:8080/openapi/greeting/dave' \
+        -H 'accept: text/plain' \
+        -H 'Content-Type: application/json' \
+        -d '{}'
+"""
+
+import connexion
+
+
+# openapi implement
+def post_greeting(name: str) -> str:
+    return f"Hello {name}\n"
+
+
+if __name__ == "__main__":
+    app = connexion.AioHttpApp(__name__, specification_dir='../../.well-known/')
+    app.add_api("openapi.yaml", arguments={"title": "Hello World Example"})
+    app.run(port=8080)
--- a/metagpt/tools/metagpt_oas3_api_svc.py
+++ b/metagpt/tools/metagpt_oas3_api_svc.py
@ -0,0 +1,46 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/17
+@Author  : mashenquan
+@File    : metagpt_oas3_api_svc.py
+@Desc    : MetaGPT OpenAPI Specification 3.0 REST API service
+"""
+import asyncio
+from pathlib import Path
+import sys
+
+import connexion
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))  # fix-bug: No module named 'metagpt'
+from metagpt.utils.common import initialize_environment
+
+
+def oas_http_svc():
+    """Start the OAS 3.0 OpenAPI HTTP service"""
+    initialize_environment()
+
+    app = connexion.FlaskApp(__name__, specification_dir='../../.well-known/')
+    app.add_api("metagpt_oas3_api.yaml")
+    app.add_api("openapi.yaml")
+    app.run(port=8080)
+
+
+async def async_main():
+    """Start the OAS 3.0 OpenAPI HTTP service in the background."""
+    loop = asyncio.get_event_loop()
+    loop.run_in_executor(None, oas_http_svc)
+
+    # TODO: replace following codes:
+    while True:
+        await asyncio.sleep(1)
+        print("sleep")
+
+
+def main():
+    oas_http_svc()
+
+
+if __name__ == "__main__":
+    # asyncio.run(async_main())
+    main()
--- a/metagpt/tools/metagpt_text_to_image.py
+++ b/metagpt/tools/metagpt_text_to_image.py
@ -0,0 +1,112 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/18
+@Author  : mashenquan
+@File    : metagpt_text_to_image.py
+@Desc    : MetaGPT Text-to-Image OAS3 api, which provides text-to-image functionality.
+"""
+import base64
+import os
+import sys
+from pathlib import Path
+from typing import List, Dict
+
+import requests
+from pydantic import BaseModel
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))  # fix-bug: No module named 'metagpt'
+from metagpt.utils.common import initialize_environment
+from metagpt.logs import logger
+
+
+class MetaGPTText2Image:
+    def __init__(self, model_url):
+        """
+        :param model_url: Model reset api url
+        """
+        self.model_url = model_url if model_url else os.environ.get('METAGPT_TEXT_TO_IMAGE_MODEL')
+
+    def text_2_image(self, text, size_type="512x512"):
+        """Text to image
+
+        :param text: The text used for image conversion.
+        :param size_type: One of ['512x512', '512x768']
+        :return: The image data is returned in Base64 encoding.
+        """
+
+        headers = {
+            "Content-Type": "application/json"
+        }
+        dims = size_type.split("x")
+        data = {
+            "prompt": text,
+            "negative_prompt": "(easynegative:0.8),black, dark,Low resolution",
+            "override_settings": {"sd_model_checkpoint": "galaxytimemachinesGTM_photoV20"},
+            "seed": -1,
+            "batch_size": 1,
+            "n_iter": 1,
+            "steps": 20,
+            "cfg_scale": 11,
+            "width": int(dims[0]),
+            "height": int(dims[1]),  # 768,
+            "restore_faces": False,
+            "tiling": False,
+            "do_not_save_samples": False,
+            "do_not_save_grid": False,
+            "enable_hr": False,
+            "hr_scale": 2,
+            "hr_upscaler": "Latent",
+            "hr_second_pass_steps": 0,
+            "hr_resize_x": 0,
+            "hr_resize_y": 0,
+            "hr_upscale_to_x": 0,
+            "hr_upscale_to_y": 0,
+            "truncate_x": 0,
+            "truncate_y": 0,
+            "applied_old_hires_behavior_to": None,
+            "eta": None,
+            "sampler_index": "DPM++ SDE Karras",
+            "alwayson_scripts": {},
+        }
+
+        class ImageResult(BaseModel):
+            images: List
+            parameters: Dict
+
+        try:
+            response = requests.post(self.model_url, headers=headers, json=data)
+            response.raise_for_status()  # Raise an exception for 4xx or 5xx responses
+            result = ImageResult(**response.json())
+            if len(result.images) == 0:
+                return ""
+            return result.images[0]
+        except requests.exceptions.RequestException as e:
+            logger.error(f"An error occurred:{e}")
+        return ""
+
+
+# Export
+def oas3_metagpt_text_to_image(text, size_type: str = "512x512", model_url=""):
+    """Text to image
+
+    :param text: The text used for image conversion.
+    :param model_url: Model reset api
+    :param size_type: One of ['512x512', '512x768']
+    :return: The image data is returned in Base64 encoding.
+    """
+    if not text:
+        return ""
+    if not model_url:
+        model_url = os.environ.get('METAGPT_TEXT_TO_IMAGE_MODEL')
+    return MetaGPTText2Image(model_url).text_2_image(text, size_type=size_type)
+
+
+if __name__ == "__main__":
+    initialize_environment()
+
+    v = oas3_metagpt_text_2_image("Panda emoji")
+    data = base64.b64decode(v)
+    with open("tmp.png", mode="wb") as writer:
+        writer.write(data)
+    print(v)
--- a/metagpt/tools/openai_text_to_embedding.py
+++ b/metagpt/tools/openai_text_to_embedding.py
@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/18
+@Author  : mashenquan
+@File    : openai_text_to_embedding.py
+@Desc    : OpenAI Text-to-Embedding OAS3 api, which provides text-to-embedding functionality.
+            For more details, checkout: `https://platform.openai.com/docs/api-reference/embeddings/object`
+"""
+import os
+from pathlib import Path
+from typing import List
+
+import requests
+from pydantic import BaseModel
+import sys
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))  # fix-bug: No module named 'metagpt'
+from metagpt.utils.common import initialize_environment
+from metagpt.logs import logger
+
+
+class Embedding(BaseModel):
+    """Represents an embedding vector returned by embedding endpoint."""
+    object: str  # The object type, which is always "embedding".
+    embedding: List[
+        float]  # The embedding vector, which is a list of floats. The length of vector depends on the model as listed in the embedding guide.
+    index: int  # The index of the embedding in the list of embeddings.
+
+
+class Usage(BaseModel):
+    prompt_tokens: int
+    total_tokens: int
+
+
+class ResultEmbedding(BaseModel):
+    object: str
+    data: List[Embedding]
+    model: str
+    usage: Usage
+
+
+class OpenAIText2Embedding:
+    def __init__(self, openai_api_key):
+        """
+        :param openai_api_key: OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`
+        """
+        self.openai_api_key = openai_api_key if openai_api_key else os.environ.get('OPENAI_API_KEY')
+
+    def text_2_embedding(self, text, model="text-embedding-ada-002"):
+        """Text to embedding
+
+        :param text: The text used for embedding.
+        :param model: One of ['text-embedding-ada-002'], ID of the model to use. For more details, checkout: `https://api.openai.com/v1/models`.
+        :return: A json object of :class:`ResultEmbedding` class if successful, otherwise `{}`.
+        """
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.openai_api_key}"
+        }
+        data = {"input": text, "model": model}
+        try:
+            response = requests.post("https://api.openai.com/v1/embeddings", headers=headers, json=data)
+            response.raise_for_status()  # Raise an exception for 4xx or 5xx responses
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            logger.error(f"An error occurred:{e}")
+        return {}
+
+
+# Export
+def oas3_openai_text_to_embedding(text, model="text-embedding-ada-002", openai_api_key=""):
+    """Text to embedding
+
+    :param text: The text used for embedding.
+    :param model: One of ['text-embedding-ada-002'], ID of the model to use. For more details, checkout: `https://api.openai.com/v1/models`.
+    :param openai_api_key: OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`
+    :return: A json object of :class:`ResultEmbedding` class if successful, otherwise `{}`.
+    """
+    if not text:
+        return ""
+    if not openai_api_key:
+        openai_api_key = os.environ.get("OPENAI_API_KEY")
+    return OpenAIText2Embedding(openai_api_key).text_2_embedding(text, model=model)
+
+
+if __name__ == "__main__":
+    initialize_environment()
+
+    v = oas3_openai_text_to_embedding("Panda emoji")
+    print(v)
--- a/metagpt/tools/openai_text_to_image.py
+++ b/metagpt/tools/openai_text_to_image.py
@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/17
+@Author  : mashenquan
+@File    : openai_text_to_image.py
+@Desc    : OpenAI Text-to-Image OAS3 api, which provides text-to-image functionality.
+"""
+import base64
+import os
+import sys
+from pathlib import Path
+from typing import List
+
+import requests
+from pydantic import BaseModel
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))  # fix-bug: No module named 'metagpt'
+from metagpt.utils.common import initialize_environment
+from metagpt.logs import logger
+
+
+class OpenAIText2Image:
+    def __init__(self, openai_api_key):
+        """
+        :param openai_api_key: OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`
+        """
+        self.openai_api_key = openai_api_key if openai_api_key else os.environ.get('OPENAI_API_KEY')
+
+    def text_2_image(self, text, size_type="1024x1024"):
+        """Text to image
+
+        :param text: The text used for image conversion.
+        :param size_type: One of ['256x256', '512x512', '1024x1024']
+        :return: The image data is returned in Base64 encoding.
+        """
+
+        class ImageUrl(BaseModel):
+            url: str
+
+        class ImageResult(BaseModel):
+            data: List[ImageUrl]
+            created: int
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.openai_api_key}"
+        }
+        data = {"prompt": text, "n": 1, "size": size_type}
+        try:
+            response = requests.post("https://api.openai.com/v1/images/generations", headers=headers, json=data)
+            response.raise_for_status()  # Raise an exception for 4xx or 5xx responses
+            result = ImageResult(**response.json())
+        except requests.exceptions.RequestException as e:
+            logger.error(f"An error occurred:{e}")
+            return ""
+        if len(result.data) > 0:
+            return OpenAIText2Image.get_image_data(result.data[0].url)
+        return ""
+
+    @staticmethod
+    def get_image_data(url):
+        """Fetch image data from a URL and encode it as Base64
+
+        :param url: Image url
+        :return: Base64-encoded image data.
+        """
+        try:
+            response = requests.get(url)
+            response.raise_for_status()  # Raise an exception for 4xx or 5xx responses
+            image_data = response.content
+            base64_image = base64.b64encode(image_data).decode("utf-8")
+            return base64_image
+
+        except requests.exceptions.RequestException as e:
+            logger.error(f"An error occurred:{e}")
+            return ""
+
+
+# Export
+def oas3_openai_text_to_image(text, size_type: str = "1024x1024", openai_api_key=""):
+    """Text to image
+
+    :param text: The text used for image conversion.
+    :param openai_api_key: OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys`
+    :param size_type: One of ['256x256', '512x512', '1024x1024']
+    :return: The image data is returned in Base64 encoding.
+    """
+    if not text:
+        return ""
+    if not openai_api_key:
+        openai_api_key = os.environ.get("OPENAI_API_KEY")
+    return OpenAIText2Image(openai_api_key).text_2_image(text, size_type=size_type)
+
+
+if __name__ == "__main__":
+    initialize_environment()
+
+    v = oas3_openai_text_to_image("Panda emoji")
+    print(v)
--- a/metagpt/utils/common.py
+++ b/metagpt/utils/common.py
@ -4,14 +4,18 @@
@Time    : 2023/4/29 16:07
@Author  : alexanderwu
@File    : common.py
+@Modified By: mashenquan, 2023-8-17, add `initalize_enviroment()` to load `config/config.yaml` to `os.environ`
 """
 import ast
 import contextlib
 import inspect
 import os
 import re
+from pathlib import Path
 from typing import List, Tuple

+import yaml
+
 from metagpt.logs import logger


@ -254,3 +258,12 @@ def parse_recipient(text):
    pattern = r"## Send To:\s*([A-Za-z]+)\s*?"  # hard code for now
    recipient = re.search(pattern, text)
    return recipient.group(1) if recipient else ""
+
+
+def initialize_environment():
+    """Load `config/config.yaml` to `os.environ`"""
+    yaml_file_path = Path(__file__).resolve().parent.parent.parent / "config/config.yaml"
+    with open(str(yaml_file_path), "r") as yaml_file:
+        data = yaml.safe_load(yaml_file)
+        for k, v in data.items():
+            os.environ[k] = str(v)
--- a/requirements.txt
+++ b/requirements.txt
@ -38,4 +38,6 @@ typing_extensions==4.5.0
 aiofiles
 libcst==1.0.1
 qdrant-client==1.4.0
+connexion[swagger-ui]
+aiohttp_jinja2

--- a/tests/metagpt/learn/init.py
+++ b/tests/metagpt/learn/init.py
--- a/tests/metagpt/learn/test_text_to_embedding.py
+++ b/tests/metagpt/learn/test_text_to_embedding.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/18
+@Author  : mashenquan
+@File    : test_text_to_embedding.py
+@Desc    : Unit tests.
+"""
+
+import asyncio
+import base64
+
+from pydantic import BaseModel
+
+from metagpt.learn.text_to_embedding import text_to_embedding
+
+
+async def mock_text_to_embedding():
+    class Input(BaseModel):
+        input: str
+
+    inputs = [
+        {"input": "Panda emoji"}
+    ]
+
+    for i in inputs:
+        seed = Input(**i)
+        data = text_to_embedding(seed.input)
+        v = ResultEmbedding(**data)
+        assert len(v.data) > 0
+
+
+def test_suite():
+    loop = asyncio.get_event_loop()
+    task = loop.create_task(mock_text_to_embedding())
+    loop.run_until_complete(task)
+
+
+if __name__ == '__main__':
+    test_suite()
--- a/tests/metagpt/learn/test_text_to_image.py
+++ b/tests/metagpt/learn/test_text_to_image.py
@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/18
+@Author  : mashenquan
+@File    : test_text_to_image.py
+@Desc    : Unit tests.
+"""
+import asyncio
+import base64
+
+from pydantic import BaseModel
+
+from metagpt.learn.text_to_image import text_to_image
+
+
+async def mock_text_to_image():
+    class Input(BaseModel):
+        input: str
+        size_type: str
+
+    inputs = [
+        {"input": "Panda emoji", "size_type": "512x512"}
+    ]
+
+    for i in inputs:
+        seed = Input(**i)
+        base64_data = text_to_image(seed.input)
+        assert base64_data != ""
+        print(f"{seed.input} -> {base64_data}")
+        assert base64.b64decode(base64_data, validate=True)
+
+
+def test_suite():
+    loop = asyncio.get_event_loop()
+    task = loop.create_task(mock_text_to_image())
+    loop.run_until_complete(task)
+
+
+if __name__ == '__main__':
+    test_suite()
--- a/tests/metagpt/learn/test_text_to_speech.py
+++ b/tests/metagpt/learn/test_text_to_speech.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/8/18
+@Author  : mashenquan
+@File    : test_text_to_speech.py
+@Desc    : Unit tests.
+"""
+import asyncio
+import base64
+
+from pydantic import BaseModel
+
+from metagpt.learn.text_to_speech import text_to_speech
+
+
+async def mock_text_to_speech():
+    class Input(BaseModel):
+        input: str
+
+    inputs = [
+        {"input": "Panda emoji"}
+    ]
+
+    for i in inputs:
+        seed = Input(**i)
+        base64_data = text_to_speech(seed.input)
+        assert base64_data != ""
+        print(f"{seed.input} -> {base64_data}")
+        assert base64.b64decode(base64_data, validate=True)
+
+
+def test_suite():
+    loop = asyncio.get_event_loop()
+    task = loop.create_task(mock_text_to_speech())
+    loop.run_until_complete(task)
+
+
+if __name__ == '__main__':
+    test_suite()
--- a/tests/metagpt/actions/test_azure_tts.py
+++ b/tests/metagpt/actions/test_azure_tts.py
@ -4,8 +4,13 @@
@Time    : 2023/7/1 22:50
@Author  : alexanderwu
@File    : test_azure_tts.py
+@Modified By: mashenquan, 2023-8-17, move to `tools` folder.
 """
-from metagpt.actions.azure_tts import AzureTTS
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent.parent))  # fix-bug: No module named 'metagpt'
+from metagpt.tools.azure_tts import AzureTTS


 def test_azure_tts():