diff --git a/.gitignore b/.gitignore index e326e8372..1a9741e91 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,5 @@ output.wav # output folder output +tmp.png + diff --git a/.well-known/metagpt_oas3_api.yaml b/.well-known/metagpt_oas3_api.yaml index 7ae10579c..a226181a5 100644 --- a/.well-known/metagpt_oas3_api.yaml +++ b/.well-known/metagpt_oas3_api.yaml @@ -71,7 +71,7 @@ paths: /txt2img/openai: post: summary: "Convert Text to Base64-encoded Image Data Stream" - operationId: openai_text_2_image.oas3_openai_text_2_image + operationId: openai_text_to_image.oas3_openai_text_to_image requestBody: required: true content: @@ -109,7 +109,7 @@ paths: /txt2embedding/openai: post: summary: Text to embedding - operationId: openai_text_2_embedding.oas3_openai_text_2_embedding + operationId: openai_text_to_embedding.oas3_openai_text_to_embedding description: Retrieve an embedding for the provided text using the OpenAI API. requestBody: content: @@ -144,6 +144,49 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + + /txt2image/metagpt: + post: + summary: "Text to Image" + description: "Generate an image from the provided text using the MetaGPT Text-to-Image API." + operationId: metagpt_text_to_image.oas3_metagpt_text_to_image + requestBody: + required: true + content: + application/json: + schema: + type: object + required: + - text + properties: + text: + type: string + description: "The text used for image conversion." + size_type: + type: string + enum: ["512x512", "512x768"] + default: "512x512" + description: "Size of the generated image." + model_url: + type: string + description: "Model reset API URL for text-to-image." + default: "" + responses: + '200': + description: "Base64-encoded image data." + content: + application/json: + schema: + type: object + properties: + image_data: + type: string + format: base64 + '400': + description: "Bad Request" + '500': + description: "Internal Server Error" + components: schemas: Embedding: diff --git a/config/config.yaml b/config/config.yaml index 303f4824b..6e9a61931 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -70,3 +70,6 @@ SD_T2I_API: "/sdapi/v1/txt2img" ### for Research MODEL_FOR_RESEARCHER_SUMMARY: gpt-3.5-turbo MODEL_FOR_RESEARCHER_REPORT: gpt-3.5-turbo-16k + +### Meta Models +#METAGPT_TEXT_TO_IMAGE_MODEL: MODEL_URL \ No newline at end of file diff --git a/metagpt/learn/text_to_embedding.py b/metagpt/learn/text_to_embedding.py new file mode 100644 index 000000000..281815ca6 --- /dev/null +++ b/metagpt/learn/text_to_embedding.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2023/8/18 +@Author : mashenquan +@File : text_to_embedding.py +@Desc : Text-to-Embedding skill, which provides text-to-embedding functionality. +""" +import os + +from metagpt.tools.openai_text_to_embedding import oas3_openai_text_to_embedding +from metagpt.utils.common import initialize_environment + + +def text_to_embedding(text, model="text-embedding-ada-002", openai_api_key=""): + """Text to embedding + + :param text: The text used for embedding. + :param model: One of ['text-embedding-ada-002'], ID of the model to use. For more details, checkout: `https://api.openai.com/v1/models`. + :param openai_api_key: OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys` + :return: A json object of :class:`ResultEmbedding` class if successful, otherwise `{}`. + """ + initialize_environment() + if os.environ.get("OPENAI_API_KEY") or openai_api_key: + return oas3_openai_text_to_embedding(text, model=model, openai_api_key=openai_api_key) + raise EnvironmentError diff --git a/metagpt/learn/text_to_image.py b/metagpt/learn/text_to_image.py new file mode 100644 index 000000000..0932dfe07 --- /dev/null +++ b/metagpt/learn/text_to_image.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2023/8/18 +@Author : mashenquan +@File : text_to_image.py +@Desc : Text-to-Image skill, which provides text-to-image functionality. +""" +import os + +from metagpt.tools.metagpt_text_to_image import oas3_metagpt_text_to_image +from metagpt.tools.openai_text_to_image import oas3_openai_text_to_image +from metagpt.utils.common import initialize_environment + + +def text_to_image(text, size_type: str = "512x512", openai_api_key="", model_url=""): + """Text to image + + :param text: The text used for image conversion. + :param openai_api_key: OpenAI API key, For more details, checkout: `https://platform.openai.com/account/api-keys` + :param size_type: If using OPENAI, the available size options are ['256x256', '512x512', '1024x1024'], while for MetaGPT, the options are ['512x512', '512x768']. + :param model_url: MetaGPT model url + :return: The image data is returned in Base64 encoding. + """ + initialize_environment() + if os.environ.get("METAGPT_TEXT_TO_IMAGE_MODEL") or model_url: + return oas3_metagpt_text_to_image(text, size_type, model_url) + if os.environ.get("OPENAI_API_KEY") or openai_api_key: + return oas3_openai_text_to_image(text, size_type, openai_api_key) + raise EnvironmentError diff --git a/metagpt/learn/text_to_speech.py b/metagpt/learn/text_to_speech.py new file mode 100644 index 000000000..1b81097b8 --- /dev/null +++ b/metagpt/learn/text_to_speech.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2023/8/17 +@Author : mashenquan +@File : text_to_speech.py +@Desc : Text-to-Speech skill, which provides text-to-speech functionality +""" +import os + +from metagpt.tools.azure_tts import oas3_azsure_tts +from metagpt.utils.common import initialize_environment + + +def text_to_speech(text, lang="zh-CN", voice="zh-CN-XiaomoNeural", style="affectionate", role="Girl", + subscription_key="", region=""): + """Text to speech + For more details, check out:`https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts` + + :param lang: The value can contain a language code such as en (English), or a locale such as en-US (English - United States). For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts` + :param voice: For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`, `https://speech.microsoft.com/portal/voicegallery` + :param style: Speaking style to express different emotions like cheerfulness, empathy, and calm. For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts` + :param role: With roles, the same voice can act as a different age and gender. For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts` + :param text: The text used for voice conversion. + :param subscription_key: key is used to access your Azure AI service API, see: `https://portal.azure.com/` > `Resource Management` > `Keys and Endpoint` + :param region: This is the location (or region) of your resource. You may need to use this field when making calls to this API. + :return: Returns the Base64-encoded .wav file data if successful, otherwise an empty string. + + """ + initialize_environment() + if (os.environ.get("AZURE_TTS_SUBSCRIPTION_KEY") and os.environ.get("AZURE_TTS_REGION")) or \ + (subscription_key and region): + return oas3_azsure_tts(text, lang, voice, style, role, subscription_key, region) + + raise EnvironmentError diff --git a/metagpt/tools/azure_tts.py b/metagpt/tools/azure_tts.py index 2ec1539ef..21e8f1b6c 100644 --- a/metagpt/tools/azure_tts.py +++ b/metagpt/tools/azure_tts.py @@ -62,7 +62,7 @@ class AzureTTS: # Export def oas3_azsure_tts(text, lang="", voice="", style="", role="", subscription_key="", region=""): - """oas3/tts/azsure + """Text to speech For more details, check out:`https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts` :param lang: The value can contain a language code such as en (English), or a locale such as en-US (English - United States). For more details, checkout: `https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts` diff --git a/metagpt/tools/metagpt_text_to_image.py b/metagpt/tools/metagpt_text_to_image.py new file mode 100644 index 000000000..393215df0 --- /dev/null +++ b/metagpt/tools/metagpt_text_to_image.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2023/8/18 +@Author : mashenquan +@File : metagpt_text_to_image.py +@Desc : MetaGPT Text-to-Image OAS3 api, which provides text-to-image functionality. +""" +import base64 +import os +import sys +from pathlib import Path +from typing import List, Dict + +import requests +from pydantic import BaseModel + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) # fix-bug: No module named 'metagpt' +from metagpt.utils.common import initialize_environment +from metagpt.logs import logger + + +class MetaGPTText2Image: + def __init__(self, model_url): + """ + :param model_url: Model reset api url + """ + self.model_url = model_url if model_url else os.environ.get('METAGPT_TEXT_TO_IMAGE_MODEL') + + def text_2_image(self, text, size_type="512x512"): + """Text to image + + :param text: The text used for image conversion. + :param size_type: One of ['512x512', '512x768'] + :return: The image data is returned in Base64 encoding. + """ + + headers = { + "Content-Type": "application/json" + } + dims = size_type.split("x") + data = { + "prompt": text, + "negative_prompt": "(easynegative:0.8),black, dark,Low resolution", + "override_settings": {"sd_model_checkpoint": "galaxytimemachinesGTM_photoV20"}, + "seed": -1, + "batch_size": 1, + "n_iter": 1, + "steps": 20, + "cfg_scale": 11, + "width": int(dims[0]), + "height": int(dims[1]), # 768, + "restore_faces": False, + "tiling": False, + "do_not_save_samples": False, + "do_not_save_grid": False, + "enable_hr": False, + "hr_scale": 2, + "hr_upscaler": "Latent", + "hr_second_pass_steps": 0, + "hr_resize_x": 0, + "hr_resize_y": 0, + "hr_upscale_to_x": 0, + "hr_upscale_to_y": 0, + "truncate_x": 0, + "truncate_y": 0, + "applied_old_hires_behavior_to": None, + "eta": None, + "sampler_index": "DPM++ SDE Karras", + "alwayson_scripts": {}, + } + + class ImageResult(BaseModel): + images: List + parameters: Dict + + try: + response = requests.post(self.model_url, headers=headers, json=data) + response.raise_for_status() # Raise an exception for 4xx or 5xx responses + result = ImageResult(**response.json()) + if len(result.images) == 0: + return "" + return result.images[0] + except requests.exceptions.RequestException as e: + logger.error(f"An error occurred:{e}") + return "" + + +# Export +def oas3_metagpt_text_to_image(text, size_type: str = "512x512", model_url=""): + """Text to image + + :param text: The text used for image conversion. + :param model_url: Model reset api + :param size_type: One of ['512x512', '512x768'] + :return: The image data is returned in Base64 encoding. + """ + if not text: + return "" + if not model_url: + model_url = os.environ.get('METAGPT_TEXT_TO_IMAGE_MODEL') + return MetaGPTText2Image(model_url).text_2_image(text, size_type=size_type) + + +if __name__ == "__main__": + initialize_environment() + + v = oas3_metagpt_text_2_image("Panda emoji") + data = base64.b64decode(v) + with open("tmp.png", mode="wb") as writer: + writer.write(data) + print(v) diff --git a/metagpt/tools/openai_text_2_embedding.py b/metagpt/tools/openai_text_to_embedding.py similarity index 94% rename from metagpt/tools/openai_text_2_embedding.py rename to metagpt/tools/openai_text_to_embedding.py index eb90a1ea9..9eddd5bc1 100644 --- a/metagpt/tools/openai_text_2_embedding.py +++ b/metagpt/tools/openai_text_to_embedding.py @@ -3,7 +3,7 @@ """ @Time : 2023/8/18 @Author : mashenquan -@File : openai_text_2_embedding.py +@File : openai_text_to_embedding.py @Desc : OpenAI Text-to-Embedding OAS3 api, which provides text-to-embedding functionality. For more details, checkout: `https://platform.openai.com/docs/api-reference/embeddings/object` """ @@ -70,7 +70,7 @@ class OpenAIText2Embedding: # Export -def oas3_openai_text_2_embedding(text, model="text-embedding-ada-002", openai_api_key=""): +def oas3_openai_text_to_embedding(text, model="text-embedding-ada-002", openai_api_key=""): """Text to embedding :param text: The text used for embedding. @@ -88,5 +88,5 @@ def oas3_openai_text_2_embedding(text, model="text-embedding-ada-002", openai_ap if __name__ == "__main__": initialize_environment() - v = oas3_openai_text_2_embedding("Panda emoji") + v = oas3_openai_text_to_embedding("Panda emoji") print(v) diff --git a/metagpt/tools/openai_text_2_image.py b/metagpt/tools/openai_text_to_image.py similarity index 94% rename from metagpt/tools/openai_text_2_image.py rename to metagpt/tools/openai_text_to_image.py index 50c007626..6ec96d166 100644 --- a/metagpt/tools/openai_text_2_image.py +++ b/metagpt/tools/openai_text_to_image.py @@ -3,7 +3,7 @@ """ @Time : 2023/8/17 @Author : mashenquan -@File : openai_text_2_image.py +@File : openai_text_to_image.py @Desc : OpenAI Text-to-Image OAS3 api, which provides text-to-image functionality. """ import base64 @@ -78,7 +78,7 @@ class OpenAIText2Image: # Export -def oas3_openai_text_2_image(text, size_type: str = "1024x1024", openai_api_key=""): +def oas3_openai_text_to_image(text, size_type: str = "1024x1024", openai_api_key=""): """Text to image :param text: The text used for image conversion. @@ -96,5 +96,5 @@ def oas3_openai_text_2_image(text, size_type: str = "1024x1024", openai_api_key= if __name__ == "__main__": initialize_environment() - v = oas3_openai_text_2_image("Panda emoji") + v = oas3_openai_text_to_image("Panda emoji") print(v) diff --git a/tests/metagpt/learn/__init__.py b/tests/metagpt/learn/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/metagpt/learn/test_text_to_embedding.py b/tests/metagpt/learn/test_text_to_embedding.py new file mode 100644 index 000000000..c85e5dde8 --- /dev/null +++ b/tests/metagpt/learn/test_text_to_embedding.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2023/8/18 +@Author : mashenquan +@File : test_text_to_embedding.py +@Desc : Unit tests. +""" + +import asyncio +import base64 + +from pydantic import BaseModel + +from metagpt.learn.text_to_embedding import text_to_embedding + + +async def mock_text_to_embedding(): + class Input(BaseModel): + input: str + + inputs = [ + {"input": "Panda emoji"} + ] + + for i in inputs: + seed = Input(**i) + data = text_to_embedding(seed.input) + v = ResultEmbedding(**data) + assert len(v.data) > 0 + + +def test_suite(): + loop = asyncio.get_event_loop() + task = loop.create_task(mock_text_to_embedding()) + loop.run_until_complete(task) + + +if __name__ == '__main__': + test_suite() diff --git a/tests/metagpt/learn/test_text_to_image.py b/tests/metagpt/learn/test_text_to_image.py new file mode 100644 index 000000000..545c8a3ef --- /dev/null +++ b/tests/metagpt/learn/test_text_to_image.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2023/8/18 +@Author : mashenquan +@File : test_text_to_image.py +@Desc : Unit tests. +""" +import asyncio +import base64 + +from pydantic import BaseModel + +from metagpt.learn.text_to_image import text_to_image + + +async def mock_text_to_image(): + class Input(BaseModel): + input: str + size_type: str + + inputs = [ + {"input": "Panda emoji", "size_type": "512x512"} + ] + + for i in inputs: + seed = Input(**i) + base64_data = text_to_image(seed.input) + assert base64_data != "" + print(f"{seed.input} -> {base64_data}") + assert base64.b64decode(base64_data, validate=True) + + +def test_suite(): + loop = asyncio.get_event_loop() + task = loop.create_task(mock_text_to_image()) + loop.run_until_complete(task) + + +if __name__ == '__main__': + test_suite() diff --git a/tests/metagpt/learn/test_text_to_speech.py b/tests/metagpt/learn/test_text_to_speech.py new file mode 100644 index 000000000..dbb599e38 --- /dev/null +++ b/tests/metagpt/learn/test_text_to_speech.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2023/8/18 +@Author : mashenquan +@File : test_text_to_speech.py +@Desc : Unit tests. +""" +import asyncio +import base64 + +from pydantic import BaseModel + +from metagpt.learn.text_to_speech import text_to_speech + + +async def mock_text_to_speech(): + class Input(BaseModel): + input: str + + inputs = [ + {"input": "Panda emoji"} + ] + + for i in inputs: + seed = Input(**i) + base64_data = text_to_speech(seed.input) + assert base64_data != "" + print(f"{seed.input} -> {base64_data}") + assert base64.b64decode(base64_data, validate=True) + + +def test_suite(): + loop = asyncio.get_event_loop() + task = loop.create_task(mock_text_to_speech()) + loop.run_until_complete(task) + + +if __name__ == '__main__': + test_suite() \ No newline at end of file