diff --git a/config/config.yaml b/config/config.yaml index 79ebae863..d8fab693e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -86,6 +86,11 @@ TIMEOUT: 60 # Timeout for llm invocation #AZURE_TTS_SUBSCRIPTION_KEY: "YOUR_API_KEY" #AZURE_TTS_REGION: "eastus" +#### for OPENAI VISION + +#OPENAI_VISION_MODEL: "YOUR_VISION_MODEL_NAME" +#VISION_MAX_TOKENS: 4096 + #### for Stable Diffusion ## Use SD service, based on https://github.com/AUTOMATIC1111/stable-diffusion-webui #SD_URL: "YOUR_SD_URL" diff --git a/examples/imitate_webpage.py b/examples/imitate_webpage.py new file mode 100644 index 000000000..6c12c7eda --- /dev/null +++ b/examples/imitate_webpage.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2024/01/15 +@Author : mannaandpoem +@File : imitate_webpage.py +""" +from metagpt.roles.code_interpreter import CodeInterpreter + + +async def main(): + web_url = 'https://pytorch.org/' + prompt = f"""This is a URL of webpage: '{web_url}' . +Firstly, utilize Selenium and WebDriver for rendering. +Secondly, convert image to a webpage including HTML, CSS and JS in one go. +Finally, save webpage in a text file. +Note: All required dependencies and environments have been fully installed and configured.""" + ci = CodeInterpreter(goal=prompt, use_tools=True) + + await ci.run(prompt) + + +if __name__ == '__main__': + import asyncio + + asyncio.run(main()) diff --git a/metagpt/prompts/tool_type.py b/metagpt/prompts/tool_type.py index ec848bbe4..43ead78a6 100644 --- a/metagpt/prompts/tool_type.py +++ b/metagpt/prompts/tool_type.py @@ -37,3 +37,9 @@ The current task is about evaluating a model, please note the following: - Ensure that the evaluated data is same processed as the training data. If not, remember use object in 'Done Tasks' to transform the data. - Use trained model from previous task result directly, do not mock or reload model yourself. """ + +# Prompt for using tools of "vision" type +VISION_PROMPT = """ +The current task is about converting image into webpage code. please note the following: +- Single-Step Code Generation: Execute the entire code generation process in a single step, encompassing HTML, CSS, and JavaScript. Avoid fragmenting the code generation into multiple separate steps to maintain consistency and simplify the development workflow. +""" \ No newline at end of file diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py index 41c8708b2..84b9cbd12 100644 --- a/metagpt/tools/__init__.py +++ b/metagpt/tools/__init__.py @@ -16,6 +16,7 @@ from metagpt.prompts.tool_type import ( FEATURE_ENGINEERING_PROMPT, MODEL_TRAIN_PROMPT, MODEL_EVALUATE_PROMPT, + VISION_PROMPT ) @@ -76,6 +77,12 @@ TOOL_TYPE_MAPPINGS = { desc="Related to text2image, image2image using stable diffusion model.", usage_prompt="", ), + "vision": ToolType( + name="vision", + module=str(TOOL_LIBS_PATH / "vision"), + desc="Only for converting image into webpage code.", + usage_prompt=VISION_PROMPT, + ), "other": ToolType( name="other", module="", diff --git a/metagpt/tools/functions/libs/vision.py b/metagpt/tools/functions/libs/vision.py new file mode 100644 index 000000000..b10ad7608 --- /dev/null +++ b/metagpt/tools/functions/libs/vision.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2024/01/12 +@Author : mannaandpoem +@File : vision.py +""" +from pathlib import Path + +import requests + +import base64 + +from metagpt.config import CONFIG + +OPENAI_API_BASE = CONFIG.OPENAI_BASE_URL +API_KEY = CONFIG.OPENAI_API_KEY +MODEL = CONFIG.OPENAI_VISION_MODEL +MAX_TOKENS = CONFIG.VISION_MAX_TOKENS + +ANALYZE_LAYOUT_PROMPT = """You are now a UI/UX, please generate layout information for this image: + +NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design. +As the design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry. """ + +GENERATE_PROMPT = """You are now a UI/UX and Web Developer. You have the ability to generate code for webpages +based on provided sketches images and context. +Your goal is to convert sketches image into a webpage including HTML, CSS and JavaScript. + +NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design. +As the design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry. + +Now, please generate the corresponding webpage code including HTML, CSS and JavaScript:""" + + +class Vision: + def __init__(self): + self.api_key = API_KEY + self.api_base = OPENAI_API_BASE + self.model = MODEL + self.max_tokens = MAX_TOKENS + + def analyze_layout(self, image_path): + return self.get_result(image_path, ANALYZE_LAYOUT_PROMPT) + + def generate_web_pages(self, image_path): + layout = self.analyze_layout(image_path) + prompt = GENERATE_PROMPT + "\n\n # Context\n The layout information of the sketch image is: \n" + layout + result = self.get_result(image_path, prompt) + return result + + def get_result(self, image_path, prompt): + base64_image = self.encode_image(image_path) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"} + } + ] + } + ], + "max_tokens": self.max_tokens, + } + response = requests.post(f"{self.api_base}/chat/completions", headers=headers, json=payload) + + if response.status_code != 200: + raise ValueError(f"Request failed with status {response.status_code}, {response.text}") + else: + return response.json()["choices"][0]["message"]["content"] + + @staticmethod + def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + @staticmethod + def save_webpages(image_path, webpages) -> Path: + # 在当前目录下创建一个名为webpages的文件夹,用于存储html、css和js文件 + webpages_path = Path(image_path).parent / "webpages" + webpages_path.mkdir(exist_ok=True) + + try: + index_path = webpages_path / "index.html" + index = webpages.split("```html")[1].split("```")[0] + except IndexError: + raise ValueError("No html code found in the result, please check your image and try again.") + + try: + if "styles.css" in index: + style_path = webpages_path / "styles.css" + elif "style.css" in index: + style_path = webpages_path / "style.css" + else: + style_path = None + style = webpages.split("```css")[1].split("```")[0] if style_path else "" + + if "scripts.js" in index: + js_path = webpages_path / "scripts.js" + elif "script.js" in index: + js_path = webpages_path / "script.js" + else: + js_path = None + js = webpages.split("```javascript")[1].split("```")[0] if js_path else "" + except IndexError: + raise ValueError("No css or js code found in the result, please check your image and try again.") + + try: + with open(index_path, "w") as f: + f.write(index) + if style_path: + with open(style_path, "w") as f: + f.write(style) + if js_path: + with open(js_path, "w") as f: + f.write(js) + except FileNotFoundError as e: + raise FileNotFoundError(f"Cannot save the webpages to {str(webpages_path)}") from e + + return webpages_path diff --git a/metagpt/tools/functions/schemas/vision.yml b/metagpt/tools/functions/schemas/vision.yml new file mode 100644 index 000000000..4cb247419 --- /dev/null +++ b/metagpt/tools/functions/schemas/vision.yml @@ -0,0 +1,36 @@ +Vision: + type: class + description: "Class for generating web pages at once." + methods: + __init__: + description: "Initialize Vision class with default values." + + generate_web_pages: + description: "Generate web pages including all code(HTML, CSS and JavaScript) in one go based on the image." + parameters: + properties: + image_path: + type: str + description: "The path of the image file" + required: + - image_path + returns: + type: str + description: "Generated webpages content." + + save_webpages: + description: "Save webpages including all code(HTML, CSS and JavaScript) at once" + parameters: + properties: + image_path: + type: str + description: "The path of the image file" + webpages: + type: str + description: "The generated webpages content" + required: + - image_path + - webpages + returns: + type: Path + description: "The path of the saved webpages" \ No newline at end of file diff --git a/tests/metagpt/tools/functions/libs/test_vision.py b/tests/metagpt/tools/functions/libs/test_vision.py new file mode 100644 index 000000000..f4f97c46a --- /dev/null +++ b/tests/metagpt/tools/functions/libs/test_vision.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2024/01/15 +@Author : mannaandpoem +@File : test_vision.py +""" +import pytest + +from metagpt import logs +from metagpt.tools.functions.libs.vision import Vision + + +@pytest.fixture +def mock_webpages(): + return """```html\n\n +\n\n```\n +```css\n.class { ... }\n```\n +```javascript\nfunction() { ... }\n```\n""" + + +def test_vision_generate_webpages(mocker, mock_webpages): + mocker.patch( + "metagpt.tools.functions.libs.vision.Vision.generate_web_pages", + return_value=mock_webpages + ) + image_path = "image.png" + vision = Vision() + rsp = vision.generate_web_pages(image_path=image_path) + logs.logger.info(rsp) + assert "html" in rsp + assert "css" in rsp + assert "javascript" in rsp + + +def test_save_webpages(mocker, mock_webpages): + mocker.patch( + "metagpt.tools.functions.libs.vision.Vision.generate_web_pages", + return_value=mock_webpages + ) + image_path = "image.png" + vision = Vision() + webpages = vision.generate_web_pages(image_path) + webpages_dir = vision.save_webpages(image_path=image_path, webpages=webpages) + logs.logger.info(webpages_dir) + assert webpages_dir.exists() + +