diff --git a/metagpt/prompts/tool_type.py b/metagpt/prompts/tool_type.py index ec848bbe4..43ead78a6 100644 --- a/metagpt/prompts/tool_type.py +++ b/metagpt/prompts/tool_type.py @@ -37,3 +37,9 @@ The current task is about evaluating a model, please note the following: - Ensure that the evaluated data is same processed as the training data. If not, remember use object in 'Done Tasks' to transform the data. - Use trained model from previous task result directly, do not mock or reload model yourself. """ + +# Prompt for using tools of "vision" type +VISION_PROMPT = """ +The current task is about converting image into webpage code. please note the following: +- Single-Step Code Generation: Execute the entire code generation process in a single step, encompassing HTML, CSS, and JavaScript. Avoid fragmenting the code generation into multiple separate steps to maintain consistency and simplify the development workflow. +""" \ No newline at end of file diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py index 4b3528795..045ede622 100644 --- a/metagpt/tools/__init__.py +++ b/metagpt/tools/__init__.py @@ -17,6 +17,7 @@ from metagpt.prompts.tool_type import ( FEATURE_ENGINEERING_PROMPT, MODEL_TRAIN_PROMPT, MODEL_EVALUATE_PROMPT, + VISION_PROMPT ) @@ -71,6 +72,12 @@ TOOL_TYPE_MAPPINGS = { desc="Only for evaluating model.", usage_prompt=MODEL_EVALUATE_PROMPT, ), + "vision": ToolType( + name="vision", + module=str(TOOL_LIBS_PATH / "vision"), + desc="Only for converting image into webpage code.", + usage_prompt=VISION_PROMPT, + ), "other": ToolType( name="other", module="", diff --git a/metagpt/tools/functions/libs/vision.py b/metagpt/tools/functions/libs/vision.py new file mode 100644 index 000000000..b653c9300 --- /dev/null +++ b/metagpt/tools/functions/libs/vision.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +@Time : 2024/01/12 +@Author : mannaandpoem +@File : vision.py +""" +import requests + +import base64 + +OPENAI_API_BASE = "..." +API_KEY = "sk-..." +MODEL = "..." +MAX_TOKENS = 4096 + + +class Vision: + def __init__(self): + self.api_key = API_KEY + self.model = MODEL + self.max_tokens = MAX_TOKENS + + def analyze_layout( + self, + image_path, + prompt="You are now a UI/UX, please generate layout information for this image: \n\n" + "NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design." + "As my design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry about it." + ): + print(f"analyze_layout: {image_path}") + return self.get_result(image_path, prompt) + + def generate_web_pages( + self, + image_path, + prompt="You are now a UI/UX and Web Developer. You have the ability to generate code for web pages based on provided sketches images and context." + "Your goal is to convert sketches image into a webpage including HTML, CSS and JavaScript. " + "NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design. " + "As my design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry about it." + "\n\nNow, please generate the corresponding webpage code including HTML, CSS and JavaScript:" + ): + layout = self.analyze_layout(image_path) + prompt += "\n\n # Context\n The layout information of the sketch image is: \n" + layout + return self.get_result(image_path, prompt) + + def get_result(self, image_path, prompt): + base64_image = self.encode_image(image_path) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"} + } + ] + } + ], + "max_tokens": self.max_tokens, + } + response = requests.post(f"{OPENAI_API_BASE}/chat/completions", headers=headers, json=payload) + return response.json()["choices"][0]["message"]["content"] + + @staticmethod + def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + +if __name__ == "__main__": + vision = Vision() + rsp = vision.generate_web_pages(image_path="./img.png") + print(rsp) \ No newline at end of file diff --git a/metagpt/tools/functions/schemas/vision.yml b/metagpt/tools/functions/schemas/vision.yml new file mode 100644 index 000000000..795854e75 --- /dev/null +++ b/metagpt/tools/functions/schemas/vision.yml @@ -0,0 +1,20 @@ +Vision: + type: class + description: "Class for generating web pages at once." + methods: + __init__: + description: "Initialize Vision class with default values." + + generate_web_pages: + description: "Generate web pages including all code(HTML, CSS and JavaScript) in one go based on the image." + parameters: + properties: + image_path: + type: str + description: "The path of the image file" + + required: + - image_path + returns: + type: str + description: "Generated web page content." \ No newline at end of file