add vision tool for code_interpreter

2026-07-08 16:12:16 +02:00 · 2024-01-12 18:30:48 +08:00 · 2024-01-12 18:30:48 +08:00 · 40f5d5e40e
commit 40f5d5e40e
parent a46d3d5b1c
4 changed files with 114 additions and 0 deletions
--- a/metagpt/prompts/tool_type.py
+++ b/metagpt/prompts/tool_type.py
@ -37,3 +37,9 @@ The current task is about evaluating a model, please note the following:
 - Ensure that the evaluated data is same processed as the training data. If not, remember use object in 'Done Tasks' to transform the data.
 - Use trained model from previous task result directly, do not mock or reload model yourself.
 """
+
+# Prompt for using tools of "vision" type
+VISION_PROMPT = """
+The current task is about converting image into webpage code. please note the following:
+- Single-Step Code Generation: Execute the entire code generation process in a single step, encompassing HTML, CSS, and JavaScript. Avoid fragmenting the code generation into multiple separate steps to maintain consistency and simplify the development workflow.
+"""
--- a/metagpt/tools/init.py
+++ b/metagpt/tools/init.py
@ -17,6 +17,7 @@ from metagpt.prompts.tool_type import (
    FEATURE_ENGINEERING_PROMPT,
    MODEL_TRAIN_PROMPT,
    MODEL_EVALUATE_PROMPT,
+    VISION_PROMPT
 )


@ -71,6 +72,12 @@ TOOL_TYPE_MAPPINGS = {
        desc="Only for evaluating model.",
        usage_prompt=MODEL_EVALUATE_PROMPT,
    ),
+    "vision": ToolType(
+        name="vision",
+        module=str(TOOL_LIBS_PATH / "vision"),
+        desc="Only for converting image into webpage code.",
+        usage_prompt=VISION_PROMPT,
+    ),
    "other": ToolType(
        name="other",
        module="",
--- a/metagpt/tools/functions/libs/vision.py
+++ b/metagpt/tools/functions/libs/vision.py
@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2024/01/12
+@Author  : mannaandpoem
+@File    : vision.py
+"""
+import requests
+
+import base64
+
+OPENAI_API_BASE = "..."
+API_KEY = "sk-..."
+MODEL = "..."
+MAX_TOKENS = 4096
+
+
+class Vision:
+    def __init__(self):
+        self.api_key = API_KEY
+        self.model = MODEL
+        self.max_tokens = MAX_TOKENS
+
+    def analyze_layout(
+            self,
+            image_path,
+            prompt="You are now a UI/UX, please generate layout information for this image: \n\n"
+                   "NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design."
+                   "As my design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry about it."
+    ):
+        print(f"analyze_layout: {image_path}")
+        return self.get_result(image_path, prompt)
+
+    def generate_web_pages(
+            self,
+            image_path,
+            prompt="You are now a UI/UX and Web Developer. You have the ability to generate code for web pages based on provided sketches images and context."
+                   "Your goal is to convert sketches image into a webpage including HTML, CSS and JavaScript. "
+                   "NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design. "
+                   "As my design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry about it."
+                   "\n\nNow, please generate the corresponding webpage code including HTML, CSS and JavaScript:"
+    ):
+        layout = self.analyze_layout(image_path)
+        prompt += "\n\n # Context\n The layout information of the sketch image is: \n" + layout
+        return self.get_result(image_path, prompt)
+
+    def get_result(self, image_path, prompt):
+        base64_image = self.encode_image(image_path)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": self.max_tokens,
+        }
+        response = requests.post(f"{OPENAI_API_BASE}/chat/completions", headers=headers, json=payload)
+        return response.json()["choices"][0]["message"]["content"]
+
+    @staticmethod
+    def encode_image(image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+
+if __name__ == "__main__":
+    vision = Vision()
+    rsp = vision.generate_web_pages(image_path="./img.png")
+    print(rsp)
--- a/metagpt/tools/functions/schemas/vision.yml
+++ b/metagpt/tools/functions/schemas/vision.yml
@ -0,0 +1,20 @@
+Vision:
+  type: class
+  description: "Class for generating web pages at once."
+  methods:
+    __init__:
+      description: "Initialize Vision class with default values."
+
+    generate_web_pages:
+      description: "Generate web pages including all code(HTML, CSS and JavaScript) in one go based on the image."
+      parameters:
+        properties:
+          image_path:
+            type: str
+            description: "The path of the image file"
+
+      required:
+        - image_path
+      returns:
+        type: str
+        description: "Generated web page content."