Merge branch 'code_intepreter_add_vision' into 'code_intepreter'

add vision tool for code_interpreter See merge request agents/data_agents_opt!47
2026-05-05 22:02:38 +02:00 · 2024-01-17 10:05:48 +00:00 · 2024-01-17 10:05:48 +00:00 · 42a106ca26
commit 42a106ca26
parent 7f0c752f8d 66db86ae2a
7 changed files with 257 additions and 0 deletions
--- a/config/config.yaml
+++ b/config/config.yaml
@ -86,6 +86,11 @@ TIMEOUT: 60 # Timeout for llm invocation
 #AZURE_TTS_SUBSCRIPTION_KEY: "YOUR_API_KEY"
 #AZURE_TTS_REGION: "eastus"

+#### for OPENAI VISION
+
+#OPENAI_VISION_MODEL: "YOUR_VISION_MODEL_NAME"
+#VISION_MAX_TOKENS: 4096
+
 #### for Stable Diffusion
 ## Use SD service, based on https://github.com/AUTOMATIC1111/stable-diffusion-webui
 #SD_URL: "YOUR_SD_URL"
--- a/examples/imitate_webpage.py
+++ b/examples/imitate_webpage.py
@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2024/01/15
+@Author  : mannaandpoem
+@File    : imitate_webpage.py
+"""
+from metagpt.roles.code_interpreter import CodeInterpreter
+
+
+async def main():
+    web_url = 'https://pytorch.org/'
+    prompt = f"""This is a URL of webpage: '{web_url}' .
+Firstly, utilize Selenium and WebDriver for rendering. 
+Secondly, convert image to a webpage including HTML, CSS and JS in one go. 
+Finally, save webpage in a text file. 
+Note: All required dependencies and environments have been fully installed and configured."""
+    ci = CodeInterpreter(goal=prompt, use_tools=True)
+
+    await ci.run(prompt)
+
+
+if __name__ == '__main__':
+    import asyncio
+
+    asyncio.run(main())
--- a/metagpt/prompts/tool_type.py
+++ b/metagpt/prompts/tool_type.py
@ -37,3 +37,9 @@ The current task is about evaluating a model, please note the following:
 - Ensure that the evaluated data is same processed as the training data. If not, remember use object in 'Done Tasks' to transform the data.
 - Use trained model from previous task result directly, do not mock or reload model yourself.
 """
+
+# Prompt for using tools of "vision" type
+VISION_PROMPT = """
+The current task is about converting image into webpage code. please note the following:
+- Single-Step Code Generation: Execute the entire code generation process in a single step, encompassing HTML, CSS, and JavaScript. Avoid fragmenting the code generation into multiple separate steps to maintain consistency and simplify the development workflow.
+"""
--- a/metagpt/tools/init.py
+++ b/metagpt/tools/init.py
@ -16,6 +16,7 @@ from metagpt.prompts.tool_type import (
    FEATURE_ENGINEERING_PROMPT,
    MODEL_TRAIN_PROMPT,
    MODEL_EVALUATE_PROMPT,
+    VISION_PROMPT
 )


@ -76,6 +77,12 @@ TOOL_TYPE_MAPPINGS = {
        desc="Related to text2image, image2image using stable diffusion model.",
        usage_prompt="",
    ),
+    "vision": ToolType(
+        name="vision",
+        module=str(TOOL_LIBS_PATH / "vision"),
+        desc="Only for converting image into webpage code.",
+        usage_prompt=VISION_PROMPT,
+    ),
    "other": ToolType(
        name="other",
        module="",
--- a/metagpt/tools/functions/libs/vision.py
+++ b/metagpt/tools/functions/libs/vision.py
@ -0,0 +1,129 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2024/01/12
+@Author  : mannaandpoem
+@File    : vision.py
+"""
+from pathlib import Path
+
+import requests
+
+import base64
+
+from metagpt.config import CONFIG
+
+OPENAI_API_BASE = CONFIG.OPENAI_BASE_URL
+API_KEY = CONFIG.OPENAI_API_KEY
+MODEL = CONFIG.OPENAI_VISION_MODEL
+MAX_TOKENS = CONFIG.VISION_MAX_TOKENS
+
+ANALYZE_LAYOUT_PROMPT = """You are now a UI/UX, please generate layout information for this image:
+
+NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design.
+As the design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry. """
+
+GENERATE_PROMPT = """You are now a UI/UX and Web Developer. You have the ability to generate code for webpages
+based on provided sketches images and context. 
+Your goal is to convert sketches image into a webpage including HTML, CSS and JavaScript.
+
+NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design.
+As the design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry.
+
+Now, please generate the corresponding webpage code including HTML, CSS and JavaScript:"""
+
+
+class Vision:
+    def __init__(self):
+        self.api_key = API_KEY
+        self.api_base = OPENAI_API_BASE
+        self.model = MODEL
+        self.max_tokens = MAX_TOKENS
+
+    def analyze_layout(self, image_path):
+        return self.get_result(image_path, ANALYZE_LAYOUT_PROMPT)
+
+    def generate_web_pages(self, image_path):
+        layout = self.analyze_layout(image_path)
+        prompt = GENERATE_PROMPT + "\n\n # Context\n The layout information of the sketch image is: \n" + layout
+        result = self.get_result(image_path, prompt)
+        return result
+
+    def get_result(self, image_path, prompt):
+        base64_image = self.encode_image(image_path)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": self.max_tokens,
+        }
+        response = requests.post(f"{self.api_base}/chat/completions", headers=headers, json=payload)
+
+        if response.status_code != 200:
+            raise ValueError(f"Request failed with status {response.status_code}, {response.text}")
+        else:
+            return response.json()["choices"][0]["message"]["content"]
+
+    @staticmethod
+    def encode_image(image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    @staticmethod
+    def save_webpages(image_path, webpages) -> Path:
+        # 在当前目录下创建一个名为webpages的文件夹，用于存储html、css和js文件
+        webpages_path = Path(image_path).parent / "webpages"
+        webpages_path.mkdir(exist_ok=True)
+
+        try:
+            index_path = webpages_path / "index.html"
+            index = webpages.split("```html")[1].split("```")[0]
+        except IndexError:
+            raise ValueError("No html code found in the result, please check your image and try again.")
+
+        try:
+            if "styles.css" in index:
+                style_path = webpages_path / "styles.css"
+            elif "style.css" in index:
+                style_path = webpages_path / "style.css"
+            else:
+                style_path = None
+            style = webpages.split("```css")[1].split("```")[0] if style_path else ""
+
+            if "scripts.js" in index:
+                js_path = webpages_path / "scripts.js"
+            elif "script.js" in index:
+                js_path = webpages_path / "script.js"
+            else:
+                js_path = None
+            js = webpages.split("```javascript")[1].split("```")[0] if js_path else ""
+        except IndexError:
+            raise ValueError("No css or js code found in the result, please check your image and try again.")
+
+        try:
+            with open(index_path, "w") as f:
+                f.write(index)
+            if style_path:
+                with open(style_path, "w") as f:
+                    f.write(style)
+            if js_path:
+                with open(js_path, "w") as f:
+                    f.write(js)
+        except FileNotFoundError as e:
+            raise FileNotFoundError(f"Cannot save the webpages to {str(webpages_path)}") from e
+
+        return webpages_path
--- a/metagpt/tools/functions/schemas/vision.yml
+++ b/metagpt/tools/functions/schemas/vision.yml
@ -0,0 +1,36 @@
+Vision:
+  type: class
+  description: "Class for generating web pages at once."
+  methods:
+    __init__:
+      description: "Initialize Vision class with default values."
+
+    generate_web_pages:
+      description: "Generate web pages including all code(HTML, CSS and JavaScript) in one go based on the image."
+      parameters:
+        properties:
+          image_path:
+            type: str
+            description: "The path of the image file"
+      required:
+        - image_path
+      returns:
+        type: str
+        description: "Generated webpages content."
+
+    save_webpages:
+      description: "Save webpages including all code(HTML, CSS and JavaScript) at once"
+      parameters:
+        properties:
+          image_path:
+            type: str
+            description: "The path of the image file"
+          webpages:
+            type: str
+            description: "The generated webpages content"
+      required:
+        - image_path
+        - webpages
+      returns:
+        type: Path
+        description: "The path of the saved webpages"
--- a/tests/metagpt/tools/functions/libs/test_vision.py
+++ b/tests/metagpt/tools/functions/libs/test_vision.py
@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2024/01/15
+@Author  : mannaandpoem
+@File    : test_vision.py
+"""
+import pytest
+
+from metagpt import logs
+from metagpt.tools.functions.libs.vision import Vision
+
+
+@pytest.fixture
+def mock_webpages():
+    return """```html\n<html>\n<script src="scripts.js"></script>
+<link rel="stylesheet" href="styles.css(">\n</html>\n```\n
+```css\n.class { ... }\n```\n
+```javascript\nfunction() { ... }\n```\n"""
+
+
+def test_vision_generate_webpages(mocker, mock_webpages):
+    mocker.patch(
+        "metagpt.tools.functions.libs.vision.Vision.generate_web_pages",
+        return_value=mock_webpages
+    )
+    image_path = "image.png"
+    vision = Vision()
+    rsp = vision.generate_web_pages(image_path=image_path)
+    logs.logger.info(rsp)
+    assert "html" in rsp
+    assert "css" in rsp
+    assert "javascript" in rsp
+
+
+def test_save_webpages(mocker, mock_webpages):
+    mocker.patch(
+        "metagpt.tools.functions.libs.vision.Vision.generate_web_pages",
+        return_value=mock_webpages
+    )
+    image_path = "image.png"
+    vision = Vision()
+    webpages = vision.generate_web_pages(image_path)
+    webpages_dir = vision.save_webpages(image_path=image_path, webpages=webpages)
+    logs.logger.info(webpages_dir)
+    assert webpages_dir.exists()
+
+