diff --git a/config/config.yaml b/config/config.yaml
index 79ebae863..d8fab693e 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -86,6 +86,11 @@ TIMEOUT: 60 # Timeout for llm invocation
#AZURE_TTS_SUBSCRIPTION_KEY: "YOUR_API_KEY"
#AZURE_TTS_REGION: "eastus"
+#### for OPENAI VISION
+
+#OPENAI_VISION_MODEL: "YOUR_VISION_MODEL_NAME"
+#VISION_MAX_TOKENS: 4096
+
#### for Stable Diffusion
## Use SD service, based on https://github.com/AUTOMATIC1111/stable-diffusion-webui
#SD_URL: "YOUR_SD_URL"
diff --git a/examples/imitate_webpage.py b/examples/imitate_webpage.py
new file mode 100644
index 000000000..6c12c7eda
--- /dev/null
+++ b/examples/imitate_webpage.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time : 2024/01/15
+@Author : mannaandpoem
+@File : imitate_webpage.py
+"""
+from metagpt.roles.code_interpreter import CodeInterpreter
+
+
+async def main():
+ web_url = 'https://pytorch.org/'
+ prompt = f"""This is a URL of webpage: '{web_url}' .
+Firstly, utilize Selenium and WebDriver for rendering.
+Secondly, convert image to a webpage including HTML, CSS and JS in one go.
+Finally, save webpage in a text file.
+Note: All required dependencies and environments have been fully installed and configured."""
+ ci = CodeInterpreter(goal=prompt, use_tools=True)
+
+ await ci.run(prompt)
+
+
+if __name__ == '__main__':
+ import asyncio
+
+ asyncio.run(main())
diff --git a/metagpt/prompts/tool_type.py b/metagpt/prompts/tool_type.py
index ec848bbe4..43ead78a6 100644
--- a/metagpt/prompts/tool_type.py
+++ b/metagpt/prompts/tool_type.py
@@ -37,3 +37,9 @@ The current task is about evaluating a model, please note the following:
- Ensure that the evaluated data is same processed as the training data. If not, remember use object in 'Done Tasks' to transform the data.
- Use trained model from previous task result directly, do not mock or reload model yourself.
"""
+
+# Prompt for using tools of "vision" type
+VISION_PROMPT = """
+The current task is about converting image into webpage code. please note the following:
+- Single-Step Code Generation: Execute the entire code generation process in a single step, encompassing HTML, CSS, and JavaScript. Avoid fragmenting the code generation into multiple separate steps to maintain consistency and simplify the development workflow.
+"""
\ No newline at end of file
diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py
index 41c8708b2..84b9cbd12 100644
--- a/metagpt/tools/__init__.py
+++ b/metagpt/tools/__init__.py
@@ -16,6 +16,7 @@ from metagpt.prompts.tool_type import (
FEATURE_ENGINEERING_PROMPT,
MODEL_TRAIN_PROMPT,
MODEL_EVALUATE_PROMPT,
+ VISION_PROMPT
)
@@ -76,6 +77,12 @@ TOOL_TYPE_MAPPINGS = {
desc="Related to text2image, image2image using stable diffusion model.",
usage_prompt="",
),
+ "vision": ToolType(
+ name="vision",
+ module=str(TOOL_LIBS_PATH / "vision"),
+ desc="Only for converting image into webpage code.",
+ usage_prompt=VISION_PROMPT,
+ ),
"other": ToolType(
name="other",
module="",
diff --git a/metagpt/tools/functions/libs/vision.py b/metagpt/tools/functions/libs/vision.py
new file mode 100644
index 000000000..b10ad7608
--- /dev/null
+++ b/metagpt/tools/functions/libs/vision.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time : 2024/01/12
+@Author : mannaandpoem
+@File : vision.py
+"""
+from pathlib import Path
+
+import requests
+
+import base64
+
+from metagpt.config import CONFIG
+
+OPENAI_API_BASE = CONFIG.OPENAI_BASE_URL
+API_KEY = CONFIG.OPENAI_API_KEY
+MODEL = CONFIG.OPENAI_VISION_MODEL
+MAX_TOKENS = CONFIG.VISION_MAX_TOKENS
+
+ANALYZE_LAYOUT_PROMPT = """You are now a UI/UX, please generate layout information for this image:
+
+NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design.
+As the design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry. """
+
+GENERATE_PROMPT = """You are now a UI/UX and Web Developer. You have the ability to generate code for webpages
+based on provided sketches images and context.
+Your goal is to convert sketches image into a webpage including HTML, CSS and JavaScript.
+
+NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design.
+As the design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry.
+
+Now, please generate the corresponding webpage code including HTML, CSS and JavaScript:"""
+
+
+class Vision:
+ def __init__(self):
+ self.api_key = API_KEY
+ self.api_base = OPENAI_API_BASE
+ self.model = MODEL
+ self.max_tokens = MAX_TOKENS
+
+ def analyze_layout(self, image_path):
+ return self.get_result(image_path, ANALYZE_LAYOUT_PROMPT)
+
+ def generate_web_pages(self, image_path):
+ layout = self.analyze_layout(image_path)
+ prompt = GENERATE_PROMPT + "\n\n # Context\n The layout information of the sketch image is: \n" + layout
+ result = self.get_result(image_path, prompt)
+ return result
+
+ def get_result(self, image_path, prompt):
+ base64_image = self.encode_image(image_path)
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {self.api_key}"
+ }
+ payload = {
+ "model": self.model,
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": prompt},
+ {
+ "type": "image_url",
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
+ }
+ ]
+ }
+ ],
+ "max_tokens": self.max_tokens,
+ }
+ response = requests.post(f"{self.api_base}/chat/completions", headers=headers, json=payload)
+
+ if response.status_code != 200:
+ raise ValueError(f"Request failed with status {response.status_code}, {response.text}")
+ else:
+ return response.json()["choices"][0]["message"]["content"]
+
+ @staticmethod
+ def encode_image(image_path):
+ with open(image_path, "rb") as image_file:
+ return base64.b64encode(image_file.read()).decode('utf-8')
+
+ @staticmethod
+ def save_webpages(image_path, webpages) -> Path:
+ # 在当前目录下创建一个名为webpages的文件夹,用于存储html、css和js文件
+ webpages_path = Path(image_path).parent / "webpages"
+ webpages_path.mkdir(exist_ok=True)
+
+ try:
+ index_path = webpages_path / "index.html"
+ index = webpages.split("```html")[1].split("```")[0]
+ except IndexError:
+ raise ValueError("No html code found in the result, please check your image and try again.")
+
+ try:
+ if "styles.css" in index:
+ style_path = webpages_path / "styles.css"
+ elif "style.css" in index:
+ style_path = webpages_path / "style.css"
+ else:
+ style_path = None
+ style = webpages.split("```css")[1].split("```")[0] if style_path else ""
+
+ if "scripts.js" in index:
+ js_path = webpages_path / "scripts.js"
+ elif "script.js" in index:
+ js_path = webpages_path / "script.js"
+ else:
+ js_path = None
+ js = webpages.split("```javascript")[1].split("```")[0] if js_path else ""
+ except IndexError:
+ raise ValueError("No css or js code found in the result, please check your image and try again.")
+
+ try:
+ with open(index_path, "w") as f:
+ f.write(index)
+ if style_path:
+ with open(style_path, "w") as f:
+ f.write(style)
+ if js_path:
+ with open(js_path, "w") as f:
+ f.write(js)
+ except FileNotFoundError as e:
+ raise FileNotFoundError(f"Cannot save the webpages to {str(webpages_path)}") from e
+
+ return webpages_path
diff --git a/metagpt/tools/functions/schemas/vision.yml b/metagpt/tools/functions/schemas/vision.yml
new file mode 100644
index 000000000..4cb247419
--- /dev/null
+++ b/metagpt/tools/functions/schemas/vision.yml
@@ -0,0 +1,36 @@
+Vision:
+ type: class
+ description: "Class for generating web pages at once."
+ methods:
+ __init__:
+ description: "Initialize Vision class with default values."
+
+ generate_web_pages:
+ description: "Generate web pages including all code(HTML, CSS and JavaScript) in one go based on the image."
+ parameters:
+ properties:
+ image_path:
+ type: str
+ description: "The path of the image file"
+ required:
+ - image_path
+ returns:
+ type: str
+ description: "Generated webpages content."
+
+ save_webpages:
+ description: "Save webpages including all code(HTML, CSS and JavaScript) at once"
+ parameters:
+ properties:
+ image_path:
+ type: str
+ description: "The path of the image file"
+ webpages:
+ type: str
+ description: "The generated webpages content"
+ required:
+ - image_path
+ - webpages
+ returns:
+ type: Path
+ description: "The path of the saved webpages"
\ No newline at end of file
diff --git a/tests/metagpt/tools/functions/libs/test_vision.py b/tests/metagpt/tools/functions/libs/test_vision.py
new file mode 100644
index 000000000..f4f97c46a
--- /dev/null
+++ b/tests/metagpt/tools/functions/libs/test_vision.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time : 2024/01/15
+@Author : mannaandpoem
+@File : test_vision.py
+"""
+import pytest
+
+from metagpt import logs
+from metagpt.tools.functions.libs.vision import Vision
+
+
+@pytest.fixture
+def mock_webpages():
+ return """```html\n\n
+\n\n```\n
+```css\n.class { ... }\n```\n
+```javascript\nfunction() { ... }\n```\n"""
+
+
+def test_vision_generate_webpages(mocker, mock_webpages):
+ mocker.patch(
+ "metagpt.tools.functions.libs.vision.Vision.generate_web_pages",
+ return_value=mock_webpages
+ )
+ image_path = "image.png"
+ vision = Vision()
+ rsp = vision.generate_web_pages(image_path=image_path)
+ logs.logger.info(rsp)
+ assert "html" in rsp
+ assert "css" in rsp
+ assert "javascript" in rsp
+
+
+def test_save_webpages(mocker, mock_webpages):
+ mocker.patch(
+ "metagpt.tools.functions.libs.vision.Vision.generate_web_pages",
+ return_value=mock_webpages
+ )
+ image_path = "image.png"
+ vision = Vision()
+ webpages = vision.generate_web_pages(image_path)
+ webpages_dir = vision.save_webpages(image_path=image_path, webpages=webpages)
+ logs.logger.info(webpages_dir)
+ assert webpages_dir.exists()
+
+