1. add vision config in config.yaml

2. add imitate_webpage.py in example 3. update vision.py
2026-07-23 17:01:08 +02:00 · 2024-01-15 11:13:35 +08:00 · 2024-01-15 11:13:35 +08:00 · f45a368be2
commit f45a368be2
parent 40f5d5e40e
3 changed files with 65 additions and 25 deletions
--- a/config/config.yaml
+++ b/config/config.yaml
@ -86,6 +86,20 @@ TIMEOUT: 60 # Timeout for llm invocation
 #AZURE_TTS_SUBSCRIPTION_KEY: "YOUR_API_KEY"
 #AZURE_TTS_REGION: "eastus"

+#### for OPENAI VISION
+
+OPENAI_VISION_URL: "https://openai-forward.metadl.com/v1"
+OPENAI_VISION_KEY: "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"
+OPENAI_VISION_MODEL: "gpt-4-vision-preview"
+VISION_MAX_TOKENS: 4096
+
+#### for AZURE VISION
+
+#AZURE_VISION_URL: "YOUR_AZURE_ENDPOINT"
+#AZURE_VISION_KEY: "YOUR_API_KEY"
+#AZURE_VISION_REGION: "YOUR_VISION_REGION_NAME"
+#VISION_MAX_TOKENS: 4096
+
 #### for Stable Diffusion
 ## Use SD service, based on https://github.com/AUTOMATIC1111/stable-diffusion-webui
 #SD_URL: "YOUR_SD_URL"
--- a/examples/imitate_webpage.py
+++ b/examples/imitate_webpage.py
@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2024/01/15
+@Author  : mannaandpoem
+@File    : imitate_webpage.py
+"""
+from metagpt.roles.code_interpreter import CodeInterpreter
+
+
+async def main():
+    prompt = """This is a URL of webpage: https://cn.bing.com/
+Firstly, utilize Selenium and WebDriver for rendering. 
+Secondly, convert image to a webpage including HTML, CSS and JS in one go. 
+Finally, save webpage in a text file. 
+Note: All required dependencies and environments have been fully installed and configured."""
+    ci = CodeInterpreter(goal=prompt, use_tools=True)
+
+    await ci.run(prompt)
+
+
+if __name__ == '__main__':
+    import asyncio
+
+    asyncio.run(main())
--- a/metagpt/tools/functions/libs/vision.py
+++ b/metagpt/tools/functions/libs/vision.py
@ -9,39 +9,40 @@ import requests

 import base64

-OPENAI_API_BASE = "..."
-API_KEY = "sk-..."
-MODEL = "..."
-MAX_TOKENS = 4096
+from metagpt.config import CONFIG
+
+OPENAI_API_BASE = CONFIG.OPENAI_VISION_URL
+API_KEY = CONFIG.OPENAI_VISION_KEY
+MODEL = CONFIG.OPENAI_VISION_MODEL
+MAX_TOKENS = CONFIG.VISION_MAX_TOKENS
+
+ANALYZE_LAYOUT_PROMPT = """You are now a UI/UX, please generate layout information for this image:
+
+NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design.
+As the design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry. """
+
+GENERATE_PROMPT = """You are now a UI/UX and Web Developer. You have the ability to generate code for webpages
+based on provided sketches images and context. 
+Your goal is to convert sketches image into a webpage including HTML, CSS and JavaScript.
+
+NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design.
+As the design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry.
+
+Now, please generate the corresponding webpage code including HTML, CSS and JavaScript:"""


 class Vision:
    def __init__(self):
        self.api_key = API_KEY
        self.model = MODEL
-        self.max_tokens = MAX_TOKENS
+        self.max_tokens = 4096

-    def analyze_layout(
-            self,
-            image_path,
-            prompt="You are now a UI/UX, please generate layout information for this image: \n\n"
-                   "NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design."
-                   "As my design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry about it."
-    ):
-        print(f"analyze_layout: {image_path}")
-        return self.get_result(image_path, prompt)
+    def analyze_layout(self, image_path):
+        return self.get_result(image_path, ANALYZE_LAYOUT_PROMPT)

-    def generate_web_pages(
-            self,
-            image_path,
-            prompt="You are now a UI/UX and Web Developer. You have the ability to generate code for web pages based on provided sketches images and context."
-                   "Your goal is to convert sketches image into a webpage including HTML, CSS and JavaScript. "
-                   "NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design. "
-                   "As my design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry about it."
-                   "\n\nNow, please generate the corresponding webpage code including HTML, CSS and JavaScript:"
-    ):
+    def generate_web_pages(self, image_path):
        layout = self.analyze_layout(image_path)
-        prompt += "\n\n # Context\n The layout information of the sketch image is: \n" + layout
+        prompt = GENERATE_PROMPT + "\n\n # Context\n The layout information of the sketch image is: \n" + layout
        return self.get_result(image_path, prompt)

    def get_result(self, image_path, prompt):
@ -78,4 +79,4 @@ class Vision:
 if __name__ == "__main__":
    vision = Vision()
    rsp = vision.generate_web_pages(image_path="./img.png")
-    print(rsp)
+    print(rsp)