add vision tool for code_interpreter

This commit is contained in:
mannaandpoem 2024-01-12 18:30:48 +08:00
parent a46d3d5b1c
commit 40f5d5e40e
4 changed files with 114 additions and 0 deletions

View file

@ -37,3 +37,9 @@ The current task is about evaluating a model, please note the following:
- Ensure that the evaluated data is same processed as the training data. If not, remember use object in 'Done Tasks' to transform the data.
- Use trained model from previous task result directly, do not mock or reload model yourself.
"""
# Prompt for using tools of "vision" type
VISION_PROMPT = """
The current task is about converting image into webpage code. please note the following:
- Single-Step Code Generation: Execute the entire code generation process in a single step, encompassing HTML, CSS, and JavaScript. Avoid fragmenting the code generation into multiple separate steps to maintain consistency and simplify the development workflow.
"""

View file

@ -17,6 +17,7 @@ from metagpt.prompts.tool_type import (
FEATURE_ENGINEERING_PROMPT,
MODEL_TRAIN_PROMPT,
MODEL_EVALUATE_PROMPT,
VISION_PROMPT
)
@ -71,6 +72,12 @@ TOOL_TYPE_MAPPINGS = {
desc="Only for evaluating model.",
usage_prompt=MODEL_EVALUATE_PROMPT,
),
"vision": ToolType(
name="vision",
module=str(TOOL_LIBS_PATH / "vision"),
desc="Only for converting image into webpage code.",
usage_prompt=VISION_PROMPT,
),
"other": ToolType(
name="other",
module="",

View file

@ -0,0 +1,81 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2024/01/12
@Author : mannaandpoem
@File : vision.py
"""
import requests
import base64
OPENAI_API_BASE = "..."
API_KEY = "sk-..."
MODEL = "..."
MAX_TOKENS = 4096
class Vision:
def __init__(self):
self.api_key = API_KEY
self.model = MODEL
self.max_tokens = MAX_TOKENS
def analyze_layout(
self,
image_path,
prompt="You are now a UI/UX, please generate layout information for this image: \n\n"
"NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design."
"As my design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry about it."
):
print(f"analyze_layout: {image_path}")
return self.get_result(image_path, prompt)
def generate_web_pages(
self,
image_path,
prompt="You are now a UI/UX and Web Developer. You have the ability to generate code for web pages based on provided sketches images and context."
"Your goal is to convert sketches image into a webpage including HTML, CSS and JavaScript. "
"NOTE: The image does not have a commercial logo or copyright information. It is just a sketch image of the design. "
"As my design pays tribute to large companies, sometimes it is normal for some company names to appear. Don't worry about it."
"\n\nNow, please generate the corresponding webpage code including HTML, CSS and JavaScript:"
):
layout = self.analyze_layout(image_path)
prompt += "\n\n # Context\n The layout information of the sketch image is: \n" + layout
return self.get_result(image_path, prompt)
def get_result(self, image_path, prompt):
base64_image = self.encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
payload = {
"model": self.model,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
}
]
}
],
"max_tokens": self.max_tokens,
}
response = requests.post(f"{OPENAI_API_BASE}/chat/completions", headers=headers, json=payload)
return response.json()["choices"][0]["message"]["content"]
@staticmethod
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
if __name__ == "__main__":
vision = Vision()
rsp = vision.generate_web_pages(image_path="./img.png")
print(rsp)

View file

@ -0,0 +1,20 @@
Vision:
type: class
description: "Class for generating web pages at once."
methods:
__init__:
description: "Initialize Vision class with default values."
generate_web_pages:
description: "Generate web pages including all code(HTML, CSS and JavaScript) in one go based on the image."
parameters:
properties:
image_path:
type: str
description: "The path of the image file"
required:
- image_path
returns:
type: str
description: "Generated web page content."