feat: +pic2txt

This commit is contained in:
莘权 马 2024-06-27 14:35:05 +08:00
parent 9dc8d7307b
commit 742ff0e80a
11 changed files with 137 additions and 1 deletions

View file

@ -0,0 +1,78 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2024/6/27
@Author : mashenquan
@File : pic2txt.py
"""
from typing import List
from tenacity import retry, stop_after_attempt, wait_random_exponential
from metagpt.actions import Action
from metagpt.logs import logger
from metagpt.tools.tool_registry import register_tool
from metagpt.utils.common import encode_image, general_after_log, to_markdown_code_block
@register_tool(include_functions=["run"])
class Pic2Txt(Action):
"""Pic2Txt deal with the following situations:
1. Given a picture about the user requirements, write out the textual user requirements.
"""
async def run(
self,
*,
image_paths: List[str],
textual_user_requirement: str = "",
acknowledge: str = "",
legacy_output: str = "",
evaluation_conclusion: str = "",
additional_technical_requirements: str = "",
) -> str:
base64_images = [encode_image(i) for i in image_paths]
prompt = PROMPT.format(
textual_user_requirement=textual_user_requirement,
acknowledge=to_markdown_code_block(val=acknowledge),
legacy_output=to_markdown_code_block(val=legacy_output),
evaluation_conclusion=evaluation_conclusion,
additional_technical_requirements=to_markdown_code_block(val=additional_technical_requirements),
)
return await self._write(prompt, base64_images=base64_images)
@retry(
wait=wait_random_exponential(min=1, max=20),
stop=stop_after_attempt(6),
after=general_after_log(logger),
)
async def _write(self, prompt: str, base64_images: List[str]) -> str:
rsp = await self.llm.aask(prompt, images=base64_images)
return rsp
PROMPT = """
## Textual User Requirements
{textual_user_requirement}
## Acknowledge
{acknowledge}
## Legacy Outputs
{legacy_output}
## Evaluation Conclusion
{evaluation_conclusion}
## Additional Technical Requirements
{additional_technical_requirements}
---
You are a tool that generates an intact textual user requirements given a few of textual fragments of user requirements and some fragments of UI pictures.
The content of "Textual User Requirements" provides a few of textual fragments of user requirements;
The content of "Acknowledge" provides additional information related to the user requirements;
"Legacy Outputs" contains the intact textual user requirements generated by you last time, which you can improve by addressing the issues raised in "Evaluation Conclusion";
"Additional Technical Requirements" specifies the additional technical requirements that the generated textual user requirements must meet;
你需要将图片中的内容转换成文字描述合并到"Textual User Requirements"以生成完整的用户需求
Return the intact textual user requirements according to the given fragments of the user requirement of "Textual User Requirements" and the UI pictures;
"""

View file

@ -65,7 +65,7 @@ class BaseLLM(ABC):
# image url or image base64
url = image if image.startswith("http") else f"data:image/jpeg;base64,{image}"
# it can with multiple-image inputs
content.append({"type": "image_url", "image_url": url})
content.append({"type": "image_url", "image_url": {"url": url}})
return {"role": "user", "content": content}
def _assistant_msg(self, msg: str) -> dict[str, str]: