feat: + pic2txt

This commit is contained in:
莘权 马 2024-06-28 13:50:47 +08:00
parent 5f55590a57
commit dcb76de45e
6 changed files with 55 additions and 19 deletions

View file

@ -20,7 +20,7 @@ from metagpt.utils.common import encode_image, general_after_log, to_markdown_co
@register_tool(include_functions=["run"])
class Pic2Txt(Action):
"""Pic2Txt deal with the following situations:
1. Given a picture about the user requirements, write out the textual user requirements.
Given some pictures depicting user requirements alongside contextual description, write out the intact textual user requirements.
"""
async def run(
@ -28,11 +28,36 @@ class Pic2Txt(Action):
*,
image_paths: List[str],
textual_user_requirement: str = "",
acknowledge: str = "",
legacy_output: str = "",
evaluation_conclusion: str = "",
additional_technical_requirements: str = "",
) -> str:
"""
Given some pictures depicting user requirements alongside contextual description, write out the intact textual user requirements
Args:
image_paths (List[str]): A list of file paths to the input image(s) depicting user requirements.
textual_user_requirement (str, optional): Textual user requirement that alongside the given images, if any.
legacy_output (str, optional): The intact textual user requirements generated by you last time, if any.
evaluation_conclusion (str, optional): Conclusion or evaluation based on the processed requirements.
additional_technical_requirements (str, optional): Any supplementary technical details relevant to the process.
Returns:
str: Textual representation of user requirements extracted from the provided image(s).
Raises:
ValueError: If image_paths list is empty.
OSError: If there is an issue accessing or reading the image files.
Example:
>>> images = ["requirements/pic/1.png", "requirements/pic/2.png", "requirements/pic/3.png"]
>>> textual_user_requirements = "User requirement paragraph 1 ..., ![](1.png). paragraph 2...![](2.png)..."
>>> action = Pic2Txt()
>>> intact_textual_user_requirements = await action.run(image_paths=images, textual_user_requirement=textual_user_requirements)
>>> print(intact_textual_user_requirements)
"User requirement paragraph 1 ..., ![...](1.png) This picture describes... paragraph 2...![...](2.png)..."
"""
descriptions = {}
for i in image_paths:
filename = Path(i)

View file

@ -40,8 +40,17 @@ from metagpt.utils.token_counter import (
)
@register_provider([LLMType.OPENAI, LLMType.FIREWORKS, LLMType.OPEN_LLM, LLMType.MOONSHOT, LLMType.MISTRAL, LLMType.YI,
LLMType.OPEN_ROUTER])
@register_provider(
[
LLMType.OPENAI,
LLMType.FIREWORKS,
LLMType.OPEN_LLM,
LLMType.MOONSHOT,
LLMType.MISTRAL,
LLMType.YI,
LLMType.OPEN_ROUTER,
]
)
class OpenAILLM(BaseLLM):
"""Check https://platform.openai.com/examples for examples"""

View file

@ -83,7 +83,7 @@ class DataAnalyst(DataInterpreter):
# print(*context, sep="\n" + "*" * 5 + "\n")
async with ThoughtReporter(enable_llm_stream=True):
rsp = await self.llm.aask(context)
self.commands = json.loads(CodeParser.parse_code(block=None, lang='json', text=rsp))
self.commands = json.loads(CodeParser.parse_code(block=None, lang="json", text=rsp))
self.rc.working_memory.add(Message(content=rsp, role="assistant"))
await run_commands(self, self.commands, self.rc.working_memory)

View file

@ -11,7 +11,11 @@ from pydantic import model_validator
from metagpt.actions import Action
from metagpt.actions.di.run_command import RunCommand
from metagpt.logs import logger
from metagpt.prompts.di.role_zero import CMD_PROMPT, ROLE_INSTRUCTION, JSON_REPAIR_PROMPT
from metagpt.prompts.di.role_zero import (
CMD_PROMPT,
JSON_REPAIR_PROMPT,
ROLE_INSTRUCTION,
)
from metagpt.roles import Role
from metagpt.schema import AIMessage, Message, UserMessage
from metagpt.strategy.experience_retriever import DummyExpRetriever, ExpRetriever
@ -21,8 +25,8 @@ from metagpt.tools.libs.editor import Editor
from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender
from metagpt.tools.tool_registry import register_tool
from metagpt.utils.common import CodeParser
from metagpt.utils.repair_llm_raw_output import RepairType, repair_llm_raw_output
from metagpt.utils.report import ThoughtReporter
from metagpt.utils.repair_llm_raw_output import repair_llm_raw_output, RepairType
@register_tool(include_functions=["ask_human", "reply_to_human"])
@ -166,7 +170,7 @@ class RoleZero(Role):
try:
commands = CodeParser.parse_code(block=None, lang="json", text=self.command_rsp)
commands = json.loads(repair_llm_raw_output(output=commands, req_keys=[None], repair_type=RepairType.JSON))
except json.JSONDecodeError as e:
except json.JSONDecodeError:
commands = await self.llm.aask(msg=JSON_REPAIR_PROMPT.format(json_data=self.command_rsp))
commands = json.loads(CodeParser.parse_code(block=None, lang="json", text=commands))
except Exception as e:

View file

@ -9,9 +9,10 @@
from metagpt.actions import UserRequirement, WritePRD
from metagpt.actions.prepare_documents import PrepareDocuments
from metagpt.actions.requirement_analysis.requirement.pic2txt import Pic2Txt
from metagpt.roles.di.role_zero import RoleZero
from metagpt.roles.role import RoleReactMode
from metagpt.utils.common import any_to_name, any_to_str
from metagpt.utils.common import any_to_name, any_to_str, tool2name
from metagpt.utils.git_repository import GitRepository
@ -32,9 +33,9 @@ class ProductManager(RoleZero):
constraints: str = "utilize the same language as the user requirements for seamless communication"
todo_action: str = any_to_name(WritePRD)
instruction: str = """Use WritePRD tool to write PRD"""
instruction: str = """Use WritePRD tool to write PRD if a PRD is required; Use `Pic2Txt` tool to write out an intact textual user requirements if an intact textual user requiremnt is required given some images alongside the contextual textual descriptions;"""
max_react_loop: int = 1 # FIXME: Read and edit files requires more steps, consider later
tools: list[str] = ["Editor:write,read,write_content", "RoleZero", "WritePRD"]
tools: list[str] = ["Editor:write,read,write_content", "RoleZero", "WritePRD", Pic2Txt.__name__]
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
@ -47,12 +48,9 @@ class ProductManager(RoleZero):
def _update_tool_execution(self):
wp = WritePRD()
self.tool_execution_map.update(
{
"WritePRD.run": wp.run,
"WritePRD": wp.run, # alias
}
)
self.tool_execution_map.update(tool2name(WritePRD, ["run"], wp.run))
pic2txt = Pic2Txt()
self.tool_execution_map.update(tool2name(Pic2Txt, ["run"], pic2txt.run))
async def _think(self) -> bool:
"""Decide what to do"""

View file

@ -13,11 +13,11 @@ async def test_pic2txt(context):
TEST_DATA_PATH / "requirements/pic/3.png",
]
textual_user_requirements = await aread(filename=TEST_DATA_PATH / "requirements/1.original_requirement.txt")
acknowledge = await aread(filename=TEST_DATA_PATH / "requirements/1.acknowledge.md")
action = Pic2Txt(context=context)
rsp = await action.run(
image_paths=images, textual_user_requirement=textual_user_requirements, acknowledge=acknowledge
image_paths=images,
textual_user_requirement=textual_user_requirements,
)
assert rsp