feat: + pic2txt

2026-05-04 13:22:39 +02:00 · 2024-06-28 13:50:47 +08:00 · 2024-06-28 13:50:47 +08:00 · dcb76de45e
commit dcb76de45e
parent 5f55590a57
6 changed files with 55 additions and 19 deletions
--- a/metagpt/actions/requirement_analysis/requirement/pic2txt.py
+++ b/metagpt/actions/requirement_analysis/requirement/pic2txt.py
@ -20,7 +20,7 @@ from metagpt.utils.common import encode_image, general_after_log, to_markdown_co
@register_tool(include_functions=["run"])
 class Pic2Txt(Action):
    """Pic2Txt deal with the following situations:
-    1. Given a picture about the user requirements, write out the textual user requirements.
+    Given some pictures depicting user requirements alongside contextual description, write out the intact textual user requirements.
    """

    async def run(
@ -28,11 +28,36 @@ class Pic2Txt(Action):
        *,
        image_paths: List[str],
        textual_user_requirement: str = "",
-        acknowledge: str = "",
        legacy_output: str = "",
        evaluation_conclusion: str = "",
        additional_technical_requirements: str = "",
    ) -> str:
+        """
+        Given some pictures depicting user requirements alongside contextual description, write out the intact textual user requirements
+
+        Args:
+            image_paths (List[str]): A list of file paths to the input image(s) depicting user requirements.
+            textual_user_requirement (str, optional): Textual user requirement that alongside the given images, if any.
+            legacy_output (str, optional): The intact textual user requirements generated by you last time, if any.
+            evaluation_conclusion (str, optional): Conclusion or evaluation based on the processed requirements.
+            additional_technical_requirements (str, optional): Any supplementary technical details relevant to the process.
+
+        Returns:
+            str: Textual representation of user requirements extracted from the provided image(s).
+
+        Raises:
+            ValueError: If image_paths list is empty.
+            OSError: If there is an issue accessing or reading the image files.
+
+        Example:
+            >>> images = ["requirements/pic/1.png", "requirements/pic/2.png", "requirements/pic/3.png"]
+            >>> textual_user_requirements = "User requirement paragraph 1 ..., ![](1.png). paragraph 2...![](2.png)..."
+            >>> action = Pic2Txt()
+            >>> intact_textual_user_requirements = await action.run(image_paths=images, textual_user_requirement=textual_user_requirements)
+            >>> print(intact_textual_user_requirements)
+            "User requirement paragraph 1 ..., ![...](1.png) This picture describes... paragraph 2...![...](2.png)..."
+
+        """
        descriptions = {}
        for i in image_paths:
            filename = Path(i)
--- a/metagpt/provider/openai_api.py
+++ b/metagpt/provider/openai_api.py
@ -40,8 +40,17 @@ from metagpt.utils.token_counter import (
 )


-@register_provider([LLMType.OPENAI, LLMType.FIREWORKS, LLMType.OPEN_LLM, LLMType.MOONSHOT, LLMType.MISTRAL, LLMType.YI,
-                    LLMType.OPEN_ROUTER])
+@register_provider(
+    [
+        LLMType.OPENAI,
+        LLMType.FIREWORKS,
+        LLMType.OPEN_LLM,
+        LLMType.MOONSHOT,
+        LLMType.MISTRAL,
+        LLMType.YI,
+        LLMType.OPEN_ROUTER,
+    ]
+)
 class OpenAILLM(BaseLLM):
    """Check https://platform.openai.com/examples for examples"""

--- a/metagpt/roles/di/data_analyst.py
+++ b/metagpt/roles/di/data_analyst.py
@ -83,7 +83,7 @@ class DataAnalyst(DataInterpreter):
        # print(*context, sep="\n" + "*" * 5 + "\n")
        async with ThoughtReporter(enable_llm_stream=True):
            rsp = await self.llm.aask(context)
-        self.commands = json.loads(CodeParser.parse_code(block=None, lang='json', text=rsp))
+        self.commands = json.loads(CodeParser.parse_code(block=None, lang="json", text=rsp))
        self.rc.working_memory.add(Message(content=rsp, role="assistant"))

        await run_commands(self, self.commands, self.rc.working_memory)
--- a/metagpt/roles/di/role_zero.py
+++ b/metagpt/roles/di/role_zero.py
@ -11,7 +11,11 @@ from pydantic import model_validator
 from metagpt.actions import Action
 from metagpt.actions.di.run_command import RunCommand
 from metagpt.logs import logger
-from metagpt.prompts.di.role_zero import CMD_PROMPT, ROLE_INSTRUCTION, JSON_REPAIR_PROMPT
+from metagpt.prompts.di.role_zero import (
+    CMD_PROMPT,
+    JSON_REPAIR_PROMPT,
+    ROLE_INSTRUCTION,
+)
 from metagpt.roles import Role
 from metagpt.schema import AIMessage, Message, UserMessage
 from metagpt.strategy.experience_retriever import DummyExpRetriever, ExpRetriever
@ -21,8 +25,8 @@ from metagpt.tools.libs.editor import Editor
 from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender
 from metagpt.tools.tool_registry import register_tool
 from metagpt.utils.common import CodeParser
+from metagpt.utils.repair_llm_raw_output import RepairType, repair_llm_raw_output
 from metagpt.utils.report import ThoughtReporter
-from metagpt.utils.repair_llm_raw_output import repair_llm_raw_output, RepairType


@register_tool(include_functions=["ask_human", "reply_to_human"])
@ -166,7 +170,7 @@ class RoleZero(Role):
        try:
            commands = CodeParser.parse_code(block=None, lang="json", text=self.command_rsp)
            commands = json.loads(repair_llm_raw_output(output=commands, req_keys=[None], repair_type=RepairType.JSON))
-        except json.JSONDecodeError as e:
+        except json.JSONDecodeError:
            commands = await self.llm.aask(msg=JSON_REPAIR_PROMPT.format(json_data=self.command_rsp))
            commands = json.loads(CodeParser.parse_code(block=None, lang="json", text=commands))
        except Exception as e:
--- a/metagpt/roles/product_manager.py
+++ b/metagpt/roles/product_manager.py
@ -9,9 +9,10 @@

 from metagpt.actions import UserRequirement, WritePRD
 from metagpt.actions.prepare_documents import PrepareDocuments
+from metagpt.actions.requirement_analysis.requirement.pic2txt import Pic2Txt
 from metagpt.roles.di.role_zero import RoleZero
 from metagpt.roles.role import RoleReactMode
-from metagpt.utils.common import any_to_name, any_to_str
+from metagpt.utils.common import any_to_name, any_to_str, tool2name
 from metagpt.utils.git_repository import GitRepository


@ -32,9 +33,9 @@ class ProductManager(RoleZero):
    constraints: str = "utilize the same language as the user requirements for seamless communication"
    todo_action: str = any_to_name(WritePRD)

-    instruction: str = """Use WritePRD tool to write PRD"""
+    instruction: str = """Use WritePRD tool to write PRD if a PRD is required; Use `Pic2Txt` tool to write out an intact textual user requirements if an intact textual user requiremnt is required given some images alongside the contextual textual descriptions;"""
    max_react_loop: int = 1  # FIXME: Read and edit files requires more steps, consider later
-    tools: list[str] = ["Editor:write,read,write_content", "RoleZero", "WritePRD"]
+    tools: list[str] = ["Editor:write,read,write_content", "RoleZero", "WritePRD", Pic2Txt.__name__]

    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
@ -47,12 +48,9 @@ class ProductManager(RoleZero):

    def _update_tool_execution(self):
        wp = WritePRD()
-        self.tool_execution_map.update(
-            {
-                "WritePRD.run": wp.run,
-                "WritePRD": wp.run,  # alias
-            }
-        )
+        self.tool_execution_map.update(tool2name(WritePRD, ["run"], wp.run))
+        pic2txt = Pic2Txt()
+        self.tool_execution_map.update(tool2name(Pic2Txt, ["run"], pic2txt.run))

    async def _think(self) -> bool:
        """Decide what to do"""
--- a/tests/metagpt/actions/requirement_analysis/requirement/test_pic2txt.py
+++ b/tests/metagpt/actions/requirement_analysis/requirement/test_pic2txt.py
@ -13,11 +13,11 @@ async def test_pic2txt(context):
        TEST_DATA_PATH / "requirements/pic/3.png",
    ]
    textual_user_requirements = await aread(filename=TEST_DATA_PATH / "requirements/1.original_requirement.txt")
-    acknowledge = await aread(filename=TEST_DATA_PATH / "requirements/1.acknowledge.md")

    action = Pic2Txt(context=context)
    rsp = await action.run(
-        image_paths=images, textual_user_requirement=textual_user_requirements, acknowledge=acknowledge
+        image_paths=images,
+        textual_user_requirement=textual_user_requirements,
    )
    assert rsp