feat: +pic2txt

2026-06-23 15:48:11 +02:00 · 2024-06-27 22:13:41 +08:00 · 2024-06-27 22:13:41 +08:00 · 632452e2a1
commit 632452e2a1
parent 742ff0e80a
12 changed files with 79 additions and 38 deletions
--- a/metagpt/actions/requirement_analysis/framework/init.py
+++ b/metagpt/actions/requirement_analysis/framework/init.py
@ -54,7 +54,7 @@ async def save_framework(
    output_dir = (
        Path(output_dir)
        if output_dir
-        else DEFAULT_WORKSPACE_ROOT / (datetime.now().strftime("%Y%m%d%H%M%S") + uuid.uuid4().hex[0:8])
+        else DEFAULT_WORKSPACE_ROOT / (datetime.now().strftime("%Y%m%d%H%M%ST") + uuid.uuid4().hex[0:8])
    )
    output_dir.mkdir(parents=True, exist_ok=True)

--- a/metagpt/actions/requirement_analysis/framework/evaluate_framework.py
+++ b/metagpt/actions/requirement_analysis/framework/evaluate_framework.py
@ -99,8 +99,8 @@ Parts not mentioned in the "Legacy TRD" will be handled by other TRDs, therefore
 Do the parameters of the interface of the external system used in the code comply with it's specifications in 'Acknowledge'?
 Is there a lack of necessary configuration files?
 Return a markdown JSON object with:
- a "is_pass" key containing a true boolean value if there is not any issue in the "Legacy Outputs";
 - an "issues" key containing a string list of natural text about the issues that need to addressed, found in the "Legacy Outputs" if any exits, each issue found must provide a detailed description and include reasons;
 - a "conclusion" key containing the evaluation conclusion;
 - a "misalignment" key containing the judgement detail of the natural text string list about the misalignment with "Legacy TRD";
+- a "is_pass" key containing a true boolean value if there is not any issue in the "Legacy Outputs";
 """
--- a/metagpt/actions/requirement_analysis/requirement/pic2txt.py
+++ b/metagpt/actions/requirement_analysis/requirement/pic2txt.py
@ -5,6 +5,8 @@
@Author  : mashenquan
@File    : pic2txt.py
 """
+import json
+from pathlib import Path
 from typing import List

 from tenacity import retry, stop_after_attempt, wait_random_exponential
@ -31,23 +33,41 @@ class Pic2Txt(Action):
        evaluation_conclusion: str = "",
        additional_technical_requirements: str = "",
    ) -> str:
-        base64_images = [encode_image(i) for i in image_paths]
+        descriptions = {}
+        for i in image_paths:
+            filename = Path(i)
+            base64_image = encode_image(filename)
+            rsp = await self._pic2txt(
+                "Generate a paragraph of text based on the content of the image, the language of the text is consistent with the language in the image.",
+                base64_image=base64_image,
+            )
+            descriptions[filename.name] = rsp
+
        prompt = PROMPT.format(
            textual_user_requirement=textual_user_requirement,
-            acknowledge=to_markdown_code_block(val=acknowledge),
+            acknowledge=to_markdown_code_block(val=json.dumps(descriptions), type_="json"),
            legacy_output=to_markdown_code_block(val=legacy_output),
            evaluation_conclusion=evaluation_conclusion,
            additional_technical_requirements=to_markdown_code_block(val=additional_technical_requirements),
        )
-        return await self._write(prompt, base64_images=base64_images)
+        return await self._write(prompt)

    @retry(
        wait=wait_random_exponential(min=1, max=20),
        stop=stop_after_attempt(6),
        after=general_after_log(logger),
    )
-    async def _write(self, prompt: str, base64_images: List[str]) -> str:
-        rsp = await self.llm.aask(prompt, images=base64_images)
+    async def _write(self, prompt: str) -> str:
+        rsp = await self.llm.aask(prompt)
+        return rsp
+
+    @retry(
+        wait=wait_random_exponential(min=1, max=20),
+        stop=stop_after_attempt(6),
+        after=general_after_log(logger),
+    )
+    async def _pic2txt(self, prompt: str, base64_image: str) -> str:
+        rsp = await self.llm.aask(prompt, images=base64_image)
        return rsp


@ -70,9 +90,9 @@ PROMPT = """
 ---
 You are a tool that generates an intact textual user requirements given a few of textual fragments of user requirements and some fragments of UI pictures.
 The content of "Textual User Requirements" provides a few of textual fragments of user requirements;
-The content of "Acknowledge" provides additional information related to the user requirements;
+The content of "Acknowledge" provides the descriptions of pictures used in "Textual User Requirements";
 "Legacy Outputs" contains the intact textual user requirements generated by you last time, which you can improve by addressing the issues raised in "Evaluation Conclusion";
 "Additional Technical Requirements" specifies the additional technical requirements that the generated textual user requirements must meet;
-你需要将图片中的内容转换成文字描述，合并到"Textual User Requirements"，以生成完整的用户需求；
+You need to merge the text content of the corresponding image in the "Acknowledge" into the "Textual User Requirements" to generate a complete, natural and coherent description of the user requirements;
 Return the intact textual user requirements according to the given fragments of the user requirement of "Textual User Requirements" and the UI pictures;
 """
--- a/metagpt/actions/requirement_analysis/trd/evaluate_trd.py
+++ b/metagpt/actions/requirement_analysis/trd/evaluate_trd.py
@ -107,9 +107,9 @@ If there are interaction events with external systems in "TRD Design", you must
 Does the sequence of steps in "Interaction Events" cause performance or cost issues? Please provide detailed descriptions and reasons;
 It is problematic if the data stream composed of input/output contains passive or irrelevant data;
 Return a markdown JSON object with:
- a "is_pass" key containing a true boolean value if there is not any issue in the "TRD Design";
 - an "issues" key containing a string list of natural text about the issues that need to be addressed, found in the "TRD Design" if any exist, each issue found must provide a detailed description and include reasons;
 - a "conclusion" key containing the evaluation conclusion;
 - a "correspondence_between" key containing the judgement detail of the natural text string list about the correspondence between "Interaction Events" and "TRD Design" steps;
 - a "misalignment" key containing the judgement detail of the natural text string list about the misalignment with "User Requirements";
+- a "is_pass" key containing a true boolean value if there is not any issue in the "TRD Design";
 """
--- a/metagpt/tools/libs/software_development.py
+++ b/metagpt/tools/libs/software_development.py
@ -2,6 +2,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import annotations

+import uuid
+from datetime import datetime
 from pathlib import Path
 from typing import Optional

@ -16,7 +18,7 @@ from metagpt.actions.requirement_analysis.trd import (
    EvaluateTRD,
    WriteTRD,
 )
-from metagpt.const import ASSISTANT_ALIAS, TEST_DATA_PATH
+from metagpt.const import ASSISTANT_ALIAS, DEFAULT_WORKSPACE_ROOT, TEST_DATA_PATH
 from metagpt.context import Context
 from metagpt.logs import ToolLogItem, log_tool_output, logger
 from metagpt.tools.tool_registry import register_tool
@ -200,6 +202,12 @@ async def write_framework(
    evaluation_conclusion = ""
    acknowledgement = await mock_asearch_acknowledgement(use_case_actors)  # Replaced by acknowledgement_repo later.
    loop_count = 0
+    output_dir = (
+        Path(output_dir)
+        if output_dir
+        else DEFAULT_WORKSPACE_ROOT / (datetime.now().strftime("%Y%m%d%H%M%ST") + uuid.uuid4().hex[0:8])
+    )
+    file_list = []
    while not is_pass and (context.cost_manager.total_cost < context.cost_manager.max_budget):
        try:
            framework = await write_framework.run(
@ -226,9 +234,9 @@ async def write_framework(
        logger.info(f"Loop {loop_count}")
        if context.cost_manager.total_cost < 1 and loop_count > max_loop:
            break
+        file_list = await save_framework(dir_data=framework, trd=trd, output_dir=output_dir)
+        logger.info(f"Output:\n{file_list}")

-    file_list = await save_framework(dir_data=framework, trd=trd, output_dir=output_dir)
-    logger.info(f"Output:\n{file_list}")
    return "## Software Framework" + "".join([f"\n- {i}" for i in file_list])


@ -237,13 +245,12 @@ async def write_trd_and_framework(
    use_case_actors: str,
    user_requirements: str,
    additional_technical_requirements: str,
-    investment: float = 17.0,
+    investment: float = 50.0,
    output_dir: Optional[str] = "",
    context: Optional[Context] = None,
 ) -> str:
    context = context or Context(cost_manager=CostManager(max_budget=investment))
    trd = await write_trd(use_case_actors=use_case_actors, user_requirements=user_requirements, context=context)
-
    return await write_framework(
        use_case_actors=use_case_actors,
        trd=trd,