Adds parse_record & parse_record_an , add RecordLogItem & modify ActionOp

RecordLogItem is same as ReflectLogItem
2026-05-21 14:05:17 +02:00 · 2024-01-29 19:45:20 +08:00 · 2024-01-29 19:45:20 +08:00 · 3659bfcf79
commit 3659bfcf79
parent bf7ade039d
4 changed files with 157 additions and 7 deletions
--- a/examples/andriod_assistant/actions/manual_record.py
+++ b/examples/andriod_assistant/actions/manual_record.py
@ -24,8 +24,7 @@ class ManualRecord(Action):
    name: str = "ManualRecord"

    async def run(
-            self, demo_name: str, task_desc: str, task_dir: Path, env: AndroidEnv,
-            grid_on: bool = False
+            self, demo_name: str, task_dir: Path, env: AndroidEnv
    ):

        # Question 这里是将通过ADB获取的东西存到本地的路径的吧
@ -41,10 +40,10 @@ class ManualRecord(Action):
            # TODO exit
            return
        step = 0
-        # Question 直接使用 OS 构建路径合适吗？
-        record_path = os.path.join(task_dir, "record.txt")
+        record_path = Path(task_dir) / "record.txt"
        record_file = open(record_path, "w")
        while True:
+            # TODO Parse Record Step 是否可以从这个函数中获取，进行参数的传递 ？
            step += 1
            clickable_list = []
            focusable_list = []
--- a/examples/andriod_assistant/actions/parse_record.py
+++ b/examples/andriod_assistant/actions/parse_record.py
@ -3,12 +3,120 @@
 # @Desc   : parse record to generate learned standard operations in stage=learn & mode=manual,
 #           LIKE scripts/document_generation.py

-from examples.andriod_assistant.prompts.operation_prompt import *
+import re
+import ast
+import json
+import time
+from pathlib import Path
+
+from examples.andriod_assistant.prompts.operation_prompt import (
+    tap_doc_template,
+    text_doc_template,
+    long_press_doc_template,
+    swipe_doc_template,
+    refine_doc_suffix
+)
+from examples.andriod_assistant.utils.schema import RecordLogItem, RunState, ActionOp, \
+    SwipeOp, AndroidActionOutput
+from examples.andriod_assistant.actions.parse_record_an import RECORD_PARSE_NODE
+from metagpt.config2 import config
+from metagpt.environment.android_env.android_env import AndroidEnv
+from metagpt.utils.common import encode_image
+from metagpt.logs import logger
 from metagpt.actions.action import Action


 class ParseRecord(Action):
    name: str = "ParseRecord"

-    async def run(self):
-        pass
+    async def run(
+            self, app_name: str, demo_name: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
+    ):
+        doc_count = 0
+        record_path = Path(task_dir) / "record.txt"
+
+        with open(record_path, "r") as record_file:
+            record_step_count = len(record_file.readlines()) - 1
+            record_file.seek(0)
+            for step in range(1, record_step_count + 1):
+                img_before_base64 = encode_image(task_dir.joinpath(f"{task_dir}_{step}_labeled.png"))
+                img_after_base64 = encode_image(task_dir.joinpath(f"{task_dir}_{step + 1}_labeled.png"))
+                rec = record_file.readline().strip()
+                action, resource_id = rec.split(":::")
+                action_type = action.split("(")[0]
+                # 构建Prompt
+                action_param = re.findall(r"\((.*?)\)", action)[0]
+                if action_type == ActionOp.TAP.value:
+                    prompt_template = tap_doc_template
+                    context = prompt_template.format(ui_element=action_param)
+                elif action_type == ActionOp.TEXT.value:
+                    input_area, input_text = action_param.split(":sep:")
+                    prompt_template = text_doc_template
+                    context = prompt_template.format(ui_element=input_area)
+                elif action_type == ActionOp.LONG_PRESS.value:
+                    prompt_template = long_press_doc_template
+                    context = prompt_template.format(ui_element=action_param)
+                elif action_type == ActionOp.SWIPE.value:
+                    swipe_area, swipe_dir = action_param.split(":sep:")
+                    if swipe_dir == SwipeOp.UP.value or swipe_dir == SwipeOp.DOWN.value:
+                        action_type = ActionOp.VERTICAL_SWIPE.value
+                    elif swipe_dir == SwipeOp.LEFT.value or swipe_dir == SwipeOp.RIGHT.value:
+                        action_type = ActionOp.HORIZONTAL_SWIPE.value
+                    prompt_template = swipe_doc_template
+                    context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area)
+                else:
+                    break
+                task_desc_path = task_dir.joinpath("task_desc.txt")
+                task_desc = open(task_desc_path, "r").read()
+                context = context.format(task_desc=task_desc)
+
+                doc_name = resource_id + ".txt"
+                doc_path = docs_dir.joinpath(doc_name)
+
+                if doc_path.exists():
+                    doc_content = ast.literal_eval(open(doc_path).read())
+                    if doc_content[action_type]:
+                        if config.get_other("doc_refine"):
+                            refine_context = refine_doc_suffix.format(old_doc=doc_content[action_type])
+                            context += refine_context
+                            logger.info(
+                                f"Documentation for the element {resource_id} already exists. The doc will be "
+                                f"refined based on the latest demo.")
+                        else:
+                            logger.info(
+                                f"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE "
+                                f"in the config file if needed.")
+                            continue
+                else:
+                    doc_content = {
+                        "tap": "",
+                        "text": "",
+                        "v_swipe": "",
+                        "h_swipe": "",
+                        "long_press": ""
+                    }
+
+                logger.info(f"Waiting for GPT-4V to generate documentation for the element {resource_id}")
+                node = await RECORD_PARSE_NODE.fill(context=context, llm=self.llm,
+                                                    images=[img_before_base64, img_after_base64])
+                if "error" in node.content:
+                    return AndroidActionOutput(action_state=RunState.FAIL)
+
+                log_path = task_dir.joinpath(f"log_{app_name}_{demo_name}.txt")
+                prompt = node.compile(context=context, schema="json", mode="auto")
+                msg = node.content
+                doc_content[action_type] = msg
+
+                with open(log_path, "a") as logfile:
+                    log_item = RecordLogItem(step=step, prompt=prompt, image_before=img_before_base64,
+                                             image_after=img_after_base64, response=node.content)
+                    # TODO 修改 dumps 方式
+                    logfile.write(json.dumps(log_item) + "\n")
+                with open(doc_path, "w") as outfile:
+                    outfile.write(str(doc_content))
+                doc_count += 1
+                logger.info(f"Documentation generated and saved to {doc_path}")
+
+                time.sleep(config.get_other("request_interval"))
+
+            logger.info(f"Documentation generation phase completed. {doc_count} docs generated.")
--- a/examples/andriod_assistant/actions/parse_record_an.py
+++ b/examples/andriod_assistant/actions/parse_record_an.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc   : the ActionNode to parse record
+
+from metagpt.actions.action_node import ActionNode
+
+OBSERVATION = ActionNode(
+    key="Observation", expected_type=str,
+    instruction="Provide a description of your observations of the two images. "
+                "Subsequently, delineate the distinctions between the first image and the second one.",
+    example=""
+)
+
+THOUGHT = ActionNode(
+    key="Thought",
+    expected_type=str,
+    instruction="Consider the impact of Action acting on UI elements.",
+    example="",
+)
+
+DESCRIPTION = ActionNode(
+    key="Description",
+    expected_type=str,
+    instruction="Describe the functionality of the UI element concisely in one or two sentences Do not include "
+                "the numeric tag in your description",
+    example="",
+)
+
+NODES = [OBSERVATION, THOUGHT, DESCRIPTION]
+
+RECORD_PARSE_NODE = ActionNode.from_children("RecordParse", NODES)
--- a/examples/andriod_assistant/utils/schema.py
+++ b/examples/andriod_assistant/utils/schema.py
@ -11,6 +11,8 @@ class ActionOp(Enum):
    LONG_PRESS = "long_press"
    TEXT = "text"
    SWIPE = "swipe"
+    VERTICAL_SWIPE = "v_swipe"
+    HORIZONTAL_SWIPE = "h_swipe"
    GRID = "grid"
    STOP = "stop"

@ -57,6 +59,15 @@ class ReflectLogItem(BaseModel):
    response: str = Field(default="")


+class RecordLogItem(BaseModel):
+    """log content for record parse, same as ReflectLogItem"""
+    step: int = Field(default=0)
+    prompt: str = Field(default="")
+    image_before: str = Field(default="")
+    image_after: str = Field(default="")
+    response: str = Field(default="")
+
+
 class DocContent(BaseModel):
    tap: str = Field(default="")
    text: str = Field(default="")
@ -125,6 +136,7 @@ class SwipeGridOp(BaseGridOpParam):
    end_area: int = Field(default=-1)
    end_subarea: str = Field(default="")

+
 # end =================== define different Action Op and its params =============