From 3659bfcf79136248f6ebf943dfef7e22ef556d53 Mon Sep 17 00:00:00 2001 From: didi <2020201387@ruc.edu.cn> Date: Mon, 29 Jan 2024 19:45:20 +0800 Subject: [PATCH] Adds parse_record & parse_record_an , add RecordLogItem & modify ActionOp RecordLogItem is same as ReflectLogItem --- .../actions/manual_record.py | 7 +- .../andriod_assistant/actions/parse_record.py | 114 +++++++++++++++++- .../actions/parse_record_an.py | 31 +++++ examples/andriod_assistant/utils/schema.py | 12 ++ 4 files changed, 157 insertions(+), 7 deletions(-) create mode 100644 examples/andriod_assistant/actions/parse_record_an.py diff --git a/examples/andriod_assistant/actions/manual_record.py b/examples/andriod_assistant/actions/manual_record.py index 5b19471a6..57c354dfe 100644 --- a/examples/andriod_assistant/actions/manual_record.py +++ b/examples/andriod_assistant/actions/manual_record.py @@ -24,8 +24,7 @@ class ManualRecord(Action): name: str = "ManualRecord" async def run( - self, demo_name: str, task_desc: str, task_dir: Path, env: AndroidEnv, - grid_on: bool = False + self, demo_name: str, task_dir: Path, env: AndroidEnv ): # Question 这里是将通过ADB获取的东西存到本地的路径的吧 @@ -41,10 +40,10 @@ class ManualRecord(Action): # TODO exit return step = 0 - # Question 直接使用 OS 构建路径合适吗? - record_path = os.path.join(task_dir, "record.txt") + record_path = Path(task_dir) / "record.txt" record_file = open(record_path, "w") while True: + # TODO Parse Record Step 是否可以从这个函数中获取,进行参数的传递 ? step += 1 clickable_list = [] focusable_list = [] diff --git a/examples/andriod_assistant/actions/parse_record.py b/examples/andriod_assistant/actions/parse_record.py index 77f49fbd3..ed794fc4c 100644 --- a/examples/andriod_assistant/actions/parse_record.py +++ b/examples/andriod_assistant/actions/parse_record.py @@ -3,12 +3,120 @@ # @Desc : parse record to generate learned standard operations in stage=learn & mode=manual, # LIKE scripts/document_generation.py -from examples.andriod_assistant.prompts.operation_prompt import * +import re +import ast +import json +import time +from pathlib import Path + +from examples.andriod_assistant.prompts.operation_prompt import ( + tap_doc_template, + text_doc_template, + long_press_doc_template, + swipe_doc_template, + refine_doc_suffix +) +from examples.andriod_assistant.utils.schema import RecordLogItem, RunState, ActionOp, \ + SwipeOp, AndroidActionOutput +from examples.andriod_assistant.actions.parse_record_an import RECORD_PARSE_NODE +from metagpt.config2 import config +from metagpt.environment.android_env.android_env import AndroidEnv +from metagpt.utils.common import encode_image +from metagpt.logs import logger from metagpt.actions.action import Action class ParseRecord(Action): name: str = "ParseRecord" - async def run(self): - pass + async def run( + self, app_name: str, demo_name: str, task_dir: Path, docs_dir: Path, env: AndroidEnv + ): + doc_count = 0 + record_path = Path(task_dir) / "record.txt" + + with open(record_path, "r") as record_file: + record_step_count = len(record_file.readlines()) - 1 + record_file.seek(0) + for step in range(1, record_step_count + 1): + img_before_base64 = encode_image(task_dir.joinpath(f"{task_dir}_{step}_labeled.png")) + img_after_base64 = encode_image(task_dir.joinpath(f"{task_dir}_{step + 1}_labeled.png")) + rec = record_file.readline().strip() + action, resource_id = rec.split(":::") + action_type = action.split("(")[0] + # 构建Prompt + action_param = re.findall(r"\((.*?)\)", action)[0] + if action_type == ActionOp.TAP.value: + prompt_template = tap_doc_template + context = prompt_template.format(ui_element=action_param) + elif action_type == ActionOp.TEXT.value: + input_area, input_text = action_param.split(":sep:") + prompt_template = text_doc_template + context = prompt_template.format(ui_element=input_area) + elif action_type == ActionOp.LONG_PRESS.value: + prompt_template = long_press_doc_template + context = prompt_template.format(ui_element=action_param) + elif action_type == ActionOp.SWIPE.value: + swipe_area, swipe_dir = action_param.split(":sep:") + if swipe_dir == SwipeOp.UP.value or swipe_dir == SwipeOp.DOWN.value: + action_type = ActionOp.VERTICAL_SWIPE.value + elif swipe_dir == SwipeOp.LEFT.value or swipe_dir == SwipeOp.RIGHT.value: + action_type = ActionOp.HORIZONTAL_SWIPE.value + prompt_template = swipe_doc_template + context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area) + else: + break + task_desc_path = task_dir.joinpath("task_desc.txt") + task_desc = open(task_desc_path, "r").read() + context = context.format(task_desc=task_desc) + + doc_name = resource_id + ".txt" + doc_path = docs_dir.joinpath(doc_name) + + if doc_path.exists(): + doc_content = ast.literal_eval(open(doc_path).read()) + if doc_content[action_type]: + if config.get_other("doc_refine"): + refine_context = refine_doc_suffix.format(old_doc=doc_content[action_type]) + context += refine_context + logger.info( + f"Documentation for the element {resource_id} already exists. The doc will be " + f"refined based on the latest demo.") + else: + logger.info( + f"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE " + f"in the config file if needed.") + continue + else: + doc_content = { + "tap": "", + "text": "", + "v_swipe": "", + "h_swipe": "", + "long_press": "" + } + + logger.info(f"Waiting for GPT-4V to generate documentation for the element {resource_id}") + node = await RECORD_PARSE_NODE.fill(context=context, llm=self.llm, + images=[img_before_base64, img_after_base64]) + if "error" in node.content: + return AndroidActionOutput(action_state=RunState.FAIL) + + log_path = task_dir.joinpath(f"log_{app_name}_{demo_name}.txt") + prompt = node.compile(context=context, schema="json", mode="auto") + msg = node.content + doc_content[action_type] = msg + + with open(log_path, "a") as logfile: + log_item = RecordLogItem(step=step, prompt=prompt, image_before=img_before_base64, + image_after=img_after_base64, response=node.content) + # TODO 修改 dumps 方式 + logfile.write(json.dumps(log_item) + "\n") + with open(doc_path, "w") as outfile: + outfile.write(str(doc_content)) + doc_count += 1 + logger.info(f"Documentation generated and saved to {doc_path}") + + time.sleep(config.get_other("request_interval")) + + logger.info(f"Documentation generation phase completed. {doc_count} docs generated.") diff --git a/examples/andriod_assistant/actions/parse_record_an.py b/examples/andriod_assistant/actions/parse_record_an.py new file mode 100644 index 000000000..b81eaec0c --- /dev/null +++ b/examples/andriod_assistant/actions/parse_record_an.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Desc : the ActionNode to parse record + +from metagpt.actions.action_node import ActionNode + +OBSERVATION = ActionNode( + key="Observation", expected_type=str, + instruction="Provide a description of your observations of the two images. " + "Subsequently, delineate the distinctions between the first image and the second one.", + example="" +) + +THOUGHT = ActionNode( + key="Thought", + expected_type=str, + instruction="Consider the impact of Action acting on UI elements.", + example="", +) + +DESCRIPTION = ActionNode( + key="Description", + expected_type=str, + instruction="Describe the functionality of the UI element concisely in one or two sentences Do not include " + "the numeric tag in your description", + example="", +) + +NODES = [OBSERVATION, THOUGHT, DESCRIPTION] + +RECORD_PARSE_NODE = ActionNode.from_children("RecordParse", NODES) diff --git a/examples/andriod_assistant/utils/schema.py b/examples/andriod_assistant/utils/schema.py index 9dfc95194..dae8c67d6 100644 --- a/examples/andriod_assistant/utils/schema.py +++ b/examples/andriod_assistant/utils/schema.py @@ -11,6 +11,8 @@ class ActionOp(Enum): LONG_PRESS = "long_press" TEXT = "text" SWIPE = "swipe" + VERTICAL_SWIPE = "v_swipe" + HORIZONTAL_SWIPE = "h_swipe" GRID = "grid" STOP = "stop" @@ -57,6 +59,15 @@ class ReflectLogItem(BaseModel): response: str = Field(default="") +class RecordLogItem(BaseModel): + """log content for record parse, same as ReflectLogItem""" + step: int = Field(default=0) + prompt: str = Field(default="") + image_before: str = Field(default="") + image_after: str = Field(default="") + response: str = Field(default="") + + class DocContent(BaseModel): tap: str = Field(default="") text: str = Field(default="") @@ -125,6 +136,7 @@ class SwipeGridOp(BaseGridOpParam): end_area: int = Field(default=-1) end_subarea: str = Field(default="") + # end =================== define different Action Op and its params =============