mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-21 14:05:17 +02:00
Adds parse_record & parse_record_an , add RecordLogItem & modify ActionOp
RecordLogItem is same as ReflectLogItem
This commit is contained in:
parent
bf7ade039d
commit
3659bfcf79
4 changed files with 157 additions and 7 deletions
|
|
@ -24,8 +24,7 @@ class ManualRecord(Action):
|
|||
name: str = "ManualRecord"
|
||||
|
||||
async def run(
|
||||
self, demo_name: str, task_desc: str, task_dir: Path, env: AndroidEnv,
|
||||
grid_on: bool = False
|
||||
self, demo_name: str, task_dir: Path, env: AndroidEnv
|
||||
):
|
||||
|
||||
# Question 这里是将通过ADB获取的东西存到本地的路径的吧
|
||||
|
|
@ -41,10 +40,10 @@ class ManualRecord(Action):
|
|||
# TODO exit
|
||||
return
|
||||
step = 0
|
||||
# Question 直接使用 OS 构建路径合适吗?
|
||||
record_path = os.path.join(task_dir, "record.txt")
|
||||
record_path = Path(task_dir) / "record.txt"
|
||||
record_file = open(record_path, "w")
|
||||
while True:
|
||||
# TODO Parse Record Step 是否可以从这个函数中获取,进行参数的传递 ?
|
||||
step += 1
|
||||
clickable_list = []
|
||||
focusable_list = []
|
||||
|
|
|
|||
|
|
@ -3,12 +3,120 @@
|
|||
# @Desc : parse record to generate learned standard operations in stage=learn & mode=manual,
|
||||
# LIKE scripts/document_generation.py
|
||||
|
||||
from examples.andriod_assistant.prompts.operation_prompt import *
|
||||
import re
|
||||
import ast
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from examples.andriod_assistant.prompts.operation_prompt import (
|
||||
tap_doc_template,
|
||||
text_doc_template,
|
||||
long_press_doc_template,
|
||||
swipe_doc_template,
|
||||
refine_doc_suffix
|
||||
)
|
||||
from examples.andriod_assistant.utils.schema import RecordLogItem, RunState, ActionOp, \
|
||||
SwipeOp, AndroidActionOutput
|
||||
from examples.andriod_assistant.actions.parse_record_an import RECORD_PARSE_NODE
|
||||
from metagpt.config2 import config
|
||||
from metagpt.environment.android_env.android_env import AndroidEnv
|
||||
from metagpt.utils.common import encode_image
|
||||
from metagpt.logs import logger
|
||||
from metagpt.actions.action import Action
|
||||
|
||||
|
||||
class ParseRecord(Action):
|
||||
name: str = "ParseRecord"
|
||||
|
||||
async def run(self):
|
||||
pass
|
||||
async def run(
|
||||
self, app_name: str, demo_name: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
|
||||
):
|
||||
doc_count = 0
|
||||
record_path = Path(task_dir) / "record.txt"
|
||||
|
||||
with open(record_path, "r") as record_file:
|
||||
record_step_count = len(record_file.readlines()) - 1
|
||||
record_file.seek(0)
|
||||
for step in range(1, record_step_count + 1):
|
||||
img_before_base64 = encode_image(task_dir.joinpath(f"{task_dir}_{step}_labeled.png"))
|
||||
img_after_base64 = encode_image(task_dir.joinpath(f"{task_dir}_{step + 1}_labeled.png"))
|
||||
rec = record_file.readline().strip()
|
||||
action, resource_id = rec.split(":::")
|
||||
action_type = action.split("(")[0]
|
||||
# 构建Prompt
|
||||
action_param = re.findall(r"\((.*?)\)", action)[0]
|
||||
if action_type == ActionOp.TAP.value:
|
||||
prompt_template = tap_doc_template
|
||||
context = prompt_template.format(ui_element=action_param)
|
||||
elif action_type == ActionOp.TEXT.value:
|
||||
input_area, input_text = action_param.split(":sep:")
|
||||
prompt_template = text_doc_template
|
||||
context = prompt_template.format(ui_element=input_area)
|
||||
elif action_type == ActionOp.LONG_PRESS.value:
|
||||
prompt_template = long_press_doc_template
|
||||
context = prompt_template.format(ui_element=action_param)
|
||||
elif action_type == ActionOp.SWIPE.value:
|
||||
swipe_area, swipe_dir = action_param.split(":sep:")
|
||||
if swipe_dir == SwipeOp.UP.value or swipe_dir == SwipeOp.DOWN.value:
|
||||
action_type = ActionOp.VERTICAL_SWIPE.value
|
||||
elif swipe_dir == SwipeOp.LEFT.value or swipe_dir == SwipeOp.RIGHT.value:
|
||||
action_type = ActionOp.HORIZONTAL_SWIPE.value
|
||||
prompt_template = swipe_doc_template
|
||||
context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area)
|
||||
else:
|
||||
break
|
||||
task_desc_path = task_dir.joinpath("task_desc.txt")
|
||||
task_desc = open(task_desc_path, "r").read()
|
||||
context = context.format(task_desc=task_desc)
|
||||
|
||||
doc_name = resource_id + ".txt"
|
||||
doc_path = docs_dir.joinpath(doc_name)
|
||||
|
||||
if doc_path.exists():
|
||||
doc_content = ast.literal_eval(open(doc_path).read())
|
||||
if doc_content[action_type]:
|
||||
if config.get_other("doc_refine"):
|
||||
refine_context = refine_doc_suffix.format(old_doc=doc_content[action_type])
|
||||
context += refine_context
|
||||
logger.info(
|
||||
f"Documentation for the element {resource_id} already exists. The doc will be "
|
||||
f"refined based on the latest demo.")
|
||||
else:
|
||||
logger.info(
|
||||
f"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE "
|
||||
f"in the config file if needed.")
|
||||
continue
|
||||
else:
|
||||
doc_content = {
|
||||
"tap": "",
|
||||
"text": "",
|
||||
"v_swipe": "",
|
||||
"h_swipe": "",
|
||||
"long_press": ""
|
||||
}
|
||||
|
||||
logger.info(f"Waiting for GPT-4V to generate documentation for the element {resource_id}")
|
||||
node = await RECORD_PARSE_NODE.fill(context=context, llm=self.llm,
|
||||
images=[img_before_base64, img_after_base64])
|
||||
if "error" in node.content:
|
||||
return AndroidActionOutput(action_state=RunState.FAIL)
|
||||
|
||||
log_path = task_dir.joinpath(f"log_{app_name}_{demo_name}.txt")
|
||||
prompt = node.compile(context=context, schema="json", mode="auto")
|
||||
msg = node.content
|
||||
doc_content[action_type] = msg
|
||||
|
||||
with open(log_path, "a") as logfile:
|
||||
log_item = RecordLogItem(step=step, prompt=prompt, image_before=img_before_base64,
|
||||
image_after=img_after_base64, response=node.content)
|
||||
# TODO 修改 dumps 方式
|
||||
logfile.write(json.dumps(log_item) + "\n")
|
||||
with open(doc_path, "w") as outfile:
|
||||
outfile.write(str(doc_content))
|
||||
doc_count += 1
|
||||
logger.info(f"Documentation generated and saved to {doc_path}")
|
||||
|
||||
time.sleep(config.get_other("request_interval"))
|
||||
|
||||
logger.info(f"Documentation generation phase completed. {doc_count} docs generated.")
|
||||
|
|
|
|||
31
examples/andriod_assistant/actions/parse_record_an.py
Normal file
31
examples/andriod_assistant/actions/parse_record_an.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Desc : the ActionNode to parse record
|
||||
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
|
||||
OBSERVATION = ActionNode(
|
||||
key="Observation", expected_type=str,
|
||||
instruction="Provide a description of your observations of the two images. "
|
||||
"Subsequently, delineate the distinctions between the first image and the second one.",
|
||||
example=""
|
||||
)
|
||||
|
||||
THOUGHT = ActionNode(
|
||||
key="Thought",
|
||||
expected_type=str,
|
||||
instruction="Consider the impact of Action acting on UI elements.",
|
||||
example="",
|
||||
)
|
||||
|
||||
DESCRIPTION = ActionNode(
|
||||
key="Description",
|
||||
expected_type=str,
|
||||
instruction="Describe the functionality of the UI element concisely in one or two sentences Do not include "
|
||||
"the numeric tag in your description",
|
||||
example="",
|
||||
)
|
||||
|
||||
NODES = [OBSERVATION, THOUGHT, DESCRIPTION]
|
||||
|
||||
RECORD_PARSE_NODE = ActionNode.from_children("RecordParse", NODES)
|
||||
|
|
@ -11,6 +11,8 @@ class ActionOp(Enum):
|
|||
LONG_PRESS = "long_press"
|
||||
TEXT = "text"
|
||||
SWIPE = "swipe"
|
||||
VERTICAL_SWIPE = "v_swipe"
|
||||
HORIZONTAL_SWIPE = "h_swipe"
|
||||
GRID = "grid"
|
||||
STOP = "stop"
|
||||
|
||||
|
|
@ -57,6 +59,15 @@ class ReflectLogItem(BaseModel):
|
|||
response: str = Field(default="")
|
||||
|
||||
|
||||
class RecordLogItem(BaseModel):
|
||||
"""log content for record parse, same as ReflectLogItem"""
|
||||
step: int = Field(default=0)
|
||||
prompt: str = Field(default="")
|
||||
image_before: str = Field(default="")
|
||||
image_after: str = Field(default="")
|
||||
response: str = Field(default="")
|
||||
|
||||
|
||||
class DocContent(BaseModel):
|
||||
tap: str = Field(default="")
|
||||
text: str = Field(default="")
|
||||
|
|
@ -125,6 +136,7 @@ class SwipeGridOp(BaseGridOpParam):
|
|||
end_area: int = Field(default=-1)
|
||||
end_subarea: str = Field(default="")
|
||||
|
||||
|
||||
# end =================== define different Action Op and its params =============
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue