udpate android assistant

This commit is contained in:
better629 2024-03-29 12:21:32 +08:00
parent e3e49eee03
commit a9ec18af72
7 changed files with 76 additions and 66 deletions

View file

@ -161,4 +161,6 @@ class ManualRecord(Action):
return AndroidActionOutput(action_state=RunState.FAIL)
record_file.write(log_str)
time.sleep(3)
time.sleep(1)
return AndroidActionOutput(action_state=RunState.SUCCESS)

View file

@ -35,7 +35,7 @@ class ParseRecord(Action):
screenshot_before_path: Path = ""
screenshot_after_path: Path = ""
async def run(self, app_name: str, task_dir: Path, docs_dir: Path):
async def run(self, task_dir: Path, docs_dir: Path):
doc_count = 0
self.record_path = Path(task_dir) / "record.txt"
self.task_desc_path = Path(task_dir) / "task_desc.txt"
@ -112,7 +112,7 @@ class ParseRecord(Action):
)
if "error" in node.content:
return AndroidActionOutput(action_state=RunState.FAIL)
log_path = task_dir.joinpath(f"log_{app_name}.txt")
log_path = task_dir.joinpath("log_parse_record.txt")
prompt = node.compile(context=context, schema="json", mode="auto")
msg = node.content
doc_content[action_type] = msg
@ -132,3 +132,5 @@ class ParseRecord(Action):
logger.info(f"Documentation generated and saved to {doc_path}")
logger.info(f"Documentation generation phase completed. {doc_count} docs generated.")
return AndroidActionOutput(action_state=RunState.FINISH)

View file

@ -25,16 +25,16 @@ from metagpt.ext.android_assistant.prompts.assistant_prompt import (
from metagpt.ext.android_assistant.utils.schema import (
AndroidActionOutput,
AndroidElement,
GridOp,
LongPressGridOp,
LongPressOp,
GridOpParam,
LongPressGridOpParam,
LongPressOpParam,
OpLogItem,
RunState,
SwipeGridOp,
SwipeOp_3,
TapGridOp,
TapOp,
TextOp,
SwipeGridOpParam,
SwipeOpParam,
TapGridOpParam,
TapOpParam,
TextOpParam,
)
from metagpt.ext.android_assistant.utils.utils import (
area_to_xy,
@ -109,7 +109,6 @@ class ScreenshotParse(Action):
xml_path: Path = env.observe(
EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
)
width, height = env.device_shape
if not screenshot_path.exists() or not xml_path.exists():
return AndroidActionOutput(action_state=RunState.FAIL)
@ -153,33 +152,34 @@ class ScreenshotParse(Action):
op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on)
if op_param.param_state == RunState.FINISH:
logger.info(f"op_param: {op_param}")
return AndroidActionOutput(action_state=RunState.FINISH)
if op_param.param_state == RunState.FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
if isinstance(op_param, TapOp):
if isinstance(op_param, TapOpParam):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
elif isinstance(op_param, TextOp):
elif isinstance(op_param, TextOpParam):
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
elif isinstance(op_param, LongPressOp):
elif isinstance(op_param, LongPressOpParam):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
elif isinstance(op_param, SwipeOp_3):
elif isinstance(op_param, SwipeOpParam):
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(
action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
)
elif isinstance(op_param, GridOp):
elif isinstance(op_param, GridOpParam):
grid_on = True
elif isinstance(op_param, TapGridOp) or isinstance(op_param, LongPressGridOp):
elif isinstance(op_param, TapGridOpParam) or isinstance(op_param, LongPressGridOpParam):
x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols)
if isinstance(op_param, TapGridOp):
if isinstance(op_param, TapGridOpParam):
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
else:
# LongPressGridOp
# LongPressGridOpParam
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
elif isinstance(op_param, SwipeGridOp):
elif isinstance(op_param, SwipeGridOpParam):
start_x, start_y = area_to_xy(
op_param.start_area, op_param.start_subarea, env.width, env.height, env.rows, env.cols
)
@ -190,12 +190,13 @@ class ScreenshotParse(Action):
action_type=EnvActionType.USER_SWIPE_TO, coord=(start_x, start_y), tgt_coord=(end_x, end_y)
)
obs, _, _, _, info = env.step(action)
action_res = info["res"]
if action_res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
if not grid_on:
obs, _, _, _, info = env.step(action)
action_res = info["res"]
if action_res == ADB_EXEC_FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
if op_param.act_name != "grid":
grid_on = True
grid_on = False
return AndroidActionOutput(data={"grid_on": grid_on})

View file

@ -33,14 +33,14 @@ from metagpt.ext.android_assistant.utils.schema import (
AndroidElement,
Decision,
DocContent,
LongPressOp,
LongPressOpParam,
OpLogItem,
ReflectLogItem,
RunState,
SwipeOp,
SwipeOp_3,
TapOp,
TextOp,
SwipeOpParam,
TapOpParam,
TextOpParam,
)
from metagpt.ext.android_assistant.utils.utils import (
draw_bbox_multi,
@ -71,6 +71,9 @@ class SelfLearnAndReflect(Action):
for path in [task_dir, docs_dir]:
path.mkdir(parents=True, exist_ok=True)
resp = await self.run_self_learn(round_count, task_desc, last_act, task_dir, env)
if resp.action_state != RunState.SUCCESS:
return resp
resp = await self.run_reflect(round_count, task_desc, last_act, task_dir, docs_dir, env)
return resp
@ -111,17 +114,17 @@ class SelfLearnAndReflect(Action):
if op_param.param_state == RunState.FAIL:
return AndroidActionOutput(action_state=RunState.FAIL)
if isinstance(op_param, TapOp):
if isinstance(op_param, TapOpParam):
self.ui_area = op_param.area
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
elif isinstance(op_param, TextOp):
elif isinstance(op_param, TextOpParam):
action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
elif isinstance(op_param, LongPressOp):
elif isinstance(op_param, LongPressOpParam):
self.ui_area = op_param.area
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
elif isinstance(op_param, SwipeOp_3):
elif isinstance(op_param, SwipeOpParam):
self.ui_area = op_param.area
self.swipe_orient = op_param.swipe_orient
x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)

View file

@ -92,9 +92,10 @@ class AndroidAssistant(Role):
"""ignore old memory to make it run multi rounds inside a role"""
newest_msgs = self.rc.memory.get(k=1)
newest_msg = newest_msgs[0] if newest_msgs else None
if newest_msg and (RunState.FAIL.value in newest_msg.content):
if newest_msg and (RunState.SUCCESS.value.upper() not in newest_msg.content):
ignore_memory = False
logger.error("Latest action_state is FAIL, won't react in remainder rounds")
state_val = newest_msg.content.split(".")[-1] # RoundCount: 1, action_state: RunState.SUCCESS
logger.warning(f"Latest action_state is {state_val}, will run in the remainder rounds without `react`")
return await super()._observe(ignore_memory)
async def _act(self) -> Message:
@ -104,7 +105,6 @@ class AndroidAssistant(Role):
resp = await todo.run(task_dir=self.task_dir, task_desc=self.task_desc, env=self.rc.env)
elif isinstance(todo, ParseRecord):
resp = await todo.run(
app_name=config.get_other("app_name", "demo"),
task_dir=self.task_dir,
docs_dir=self.docs_dir,
)

View file

@ -96,26 +96,26 @@ class BaseOpParam(BaseModel):
param_state: RunState = Field(default=RunState.SUCCESS, description="return state when extract params")
class TapOp(BaseOpParam):
class TapOpParam(BaseOpParam):
area: int = Field(default=-1)
class TextOp(BaseOpParam):
class TextOpParam(BaseOpParam):
input_str: str = Field(default="")
class LongPressOp(BaseOpParam):
class LongPressOpParam(BaseOpParam):
area: int = Field(default=-1)
# Modify This SwipeOp to SwipeOp_3, Need better name
class SwipeOp_3(BaseOpParam):
# Modify This SwipeOp to SwipeOpParam, Need better name
class SwipeOpParam(BaseOpParam):
area: int = Field(default=-1)
swipe_orient: str = Field(default="up")
dist: str = Field(default="")
class GridOp(BaseModel):
class GridOpParam(BaseOpParam):
act_name: str = Field(default="")
@ -126,17 +126,17 @@ class BaseGridOpParam(BaseOpParam):
return f"{act_name}_grid"
class TapGridOp(BaseGridOpParam):
class TapGridOpParam(BaseGridOpParam):
area: int = Field(default=-1)
subarea: str = Field(default="")
class LongPressGridOp(BaseGridOpParam):
class LongPressGridOpParam(BaseGridOpParam):
area: int = Field(default=-1)
subarea: str = Field(default="")
class SwipeGridOp(BaseGridOpParam):
class SwipeGridOpParam(BaseGridOpParam):
start_area: int = Field(default=-1)
start_subarea: str = Field(default="")
end_area: int = Field(default=-1)

View file

@ -16,16 +16,16 @@ from metagpt.ext.android_assistant.utils.schema import (
BaseGridOpParam,
BaseOpParam,
Decision,
GridOp,
LongPressGridOp,
LongPressOp,
GridOpParam,
LongPressGridOpParam,
LongPressOpParam,
ReflectOp,
RunState,
SwipeGridOp,
SwipeOp_3,
TapGridOp,
TapOp,
TextOp,
SwipeGridOpParam,
SwipeOpParam,
TapGridOpParam,
TapOpParam,
TextOpParam,
)
from metagpt.logs import logger
@ -260,7 +260,9 @@ def reflect_parse_extarct(parsed_json: dict) -> ReflectOp:
return op
def screenshot_parse_extract(parsed_json: dict, grid_on: bool = False) -> Union[BaseOpParam, BaseGridOpParam, GridOp]:
def screenshot_parse_extract(
parsed_json: dict, grid_on: bool = False
) -> Union[BaseOpParam, BaseGridOpParam, GridOpParam]:
act = parsed_json.get("Action")
last_act = parsed_json.get("Summary")
act_name = act.split("(")[0]
@ -284,44 +286,44 @@ def op_params_clean(params: list[str]) -> list[Union[int, str]]:
return param_values
def screenshot_parse_extract_without_grid(act_name: str, act: str, last_act: str) -> Union[BaseOpParam, GridOp]:
def screenshot_parse_extract_without_grid(act_name: str, act: str, last_act: str) -> Union[BaseOpParam, GridOpParam]:
if act_name == ActionOp.TAP.value:
area = int(re.findall(r"tap\((.*?)\)", act)[0])
op = TapOp(act_name=act_name, area=area, last_act=last_act)
op = TapOpParam(act_name=act_name, area=area, last_act=last_act)
elif act_name == ActionOp.TEXT.value:
input_str = re.findall(r"text\((.*?)\)", act)[0][1:-1]
op = TextOp(act_name=act_name, input_str=input_str, last_act=last_act)
op = TextOpParam(act_name=act_name, input_str=input_str, last_act=last_act)
elif act_name == ActionOp.LONG_PRESS.value:
area = int(re.findall(r"long_press\((.*?)\)", act)[0])
op = LongPressOp(act_name=act_name, area=area, last_act=last_act)
op = LongPressOpParam(act_name=act_name, area=area, last_act=last_act)
elif act_name == ActionOp.SWIPE.value:
params = re.findall(r"swipe\((.*?)\)", act)[0].split(",")
params = op_params_clean(params) # area, swipe_orient, dist
op = SwipeOp_3(act_name=act_name, area=params[0], swipe_orient=params[1], dist=params[2], last_act=last_act)
op = SwipeOpParam(act_name=act_name, area=params[0], swipe_orient=params[1], dist=params[2], last_act=last_act)
elif act_name == ActionOp.GRID.value:
op = GridOp(act_name=act_name)
op = GridOpParam(act_name=act_name)
else:
op = BaseOpParam(param_state=RunState.FAIL)
return op
def screenshot_parse_extract_with_grid(act_name: str, act: str, last_act: str) -> Union[BaseGridOpParam, GridOp]:
def screenshot_parse_extract_with_grid(act_name: str, act: str, last_act: str) -> Union[BaseGridOpParam, GridOpParam]:
if act_name == ActionOp.TAP.value:
params = re.findall(r"tap\((.*?)\)", act)[0].split(",")
params = op_params_clean(params)
op = TapGridOp(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
op = TapGridOpParam(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
elif act_name == ActionOp.LONG_PRESS.value:
params = re.findall(r"long_press\((.*?)\)", act)[0].split(",")
params = op_params_clean(params)
op = LongPressGridOp(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
op = LongPressGridOpParam(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act)
elif act_name == ActionOp.SWIPE.value:
params = re.findall(r"swipe\((.*?)\)", act)[0].split(",")
params = op_params_clean(params)
op = SwipeGridOp(
op = SwipeGridOpParam(
act_name=act_name, start_area=params[0], start_subarea=params[1], end_area=params[2], end_subarea=params[3]
)
elif act_name == ActionOp.GRID.value:
op = GridOp(act_name=act_name)
op = GridOpParam(act_name=act_name)
else:
op = BaseGridOpParam(param_state=RunState.FAIL)
return op