diff --git a/metagpt/ext/android_assistant/actions/manual_record.py b/metagpt/ext/android_assistant/actions/manual_record.py index bdccbb72f..fa9235da4 100644 --- a/metagpt/ext/android_assistant/actions/manual_record.py +++ b/metagpt/ext/android_assistant/actions/manual_record.py @@ -161,4 +161,6 @@ class ManualRecord(Action): return AndroidActionOutput(action_state=RunState.FAIL) record_file.write(log_str) - time.sleep(3) + time.sleep(1) + + return AndroidActionOutput(action_state=RunState.SUCCESS) diff --git a/metagpt/ext/android_assistant/actions/parse_record.py b/metagpt/ext/android_assistant/actions/parse_record.py index 6974bfe46..c9b8d34fa 100644 --- a/metagpt/ext/android_assistant/actions/parse_record.py +++ b/metagpt/ext/android_assistant/actions/parse_record.py @@ -35,7 +35,7 @@ class ParseRecord(Action): screenshot_before_path: Path = "" screenshot_after_path: Path = "" - async def run(self, app_name: str, task_dir: Path, docs_dir: Path): + async def run(self, task_dir: Path, docs_dir: Path): doc_count = 0 self.record_path = Path(task_dir) / "record.txt" self.task_desc_path = Path(task_dir) / "task_desc.txt" @@ -112,7 +112,7 @@ class ParseRecord(Action): ) if "error" in node.content: return AndroidActionOutput(action_state=RunState.FAIL) - log_path = task_dir.joinpath(f"log_{app_name}.txt") + log_path = task_dir.joinpath("log_parse_record.txt") prompt = node.compile(context=context, schema="json", mode="auto") msg = node.content doc_content[action_type] = msg @@ -132,3 +132,5 @@ class ParseRecord(Action): logger.info(f"Documentation generated and saved to {doc_path}") logger.info(f"Documentation generation phase completed. {doc_count} docs generated.") + + return AndroidActionOutput(action_state=RunState.FINISH) diff --git a/metagpt/ext/android_assistant/actions/screenshot_parse.py b/metagpt/ext/android_assistant/actions/screenshot_parse.py index 3d0aa5891..e55d56f0a 100644 --- a/metagpt/ext/android_assistant/actions/screenshot_parse.py +++ b/metagpt/ext/android_assistant/actions/screenshot_parse.py @@ -25,16 +25,16 @@ from metagpt.ext.android_assistant.prompts.assistant_prompt import ( from metagpt.ext.android_assistant.utils.schema import ( AndroidActionOutput, AndroidElement, - GridOp, - LongPressGridOp, - LongPressOp, + GridOpParam, + LongPressGridOpParam, + LongPressOpParam, OpLogItem, RunState, - SwipeGridOp, - SwipeOp_3, - TapGridOp, - TapOp, - TextOp, + SwipeGridOpParam, + SwipeOpParam, + TapGridOpParam, + TapOpParam, + TextOpParam, ) from metagpt.ext.android_assistant.utils.utils import ( area_to_xy, @@ -109,7 +109,6 @@ class ScreenshotParse(Action): xml_path: Path = env.observe( EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir) ) - width, height = env.device_shape if not screenshot_path.exists() or not xml_path.exists(): return AndroidActionOutput(action_state=RunState.FAIL) @@ -153,33 +152,34 @@ class ScreenshotParse(Action): op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on) if op_param.param_state == RunState.FINISH: + logger.info(f"op_param: {op_param}") return AndroidActionOutput(action_state=RunState.FINISH) if op_param.param_state == RunState.FAIL: return AndroidActionOutput(action_state=RunState.FAIL) - if isinstance(op_param, TapOp): + if isinstance(op_param, TapOpParam): x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y)) - elif isinstance(op_param, TextOp): + elif isinstance(op_param, TextOpParam): action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str) - elif isinstance(op_param, LongPressOp): + elif isinstance(op_param, LongPressOpParam): x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y)) - elif isinstance(op_param, SwipeOp_3): + elif isinstance(op_param, SwipeOpParam): x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) action = EnvAction( action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist ) - elif isinstance(op_param, GridOp): + elif isinstance(op_param, GridOpParam): grid_on = True - elif isinstance(op_param, TapGridOp) or isinstance(op_param, LongPressGridOp): + elif isinstance(op_param, TapGridOpParam) or isinstance(op_param, LongPressGridOpParam): x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols) - if isinstance(op_param, TapGridOp): + if isinstance(op_param, TapGridOpParam): action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y)) else: - # LongPressGridOp + # LongPressGridOpParam action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y)) - elif isinstance(op_param, SwipeGridOp): + elif isinstance(op_param, SwipeGridOpParam): start_x, start_y = area_to_xy( op_param.start_area, op_param.start_subarea, env.width, env.height, env.rows, env.cols ) @@ -190,12 +190,13 @@ class ScreenshotParse(Action): action_type=EnvActionType.USER_SWIPE_TO, coord=(start_x, start_y), tgt_coord=(end_x, end_y) ) - obs, _, _, _, info = env.step(action) - action_res = info["res"] - if action_res == ADB_EXEC_FAIL: - return AndroidActionOutput(action_state=RunState.FAIL) + if not grid_on: + obs, _, _, _, info = env.step(action) + action_res = info["res"] + if action_res == ADB_EXEC_FAIL: + return AndroidActionOutput(action_state=RunState.FAIL) if op_param.act_name != "grid": - grid_on = True + grid_on = False return AndroidActionOutput(data={"grid_on": grid_on}) diff --git a/metagpt/ext/android_assistant/actions/self_learn_and_reflect.py b/metagpt/ext/android_assistant/actions/self_learn_and_reflect.py index 0bfb38606..73f5073a6 100644 --- a/metagpt/ext/android_assistant/actions/self_learn_and_reflect.py +++ b/metagpt/ext/android_assistant/actions/self_learn_and_reflect.py @@ -33,14 +33,14 @@ from metagpt.ext.android_assistant.utils.schema import ( AndroidElement, Decision, DocContent, - LongPressOp, + LongPressOpParam, OpLogItem, ReflectLogItem, RunState, SwipeOp, - SwipeOp_3, - TapOp, - TextOp, + SwipeOpParam, + TapOpParam, + TextOpParam, ) from metagpt.ext.android_assistant.utils.utils import ( draw_bbox_multi, @@ -71,6 +71,9 @@ class SelfLearnAndReflect(Action): for path in [task_dir, docs_dir]: path.mkdir(parents=True, exist_ok=True) resp = await self.run_self_learn(round_count, task_desc, last_act, task_dir, env) + if resp.action_state != RunState.SUCCESS: + return resp + resp = await self.run_reflect(round_count, task_desc, last_act, task_dir, docs_dir, env) return resp @@ -111,17 +114,17 @@ class SelfLearnAndReflect(Action): if op_param.param_state == RunState.FAIL: return AndroidActionOutput(action_state=RunState.FAIL) - if isinstance(op_param, TapOp): + if isinstance(op_param, TapOpParam): self.ui_area = op_param.area x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y)) - elif isinstance(op_param, TextOp): + elif isinstance(op_param, TextOpParam): action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str) - elif isinstance(op_param, LongPressOp): + elif isinstance(op_param, LongPressOpParam): self.ui_area = op_param.area x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y)) - elif isinstance(op_param, SwipeOp_3): + elif isinstance(op_param, SwipeOpParam): self.ui_area = op_param.area self.swipe_orient = op_param.swipe_orient x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox) diff --git a/metagpt/ext/android_assistant/roles/android_assistant.py b/metagpt/ext/android_assistant/roles/android_assistant.py index e4d3f36d5..531c82261 100644 --- a/metagpt/ext/android_assistant/roles/android_assistant.py +++ b/metagpt/ext/android_assistant/roles/android_assistant.py @@ -92,9 +92,10 @@ class AndroidAssistant(Role): """ignore old memory to make it run multi rounds inside a role""" newest_msgs = self.rc.memory.get(k=1) newest_msg = newest_msgs[0] if newest_msgs else None - if newest_msg and (RunState.FAIL.value in newest_msg.content): + if newest_msg and (RunState.SUCCESS.value.upper() not in newest_msg.content): ignore_memory = False - logger.error("Latest action_state is FAIL, won't react in remainder rounds") + state_val = newest_msg.content.split(".")[-1] # RoundCount: 1, action_state: RunState.SUCCESS + logger.warning(f"Latest action_state is {state_val}, will run in the remainder rounds without `react`") return await super()._observe(ignore_memory) async def _act(self) -> Message: @@ -104,7 +105,6 @@ class AndroidAssistant(Role): resp = await todo.run(task_dir=self.task_dir, task_desc=self.task_desc, env=self.rc.env) elif isinstance(todo, ParseRecord): resp = await todo.run( - app_name=config.get_other("app_name", "demo"), task_dir=self.task_dir, docs_dir=self.docs_dir, ) diff --git a/metagpt/ext/android_assistant/utils/schema.py b/metagpt/ext/android_assistant/utils/schema.py index d7990de40..661f4abf4 100644 --- a/metagpt/ext/android_assistant/utils/schema.py +++ b/metagpt/ext/android_assistant/utils/schema.py @@ -96,26 +96,26 @@ class BaseOpParam(BaseModel): param_state: RunState = Field(default=RunState.SUCCESS, description="return state when extract params") -class TapOp(BaseOpParam): +class TapOpParam(BaseOpParam): area: int = Field(default=-1) -class TextOp(BaseOpParam): +class TextOpParam(BaseOpParam): input_str: str = Field(default="") -class LongPressOp(BaseOpParam): +class LongPressOpParam(BaseOpParam): area: int = Field(default=-1) -# Modify This SwipeOp to SwipeOp_3, Need better name -class SwipeOp_3(BaseOpParam): +# Modify This SwipeOp to SwipeOpParam, Need better name +class SwipeOpParam(BaseOpParam): area: int = Field(default=-1) swipe_orient: str = Field(default="up") dist: str = Field(default="") -class GridOp(BaseModel): +class GridOpParam(BaseOpParam): act_name: str = Field(default="") @@ -126,17 +126,17 @@ class BaseGridOpParam(BaseOpParam): return f"{act_name}_grid" -class TapGridOp(BaseGridOpParam): +class TapGridOpParam(BaseGridOpParam): area: int = Field(default=-1) subarea: str = Field(default="") -class LongPressGridOp(BaseGridOpParam): +class LongPressGridOpParam(BaseGridOpParam): area: int = Field(default=-1) subarea: str = Field(default="") -class SwipeGridOp(BaseGridOpParam): +class SwipeGridOpParam(BaseGridOpParam): start_area: int = Field(default=-1) start_subarea: str = Field(default="") end_area: int = Field(default=-1) diff --git a/metagpt/ext/android_assistant/utils/utils.py b/metagpt/ext/android_assistant/utils/utils.py index 67a0cb860..3a5ebc325 100644 --- a/metagpt/ext/android_assistant/utils/utils.py +++ b/metagpt/ext/android_assistant/utils/utils.py @@ -16,16 +16,16 @@ from metagpt.ext.android_assistant.utils.schema import ( BaseGridOpParam, BaseOpParam, Decision, - GridOp, - LongPressGridOp, - LongPressOp, + GridOpParam, + LongPressGridOpParam, + LongPressOpParam, ReflectOp, RunState, - SwipeGridOp, - SwipeOp_3, - TapGridOp, - TapOp, - TextOp, + SwipeGridOpParam, + SwipeOpParam, + TapGridOpParam, + TapOpParam, + TextOpParam, ) from metagpt.logs import logger @@ -260,7 +260,9 @@ def reflect_parse_extarct(parsed_json: dict) -> ReflectOp: return op -def screenshot_parse_extract(parsed_json: dict, grid_on: bool = False) -> Union[BaseOpParam, BaseGridOpParam, GridOp]: +def screenshot_parse_extract( + parsed_json: dict, grid_on: bool = False +) -> Union[BaseOpParam, BaseGridOpParam, GridOpParam]: act = parsed_json.get("Action") last_act = parsed_json.get("Summary") act_name = act.split("(")[0] @@ -284,44 +286,44 @@ def op_params_clean(params: list[str]) -> list[Union[int, str]]: return param_values -def screenshot_parse_extract_without_grid(act_name: str, act: str, last_act: str) -> Union[BaseOpParam, GridOp]: +def screenshot_parse_extract_without_grid(act_name: str, act: str, last_act: str) -> Union[BaseOpParam, GridOpParam]: if act_name == ActionOp.TAP.value: area = int(re.findall(r"tap\((.*?)\)", act)[0]) - op = TapOp(act_name=act_name, area=area, last_act=last_act) + op = TapOpParam(act_name=act_name, area=area, last_act=last_act) elif act_name == ActionOp.TEXT.value: input_str = re.findall(r"text\((.*?)\)", act)[0][1:-1] - op = TextOp(act_name=act_name, input_str=input_str, last_act=last_act) + op = TextOpParam(act_name=act_name, input_str=input_str, last_act=last_act) elif act_name == ActionOp.LONG_PRESS.value: area = int(re.findall(r"long_press\((.*?)\)", act)[0]) - op = LongPressOp(act_name=act_name, area=area, last_act=last_act) + op = LongPressOpParam(act_name=act_name, area=area, last_act=last_act) elif act_name == ActionOp.SWIPE.value: params = re.findall(r"swipe\((.*?)\)", act)[0].split(",") params = op_params_clean(params) # area, swipe_orient, dist - op = SwipeOp_3(act_name=act_name, area=params[0], swipe_orient=params[1], dist=params[2], last_act=last_act) + op = SwipeOpParam(act_name=act_name, area=params[0], swipe_orient=params[1], dist=params[2], last_act=last_act) elif act_name == ActionOp.GRID.value: - op = GridOp(act_name=act_name) + op = GridOpParam(act_name=act_name) else: op = BaseOpParam(param_state=RunState.FAIL) return op -def screenshot_parse_extract_with_grid(act_name: str, act: str, last_act: str) -> Union[BaseGridOpParam, GridOp]: +def screenshot_parse_extract_with_grid(act_name: str, act: str, last_act: str) -> Union[BaseGridOpParam, GridOpParam]: if act_name == ActionOp.TAP.value: params = re.findall(r"tap\((.*?)\)", act)[0].split(",") params = op_params_clean(params) - op = TapGridOp(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act) + op = TapGridOpParam(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act) elif act_name == ActionOp.LONG_PRESS.value: params = re.findall(r"long_press\((.*?)\)", act)[0].split(",") params = op_params_clean(params) - op = LongPressGridOp(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act) + op = LongPressGridOpParam(act_name=act_name, area=params[0], subarea=params[1], last_act=last_act) elif act_name == ActionOp.SWIPE.value: params = re.findall(r"swipe\((.*?)\)", act)[0].split(",") params = op_params_clean(params) - op = SwipeGridOp( + op = SwipeGridOpParam( act_name=act_name, start_area=params[0], start_subarea=params[1], end_area=params[2], end_subarea=params[3] ) elif act_name == ActionOp.GRID.value: - op = GridOp(act_name=act_name) + op = GridOpParam(act_name=act_name) else: op = BaseGridOpParam(param_state=RunState.FAIL) return op