diff --git a/examples/andriod_assistant/apps/demo_Contacts/log_Contacts_demo_Contacts_2024-01-30_21-50-19.txt b/examples/andriod_assistant/apps/demo_Contacts/log_Contacts_demo_Contacts_2024-01-30_21-50-19.txt new file mode 100644 index 000000000..59e2aecd7 --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/log_Contacts_demo_Contacts_2024-01-30_21-50-19.txt @@ -0,0 +1,5 @@ +{"step": 1, "prompt": "I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number 9 on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_1.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_2.png", "response": {"id": "chatcmpl-8miqk5n21ZtIdridhvSQyTZUzVel9", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "Tapping this UI element opens the app drawer, displaying a list of all the apps installed on the device.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622838, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 22, "prompt_tokens": 3101, "total_tokens": 3123}}} +{"step": 2, "prompt": "I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number 9 on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_2.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_3.png", "response": {"id": "chatcmpl-8mirFP7if9MJFST6hNhwTAwS3fSrz", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "Tapping this UI element will open the Contacts application.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622869, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 11, "prompt_tokens": 3101, "total_tokens": 3112}}} +{"step": 3, "prompt": "I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number 6 on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_3.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_4.png", "response": {"id": "chatcmpl-8mirf3RakbtpZK0zfvJjdXJ48rYNJ", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "Tapping this UI element allows the user to add a new contact to their contact list.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622895, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 18, "prompt_tokens": 3101, "total_tokens": 3119}}} +{"step": 4, "prompt": "I will give you the screenshot of a mobile app before and after typing in the input area labeled\nwith the number 4 on the screen. The numeric tag of each element is located at the center of the element. \nTyping in this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is \nto describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the \nUI element should focus on the general function. For example, if the change of the screenshot shows that the user typed \n\"How are you?\" in the chat box, you do not need to mention the actual text. Just say: \"This input area is used for the \nuser to type a message to send to the chat window.\". Never include the numeric tag of the UI element in your \ndescription. You can use pronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_4.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_5.png", "response": {"id": "chatcmpl-8mis5yw6Dt9iqFvUBfyKyThUpUBIR", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "The UI element is used for the user to enter the name of a new contact in the Contacts app.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622921, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 21, "prompt_tokens": 3112, "total_tokens": 3133}}} +{"step": 5, "prompt": "I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number 4 on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_5.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_6.png", "response": {"id": "chatcmpl-8misV60JHJEblfhdkseEPxtj5sqqi", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "Tapping this UI element allows the user to enter a name for the new contact.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622947, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 17, "prompt_tokens": 3101, "total_tokens": 3118}}} diff --git a/examples/andriod_assistant/apps/demo_Contacts/record.txt b/examples/andriod_assistant/apps/demo_Contacts/record.txt new file mode 100644 index 000000000..33d4595fe --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/record.txt @@ -0,0 +1,10 @@ +tap(9):::android.view.ViewGroup_1067_236_android.widget.TextView_183_204_Apps_2 +tap(9):::com.android.launcher3.id_apps_list_view_com.android.launcher3.id_icon_Contacts_7 +tap(6):::com.android.contacts.id_floating_action_button_container_com.android.contacts.id_floating_action_button_addnewcontact_0 +text(4:sep:"zr"):::com.android.contacts.id_editors_android.widget.EditText_775_142_0 +tap(4):::com.android.contacts.id_editors_android.widget.EditText_775_142_0 +text(4:sep:"zr"):::com.android.contacts.id_editors_android.widget.EditText_775_142_0 +tap(6):::com.android.contacts.id_editors_android.widget.EditText_775_142_0 +text(6:sep:"+86 15231955333"):::com.android.contacts.id_editors_android.widget.EditText_775_142_0 +tap(2):::android.widget.LinearLayout_126_147_com.android.contacts.id_menu_save_Save_0 +stop diff --git a/examples/andriod_assistant/apps/demo_Contacts/task_desc.txt b/examples/andriod_assistant/apps/demo_Contacts/task_desc.txt new file mode 100644 index 000000000..8d54c2c80 --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/task_desc.txt @@ -0,0 +1 @@ +create a new contact in Contacts app named zr ,with a phone number +86 15231955333 \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_1.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_1.xml new file mode 100644 index 000000000..0cf5c90dd --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_1.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_10.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_10.xml new file mode 100644 index 000000000..1c4204bd3 --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_10.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_2.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_2.xml new file mode 100644 index 000000000..76ea0952b --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_2.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_3.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_3.xml new file mode 100644 index 000000000..dc8c728a6 --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_3.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_4.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_4.xml new file mode 100644 index 000000000..38971bd2e --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_4.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_5.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_5.xml new file mode 100644 index 000000000..38971bd2e --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_5.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_6.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_6.xml new file mode 100644 index 000000000..88b29a07e --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_6.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_7.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_7.xml new file mode 100644 index 000000000..3053e960b --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_7.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_8.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_8.xml new file mode 100644 index 000000000..51ec4ddcc --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_8.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_9.xml b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_9.xml new file mode 100644 index 000000000..defd9fcd8 --- /dev/null +++ b/examples/andriod_assistant/apps/demo_Contacts/xml/demo_Contacts_2024-01-30_21-50-19_9.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_docs/android.view.ViewGroup_1067_236_android.widget.TextView_183_204_Apps_2.txt b/examples/andriod_assistant/apps/demo_docs/android.view.ViewGroup_1067_236_android.widget.TextView_183_204_Apps_2.txt new file mode 100644 index 000000000..0d1cd0c48 --- /dev/null +++ b/examples/andriod_assistant/apps/demo_docs/android.view.ViewGroup_1067_236_android.widget.TextView_183_204_Apps_2.txt @@ -0,0 +1 @@ +{'tap': 'Tapping this UI element opens the app drawer, displaying a list of all the apps installed on the device.', 'text': '', 'v_swipe': '', 'h_swipe': '', 'long_press': ''} \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_docs/android.widget.LinearLayout_126_147_com.android.contacts.id_menu_save_Save_0.txt b/examples/andriod_assistant/apps/demo_docs/android.widget.LinearLayout_126_147_com.android.contacts.id_menu_save_Save_0.txt new file mode 100644 index 000000000..d0a49e563 --- /dev/null +++ b/examples/andriod_assistant/apps/demo_docs/android.widget.LinearLayout_126_147_com.android.contacts.id_menu_save_Save_0.txt @@ -0,0 +1 @@ +{'tap': 'Tapping this UI element saves the new contact information that has been entered into the Contacts app.', 'text': '', 'v_swipe': '', 'h_swipe': '', 'long_press': ''} \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_docs/com.android.contacts.id_editors_android.widget.EditText_775_142_0.txt b/examples/andriod_assistant/apps/demo_docs/com.android.contacts.id_editors_android.widget.EditText_775_142_0.txt new file mode 100644 index 000000000..300c39f98 --- /dev/null +++ b/examples/andriod_assistant/apps/demo_docs/com.android.contacts.id_editors_android.widget.EditText_775_142_0.txt @@ -0,0 +1 @@ +{'tap': 'Tapping this UI element allows the user to enter a name for the new contact.', 'text': 'The UI element is used for the user to enter the name of a new contact in the Contacts app.', 'v_swipe': '', 'h_swipe': '', 'long_press': ''} \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_docs/com.android.contacts.id_floating_action_button_container_com.android.contacts.id_floating_action_button_addnewcontact_0.txt b/examples/andriod_assistant/apps/demo_docs/com.android.contacts.id_floating_action_button_container_com.android.contacts.id_floating_action_button_addnewcontact_0.txt new file mode 100644 index 000000000..b53b16480 --- /dev/null +++ b/examples/andriod_assistant/apps/demo_docs/com.android.contacts.id_floating_action_button_container_com.android.contacts.id_floating_action_button_addnewcontact_0.txt @@ -0,0 +1 @@ +{'tap': 'Tapping this UI element allows the user to add a new contact to their contact list.', 'text': '', 'v_swipe': '', 'h_swipe': '', 'long_press': ''} \ No newline at end of file diff --git a/examples/andriod_assistant/apps/demo_docs/com.android.launcher3.id_apps_list_view_com.android.launcher3.id_icon_Contacts_7.txt b/examples/andriod_assistant/apps/demo_docs/com.android.launcher3.id_apps_list_view_com.android.launcher3.id_icon_Contacts_7.txt new file mode 100644 index 000000000..88ece6b0e --- /dev/null +++ b/examples/andriod_assistant/apps/demo_docs/com.android.launcher3.id_apps_list_view_com.android.launcher3.id_icon_Contacts_7.txt @@ -0,0 +1 @@ +{'tap': 'Tapping this UI element will open the Contacts application.', 'text': '', 'v_swipe': '', 'h_swipe': '', 'long_press': ''} \ No newline at end of file diff --git a/examples/andriod_assistant/test.py b/examples/andriod_assistant/test.py index e170bd715..3210adb46 100644 --- a/examples/andriod_assistant/test.py +++ b/examples/andriod_assistant/test.py @@ -2,14 +2,105 @@ # -*- coding: utf-8 -*- # @Desc : test case (imgs from appagent's) + +import re +import ast +import json +import time +from pathlib import Path + +from prompts.operation_prompt import ( + tap_doc_template, + text_doc_template, + long_press_doc_template, + swipe_doc_template, + refine_doc_suffix +) +from utils.schema import ActionOp, SwipeOp +from actions.parse_record_an import RECORD_PARSE_NODE +from metagpt.config2 import config +from metagpt.utils.common import encode_image +from metagpt.logs import logger from metagpt.actions.action import Action -from examples.andriod_assistant.utils.schema import AndroidActionOutput -# TODO test for manual record +TEST_BEFORE_PATH = Path( + "examples/andriod_assistant/apps/demo_Contacts/labeled_screenshots/demo_Contacts_2024-01-30_21-50-19_1.png") +TEST_AFTER_PATH = Path( + "examples/andriod_assistant/apps/demo_Contacts/labeled_screenshots/demo_Contacts_2024-01-30_21-50-19_2.png") +RECORD_PATH = Path("examples/andriod_assistant/apps/demo_Contacts/record.txt") +TASK_DESC_PATH = Path("examples/andriod_assistant/apps/demo_Contacts/task_desc.txt") +DOCS_DIR = Path("examples/andriod_assistant/storage") +testaction = Action(name="test") # TODO test for parse record +# 仅使用一张图像进行测试 +img_before_base64 = encode_image(TEST_BEFORE_PATH) +img_after_base64 = encode_image(TEST_AFTER_PATH) -# TODO test for screenshot_parse +with open(RECORD_PATH, "r") as record_file: + rec = record_file.readline().strip() + action, resource_id = rec.split(":::") + action_type = action.split("(")[0] + action_param = re.findall(r"\((.*?)\)", action)[0] + if action_type == ActionOp.TAP.value: + prompt_template = tap_doc_template + context = prompt_template.format(ui_element=action_param) + elif action_type == ActionOp.TEXT.value: + input_area, input_text = action_param.split(":sep:") + prompt_template = text_doc_template + context = prompt_template.format(ui_element=input_area) + elif action_type == ActionOp.LONG_PRESS.value: + prompt_template = long_press_doc_template + context = prompt_template.format(ui_element=action_param) + elif action_type == ActionOp.SWIPE.value: + swipe_area, swipe_dir = action_param.split(":sep:") + if swipe_dir == SwipeOp.UP.value or swipe_dir == SwipeOp.DOWN.value: + action_type = ActionOp.VERTICAL_SWIPE.value + elif swipe_dir == SwipeOp.LEFT.value or swipe_dir == SwipeOp.RIGHT.value: + action_type = ActionOp.HORIZONTAL_SWIPE.value + prompt_template = swipe_doc_template + context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area) + else: + print("Error occurs") -# TODO add AndroidActionOutput to manual record & parse record + task_desc_path = TASK_DESC_PATH + task_desc = open(task_desc_path, "r").read() + context = context.format(task_desc=task_desc) + doc_name = resource_id + ".txt" + + doc_path = DOCS_DIR.joinpath(doc_name) + if doc_path.exists(): + doc_content = ast.literal_eval(open(doc_path).read()) + if doc_content[action_type]: + if config.get_other("doc_refine"): + refine_context = refine_doc_suffix.format(old_doc=doc_content[action_type]) + context += refine_context + logger.info( + f"Documentation for the element {resource_id} already exists. The doc will be " + f"refined based on the latest demo.") + else: + logger.info( + f"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE " + f"in the config file if needed.") + else: + doc_content = { + "tap": "", + "text": "", + "v_swipe": "", + "h_swipe": "", + "long_press": "" + } + logger.info(f"Waiting for GPT-4V to generate documentation for the element {resource_id}") + + node = RECORD_PARSE_NODE.fill(context=context, llm=testaction.llm, + images=[img_before_base64, img_after_base64]) + + # log_path = task_dir.joinpath(f"log_{app_name}_{demo_name}.txt") + prompt = node.compile(context=context, schema="json", mode="auto") + msg = node.content + doc_content[action_type] = msg + + with open(doc_path, "w") as outfile: + outfile.write(str(doc_content)) + logger.info(f"Documentation generated and saved to {doc_path}")