Adds New Manual demo & docs in apps folder; Adds Storage folder as result folder

This commit is contained in:
didi 2024-01-30 23:15:06 +08:00
parent 99da01fecd
commit 75e4dfe182
19 changed files with 126 additions and 4 deletions

View file

@ -0,0 +1,5 @@
{"step": 1, "prompt": "I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number 9 on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_1.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_2.png", "response": {"id": "chatcmpl-8miqk5n21ZtIdridhvSQyTZUzVel9", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "Tapping this UI element opens the app drawer, displaying a list of all the apps installed on the device.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622838, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 22, "prompt_tokens": 3101, "total_tokens": 3123}}}
{"step": 2, "prompt": "I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number 9 on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_2.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_3.png", "response": {"id": "chatcmpl-8mirFP7if9MJFST6hNhwTAwS3fSrz", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "Tapping this UI element will open the Contacts application.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622869, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 11, "prompt_tokens": 3101, "total_tokens": 3112}}}
{"step": 3, "prompt": "I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number 6 on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_3.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_4.png", "response": {"id": "chatcmpl-8mirf3RakbtpZK0zfvJjdXJ48rYNJ", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "Tapping this UI element allows the user to add a new contact to their contact list.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622895, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 18, "prompt_tokens": 3101, "total_tokens": 3119}}}
{"step": 4, "prompt": "I will give you the screenshot of a mobile app before and after typing in the input area labeled\nwith the number 4 on the screen. The numeric tag of each element is located at the center of the element. \nTyping in this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is \nto describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the \nUI element should focus on the general function. For example, if the change of the screenshot shows that the user typed \n\"How are you?\" in the chat box, you do not need to mention the actual text. Just say: \"This input area is used for the \nuser to type a message to send to the chat window.\". Never include the numeric tag of the UI element in your \ndescription. You can use pronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_4.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_5.png", "response": {"id": "chatcmpl-8mis5yw6Dt9iqFvUBfyKyThUpUBIR", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "The UI element is used for the user to enter the name of a new contact in the Contacts app.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622921, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 21, "prompt_tokens": 3112, "total_tokens": 3133}}}
{"step": 5, "prompt": "I will give you the screenshot of a mobile app before and after tapping the UI element labeled \nwith the number 4 on the screen. The numeric tag of each element is located at the center of the element. \nTapping this UI element is a necessary part of proceeding with a larger task, which is to create a new contact in Contacts app named zr ,with a phone number +86 15231955333. Your task is to \ndescribe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI \nelement should focus on the general function. For example, if the UI element is used to navigate to the chat window \nwith John, your description should not include the name of the specific person. Just say: \"Tapping this area will \nnavigate the user to the chat window\". Never include the numeric tag of the UI element in your description. You can use \npronouns such as \"the UI element\" to refer to the element.", "image_before": "demo_Contacts_2024-01-30_21-50-19_5.png", "image_after": "demo_Contacts_2024-01-30_21-50-19_6.png", "response": {"id": "chatcmpl-8misV60JHJEblfhdkseEPxtj5sqqi", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "Tapping this UI element allows the user to enter a name for the new contact.", "role": "assistant", "function_call": null, "tool_calls": null}}], "created": 1706622947, "model": "gpt-4-1106-vision-preview", "object": "chat.completion", "system_fingerprint": null, "usage": {"completion_tokens": 17, "prompt_tokens": 3101, "total_tokens": 3118}}}

View file

@ -0,0 +1,10 @@
tap(9):::android.view.ViewGroup_1067_236_android.widget.TextView_183_204_Apps_2
tap(9):::com.android.launcher3.id_apps_list_view_com.android.launcher3.id_icon_Contacts_7
tap(6):::com.android.contacts.id_floating_action_button_container_com.android.contacts.id_floating_action_button_addnewcontact_0
text(4:sep:"zr"):::com.android.contacts.id_editors_android.widget.EditText_775_142_0
tap(4):::com.android.contacts.id_editors_android.widget.EditText_775_142_0
text(4:sep:"zr"):::com.android.contacts.id_editors_android.widget.EditText_775_142_0
tap(6):::com.android.contacts.id_editors_android.widget.EditText_775_142_0
text(6:sep:"+86 15231955333"):::com.android.contacts.id_editors_android.widget.EditText_775_142_0
tap(2):::android.widget.LinearLayout_126_147_com.android.contacts.id_menu_save_Save_0
stop

View file

@ -0,0 +1 @@
create a new contact in Contacts app named zr ,with a phone number +86 15231955333

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
{'tap': 'Tapping this UI element opens the app drawer, displaying a list of all the apps installed on the device.', 'text': '', 'v_swipe': '', 'h_swipe': '', 'long_press': ''}

View file

@ -0,0 +1 @@
{'tap': 'Tapping this UI element saves the new contact information that has been entered into the Contacts app.', 'text': '', 'v_swipe': '', 'h_swipe': '', 'long_press': ''}

View file

@ -0,0 +1 @@
{'tap': 'Tapping this UI element allows the user to enter a name for the new contact.', 'text': 'The UI element is used for the user to enter the name of a new contact in the Contacts app.', 'v_swipe': '', 'h_swipe': '', 'long_press': ''}

View file

@ -0,0 +1 @@
{'tap': 'Tapping this UI element allows the user to add a new contact to their contact list.', 'text': '', 'v_swipe': '', 'h_swipe': '', 'long_press': ''}

View file

@ -0,0 +1 @@
{'tap': 'Tapping this UI element will open the Contacts application.', 'text': '', 'v_swipe': '', 'h_swipe': '', 'long_press': ''}

View file

@ -2,14 +2,105 @@
# -*- coding: utf-8 -*-
# @Desc : test case (imgs from appagent's)
import re
import ast
import json
import time
from pathlib import Path
from prompts.operation_prompt import (
tap_doc_template,
text_doc_template,
long_press_doc_template,
swipe_doc_template,
refine_doc_suffix
)
from utils.schema import ActionOp, SwipeOp
from actions.parse_record_an import RECORD_PARSE_NODE
from metagpt.config2 import config
from metagpt.utils.common import encode_image
from metagpt.logs import logger
from metagpt.actions.action import Action
from examples.andriod_assistant.utils.schema import AndroidActionOutput
# TODO test for manual record
TEST_BEFORE_PATH = Path(
"examples/andriod_assistant/apps/demo_Contacts/labeled_screenshots/demo_Contacts_2024-01-30_21-50-19_1.png")
TEST_AFTER_PATH = Path(
"examples/andriod_assistant/apps/demo_Contacts/labeled_screenshots/demo_Contacts_2024-01-30_21-50-19_2.png")
RECORD_PATH = Path("examples/andriod_assistant/apps/demo_Contacts/record.txt")
TASK_DESC_PATH = Path("examples/andriod_assistant/apps/demo_Contacts/task_desc.txt")
DOCS_DIR = Path("examples/andriod_assistant/storage")
testaction = Action(name="test")
# TODO test for parse record
# 仅使用一张图像进行测试
img_before_base64 = encode_image(TEST_BEFORE_PATH)
img_after_base64 = encode_image(TEST_AFTER_PATH)
# TODO test for screenshot_parse
with open(RECORD_PATH, "r") as record_file:
rec = record_file.readline().strip()
action, resource_id = rec.split(":::")
action_type = action.split("(")[0]
action_param = re.findall(r"\((.*?)\)", action)[0]
if action_type == ActionOp.TAP.value:
prompt_template = tap_doc_template
context = prompt_template.format(ui_element=action_param)
elif action_type == ActionOp.TEXT.value:
input_area, input_text = action_param.split(":sep:")
prompt_template = text_doc_template
context = prompt_template.format(ui_element=input_area)
elif action_type == ActionOp.LONG_PRESS.value:
prompt_template = long_press_doc_template
context = prompt_template.format(ui_element=action_param)
elif action_type == ActionOp.SWIPE.value:
swipe_area, swipe_dir = action_param.split(":sep:")
if swipe_dir == SwipeOp.UP.value or swipe_dir == SwipeOp.DOWN.value:
action_type = ActionOp.VERTICAL_SWIPE.value
elif swipe_dir == SwipeOp.LEFT.value or swipe_dir == SwipeOp.RIGHT.value:
action_type = ActionOp.HORIZONTAL_SWIPE.value
prompt_template = swipe_doc_template
context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area)
else:
print("Error occurs")
# TODO add AndroidActionOutput to manual record & parse record
task_desc_path = TASK_DESC_PATH
task_desc = open(task_desc_path, "r").read()
context = context.format(task_desc=task_desc)
doc_name = resource_id + ".txt"
doc_path = DOCS_DIR.joinpath(doc_name)
if doc_path.exists():
doc_content = ast.literal_eval(open(doc_path).read())
if doc_content[action_type]:
if config.get_other("doc_refine"):
refine_context = refine_doc_suffix.format(old_doc=doc_content[action_type])
context += refine_context
logger.info(
f"Documentation for the element {resource_id} already exists. The doc will be "
f"refined based on the latest demo.")
else:
logger.info(
f"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE "
f"in the config file if needed.")
else:
doc_content = {
"tap": "",
"text": "",
"v_swipe": "",
"h_swipe": "",
"long_press": ""
}
logger.info(f"Waiting for GPT-4V to generate documentation for the element {resource_id}")
node = RECORD_PARSE_NODE.fill(context=context, llm=testaction.llm,
images=[img_before_base64, img_after_base64])
# log_path = task_dir.joinpath(f"log_{app_name}_{demo_name}.txt")
prompt = node.compile(context=context, schema="json", mode="auto")
msg = node.content
doc_content[action_type] = msg
with open(doc_path, "w") as outfile:
outfile.write(str(doc_content))
logger.info(f"Documentation generated and saved to {doc_path}")