diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..ff6f19aab --- /dev/null +++ b/.coveragerc @@ -0,0 +1,7 @@ +[run] +source = + ./metagpt/ +omit = + */metagpt/environment/android/* + */metagpt/ext/android_assistant/* + */metagpt/ext/werewolf/* \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index a774d0ed1..01ab0342d 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -3,7 +3,7 @@ { "name": "Python 3", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile - "image": "mcr.microsoft.com/devcontainers/python:0-3.11", + "image": "metagpt/metagpt:latest", // Features to add to the dev container. More info: https://containers.dev/features. // "features": {}, @@ -18,7 +18,7 @@ ] } }, - + // Use 'postCreateCommand' to run commands after the container is created. "postCreateCommand": "./.devcontainer/postCreateCommand.sh" diff --git a/.gitattributes b/.gitattributes index 865da2ca2..e6436790e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -14,6 +14,7 @@ *.ico binary *.jpeg binary *.mp3 binary +*.mp4 binary *.zip binary *.bin binary diff --git a/.github/ISSUE_TEMPLATE/show_me_the_bug.md b/.github/ISSUE_TEMPLATE/show_me_the_bug.md index 504a2bd12..0c33f0319 100644 --- a/.github/ISSUE_TEMPLATE/show_me_the_bug.md +++ b/.github/ISSUE_TEMPLATE/show_me_the_bug.md @@ -19,6 +19,7 @@ - LLM type and model name: - System version: - Python version: +- MetaGPT version or branch: diff --git a/.github/workflows/build-package.yaml b/.github/workflows/build-package.yaml index 7f4fee53e..294a13f71 100644 --- a/.github/workflows/build-package.yaml +++ b/.github/workflows/build-package.yaml @@ -1,8 +1,9 @@ name: Build and upload python package on: + workflow_dispatch: release: - types: [created] + types: [created, published] jobs: deploy: diff --git a/.github/workflows/fulltest.yaml b/.github/workflows/fulltest.yaml index 70c800481..2ab6444fa 100644 --- a/.github/workflows/fulltest.yaml +++ b/.github/workflows/fulltest.yaml @@ -30,7 +30,10 @@ jobs: cache: 'pip' - name: Install dependencies run: | - sh tests/scripts/run_install_deps.sh + python -m pip install --upgrade pip + pip install -e .[test] + npm install -g @mermaid-js/mermaid-cli + playwright install --with-deps - name: Run reverse proxy script for ssh service if: contains(github.ref, '-debugger') continue-on-error: true diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml index ed4bbb144..d350a87f1 100644 --- a/.github/workflows/pre-commit.yaml +++ b/.github/workflows/pre-commit.yaml @@ -11,6 +11,7 @@ on: jobs: pre-commit-check: runs-on: ubuntu-latest + environment: pre-commit steps: - name: Checkout Source Code uses: actions/checkout@v2 diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index afa9faba7..25f82b1e6 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -27,20 +27,57 @@ jobs: cache: 'pip' - name: Install dependencies run: | - sh tests/scripts/run_install_deps.sh + python -m pip install --upgrade pip + pip install -e .[test] + npm install -g @mermaid-js/mermaid-cli + playwright install --with-deps - name: Test with pytest run: | export ALLOW_OPENAI_API_CALL=0 mkdir -p ~/.metagpt && cp tests/config2.yaml ~/.metagpt/config2.yaml - pytest tests/ --doctest-modules --cov=./metagpt/ --cov-report=xml:cov.xml --cov-report=html:htmlcov --durations=20 | tee unittest.txt + pytest --continue-on-collection-errors tests/ \ + --ignore=tests/metagpt/environment/android_env \ + --ignore=tests/metagpt/ext/android_assistant \ + --ignore=tests/metagpt/ext/stanford_town \ + --ignore=tests/metagpt/provider/test_bedrock_api.py \ + --ignore=tests/metagpt/rag/factories/test_embedding.py \ + --ignore=tests/metagpt/ext/werewolf/actions/test_experience_operation.py \ + --ignore=tests/metagpt/provider/test_openai.py \ + --ignore=tests/metagpt/planner/test_action_planner.py \ + --ignore=tests/metagpt/planner/test_basic_planner.py \ + --ignore=tests/metagpt/actions/test_project_management.py \ + --ignore=tests/metagpt/actions/test_write_code.py \ + --ignore=tests/metagpt/actions/test_write_code_review.py \ + --ignore=tests/metagpt/actions/test_write_prd.py \ + --ignore=tests/metagpt/environment/werewolf_env/test_werewolf_ext_env.py \ + --ignore=tests/metagpt/memory/test_brain_memory.py \ + --ignore=tests/metagpt/roles/test_assistant.py \ + --ignore=tests/metagpt/roles/test_engineer.py \ + --ignore=tests/metagpt/serialize_deserialize/test_write_code_review.py \ + --ignore=tests/metagpt/test_environment.py \ + --ignore=tests/metagpt/test_llm.py \ + --ignore=tests/metagpt/tools/test_metagpt_oas3_api_svc.py \ + --ignore=tests/metagpt/tools/test_moderation.py \ + --ignore=tests/metagpt/tools/test_search_engine.py \ + --ignore=tests/metagpt/tools/test_tool_convert.py \ + --ignore=tests/metagpt/tools/test_web_browser_engine_playwright.py \ + --ignore=tests/metagpt/utils/test_mermaid.py \ + --ignore=tests/metagpt/utils/test_redis.py \ + --ignore=tests/metagpt/utils/test_tree.py \ + --ignore=tests/metagpt/serialize_deserialize/test_sk_agent.py \ + --ignore=tests/metagpt/utils/test_text.py \ + --ignore=tests/metagpt/actions/di/test_write_analysis_code.py \ + --ignore=tests/metagpt/provider/test_ark.py \ + --doctest-modules --cov=./metagpt/ --cov-report=xml:cov.xml --cov-report=html:htmlcov \ + --durations=20 | tee unittest.txt - name: Show coverage report run: | coverage report -m - name: Show failed tests and overall summary run: | grep -E "FAILED tests|ERROR tests|[0-9]+ passed," unittest.txt - failed_count=$(grep -E "FAILED|ERROR" unittest.txt | wc -l) - if [[ "$failed_count" -gt 0 ]]; then + failed_count=$(grep -E "FAILED tests|ERROR tests" unittest.txt | wc -l | tr -d '[:space:]') + if [[ $failed_count -gt 0 ]]; then echo "$failed_count failed lines found! Task failed." exit 1 fi diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..292433f80 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +recursive-include metagpt/ext/stanford_town/prompts *.txt +recursive-include metagpt/ext/stanford_town/static_dirs *.csv +recursive-include metagpt/ext/stanford_town/static_dirs *.json \ No newline at end of file diff --git a/README.md b/README.md index 9f129105c..3410e08fc 100644 --- a/README.md +++ b/README.md @@ -26,18 +26,16 @@ # MetaGPT: The Multi-Agent Framework
## News -🚀 Mar. 29, 2024: [v0.8.0](https://github.com/geekan/MetaGPT/releases/tag/v0.8.0) released. Now you can use Data Interpreter via pypi package import. Meanwhile, we integrated RAG module and supported multiple new LLMs. - -🚀 Mar. 14, 2024: Our **Data Interpreter** paper is on [arxiv](https://arxiv.org/abs/2402.18679). Check the [example](https://docs.deepwisdom.ai/main/en/DataInterpreter/) and [code](https://github.com/geekan/MetaGPT/tree/main/examples/di)! +🚀 Mar. 29, 2024: [v0.8.0](https://github.com/geekan/MetaGPT/releases/tag/v0.8.0) released. Now you can use Data Interpreter ([arxiv](https://arxiv.org/abs/2402.18679), [example](https://docs.deepwisdom.ai/main/en/DataInterpreter/), [code](https://github.com/geekan/MetaGPT/tree/main/examples/di)) via pypi package import. Meanwhile, we integrated RAG module and supported multiple new LLMs. 🚀 Feb. 08, 2024: [v0.7.0](https://github.com/geekan/MetaGPT/releases/tag/v0.7.0) released, supporting assigning different LLMs to different Roles. We also introduced [Data Interpreter](https://github.com/geekan/MetaGPT/blob/main/examples/di/README.md), a powerful agent capable of solving a wide range of real-world problems. 🚀 Jan. 16, 2024: Our paper [MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework -](https://arxiv.org/abs/2308.00352) accepted for oral presentation **(top 1.2%)** at ICLR 2024, **ranking #1** in the LLM-based Agent category. +](https://openreview.net/forum?id=VtmBAGCN7o) accepted for **oral presentation (top 1.2%)** at ICLR 2024, **ranking #1** in the LLM-based Agent category. 🚀 Jan. 03, 2024: [v0.6.0](https://github.com/geekan/MetaGPT/releases/tag/v0.6.0) released, new features include serialization, upgraded OpenAI package and supported multiple LLM, provided [minimal example for debate](https://github.com/geekan/MetaGPT/blob/main/examples/debate_simple.py) etc. -🚀 Dec. 15, 2023: [v0.5.0](https://github.com/geekan/MetaGPT/releases/tag/v0.5.0) released, introducing some experimental features such as **incremental development**, **multilingual**, **multiple programming languages**, etc. +🚀 Dec. 15, 2023: [v0.5.0](https://github.com/geekan/MetaGPT/releases/tag/v0.5.0) released, introducing some experimental features such as incremental development, multilingual, multiple programming languages, etc. 🔥 Nov. 08, 2023: MetaGPT is selected into [Open100: Top 100 Open Source achievements](https://www.benchcouncil.org/evaluation/opencs/annual.html). @@ -85,8 +83,8 @@ # Check https://docs.deepwisdom.ai/main/en/guide/get_started/configuration.html ```yaml llm: - api_type: "openai" # or azure / ollama / open_llm etc. Check LLMType for more options - model: "gpt-4-turbo-preview" # or gpt-3.5-turbo-1106 / gpt-4-1106-preview + api_type: "openai" # or azure / ollama / groq etc. Check LLMType for more options + model: "gpt-4-turbo" # or gpt-3.5-turbo base_url: "https://api.openai.com/v1" # or forward url / other llm url api_key: "YOUR_API_KEY" ``` @@ -107,7 +105,7 @@ ### Usage print(repo) # it will print the repo structure with files ``` -You can also use its [Data Interpreter](https://github.com/geekan/MetaGPT/tree/main/examples/di) +You can also use [Data Interpreter](https://github.com/geekan/MetaGPT/tree/main/examples/di) to write code: ```python import asyncio @@ -147,10 +145,13 @@ ## Tutorial ## Support -### Discard Join US -📢 Join Our [Discord Channel](https://discord.gg/ZRHeExS6xv)! +### Discord Join US -Looking forward to seeing you there! 🎉 +📢 Join Our [Discord Channel](https://discord.gg/ZRHeExS6xv)! Looking forward to seeing you there! 🎉 + +### Contributor form + +📝 [Fill out the form](https://airtable.com/appInfdG0eJ9J4NNL/pagK3Fh1sGclBvVkV/form) to become a contributor. We are looking forward to your participation! ### Contact Information @@ -165,16 +166,15 @@ ## Citation To stay updated with the latest research and development, follow [@MetaGPT_](https://twitter.com/MetaGPT_) on Twitter. -To cite [MetaGPT](https://arxiv.org/abs/2308.00352) or [Data Interpreter](https://arxiv.org/abs/2402.18679) in publications, please use the following BibTeX entries. +To cite [MetaGPT](https://openreview.net/forum?id=VtmBAGCN7o) or [Data Interpreter](https://arxiv.org/abs/2402.18679) in publications, please use the following BibTeX entries. ```bibtex -@misc{hong2023metagpt, - title={MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework}, - author={Sirui Hong and Mingchen Zhuge and Jonathan Chen and Xiawu Zheng and Yuheng Cheng and Ceyao Zhang and Jinlin Wang and Zili Wang and Steven Ka Shing Yau and Zijuan Lin and Liyang Zhou and Chenyu Ran and Lingfeng Xiao and Chenglin Wu and Jürgen Schmidhuber}, - year={2023}, - eprint={2308.00352}, - archivePrefix={arXiv}, - primaryClass={cs.AI} +@inproceedings{hong2024metagpt, + title={Meta{GPT}: Meta Programming for A Multi-Agent Collaborative Framework}, + author={Sirui Hong and Mingchen Zhuge and Jonathan Chen and Xiawu Zheng and Yuheng Cheng and Jinlin Wang and Ceyao Zhang and Zili Wang and Steven Ka Shing Yau and Zijuan Lin and Liyang Zhou and Chenyu Ran and Lingfeng Xiao and Chenglin Wu and J{\"u}rgen Schmidhuber}, + booktitle={The Twelfth International Conference on Learning Representations}, + year={2024}, + url={https://openreview.net/forum?id=VtmBAGCN7o} } @misc{hong2024data, title={Data Interpreter: An LLM Agent For Data Science}, @@ -184,6 +184,5 @@ ## Citation archivePrefix={arXiv}, primaryClass={cs.AI} } - ``` diff --git a/config/config2.example.yaml b/config/config2.example.yaml index c5454ec32..b82468eed 100644 --- a/config/config2.example.yaml +++ b/config/config2.example.yaml @@ -1,17 +1,24 @@ llm: - api_type: "openai" # or azure / ollama / open_llm etc. Check LLMType for more options + api_type: "openai" # or azure / ollama / groq etc. base_url: "YOUR_BASE_URL" api_key: "YOUR_API_KEY" - model: "gpt-4-turbo-preview" # or gpt-3.5-turbo-1106 / gpt-4-1106-preview + model: "gpt-4-turbo" # or gpt-3.5-turbo proxy: "YOUR_PROXY" # for LLM API requests # timeout: 600 # Optional. If set to 0, default value is 300. - pricing_plan: "" # Optional. If invalid, it will be automatically filled in with the value of the `model`. - # Azure-exclusive pricing plan mappings: - # - gpt-3.5-turbo 4k: "gpt-3.5-turbo-1106" - # - gpt-4-turbo: "gpt-4-turbo-preview" - # - gpt-4-turbo-vision: "gpt-4-vision-preview" - # - gpt-4 8k: "gpt-4" - # See for more: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/ + # Details: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/ + pricing_plan: "" # Optional. Use for Azure LLM when its model name is not the same as OpenAI's + + +# RAG Embedding. +# For backward compatibility, if the embedding is not set and the llm's api_type is either openai or azure, the llm's config will be used. +embedding: + api_type: "" # openai / azure / gemini / ollama etc. Check EmbeddingType for more options. + base_url: "" + api_key: "" + model: "" + api_version: "" + embed_batch_size: 100 + dimensions: # output dimension of embedding model repair_llm_output: true # when the output is not a valid json, try to repair it @@ -28,7 +35,7 @@ browser: mermaid: engine: "pyppeteer" - path: "/Applications/Google Chrome.app" + pyppeteer_path: "/Applications/Google Chrome.app" redis: host: "YOUR_HOST" @@ -52,3 +59,27 @@ iflytek_api_key: "YOUR_API_KEY" iflytek_api_secret: "YOUR_API_SECRET" metagpt_tti_url: "YOUR_MODEL_URL" + +omniparse: + api_key: "YOUR_API_KEY" + base_url: "YOUR_BASE_URL" + +models: +# "YOUR_MODEL_NAME_1 or YOUR_API_TYPE_1": # model: "gpt-4-turbo" # or gpt-3.5-turbo +# api_type: "openai" # or azure / ollama / groq etc. +# base_url: "YOUR_BASE_URL" +# api_key: "YOUR_API_KEY" +# proxy: "YOUR_PROXY" # for LLM API requests +# # timeout: 600 # Optional. If set to 0, default value is 300. +# # Details: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/ +# pricing_plan: "" # Optional. Use for Azure LLM when its model name is not the same as OpenAI's +# "YOUR_MODEL_NAME_2 or YOUR_API_TYPE_2": # api_type: "openai" # or azure / ollama / groq etc. +# api_type: "openai" # or azure / ollama / groq etc. +# base_url: "YOUR_BASE_URL" +# api_key: "YOUR_API_KEY" +# proxy: "YOUR_PROXY" # for LLM API requests +# # timeout: 600 # Optional. If set to 0, default value is 300. +# # Details: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/ +# pricing_plan: "" # Optional. Use for Azure LLM when its model name is not the same as OpenAI's + +agentops_api_key: "YOUR_AGENTOPS_API_KEY" # get key from https://app.agentops.ai/settings/projects diff --git a/config/config2.yaml b/config/config2.yaml index 8e5825b57..b3f24539c 100644 --- a/config/config2.yaml +++ b/config/config2.yaml @@ -1,7 +1,8 @@ # Full Example: https://github.com/geekan/MetaGPT/blob/main/config/config2.example.yaml # Reflected Code: https://github.com/geekan/MetaGPT/blob/main/metagpt/config2.py +# Config Docs: https://docs.deepwisdom.ai/main/en/guide/get_started/configuration.html llm: - api_type: "openai" # or azure / ollama / open_llm etc. Check LLMType for more options - model: "gpt-4-turbo-preview" # or gpt-3.5-turbo-1106 / gpt-4-1106-preview + api_type: "openai" # or azure / ollama / groq etc. + model: "gpt-4-turbo" # or gpt-3.5-turbo base_url: "https://api.openai.com/v1" # or forward url / other llm url api_key: "YOUR_API_KEY" \ No newline at end of file diff --git a/config/examples/anthropic-claude-3-5-sonnet.yaml b/config/examples/anthropic-claude-3-5-sonnet.yaml new file mode 100644 index 000000000..7c4df6064 --- /dev/null +++ b/config/examples/anthropic-claude-3-5-sonnet.yaml @@ -0,0 +1,5 @@ +llm: + api_type: 'claude' # or anthropic + base_url: 'https://api.anthropic.com' + api_key: 'YOUR_API_KEY' + model: 'claude-3-5-sonnet-20240620' # or 'claude-3-opus-20240229' \ No newline at end of file diff --git a/config/examples/aws-bedrock.yaml b/config/examples/aws-bedrock.yaml new file mode 100644 index 000000000..d44fe8386 --- /dev/null +++ b/config/examples/aws-bedrock.yaml @@ -0,0 +1,10 @@ +llm: + api_type: 'bedrock' + access_key: 'YOUR_API_KEY' + secret_key: 'YOUR_API_SECRET' + + region_name: "us-east-1" + model: "meta.llama2-70b-chat-v1" + # model: "anthropic.claude-3-sonnet-20240229-v1:0" + # model: "mistral.mixtral-8x7b-instruct-v0:1" + # model: "meta.llama2-13b-chat-v1" \ No newline at end of file diff --git a/config/examples/google-gemini.yaml b/config/examples/google-gemini.yaml new file mode 100644 index 000000000..82a22bdf5 --- /dev/null +++ b/config/examples/google-gemini.yaml @@ -0,0 +1,4 @@ +llm: + api_type: 'gemini' + api_key: 'YOUR_API_KEY' + model: 'gemini-pro' \ No newline at end of file diff --git a/config/examples/groq-llama3-70b.yaml b/config/examples/groq-llama3-70b.yaml new file mode 100644 index 000000000..93ff24b3d --- /dev/null +++ b/config/examples/groq-llama3-70b.yaml @@ -0,0 +1,5 @@ +llm: + # Visit https://console.groq.com/keys to create api key + base_url: "https://api.groq.com/openai/v1" + api_key: "YOUR_API_KEY" + model: "llama3-70b-8192" # llama3-8b-8192,llama3-70b-8192,llama2-70b-4096 ,mixtral-8x7b-32768,gemma-7b-it diff --git a/config/examples/huoshan_ark.yaml b/config/examples/huoshan_ark.yaml new file mode 100644 index 000000000..b0516359b --- /dev/null +++ b/config/examples/huoshan_ark.yaml @@ -0,0 +1,5 @@ +llm: + api_type: "ark" + model: "" # your model endpoint like ep-xxx + base_url: "https://ark.cn-beijing.volces.com/api/v3" + api_key: "" # your api-key like ey…… \ No newline at end of file diff --git a/config/examples/openai-gpt-3.5-turbo.yaml b/config/examples/openai-gpt-3.5-turbo.yaml new file mode 100644 index 000000000..41364842a --- /dev/null +++ b/config/examples/openai-gpt-3.5-turbo.yaml @@ -0,0 +1,5 @@ +llm: + api_key: "YOUR_API_KEY" + model: "gpt-3.5-turbo" + #proxy: "http://
+
+After remembering the location where you want to operate, a request similar to the one below will be output in the terminal. Reply to it and thereby direct the Android assistant to learn your demonstration action:
+
+```bash
+| INFO | examples.android_assistant.actions.manual_record:run:96 - Which element do you want to tap? Choose a numeric tag from 1 to 11:
+user_input: 8
+| INFO | examples.android_assistant.actions.manual_record:run:81 - Choose one of the following actions you want to perform on the current screen:
+tap, text, long_press, swipe, stop
+user_input: tap
+```
+
+### Automatic Execution Stage
+After the Android Assistant completes the learning stage, you can command it to complete tasks on the phone through text descriptions. By configuring the operation documents from the self-learning stage, the Android Assistant has richer prior knowledge, and its execution capabilities are further enhanced.
+You can instruct the Android Assistant to send messages in the "Messenger" app with the following command:
+```bash
+python run_assistant.py "Send 'When will we release this feature?' to +86 8888888" --stage "act" --mode "auto or manual" --app-name "Messenger"
+```
+Specifically, by selecting `auto` for `mode`, the Android assistant will employ the operational records compiled through self-exploration. Alternatively, if `manual` is chosen as the `mode`, the Android assistant will leverage the operation manuals accrued from learning via human demonstration.
+
+## Installation
+To use the Android Assistant, you first need to meet the following conditions:
+1. Complete the installation of the MetaGPT environment.
+2. Install [Android Debug Bridge (ADB)](https://developer.android.com/tools/adb?hl=zh-cn) on your PC, which enables interaction between your PC and Android devices.
+3. Install Android Studio and within it, install the Android emulator to provide an environment for the Android Assistant to learn and execute. For information on how to install the Android emulator, refer to [Quick Installation of Android Studio & Emulator](https://docs.expo.dev/workflow/android-studio-emulator/).
+4. (Optional) Connect your Android device to the USB port of your PC, which can also provide an environment for the Android Assistant to learn and execute.
+
+Note ⚠️: When operating with the Android emulator, the emulator model we use is Medium Phone, which is recommended for first-time users to complete the operation.
+
+After completing these operations, you can enter the following command to check if ADB is installed successfully and if the Android device is connected:
+```bash
+adb devices
+```
+
+## Usage
+The MetaGPT Android Assistant is designed within the MetaGPT framework as a collection of Roles and multiple Actions. You can run it by executing the `run_assistant.py` script. The specific parameter description of this script is as follows:
+```text
+Usage: run_assistant.py [OPTIONS] TASK_DESC
+
+ Run a Android Assistant
+
+Arguments:
+ TASK_DESC the task description you want the android assistant to learn or
+ act [required]
+
+Options:
+ --n-round INTEGER The max round to do an app operation task.
+ [default: 20]
+ --stage TEXT stage: learn / act [default: learn]
+ --mode TEXT mode: auto / manual , when state=learn
+ [default: auto]
+ --app-name TEXT the name of app you want to run [default:
+ demo]
+ --investment FLOAT Dollar amount to invest in the AI company.
+ [default: 5.0]
+ --refine-doc / --no-refine-doc Refine existing operation docs based on the
+ latest observation if True. [default: no-
+ refine-doc]
+ --min-dist INTEGER The minimum distance between elements to
+ prevent overlapping during the labeling
+ process. [default: 30]
+ --android-screenshot-dir TEXT The path to store screenshots on android
+ device. Make sure it exists. [default:
+ /sdcard/Pictures/Screenshots]
+ --android-xml-dir TEXT The path to store xml files for determining
+ UI elements localtion. Make sure it exists.
+ [default: /sdcard]
+ --device-id TEXT The Android device_id [default:
+ emulator-5554]
+ --help Show this message and exit.
+```
+
+## Acknowledgements
+The MetaGPT Android Assistant has referenced some ideas and code from the [AppAgent](https://github.com/mnotgod96/AppAgent) project. We thank the developers of the Appagent project.
+
+### Citation
+
+```bib
+@misc{yang2023appagent,
+ title={AppAgent: Multimodal Agents as Smartphone Users},
+ author={Chi Zhang and Zhao Yang and Jiaxuan Liu and Yucheng Han and Xin Chen and Zebiao Huang and Bin Fu and Gang Yu},
+ year={2023},
+ eprint={2312.13771},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
\ No newline at end of file
diff --git a/metagpt/ext/android_assistant/README_CN.md b/metagpt/ext/android_assistant/README_CN.md
new file mode 100644
index 000000000..a1abbe3b0
--- /dev/null
+++ b/metagpt/ext/android_assistant/README_CN.md
@@ -0,0 +1,113 @@
+# MetaGPT 安卓助理
+
+MetaGPT安卓助理是一款依托于先进的MetaGPT框架构建的多模态大语言模型驱动的智能辅助工具。
+它具备自我学习的能力,能够通过学习掌握用户的日常使用方式,同时能够根据用户的指令自动完成各类应用程序的操作任务,实现了用户双手的全面解放。
+接下来,我们将介绍MetaGPT安卓助理的功能以及如何使用它。
+
+## 功能
+
+MetaGPT 安卓助理的执行主要包含两个阶段,分别为自我学习与自动执行。下面,我们将从这两个阶段介绍MetaGPT 安卓助理的具体功能。
+
+### 自我学习阶段
+
+通过学习人类演示或基于人类指令对app进行探索,MetaGPT安卓助理可以对app的功能进行学习,生成相应的操作文档,为后续的“自动执行”阶段使用。对于任何给定的任务目标,进行约20轮的探索可以显著提高性能。
+
+通过设定`stage`为`learn`可要求安卓助理进入自我学习阶段。通过设定`mode`为`auto`,可要求安卓助理通过自动探索学习,通过设定`mode`为`manual`,可要求安卓助理通过人类手动演示学习。在使用章节,我们对脚本的参数进行了详细的说明。
+您可以尝试对“Messenger”应用程序进行自动探索和手动演示模式的实验,具体命令如下:
+
+```bash
+cd examples/android_assistant
+python run_assistant.py "Send 'When will we release this feature? to +86 8888888'" --stage "learn" --mode "auto or manual" --app-name "Messenger"
+```
+
+#### 基于人类演示的学习
+在要求安卓助理在自我学习阶段执行自我探索时,您可以解放您的双手,但在要求他根据您的指令进行学习时,你需要根据终端中的指令进行输入,以便安卓助理能够准确地学习您的操作方式。
+一个可能的例子如下:
+
+```bash
+cd examples/android_assistant
+python run_assistant.py "Send 'When will we release this feature? to +86 8888888'" --stage "learn" --mode "manual" --app-name "Messenger"
+```
+
+在运行这一指令后,你将首先看到一个在各个可交互的位置进行了标记的安卓屏幕的截图,如下图:
+
+
+
+在记住你要操作的位置之后,终端中将会输出与下面类似的要求,回复它,进而指挥安卓助理学习你的演示行为:
+
+```bash
+| INFO | examples.android_assistant.actions.manual_record:run:96 - Which element do you want to tap? Choose a numeric tag from 1 to 11:
+user_input: 8
+| INFO | examples.android_assistant.actions.manual_record:run:81 - Choose one of the following actions you want to perform on the current screen:
+tap, text, long_press, swipe, stop
+user_input: tap
+```
+### 自动执行阶段
+在安卓助理完成了自我学习阶段之后,您可以通过文本描述的方式,指挥安卓助理在手机中完成任务。通过为其配置自我学习阶段的操作文档,安卓助理具备了更丰富的前置知识,执行能力进一步得到提升。
+你可以通过以下指令,指挥安卓助理在“Messenger”应用中发送信息:
+```bash
+python run_assistant.py "Send 'When will we release this feature? to +86 8888888'" --stage "act" --mode "auto or manual" --app-name "Messenger"
+```
+其中,`mode`选择`auto`,安卓助理将使用自我探索中积累的操作文档;`mode`选择`manual`,安卓助理将使用人类演示学习中积累的操作文档。
+
+## 安装
+为了使用安卓助理,你首先需要满足以下条件:
+1. 完成MetaGPT环境的安装
+2. 在你的PC上安装[Android Debug Bridge(ADB)](https://developer.android.com/tools/adb?hl=zh-cn),ADB可以使你的PC与安卓设备进行交互。
+3. 安装Android Studio,在其中安装Android模拟器,以为安卓助手提供学习与执行的环境。关于如何安装Android模拟器,可以参考[快速安装Android Studio & Emulator](https://dev.weixin.qq.com/docs/framework/dev/framework/env/android-simulator.html)。
+4. (Optional) 将你的安卓设备连接到PC的USB端口上,这同样可以为安卓助手提供学习与执行的环境。
+
+注意 ⚠️:在使用Android模拟器进行操作时,我们使用的模拟器型号为Medium Phone,建议第一次尝试此类应用的用户使用这一型号完成操作。
+
+在完成这一系列操作之后,你可以输入以下命令检查ADB是否安装成功,以及安卓设备是否连接
+```bash
+adb devices
+```
+## 使用
+MetaGPT 安卓助理在MetaGPT框架中被设计为一个`Role`与多个`Action`的集合,你可以通过运行`run_assistant.py`脚本来运行它。这一脚本具体的参数说明如下:
+```text
+用法:run_assistant.py [选项] 任务描述
+
+ 运行一个安卓助手
+
+参数:
+ TASK_DESC 你希望安卓助手学习或执行的任务描述
+ [必需]
+
+选项:
+ --n-round 整数 执行应用程序操作任务的最大轮数。
+ [默认值:20]
+ --stage 文本 阶段:learn/act [默认值:learn]
+ --mode 文本 模式:auto/manual,当状态=learn时 [默认值:auto]
+ --app-name 文本 你想要运行的应用程序名称 [默认值:
+ 演示]
+ --investment 浮点数 投资于人工智能公司的美元金额。
+ [默认值:5.0]
+ --refine-doc / --no-refine-doc 如果为真,则根据最新的观察结果优化现有操作文档。
+ [默认值:--no-refine-doc]
+ --min-dist 整数 在标记过程中防止元素重叠的最小元素间距。
+ [默认值:30]
+ --android-screenshot-dir 文本 在安卓设备上存储截图的路径。确保其存在。
+ [默认值:/sdcard/Pictures/Screenshots]
+ --android-xml-dir 文本 存储用于确定UI元素位置的XML文件的路径。
+ 确保其存在。[默认值:/sdcard]
+ --device-id 文本 安卓device_id [默认值:
+ 模拟器-5554]
+ --help 显示此信息并退出。
+```
+
+## 致谢
+MetaGPT 安卓助理参考了 [AppAgent](https://github.com/mnotgod96/AppAgent) 项目的部分思路与代码,感谢 Appagent 项目的开发者们。
+
+### 引用
+
+```bib
+@misc{yang2023appagent,
+ title={AppAgent: Multimodal Agents as Smartphone Users},
+ author={Chi Zhang and Zhao Yang and Jiaxuan Liu and Yucheng Han and Xin Chen and Zebiao Huang and Bin Fu and Gang Yu},
+ year={2023},
+ eprint={2312.13771},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
\ No newline at end of file
diff --git a/metagpt/ext/android_assistant/__init__.py b/metagpt/ext/android_assistant/__init__.py
new file mode 100644
index 000000000..2bcf8efd0
--- /dev/null
+++ b/metagpt/ext/android_assistant/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc :
diff --git a/metagpt/ext/android_assistant/actions/__init__.py b/metagpt/ext/android_assistant/actions/__init__.py
new file mode 100644
index 000000000..2bcf8efd0
--- /dev/null
+++ b/metagpt/ext/android_assistant/actions/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc :
diff --git a/metagpt/ext/android_assistant/actions/manual_record.py b/metagpt/ext/android_assistant/actions/manual_record.py
new file mode 100644
index 000000000..bcfb2ed89
--- /dev/null
+++ b/metagpt/ext/android_assistant/actions/manual_record.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc : manual record user interaction in stage=learn & mode=manual, LIKE scripts/step_recorder.py
+import time
+from pathlib import Path
+
+import cv2
+
+from metagpt.actions.action import Action
+from metagpt.config2 import config
+from metagpt.environment.android.android_env import AndroidEnv
+from metagpt.environment.android.const import ADB_EXEC_FAIL
+from metagpt.environment.android.env_space import (
+ EnvAction,
+ EnvActionType,
+ EnvObsParams,
+ EnvObsType,
+)
+from metagpt.ext.android_assistant.utils.schema import (
+ ActionOp,
+ AndroidActionOutput,
+ RunState,
+ SwipeOp,
+)
+from metagpt.ext.android_assistant.utils.utils import (
+ draw_bbox_multi,
+ elem_list_from_xml_tree,
+)
+from metagpt.logs import logger
+
+
+class ManualRecord(Action):
+ """do a human operation on the screen with human input"""
+
+ name: str = "ManualRecord"
+
+ useless_list: list[str] = [] # store useless elements uid
+ record_path: Path = ""
+ task_desc_path: Path = ""
+ screenshot_before_path: Path = ""
+ screenshot_after_path: Path = ""
+ xml_path: Path = ""
+
+ async def run(self, task_desc: str, task_dir: Path, env: AndroidEnv):
+ self.record_path = Path(task_dir) / "record.txt"
+ self.task_desc_path = Path(task_dir) / "task_desc.txt"
+ self.screenshot_before_path = Path(task_dir) / "raw_screenshots"
+ self.screenshot_after_path = Path(task_dir) / "labeled_screenshots"
+ self.xml_path = Path(task_dir) / "xml"
+ for path in [self.screenshot_before_path, self.screenshot_after_path, self.xml_path]:
+ path.mkdir(parents=True, exist_ok=True)
+
+ self.record_path.write_text("")
+ record_file = open(self.record_path, "w")
+ self.task_desc_path.write_text(task_desc)
+
+ step = 0
+ extra_config = config.extra
+ while True:
+ step += 1
+ screenshot_path: Path = env.observe(
+ EnvObsParams(
+ obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{step}", local_save_dir=self.screenshot_before_path
+ )
+ )
+ xml_path: Path = env.observe(
+ EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{step}", local_save_dir=self.xml_path)
+ )
+ if not screenshot_path.exists() or not xml_path.exists():
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ elem_list = elem_list_from_xml_tree(xml_path, self.useless_list, extra_config.get("min_dist", 30))
+
+ screenshot_labeled_path = Path(self.screenshot_after_path).joinpath(f"{step}_labeled.png")
+ labeled_img = draw_bbox_multi(screenshot_path, screenshot_labeled_path, elem_list)
+
+ cv2.namedWindow("image", cv2.WINDOW_NORMAL)
+ cv2.imshow("image", labeled_img)
+ cv2.waitKey(0)
+ cv2.destroyAllWindows()
+
+ user_input = "xxx"
+ logger.info(
+ "Choose one of the following actions you want to perform on the current screen:\n"
+ "tap, text, long_press, swipe, stop"
+ )
+
+ while (
+ user_input.lower() != ActionOp.TAP.value
+ and user_input.lower() != ActionOp.TEXT.value
+ and user_input.lower() != ActionOp.LONG_PRESS.value
+ and user_input.lower() != ActionOp.SWIPE.value
+ and user_input.lower() != ActionOp.STOP.value
+ ):
+ user_input = input("user_input: ")
+
+ if user_input.lower() == ActionOp.TAP.value:
+ logger.info(f"Which element do you want to tap? Choose a numeric tag from 1 to {len(elem_list)}:")
+ user_input = "xxx"
+ while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:
+ user_input = input("user_input: ")
+ tl, br = elem_list[int(user_input) - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
+ log_str = f"tap({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n"
+ elif user_input.lower() == ActionOp.TEXT.value:
+ logger.info(
+ f"Which element do you want to input the text string? Choose a numeric tag from 1 to "
+ f"{len(elem_list)}:"
+ )
+ input_area = "xxx"
+ while not input_area.isnumeric() or int(input_area) > len(elem_list) or int(input_area) < 1:
+ input_area = input("user_input: ")
+ logger.info("Enter your input text below:")
+ user_input = ""
+ while not user_input:
+ user_input = input("user_input: ")
+ action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=user_input)
+ log_str = f"text({input_area}:sep:'{user_input}'):::{elem_list[int(input_area) - 1].uid}\n"
+ elif user_input.lower() == ActionOp.LONG_PRESS.value:
+ logger.info(
+ f"Which element do you want to long press? Choose a numeric tag from 1 to {len(elem_list)}:"
+ )
+ user_input = "xxx"
+ while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:
+ user_input = input("user_input: ")
+ tl, br = elem_list[int(user_input) - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+ action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
+ log_str = f"long_press({int(user_input)}):::{elem_list[int(user_input) - 1].uid}\n"
+ elif user_input.lower() == ActionOp.SWIPE.value:
+ logger.info(
+ "What is the direction of your swipe? Choose one from the following options:\n"
+ "up, down, left, right"
+ )
+ user_input = ""
+ while (
+ user_input != SwipeOp.UP.value
+ and user_input != SwipeOp.DOWN.value
+ and user_input != SwipeOp.LEFT.value
+ and user_input != SwipeOp.RIGHT.value
+ ):
+ user_input = input("user_input: ")
+ swipe_dir = user_input
+ logger.info(f"Which element do you want to swipe? Choose a numeric tag from 1 to {len(elem_list)}:")
+ while not user_input.isnumeric() or int(user_input) > len(elem_list) or int(user_input) < 1:
+ user_input = input("user_input: ")
+ tl, br = elem_list[int(user_input) - 1].bbox
+ x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
+
+ action = EnvAction(action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=swipe_dir)
+ log_str = f"swipe({int(user_input)}:sep:{swipe_dir}):::{elem_list[int(user_input) - 1].uid}\n"
+ elif user_input.lower() == ActionOp.STOP.value:
+ record_file.write("stop\n")
+ record_file.close()
+ break
+ else:
+ break
+
+ obs, _, _, _, info = env.step(action)
+ action_res = info["res"]
+ if action_res == ADB_EXEC_FAIL:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+ record_file.write(log_str)
+
+ time.sleep(1)
+
+ return AndroidActionOutput(action_state=RunState.SUCCESS)
diff --git a/metagpt/ext/android_assistant/actions/parse_record.py b/metagpt/ext/android_assistant/actions/parse_record.py
new file mode 100644
index 000000000..304daf655
--- /dev/null
+++ b/metagpt/ext/android_assistant/actions/parse_record.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc : parse record to generate learned standard operations in stage=learn & mode=manual,
+# LIKE scripts/document_generation.py
+
+import ast
+import re
+from pathlib import Path
+
+from metagpt.actions.action import Action
+from metagpt.config2 import config
+from metagpt.ext.android_assistant.actions.parse_record_an import RECORD_PARSE_NODE
+from metagpt.ext.android_assistant.prompts.operation_prompt import (
+ long_press_doc_template,
+ refine_doc_suffix,
+ swipe_doc_template,
+ tap_doc_template,
+ text_doc_template,
+)
+from metagpt.ext.android_assistant.utils.schema import (
+ ActionOp,
+ AndroidActionOutput,
+ RecordLogItem,
+ RunState,
+ SwipeOp,
+)
+from metagpt.logs import logger
+from metagpt.utils.common import encode_image
+
+
+class ParseRecord(Action):
+ name: str = "ParseRecord"
+ record_path: Path = ""
+ task_desc_path: Path = ""
+ screenshot_before_path: Path = ""
+ screenshot_after_path: Path = ""
+
+ async def run(self, task_dir: Path, docs_dir: Path):
+ doc_count = 0
+ self.record_path = Path(task_dir) / "record.txt"
+ self.task_desc_path = Path(task_dir) / "task_desc.txt"
+ self.screenshot_before_path = Path(task_dir) / "raw_screenshots"
+ self.screenshot_after_path = Path(task_dir) / "labeled_screenshots"
+ for path in [self.screenshot_before_path, self.screenshot_after_path]:
+ path.mkdir(parents=True, exist_ok=True)
+
+ task_desc = self.task_desc_path.read_text()
+ extra_config = config.extra
+
+ with open(self.record_path, "r") as record_file:
+ record_step_count = len(record_file.readlines()) - 1
+ record_file.seek(0)
+ for step in range(1, record_step_count + 1):
+ img_before_base64 = encode_image(self.screenshot_after_path.joinpath(f"{step}_labeled.png"))
+ img_after_base64 = encode_image(self.screenshot_after_path.joinpath(f"{step + 1}_labeled.png"))
+ rec = record_file.readline().strip()
+ action, resource_id = rec.split(":::")
+ action_type = action.split("(")[0]
+ # 构建Prompt
+ action_param = re.findall(r"\((.*?)\)", action)[0]
+ if action_type == ActionOp.TAP.value:
+ prompt_template = tap_doc_template
+ context = prompt_template.format(ui_element=action_param)
+ elif action_type == ActionOp.TEXT.value:
+ input_area, input_text = action_param.split(":sep:")
+ prompt_template = text_doc_template
+ context = prompt_template.format(ui_element=input_area)
+ elif action_type == ActionOp.LONG_PRESS.value:
+ prompt_template = long_press_doc_template
+ context = prompt_template.format(ui_element=action_param)
+ elif action_type == ActionOp.SWIPE.value:
+ swipe_area, swipe_dir = action_param.split(":sep:")
+ if swipe_dir == SwipeOp.UP.value or swipe_dir == SwipeOp.DOWN.value:
+ action_type = ActionOp.VERTICAL_SWIPE.value
+ elif swipe_dir == SwipeOp.LEFT.value or swipe_dir == SwipeOp.RIGHT.value:
+ action_type = ActionOp.HORIZONTAL_SWIPE.value
+ prompt_template = swipe_doc_template
+ context = prompt_template.format(swipe_dir=swipe_dir, ui_element=swipe_area)
+ else:
+ break
+ context = context.format(task_desc=task_desc)
+
+ doc_name = resource_id + ".txt"
+ doc_path = docs_dir.joinpath(doc_name)
+
+ if doc_path.exists():
+ try:
+ doc_content = ast.literal_eval(doc_path.read_text())
+ except Exception as exp:
+ logger.error(f"ast parse doc: {doc_path} failed, exp: {exp}")
+ continue
+
+ if doc_content[action_type]:
+ if extra_config.get("doc_refine", False):
+ refine_context = refine_doc_suffix.format(old_doc=doc_content[action_type])
+ context += refine_context
+ logger.info(
+ f"Documentation for the element {resource_id} already exists. The doc will be "
+ f"refined based on the latest demo."
+ )
+ else:
+ logger.info(
+ f"Documentation for the element {resource_id} already exists. Turn on DOC_REFINE "
+ f"in the config file if needed."
+ )
+ continue
+ else:
+ doc_content = {"tap": "", "text": "", "v_swipe": "", "h_swipe": "", "long_press": ""}
+
+ logger.info(f"Waiting for GPT-4V to generate documentation for the element {resource_id}")
+ node = await RECORD_PARSE_NODE.fill(
+ context=context, llm=self.llm, images=[img_before_base64, img_after_base64]
+ )
+ if "error" in node.content:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+ log_path = task_dir.joinpath("log_parse_record.txt")
+ prompt = node.compile(context=context, schema="json", mode="auto")
+ msg = node.content
+ doc_content[action_type] = msg
+
+ with open(log_path, "a") as logfile:
+ log_item = RecordLogItem(
+ step=step,
+ prompt=prompt,
+ image_before=img_before_base64,
+ image_after=img_after_base64,
+ response=node.content,
+ )
+ logfile.write(log_item.model_dump_json() + "\n")
+ with open(doc_path, "w") as outfile:
+ outfile.write(str(doc_content))
+ doc_count += 1
+ logger.info(f"Documentation generated and saved to {doc_path}")
+
+ logger.info(f"Documentation generation phase completed. {doc_count} docs generated.")
+
+ return AndroidActionOutput(action_state=RunState.FINISH)
diff --git a/metagpt/ext/android_assistant/actions/parse_record_an.py b/metagpt/ext/android_assistant/actions/parse_record_an.py
new file mode 100644
index 000000000..210c93e23
--- /dev/null
+++ b/metagpt/ext/android_assistant/actions/parse_record_an.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc : the ActionNode to parse record
+
+from metagpt.actions.action_node import ActionNode
+
+OBSERVATION = ActionNode(
+ key="Observation",
+ expected_type=str,
+ instruction="Provide a description of your observations of the two images. "
+ "Subsequently, delineate the distinctions between the first image and the second one.",
+ example="",
+)
+
+THOUGHT = ActionNode(
+ key="Thought",
+ expected_type=str,
+ instruction="Consider the impact of Action acting on UI elements.",
+ example="",
+)
+
+DESCRIPTION = ActionNode(
+ key="Description",
+ expected_type=str,
+ instruction="Describe the functionality of the UI element concisely in one or two sentences Do not include "
+ "the numeric tag in your description",
+ example="",
+)
+
+NODES = [OBSERVATION, THOUGHT, DESCRIPTION]
+
+RECORD_PARSE_NODE = ActionNode.from_children("RecordParse", NODES)
diff --git a/metagpt/ext/android_assistant/actions/screenshot_parse.py b/metagpt/ext/android_assistant/actions/screenshot_parse.py
new file mode 100644
index 000000000..4d8bb0e1e
--- /dev/null
+++ b/metagpt/ext/android_assistant/actions/screenshot_parse.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc : LIKE scripts/task_executor.py in stage=act
+
+import ast
+from pathlib import Path
+
+from metagpt.actions.action import Action
+from metagpt.config2 import config
+from metagpt.environment.android.android_env import AndroidEnv
+from metagpt.environment.android.const import ADB_EXEC_FAIL
+from metagpt.environment.android.env_space import (
+ EnvAction,
+ EnvActionType,
+ EnvObsParams,
+ EnvObsType,
+)
+from metagpt.ext.android_assistant.actions.screenshot_parse_an import (
+ SCREENSHOT_PARSE_NODE,
+)
+from metagpt.ext.android_assistant.prompts.assistant_prompt import (
+ screenshot_parse_template,
+ screenshot_parse_with_grid_template,
+)
+from metagpt.ext.android_assistant.utils.schema import (
+ AndroidActionOutput,
+ AndroidElement,
+ GridOpParam,
+ LongPressGridOpParam,
+ LongPressOpParam,
+ OpLogItem,
+ RunState,
+ SwipeGridOpParam,
+ SwipeOpParam,
+ TapGridOpParam,
+ TapOpParam,
+ TextOpParam,
+)
+from metagpt.ext.android_assistant.utils.utils import (
+ area_to_xy,
+ draw_bbox_multi,
+ draw_grid,
+ elem_bbox_to_xy,
+ screenshot_parse_extract,
+ traverse_xml_tree,
+)
+from metagpt.logs import logger
+from metagpt.utils.common import encode_image
+
+
+class ScreenshotParse(Action):
+ name: str = "ScreenshotParse"
+
+ def _makeup_ui_document(self, elem_list: list[AndroidElement], docs_idr: Path, use_exist_doc: bool = True) -> str:
+ if not use_exist_doc:
+ return ""
+
+ ui_doc = """
+You also have access to the following documentations that describes the functionalities of UI
+elements you can interact on the screen. These docs are crucial for you to determine the target of your
+next action. You should always prioritize these documented elements for interaction: """
+ for i, elem in enumerate(elem_list):
+ doc_path = docs_idr.joinpath(f"{elem.uid}.txt")
+ if not doc_path.exists():
+ continue
+ try:
+ doc_content = ast.literal_eval(doc_path.read_text())
+ except Exception as exp:
+ logger.error(f"ast parse doc: {doc_path} failed, exp: {exp}")
+ continue
+
+ ui_doc += f"Documentation of UI element labeled with the numeric tag '{i + 1}':\n"
+ if doc_content["tap"]:
+ ui_doc += f"This UI element is clickable. {doc_content['tap']}\n\n"
+ if doc_content["text"]:
+ ui_doc += (
+ f"This UI element can receive text input. The text input is used for the following "
+ f"purposes: {doc_content['text']}\n\n"
+ )
+ if doc_content["long_press"]:
+ ui_doc += f"This UI element is long clickable. {doc_content['long_press']}\n\n"
+ if doc_content["v_swipe"]:
+ ui_doc += (
+ f"This element can be swiped directly without tapping. You can swipe vertically on "
+ f"this UI element. {doc_content['v_swipe']}\n\n"
+ )
+ if doc_content["h_swipe"]:
+ ui_doc += (
+ f"This element can be swiped directly without tapping. You can swipe horizontally on "
+ f"this UI element. {doc_content['h_swipe']}\n\n"
+ )
+ return ui_doc
+
+ async def run(
+ self,
+ round_count: int,
+ task_desc: str,
+ last_act: str,
+ task_dir: Path,
+ docs_dir: Path,
+ grid_on: bool,
+ env: AndroidEnv,
+ ):
+ extra_config = config.extra
+ for path in [task_dir, docs_dir]:
+ path.mkdir(parents=True, exist_ok=True)
+ screenshot_path: Path = env.observe(
+ EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir)
+ )
+ xml_path: Path = env.observe(
+ EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
+ )
+ if not screenshot_path.exists() or not xml_path.exists():
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ clickable_list = []
+ focusable_list = []
+ traverse_xml_tree(xml_path, clickable_list, "clickable", True)
+ traverse_xml_tree(xml_path, focusable_list, "focusable", True)
+ elem_list: list[AndroidElement] = clickable_list.copy()
+ for elem in focusable_list:
+ bbox = elem.bbox
+ center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
+ close = False
+ for e in clickable_list:
+ bbox = e.bbox
+ center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
+ dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
+ if dist <= extra_config.get("min_dist", 30):
+ close = True
+ break
+ if not close:
+ elem_list.append(elem)
+
+ screenshot_labeled_path = task_dir.joinpath(f"{round_count}_labeled.png")
+ draw_bbox_multi(screenshot_path, screenshot_labeled_path, elem_list)
+ img_base64 = encode_image(screenshot_labeled_path)
+
+ parse_template = screenshot_parse_with_grid_template if grid_on else screenshot_parse_template
+
+ if grid_on:
+ env.rows, env.cols = draw_grid(screenshot_path, task_dir / f"{round_count}_grid.png")
+
+ ui_doc = self._makeup_ui_document(elem_list, docs_dir)
+ context = parse_template.format(ui_document=ui_doc, task_description=task_desc, last_act=last_act)
+ node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64])
+
+ if "error" in node.content:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ prompt = node.compile(context=context, schema="json", mode="auto")
+ OpLogItem(step=round_count, prompt=prompt, image=str(screenshot_labeled_path), response=node.content)
+
+ op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on)
+ if op_param.param_state == RunState.FINISH:
+ logger.info(f"op_param: {op_param}")
+ return AndroidActionOutput(action_state=RunState.FINISH)
+ if op_param.param_state == RunState.FAIL:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ last_act = op_param.last_act
+ if isinstance(op_param, TapOpParam):
+ x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
+ action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
+ elif isinstance(op_param, TextOpParam):
+ action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
+ elif isinstance(op_param, LongPressOpParam):
+ x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
+ action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
+ elif isinstance(op_param, SwipeOpParam):
+ x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
+ action = EnvAction(
+ action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
+ )
+ elif isinstance(op_param, GridOpParam):
+ grid_on = True
+ elif isinstance(op_param, TapGridOpParam) or isinstance(op_param, LongPressGridOpParam):
+ x, y = area_to_xy(op_param.area, op_param.subarea, env.width, env.height, env.rows, env.cols)
+ if isinstance(op_param, TapGridOpParam):
+ action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
+ else:
+ # LongPressGridOpParam
+ action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
+ elif isinstance(op_param, SwipeGridOpParam):
+ start_x, start_y = area_to_xy(
+ op_param.start_area, op_param.start_subarea, env.width, env.height, env.rows, env.cols
+ )
+ end_x, end_y = area_to_xy(
+ op_param.end_area, op_param.end_subarea, env.width, env.height, env.rows, env.cols
+ )
+ action = EnvAction(
+ action_type=EnvActionType.USER_SWIPE_TO, coord=(start_x, start_y), tgt_coord=(end_x, end_y)
+ )
+
+ if not grid_on:
+ obs, _, _, _, info = env.step(action)
+ action_res = info["res"]
+ if action_res == ADB_EXEC_FAIL:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ if op_param.act_name != "grid":
+ grid_on = False
+
+ return AndroidActionOutput(data={"grid_on": grid_on, "last_act": last_act})
diff --git a/metagpt/ext/android_assistant/actions/screenshot_parse_an.py b/metagpt/ext/android_assistant/actions/screenshot_parse_an.py
new file mode 100644
index 000000000..eb23ba934
--- /dev/null
+++ b/metagpt/ext/android_assistant/actions/screenshot_parse_an.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc : the ActionNode to parse screenshot
+
+from metagpt.actions.action_node import ActionNode
+
+OBSERVATION = ActionNode(
+ key="Observation", expected_type=str, instruction="Describe what you observe in the image", example=""
+)
+
+THOUGHT = ActionNode(
+ key="Thought",
+ expected_type=str,
+ instruction="To complete the given task, what is the next step I should do",
+ example="",
+)
+
+ACTION = ActionNode(
+ key="Action",
+ expected_type=str,
+ instruction="The function call with the correct parameters to proceed with the task. If you believe the task is "
+ "completed or there is nothing to be done, you should output FINISH. You cannot output anything else "
+ "except a function call or FINISH in this field.",
+ example="",
+)
+
+SUMMARY = ActionNode(
+ key="Summary",
+ expected_type=str,
+ instruction="Summarize your past actions along with your latest action in one or two sentences. Do not include "
+ "the numeric tag in your summary",
+ example="",
+)
+
+SUMMARY_GRID = ActionNode(
+ key="Summary",
+ expected_type=str,
+ instruction="Summarize your past actions along with your latest action in one or two sentences. Do not include "
+ "the grid area number in your summary",
+ example="",
+)
+
+NODES = [OBSERVATION, THOUGHT, ACTION, SUMMARY]
+
+NODES_GRID = [OBSERVATION, THOUGHT, ACTION, SUMMARY_GRID]
+
+SCREENSHOT_PARSE_NODE = ActionNode.from_children("ScreenshotParse", NODES)
+SCREENSHOT_PARSE_GRID_NODE = ActionNode.from_children("ScreenshotParseGrid", NODES_GRID)
diff --git a/metagpt/ext/android_assistant/actions/self_learn_and_reflect.py b/metagpt/ext/android_assistant/actions/self_learn_and_reflect.py
new file mode 100644
index 000000000..5e9cfbb45
--- /dev/null
+++ b/metagpt/ext/android_assistant/actions/self_learn_and_reflect.py
@@ -0,0 +1,231 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc : LIKE scripts/self_explorer.py in stage=learn & mode=auto self_explore_task stage
+
+import ast
+from pathlib import Path
+
+from metagpt.actions.action import Action
+from metagpt.config2 import config
+from metagpt.environment.android.android_env import AndroidEnv
+from metagpt.environment.android.const import ADB_EXEC_FAIL
+from metagpt.environment.android.env_space import (
+ EnvAction,
+ EnvActionType,
+ EnvObsParams,
+ EnvObsType,
+)
+from metagpt.ext.android_assistant.actions.screenshot_parse_an import (
+ SCREENSHOT_PARSE_NODE,
+)
+from metagpt.ext.android_assistant.actions.self_learn_reflect_an import (
+ SELF_LEARN_REFLECT_NODE,
+)
+from metagpt.ext.android_assistant.prompts.assistant_prompt import (
+ screenshot_parse_self_explore_reflect_template as reflect_template,
+)
+from metagpt.ext.android_assistant.prompts.assistant_prompt import (
+ screenshot_parse_self_explore_template,
+)
+from metagpt.ext.android_assistant.utils.schema import (
+ ActionOp,
+ AndroidActionOutput,
+ AndroidElement,
+ Decision,
+ DocContent,
+ LongPressOpParam,
+ OpLogItem,
+ ReflectLogItem,
+ RunState,
+ SwipeOp,
+ SwipeOpParam,
+ TapOpParam,
+ TextOpParam,
+)
+from metagpt.ext.android_assistant.utils.utils import (
+ draw_bbox_multi,
+ elem_bbox_to_xy,
+ elem_list_from_xml_tree,
+ reflect_parse_extarct,
+ screenshot_parse_extract,
+)
+from metagpt.logs import logger
+from metagpt.utils.common import encode_image
+
+
+class SelfLearnAndReflect(Action):
+ name: str = "SelfLearnAndReflect"
+
+ useless_list: list[str] = [] # store useless elements uid
+
+ screenshot_before_path: str = ""
+ screenshot_before_base64: str = ""
+ elem_list: list[AndroidElement] = []
+ swipe_orient: str = "up"
+ act_name: str = ""
+ ui_area: int = -1
+
+ async def run(
+ self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
+ ) -> AndroidActionOutput:
+ for path in [task_dir, docs_dir]:
+ path.mkdir(parents=True, exist_ok=True)
+ resp = await self.run_self_learn(round_count, task_desc, last_act, task_dir, env)
+ if resp.action_state != RunState.SUCCESS:
+ return resp
+
+ resp = await self.run_reflect(round_count, task_desc, last_act, task_dir, docs_dir, env)
+ return resp
+
+ async def run_self_learn(
+ self, round_count: int, task_desc: str, last_act: str, task_dir: Path, env: AndroidEnv
+ ) -> AndroidActionOutput:
+ extra_config = config.extra
+ screenshot_path: Path = env.observe(
+ EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_before", local_save_dir=task_dir)
+ )
+ xml_path: Path = env.observe(
+ EnvObsParams(obs_type=EnvObsType.GET_XML, xml_name=f"{round_count}", local_save_dir=task_dir)
+ )
+ if not screenshot_path.exists() or not xml_path.exists():
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ elem_list = elem_list_from_xml_tree(xml_path, self.useless_list, extra_config.get("min_dist", 30))
+
+ screenshot_before_labeled_path = task_dir.joinpath(f"{round_count}_before_labeled.png")
+ draw_bbox_multi(screenshot_path, screenshot_before_labeled_path, elem_list)
+ img_base64 = encode_image(screenshot_before_labeled_path)
+ self.screenshot_before_base64 = img_base64
+ self.screenshot_before_path = screenshot_before_labeled_path
+
+ self_explore_template = screenshot_parse_self_explore_template
+ context = self_explore_template.format(task_description=task_desc, last_act=last_act)
+
+ node = await SCREENSHOT_PARSE_NODE.fill(context=context, llm=self.llm, images=[img_base64])
+ logger.debug(f"fill result:{node}")
+ if "error" in node.content:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+ prompt = node.compile(context=context, schema="json", mode="auto")
+ # Modify WindowsPath to Str
+ OpLogItem(step=round_count, prompt=prompt, image=str(screenshot_before_labeled_path), response=node.content)
+ op_param = screenshot_parse_extract(node.instruct_content.model_dump(), grid_on=False)
+ # TODO Modify Op_param. When op_param.action is FINISH, how to solve this ?
+ if op_param.param_state == RunState.FINISH:
+ return AndroidActionOutput(action_state=RunState.FINISH)
+ if op_param.param_state == RunState.FAIL:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ if isinstance(op_param, TapOpParam):
+ self.ui_area = op_param.area
+ x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
+ action = EnvAction(action_type=EnvActionType.SYSTEM_TAP, coord=(x, y))
+ elif isinstance(op_param, TextOpParam):
+ action = EnvAction(action_type=EnvActionType.USER_INPUT, input_txt=op_param.input_str)
+ elif isinstance(op_param, LongPressOpParam):
+ self.ui_area = op_param.area
+ x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
+ action = EnvAction(action_type=EnvActionType.USER_LONGPRESS, coord=(x, y))
+ elif isinstance(op_param, SwipeOpParam):
+ self.ui_area = op_param.area
+ self.swipe_orient = op_param.swipe_orient
+ x, y = elem_bbox_to_xy(elem_list[op_param.area - 1].bbox)
+ action = EnvAction(
+ action_type=EnvActionType.USER_SWIPE, coord=(x, y), orient=op_param.swipe_orient, dist=op_param.dist
+ )
+
+ obs, _, _, _, info = env.step(action)
+ action_res = info["res"]
+ if action_res == ADB_EXEC_FAIL:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ self.elem_list = elem_list
+ self.act_name = op_param.act_name
+ return AndroidActionOutput()
+
+ async def run_reflect(
+ self, round_count: int, task_desc: str, last_act: str, task_dir: Path, docs_dir: Path, env: AndroidEnv
+ ) -> AndroidActionOutput:
+ screenshot_path: Path = env.observe(
+ EnvObsParams(obs_type=EnvObsType.GET_SCREENSHOT, ss_name=f"{round_count}_after", local_save_dir=task_dir)
+ )
+ if not screenshot_path.exists():
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ screenshot_after_labeled_path = task_dir.joinpath(f"{round_count}_after_labeled.png")
+ draw_bbox_multi(screenshot_path, screenshot_after_labeled_path, elem_list=self.elem_list)
+ img_base64 = encode_image(screenshot_after_labeled_path)
+ if self.act_name == ActionOp.TAP.value:
+ action = "tapping"
+ elif self.act_name == ActionOp.LONG_PRESS.value:
+ action = "long pressing"
+ elif self.act_name == ActionOp.SWIPE.value:
+ action = "swiping"
+ if self.swipe_orient == SwipeOp.UP.value or self.swipe_orient == SwipeOp.DOWN.value:
+ action = "v_swipe"
+ elif self.swipe_orient == SwipeOp.LEFT.value or self.swipe_orient == SwipeOp.RIGHT.value:
+ action = "h_swipe"
+ else:
+ # TODO Test for assignment, This error is eupiped with the next.
+ logger.warning(f"Current action name parse failed, it's `{self.act_name}`")
+ action = None
+ context = reflect_template.format(
+ action=action, ui_element=str(self.ui_area), task_desc=task_desc, last_act=last_act
+ )
+ node = await SELF_LEARN_REFLECT_NODE.fill(
+ context=context, llm=self.llm, images=[self.screenshot_before_base64, img_base64]
+ )
+
+ if "error" in node.content:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ prompt = node.compile(context=context, schema="json", mode="auto")
+ ReflectLogItem(
+ step=round_count,
+ prompt=prompt,
+ image_before=str(self.screenshot_before_path),
+ image_after=str(screenshot_after_labeled_path),
+ response=node.content,
+ )
+
+ op_param = reflect_parse_extarct(node.instruct_content.model_dump())
+ if op_param.param_state == RunState.FINISH:
+ return AndroidActionOutput(action_state=RunState.FINISH)
+ if op_param.param_state == RunState.FAIL:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ logger.info(
+ f"reflect_parse_extarct decision: {op_param.decision}, "
+ f"elem_list size: {len(self.elem_list)}, ui_area: {self.ui_area}"
+ )
+ # TODO here will cause `IndexError: list index out of range`.
+ # Maybe you should clink back to the desktop in the simulator
+ resource_id = self.elem_list[int(self.ui_area) - 1].uid
+ if op_param.decision == Decision.INEFFECTIVE.value:
+ self.useless_list.append(resource_id)
+ last_act = "NONE" # TODO global
+ elif op_param.decision in [Decision.BACK.value, Decision.CONTINUE.value, Decision.SUCCESS.value]:
+ if op_param.decision in [Decision.BACK.value, Decision.CONTINUE.value]:
+ self.useless_list.append(resource_id)
+ last_act = "NONE"
+ if op_param.decision == Decision.BACK.value:
+ action = EnvAction(action_type=EnvActionType.SYSTEM_BACK)
+ obs, _, _, _, info = env.step(action)
+ if info["res"] == ADB_EXEC_FAIL:
+ return AndroidActionOutput(action_state=RunState.FAIL)
+ doc = op_param.documentation
+ doc_path = docs_dir.joinpath(f"{resource_id}.txt")
+ if doc_path.exists():
+ try:
+ doc_content = ast.literal_eval(doc_path.read_text())
+ except Exception as exp:
+ logger.error(f"ast parse doc: {doc_path} failed, exp: {exp}")
+ return AndroidActionOutput(action_state=RunState.FAIL)
+
+ if doc_content[self.act_name]:
+ logger.info(f"Documentation for the element {resource_id} already exists.")
+ return AndroidActionOutput(action_state=RunState.FAIL)
+ else:
+ doc_content = DocContent()
+ setattr(doc_content, self.act_name, doc)
+ doc_path.write_text(str(doc_content))
+ return AndroidActionOutput(data={"last_act": last_act})
diff --git a/metagpt/ext/android_assistant/actions/self_learn_reflect_an.py b/metagpt/ext/android_assistant/actions/self_learn_reflect_an.py
new file mode 100644
index 000000000..305b7376a
--- /dev/null
+++ b/metagpt/ext/android_assistant/actions/self_learn_reflect_an.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc : the ActionNode to parse Reflection
+
+from metagpt.actions.action_node import ActionNode
+
+DECISION = ActionNode(
+ key="Decision", expected_type=str, instruction="explain why you made this decision", example="BACK"
+)
+
+
+THOUGHT = ActionNode(key="Thought", expected_type=str, instruction="explain why you made this decision", example="")
+
+
+DOCUMENTATION = ActionNode(
+ key="Documentation", expected_type=str, instruction="describe the function of the UI element", example=""
+)
+
+
+NODES = [DECISION, THOUGHT, DOCUMENTATION]
+SELF_LEARN_REFLECT_NODE = ActionNode.from_children("SelfLearnReflect", NODES)
diff --git a/metagpt/ext/android_assistant/prompts/__init__.py b/metagpt/ext/android_assistant/prompts/__init__.py
new file mode 100644
index 000000000..2bcf8efd0
--- /dev/null
+++ b/metagpt/ext/android_assistant/prompts/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc :
diff --git a/metagpt/ext/android_assistant/prompts/assistant_prompt.py b/metagpt/ext/android_assistant/prompts/assistant_prompt.py
new file mode 100644
index 000000000..34baf5841
--- /dev/null
+++ b/metagpt/ext/android_assistant/prompts/assistant_prompt.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc : the prompt templates of assistant learning and acting
+
+screenshot_parse_template = """You are an agent that is trained to perform some basic tasks on a smartphone. You will be given a
+smartphone screenshot. The interactive UI elements on the screenshot are labeled with numeric tags starting from 1. The
+numeric tag of each interactive element is located in the center of the element.
+
+You can call the following functions to control the smartphone:
+
+1. tap(element: int)
+This function is used to tap an UI element shown on the smartphone screen.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
+A simple use case can be tap(5), which taps the UI element labeled with the number 5.
+
+2. text(text_input: str)
+This function is used to insert text input in an input field/box. text_input is the string you want to insert and must
+be wrapped with double quotation marks. A simple use case can be text("Hello, world!"), which inserts the string
+"Hello, world!" into the input area on the smartphone screen. This function is usually callable when you see a keyboard
+showing in the lower half of the screen.
+
+3. long_press(element: int)
+This function is used to long press an UI element shown on the smartphone screen.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
+A simple use case can be long_press(5), which long presses the UI element labeled with the number 5.
+
+4. swipe(element: int, direction: str, dist: str)
+This function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen. "direction" is a string that
+represents one of the four directions: up, down, left, right. "direction" must be wrapped with double quotation
+marks. "dist" determines the distance of the swipe and can be one of the three options: short, medium, long. You should
+choose the appropriate distance option according to your need.
+A simple use case can be swipe(21, "up", "medium"), which swipes up the UI element labeled with the number 21 for a
+medium distance.
+
+5. grid()
+You should call this function when you find the element you want to interact with is not labeled with a numeric tag and
+other elements with numeric tags cannot help with the task. The function will bring up a grid overlay to divide the
+smartphone screen into small areas and this will give you more freedom to choose any part of the screen to tap, long
+press, or swipe.
+{ui_document}
+The task you need to complete is to: {task_description}. Your past actions to proceed with this task are summarized as
+follows: {last_act}
+Now, given the documentation and the following labeled screenshot, you need to think and call the function needed to
+proceed with the task. Your output should include three parts in the given format:
+
+You can only take one action at a time, so please directly call the function."""
+
+screenshot_parse_with_grid_template = """You are an agent that is trained to perform some basic tasks on a smartphone. You will be given
+a smartphone screenshot overlaid by a grid. The grid divides the screenshot into small square areas. Each area is
+labeled with an integer in the top-left corner.
+
+You can call the following functions to control the smartphone:
+
+1. tap(area: int, subarea: str)
+This function is used to tap a grid area shown on the smartphone screen. "area" is the integer label assigned to a grid
+area shown on the smartphone screen. "subarea" is a string representing the exact location to tap within the grid area.
+It can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom, and
+bottom-right.
+A simple use case can be tap(5, "center"), which taps the exact center of the grid area labeled with the number 5.
+
+2. long_press(area: int, subarea: str)
+This function is used to long press a grid area shown on the smartphone screen. "area" is the integer label assigned to
+a grid area shown on the smartphone screen. "subarea" is a string representing the exact location to long press within
+the grid area. It can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left, bottom,
+and bottom-right.
+A simple use case can be long_press(7, "top-left"), which long presses the top left part of the grid area labeled with
+the number 7.
+
+3. swipe(start_area: int, start_subarea: str, end_area: int, end_subarea: str)
+This function is used to perform a swipe action on the smartphone screen, especially when you want to interact with a
+scroll view or a slide bar. "start_area" is the integer label assigned to the grid area which marks the starting
+location of the swipe. "start_subarea" is a string representing the exact location to begin the swipe within the grid
+area. "end_area" is the integer label assigned to the grid area which marks the ending location of the swipe.
+"end_subarea" is a string representing the exact location to end the swipe within the grid area.
+The two subarea parameters can take one of the nine values: center, top-left, top, top-right, left, right, bottom-left,
+bottom, and bottom-right.
+A simple use case can be swipe(21, "center", 25, "right"), which performs a swipe starting from the center of grid area
+21 to the right part of grid area 25.
+
+The task you need to complete is to: {task_description}. Your past actions to proceed with this task are summarized as
+follows: {last_act}
+Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task.
+Your output should include three parts in the given format:
+
+You can only take one action at a time, so please directly call the function."""
+
+screenshot_parse_self_explore_template = """You are an agent that is trained to complete certain tasks on a smartphone. You will be
+given a screenshot of a smartphone app. The interactive UI elements on the screenshot are labeled with numeric tags
+starting from 1.
+
+You can call the following functions to interact with those labeled elements to control the smartphone:
+
+1. tap(element: int)
+This function is used to tap an UI element shown on the smartphone screen.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
+A simple use case can be tap(5), which taps the UI element labeled with the number 5.
+
+2. text(text_input: str)
+This function is used to insert text input in an input field/box. text_input is the string you want to insert and must
+be wrapped with double quotation marks. A simple use case can be text("Hello, world!"), which inserts the string
+"Hello, world!" into the input area on the smartphone screen. This function is only callable when you see a keyboard
+showing in the lower half of the screen.
+
+3. long_press(element: int)
+This function is used to long press an UI element shown on the smartphone screen.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen.
+A simple use case can be long_press(5), which long presses the UI element labeled with the number 5.
+
+4. swipe(element: int, direction: str, dist: str)
+This function is used to swipe an UI element shown on the smartphone screen, usually a scroll view or a slide bar.
+"element" is a numeric tag assigned to an UI element shown on the smartphone screen. "direction" is a string that
+represents one of the four directions: up, down, left, right. "direction" must be wrapped with double quotation
+marks. "dist" determines the distance of the swipe and can be one of the three options: short, medium, long. You should
+choose the appropriate distance option according to your need.
+A simple use case can be swipe(21, "up", "medium"), which swipes up the UI element labeled with the number 21 for a
+medium distance.
+
+The task you need to complete is to {task_description}. Your past actions to proceed with this task are summarized as
+follows: {last_act}
+Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task.
+Your output should include three parts in the given format:
+
+You can only take one action at a time, so please directly call the function."""
+
+screenshot_parse_self_explore_reflect_template = """I will give you screenshots of a mobile app before and after {action} the UI
+element labeled with the number '{ui_element}' on the first screenshot. The numeric tag of each element is located at
+the center of the element. The action of {action} this UI element was described as follows:
+{last_act}
+The action was also an attempt to proceed with a larger task, which is to {task_desc}. Your job is to carefully analyze
+the difference between the two screenshots to determine if the action is in accord with the description above and at
+the same time effectively moved the task forward. Your output should be determined based on the following situations:
+1. BACK
+If you think the action navigated you to a page where you cannot proceed with the given task, you should go back to the
+previous interface. At the same time, describe the functionality of the UI element concisely in one or two sentences by
+observing the difference between the two screenshots. Notice that your description of the UI element should focus on
+the general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as
+"the UI element" to refer to the element. Your output should be in the following format:
+Decision: BACK
+Thought: is the role whose schedule we are editing right now.
+ scratch = role.rc.scratch
+ #