From c2dba151fbe139291d8fd185aea87e15a04a093a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=A3=92=E6=A3=92?= Date: Thu, 30 Nov 2023 17:42:55 +0800 Subject: [PATCH] add unit test : write_code_reuse_code_long_for_wine. --- .../actions/test_write_analysis_code.py | 274 +++++++++++------- 1 file changed, 173 insertions(+), 101 deletions(-) diff --git a/tests/metagpt/actions/test_write_analysis_code.py b/tests/metagpt/actions/test_write_analysis_code.py index d4bccb552..1a727a9e4 100644 --- a/tests/metagpt/actions/test_write_analysis_code.py +++ b/tests/metagpt/actions/test_write_analysis_code.py @@ -6,110 +6,110 @@ from metagpt.actions.execute_code import ExecutePyCode from metagpt.schema import Message from metagpt.logs import logger -@pytest.mark.asyncio -async def test_write_code_by_list_plan(): - write_code = WriteCodeByGenerate() - execute_code = ExecutePyCode() - messages = [] - plan = ["随机生成一个pandas DataFrame时间序列", "绘制这个时间序列的直方图", "求均值"] - for task in plan: - print(f"\n任务: {task}\n\n") - messages.append(Message(task, role='assistant')) - code = await write_code.run(messages) - messages.append(Message(code, role='assistant')) - assert len(code) > 0 - output = await execute_code.run(code) - print(f"\n[Output]: 任务{task}的执行结果是: \n{output}\n") - messages.append(output[0]) +# @pytest.mark.asyncio +# async def test_write_code_by_list_plan(): +# write_code = WriteCodeByGenerate() +# execute_code = ExecutePyCode() +# messages = [] +# plan = ["随机生成一个pandas DataFrame时间序列", "绘制这个时间序列的直方图", "求均值"] +# for task in plan: +# print(f"\n任务: {task}\n\n") +# messages.append(Message(task, role='assistant')) +# code = await write_code.run(messages) +# messages.append(Message(code, role='assistant')) +# assert len(code) > 0 +# output = await execute_code.run(code) +# print(f"\n[Output]: 任务{task}的执行结果是: \n{output}\n") +# messages.append(output[0]) -@pytest.mark.asyncio -async def test_write_code_to_correct_error(): +# @pytest.mark.asyncio +# async def test_write_code_to_correct_error(): - structural_context = """ - ## User Requirement - read a dataset test.csv and print its head - ## Current Plan - [ - { - "task_id": "1", - "dependent_task_ids": [], - "instruction": "import pandas and load the dataset from 'test.csv'.", - "task_type": "", - "code": "", - "result": "", - "is_finished": false - }, - { - "task_id": "2", - "dependent_task_ids": [ - "1" - ], - "instruction": "Print the head of the dataset to display the first few rows.", - "task_type": "", - "code": "", - "result": "", - "is_finished": false - } - ] - ## Current Task - {"task_id": "1", "dependent_task_ids": [], "instruction": "import pandas and load the dataset from 'test.csv'.", "task_type": "", "code": "", "result": "", "is_finished": false} - """ - wrong_code = """import pandas as pd\ndata = pd.read_excel('test.csv')\ndata""" # use read_excel to read a csv - error = """ - Traceback (most recent call last): - File "", line 2, in - File "/Users/gary/miniconda3/envs/py39_scratch/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 478, in read_excel - io = ExcelFile(io, storage_options=storage_options, engine=engine) - File "/Users/gary/miniconda3/envs/py39_scratch/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 1500, in __init__ - raise ValueError( - ValueError: Excel file format cannot be determined, you must specify an engine manually. - """ - context = [ - Message(content=structural_context, role="user"), - Message(content=wrong_code, role="assistant"), - Message(content=error, role="user"), - ] - new_code = await WriteCodeByGenerate().run(context=context) - print(new_code) - assert "read_csv" in new_code # should correct read_excel to read_csv +# structural_context = """ +# ## User Requirement +# read a dataset test.csv and print its head +# ## Current Plan +# [ +# { +# "task_id": "1", +# "dependent_task_ids": [], +# "instruction": "import pandas and load the dataset from 'test.csv'.", +# "task_type": "", +# "code": "", +# "result": "", +# "is_finished": false +# }, +# { +# "task_id": "2", +# "dependent_task_ids": [ +# "1" +# ], +# "instruction": "Print the head of the dataset to display the first few rows.", +# "task_type": "", +# "code": "", +# "result": "", +# "is_finished": false +# } +# ] +# ## Current Task +# {"task_id": "1", "dependent_task_ids": [], "instruction": "import pandas and load the dataset from 'test.csv'.", "task_type": "", "code": "", "result": "", "is_finished": false} +# """ +# wrong_code = """import pandas as pd\ndata = pd.read_excel('test.csv')\ndata""" # use read_excel to read a csv +# error = """ +# Traceback (most recent call last): +# File "", line 2, in +# File "/Users/gary/miniconda3/envs/py39_scratch/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 478, in read_excel +# io = ExcelFile(io, storage_options=storage_options, engine=engine) +# File "/Users/gary/miniconda3/envs/py39_scratch/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 1500, in __init__ +# raise ValueError( +# ValueError: Excel file format cannot be determined, you must specify an engine manually. +# """ +# context = [ +# Message(content=structural_context, role="user"), +# Message(content=wrong_code, role="assistant"), +# Message(content=error, role="user"), +# ] +# new_code = await WriteCodeByGenerate().run(context=context) +# print(new_code) +# assert "read_csv" in new_code # should correct read_excel to read_csv -@pytest.mark.asyncio -async def test_write_code_reuse_code_simple(): - structural_context = """ - ## User Requirement - read a dataset test.csv and print its head - ## Current Plan - [ - { - "task_id": "1", - "dependent_task_ids": [], - "instruction": "import pandas and load the dataset from 'test.csv'.", - "task_type": "", - "code": "import pandas as pd\ndata = pd.read_csv('test.csv')", - "result": "", - "is_finished": true - }, - { - "task_id": "2", - "dependent_task_ids": [ - "1" - ], - "instruction": "Print the head of the dataset to display the first few rows.", - "task_type": "", - "code": "", - "result": "", - "is_finished": false - } - ] - ## Current Task - {"task_id": "2", "dependent_task_ids": ["1"], "instruction": "Print the head of the dataset to display the first few rows.", "task_type": "", "code": "", "result": "", "is_finished": false} - """ - context = [ - Message(content=structural_context, role="user"), - ] - code = await WriteCodeByGenerate().run(context=context) - print(code) - assert "pandas" not in code and "read_csv" not in code # should reuse import and read statement from previous one +# @pytest.mark.asyncio +# async def test_write_code_reuse_code_simple(): +# structural_context = """ +# ## User Requirement +# read a dataset test.csv and print its head +# ## Current Plan +# [ +# { +# "task_id": "1", +# "dependent_task_ids": [], +# "instruction": "import pandas and load the dataset from 'test.csv'.", +# "task_type": "", +# "code": "import pandas as pd\ndata = pd.read_csv('test.csv')", +# "result": "", +# "is_finished": true +# }, +# { +# "task_id": "2", +# "dependent_task_ids": [ +# "1" +# ], +# "instruction": "Print the head of the dataset to display the first few rows.", +# "task_type": "", +# "code": "", +# "result": "", +# "is_finished": false +# } +# ] +# ## Current Task +# {"task_id": "2", "dependent_task_ids": ["1"], "instruction": "Print the head of the dataset to display the first few rows.", "task_type": "", "code": "", "result": "", "is_finished": false} +# """ +# context = [ +# Message(content=structural_context, role="user"), +# ] +# code = await WriteCodeByGenerate().run(context=context) +# print(code) +# assert "pandas" not in code and "read_csv" not in code # should reuse import and read statement from previous one @pytest.mark.asyncio async def test_write_code_reuse_code_long(): @@ -167,3 +167,75 @@ async def test_write_code_reuse_code_long(): success_rate = sum(success) / trials_num logger.info(f"success rate: {success_rate :.2f}") assert success_rate >= 0.8 + + +@pytest.mark.asyncio +async def test_write_code_reuse_code_long_for_wine(): + """test code reuse for long context""" + + structural_context = """ + ## User Requirement + Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy + ## Current Plan + [ + { + "task_id": "1", + "dependent_task_ids": [], + "instruction": "Load the sklearn Wine recognition dataset and perform exploratory data analysis." + "task_type": "", + "code": "from sklearn.datasets import load_wine\n# Load the Wine recognition dataset\nwine_data = load_wine()\n# Perform exploratory data analysis\nwine_data.keys()", + "result": "Truncated to show only the last 1000 characters\ndict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])", + "is_finished": true + }, + { + "task_id": "2", + "dependent_task_ids": ["1"], + "instruction": "Create a plot to visualize some aspect of the wine dataset." + "task_type": "", + "code": "", + "result": "", + "is_finished": false + }, + { + "task_id": "3", + "dependent_task_ids": ["1"], + "instruction": "Split the dataset into training and validation sets with a 20% validation size.", + "task_type": "", + "code": "", + "result": "", + "is_finished": false + }, + { + "task_id": "4", + "dependent_task_ids": ["3"], + "instruction": "Train a model on the training set to predict wine class.", + "task_type": "", + "code": "", + "result": "", + "is_finished": false + }, + { + "task_id": "5", + "dependent_task_ids": ["4"], + "instruction": "Evaluate the model on the validation set and report the accuracy.", + "task_type": "", + "code": "", + "result": "", + "is_finished": false + } + ] + ## Current Task + {"task_id": "2", "dependent_task_ids": ["1"], "instruction": "Create a plot to visualize some aspect of the Wine dataset.", "task_type": "", "code": "", "result": "", "is_finished": false} + """ + context = [ + Message(content=structural_context, role="user"), + ] + trials_num = 5 + trials = [WriteCodeByGenerate().run(context=context, temperature=0.0) for _ in range(trials_num)] + trial_results = await asyncio.gather(*trials) + print(*trial_results, sep="\n\n***\n\n") + success = ["load_wine" not in result\ + for result in trial_results] # should reuse iris_data from previous tasks + success_rate = sum(success) / trials_num + logger.info(f"success rate: {success_rate :.2f}") + assert success_rate >= 0.8