Merge branch 'dev_tool_selection' of https://gitlab.deepwisdomai.com/agents/data_agents_opt into dev_tool_selection

Resolve conflicts
This commit is contained in:
stellahsr 2023-12-08 13:51:00 +08:00
commit b9d12f17ae
8 changed files with 249 additions and 136 deletions

View file

@ -1,21 +1,21 @@
import glob
import json
import re
from typing import List
import fire
import pandas as pd
import re
from metagpt.actions import Action
from metagpt.actions.execute_code import ExecutePyCode
from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
from metagpt.actions.write_code_steps import WriteCodeSteps
from metagpt.actions.write_plan import WritePlan
from metagpt.const import DATA_PATH
from metagpt.logs import logger
from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT
from metagpt.roles import Role
from metagpt.schema import Message, Plan
from metagpt.utils.common import CodeParser
from metagpt.actions.write_code_steps import WriteCodeSteps
from metagpt.actions.debug_code import DebugCode
STRUCTURAL_CONTEXT = """
@ -74,32 +74,16 @@ def read_data(file: str) -> pd.DataFrame:
return df
def get_samples(df: pd.DataFrame) -> str:
def get_column_info(df: pd.DataFrame) -> str:
data = []
if len(df) > 5:
df_ = df.sample(5, random_state=0)
else:
df_ = df
for i in list(df_):
for i in df.columns:
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
n_unique = df[i].nunique()
s = df_[i].tolist()
data.append([i, df[i].dtype, nan_freq, n_unique])
if str(df[i].dtype) == "float64":
s = [round(sample, 2) if not pd.isna(sample) else None for sample in s]
data.append([df_[i].name, df[i].dtype, nan_freq, n_unique, s])
samples = pd.DataFrame(
data,
columns=[
"Column_name",
"Data_type",
"NaN_Frequency(%)",
"N_unique",
"Samples",
],
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
)
return samples.to_string(index=False)
@ -128,20 +112,19 @@ class AskReview(Action):
class GenerateDataDesc(Action):
async def run(self, files: list) -> dict:
async def run(self, file: str) -> dict:
data_desc = {}
for file in files:
df = read_data(file)
file_name = file.split("/")[-1]
data_head = df.head().to_dict(orient="list")
data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
rsp = await self._aask(prompt)
rsp = CodeParser.parse_code(block=None, text=rsp)
data_desc[file_name] = {}
data_desc[file_name]["path"] = file
data_desc[file_name]["description"] = rsp
data_desc[file_name]["column_info"] = get_samples(df)
df = read_data(file)
data_head = df.head().to_dict(orient="list")
data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
rsp = await self._aask(prompt)
rsp = CodeParser.parse_code(block=None, text=rsp)
rsp = json.loads(rsp)
data_desc["path"] = file
data_desc["data_desc"] = rsp["data_desc"]
data_desc["column_desc"] = rsp["column_desc"]
data_desc["column_info"] = get_column_info(df)
return data_desc
@ -184,6 +167,8 @@ class MLEngineer(Role):
self.plan.finish_current_task()
self.working_memory.clear()
if "print(df_processed.info())" in code:
self.data_desc["column_info"] = result
else:
# update plan according to user's feedback and to take on changed tasks
await self._update_plan()
@ -198,8 +183,7 @@ class MLEngineer(Role):
print(truncate(result))
async def _generate_data_desc(self):
files = glob.glob(self.data_path + "/*.csv")
data_desc = await GenerateDataDesc().run(files=files)
data_desc = await GenerateDataDesc().run(self.data_path)
return data_desc
async def _write_and_exec_code(self, max_retry: int = 3):
@ -221,14 +205,10 @@ class MLEngineer(Role):
if counter == 0:
context = self.get_useful_memories()
else:
# improve_code = await DebugCode().run(plan=self.plan,
# code= code_context + "\n\n" + code,
# runtime_result=self.working_memory.get())
improve_code = ""
improve_code = await DebugCode().run(plan=self.plan,
code= code_context + "\n\n" + code,
runtime_result=self.working_memory.get())
# breakpoint()
column_names_dict = {key: value["column_info"] for key, value in self.data_desc.items()}
if not self.use_tools or self.plan.current_task.task_type == "other":
logger.info("Write code with pure generation")
@ -246,7 +226,7 @@ class MLEngineer(Role):
cause_by = DebugCode
else:
code = await WriteCodeWithTools().run(
context=context, plan=self.plan, code_steps=code_steps, **{"column_names": column_names_dict}
context=context, plan=self.plan, code_steps=code_steps, **{"column_names": {}}
)
cause_by = WriteCodeWithTools
@ -260,7 +240,7 @@ class MLEngineer(Role):
result, success = await self.execute_code.run(runcode)
# truncated the result
print(truncate(result))
# print(result)
self.working_memory.add(
Message(content=truncate(remove_escape_and_color_codes(result)), role="user", cause_by=ExecutePyCode)
)
@ -330,9 +310,8 @@ if __name__ == "__main__":
# requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
# requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"
from metagpt.const import DATA_PATH
# requirement = "Perform data analysis on the provided data. Train a model to predict the target variable Survived. Include data preprocessing, feature engineering, and modeling in your pipeline. The metric is accuracy."
data_path = f"{DATA_PATH}/titanic"
requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."