mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-12 01:02:37 +02:00
Merge branch 'dev_tool_selection' of https://gitlab.deepwisdomai.com/agents/data_agents_opt into dev_tool_selection
Resolve conflicts
This commit is contained in:
commit
b9d12f17ae
8 changed files with 249 additions and 136 deletions
|
|
@ -1,21 +1,21 @@
|
|||
import glob
|
||||
import json
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
import fire
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
from metagpt.actions import Action
|
||||
from metagpt.actions.execute_code import ExecutePyCode
|
||||
from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
|
||||
from metagpt.actions.write_code_steps import WriteCodeSteps
|
||||
from metagpt.actions.write_plan import WritePlan
|
||||
from metagpt.const import DATA_PATH
|
||||
from metagpt.logs import logger
|
||||
from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT
|
||||
from metagpt.roles import Role
|
||||
from metagpt.schema import Message, Plan
|
||||
from metagpt.utils.common import CodeParser
|
||||
from metagpt.actions.write_code_steps import WriteCodeSteps
|
||||
from metagpt.actions.debug_code import DebugCode
|
||||
|
||||
STRUCTURAL_CONTEXT = """
|
||||
|
|
@ -74,32 +74,16 @@ def read_data(file: str) -> pd.DataFrame:
|
|||
return df
|
||||
|
||||
|
||||
def get_samples(df: pd.DataFrame) -> str:
|
||||
def get_column_info(df: pd.DataFrame) -> str:
|
||||
data = []
|
||||
|
||||
if len(df) > 5:
|
||||
df_ = df.sample(5, random_state=0)
|
||||
else:
|
||||
df_ = df
|
||||
|
||||
for i in list(df_):
|
||||
for i in df.columns:
|
||||
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
|
||||
n_unique = df[i].nunique()
|
||||
s = df_[i].tolist()
|
||||
data.append([i, df[i].dtype, nan_freq, n_unique])
|
||||
|
||||
if str(df[i].dtype) == "float64":
|
||||
s = [round(sample, 2) if not pd.isna(sample) else None for sample in s]
|
||||
|
||||
data.append([df_[i].name, df[i].dtype, nan_freq, n_unique, s])
|
||||
samples = pd.DataFrame(
|
||||
data,
|
||||
columns=[
|
||||
"Column_name",
|
||||
"Data_type",
|
||||
"NaN_Frequency(%)",
|
||||
"N_unique",
|
||||
"Samples",
|
||||
],
|
||||
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
|
||||
)
|
||||
return samples.to_string(index=False)
|
||||
|
||||
|
|
@ -128,20 +112,19 @@ class AskReview(Action):
|
|||
|
||||
|
||||
class GenerateDataDesc(Action):
|
||||
async def run(self, files: list) -> dict:
|
||||
async def run(self, file: str) -> dict:
|
||||
data_desc = {}
|
||||
for file in files:
|
||||
df = read_data(file)
|
||||
file_name = file.split("/")[-1]
|
||||
data_head = df.head().to_dict(orient="list")
|
||||
data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
|
||||
prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
|
||||
rsp = await self._aask(prompt)
|
||||
rsp = CodeParser.parse_code(block=None, text=rsp)
|
||||
data_desc[file_name] = {}
|
||||
data_desc[file_name]["path"] = file
|
||||
data_desc[file_name]["description"] = rsp
|
||||
data_desc[file_name]["column_info"] = get_samples(df)
|
||||
df = read_data(file)
|
||||
data_head = df.head().to_dict(orient="list")
|
||||
data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
|
||||
prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
|
||||
rsp = await self._aask(prompt)
|
||||
rsp = CodeParser.parse_code(block=None, text=rsp)
|
||||
rsp = json.loads(rsp)
|
||||
data_desc["path"] = file
|
||||
data_desc["data_desc"] = rsp["data_desc"]
|
||||
data_desc["column_desc"] = rsp["column_desc"]
|
||||
data_desc["column_info"] = get_column_info(df)
|
||||
return data_desc
|
||||
|
||||
|
||||
|
|
@ -184,6 +167,8 @@ class MLEngineer(Role):
|
|||
self.plan.finish_current_task()
|
||||
self.working_memory.clear()
|
||||
|
||||
if "print(df_processed.info())" in code:
|
||||
self.data_desc["column_info"] = result
|
||||
else:
|
||||
# update plan according to user's feedback and to take on changed tasks
|
||||
await self._update_plan()
|
||||
|
|
@ -198,8 +183,7 @@ class MLEngineer(Role):
|
|||
print(truncate(result))
|
||||
|
||||
async def _generate_data_desc(self):
|
||||
files = glob.glob(self.data_path + "/*.csv")
|
||||
data_desc = await GenerateDataDesc().run(files=files)
|
||||
data_desc = await GenerateDataDesc().run(self.data_path)
|
||||
return data_desc
|
||||
|
||||
async def _write_and_exec_code(self, max_retry: int = 3):
|
||||
|
|
@ -221,14 +205,10 @@ class MLEngineer(Role):
|
|||
if counter == 0:
|
||||
context = self.get_useful_memories()
|
||||
else:
|
||||
# improve_code = await DebugCode().run(plan=self.plan,
|
||||
# code= code_context + "\n\n" + code,
|
||||
# runtime_result=self.working_memory.get())
|
||||
improve_code = ""
|
||||
improve_code = await DebugCode().run(plan=self.plan,
|
||||
code= code_context + "\n\n" + code,
|
||||
runtime_result=self.working_memory.get())
|
||||
|
||||
# breakpoint()
|
||||
|
||||
column_names_dict = {key: value["column_info"] for key, value in self.data_desc.items()}
|
||||
|
||||
if not self.use_tools or self.plan.current_task.task_type == "other":
|
||||
logger.info("Write code with pure generation")
|
||||
|
|
@ -246,7 +226,7 @@ class MLEngineer(Role):
|
|||
cause_by = DebugCode
|
||||
else:
|
||||
code = await WriteCodeWithTools().run(
|
||||
context=context, plan=self.plan, code_steps=code_steps, **{"column_names": column_names_dict}
|
||||
context=context, plan=self.plan, code_steps=code_steps, **{"column_names": {}}
|
||||
)
|
||||
|
||||
cause_by = WriteCodeWithTools
|
||||
|
|
@ -260,7 +240,7 @@ class MLEngineer(Role):
|
|||
result, success = await self.execute_code.run(runcode)
|
||||
# truncated the result
|
||||
print(truncate(result))
|
||||
# print(result)
|
||||
|
||||
self.working_memory.add(
|
||||
Message(content=truncate(remove_escape_and_color_codes(result)), role="user", cause_by=ExecutePyCode)
|
||||
)
|
||||
|
|
@ -330,9 +310,8 @@ if __name__ == "__main__":
|
|||
# requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
|
||||
# requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"
|
||||
|
||||
from metagpt.const import DATA_PATH
|
||||
|
||||
# requirement = "Perform data analysis on the provided data. Train a model to predict the target variable Survived. Include data preprocessing, feature engineering, and modeling in your pipeline. The metric is accuracy."
|
||||
|
||||
data_path = f"{DATA_PATH}/titanic"
|
||||
requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue