Merge branch 'dev_tool_selection' of https://gitlab.deepwisdomai.com/agents/data_agents_opt into dev_tool_selection

Resolve conflicts
2026-05-12 01:02:37 +02:00 · 2023-12-08 13:51:00 +08:00 · 2023-12-08 13:51:00 +08:00 · b9d12f17ae
commit b9d12f17ae
parent 13e2b05812 fe2b79fedc
8 changed files with 249 additions and 136 deletions
--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@ -1,21 +1,21 @@
-import glob
 import json
+import re
 from typing import List

 import fire
 import pandas as pd
-import re

 from metagpt.actions import Action
 from metagpt.actions.execute_code import ExecutePyCode
 from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
+from metagpt.actions.write_code_steps import WriteCodeSteps
 from metagpt.actions.write_plan import WritePlan
+from metagpt.const import DATA_PATH
 from metagpt.logs import logger
 from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT
 from metagpt.roles import Role
 from metagpt.schema import Message, Plan
 from metagpt.utils.common import CodeParser
-from metagpt.actions.write_code_steps import WriteCodeSteps
 from metagpt.actions.debug_code import DebugCode

 STRUCTURAL_CONTEXT = """
@ -74,32 +74,16 @@ def read_data(file: str) -> pd.DataFrame:
    return df


-def get_samples(df: pd.DataFrame) -> str:
+def get_column_info(df: pd.DataFrame) -> str:
    data = []
-
-    if len(df) > 5:
-        df_ = df.sample(5, random_state=0)
-    else:
-        df_ = df
-
-    for i in list(df_):
+    for i in df.columns:
        nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
        n_unique = df[i].nunique()
-        s = df_[i].tolist()
+        data.append([i, df[i].dtype, nan_freq, n_unique])

-        if str(df[i].dtype) == "float64":
-            s = [round(sample, 2) if not pd.isna(sample) else None for sample in s]
-
-        data.append([df_[i].name, df[i].dtype, nan_freq, n_unique, s])
    samples = pd.DataFrame(
        data,
-        columns=[
-            "Column_name",
-            "Data_type",
-            "NaN_Frequency(%)",
-            "N_unique",
-            "Samples",
-        ],
+        columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
    )
    return samples.to_string(index=False)

@ -128,20 +112,19 @@ class AskReview(Action):


 class GenerateDataDesc(Action):
-    async def run(self, files: list) -> dict:
+    async def run(self, file: str) -> dict:
        data_desc = {}
-        for file in files:
-            df = read_data(file)
-            file_name = file.split("/")[-1]
-            data_head = df.head().to_dict(orient="list")
-            data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
-            prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
-            rsp = await self._aask(prompt)
-            rsp = CodeParser.parse_code(block=None, text=rsp)
-            data_desc[file_name] = {}
-            data_desc[file_name]["path"] = file
-            data_desc[file_name]["description"] = rsp
-            data_desc[file_name]["column_info"] = get_samples(df)
+        df = read_data(file)
+        data_head = df.head().to_dict(orient="list")
+        data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
+        prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
+        rsp = await self._aask(prompt)
+        rsp = CodeParser.parse_code(block=None, text=rsp)
+        rsp = json.loads(rsp)
+        data_desc["path"] = file
+        data_desc["data_desc"] = rsp["data_desc"]
+        data_desc["column_desc"] = rsp["column_desc"]
+        data_desc["column_info"] = get_column_info(df)
        return data_desc


@ -184,6 +167,8 @@ class MLEngineer(Role):
                self.plan.finish_current_task()
                self.working_memory.clear()

+                if "print(df_processed.info())" in code:
+                    self.data_desc["column_info"] = result
            else:
                # update plan according to user's feedback and to take on changed tasks
                await self._update_plan()
@ -198,8 +183,7 @@ class MLEngineer(Role):
            print(truncate(result))

    async def _generate_data_desc(self):
-        files = glob.glob(self.data_path + "/*.csv")
-        data_desc = await GenerateDataDesc().run(files=files)
+        data_desc = await GenerateDataDesc().run(self.data_path)
        return data_desc

    async def _write_and_exec_code(self, max_retry: int = 3):
@ -221,14 +205,10 @@ class MLEngineer(Role):
            if counter == 0:
                context = self.get_useful_memories()
            else:
-                # improve_code = await DebugCode().run(plan=self.plan,
-                #                                      code= code_context + "\n\n" + code,
-                #                                      runtime_result=self.working_memory.get())
-                improve_code = ""
+                improve_code = await DebugCode().run(plan=self.plan,
+                                                     code= code_context + "\n\n" + code,
+                                                     runtime_result=self.working_memory.get())

-            # breakpoint()
-
-            column_names_dict = {key: value["column_info"] for key, value in self.data_desc.items()}

            if not self.use_tools or self.plan.current_task.task_type == "other":
                logger.info("Write code with pure generation")
@ -246,7 +226,7 @@ class MLEngineer(Role):
                    cause_by = DebugCode
                else:
                    code = await WriteCodeWithTools().run(
-                        context=context, plan=self.plan, code_steps=code_steps, **{"column_names": column_names_dict}
+                        context=context, plan=self.plan, code_steps=code_steps, **{"column_names": {}}
                    )

                    cause_by = WriteCodeWithTools
@ -260,7 +240,7 @@ class MLEngineer(Role):
            result, success = await self.execute_code.run(runcode)
            # truncated the result
            print(truncate(result))
-            # print(result)
+            
            self.working_memory.add(
                Message(content=truncate(remove_escape_and_color_codes(result)), role="user", cause_by=ExecutePyCode)
            )
@ -330,9 +310,8 @@ if __name__ == "__main__":
    # requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
    # requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"

-    from metagpt.const import DATA_PATH
-
    # requirement = "Perform data analysis on the provided data. Train a model to predict the target variable Survived. Include data preprocessing, feature engineering, and modeling in your pipeline. The metric is accuracy."
+
    data_path = f"{DATA_PATH}/titanic"
    requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."