From 4304dd28cae93e3a2c597bf139bcd2d7783b3dad Mon Sep 17 00:00:00 2001 From: wubinhao <15754305168@163.com> Date: Tue, 5 Dec 2023 17:57:56 +0800 Subject: [PATCH 1/5] update write task guide, add code plan --- metagpt/actions/write_task_guide.py | 82 +++++++++++++++++++++++++++++ metagpt/roles/ml_engineer.py | 21 ++++---- metagpt/schema.py | 1 + 3 files changed, 92 insertions(+), 12 deletions(-) create mode 100644 metagpt/actions/write_task_guide.py diff --git a/metagpt/actions/write_task_guide.py b/metagpt/actions/write_task_guide.py new file mode 100644 index 000000000..eff53feef --- /dev/null +++ b/metagpt/actions/write_task_guide.py @@ -0,0 +1,82 @@ + +import json +from typing import Dict, List, Union + +from metagpt.actions import Action +from metagpt.schema import Message, Task, Plan + + +TASK_GUIDE_PROMPT_TEMPLATE = """ +# Context +{context} + +## Format example +1. +2. +3. +... + +----- +Tasks are all code development tasks. +You are a professional engineer, the main goal is to plan out concise solution steps for Current Task before coding. +A planning process can reduce the difficulty and improve the quality of coding. +You may be given some code plans for the tasks ahead, but you don't have to follow the existing plan when planning the current task. +The output plan should following the subsequent principles: +1.The plan is a rough checklist of steps outlining the entire program's structure.Try to keep the number of steps fewer than 5. +2.The steps should be written concisely and at a high level, avoiding overly detailed implementation specifics. +3.The execution of the plan happens sequentially, but the plan can incorporate conditional (if) and looping(loop) keywords for more complex structures. +4.Output carefully referenced "Format example" in format. +""" + +STRUCTURAL_CONTEXT = """ +## User Requirement +{user_requirement} +## Current Plan +{tasks} +## Current Task +{current_task} +""" + + +class WriteTaskGuide(Action): + + async def run(self, plan: Plan) -> str: + """Run of a task guide writing action, used in ml engineer + + Args: + plan (plan): task plan + useful_memories (list): useful_memories + Returns: + str: The dataset_descriptions string. + """ + + context = self.get_context(plan) + task_guide_prompt = TASK_GUIDE_PROMPT_TEMPLATE.format( + context=context, + ) + task_guide = await self._aask(task_guide_prompt) + return task_guide + + def get_context(self, plan: Plan): + user_requirement = plan.goal + task_rename_map = { + 'task_id': 'task_id', + 'instruction': 'instruction', + 'is_finished': 'is_finished', + # 'task_guide': 'code_plan' + } + + def process_task(task): + task_dict = task.dict() + ptask = {task_rename_map[k]: task_dict[k] for k in task_dict if k in task_rename_map} + return ptask + tasks = json.dumps( + [process_task(task) for task in plan.tasks], indent=4, ensure_ascii=False + ) + current_task = json.dumps(process_task(plan.current_task)) if plan.current_task else {} + context = STRUCTURAL_CONTEXT.format( + user_requirement=user_requirement, tasks=tasks, current_task=current_task + ) + # print(context) + return context + diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py index 65583638e..d905b7bfd 100644 --- a/metagpt/roles/ml_engineer.py +++ b/metagpt/roles/ml_engineer.py @@ -12,6 +12,7 @@ from metagpt.logs import logger from metagpt.actions.write_plan import WritePlan from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools from metagpt.actions.execute_code import ExecutePyCode +from metagpt.actions.write_task_guide import WriteTaskGuide STRUCTURAL_CONTEXT = """ ## User Requirement @@ -66,11 +67,6 @@ class AskReview(Action): return rsp, confirmed -class WriteTaskGuide(Action): - async def run(self, task_instruction: str, data_desc: str = "") -> str: - return "" - - class MLEngineer(Role): def __init__( self, name="ABC", profile="MLEngineer", goal="", auto_run: bool = False @@ -79,7 +75,7 @@ class MLEngineer(Role): self._set_react_mode(react_mode="plan_and_act") self.plan = Plan(goal=goal) self.use_tools = False - self.use_task_guide = False + self.use_task_guide = True self.execute_code = ExecutePyCode() self.auto_run = auto_run @@ -92,7 +88,7 @@ class MLEngineer(Role): logger.info(f"ready to take on task {task}") # take on current task - code, result, success = await self._write_and_exec_code() + code, result, success, task_guide = await self._write_and_exec_code() # ask for acceptance, users can other refuse and change tasks in the plan task_result_confirmed = await self._ask_review() @@ -101,6 +97,7 @@ class MLEngineer(Role): # tick off this task and record progress task.code = code task.result = result + task.task_guide = task_guide self.plan.finish_current_task() self.working_memory.clear() @@ -110,7 +107,7 @@ class MLEngineer(Role): async def _write_and_exec_code(self, max_retry: int = 3): task_guide = ( - await WriteTaskGuide().run(self.plan.current_task.instruction) + await WriteTaskGuide().run(self.plan) if self.use_task_guide else "" ) @@ -156,7 +153,7 @@ class MLEngineer(Role): counter += 1 - return code, result, success + return code, result, success, task_guide async def _ask_review(self): if not self.auto_run: @@ -185,7 +182,7 @@ class MLEngineer(Role): def get_useful_memories(self) -> List[Message]: """find useful memories only to reduce context length and improve performance""" - + # TODO dataset description , code steps user_requirement = self.plan.goal tasks = json.dumps( [task.dict() for task in self.plan.tasks], indent=4, ensure_ascii=False @@ -204,9 +201,9 @@ class MLEngineer(Role): if __name__ == "__main__": - requirement = "Run data analysis on sklearn Iris dataset, include a plot" + # requirement = "Run data analysis on sklearn Iris dataset, include a plot" # requirement = "Run data analysis on sklearn Diabetes dataset, include a plot" - # requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy" + requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy" # requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy" # requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv" diff --git a/metagpt/schema.py b/metagpt/schema.py index e39f54a0c..db6861280 100644 --- a/metagpt/schema.py +++ b/metagpt/schema.py @@ -81,6 +81,7 @@ class Task(BaseModel): code: str = "" result: str = "" is_finished: bool = False + task_guide: str = "" class Plan(BaseModel): From 7436150849344945de0d7783538f9e7d7f44fb41 Mon Sep 17 00:00:00 2001 From: wubinhao <15754305168@163.com> Date: Tue, 5 Dec 2023 18:02:02 +0800 Subject: [PATCH 2/5] add code plan --- metagpt/actions/write_task_guide.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagpt/actions/write_task_guide.py b/metagpt/actions/write_task_guide.py index eff53feef..75067d33c 100644 --- a/metagpt/actions/write_task_guide.py +++ b/metagpt/actions/write_task_guide.py @@ -63,7 +63,7 @@ class WriteTaskGuide(Action): 'task_id': 'task_id', 'instruction': 'instruction', 'is_finished': 'is_finished', - # 'task_guide': 'code_plan' + 'task_guide': 'code_plan' } def process_task(task): From 2e7abe7d0342c13f782c878662f065a7a1b829eb Mon Sep 17 00:00:00 2001 From: wubinhao <15754305168@163.com> Date: Wed, 6 Dec 2023 11:24:24 +0800 Subject: [PATCH 3/5] change task_guide to code_steps --- metagpt/actions/write_analysis_code.py | 12 ++++---- ...rite_task_guide.py => write_code_steps.py} | 21 +++++-------- metagpt/llm.py | 2 +- metagpt/roles/ml_engineer.py | 30 +++++++++---------- metagpt/schema.py | 2 +- 5 files changed, 31 insertions(+), 36 deletions(-) rename metagpt/actions/{write_task_guide.py => write_code_steps.py} (80%) diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py index db0df2f90..1127dc78b 100644 --- a/metagpt/actions/write_analysis_code.py +++ b/metagpt/actions/write_analysis_code.py @@ -85,7 +85,7 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode): self, context: [List[Message]], plan: Plan = None, - task_guide: str = "", + code_steps: str = "", system_msg: str = None, **kwargs, ) -> str: @@ -155,7 +155,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode): self, context: List[Message], plan: Plan = None, - task_guide: str = "", + code_steps: str = "", data_desc: str = "", ) -> str: task_type = plan.current_task.task_type @@ -165,12 +165,12 @@ class WriteCodeWithTools(BaseWriteAnalysisCode): {k: tool[k] for k in ["name", "description"] if k in tool} for tool in available_tools ] - task_guide = "\n".join( - [f"Step {step.strip()}" for step in task_guide.split("\n")] + code_steps = "\n".join( + [f"Step {step.strip()}" for step in code_steps.split("\n")] ) recommend_tools = await self._tool_recommendation( - task, task_guide, available_tools + task, code_steps, available_tools ) recommend_tools, tool_catalog = self._parse_recommend_tools(task_type, recommend_tools) logger.info(f"Recommended tools for every steps: {recommend_tools}") @@ -194,7 +194,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode): completed_code=completed_code, data_desc=data_desc, special_prompt=special_prompt, - code_steps=task_guide, + code_steps=code_steps, module_name=module_name, output_desc=output_desc, available_tools=recommend_tools, diff --git a/metagpt/actions/write_task_guide.py b/metagpt/actions/write_code_steps.py similarity index 80% rename from metagpt/actions/write_task_guide.py rename to metagpt/actions/write_code_steps.py index 75067d33c..47ea0b1df 100644 --- a/metagpt/actions/write_task_guide.py +++ b/metagpt/actions/write_code_steps.py @@ -6,7 +6,7 @@ from metagpt.actions import Action from metagpt.schema import Message, Task, Plan -TASK_GUIDE_PROMPT_TEMPLATE = """ +CODE_STEPS_PROMPT_TEMPLATE = """ # Context {context} @@ -38,7 +38,7 @@ STRUCTURAL_CONTEXT = """ """ -class WriteTaskGuide(Action): +class WriteCodeSteps(Action): async def run(self, plan: Plan) -> str: """Run of a task guide writing action, used in ml engineer @@ -51,24 +51,19 @@ class WriteTaskGuide(Action): """ context = self.get_context(plan) - task_guide_prompt = TASK_GUIDE_PROMPT_TEMPLATE.format( + code_steps_prompt = CODE_STEPS_PROMPT_TEMPLATE.format( context=context, ) - task_guide = await self._aask(task_guide_prompt) - return task_guide + code_steps = await self._aask(code_steps_prompt) + return code_steps def get_context(self, plan: Plan): user_requirement = plan.goal - task_rename_map = { - 'task_id': 'task_id', - 'instruction': 'instruction', - 'is_finished': 'is_finished', - 'task_guide': 'code_plan' - } + select_task_keys = ['task_id', 'instruction', 'is_finished', 'code_steps'] def process_task(task): task_dict = task.dict() - ptask = {task_rename_map[k]: task_dict[k] for k in task_dict if k in task_rename_map} + ptask = {k: task_dict[k] for k in task_dict if k in select_task_keys} return ptask tasks = json.dumps( [process_task(task) for task in plan.tasks], indent=4, ensure_ascii=False @@ -77,6 +72,6 @@ class WriteTaskGuide(Action): context = STRUCTURAL_CONTEXT.format( user_requirement=user_requirement, tasks=tasks, current_task=current_task ) - # print(context) + print(context) return context diff --git a/metagpt/llm.py b/metagpt/llm.py index 4edcd7a83..c8ddf9a26 100644 --- a/metagpt/llm.py +++ b/metagpt/llm.py @@ -11,7 +11,7 @@ from metagpt.config import CONFIG from metagpt.provider.anthropic_api import Claude2 as Claude from metagpt.provider.openai_api import OpenAIGPTAPI from metagpt.provider.zhipuai_api import ZhiPuAIGPTAPI -from metagpt.provider.spark_api import SparkAPI +# from metagpt.provider.spark_api import SparkAPI from metagpt.provider.human_provider import HumanProvider diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py index d905b7bfd..e957d66c4 100644 --- a/metagpt/roles/ml_engineer.py +++ b/metagpt/roles/ml_engineer.py @@ -12,7 +12,7 @@ from metagpt.logs import logger from metagpt.actions.write_plan import WritePlan from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools from metagpt.actions.execute_code import ExecutePyCode -from metagpt.actions.write_task_guide import WriteTaskGuide +from metagpt.actions.write_code_steps import WriteCodeSteps STRUCTURAL_CONTEXT = """ ## User Requirement @@ -75,7 +75,7 @@ class MLEngineer(Role): self._set_react_mode(react_mode="plan_and_act") self.plan = Plan(goal=goal) self.use_tools = False - self.use_task_guide = True + self.use_code_steps = True self.execute_code = ExecutePyCode() self.auto_run = auto_run @@ -88,7 +88,7 @@ class MLEngineer(Role): logger.info(f"ready to take on task {task}") # take on current task - code, result, success, task_guide = await self._write_and_exec_code() + code, result, success, code_steps = await self._write_and_exec_code() # ask for acceptance, users can other refuse and change tasks in the plan task_result_confirmed = await self._ask_review() @@ -97,7 +97,7 @@ class MLEngineer(Role): # tick off this task and record progress task.code = code task.result = result - task.task_guide = task_guide + task.code_steps = code_steps self.plan.finish_current_task() self.working_memory.clear() @@ -106,9 +106,9 @@ class MLEngineer(Role): await self._update_plan() async def _write_and_exec_code(self, max_retry: int = 3): - task_guide = ( - await WriteTaskGuide().run(self.plan) - if self.use_task_guide + code_steps = ( + await WriteCodeSteps().run(self.plan) + if self.use_code_steps else "" ) @@ -123,14 +123,14 @@ class MLEngineer(Role): # breakpoint() if not self.use_tools or self.plan.current_task.task_type == "other": - # code = "print('abc')" - code = await WriteCodeByGenerate().run( - context=context, plan=self.plan, task_guide=task_guide, temperature=0.0 - ) + code = "print('abc')" + # code = await WriteCodeByGenerate().run( + # context=context, plan=self.plan, code_steps=code_steps, temperature=0.0 + # ) cause_by = WriteCodeByGenerate else: code = await WriteCodeWithTools().run( - context=context, plan=self.plan, task_guide=task_guide, data_desc="" + context=context, plan=self.plan, code_steps=code_steps, data_desc="" ) cause_by = WriteCodeWithTools @@ -153,7 +153,7 @@ class MLEngineer(Role): counter += 1 - return code, result, success, task_guide + return code, result, success, code_steps async def _ask_review(self): if not self.auto_run: @@ -203,9 +203,9 @@ class MLEngineer(Role): if __name__ == "__main__": # requirement = "Run data analysis on sklearn Iris dataset, include a plot" # requirement = "Run data analysis on sklearn Diabetes dataset, include a plot" - requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy" + # requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy" # requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy" - # requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv" + requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv" async def main(requirement: str = requirement, auto_run: bool = False): role = MLEngineer(goal=requirement, auto_run=auto_run) diff --git a/metagpt/schema.py b/metagpt/schema.py index db6861280..2e4260096 100644 --- a/metagpt/schema.py +++ b/metagpt/schema.py @@ -81,7 +81,7 @@ class Task(BaseModel): code: str = "" result: str = "" is_finished: bool = False - task_guide: str = "" + code_steps: str = "" class Plan(BaseModel): From 58e8e4c87936d6bf721f91109d4595a864a23203 Mon Sep 17 00:00:00 2001 From: wubinhao <15754305168@163.com> Date: Wed, 6 Dec 2023 15:56:26 +0800 Subject: [PATCH 4/5] fix --- metagpt/actions/write_code_steps.py | 2 +- metagpt/llm.py | 2 +- metagpt/roles/ml_engineer.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/metagpt/actions/write_code_steps.py b/metagpt/actions/write_code_steps.py index 47ea0b1df..d3f6e5553 100644 --- a/metagpt/actions/write_code_steps.py +++ b/metagpt/actions/write_code_steps.py @@ -72,6 +72,6 @@ class WriteCodeSteps(Action): context = STRUCTURAL_CONTEXT.format( user_requirement=user_requirement, tasks=tasks, current_task=current_task ) - print(context) + # print(context) return context diff --git a/metagpt/llm.py b/metagpt/llm.py index c8ddf9a26..4edcd7a83 100644 --- a/metagpt/llm.py +++ b/metagpt/llm.py @@ -11,7 +11,7 @@ from metagpt.config import CONFIG from metagpt.provider.anthropic_api import Claude2 as Claude from metagpt.provider.openai_api import OpenAIGPTAPI from metagpt.provider.zhipuai_api import ZhiPuAIGPTAPI -# from metagpt.provider.spark_api import SparkAPI +from metagpt.provider.spark_api import SparkAPI from metagpt.provider.human_provider import HumanProvider diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py index e957d66c4..ce0689497 100644 --- a/metagpt/roles/ml_engineer.py +++ b/metagpt/roles/ml_engineer.py @@ -123,10 +123,10 @@ class MLEngineer(Role): # breakpoint() if not self.use_tools or self.plan.current_task.task_type == "other": - code = "print('abc')" - # code = await WriteCodeByGenerate().run( - # context=context, plan=self.plan, code_steps=code_steps, temperature=0.0 - # ) + # code = "print('abc')" + code = await WriteCodeByGenerate().run( + context=context, plan=self.plan, code_steps=code_steps, temperature=0.0 + ) cause_by = WriteCodeByGenerate else: code = await WriteCodeWithTools().run( From 029adbc6d6fcc10e1cd553e2412b2355de36f2e8 Mon Sep 17 00:00:00 2001 From: wubinhao <15754305168@163.com> Date: Wed, 6 Dec 2023 16:48:31 +0800 Subject: [PATCH 5/5] update functions --- .../tools/functions/libs/data_preprocess.py | 123 +++++++++++ metagpt/tools/functions/libs/ml_model.py | 196 ++++++++++++++++++ .../functions/schemas/data_preprocess.py | 62 ++++++ metagpt/tools/functions/schemas/ml_model.py | 55 +++++ 4 files changed, 436 insertions(+) create mode 100644 metagpt/tools/functions/libs/data_preprocess.py create mode 100644 metagpt/tools/functions/libs/ml_model.py create mode 100644 metagpt/tools/functions/schemas/data_preprocess.py create mode 100644 metagpt/tools/functions/schemas/ml_model.py diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py new file mode 100644 index 000000000..68c96bbc9 --- /dev/null +++ b/metagpt/tools/functions/libs/data_preprocess.py @@ -0,0 +1,123 @@ + +import pandas as pd +import numpy as np + +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import KBinsDiscretizer +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MaxAbsScaler +from sklearn.preprocessing import RobustScaler +from sklearn.preprocessing import OrdinalEncoder + +from metagpt.tools.functions import registry +from metagpt.tools.functions.schemas.data_preprocess import * + + +@registry.register("data_preprocess", FillMissingValue) +def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,): + df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features]) + return df + + +# @registry.register("data_preprocess", FillMissingValue) +# def label_encode(df: pd.DataFrame, features: list,): +# for col in features: +# df[col] = LabelEncoder().fit_transform(df[col]) +# return df + + +@registry.register("data_preprocess", SplitBins) +def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',): + df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", MinMaxScale) +def min_max_scale(df: pd.DataFrame, features: list, ): + df[features] = MinMaxScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", StandardScale) +def standard_scale(df: pd.DataFrame, features: list, ): + df[features] = StandardScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", LogTransform) +def log_transform(df: pd.DataFrame, features: list, ): + for col in features: + if df[col].min() <= 0: + df[col] = df[col] - df[col].min() + 2 + df[col] = np.log(df[col]) + return df + + +@registry.register("data_preprocess", MaxAbsScale) +def max_abs_scale(df: pd.DataFrame, features: list, ): + df[features] = MaxAbsScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", RobustScale) +def robust_scale(df: pd.DataFrame, features: list, ): + df[features] = RobustScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", OrdinalEncode) +def ordinal_encode(df: pd.DataFrame, features: list,): + df[features] = OrdinalEncoder().fit_transform(df[features]) + return df + + +if __name__ == '__main__': + def run(): + V = { + 'a': [-1, 2, 3, 6, 5, 4], + 'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4], + 'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], + 'd': [1, None, 3, None, 5, 4], + 'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4], + 'f': ['aa', np.NAN, 'cc', None, '', 'ff'], + + } + + df = pd.DataFrame(V) + print(df.dtypes) + + numeric_features = ['a', 'b', 'd', 'e'] + numeric_features_wo_miss = ['a', 'b', ] + categorial_features = ['c', 'f'] + + df_ = fill_missing_value(df.copy(), numeric_features) + print(df_) + df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe') + print(df_) + + df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999) + print(df_) + + # df_ = label_encode(df.copy(), numeric_features + categorial_features, ) + # print(df_) + + df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile') + print(df_) + + df_ = min_max_scale(df.copy(), numeric_features, ) + print(df_) + + df_ = standard_scale(df.copy(), numeric_features, ) + print(df_) + + df_ = log_transform(df.copy(), numeric_features, ) + print(df_) + + df_ = max_abs_scale(df.copy(), numeric_features, ) + print(df_) + + df_ = robust_scale(df.copy(), numeric_features, ) + print(df_) + run() \ No newline at end of file diff --git a/metagpt/tools/functions/libs/ml_model.py b/metagpt/tools/functions/libs/ml_model.py new file mode 100644 index 000000000..b669de2c1 --- /dev/null +++ b/metagpt/tools/functions/libs/ml_model.py @@ -0,0 +1,196 @@ +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder + +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import GradientBoostingClassifier + + +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import GradientBoostingRegressor + +from metagpt.tools.functions import registry +from metagpt.tools.functions.schemas.ml_model import * + + +######### +## 分类 ## +######### + + +@registry.register("classification_model", LogisticRegressionClassification) +def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + + model = LogisticRegression(penalty=penalty, dual=dual) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict_proba(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("classification_model", RandomForestClassification) +def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict_proba(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("classification_model", GradientBoostingClassification) +def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict_proba(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + + +######### +## 回归 ## +######### + + +@registry.register("regression_model", LinearRegressionRegression) +def linear_regression(df, label, test_size=0.2, ): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + + model = LinearRegression() + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("regression_model", RandomForestRegression) +def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("regression_model", GradientBoostingRegression) +def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +if __name__ == '__main__': + def run(): + from sklearn.datasets import load_iris + loader = load_iris(as_frame=True) + df = loader['data'] + df['target'] = loader['target'] + + df[df.columns[0]] = df[df.columns[0]].astype(str) + df[df.columns[1]] = df[df.columns[1]].astype(int) + df['target'] = df['target'].astype(str) + + print(df) + print('####'*5) + res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False) + print(res['te_pred_prob']) + + print('####'*5) + res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini') + print(res['te_pred_prob']) + + print('####'*5) + res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1) + print(res['te_pred_prob']) + + from sklearn.datasets import make_regression + import pandas as pd + loader = make_regression() + df = pd.DataFrame(loader[0]) + df['target'] = loader[1] + + df[df.columns[0]] = df[df.columns[0]].astype(str) + df[df.columns[1]] = df[df.columns[1]].astype(int) + # df['target'] = df['target'].astype(str) + + print(df) + print('####' * 5) + res = linear_regression(df, 'target', test_size=0.25, ) + print(res['te_pred_prob']) + + print('####' * 5) + res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error') + print(res['te_pred_prob']) + + print('####' * 5) + res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1) + print(res['te_pred_prob']) + run() \ No newline at end of file diff --git a/metagpt/tools/functions/schemas/data_preprocess.py b/metagpt/tools/functions/schemas/data_preprocess.py new file mode 100644 index 000000000..40e1d64e0 --- /dev/null +++ b/metagpt/tools/functions/schemas/data_preprocess.py @@ -0,0 +1,62 @@ + +import pandas as pd + +from metagpt.tools.functions.schemas.base import tool_field, ToolSchema + + +class FillMissingValue(ToolSchema): + """Completing missing values with simple strategies""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + strategy: str = tool_field(description="the imputation strategy", default='mean') + fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None) + + +# class LabelEncode(ToolSchema): +# """Completing missing values with simple strategies""" +# df: pd.DataFrame = tool_field(description="input dataframe") +# features: list = tool_field(description="columns to be processed") + + +class SplitBins(ToolSchema): + """Bin continuous data into intervals and return the bin identifier encoded as an integer value""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile') + + +class MinMaxScale(ToolSchema): + """Transform features by scaling each feature to a range, witch is (0, 1)""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class StandardScale(ToolSchema): + """Standardize features by removing the mean and scaling to unit variance""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class LogTransform(ToolSchema): + """Performs a logarithmic transformation on the specified columns""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class MaxAbsScale(ToolSchema): + """Scale each feature by its maximum absolute value""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class RobustScale(ToolSchema): + """Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class OrdinalEncode(ToolSchema): + """Encode categorical features as an integer array""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + diff --git a/metagpt/tools/functions/schemas/ml_model.py b/metagpt/tools/functions/schemas/ml_model.py new file mode 100644 index 000000000..9268156af --- /dev/null +++ b/metagpt/tools/functions/schemas/ml_model.py @@ -0,0 +1,55 @@ +import pandas as pd + +from metagpt.tools.functions.schemas.base import tool_field, ToolSchema + + +class LogisticRegressionClassification(ToolSchema): + """Logistic Regression (aka logit, MaxEnt) classifier""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + penalty: str = tool_field(description="Specify the norm of the penalty", default="l2") + dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2") + + +class RandomForestClassification(ToolSchema): + """random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of trees in the forest", default=100) + criterion: str = tool_field(description="The function to measure the quality of a split", default="gini") + + +class GradientBoostingClassification(ToolSchema): + """Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100) + learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1) + + +class LinearRegressionRegression(ToolSchema): + """Ordinary least squares Linear Regression.""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + + +class RandomForestRegression(ToolSchema): + """random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of trees in the forest", default=100) + criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error") + + +class GradientBoostingRegression(ToolSchema): + """Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100) + learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)