diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py index 646b4f3f1..c8a28edd1 100644 --- a/metagpt/actions/write_analysis_code.py +++ b/metagpt/actions/write_analysis_code.py @@ -16,10 +16,14 @@ from metagpt.prompts.ml_engineer import ( ML_SPECIFIC_PROMPT, ML_MODULE_MAP, TOOL_OUTPUT_DESC, + TOOL_USAGE_PROMPT, ) from metagpt.schema import Message, Plan from metagpt.tools.functions import registry from metagpt.utils.common import create_func_config +from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT, GENERATE_CODE_PROMPT +from metagpt.utils.common import CodeParser +from metagpt.actions.execute_code import ExecutePyCode class BaseWriteAnalysisCode(Action): @@ -47,13 +51,13 @@ class BaseWriteAnalysisCode(Action): # 添加默认的提示词 if ( - default_system_msg not in messages[0]["content"] - and messages[0]["role"] != "system" + default_system_msg not in messages[0]["content"] + and messages[0]["role"] != "system" ): messages.insert(0, {"role": "system", "content": default_system_msg}) elif ( - default_system_msg not in messages[0]["content"] - and messages[0]["role"] == "system" + default_system_msg not in messages[0]["content"] + and messages[0]["role"] == "system" ): messages[0] = { "role": "system", @@ -62,14 +66,14 @@ class BaseWriteAnalysisCode(Action): return messages async def run( - self, context: List[Message], plan: Plan = None, task_guide: str = "" + self, context: List[Message], plan: Plan = None, code_steps: str = "" ) -> str: """Run of a code writing action, used in data analysis or modeling Args: context (List[Message]): Action output history, source action denoted by Message.cause_by plan (Plan, optional): Overall plan. Defaults to None. - task_guide (str, optional): suggested step breakdown for the current task. Defaults to "". + code_steps (str, optional): suggested step breakdown for the current task. Defaults to "". Returns: str: The code string. @@ -83,12 +87,12 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode): super().__init__(name, context, llm) async def run( - self, - context: [List[Message]], - plan: Plan = None, - task_guide: str = "", - system_msg: str = None, - **kwargs, + self, + context: [List[Message]], + plan: Plan = None, + code_steps: str = "", + system_msg: str = None, + **kwargs, ) -> str: context.append(Message(content=self.REUSE_CODE_INSTRUCTION, role="user")) prompt = self.process_msg(context, system_msg) @@ -98,6 +102,7 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode): class WriteCodeWithTools(BaseWriteAnalysisCode): """Write code with help of local available tools. Choose tools first, then generate code to use the tools""" + execute_code = ExecutePyCode() @staticmethod def _parse_recommend_tools(module: str, recommend_tools: list) -> List[Dict]: @@ -121,10 +126,10 @@ class WriteCodeWithTools(BaseWriteAnalysisCode): return tool_catalog async def _tool_recommendation( - self, - context: [List[Message]], - code_steps: str, - available_tools: list + self, + context: [List[Message]], + code_steps: str, + available_tools: list ) -> list: """ Recommend tools for the specified task. @@ -148,15 +153,28 @@ class WriteCodeWithTools(BaseWriteAnalysisCode): recommend_tools = rsp["recommend_tools"] return recommend_tools + async def run( - self, - context: List[Message], - plan: Plan = None, - task_guide: str = "", + self, + context: List[Message], + plan: Plan = None, + code_steps: str = "", + **kwargs, ) -> str: task_type = plan.current_task.task_type + logger.info(f"task_type is: {task_type}") available_tools = registry.get_all_schema_by_module(task_type) - special_prompt = ML_SPECIFIC_PROMPT.get(task_type, "") + + # special_prompt = ML_SPECIFIC_PROMPT.get(task_type, "") + + finished_tasks = plan.get_finished_tasks() + code_context = [task.code for task in finished_tasks] + + code_context = "\n\n".join(code_context) + + ### add runtime info + result, success = await self.execute_code.run(code_context) + logger.info(result) if len(available_tools) > 0: available_tools = [ @@ -164,25 +182,46 @@ class WriteCodeWithTools(BaseWriteAnalysisCode): for tool in available_tools ] - recommend_tools = await self._tool_recommendation(context, task_guide, available_tools) + final_code = code_context + + recommend_tools = await self._tool_recommendation(context, code_steps, available_tools) tool_catalog = self._parse_recommend_tools(task_type, recommend_tools) logger.info(f"Recommended tools: \n{recommend_tools}") module_name = ML_MODULE_MAP[task_type] output_desc = TOOL_OUTPUT_DESC.get(task_type, "") - prompt = TOO_ORGANIZATION_PROMPT.format( - special_prompt=special_prompt, - code_steps=task_guide, + + hist_info = f"Previous finished code is \n\n ```Python {final_code} ``` \n\n " \ + f"Conde runtime result is {result} \n\n" + + prompt = TOOL_USAGE_PROMPT.format( + goal=plan.current_task.instruction, + context=hist_info, + code_steps=code_steps, module_name=module_name, output_desc=output_desc, function_catalog=tool_catalog, ) - context.append(Message(content=prompt, role="user")) - else: - context.append(Message(content=self.REUSE_CODE_INSTRUCTION, role="user")) - context.append(Message(content=special_prompt, role="user")) - prompt = self.process_msg(context) - tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS) - rsp = await self.llm.aask_code(prompt, **tool_config) - return rsp["code"] + tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS) + + rsp = await self.llm.aask_code(prompt, **tool_config) + logger.info(f"rsp is: {rsp}") + final_code = final_code + "\n\n" + rsp["code"] + + return final_code + + else: + hist_info = f"Previous finished code is \n\n ```Python {code_context} ``` \n\n " \ + f"Conde runtime result is {result} \n\n" + + prompt = GENERATE_CODE_PROMPT.format( + goal=plan.current_task.instruction, + context=hist_info, + ) + + tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS) + logger.info(f"prompt is: {prompt}") + rsp = await self.llm.aask_code(prompt, **tool_config) + logger.info(f"rsp is: {rsp}") + return rsp["code"] diff --git a/metagpt/actions/write_code_steps.py b/metagpt/actions/write_code_steps.py new file mode 100644 index 000000000..0bfb9c225 --- /dev/null +++ b/metagpt/actions/write_code_steps.py @@ -0,0 +1,80 @@ + +import json +from typing import Dict, List, Union + +from metagpt.actions import Action +from metagpt.schema import Message, Task, Plan +from metagpt.utils.common import CodeParser + +CODE_STEPS_PROMPT_TEMPLATE = """ +# Context +{context} + +----- +Tasks are all code development tasks. +You are a professional engineer, the main goal is to plan out concise solution steps for Current Task before coding. +A planning process can reduce the difficulty and improve the quality of coding. +You may be given some code plans for the tasks ahead, but you don't have to follow the existing plan when planning the current task. +The output plan should following the subsequent principles: +1.The plan is a rough checklist of steps outlining the entire program's structure.Try to keep the number of steps fewer than 5. +2.The steps should be written concisely and at a high level, avoiding overly detailed implementation specifics. +3.The execution of the plan happens sequentially, but the plan can incorporate conditional (if) and looping(loop) keywords for more complex structures. + +Output the code steps in a JSON format, as shown in this example: +```json +{ + "Step 1": "", + "Step 2": "", + "Step 3": "", + ... +} +``` +""" + +STRUCTURAL_CONTEXT = """ +## User Requirement +{user_requirement} +## Current Plan +{tasks} +## Current Task +{current_task} +""" + + +class WriteCodeSteps(Action): + + async def run(self, plan: Plan) -> str: + """Run of a task guide writing action, used in ml engineer + + Args: + plan (plan): task plan + useful_memories (list): useful_memories + Returns: + str: The dataset_descriptions string. + """ + + context = self.get_context(plan) + code_steps_prompt = CODE_STEPS_PROMPT_TEMPLATE.replace( + "{context}", context + ) + code_steps = await self._aask(code_steps_prompt) + code_steps = CodeParser.parse_code(block=None, text=code_steps) + return code_steps + + def get_context(self, plan: Plan): + user_requirement = plan.goal + select_task_keys = ['task_id', 'instruction', 'is_finished', 'code_steps'] + + def process_task(task): + task_dict = task.dict() + ptask = {k: task_dict[k] for k in task_dict if k in select_task_keys} + return ptask + tasks = json.dumps( + [process_task(task) for task in plan.tasks], indent=4, ensure_ascii=False + ) + current_task = json.dumps(process_task(plan.current_task)) if plan.current_task else {} + context = STRUCTURAL_CONTEXT.format( + user_requirement=user_requirement, tasks=tasks, current_task=current_task + ) + # print(context) + return context diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py index d568bdd1f..b68dadc9a 100644 --- a/metagpt/prompts/ml_engineer.py +++ b/metagpt/prompts/ml_engineer.py @@ -23,7 +23,6 @@ Output the information in a JSON format, as shown in this example: ``` """ - ASSIGN_TASK_TYPE_PROMPT = """ Please assign a task type to each task in the list below from the given categories: {task_list} @@ -53,7 +52,6 @@ ASSIGN_TASK_TYPE = { }, } - TOOL_RECOMMENDATION_PROMPT = """ Your are a tool recommender, the main goal is to recommend suitable tools for current task before coding. A tool means a function that can be used to help you solve the task. @@ -88,7 +86,6 @@ SELECT_FUNCTION_TOOLS = { }, } - CODE_GENERATOR_WITH_TOOLS = { "name": "add_subtask_code", "description": "Add new code cell of current task to the end of an active Jupyter notebook.", @@ -104,6 +101,54 @@ CODE_GENERATOR_WITH_TOOLS = { }, } +TOOL_USAGE_PROMPT = """ +## Target +{goal} + +## History Info +{context} + +## Available Tools: +Each function is described in JSON format, including the function name and parameters. {output_desc} +{function_catalog} + +When you call a function above, you should import the function from `{module_name}` first, e.g.: +```python +from metagpt.tools.functions.libs.data_preprocess import fill_missing_value +```end + +## Your Output Format: +Generate the complete code for this task: +```python +# Tools used: [function names or 'none'] + +```end + +## Attention: +Make sure use the columns from the dataset columns +Finish your coding tasks as a helpful programmer based on the tools. + +""" +GENERATE_CODE_PROMPT = """ +## Target +{goal} + +## History Info +{context} + +## Your Output Format: +Generate the complete code for this task: +```python +# Tools used: [function names or 'none'] + +```end + +## Attention: +Make sure use the columns from the dataset columns +Finish your coding tasks as a helpful programmer based on the tools. + +""" + TOO_ORGANIZATION_PROMPT = """ The previous conversation has provided all tasks step-by-step for the use goal and their statuses. Now, begin writing code for the current task. This code should writen strictly on the basis of all previous completed tasks code, not a standalone code. And avoid writing duplicate code that has already been written in previous tasks, such as repeated import of packages, reading data, etc. @@ -167,7 +212,6 @@ CLASSIFICATION_MODEL_OUTPUT_DESC = "" REGRESSION_MODEL_OUTPUT_DESC = "" - ML_SPECIFIC_PROMPT = { "data_preprocess": DATA_PREPROCESS_PROMPT, "feature_engineering": FEATURE_ENGINEERING_PROMPT, diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py index c088ff104..deb76f0a9 100644 --- a/metagpt/roles/ml_engineer.py +++ b/metagpt/roles/ml_engineer.py @@ -10,12 +10,12 @@ from metagpt.actions import Action from metagpt.actions.execute_code import ExecutePyCode from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools from metagpt.actions.write_plan import WritePlan -# from metagpt.actions.write_task_guide import WriteTaskGuide from metagpt.logs import logger from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT from metagpt.roles import Role from metagpt.schema import Message, Plan from metagpt.utils.common import CodeParser +from metagpt.actions.write_code_steps import WriteCodeSteps STRUCTURAL_CONTEXT = """ ## User Requirement @@ -39,7 +39,7 @@ catboost def truncate(result: str, keep_len: int = 1000) -> str: desc = "Truncated to show only the last 1000 characters\n" if result.startswith(desc): - result = result[-len(desc):] + result = result[-len(desc) :] if len(result) > keep_len: result = result[-keep_len:] @@ -110,9 +110,9 @@ class AskReview(Action): logger.info("most recent context:") latest_action = context[-1].cause_by.__name__ if context[-1].cause_by else "" prompt = f"\nPlease review output from {latest_action}:\n" \ - "If you want to change a task in the plan, say 'change task task_id, ... (things to change)'\n" \ - "If you confirm the output and wish to continue with the current process, type CONFIRM\n" \ - "If you want to terminate the process, type exit:\n" + "If you want to change a task in the plan, say 'change task task_id, ... (things to change)'\n" \ + "If you confirm the output and wish to continue with the current process, type CONFIRM\n" \ + "If you want to terminate the process, type exit:\n" rsp = input(prompt) if rsp.lower() in ("exit"): @@ -123,11 +123,6 @@ class AskReview(Action): return rsp, confirmed -# class WriteTaskGuide(Action): -# async def run(self, task_instruction: str, data_desc: dict = None) -> str: -# return "" - - class GenerateDataDesc(Action): async def run(self, files: list) -> dict: data_desc = {} @@ -148,13 +143,13 @@ class GenerateDataDesc(Action): class MLEngineer(Role): def __init__( - self, name="ABC", profile="MLEngineer", goal="", auto_run: bool = False, data_path: str = None + self, name="ABC", profile="MLEngineer", goal="", auto_run: bool = False, data_path: str = None ): super().__init__(name=name, profile=profile, goal=goal) self._set_react_mode(react_mode="plan_and_act") self.plan = Plan(goal=goal) self.use_tools = True - self.use_task_guide = True + self.use_code_steps = True self.execute_code = ExecutePyCode() self.auto_run = auto_run self.data_path = data_path @@ -164,6 +159,7 @@ class MLEngineer(Role): if self.data_path: self.data_desc = await self._generate_data_desc() + # create initial plan and update until confirmation await self._update_plan() @@ -172,7 +168,7 @@ class MLEngineer(Role): logger.info(f"ready to take on task {task}") # take on current task - code, result, success = await self._write_and_exec_code() + code, result, success, code_steps = await self._write_and_exec_code() # ask for acceptance, users can other refuse and change tasks in the plan task_result_confirmed = await self._ask_review() @@ -181,6 +177,7 @@ class MLEngineer(Role): # tick off this task and record progress task.code = code task.result = result + task.code_steps = code_steps self.plan.finish_current_task() self.working_memory.clear() @@ -194,9 +191,9 @@ class MLEngineer(Role): return data_desc async def _write_and_exec_code(self, max_retry: int = 3): - task_guide = ( - await WriteTaskGuide().run(self.plan) - if self.use_task_guide + code_steps = ( + await WriteCodeSteps().run(self.plan) + if self.use_code_steps else "" ) @@ -204,23 +201,22 @@ class MLEngineer(Role): success = False while not success and counter < max_retry: context = self.get_useful_memories() - - # print("*" * 10) - # print(context) - # print("*" * 10) # breakpoint() + column_names_dict = {key: value["column_info"] for key,value in self.data_desc.items()} + if not self.use_tools or self.plan.current_task.task_type == "other": logger.info("Write code with pure generation") # code = "print('abc')" code = await WriteCodeByGenerate().run( - context=context, plan=self.plan, task_guide=task_guide, temperature=0.0 + context=context, plan=self.plan, code_steps=code_steps, temperature=0.0 ) cause_by = WriteCodeByGenerate else: logger.info("Write code with tools") + code = await WriteCodeWithTools().run( - context=context, plan=self.plan, task_guide=task_guide + context=context, plan=self.plan, code_steps=code_steps, **{"column_names": column_names_dict} ) cause_by = WriteCodeWithTools @@ -243,7 +239,7 @@ class MLEngineer(Role): counter += 1 - return code, result, success + return code, result, success, code_steps async def _ask_review(self): if not self.auto_run: @@ -272,7 +268,7 @@ class MLEngineer(Role): def get_useful_memories(self) -> List[Message]: """find useful memories only to reduce context length and improve performance""" - + # TODO dataset description , code steps user_requirement = self.plan.goal tasks = json.dumps( [task.dict() for task in self.plan.tasks], indent=4, ensure_ascii=False @@ -294,7 +290,7 @@ class MLEngineer(Role): if __name__ == "__main__": - # requirement = "Run data analysis on sklearn Iris dataset, include a plot.." + # requirement = "Run data analysis on sklearn Iris dataset, include a plot" # requirement = "Run data analysis on sklearn Diabetes dataset, include a plot" # requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy" # requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy" @@ -305,10 +301,8 @@ if __name__ == "__main__": requirement = "Perform data analysis on the provided data. Train a model to predict the target variable Survived. Include data preprocessing, feature engineering, and modeling in your pipeline. The metric is accuracy." data_path = f"{DATA_PATH}/titanic" - async def main(requirement: str = requirement, auto_run: bool = True, data_path: str = data_path): role = MLEngineer(goal=requirement, auto_run=auto_run, data_path=data_path) await role.run(requirement) - fire.Fire(main) diff --git a/metagpt/schema.py b/metagpt/schema.py index e39f54a0c..2e4260096 100644 --- a/metagpt/schema.py +++ b/metagpt/schema.py @@ -81,6 +81,7 @@ class Task(BaseModel): code: str = "" result: str = "" is_finished: bool = False + code_steps: str = "" class Plan(BaseModel): diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py new file mode 100644 index 000000000..68c96bbc9 --- /dev/null +++ b/metagpt/tools/functions/libs/data_preprocess.py @@ -0,0 +1,123 @@ + +import pandas as pd +import numpy as np + +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import KBinsDiscretizer +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MaxAbsScaler +from sklearn.preprocessing import RobustScaler +from sklearn.preprocessing import OrdinalEncoder + +from metagpt.tools.functions import registry +from metagpt.tools.functions.schemas.data_preprocess import * + + +@registry.register("data_preprocess", FillMissingValue) +def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,): + df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features]) + return df + + +# @registry.register("data_preprocess", FillMissingValue) +# def label_encode(df: pd.DataFrame, features: list,): +# for col in features: +# df[col] = LabelEncoder().fit_transform(df[col]) +# return df + + +@registry.register("data_preprocess", SplitBins) +def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',): + df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", MinMaxScale) +def min_max_scale(df: pd.DataFrame, features: list, ): + df[features] = MinMaxScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", StandardScale) +def standard_scale(df: pd.DataFrame, features: list, ): + df[features] = StandardScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", LogTransform) +def log_transform(df: pd.DataFrame, features: list, ): + for col in features: + if df[col].min() <= 0: + df[col] = df[col] - df[col].min() + 2 + df[col] = np.log(df[col]) + return df + + +@registry.register("data_preprocess", MaxAbsScale) +def max_abs_scale(df: pd.DataFrame, features: list, ): + df[features] = MaxAbsScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", RobustScale) +def robust_scale(df: pd.DataFrame, features: list, ): + df[features] = RobustScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", OrdinalEncode) +def ordinal_encode(df: pd.DataFrame, features: list,): + df[features] = OrdinalEncoder().fit_transform(df[features]) + return df + + +if __name__ == '__main__': + def run(): + V = { + 'a': [-1, 2, 3, 6, 5, 4], + 'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4], + 'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], + 'd': [1, None, 3, None, 5, 4], + 'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4], + 'f': ['aa', np.NAN, 'cc', None, '', 'ff'], + + } + + df = pd.DataFrame(V) + print(df.dtypes) + + numeric_features = ['a', 'b', 'd', 'e'] + numeric_features_wo_miss = ['a', 'b', ] + categorial_features = ['c', 'f'] + + df_ = fill_missing_value(df.copy(), numeric_features) + print(df_) + df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe') + print(df_) + + df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999) + print(df_) + + # df_ = label_encode(df.copy(), numeric_features + categorial_features, ) + # print(df_) + + df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile') + print(df_) + + df_ = min_max_scale(df.copy(), numeric_features, ) + print(df_) + + df_ = standard_scale(df.copy(), numeric_features, ) + print(df_) + + df_ = log_transform(df.copy(), numeric_features, ) + print(df_) + + df_ = max_abs_scale(df.copy(), numeric_features, ) + print(df_) + + df_ = robust_scale(df.copy(), numeric_features, ) + print(df_) + run() \ No newline at end of file diff --git a/metagpt/tools/functions/libs/ml_model.py b/metagpt/tools/functions/libs/ml_model.py new file mode 100644 index 000000000..b669de2c1 --- /dev/null +++ b/metagpt/tools/functions/libs/ml_model.py @@ -0,0 +1,196 @@ +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder + +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import GradientBoostingClassifier + + +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import GradientBoostingRegressor + +from metagpt.tools.functions import registry +from metagpt.tools.functions.schemas.ml_model import * + + +######### +## 分类 ## +######### + + +@registry.register("classification_model", LogisticRegressionClassification) +def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + + model = LogisticRegression(penalty=penalty, dual=dual) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict_proba(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("classification_model", RandomForestClassification) +def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict_proba(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("classification_model", GradientBoostingClassification) +def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict_proba(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + + +######### +## 回归 ## +######### + + +@registry.register("regression_model", LinearRegressionRegression) +def linear_regression(df, label, test_size=0.2, ): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + + model = LinearRegression() + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("regression_model", RandomForestRegression) +def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("regression_model", GradientBoostingRegression) +def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +if __name__ == '__main__': + def run(): + from sklearn.datasets import load_iris + loader = load_iris(as_frame=True) + df = loader['data'] + df['target'] = loader['target'] + + df[df.columns[0]] = df[df.columns[0]].astype(str) + df[df.columns[1]] = df[df.columns[1]].astype(int) + df['target'] = df['target'].astype(str) + + print(df) + print('####'*5) + res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False) + print(res['te_pred_prob']) + + print('####'*5) + res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini') + print(res['te_pred_prob']) + + print('####'*5) + res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1) + print(res['te_pred_prob']) + + from sklearn.datasets import make_regression + import pandas as pd + loader = make_regression() + df = pd.DataFrame(loader[0]) + df['target'] = loader[1] + + df[df.columns[0]] = df[df.columns[0]].astype(str) + df[df.columns[1]] = df[df.columns[1]].astype(int) + # df['target'] = df['target'].astype(str) + + print(df) + print('####' * 5) + res = linear_regression(df, 'target', test_size=0.25, ) + print(res['te_pred_prob']) + + print('####' * 5) + res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error') + print(res['te_pred_prob']) + + print('####' * 5) + res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1) + print(res['te_pred_prob']) + run() \ No newline at end of file diff --git a/metagpt/tools/functions/schemas/data_preprocess.py b/metagpt/tools/functions/schemas/data_preprocess.py new file mode 100644 index 000000000..40e1d64e0 --- /dev/null +++ b/metagpt/tools/functions/schemas/data_preprocess.py @@ -0,0 +1,62 @@ + +import pandas as pd + +from metagpt.tools.functions.schemas.base import tool_field, ToolSchema + + +class FillMissingValue(ToolSchema): + """Completing missing values with simple strategies""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + strategy: str = tool_field(description="the imputation strategy", default='mean') + fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None) + + +# class LabelEncode(ToolSchema): +# """Completing missing values with simple strategies""" +# df: pd.DataFrame = tool_field(description="input dataframe") +# features: list = tool_field(description="columns to be processed") + + +class SplitBins(ToolSchema): + """Bin continuous data into intervals and return the bin identifier encoded as an integer value""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile') + + +class MinMaxScale(ToolSchema): + """Transform features by scaling each feature to a range, witch is (0, 1)""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class StandardScale(ToolSchema): + """Standardize features by removing the mean and scaling to unit variance""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class LogTransform(ToolSchema): + """Performs a logarithmic transformation on the specified columns""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class MaxAbsScale(ToolSchema): + """Scale each feature by its maximum absolute value""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class RobustScale(ToolSchema): + """Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class OrdinalEncode(ToolSchema): + """Encode categorical features as an integer array""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + diff --git a/metagpt/tools/functions/schemas/ml_model.py b/metagpt/tools/functions/schemas/ml_model.py new file mode 100644 index 000000000..9268156af --- /dev/null +++ b/metagpt/tools/functions/schemas/ml_model.py @@ -0,0 +1,55 @@ +import pandas as pd + +from metagpt.tools.functions.schemas.base import tool_field, ToolSchema + + +class LogisticRegressionClassification(ToolSchema): + """Logistic Regression (aka logit, MaxEnt) classifier""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + penalty: str = tool_field(description="Specify the norm of the penalty", default="l2") + dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2") + + +class RandomForestClassification(ToolSchema): + """random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of trees in the forest", default=100) + criterion: str = tool_field(description="The function to measure the quality of a split", default="gini") + + +class GradientBoostingClassification(ToolSchema): + """Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100) + learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1) + + +class LinearRegressionRegression(ToolSchema): + """Ordinary least squares Linear Regression.""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + + +class RandomForestRegression(ToolSchema): + """random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of trees in the forest", default=100) + criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error") + + +class GradientBoostingRegression(ToolSchema): + """Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100) + learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)