drop old tool definition

This commit is contained in:
lidanyang 2023-12-13 20:41:32 +08:00
parent 7744815c5f
commit edd6987a1c
8 changed files with 0 additions and 615 deletions

View file

@ -4,6 +4,3 @@
# @Author : lidanyang
# @File : __init__.py
# @Desc :
from metagpt.tools.functions.register.register import registry
import metagpt.tools.functions.libs.feature_engineering
import metagpt.tools.functions.libs.data_preprocess

View file

@ -1,196 +0,0 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.ml_model import *
#########
## 分类 ##
#########
@registry.register("classification_model", LogisticRegressionClassification)
def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = LogisticRegression(penalty=penalty, dual=dual)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("classification_model", RandomForestClassification)
def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("classification_model", GradientBoostingClassification)
def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
#########
## 回归 ##
#########
@registry.register("regression_model", LinearRegressionRegression)
def linear_regression(df, label, test_size=0.2, ):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = LinearRegression()
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("regression_model", RandomForestRegression)
def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("regression_model", GradientBoostingRegression)
def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
if __name__ == '__main__':
def run():
from sklearn.datasets import load_iris
loader = load_iris(as_frame=True)
df = loader['data']
df['target'] = loader['target']
df[df.columns[0]] = df[df.columns[0]].astype(str)
df[df.columns[1]] = df[df.columns[1]].astype(int)
df['target'] = df['target'].astype(str)
print(df)
print('####'*5)
res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False)
print(res['te_pred_prob'])
print('####'*5)
res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini')
print(res['te_pred_prob'])
print('####'*5)
res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
print(res['te_pred_prob'])
from sklearn.datasets import make_regression
import pandas as pd
loader = make_regression()
df = pd.DataFrame(loader[0])
df['target'] = loader[1]
df[df.columns[0]] = df[df.columns[0]].astype(str)
df[df.columns[1]] = df[df.columns[1]].astype(int)
# df['target'] = df['target'].astype(str)
print(df)
print('####' * 5)
res = linear_regression(df, 'target', test_size=0.25, )
print(res['te_pred_prob'])
print('####' * 5)
res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error')
print(res['te_pred_prob'])
print('####' * 5)
res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
print(res['te_pred_prob'])
run()

View file

@ -1,6 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/11/16 16:37
# @Author : lidanyang
# @File : __init__.py
# @Desc :

View file

@ -1,78 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/11/16 16:38
# @Author : lidanyang
# @File : register.py
# @Desc :
import inspect
from typing import Type, Optional, Callable, Dict, Union, List
from metagpt.tools.functions.schemas.base import ToolSchema
class FunctionRegistry:
def __init__(self):
self.functions: Dict[str, Dict[str, Dict]] = {}
@staticmethod
def _check_param_consistency(func_params, schema):
param_names = set(func_params.keys())
schema_names = set(schema["parameters"]["properties"].keys())
if param_names != schema_names:
raise ValueError("Function parameters do not match schema properties")
def register(self, module: str, tool_schema: Type[ToolSchema]) -> Callable:
def wrapper(func: Callable) -> Callable:
module_registry = self.functions.setdefault(module, {})
if func.__name__ in module_registry:
raise ValueError(f"Function {func.__name__} is already registered in {module}")
func_params = inspect.signature(func).parameters
schema = tool_schema.schema()
schema["name"] = func.__name__
self._check_param_consistency(func_params, schema)
module_registry[func.__name__] = {
"func": func,
"schema": schema,
}
return func
return wrapper
def get(self, module: str, name: str) -> Optional[Union[Callable, Dict]]:
"""Get function by module and name"""
module_registry = self.functions.get(module, {})
return module_registry.get(name)
def get_by_name(self, name: str) -> Optional[Dict]:
"""Get function by name"""
for module_registry in self.functions.values():
if name in module_registry:
return module_registry.get(name, {})
def get_all_by_module(self, module: str) -> Optional[Dict]:
"""Get all functions by module"""
return self.functions.get(module, {})
def get_schema(self, module: str, name: str) -> Optional[Dict]:
"""Get schema by module and name"""
module_registry = self.functions.get(module, {})
return module_registry.get(name, {}).get("schema")
def get_schemas(self, module: str, names: List[str]) -> List[Dict]:
"""Get schemas by module and names"""
module_registry = self.functions.get(module, {})
return [module_registry.get(name, {}).get("schema") for name in names]
def get_all_schema_by_module(self, module: str) -> List[Dict]:
"""Get all schemas by module"""
module_registry = self.functions.get(module, {})
return [v.get("schema") for v in module_registry.values()]
registry = FunctionRegistry()

View file

@ -1,100 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/11/16 16:34
# @Author : lidanyang
# @File : base.py
# @Desc : Build base class to generate schema for tool
from typing import Any, List, Optional, get_type_hints
class NoDefault:
"""
A class to represent a missing default value.
This is used to distinguish between a default value of None and a missing default value.
"""
pass
def tool_field(
description: str, default: Any = NoDefault(), enum: Optional[List[Any]] = None, **kwargs
):
"""
Create a field for a tool parameter.
Args:
description (str): A description of the field.
default (Any, optional): The default value for the field. Defaults to None.
enum (Optional[List[Any]], optional): A list of possible values for the field. Defaults to None.
**kwargs: Additional keyword arguments.
Returns:
dict: A dictionary representing the field with provided attributes.
"""
field_info = {
"description": description,
"default": default,
"enum": enum,
}
field_info.update(kwargs)
return field_info
class ToolSchema:
@staticmethod
def format_type(type_hint):
"""
Format a type hint into a string representation.
Args:
type_hint (type): The type hint to format.
Returns:
str: A string representation of the type hint.
"""
if isinstance(type_hint, type):
# Handle built-in types separately
if type_hint.__module__ == "builtins":
return type_hint.__name__
else:
return f"{type_hint.__module__}.{type_hint.__name__}"
elif hasattr(type_hint, "__origin__") and hasattr(type_hint, "__args__"):
# Handle generic types (like List[int])
origin_type = ToolSchema.format_type(type_hint.__origin__)
args_type = ", ".join(
[ToolSchema.format_type(t) for t in type_hint.__args__]
)
return f"{origin_type}[{args_type}]"
else:
return str(type_hint)
@classmethod
def schema(cls):
"""
Generate a schema dictionary for the class.
The schema includes the class name, description, and information about
each class parameter based on type hints and field definitions.
Returns:
dict: A dictionary representing the schema of the class.
"""
schema = {
"name": cls.__name__,
"description": cls.__doc__,
"parameters": {"type": "object", "properties": {}, "required": []},
}
type_hints = get_type_hints(cls)
for attr, type_hint in type_hints.items():
value = getattr(cls, attr, None)
if isinstance(value, dict):
# Process each attribute that is defined using the field function
prop_info = {k: v for k, v in value.items() if v is not None or k == "default"}
if isinstance(prop_info["default"], NoDefault):
del prop_info["default"]
prop_info["type"] = ToolSchema.format_type(type_hint)
schema["parameters"]["properties"][attr] = prop_info
# Check for required fields
if "default" not in prop_info:
schema["parameters"]["required"].append(attr)
return schema

View file

@ -1,67 +0,0 @@
import pandas as pd
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
class FillMissingValue(ToolSchema):
"""Completing missing values with simple strategies"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(
description="the imputation strategy",
default='mean',
enum=['mean', 'median', 'most_frequent', 'constant']
)
fill_value: int = tool_field(
description="fill_value is used to replace all occurrences of missing_values", default=None)
class SplitBins(ToolSchema):
"""Bin continuous data into intervals and return the bin identifier encoded as an integer value"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile')
class MinMaxScale(ToolSchema):
"""Transform features by scaling each feature to a range, witch is (0, 1)"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class StandardScale(ToolSchema):
"""Standardize features by removing the mean and scaling to unit variance"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class LogTransform(ToolSchema):
"""Performs a logarithmic transformation on the specified columns"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class MaxAbsScale(ToolSchema):
"""Scale each feature by its maximum absolute value"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class RobustScale(ToolSchema):
"""Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class OrdinalEncode(ToolSchema):
"""Encode categorical features as an integer array"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class OneHotEncoding(ToolSchema):
"""Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Categorical columns to be one-hot encoded and dropped.")

View file

@ -1,110 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/11/17 10:34
# @Author : lidanyang
# @File : feature_engineering.py
# @Desc : Schema for feature engineering functions
from typing import List
import pandas as pd
from metagpt.tools.functions.schemas.base import ToolSchema, tool_field
class PolynomialExpansion(ToolSchema):
"""Add polynomial and interaction features from selected numeric columns, excluding the bias column."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Columns for polynomial expansion.")
degree: int = tool_field(description="Degree of polynomial features.", default=2)
class FrequencyEncoding(ToolSchema):
"""Add value counts of categorical columns as new features."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Categorical columns to be frequency encoded.")
class TargetMeanEncoder(ToolSchema):
"""Encodes a categorical column by the mean of the label column, and adds the result as a new feature."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
col: str = tool_field(description="Column to be mean encoded.")
label: str = tool_field(description="Predicted label column.")
class KFoldTargetMeanEncoder(ToolSchema):
"""Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
col: str = tool_field(description="Column to be k-fold mean encoded.")
label: str = tool_field(description="Predicted label column.")
n_splits: int = tool_field(description="Number of splits for K-fold.", default=5)
random_state: int = tool_field(description="Random seed.", default=2021)
class CatCross(ToolSchema):
"""Add pairwise crossed features and convert them to numerical features."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Columns to be pairwise crossed.")
max_cat_num: int = tool_field(
description="Maximum unique categories per crossed feature.", default=100
)
class GroupStat(ToolSchema):
"""Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
group_col: str = tool_field(description="Column used for grouping.")
agg_col: str = tool_field(description="Column on which aggregation is performed.")
agg_funcs: list = tool_field(
description="""List of aggregation functions to apply, such as ['mean', 'std'].
Each function must be supported by pandas."""
)
class ExtractTimeComps(ToolSchema):
"""Extract and add specific time components as new features from a designated time column."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
time_col: str = tool_field(
description="The name of the column containing time data."
)
time_comps: List[str] = tool_field(
description="""List of time components to extract.
Each component must be in ['year', 'month', 'day', 'hour', 'dayofweek', 'is_weekend']."""
)
class FeShiftByTime(ToolSchema):
"""Shift column values based on specified time intervals and add the resulting new features to the DataFrame. New features are named in the format of '<group_col>_<shift_col>_lag_<period>_<freq>'."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
time_col: str = tool_field(description="Column for time-based shifting.")
group_col: str = tool_field(description="Column for grouping before shifting.")
shift_col: str = tool_field(description="Column to shift.")
periods: list = tool_field(description="Time intervals for shifting.")
freq: str = tool_field(
description="Frequency unit for time intervals (e.g., 'D', 'M').",
enum=["D", "M", "Y", "W", "H"],
)
class FeRollingByTime(ToolSchema):
"""Calculate rolling statistics for a DataFrame column over time intervals."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
time_col: str = tool_field(description="Column for time-based rolling.")
group_col: str = tool_field(description="Column for grouping before rolling.")
rolling_col: str = tool_field(description="Column for rolling calculations.")
periods: list = tool_field(description="Window sizes for rolling.")
freq: str = tool_field(
description="Frequency unit for time windows (e.g., 'D', 'M').",
enum=["D", "M", "Y", "W", "H"],
)
agg_funcs: list = tool_field(
description="""List of aggregation functions for rolling, like ['mean', 'std'].
Each function must be in ['mean', 'std', 'min', 'max', 'median', 'sum', 'count']."""
)

View file

@ -1,55 +0,0 @@
import pandas as pd
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
class LogisticRegressionClassification(ToolSchema):
"""Logistic Regression (aka logit, MaxEnt) classifier"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
penalty: str = tool_field(description="Specify the norm of the penalty", default="l2")
dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2")
class RandomForestClassification(ToolSchema):
"""random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
criterion: str = tool_field(description="The function to measure the quality of a split", default="gini")
class GradientBoostingClassification(ToolSchema):
"""Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
class LinearRegressionRegression(ToolSchema):
"""Ordinary least squares Linear Regression."""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
class RandomForestRegression(ToolSchema):
"""random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error")
class GradientBoostingRegression(ToolSchema):
"""Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)