From 07771a769955f900b305334920d2ef5c70eae5bc Mon Sep 17 00:00:00 2001 From: lidanyang Date: Tue, 12 Dec 2023 10:56:51 +0800 Subject: [PATCH] add ml Class tool schema --- .../functions/schemas/data_preprocess.yml | 306 +++++++++++++ .../functions/schemas/feature_engineering.yml | 429 ++++++++++++++++++ 2 files changed, 735 insertions(+) create mode 100644 metagpt/tools/functions/schemas/data_preprocess.yml create mode 100644 metagpt/tools/functions/schemas/feature_engineering.yml diff --git a/metagpt/tools/functions/schemas/data_preprocess.yml b/metagpt/tools/functions/schemas/data_preprocess.yml new file mode 100644 index 000000000..95b0124cc --- /dev/null +++ b/metagpt/tools/functions/schemas/data_preprocess.yml @@ -0,0 +1,306 @@ +FillMissingValue: + type: class + description: "Completing missing values with simple strategies" + methods: + __init__: + description: "Initialize self." + parameters: + properties: + features: + type: list + description: "columns to be processed" + strategy: + type: str + description: "the imputation strategy" + default: mean + enum: + - mean + - median + - most_frequent + - constant + fill_value: + type: int + description: "fill_value is used to replace all occurrences of missing_values" + default: null + required: + - features + fit: + description: "Fit the FillMissingValue model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +MinMaxScale: + type: class + description: "Transform features by scaling each feature to a range, witch is (0, 1)" + methods: + __init__: + description: "Initialize self." + parameters: + properties: + features: + type: list + description: "columns to be processed" + required: + - features + fit: + description: "Fit the MinMaxScale model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +StandardScale: + type: class + description: "Standardize features by removing the mean and scaling to unit variance" + methods: + __init__: + description: "Initialize self." + parameters: + properties: + features: + type: list + description: "columns to be processed" + required: + - features + fit: + description: "Fit the StandardScale model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +MaxAbsScale: + type: class + description: "cale each feature by its maximum absolute value" + methods: + __init__: + description: "Initialize self." + parameters: + properties: + features: + type: list + description: "columns to be processed" + required: + - features + fit: + description: "Fit the MaxAbsScale model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +LabelEncode: + type: class + description: "Apply label encoding to specified categorical columns in-place." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + features: + type: list + description: "Categorical columns to be label encoded" + required: + - features + fit: + description: "Fit the LabelEncode model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +OneHotEncode: + type: class + description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + features: + type: list + description: "Categorical columns to be one-hot encoded and dropped" + required: + - features + fit: + description: "Fit the OneHotEncoding model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." \ No newline at end of file diff --git a/metagpt/tools/functions/schemas/feature_engineering.yml b/metagpt/tools/functions/schemas/feature_engineering.yml new file mode 100644 index 000000000..2cc4ec2fa --- /dev/null +++ b/metagpt/tools/functions/schemas/feature_engineering.yml @@ -0,0 +1,429 @@ +PolynomialExpansion: + type: class + description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + cols: + type: list + description: "Columns for polynomial expansion." + degree: + type: int + description: "The degree of the polynomial features." + default: 2 + required: + - cols + fit: + description: "Fit the PolynomialExpansion model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +CatCount: + type: class + description: "Add value counts of categorical columns as new features." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + cols: + type: list + description: "Columns for value counts." + required: + - cols + fit: + description: "Fit the CatCount model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +TargetMeanEncoder: + type: class + description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + col: + type: str + description: "Column to be mean encoded." + label: + type: str + description: "Predicted label column." + required: + - col + - label + fit: + description: "Fit the TargetMeanEncoder model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +KFoldTargetMeanEncoder: + type: class + description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + col: + type: str + description: "Column to be k-fold mean encoded." + label: + type: str + description: "Predicted label column." + n_splits: + type: int + description: "Number of splits for K-fold." + default: 5 + random_state: + type: int + description: "Random seed." + default: 2021 + required: + - col + - label + fit: + description: "Fit the KFoldTargetMeanEncoder model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +CatCross: + type: class + description: "Add pairwise crossed features and convert them to numerical features." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + cols: + type: list + description: "Columns to be pairwise crossed." + max_cat_num: + type: int + description: "Maximum unique categories per crossed feature." + default: 100 + required: + - cols + fit: + description: "Fit the CatCross model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +GroupStat: + type: class + description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '__by_'." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + group_col: + type: str + description: "Column used for grouping." + agg_col: + type: str + description: "Column on which aggregation is performed." + agg_funcs: + type: list + description: >- + List of aggregation functions to apply, such as ['mean', 'std']. + Each function must be supported by pandas. + required: + - group_col + - agg_col + - agg_funcs + fit: + description: "Fit the GroupStat model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +SplitBins: + type: class + description: "Bin continuous data into intervals and return the bin identifier encoded as an integer value" + methods: + __init__: + description: "Initialize self." + parameters: + properties: + cols: + type: list + description: "Columns to be binned." + strategy: + type: str + description: "Strategy used to define the widths of the bins." + default: quantile + required: + - cols + fit: + description: "Fit the SplitBins model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + +GeneralSelection: + type: class + description: "Drop all nan feats and feats with only one unique value." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + label_col: + type: str + description: "Label column name." + required: + - label_col + fit: + description: "Fit the GeneralSelection model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame." \ No newline at end of file