mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-05 14:55:18 +02:00
Merge branch 'add_test_for_ml_tools' into 'code_intepreter'
Add test for ml tools See merge request agents/data_agents_opt!44
This commit is contained in:
commit
4eb366cd31
7 changed files with 365 additions and 191 deletions
|
|
@ -37,8 +37,9 @@ class FillMissingValue(MLProcess):
|
|||
def transform(self, df: pd.DataFrame):
|
||||
if len(self.features) == 0:
|
||||
return df
|
||||
df[self.features] = self.si.transform(df[self.features])
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[self.features] = self.si.transform(new_df[self.features])
|
||||
return new_df
|
||||
|
||||
|
||||
class MinMaxScale(MLProcess):
|
||||
|
|
@ -54,8 +55,9 @@ class MinMaxScale(MLProcess):
|
|||
self.mms.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.mms.transform(df[self.features])
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[self.features] = self.mms.transform(new_df[self.features])
|
||||
return new_df
|
||||
|
||||
|
||||
class StandardScale(MLProcess):
|
||||
|
|
@ -71,8 +73,9 @@ class StandardScale(MLProcess):
|
|||
self.ss.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.ss.transform(df[self.features])
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[self.features] = self.ss.transform(new_df[self.features])
|
||||
return new_df
|
||||
|
||||
|
||||
class MaxAbsScale(MLProcess):
|
||||
|
|
@ -88,8 +91,9 @@ class MaxAbsScale(MLProcess):
|
|||
self.mas.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.mas.transform(df[self.features])
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[self.features] = self.mas.transform(new_df[self.features])
|
||||
return new_df
|
||||
|
||||
|
||||
class RobustScale(MLProcess):
|
||||
|
|
@ -105,8 +109,9 @@ class RobustScale(MLProcess):
|
|||
self.rs.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.rs.transform(df[self.features])
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[self.features] = self.rs.transform(new_df[self.features])
|
||||
return new_df
|
||||
|
||||
|
||||
class OrdinalEncode(MLProcess):
|
||||
|
|
@ -122,8 +127,9 @@ class OrdinalEncode(MLProcess):
|
|||
self.oe.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.oe.transform(df[self.features])
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[self.features] = self.oe.transform(new_df[self.features])
|
||||
return new_df
|
||||
|
||||
|
||||
class OneHotEncode(MLProcess):
|
||||
|
|
@ -142,9 +148,9 @@ class OneHotEncode(MLProcess):
|
|||
ts_data = self.ohe.transform(df[self.features])
|
||||
new_columns = self.ohe.get_feature_names_out(self.features)
|
||||
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
|
||||
df.drop(self.features, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
new_df = df.drop(self.features, axis=1)
|
||||
new_df = pd.concat([new_df, ts_data], axis=1)
|
||||
return new_df
|
||||
|
||||
|
||||
class LabelEncode(MLProcess):
|
||||
|
|
@ -165,13 +171,14 @@ class LabelEncode(MLProcess):
|
|||
def transform(self, df: pd.DataFrame):
|
||||
if len(self.features) == 0:
|
||||
return df
|
||||
new_df = df.copy()
|
||||
for i in range(len(self.features)):
|
||||
data_list = df[self.features[i]].astype(str).tolist()
|
||||
for unique_item in np.unique(df[self.features[i]].astype(str)):
|
||||
if unique_item not in self.le_encoders[i].classes_:
|
||||
data_list = ["unknown" if x == unique_item else x for x in data_list]
|
||||
df[self.features[i]] = self.le_encoders[i].transform(data_list)
|
||||
return df
|
||||
new_df[self.features[i]] = self.le_encoders[i].transform(data_list)
|
||||
return new_df
|
||||
|
||||
|
||||
def get_column_info(df: pd.DataFrame) -> dict:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2023/11/17 10:33
|
||||
# @Author : lidanyang
|
||||
# @File : feature_engineering.py
|
||||
# @File : test_feature_engineering.py
|
||||
# @Desc : Feature Engineering Tools
|
||||
import itertools
|
||||
|
||||
|
|
@ -43,9 +43,9 @@ class PolynomialExpansion(MLProcess):
|
|||
ts_data = self.poly.transform(df[self.cols].fillna(0))
|
||||
column_name = self.poly.get_feature_names_out(self.cols)
|
||||
ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
|
||||
df.drop(self.cols, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
new_df = df.drop(self.cols, axis=1)
|
||||
new_df = pd.concat([new_df, ts_data], axis=1)
|
||||
return new_df
|
||||
|
||||
|
||||
class CatCount(MLProcess):
|
||||
|
|
@ -57,8 +57,9 @@ class CatCount(MLProcess):
|
|||
self.encoder_dict = df[self.col].value_counts().to_dict()
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[f"{self.col}_cnt"] = df[self.col].map(self.encoder_dict)
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[f"{self.col}_cnt"] = new_df[self.col].map(self.encoder_dict)
|
||||
return new_df
|
||||
|
||||
|
||||
class TargetMeanEncoder(MLProcess):
|
||||
|
|
@ -71,8 +72,9 @@ class TargetMeanEncoder(MLProcess):
|
|||
self.encoder_dict = df.groupby(self.col)[self.label].mean().to_dict()
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[f"{self.col}_target_mean"] = df[self.col].map(self.encoder_dict)
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[f"{self.col}_target_mean"] = new_df[self.col].map(self.encoder_dict)
|
||||
return new_df
|
||||
|
||||
|
||||
class KFoldTargetMeanEncoder(MLProcess):
|
||||
|
|
@ -96,8 +98,9 @@ class KFoldTargetMeanEncoder(MLProcess):
|
|||
self.encoder_dict = tmp.groupby(self.col)[col_name].mean().to_dict()
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[f"{self.col}_kf_target_mean"] = df[self.col].map(self.encoder_dict)
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[f"{self.col}_kf_target_mean"] = new_df[self.col].map(self.encoder_dict)
|
||||
return new_df
|
||||
|
||||
|
||||
class CatCross(MLProcess):
|
||||
|
|
@ -124,14 +127,15 @@ class CatCross(MLProcess):
|
|||
self.combs_map = dict(res)
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
new_df = df.copy()
|
||||
for comb in self.combs:
|
||||
new_col = f"{comb[0]}_{comb[1]}"
|
||||
_map = self.combs_map[new_col]
|
||||
df[new_col] = pd.Series(zip(df[comb[0]], df[comb[1]])).map(_map)
|
||||
new_df[new_col] = pd.Series(zip(new_df[comb[0]], new_df[comb[1]])).map(_map)
|
||||
# set the unknown value to a new number
|
||||
df[new_col].fillna(max(_map.values()) + 1, inplace=True)
|
||||
df[new_col] = df[new_col].astype(int)
|
||||
return df
|
||||
new_df[new_col].fillna(max(_map.values()) + 1, inplace=True)
|
||||
new_df[new_col] = new_df[new_col].astype(int)
|
||||
return new_df
|
||||
|
||||
|
||||
class GroupStat(MLProcess):
|
||||
|
|
@ -149,12 +153,12 @@ class GroupStat(MLProcess):
|
|||
self.group_df = group_df
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df.merge(self.group_df, on=self.group_col, how="left")
|
||||
return df
|
||||
new_df = df.merge(self.group_df, on=self.group_col, how="left")
|
||||
return new_df
|
||||
|
||||
|
||||
class SplitBins(MLProcess):
|
||||
def __init__(self, cols: str, strategy: str = "quantile"):
|
||||
def __init__(self, cols: list, strategy: str = "quantile"):
|
||||
self.cols = cols
|
||||
self.strategy = strategy
|
||||
self.encoder = None
|
||||
|
|
@ -164,8 +168,9 @@ class SplitBins(MLProcess):
|
|||
self.encoder.fit(df[self.cols].fillna(0))
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
|
||||
return df
|
||||
new_df = df.copy()
|
||||
new_df[self.cols] = self.encoder.transform(new_df[self.cols].fillna(0))
|
||||
return new_df
|
||||
|
||||
|
||||
class ExtractTimeComps(MLProcess):
|
||||
|
|
@ -192,91 +197,8 @@ class ExtractTimeComps(MLProcess):
|
|||
time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
|
||||
if "is_weekend" in self.time_comps:
|
||||
time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
|
||||
df = pd.concat([df, time_comps_df], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
# @registry.register("feature_engineering", FeShiftByTime)
|
||||
# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
|
||||
# df[time_col] = pd.to_datetime(df[time_col])
|
||||
#
|
||||
# def shift_datetime(date, offset, unit):
|
||||
# if unit in ["year", "y", "Y"]:
|
||||
# return date + relativedelta(years=offset)
|
||||
# elif unit in ["month", "m", "M"]:
|
||||
# return date + relativedelta(months=offset)
|
||||
# elif unit in ["day", "d", "D"]:
|
||||
# return date + relativedelta(days=offset)
|
||||
# elif unit in ["week", "w", "W"]:
|
||||
# return date + relativedelta(weeks=offset)
|
||||
# elif unit in ["hour", "h", "H"]:
|
||||
# return date + relativedelta(hours=offset)
|
||||
# else:
|
||||
# return date
|
||||
#
|
||||
# def shift_by_time_on_key(
|
||||
# inner_df, time_col, group_col, shift_col, offset, unit, col_name
|
||||
# ):
|
||||
# inner_df = inner_df.drop_duplicates()
|
||||
# inner_df[time_col] = inner_df[time_col].map(
|
||||
# lambda x: shift_datetime(x, offset, unit)
|
||||
# )
|
||||
# inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
|
||||
# shift_col
|
||||
# ].mean()
|
||||
# inner_df.rename(columns={shift_col: col_name}, inplace=True)
|
||||
# return inner_df
|
||||
#
|
||||
# shift_df = df[[time_col, group_col, shift_col]].copy()
|
||||
# for period in periods:
|
||||
# new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
|
||||
# tmp = shift_by_time_on_key(
|
||||
# shift_df, time_col, group_col, shift_col, period, freq, new_col_name
|
||||
# )
|
||||
# df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
#
|
||||
# return df
|
||||
#
|
||||
#
|
||||
# @registry.register("feature_engineering", FeRollingByTime)
|
||||
# def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
|
||||
# df[time_col] = pd.to_datetime(df[time_col])
|
||||
#
|
||||
# def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
|
||||
# time_freq = {
|
||||
# "Y": [365 * offset, "D"],
|
||||
# "M": [30 * offset, "D"],
|
||||
# "D": [offset, "D"],
|
||||
# "W": [7 * offset, "D"],
|
||||
# "H": [offset, "h"],
|
||||
# }
|
||||
#
|
||||
# if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
|
||||
# raise ValueError(f"Invalid agg function: {agg_func}")
|
||||
#
|
||||
# rolling_feat = inner_df.rolling(
|
||||
# f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
|
||||
# )
|
||||
# rolling_feat = getattr(rolling_feat, agg_func)()
|
||||
# depth = df.columns.nlevels
|
||||
# rolling_feat = rolling_feat.stack(list(range(depth)))
|
||||
# rolling_feat.name = col_name
|
||||
# return rolling_feat
|
||||
#
|
||||
# rolling_df = df[[time_col, group_col, rolling_col]].copy()
|
||||
# for period in periods:
|
||||
# for func in agg_funcs:
|
||||
# new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
|
||||
# tmp = pd.pivot_table(
|
||||
# rolling_df,
|
||||
# index=time_col,
|
||||
# values=rolling_col,
|
||||
# columns=group_col,
|
||||
# )
|
||||
# tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
|
||||
# df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
#
|
||||
# return df
|
||||
new_df = pd.concat([df, time_comps_df], axis=1)
|
||||
return new_df
|
||||
|
||||
|
||||
class GeneralSelection(MLProcess):
|
||||
|
|
@ -302,8 +224,8 @@ class GeneralSelection(MLProcess):
|
|||
self.feats = feats
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df[self.feats + [self.label_col]]
|
||||
return df
|
||||
new_df = df[self.feats + [self.label_col]]
|
||||
return new_df
|
||||
|
||||
|
||||
class TreeBasedSelection(MLProcess):
|
||||
|
|
@ -344,8 +266,8 @@ class TreeBasedSelection(MLProcess):
|
|||
self.feats.append(self.label_col)
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df[self.feats]
|
||||
return df
|
||||
new_df = df[self.feats]
|
||||
return new_df
|
||||
|
||||
|
||||
class VarianceBasedSelection(MLProcess):
|
||||
|
|
@ -364,5 +286,5 @@ class VarianceBasedSelection(MLProcess):
|
|||
self.feats.append(self.label_col)
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df[self.feats]
|
||||
return df
|
||||
new_df = df[self.feats]
|
||||
return new_df
|
||||
|
|
|
|||
|
|
@ -3,8 +3,13 @@ import asyncio
|
|||
import pytest
|
||||
|
||||
from metagpt.actions.execute_code import ExecutePyCode
|
||||
from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
|
||||
from metagpt.actions.write_analysis_code import (
|
||||
WriteCodeByGenerate,
|
||||
WriteCodeWithTools,
|
||||
WriteCodeWithToolsML,
|
||||
)
|
||||
from metagpt.logs import logger
|
||||
from metagpt.plan.planner import STRUCTURAL_CONTEXT
|
||||
from metagpt.schema import Message, Plan, Task
|
||||
|
||||
|
||||
|
|
@ -40,13 +45,15 @@ async def test_tool_recommendation():
|
|||
tools = await write_code._tool_recommendation(task, code_steps, available_tools)
|
||||
|
||||
assert len(tools) == 1
|
||||
assert tools[0] == ["fill_missing_value"]
|
||||
assert tools[0] == "fill_missing_value"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_write_code_with_tools():
|
||||
write_code = WriteCodeWithTools()
|
||||
messages = []
|
||||
write_code_ml = WriteCodeWithToolsML()
|
||||
|
||||
requirement = "构造数据集并进行数据清洗"
|
||||
task_map = {
|
||||
"1": Task(
|
||||
task_id="1",
|
||||
|
|
@ -69,10 +76,6 @@ async def test_write_code_with_tools():
|
|||
instruction="对数据集进行数据清洗",
|
||||
task_type="data_preprocess",
|
||||
dependent_task_ids=["1"],
|
||||
code_steps="""
|
||||
{"Step 1": "对数据集进行去重",
|
||||
"Step 2": "对数据集进行缺失值处理"}
|
||||
""",
|
||||
),
|
||||
}
|
||||
plan = Plan(
|
||||
|
|
@ -83,10 +86,22 @@ async def test_write_code_with_tools():
|
|||
)
|
||||
column_info = ""
|
||||
|
||||
code = await write_code.run(messages, plan, column_info)
|
||||
context = STRUCTURAL_CONTEXT.format(
|
||||
user_requirement=requirement,
|
||||
context=plan.context,
|
||||
tasks=list(task_map.values()),
|
||||
current_task=plan.current_task.model_dump_json(),
|
||||
)
|
||||
context_msg = [Message(content=context, role="user")]
|
||||
|
||||
code = await write_code.run(context_msg, plan)
|
||||
assert len(code) > 0
|
||||
print(code)
|
||||
|
||||
code_with_ml = await write_code_ml.run([], plan, column_info)
|
||||
assert len(code_with_ml) > 0
|
||||
print(code_with_ml)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_write_code_to_correct_error():
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2023/11/17 10:24
|
||||
# @Time : 2024/1/11 16:14
|
||||
# @Author : lidanyang
|
||||
# @File : __init__.py
|
||||
# @Desc :
|
||||
111
tests/metagpt/tools/functions/libs/test_data_preprocess.py
Normal file
111
tests/metagpt/tools/functions/libs/test_data_preprocess.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import numpy.testing as npt
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from metagpt.tools.functions.libs.data_preprocess import (
|
||||
FillMissingValue,
|
||||
LabelEncode,
|
||||
MaxAbsScale,
|
||||
MinMaxScale,
|
||||
OneHotEncode,
|
||||
OrdinalEncode,
|
||||
RobustScale,
|
||||
StandardScale,
|
||||
get_column_info,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_datasets():
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"num1": [1, 2, np.nan, 4, 5],
|
||||
"cat1": ["A", "B", np.nan, "D", "A"],
|
||||
"date1": [
|
||||
datetime(2020, 1, 1),
|
||||
datetime(2020, 1, 2),
|
||||
datetime(2020, 1, 3),
|
||||
datetime(2020, 1, 4),
|
||||
datetime(2020, 1, 5),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_fill_missing_value(mock_datasets):
|
||||
fm = FillMissingValue(features=["num1"], strategy="mean")
|
||||
transformed = fm.fit_transform(mock_datasets.copy())
|
||||
|
||||
assert transformed["num1"].isnull().sum() == 0
|
||||
|
||||
|
||||
def test_min_max_scale(mock_datasets):
|
||||
mms = MinMaxScale(features=["num1"])
|
||||
transformed = mms.fit_transform(mock_datasets.copy())
|
||||
|
||||
npt.assert_allclose(transformed["num1"].min(), 0)
|
||||
npt.assert_allclose(transformed["num1"].max(), 1)
|
||||
|
||||
|
||||
def test_standard_scale(mock_datasets):
|
||||
ss = StandardScale(features=["num1"])
|
||||
transformed = ss.fit_transform(mock_datasets.copy())
|
||||
|
||||
assert int(transformed["num1"].mean()) == 0
|
||||
assert int(transformed["num1"].std()) == 1
|
||||
|
||||
|
||||
def test_max_abs_scale(mock_datasets):
|
||||
mas = MaxAbsScale(features=["num1"])
|
||||
transformed = mas.fit_transform(mock_datasets.copy())
|
||||
|
||||
npt.assert_allclose(transformed["num1"].abs().max(), 1)
|
||||
|
||||
|
||||
def test_robust_scale(mock_datasets):
|
||||
rs = RobustScale(features=["num1"])
|
||||
transformed = rs.fit_transform(mock_datasets.copy())
|
||||
|
||||
assert int(transformed["num1"].median()) == 0
|
||||
|
||||
|
||||
def test_ordinal_encode(mock_datasets):
|
||||
oe = OrdinalEncode(features=["cat1"])
|
||||
transformed = oe.fit_transform(mock_datasets.copy())
|
||||
|
||||
assert transformed["cat1"].max() == 2
|
||||
|
||||
|
||||
def test_one_hot_encode(mock_datasets):
|
||||
ohe = OneHotEncode(features=["cat1"])
|
||||
transformed = ohe.fit_transform(mock_datasets.copy())
|
||||
|
||||
assert transformed["cat1_A"].max() == 1
|
||||
|
||||
|
||||
def test_label_encode(mock_datasets):
|
||||
le = LabelEncode(features=["cat1"])
|
||||
transformed = le.fit_transform(mock_datasets.copy())
|
||||
|
||||
assert transformed["cat1"].max() == 3
|
||||
|
||||
# test transform with unseen data
|
||||
test = mock_datasets.copy()
|
||||
test["cat1"] = ["A", "B", "C", "D", "E"]
|
||||
transformed = le.transform(test)
|
||||
assert transformed["cat1"].max() == 4
|
||||
|
||||
|
||||
def test_get_column_info(mock_datasets):
|
||||
df = mock_datasets
|
||||
column_info = get_column_info(df)
|
||||
|
||||
assert column_info == {
|
||||
"Category": ["cat1"],
|
||||
"Numeric": ["num1"],
|
||||
"Datetime": ["date1"],
|
||||
"Others": [],
|
||||
}
|
||||
174
tests/metagpt/tools/functions/libs/test_feature_engineering.py
Normal file
174
tests/metagpt/tools/functions/libs/test_feature_engineering.py
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from sklearn.datasets import fetch_california_housing, load_breast_cancer, load_iris
|
||||
|
||||
from metagpt.tools.functions.libs.feature_engineering import (
|
||||
CatCount,
|
||||
CatCross,
|
||||
ExtractTimeComps,
|
||||
GeneralSelection,
|
||||
GroupStat,
|
||||
KFoldTargetMeanEncoder,
|
||||
PolynomialExpansion,
|
||||
SplitBins,
|
||||
TargetMeanEncoder,
|
||||
TreeBasedSelection,
|
||||
VarianceBasedSelection,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_dataset():
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"num1": [1, 2, np.nan, 4, 5, 6, 7, 3],
|
||||
"num2": [1, 3, 2, 1, np.nan, 5, 6, 4],
|
||||
"num3": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
"cat1": ["A", "B", np.nan, "D", "E", "C", "B", "A"],
|
||||
"cat2": ["A", "A", "A", "A", "A", "A", "A", "A"],
|
||||
"date1": [
|
||||
"2020-01-01",
|
||||
"2020-01-02",
|
||||
"2020-01-03",
|
||||
"2020-01-04",
|
||||
"2020-01-05",
|
||||
"2020-01-06",
|
||||
"2020-01-07",
|
||||
"2020-01-08",
|
||||
],
|
||||
"label": [0, 1, 0, 1, 0, 1, 0, 1],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def load_sklearn_data(data_name):
|
||||
if data_name == "iris":
|
||||
data = load_iris()
|
||||
elif data_name == "breast_cancer":
|
||||
data = load_breast_cancer()
|
||||
elif data_name == "housing":
|
||||
data = fetch_california_housing()
|
||||
else:
|
||||
raise ValueError("data_name not supported")
|
||||
|
||||
X, y, feature_names = data.data, data.target, data.feature_names
|
||||
data = pd.DataFrame(X, columns=feature_names)
|
||||
data["label"] = y
|
||||
return data
|
||||
|
||||
|
||||
def test_polynomial_expansion(mock_dataset):
|
||||
pe = PolynomialExpansion(cols=["num1", "num2", "label"], degree=2, label_col="label")
|
||||
transformed = pe.fit_transform(mock_dataset)
|
||||
|
||||
assert len(transformed.columns) == len(mock_dataset.columns) + 3
|
||||
|
||||
# when too many columns
|
||||
data = load_sklearn_data("breast_cancer")
|
||||
cols = [c for c in data.columns if c != "label"]
|
||||
pe = PolynomialExpansion(cols=cols, degree=2, label_col="label")
|
||||
transformed = pe.fit_transform(data)
|
||||
|
||||
assert len(transformed.columns) == len(data.columns) + 55
|
||||
|
||||
|
||||
def test_cat_count(mock_dataset):
|
||||
cc = CatCount(col="cat1")
|
||||
transformed = cc.fit_transform(mock_dataset)
|
||||
|
||||
assert "cat1_cnt" in transformed.columns
|
||||
assert transformed["cat1_cnt"][0] == 2
|
||||
|
||||
|
||||
def test_target_mean_encoder(mock_dataset):
|
||||
tme = TargetMeanEncoder(col="cat1", label="label")
|
||||
transformed = tme.fit_transform(mock_dataset)
|
||||
|
||||
assert "cat1_target_mean" in transformed.columns
|
||||
assert transformed["cat1_target_mean"][0] == 0.5
|
||||
|
||||
|
||||
def test_kfold_target_mean_encoder(mock_dataset):
|
||||
kfme = KFoldTargetMeanEncoder(col="cat1", label="label")
|
||||
transformed = kfme.fit_transform(mock_dataset)
|
||||
|
||||
assert "cat1_kf_target_mean" in transformed.columns
|
||||
|
||||
|
||||
def test_cat_cross(mock_dataset):
|
||||
cc = CatCross(cols=["cat1", "cat2"])
|
||||
transformed = cc.fit_transform(mock_dataset)
|
||||
|
||||
assert "cat1_cat2" in transformed.columns
|
||||
|
||||
cc = CatCross(cols=["cat1", "cat2"], max_cat_num=3)
|
||||
transformed = cc.fit_transform(mock_dataset)
|
||||
|
||||
assert "cat1_cat2" not in transformed.columns
|
||||
|
||||
|
||||
def test_group_stat(mock_dataset):
|
||||
gs = GroupStat(group_col="cat1", agg_col="num1", agg_funcs=["mean", "sum"])
|
||||
transformed = gs.fit_transform(mock_dataset)
|
||||
|
||||
assert "num1_mean_by_cat1" in transformed.columns
|
||||
assert "num1_sum_by_cat1" in transformed.columns
|
||||
|
||||
|
||||
def test_split_bins(mock_dataset):
|
||||
sb = SplitBins(cols=["num1"])
|
||||
transformed = sb.fit_transform(mock_dataset)
|
||||
|
||||
assert transformed["num1"].nunique() <= 5
|
||||
assert all(0 <= x < 5 for x in transformed["num1"])
|
||||
|
||||
|
||||
def test_extract_time_comps(mock_dataset):
|
||||
time_comps = ["year", "month", "day", "hour", "dayofweek", "is_weekend"]
|
||||
etc = ExtractTimeComps(time_col="date1", time_comps=time_comps)
|
||||
transformed = etc.fit_transform(mock_dataset.copy())
|
||||
|
||||
for comp in time_comps:
|
||||
assert comp in transformed.columns
|
||||
assert transformed["year"][0] == 2020
|
||||
assert transformed["month"][0] == 1
|
||||
assert transformed["day"][0] == 1
|
||||
assert transformed["hour"][0] == 0
|
||||
assert transformed["dayofweek"][0] == 3
|
||||
assert transformed["is_weekend"][0] == 0
|
||||
|
||||
|
||||
def test_general_selection(mock_dataset):
|
||||
gs = GeneralSelection(label_col="label")
|
||||
transformed = gs.fit_transform(mock_dataset.copy())
|
||||
|
||||
assert "num3" not in transformed.columns
|
||||
assert "cat2" not in transformed.columns
|
||||
|
||||
|
||||
def test_tree_based_selection(mock_dataset):
|
||||
# regression
|
||||
data = load_sklearn_data("housing")
|
||||
tbs = TreeBasedSelection(label_col="label", task_type="reg")
|
||||
transformed = tbs.fit_transform(data)
|
||||
assert len(transformed.columns) > 1
|
||||
|
||||
# classification
|
||||
data = load_sklearn_data("breast_cancer")
|
||||
tbs = TreeBasedSelection(label_col="label", task_type="cls")
|
||||
transformed = tbs.fit_transform(data)
|
||||
assert len(transformed.columns) > 1
|
||||
|
||||
# multi-classification
|
||||
data = load_sklearn_data("iris")
|
||||
tbs = TreeBasedSelection(label_col="label", task_type="mcls")
|
||||
transformed = tbs.fit_transform(data)
|
||||
assert len(transformed.columns) > 1
|
||||
|
||||
|
||||
def test_variance_based_selection(mock_dataset):
|
||||
vbs = VarianceBasedSelection(label_col="label")
|
||||
transformed = vbs.fit_transform(mock_dataset.copy())
|
||||
|
||||
assert "num3" not in transformed.columns
|
||||
|
|
@ -1,55 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2023/11/17 10:24
|
||||
# @Author : lidanyang
|
||||
# @File : test_register.py
|
||||
# @Desc :
|
||||
import pytest
|
||||
|
||||
from metagpt.tools.functions.register.register import FunctionRegistry
|
||||
from metagpt.tools.functions.schemas.base import ToolSchema, tool_field
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def registry():
|
||||
return FunctionRegistry()
|
||||
|
||||
|
||||
class AddNumbers(ToolSchema):
|
||||
"""Add two numbers"""
|
||||
|
||||
num1: int = tool_field(description="First number")
|
||||
num2: int = tool_field(description="Second number")
|
||||
|
||||
|
||||
def test_register(registry):
|
||||
@registry.register("module1", AddNumbers)
|
||||
def add_numbers(num1, num2):
|
||||
return num1 + num2
|
||||
|
||||
assert len(registry.functions["module1"]) == 1
|
||||
assert "add_numbers" in registry.functions["module1"]
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
|
||||
@registry.register("module1", AddNumbers)
|
||||
def add_numbers(num1, num2):
|
||||
return num1 + num2
|
||||
|
||||
func = registry.get("module1", "add_numbers")
|
||||
assert func["func"](1, 2) == 3
|
||||
assert func["schema"] == {
|
||||
"name": "add_numbers",
|
||||
"description": "Add two numbers",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"num1": {"description": "First number", "type": "int"},
|
||||
"num2": {"description": "Second number", "type": "int"},
|
||||
},
|
||||
"required": ["num1", "num2"],
|
||||
},
|
||||
}
|
||||
|
||||
module1_funcs = registry.get_all_by_module("module1")
|
||||
assert len(module1_funcs) == 1
|
||||
Loading…
Add table
Add a link
Reference in a new issue