From 30cb70506207d778c1eedd3417399e18ecf22c5f Mon Sep 17 00:00:00 2001 From: abigailt Date: Thu, 5 Oct 2023 13:58:40 +0300 Subject: [PATCH] No default encoder, if none provided data is supplied to the model as is. Fix data type of representative values. Fix and add more tests. Signed-off-by: abigailt --- apt/minimization/minimizer.py | 56 ++++++------ tests/test_minimizer.py | 162 ++++++++++++++++++++++------------ 2 files changed, 134 insertions(+), 84 deletions(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 29b7eb0..e0760fe 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -57,7 +57,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded before using them to train the decision tree model). :param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical - features) + features). If not provided, the data will be fed as is directly to the estimator. :type encoder: sklearn OrdinalEncoder or OneHotEncoder :type categorical_features: list of strings, optional :param features_to_minimize: The features to be minimized. @@ -256,7 +256,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # Going to fit # (currently not dealing with option to fit with only X and y and no estimator) if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None: - dtype = dataset.get_samples().dtype x = pd.DataFrame(dataset.get_samples(), columns=self._features) if not self.features_to_minimize: self.features_to_minimize = self._features @@ -293,21 +292,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # collect feature data (such as min, max) self._feature_data = self._get_feature_data(x) - # default encoder in case none provided - if self.encoder is None: - numeric_features = [f for f in self._features if f not in self.categorical_features] - numeric_transformer = Pipeline( - steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] - ) - categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) - self.encoder = ColumnTransformer( - transformers=[ - ("num", numeric_transformer, numeric_features), - ("cat", categorical_transformer, self.categorical_features), - ] - ) - self.encoder.fit(x) - self.cells = [] self._categorical_values = {} @@ -341,7 +325,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM generalized = self._generalize_from_generalizations(x_test, self.generalizations) # check accuracy - accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), y_test)) + if self.encoder: + accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test)) + else: + accuracy = self.estimator.score(ArrayDataset(generalized, y_test)) print('Initial accuracy of model on generalized data, relative to original model predictions ' '(base generalization derived from tree, before improvements): %f' % accuracy) @@ -371,8 +358,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM else: generalized = self._generalize_from_generalizations(x_test, self.generalizations) - accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), - y_test)) + if self.encoder: + accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test)) + else: + accuracy = self.estimator.score(ArrayDataset(generalized, y_test)) # if accuracy passed threshold roll back to previous iteration generalizations if accuracy < self.target_accuracy: self.cells = cells_previous_iter @@ -401,8 +390,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._cells_by_id) else: generalized = self._generalize_from_generalizations(x_test, self.generalizations) - accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), - y_test)) + + if self.encoder: + accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test)) + else: + accuracy = self.estimator.score(ArrayDataset(generalized, y_test)) print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy)) # self._cells currently holds the chosen generalization based on target accuracy @@ -893,7 +885,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def _generalize_indexes(self, original_data, cells, all_indexes): # prepared data include one hot encoded categorical data + QI - representatives = pd.DataFrame(columns=self._features) # empty except for columns + dtypes = original_data.dtypes.to_dict() + new_dtypes = {} + for t in dtypes.keys(): + new_dtypes[t] = pd.Series(dtype=dtypes[t].name) + representatives = pd.DataFrame(new_dtypes) # empty except for columns original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True) # iterate over cells (leaves in decision tree) @@ -1000,8 +996,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells, cells_by_id) - accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), - labels)) - current_accuracy + if self.encoder: + accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), + labels)) - current_accuracy + else: + accuracy_gain = self.estimator.score(ArrayDataset(generalized, labels)) - current_accuracy if accuracy_gain < 0: accuracy_gain = 0 if accuracy_gain != 0: @@ -1027,8 +1026,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells, cells_by_id) - accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), - labels)) - current_accuracy + if self.encoder: + accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), + labels)) - current_accuracy + else: + accuracy_gain = self.estimator.score(ArrayDataset(generalized, labels)) - current_accuracy if accuracy_gain < 0: accuracy_gain = 0 diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 135adcf..ca34fbd 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -11,6 +11,8 @@ from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder +from torch import nn, optim + import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Input @@ -24,6 +26,9 @@ from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegresso tf.compat.v1.disable_eager_execution() +ACCURACY_DIFF = 0.05 + + @pytest.fixture def diabetes_dataset(): return load_diabetes() @@ -286,7 +291,7 @@ def test_minimizer_fit(data_two_features): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_minimizer_ncp(data_two_features): @@ -348,12 +353,15 @@ def test_minimizer_ncp_categorical(data_four_features): train_dataset = ArrayDataset(x, predictions, features_names=features) gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, - categorical_features=categorical_features, generalize_using_transform=False) + categorical_features=categorical_features, + generalize_using_transform=False, + encoder=preprocessor) gen1.fit(dataset=train_dataset) ncp1 = gen1.ncp.fit_score ncp2 = gen1.calculate_ncp(ad1) - gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features) + gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features, + encoder=preprocessor) gen2.fit(dataset=train_dataset) ncp3 = gen2.ncp.fit_score gen2.transform(dataset=ad1) @@ -414,7 +422,8 @@ def test_minimizer_fit_pandas(data_four_features): # Now we have a full prediction pipeline. target_accuracy = 0.5 gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, - categorical_features=categorical_features) + categorical_features=categorical_features, + encoder=preprocessor) train_dataset = ArrayDataset(x, predictions) gen.fit(dataset=train_dataset) transformed = gen.transform(dataset=ArrayDataset(x)) @@ -428,7 +437,7 @@ def test_minimizer_fit_pandas(data_four_features): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_minimizer_params_categorical(cells_categorical): @@ -450,13 +459,14 @@ def test_minimizer_params_categorical(cells_categorical): # Now we have a full prediction pipeline. target_accuracy = 0.5 gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, - categorical_features=categorical_features, cells=cells) + categorical_features=categorical_features, cells=cells, + encoder=preprocessor) train_dataset = ArrayDataset(x, predictions) gen.fit(dataset=train_dataset) transformed = gen.transform(dataset=ArrayDataset(x)) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_minimizer_fit_qi(data_three_features): @@ -484,7 +494,7 @@ def test_minimizer_fit_qi(data_three_features): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_minimizer_fit_pandas_qi(data_five_features): @@ -508,7 +518,8 @@ def test_minimizer_fit_pandas_qi(data_five_features): # Now we have a full prediction pipeline. target_accuracy = 0.5 gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, - categorical_features=categorical_features, features_to_minimize=qi) + categorical_features=categorical_features, features_to_minimize=qi, + encoder=preprocessor) train_dataset = ArrayDataset(x, predictions) gen.fit(dataset=train_dataset) transformed = gen.transform(dataset=ArrayDataset(x)) @@ -523,7 +534,7 @@ def test_minimizer_fit_pandas_qi(data_five_features): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_minimize_ndarray_iris(): @@ -552,7 +563,7 @@ def test_minimize_ndarray_iris(): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_minimize_pandas_adult(): @@ -582,7 +593,8 @@ def test_minimize_pandas_adult(): predictions = np.argmax(predictions, axis=1) target_accuracy = 0.7 gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, - categorical_features=categorical_features, features_to_minimize=qi) + categorical_features=categorical_features, features_to_minimize=qi, + encoder=preprocessor) gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features)) transformed = gen.transform(dataset=ArrayDataset(x_train)) gener = gen.generalizations @@ -609,7 +621,7 @@ def test_minimize_pandas_adult(): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_german_credit_pandas(): @@ -637,7 +649,8 @@ def test_german_credit_pandas(): predictions = np.argmax(predictions, axis=1) target_accuracy = 0.7 gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, - categorical_features=categorical_features, features_to_minimize=qi) + categorical_features=categorical_features, features_to_minimize=qi, + encoder=preprocessor) gen.fit(dataset=ArrayDataset(x_train, predictions)) transformed = gen.transform(dataset=ArrayDataset(x_train)) gener = gen.generalizations @@ -666,7 +679,7 @@ def test_german_credit_pandas(): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_regression(diabetes_dataset): @@ -726,7 +739,7 @@ def test_regression(diabetes_dataset): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_x_y(): @@ -766,7 +779,7 @@ def test_x_y(): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_x_y_features_names(): @@ -806,7 +819,7 @@ def test_x_y_features_names(): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_BaseEstimator_classification(data_five_features): @@ -828,7 +841,8 @@ def test_BaseEstimator_classification(data_five_features): # Now we have a full prediction pipeline. target_accuracy = 0.5 gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, - categorical_features=categorical_features, features_to_minimize=QI) + categorical_features=categorical_features, features_to_minimize=QI, + encoder=preprocessor) train_dataset = ArrayDataset(x, predictions) gen.fit(dataset=train_dataset) transformed = gen.transform(dataset=ArrayDataset(x)) @@ -844,7 +858,7 @@ def test_BaseEstimator_classification(data_five_features): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(preprocessor.transform(transformed), predictions) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_BaseEstimator_regression(diabetes_dataset): @@ -903,7 +917,7 @@ def test_BaseEstimator_regression(diabetes_dataset): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(transformed, predictions) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_keras_model(): @@ -936,7 +950,39 @@ def test_keras_model(): check_ncp(ncp, gener) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) + + +class PytorchModel(nn.Module): + + def __init__(self, num_classes, num_features): + super(PytorchModel, self).__init__() + + self.fc1 = nn.Sequential( + nn.Linear(num_features, 1024), + nn.Tanh(), ) + + self.fc2 = nn.Sequential( + nn.Linear(1024, 512), + nn.Tanh(), ) + + self.fc3 = nn.Sequential( + nn.Linear(512, 256), + nn.Tanh(), ) + + self.fc4 = nn.Sequential( + nn.Linear(256, 128), + nn.Tanh(), + ) + + self.classifier = nn.Linear(128, num_classes) + + def forward(self, x): + out = self.fc1(x) + out = self.fc2(out) + out = self.fc3(out) + out = self.fc4(out) + return self.classifier(out) def test_minimizer_pytorch(data_three_features): @@ -944,49 +990,17 @@ def test_minimizer_pytorch(data_three_features): x = x.astype(np.float32) qi = ['age', 'weight'] - from torch import nn, optim from apt.utils.datasets.datasets import PytorchData from apt.utils.models.pytorch_model import PyTorchClassifier - class pytorch_model(nn.Module): - - def __init__(self, num_classes, num_features): - super(pytorch_model, self).__init__() - - self.fc1 = nn.Sequential( - nn.Linear(num_features, 1024), - nn.Tanh(), ) - - self.fc2 = nn.Sequential( - nn.Linear(1024, 512), - nn.Tanh(), ) - - self.fc3 = nn.Sequential( - nn.Linear(512, 256), - nn.Tanh(), ) - - self.fc4 = nn.Sequential( - nn.Linear(256, 128), - nn.Tanh(), - ) - - self.classifier = nn.Linear(128, num_classes) - - def forward(self, x): - out = self.fc1(x) - out = self.fc2(out) - out = self.fc3(out) - out = self.fc4(out) - return self.classifier(out) - - base_est = pytorch_model(2, 3) + base_est = PytorchModel(2, 3) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(base_est.parameters(), lr=0.01) model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion, optimizer=optimizer, input_shape=(3,), nb_classes=2) - model.fit(PytorchData(x.astype(np.float32), y), save_entire_model=False, nb_epochs=10) + model.fit(PytorchData(x, y), save_entire_model=False, nb_epochs=10) ad = ArrayDataset(x) predictions = model.predict(ad) @@ -1006,7 +1020,41 @@ def test_minimizer_pytorch(data_three_features): check_ncp(ncp, expected_generalizations) rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions)) - assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) + + +def test_minimizer_pytorch_iris(): + features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] + (x_train, y_train), _ = get_iris_dataset_np() + x_train = x_train.astype(np.float32) + qi = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] + + from apt.utils.datasets.datasets import PytorchData + from apt.utils.models.pytorch_model import PyTorchClassifier + + base_est = PytorchModel(3, 4) + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(base_est.parameters(), lr=0.01) + + model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion, + optimizer=optimizer, input_shape=(4,), + nb_classes=3) + model.fit(PytorchData(x_train, y_train), save_entire_model=False, nb_epochs=10) + + predictions = model.predict(ArrayDataset(x_train)) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + target_accuracy = 0.99 + gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi) + transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features)) + gener = gen.generalizations + + check_features(features, gener, transformed, x_train) + ncp = gen.ncp.transform_score + check_ncp(ncp, gener) + + rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions)) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) def test_untouched():