Sup cat features (#14)

* support categorical features

* update the documentation and readme
added a test for the case where cells are supplied as a param.

* add big tests (adult test and iris)
and fixed bugs

* update transform to return numpy if original data is numpy

* added nursery test

* break loop if there is an illegal level

* Stop pruning one step before passing accuracy threshold

* adding asserts and fix DecisionTreeClassifier init

* Fix tests

Co-authored-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
olasaadi 2022-01-11 09:51:04 +02:00 committed by GitHub
parent c1450865b1
commit 2eb626c00c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 666 additions and 104 deletions

View file

@ -1,18 +1,22 @@
"""
This module implements all classes needed to perform data minimization
"""
from typing import Union
import pandas as pd
import numpy as np
import copy
import sys
from scipy.spatial import distance
from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
""" A transformer that generalizes data to representative points.
@ -44,6 +48,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
features : list of str, optional
The feature names, in the order that they appear in the data.
categorical_features: list of str, optional
The list of categorical features should only be supplied when
passing data as a pandas dataframe.
cells : list of object, optional
The cells used to generalize records. Each cell must define a
range or subset of categories for each feature, as well as a
@ -70,11 +78,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
"""
def __init__(self, estimator=None, target_accuracy=0.998, features=None,
cells=None):
cells=None, categorical_features=None):
self.estimator = estimator
self.target_accuracy = target_accuracy
self.features = features
self.cells = cells
self.categorical_features = []
if categorical_features:
self.categorical_features = categorical_features
self.is_numpy = True
def get_params(self, deep=True):
"""Get parameters for this estimator.
@ -121,7 +133,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def generalizations(self):
return self.generalizations_
def fit_transform(self, X=None, y=None):
def fit_transform(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
"""Learns the generalizations based on training data, and applies them to the data.
Parameters
@ -134,13 +146,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
Returns
-------
self : object
Returns self.
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
"""
self.fit(X, y)
return self.transform(X)
def fit(self, X=None, y=None):
def fit(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
"""Learns the generalizations based on training data.
Parameters
@ -153,15 +166,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
Returns
-------
X_transformed : ndarray, shape (n_samples, n_features)
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
"""
# take into account that estimator, X, y, cells, features may be None
if X is not None:
if type(X) == np.ndarray:
self.is_numpy = True
else:
self.is_numpy = False
if X is not None and y is not None:
X, y = check_X_y(X, y, accept_sparse=True)
if self.is_numpy:
X, y = check_X_y(X, y, accept_sparse=True)
self.n_features_ = X.shape[1]
elif self.features:
self.n_features_ = len(self.features)
@ -180,6 +199,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.cells_ = self.cells
else:
self.cells_ = {}
self.categorical_values = {}
if self.is_numpy:
X = pd.DataFrame(X, columns=self._features)
# Going to fit
@ -187,36 +210,67 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if self.estimator and X is not None and y is not None:
# divide dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
test_size = 0.4,
random_state = 18)
test_size=0.4,
random_state=18)
# collect feature data (such as min, max)
train_data = pd.DataFrame(X_train, columns=self._features)
feature_data = {}
for feature in self._features:
if not feature in feature_data.keys():
values = list(train_data.loc[:, feature])
if feature not in feature_data.keys():
fd = {}
fd['min'] = min(values)
fd['max'] = max(values)
values = list(X.loc[:, feature])
if feature not in self.categorical_features:
fd['min'] = min(values)
fd['max'] = max(values)
fd['range'] = max(values) - min(values)
else:
fd['range'] = len(values)
feature_data[feature] = fd
# prepare data for DT
categorical_features = list(self.categorical_features)
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
# numeric_features = list(self._features) - list(self.categorical_features)
numeric_features = [item for item in self._features if item not in self.categorical_features]
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
preprocessor.fit(X)
x_prepared = preprocessor.transform(X_train)
self.preprocessor = preprocessor
self.cells_ = {}
self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
self.dt_.fit(X_train, y_train)
self.dt_.fit(x_prepared, y_train)
self._modify_categorical_features(X)
x_prepared = pd.DataFrame(x_prepared, columns=self.categorical_data.columns)
self._calculate_cells()
self._modify_cells()
nodes = self._get_nodes_level(0)
self._attach_cells_representatives(X_train, y_train, nodes)
self._attach_cells_representatives(x_prepared, X_train, y_train, nodes)
# self.cells_ currently holds the generalization created from the tree leaves
self._calculate_generalizations()
# apply generalizations to test data
generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_)
x_prepared_test = preprocessor.transform(X_test)
x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self.categorical_data.columns)
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
# check accuracy
accuracy = self.estimator.score(generalized, y_test)
accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
print('Initial accuracy of model on generalized data, relative to original model predictions '
'(base generalization derived from tree, before improvements): %f' % accuracy)
@ -225,29 +279,44 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
print('Improving generalizations')
level = 1
while accuracy > self.target_accuracy:
nodes = self._get_nodes_level(level)
self._calculate_level_cells(level)
self._attach_cells_representatives(X_train, y_train, nodes)
self._calculate_generalizations()
generalized = self._generalize(X_test, nodes, self.cells_,
self.cells_by_id_)
accuracy = self.estimator.score(generalized, y_test)
print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
level+=1
try:
cells_previous_iter = self.cells_
generalization_prev_iter = self.generalizations_
cells_by_id_prev = self.cells_by_id_
nodes = self._get_nodes_level(level)
self._calculate_level_cells(level)
self._attach_cells_representatives(x_prepared, X_train, y_train, nodes)
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_,
self.cells_by_id_)
accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
# if accuracy passed threshold roll back to previous iteration generalizations
if accuracy < self.target_accuracy:
self.cells_ = cells_previous_iter
self.generalizations_ = generalization_prev_iter
self.cells_by_id_ = cells_by_id_prev
break
else:
print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
level += 1
except Exception as e:
print(e)
break
# if accuracy below threshold, improve accuracy by removing features from generalization
if accuracy < self.target_accuracy:
elif accuracy < self.target_accuracy:
print('Improving accuracy')
while accuracy < self.target_accuracy:
removed_feature = self._remove_feature_from_generalization(X_test,
removed_feature = self._remove_feature_from_generalization(X_test, x_prepared_test,
nodes, y_test,
feature_data, accuracy)
if removed_feature is None:
break
self._calculate_generalizations()
generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_)
accuracy = self.estimator.score(generalized, y_test)
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self.cells_ currently holds the chosen generalization based on target accuracy
@ -258,17 +327,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# Return the transformer
return self
def transform(self, X):
def transform(self, X: Union[np.ndarray, pd.DataFrame]):
""" Transforms data records to representative points.
Parameters
----------
X : {array-like, sparse-matrix}, shape (n_samples, n_features)
X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
may contain both numeric and categorical data.
The input samples.
Returns
-------
X_transformed : ndarray, shape (n_samples, n_features)
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
"""
@ -279,8 +349,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
'appropriate arguments before using this method.'
check_is_fitted(self, ['cells', 'features'], msg=msg)
# Input validation
X = check_array(X, accept_sparse=True)
if type(X) == np.ndarray:
# Input validation
X = check_array(X, accept_sparse=True)
self.is_numpy = True
X = pd.DataFrame(X, columns=self._features)
else:
self.is_numpy = False
if X.shape[1] != self.n_features_ and self.n_features_ != 0:
raise ValueError('Shape of input is different from what was seen'
'in `fit`')
@ -300,8 +377,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# if feature has a representative value in the cell and should not
# be left untouched, take the representative value
if feature in self.cells_[i]['representative'] and \
( 'untouched' not in self.cells_[i] \
or feature not in self.cells_[i]['untouched'] ):
('untouched' not in self.cells_[i]
or feature not in self.cells_[i]['untouched']):
representatives.loc[i, feature] = self.cells_[i]['representative'][feature]
# else, drop the feature (removes from representatives columns that
# do not have a representative value or should remain untouched)
@ -315,30 +392,57 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# values (leaves others untouched)
if indexes and not representatives.columns.empty:
if len(indexes) > 1:
replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True)
replace = pd.concat([representatives.loc[i].to_frame().T] * len(indexes)).reset_index(drop=True)
else:
replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
replace.index = indexes
generalized.loc[indexes, representatives.columns] = replace
return generalized.to_numpy()
if self.is_numpy:
return generalized.to_numpy()
return generalized
def _get_record_indexes_for_cell(self, X, cell, mapped):
return [i for i, x in enumerate(X) if not mapped.item(i) and
self._cell_contains(cell, x, i, mapped)]
indexes = []
for index, row in X.iterrows():
if not mapped.item(index) and self._cell_contains(cell, row, index, mapped):
indexes.append(index)
return indexes
def _cell_contains(self, cell, x, i, mapped):
for f in self._features:
if f in cell['ranges']:
if not self._cell_contains_numeric(f, cell['ranges'][f], x):
return False
elif f in cell['categories']:
if not self._cell_contains_categorical(f, cell['categories'][f], x):
return False
elif f in cell['untouched']:
continue
else:
#TODO: exception - feature not defined
pass
raise TypeError("feature " + f + "not found in cell" + cell['id'])
# Mark as mapped
mapped.itemset(i, 1)
return True
def _modify_categorical_features(self, X):
self.categorical_values = {}
self.oneHotVectorFeaturesToFeatures = {}
features_to_remove = []
for feature in self.categorical_features:
try:
all_values = X.loc[:, feature]
values = list(all_values.unique())
self.categorical_values[feature] = values
X[feature] = pd.Categorical(X.loc[:, feature], categories=values, ordered=False)
ohe = pd.get_dummies(X[feature], prefix=feature)
for oneHotVectorFeature in ohe.columns:
self.oneHotVectorFeaturesToFeatures[oneHotVectorFeature] = feature
X = pd.concat([X, ohe], axis=1)
features_to_remove.append(feature)
except KeyError:
print("feature " + feature + "not found in training data")
self.categorical_data = X.drop(features_to_remove, axis=1)
def _cell_contains_numeric(self, f, range, x):
i = self._features.index(f)
# convert x to ndarray to allow indexing
@ -352,6 +456,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return False
return True
def _cell_contains_categorical(self, f, range, x):
i = self._features.index(f)
# convert x to ndarray to allow indexing
a = np.array(x)
value = a.item(i)
if value in range:
return True
return False
def _calculate_cells(self):
self.cells_by_id_ = {}
self.cells_ = self._calculate_cells_recursive(0)
@ -366,7 +479,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return [cell]
cells = []
feature = self._features[feature_index]
feature = self.categorical_data.columns[feature_index]
threshold = self.dt_.tree_.threshold[node]
left_child = self.dt_.tree_.children_left[node]
right_child = self.dt_.tree_.children_right[node]
@ -397,22 +510,38 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def _modify_cells(self):
cells = []
features = self.categorical_data.columns
for cell in self.cells_:
new_cell = {'id': cell['id'], 'label': cell['label'], 'ranges': {},
'categories': {}, 'hist': cell['hist'], 'representative': None}
for feature in self._features:
if feature in cell['ranges'].keys():
new_cell['ranges'][feature] = cell['ranges'][feature]
new_cell = {'id': cell['id'], 'label': cell['label'], 'ranges': {}, 'categories': {}, 'hist': cell['hist'],
'representative': None}
for feature in features:
if feature in self.oneHotVectorFeaturesToFeatures.keys():
# feature is categorical and should be mapped
categorical_feature = self.oneHotVectorFeaturesToFeatures[feature]
if categorical_feature not in new_cell['categories'].keys():
new_cell['categories'][categorical_feature] = self.categorical_values[
categorical_feature].copy()
if feature in cell['ranges'].keys():
categorical_value = feature[len(categorical_feature) + 1:]
if cell['ranges'][feature]['start'] is not None:
# categorical feature must have this value
new_cell['categories'][categorical_feature] = [categorical_value]
else:
# categorical feature can not have this value
if categorical_value in new_cell['categories'][categorical_feature]:
new_cell['categories'][categorical_feature].remove(categorical_value)
else:
new_cell['ranges'][feature] = {'start': None, 'end': None}
if feature in cell['ranges'].keys():
new_cell['ranges'][feature] = cell['ranges'][feature]
else:
new_cell['ranges'][feature] = {'start': None, 'end': None}
cells.append(new_cell)
self.cells_by_id_[new_cell['id']] = new_cell
self.cells_ = cells
def _calculate_level_cells(self, level):
if level < 0 or level > self.dt_.get_depth():
#TODO: exception 'Illegal level %d' % level
pass
raise TypeError("Illegal level %d' % level", level)
if level > 0:
new_cells = []
@ -420,7 +549,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
nodes = self._get_nodes_level(level)
if nodes:
for node in nodes:
if self.dt_.tree_.feature[node] == -2: # leaf node
if self.dt_.tree_.feature[node] == -2: # leaf node
new_cell = self.cells_by_id_[node]
else:
left_child = self.dt_.tree_.children_left[node]
@ -474,23 +603,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# return all nodes with depth == level or leaves higher than level
return [i for i, x in enumerate(node_depth) if x == depth or (x < depth and is_leaves[i])]
def _attach_cells_representatives(self, samples, labels, level_nodes):
samples_df = pd.DataFrame(samples, columns=self._features)
labels_df = pd.DataFrame(labels, columns=['label'])
samples_node_ids = self._find_sample_nodes(samples_df, level_nodes)
def _attach_cells_representatives(self, prepared_data, originalTrainFeatures, labelFeature, level_nodes):
# prepared data include one hot encoded categorical data,
# if there is no categorical data prepared data is original data
nodeIds = self._find_sample_nodes(prepared_data, level_nodes)
labels_df = pd.DataFrame(labelFeature, columns=['label'])
for cell in self.cells_:
cell['representative'] = {}
# get all rows in cell
indexes = [i for i, x in enumerate(samples_node_ids) if x == cell['id']]
sample_rows = samples_df.iloc[indexes]
indexes = [i for i, x in enumerate(nodeIds) if x == cell['id']]
original_rows = originalTrainFeatures.iloc[indexes]
sample_rows = prepared_data.iloc[indexes]
sample_labels = labels_df.iloc[indexes]['label'].values.tolist()
# get rows with matching label
indexes = [i for i, label in enumerate(sample_labels) if label == cell['label']]
match_samples = sample_rows.iloc[indexes]
match_rows = original_rows.iloc[indexes]
# find the "middle" of the cluster
array = match_samples.values
# Only works with numpy 1.9.0 and higher!!!
median = np.median(array, axis=0)
# find the record closest to the median
i = 0
min = len(array)
min_dist = float("inf")
@ -500,19 +632,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
min_dist = dist
min = i
i = i + 1
row = match_samples.iloc[min]
# use its values as the representative
row = match_rows.iloc[min]
for feature in cell['ranges'].keys():
cell['representative'][feature] = row[feature].item()
cell['representative'][feature] = row[feature]
for feature in cell['categories'].keys():
cell['representative'][feature] = row[feature]
def _find_sample_nodes(self, samples, nodes):
paths = self.dt_.decision_path(samples).toarray()
nodeSet = set(nodes)
return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths]
def _generalize(self, data, level_nodes, cells, cells_by_id):
representatives = pd.DataFrame(columns=self._features) # empty except for columns
generalized = pd.DataFrame(data, columns=self._features, copy=True) # original data
def _generalize(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
# prepared data include one hot encoded categorical data + QI
representatives = pd.DataFrame(columns=self._features) # empty except for columns
generalized = pd.DataFrame(prepared_data, columns=self.categorical_data.columns, copy=True)
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
mapping_to_cells = self._map_to_cells(generalized, level_nodes, cells_by_id)
# iterate over cells (leaves in decision tree)
for i in range(len(cells)):
@ -530,32 +665,39 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
representatives = representatives.drop(feature, axis=1)
# get the indexes of all records that map to this cell
indexes = [j for j in range(len(mapping_to_cells)) if mapping_to_cells[j]['id'] == cells[i]['id']]
indexes = [j for j in mapping_to_cells if mapping_to_cells[j]['id'] == cells[i]['id']]
# replaces the values in the representative columns with the representative values
# (leaves others untouched)
if indexes and not representatives.columns.empty:
if len(indexes) > 1:
replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True)
replace = pd.concat([representatives.loc[i].to_frame().T] * len(indexes)).reset_index(drop=True)
else:
replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
replace.index = indexes
generalized.loc[indexes, representatives.columns] = replace
# replace = self.preprocessor.transform(replace)
replace = pd.DataFrame(replace, indexes, columns=self._features)
original_data_generalized.loc[indexes, representatives.columns.tolist()] = replace
return generalized.to_numpy()
return original_data_generalized
def _map_to_cells(self, samples, nodes, cells_by_id):
mapping_to_cells = []
mapping_to_cells = {}
for index, row in samples.iterrows():
cell = self._find_sample_cells([row], nodes, cells_by_id)[0]
mapping_to_cells.append(cell)
mapping_to_cells[index] = cell
return mapping_to_cells
def _find_sample_cells(self, samples, nodes, cells_by_id):
node_ids = self._find_sample_nodes(samples, nodes)
return [cells_by_id[nodeId] for nodeId in node_ids]
def _remove_feature_from_generalization(self, samples, nodes, labels, feature_data, current_accuracy):
feature = self._get_feature_to_remove(samples, nodes, labels, feature_data, current_accuracy)
def _remove_feature_from_generalization(self, original_data, prepared_data, nodes, labels, feature_data,
current_accuracy):
# prepared data include one hot encoded categorical data,
# if there is no categorical data prepared data is original data
feature = self._get_feature_to_remove(original_data, prepared_data, nodes, labels, feature_data,
current_accuracy)
if feature is None:
return None
GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
@ -563,14 +705,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# self.generalizations_['untouched'].append(feature)
return feature
def _get_feature_to_remove(self, samples, nodes, labels, feature_data, current_accuracy):
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy):
# prepared data include one hot encoded categorical data,
# if there is no categorical data prepared data is original data
# We want to remove features with low iLoss (NCP) and high accuracy gain
# (after removing them)
ranges = self.generalizations_['ranges']
range_counts = self._find_range_count(samples, ranges)
total = samples.size
range_counts = self._find_range_count(original_data, ranges)
total = prepared_data.size
range_min = sys.float_info.max
remove_feature = None
categories = self.generalizations['categories']
category_counts = self._find_categories_count(original_data, categories)
for feature in ranges.keys():
if feature not in self.generalizations_['untouched']:
@ -583,8 +729,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
new_cells = copy.deepcopy(self.cells_)
cells_by_id = copy.deepcopy(self.cells_by_id_)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(samples, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(generalized, labels) - current_accuracy
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(self.preprocessor.transform(generalized),
labels) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
@ -594,15 +741,39 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
range_min = feature_ncp
remove_feature = feature
for feature in categories.keys():
if feature not in self.generalizations['untouched']:
feature_ncp = self._calc_ncp_categorical(categories[feature],
category_counts[feature],
feature_data[feature],
total)
if feature_ncp > 0:
# divide by accuracy loss
new_cells = copy.deepcopy(self.cells_)
cells_by_id = copy.deepcopy(self.cells_by_id_)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(self.preprocessor.transform(generalized),
labels) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
feature_ncp = feature_ncp / accuracy_gain
if feature_ncp < range_min:
range_min = feature_ncp
remove_feature = feature
print('feature to remove: ' + (str(remove_feature) if remove_feature is not None else 'none'))
return remove_feature
def _calculate_generalizations(self):
self.generalizations_ = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells_),
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells_)}
'categories': GeneralizeToRepresentative._calculate_categories(self.cells_),
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells_)}
def _find_range_count(self, samples, ranges):
samples_df = pd.DataFrame(samples, columns=self._features)
samples_df = pd.DataFrame(samples, columns=self.categorical_data.columns)
range_counts = {}
last_value = None
for r in ranges.keys():
@ -612,22 +783,42 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
range_counts[r].append(samples_df.shape[0])
else:
for value in ranges[r]:
range_counts[r].append(len(samples_df.loc[samples_df[r] <= value]))
counter = [item for item in samples_df[r] if int(item) <= value]
range_counts[r].append(len(counter))
last_value = value
range_counts[r].append(len(samples_df.loc[samples_df[r] > last_value]))
counter = [item for item in samples_df[r] if int(item) <= last_value]
range_counts[r].append(len(counter))
return range_counts
def _find_categories_count(self, samples, categories):
category_counts = {}
for c in categories.keys():
category_counts[c] = []
for value in categories[c]:
category_counts[c].append(len(samples.loc[samples[c].isin(value)]))
return category_counts
def _calculate_ncp(self, samples, generalizations, feature_data):
# supressed features are already taken care of within _calc_ncp_numeric
ranges = generalizations['ranges']
categories = generalizations['categories']
range_counts = self._find_range_count(samples, ranges)
category_counts = self._find_categories_count(samples, categories)
total = samples.shape[0]
total_ncp = 0
total_features = len(generalizations['untouched'])
for feature in ranges.keys():
feature_ncp = GeneralizeToRepresentative._calc_ncp_numeric(ranges[feature], range_counts[feature], feature_data[feature], total)
feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
feature_data[feature], total)
total_ncp = total_ncp + feature_ncp
total_features += 1
for feature in categories.keys():
featureNCP = self._calc_ncp_categorical(categories[feature], category_counts[feature],
feature_data[feature],
total)
total_ncp = total_ncp + featureNCP
total_features += 1
if total_features == 0:
return 0
return total_ncp / total_features
@ -649,6 +840,55 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
ranges[feature].sort()
return ranges
@staticmethod
def _calculate_categories(cells):
categories = {}
categorical_features_values = GeneralizeToRepresentative._calculate_categorical_features_values(cells)
for feature in categorical_features_values.keys():
partitions = []
values = categorical_features_values[feature]
assigned = []
for i in range(len(values)):
value1 = values[i]
if value1 in assigned:
continue
partition = [value1]
assigned.append(value1)
for j in range(len(values)):
if j <= i:
continue
value2 = values[j]
if GeneralizeToRepresentative._are_inseparable(cells, feature, value1, value2):
partition.append(value2)
assigned.append(value2)
partitions.append(partition)
categories[feature] = partitions
return categories
@staticmethod
def _calculate_categorical_features_values(cells):
categorical_features_values = {}
for cell in cells:
for feature in [key for key in cell['categories'].keys() if
'untouched' not in cell or key not in cell['untouched']]:
if feature not in categorical_features_values.keys():
categorical_features_values[feature] = []
for value in cell['categories'][feature]:
if value not in categorical_features_values[feature]:
categorical_features_values[feature].append(value)
return categorical_features_values
@staticmethod
def _are_inseparable(cells, feature, value1, value2):
for cell in cells:
if feature not in cell['categories'].keys():
continue
value1_in = value1 in cell['categories'][feature]
value2_in = value2 in cell['categories'][feature]
if value1_in != value2_in:
return False
return True
@staticmethod
def _calculate_untouched(cells):
untouched_lists = [cell['untouched'] if 'untouched' in cell else [] for cell in cells]
@ -656,6 +896,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
untouched = untouched.intersection(*untouched_lists)
return list(untouched)
@staticmethod
def _calc_ncp_categorical(categories, categoryCount, feature_data, total):
category_sizes = [len(g) if len(g) > 1 else 0 for g in categories]
normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, categoryCount)]
average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes)
return average_group_size / feature_data['range'] # number of values in category
@staticmethod
def _calc_ncp_numeric(feature_range, range_count, feature_data, total):
# if there are no ranges, feature is supressed and iLoss is 1
@ -669,7 +916,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
return average_range_size / (feature_data['max'] - feature_data['min'])
@staticmethod
def _remove_feature_from_cells(cells, cells_by_id, feature):
for cell in cells: