mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-26 15:49:37 +02:00
Support pytorch models in data minimization (#85)
* Support pytorch models in data minimization Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
a40484e0c9
commit
26addd192f
2 changed files with 76 additions and 3 deletions
|
|
@ -256,6 +256,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
# Going to fit
|
# Going to fit
|
||||||
# (currently not dealing with option to fit with only X and y and no estimator)
|
# (currently not dealing with option to fit with only X and y and no estimator)
|
||||||
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
|
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
|
||||||
|
dtype = dataset.get_samples().dtype
|
||||||
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
|
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
|
||||||
if not self.features_to_minimize:
|
if not self.features_to_minimize:
|
||||||
self.features_to_minimize = self._features
|
self.features_to_minimize = self._features
|
||||||
|
|
@ -340,7 +341,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||||
|
|
||||||
# check accuracy
|
# check accuracy
|
||||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), y_test))
|
||||||
print('Initial accuracy of model on generalized data, relative to original model predictions '
|
print('Initial accuracy of model on generalized data, relative to original model predictions '
|
||||||
'(base generalization derived from tree, before improvements): %f' % accuracy)
|
'(base generalization derived from tree, before improvements): %f' % accuracy)
|
||||||
|
|
||||||
|
|
@ -370,7 +371,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
else:
|
else:
|
||||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||||
|
|
||||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
|
||||||
|
y_test))
|
||||||
# if accuracy passed threshold roll back to previous iteration generalizations
|
# if accuracy passed threshold roll back to previous iteration generalizations
|
||||||
if accuracy < self.target_accuracy:
|
if accuracy < self.target_accuracy:
|
||||||
self.cells = cells_previous_iter
|
self.cells = cells_previous_iter
|
||||||
|
|
@ -399,7 +401,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self._cells_by_id)
|
self._cells_by_id)
|
||||||
else:
|
else:
|
||||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
|
||||||
|
y_test))
|
||||||
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
|
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
|
||||||
|
|
||||||
# self._cells currently holds the chosen generalization based on target accuracy
|
# self._cells currently holds the chosen generalization based on target accuracy
|
||||||
|
|
|
||||||
|
|
@ -939,6 +939,76 @@ def test_keras_model():
|
||||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||||
|
|
||||||
|
|
||||||
|
def test_minimizer_pytorch(data_three_features):
|
||||||
|
x, y, features = data_three_features
|
||||||
|
x = x.astype(np.float32)
|
||||||
|
qi = ['age', 'weight']
|
||||||
|
|
||||||
|
from torch import nn, optim
|
||||||
|
from apt.utils.datasets.datasets import PytorchData
|
||||||
|
from apt.utils.models.pytorch_model import PyTorchClassifier
|
||||||
|
|
||||||
|
class pytorch_model(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, num_classes, num_features):
|
||||||
|
super(pytorch_model, self).__init__()
|
||||||
|
|
||||||
|
self.fc1 = nn.Sequential(
|
||||||
|
nn.Linear(num_features, 1024),
|
||||||
|
nn.Tanh(), )
|
||||||
|
|
||||||
|
self.fc2 = nn.Sequential(
|
||||||
|
nn.Linear(1024, 512),
|
||||||
|
nn.Tanh(), )
|
||||||
|
|
||||||
|
self.fc3 = nn.Sequential(
|
||||||
|
nn.Linear(512, 256),
|
||||||
|
nn.Tanh(), )
|
||||||
|
|
||||||
|
self.fc4 = nn.Sequential(
|
||||||
|
nn.Linear(256, 128),
|
||||||
|
nn.Tanh(),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.classifier = nn.Linear(128, num_classes)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
out = self.fc1(x)
|
||||||
|
out = self.fc2(out)
|
||||||
|
out = self.fc3(out)
|
||||||
|
out = self.fc4(out)
|
||||||
|
return self.classifier(out)
|
||||||
|
|
||||||
|
base_est = pytorch_model(2, 3)
|
||||||
|
criterion = nn.CrossEntropyLoss()
|
||||||
|
optimizer = optim.Adam(base_est.parameters(), lr=0.01)
|
||||||
|
|
||||||
|
model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
|
||||||
|
optimizer=optimizer, input_shape=(3,),
|
||||||
|
nb_classes=2)
|
||||||
|
model.fit(PytorchData(x.astype(np.float32), y), save_entire_model=False, nb_epochs=10)
|
||||||
|
|
||||||
|
ad = ArrayDataset(x)
|
||||||
|
predictions = model.predict(ad)
|
||||||
|
if predictions.shape[1] > 1:
|
||||||
|
predictions = np.argmax(predictions, axis=1)
|
||||||
|
target_accuracy = 0.5
|
||||||
|
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
|
||||||
|
train_dataset = ArrayDataset(x, predictions, features_names=features)
|
||||||
|
gen.fit(dataset=train_dataset)
|
||||||
|
transformed = gen.transform(dataset=ad)
|
||||||
|
gener = gen.generalizations
|
||||||
|
expected_generalizations = {'ranges': {'age': [], 'weight': []}, 'categories': {}, 'untouched': ['height']}
|
||||||
|
compare_generalizations(gener, expected_generalizations)
|
||||||
|
check_features(features, expected_generalizations, transformed, x)
|
||||||
|
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x, [0, 2], axis=1)).all())
|
||||||
|
ncp = gen.ncp.transform_score
|
||||||
|
check_ncp(ncp, expected_generalizations)
|
||||||
|
|
||||||
|
rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
|
||||||
|
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||||
|
|
||||||
|
|
||||||
def test_untouched():
|
def test_untouched():
|
||||||
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}}, "label": 0,
|
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}}, "label": 0,
|
||||||
'categories': {'gender': ['male']}, "representative": {"age": 26, "height": 149}},
|
'categories': {'gender': ['male']}, "representative": {"age": 26, "height": 149}},
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue