Working example of anonymization with pytorch multi-output binary model

Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
abigailt 2024-03-12 13:25:31 +02:00
parent 5e19d4ae27
commit 076503b248
2 changed files with 76 additions and 4 deletions

View file

@ -233,7 +233,7 @@ class ArrayDataset(Dataset):
raise ValueError("The supplied features are not the same as in the data features") raise ValueError("The supplied features are not the same as in the data features")
self.features_names = x.columns.to_list() self.features_names = x.columns.to_list()
if self._y is not None and len(self._x) != len(self._y): if self._y is not None and self._x.shape[0] != self._y.shape[0]:
raise ValueError("Non equivalent lengths of x and y") raise ValueError("Non equivalent lengths of x and y")
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE: def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:

View file

@ -6,11 +6,17 @@ from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import OneHotEncoder
from apt.anonymization import Anonymize
from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, get_nursery_dataset_pd
from sklearn.datasets import load_diabetes from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from torch import nn, optim, sigmoid, where
from torch.nn import functional
from scipy.special import expit
from apt.utils.datasets.datasets import PytorchData
from apt.utils.models import ModelOutputType
from apt.utils.models.pytorch_model import PyTorchClassifier
from apt.anonymization import Anonymize
from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, get_nursery_dataset_pd
from apt.utils.datasets import ArrayDataset from apt.utils.datasets import ArrayDataset
@ -187,6 +193,72 @@ def test_anonymize_pandas_one_hot():
assert ((np.min(anonymized_slice, axis=1) == 0).all()) assert ((np.min(anonymized_slice, axis=1) == 0).all())
def test_anonymize_pytorch_multi_label_binary():
class multi_label_binary_model(nn.Module):
def __init__(self, num_labels, num_features):
super(multi_label_binary_model, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(num_features, 256),
nn.Tanh(), )
self.classifier1 = nn.Linear(256, num_labels)
def forward(self, x):
return self.classifier1(self.fc1(x))
# missing sigmoid on each output
class FocalLoss(nn.Module):
def __init__(self, gamma=2, alpha=0.5):
super(FocalLoss, self).__init__()
self.gamma = gamma
self.alpha = alpha
def forward(self, input, target):
bce_loss = functional.binary_cross_entropy_with_logits(input, target, reduction='none')
p = sigmoid(input)
p = where(target >= 0.5, p, 1-p)
modulating_factor = (1 - p)**self.gamma
alpha = self.alpha * target + (1 - self.alpha) * (1 - target)
focal_loss = alpha * modulating_factor * bce_loss
return focal_loss.mean()
(x_train, y_train), _ = get_iris_dataset_np()
# make multi-label binary
y_train = np.column_stack((y_train, y_train, y_train))
y_train[y_train > 1] = 1
model = multi_label_binary_model(3, 4)
criterion = FocalLoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
art_model = PyTorchClassifier(model=model,
output_type=ModelOutputType.CLASSIFIER_MULTI_OUTPUT_BINARY_LOGITS,
loss=criterion,
optimizer=optimizer,
input_shape=(24,),
nb_classes=3)
art_model.fit(PytorchData(x_train.astype(np.float32), y_train.astype(np.float32)), save_entire_model=False,
nb_epochs=10)
pred = art_model.predict(PytorchData(x_train.astype(np.float32), y_train.astype(np.float32)))
pred = expit(pred)
pred[pred < 0.5] = 0
pred[pred >= 0.5] = 1
k = 10
QI = [0, 2]
anonymizer = Anonymize(k, QI, train_only_QI=True)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
assert (len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
assert (np.min(counts_elements) >= k)
assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
def test_errors(): def test_errors():
with pytest.raises(ValueError): with pytest.raises(ValueError):
Anonymize(1, [0, 2]) Anonymize(1, [0, 2])