mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-24 20:36:21 +02:00
* Squashed commit of wrappers:
Wrapper minimizer
* apply dataset wrapper on minimizer
* apply changes on minimization notebook
* add black_box_access and unlimited_queries params
Dataset wrapper anonymizer
Add features_names to ArrayDataset
and allow providing features names in QI and Cat features not just indexes
update notebooks
categorical features and QI passed by indexes
dataset include feature names and is_pandas param
add pytorch Dataset
Remove redundant code.
Use data wrappers in model wrapper APIs.
add generic dataset components
Create initial version of wrappers for models
* Fix handling of categorical features
320 lines
12 KiB
Python
320 lines
12 KiB
Python
from sklearn import datasets, model_selection
|
|
import sklearn.preprocessing
|
|
import pandas as pd
|
|
import ssl
|
|
from os import path, mkdir
|
|
from six.moves.urllib.request import urlretrieve
|
|
|
|
|
|
def _load_iris(test_set_size: float = 0.3):
|
|
iris = datasets.load_iris()
|
|
data = iris.data
|
|
labels = iris.target
|
|
|
|
# Split training and test sets
|
|
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
|
|
random_state=18, stratify=labels)
|
|
|
|
return (x_train, y_train), (x_test, y_test)
|
|
|
|
|
|
def get_iris_dataset(test_set: float = 0.3):
|
|
"""
|
|
Loads the Iris dataset from scikit-learn.
|
|
|
|
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
|
|
:return: Entire dataset and labels as numpy array.
|
|
"""
|
|
return _load_iris(test_set)
|
|
|
|
|
|
def _load_diabetes(test_set_size: float = 0.3):
|
|
diabetes = datasets.load_diabetes()
|
|
data = diabetes.data
|
|
labels = diabetes.target
|
|
|
|
# Split training and test sets
|
|
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
|
|
random_state=18)
|
|
|
|
return (x_train, y_train), (x_test, y_test)
|
|
|
|
|
|
def get_diabetes_dataset():
|
|
"""
|
|
Loads the Iris dataset from scikit-learn.
|
|
|
|
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
|
|
:return: Entire dataset and labels as numpy array.
|
|
"""
|
|
return _load_diabetes()
|
|
|
|
|
|
def get_german_credit_dataset(test_set: float = 0.3):
|
|
"""
|
|
Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
|
|
|
|
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
|
|
:return: Dataset and labels as pandas dataframes.
|
|
"""
|
|
|
|
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
|
|
data_dir = '../datasets/german'
|
|
data_file = '../datasets/german/data'
|
|
|
|
if not path.exists(data_dir):
|
|
mkdir(data_dir)
|
|
|
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
if not path.exists(data_file):
|
|
urlretrieve(url, data_file)
|
|
|
|
# load data
|
|
features = ["Existing_checking_account", "Duration_in_month", "Credit_history", "Purpose", "Credit_amount",
|
|
"Savings_account", "Present_employment_since", "Installment_rate", "Personal_status_sex", "debtors",
|
|
"Present_residence", "Property", "Age", "Other_installment_plans", "Housing",
|
|
"Number_of_existing_credits", "Job", "N_people_being_liable_provide_maintenance", "Telephone",
|
|
"Foreign_worker", "label"]
|
|
data = pd.read_csv(data_file, sep=" ", names=features, engine="python")
|
|
# remove rows with missing label
|
|
data = data.dropna(subset=["label"])
|
|
_modify_german_dataset(data)
|
|
|
|
# Split training and test sets
|
|
stratified = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_set, random_state=18)
|
|
for train_set, test_set in stratified.split(data, data["label"]):
|
|
train = data.iloc[train_set]
|
|
test = data.iloc[test_set]
|
|
x_train = train.drop(["label"], axis=1)
|
|
y_train = train.loc[:, "label"]
|
|
x_test = test.drop(["label"], axis=1)
|
|
y_test = test.loc[:, "label"]
|
|
|
|
categorical_features = ["Existing_checking_account", "Credit_history", "Purpose", "Savings_account",
|
|
"Present_employment_since", "Personal_status_sex", "debtors", "Property",
|
|
"Other_installment_plans", "Housing", "Job"]
|
|
x_train.reset_index(drop=True, inplace=True)
|
|
y_train.reset_index(drop=True, inplace=True)
|
|
x_test.reset_index(drop=True, inplace=True)
|
|
y_test.reset_index(drop=True, inplace=True)
|
|
|
|
return (x_train, y_train), (x_test, y_test)
|
|
|
|
|
|
def _modify_german_dataset(data):
|
|
|
|
def modify_Foreign_worker(value):
|
|
if value == 'A201':
|
|
return 1
|
|
elif value == 'A202':
|
|
return 0
|
|
else:
|
|
raise Exception('Bad value')
|
|
|
|
def modify_Telephone(value):
|
|
if value == 'A191':
|
|
return 0
|
|
elif value == 'A192':
|
|
return 1
|
|
else:
|
|
raise Exception('Bad value')
|
|
data['Foreign_worker'] = data['Foreign_worker'].apply(modify_Foreign_worker)
|
|
data['Telephone'] = data['Telephone'].apply(modify_Telephone)
|
|
|
|
|
|
def get_adult_dataset():
|
|
"""
|
|
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary.
|
|
|
|
:return: Dataset and labels as pandas dataframes.
|
|
"""
|
|
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
|
|
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
|
|
'label']
|
|
train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
|
|
test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
|
|
data_dir = '../datasets/adult'
|
|
train_file = '../datasets/adult/train'
|
|
test_file = '../datasets/adult/test'
|
|
|
|
if not path.exists(data_dir):
|
|
mkdir(data_dir)
|
|
|
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
if not path.exists(train_file):
|
|
urlretrieve(train_url, train_file)
|
|
if not path.exists(test_file):
|
|
urlretrieve(test_url, test_file)
|
|
|
|
train = pd.read_csv(train_file, sep=', ', names=features, engine='python')
|
|
test = pd.read_csv(test_file, sep=', ', names=features, engine='python')
|
|
test = test.iloc[1:]
|
|
|
|
train = _modify_adult_dataset(train)
|
|
test = _modify_adult_dataset(test)
|
|
|
|
x_train = train.drop(['label'], axis=1)
|
|
y_train = train.loc[:, 'label']
|
|
x_test = test.drop(['label'], axis=1)
|
|
y_test = test.loc[:, 'label']
|
|
|
|
return (x_train, y_train), (x_test, y_test)
|
|
|
|
|
|
def _modify_adult_dataset(data):
|
|
def modify_label(value):
|
|
if value == '<=50K.' or value == '<=50K':
|
|
return 0
|
|
elif value == '>50K.' or value == '>50K':
|
|
return 1
|
|
else:
|
|
raise Exception('Bad label value')
|
|
|
|
def modify_native_country(value):
|
|
Euro_1 = ['Italy', 'Holand-Netherlands', 'Germany', 'France']
|
|
Euro_2 = ['Yugoslavia', 'South', 'Portugal', 'Poland', 'Hungary', 'Greece']
|
|
SE_Asia = ['Vietnam', 'Thailand', 'Philippines', 'Laos', 'Cambodia']
|
|
UnitedStates = ['United-States']
|
|
LatinAmerica = ['Trinadad&Tobago', 'Puerto-Rico', 'Outlying-US(Guam-USVI-etc)', 'Nicaragua', 'Mexico',
|
|
'Jamaica', 'Honduras', 'Haiti', 'Guatemala', 'Dominican-Republic']
|
|
China = ['Taiwan', 'Hong', 'China']
|
|
BritishCommonwealth = ['Scotland', 'Ireland', 'India', 'England', 'Canada']
|
|
SouthAmerica = ['Peru', 'El-Salvador', 'Ecuador', 'Columbia']
|
|
Other = ['Japan', 'Iran', 'Cuba']
|
|
|
|
if value in Euro_1:
|
|
return 'Euro_1'
|
|
elif value in Euro_2:
|
|
return 'Euro_2'
|
|
elif value in SE_Asia:
|
|
return 'SE_Asia'
|
|
elif value in UnitedStates:
|
|
return 'UnitedStates'
|
|
elif value in LatinAmerica:
|
|
return 'LatinAmerica'
|
|
elif value in China:
|
|
return 'China'
|
|
elif value in BritishCommonwealth:
|
|
return 'BritishCommonwealth'
|
|
elif value in SouthAmerica:
|
|
return 'SouthAmerica'
|
|
elif value in Other:
|
|
return 'Other'
|
|
elif value == '?':
|
|
return 'Unknown'
|
|
else:
|
|
raise Exception('Bad native country value')
|
|
|
|
data['label'] = data['label'].apply(modify_label)
|
|
data['native-country'] = data['native-country'].apply(modify_native_country)
|
|
|
|
for col in ('age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'):
|
|
try:
|
|
data[col] = data[col].fillna(0)
|
|
except KeyError:
|
|
print('missing column ' + col)
|
|
|
|
for col in ('workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'):
|
|
try:
|
|
data[col] = data[col].fillna('NA')
|
|
except KeyError:
|
|
print('missing column ' + col)
|
|
|
|
return data.drop(['fnlwgt', 'education'], axis=1)
|
|
|
|
|
|
def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
|
|
"""
|
|
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it if necessary.
|
|
|
|
:param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot
|
|
encoded and data is scaled using sklearn's StandardScaler.
|
|
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1.
|
|
:param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute
|
|
inference. This is done by assigning the original value 'problematic' the new value 1, and
|
|
the other original values are assigned the new value 0.
|
|
:return: Dataset and labels as pandas dataframes.
|
|
"""
|
|
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data'
|
|
data_dir = '../datasets/nursery'
|
|
data_file = '../datasets/nursery/data'
|
|
|
|
if not path.exists(data_dir):
|
|
mkdir(data_dir)
|
|
|
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
if not path.exists(data_file):
|
|
urlretrieve(url, data_file)
|
|
|
|
# load data
|
|
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health", "label"]
|
|
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]
|
|
data = pd.read_csv(data_file, sep=",", names=features, engine="python")
|
|
# remove rows with missing label or too sparse label
|
|
data = data.dropna(subset=["label"])
|
|
data.drop(data.loc[data["label"] == "recommend"].index, axis=0, inplace=True)
|
|
|
|
# fill missing values
|
|
data["children"] = data["children"].fillna(0)
|
|
|
|
for col in ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]:
|
|
data[col] = data[col].fillna("other")
|
|
|
|
# make categorical label
|
|
def modify_label(value): # 5 classes
|
|
if value == "not_recom":
|
|
return 0
|
|
elif value == "very_recom":
|
|
return 1
|
|
elif value == "priority":
|
|
return 2
|
|
elif value == "spec_prior":
|
|
return 3
|
|
else:
|
|
raise Exception("Bad label value: %s" % value)
|
|
|
|
data["label"] = data["label"].apply(modify_label)
|
|
data["children"] = data["children"].apply(lambda x: "4" if x == "more" else x)
|
|
|
|
if transform_social:
|
|
|
|
def modify_social(value):
|
|
if value == "problematic":
|
|
return 1
|
|
else:
|
|
return 0
|
|
|
|
data["social"] = data["social"].apply(modify_social)
|
|
categorical_features.remove("social")
|
|
|
|
if not raw:
|
|
# one-hot-encode categorical features
|
|
features_to_remove = []
|
|
for feature in categorical_features:
|
|
all_values = data.loc[:, feature]
|
|
values = list(all_values.unique())
|
|
data[feature] = pd.Categorical(data.loc[:, feature], categories=values, ordered=False)
|
|
one_hot_vector = pd.get_dummies(data[feature], prefix=feature)
|
|
data = pd.concat([data, one_hot_vector], axis=1)
|
|
features_to_remove.append(feature)
|
|
data = data.drop(features_to_remove, axis=1)
|
|
|
|
# normalize data
|
|
label = data.loc[:, "label"]
|
|
features = data.drop(["label"], axis=1)
|
|
scaler = sklearn.preprocessing.StandardScaler()
|
|
scaler.fit(features)
|
|
scaled_features = pd.DataFrame(scaler.transform(features), columns=features.columns)
|
|
data = pd.concat([label, scaled_features], axis=1, join="inner")
|
|
|
|
# Split training and test sets
|
|
stratified = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_set, random_state=18)
|
|
for train_set, test_set in stratified.split(data, data["label"]):
|
|
train = data.iloc[train_set]
|
|
test = data.iloc[test_set]
|
|
x_train = train.drop(["label"], axis=1)
|
|
y_train = train.loc[:, "label"]
|
|
x_test = test.drop(["label"], axis=1)
|
|
y_test = test.loc[:, "label"]
|
|
|
|
return (x_train, y_train), (x_test, y_test)
|