mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-05-15 06:52:37 +02:00
Data and Model wrappers (#26)
* Squashed commit of wrappers:
Wrapper minimizer
* apply dataset wrapper on minimizer
* apply changes on minimization notebook
* add black_box_access and unlimited_queries params
Dataset wrapper anonymizer
Add features_names to ArrayDataset
and allow providing features names in QI and Cat features not just indexes
update notebooks
categorical features and QI passed by indexes
dataset include feature names and is_pandas param
add pytorch Dataset
Remove redundant code.
Use data wrappers in model wrapper APIs.
add generic dataset components
Create initial version of wrappers for models
* Fix handling of categorical features
This commit is contained in:
parent
d53818644e
commit
2b2dab6bef
17 changed files with 1340 additions and 752 deletions
0
apt/utils/__init__.py
Normal file
0
apt/utils/__init__.py
Normal file
320
apt/utils/dataset_utils.py
Normal file
320
apt/utils/dataset_utils.py
Normal file
|
|
@ -0,0 +1,320 @@
|
|||
from sklearn import datasets, model_selection
|
||||
import sklearn.preprocessing
|
||||
import pandas as pd
|
||||
import ssl
|
||||
from os import path, mkdir
|
||||
from six.moves.urllib.request import urlretrieve
|
||||
|
||||
|
||||
def _load_iris(test_set_size: float = 0.3):
|
||||
iris = datasets.load_iris()
|
||||
data = iris.data
|
||||
labels = iris.target
|
||||
|
||||
# Split training and test sets
|
||||
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
|
||||
random_state=18, stratify=labels)
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
def get_iris_dataset(test_set: float = 0.3):
|
||||
"""
|
||||
Loads the Iris dataset from scikit-learn.
|
||||
|
||||
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
|
||||
:return: Entire dataset and labels as numpy array.
|
||||
"""
|
||||
return _load_iris(test_set)
|
||||
|
||||
|
||||
def _load_diabetes(test_set_size: float = 0.3):
|
||||
diabetes = datasets.load_diabetes()
|
||||
data = diabetes.data
|
||||
labels = diabetes.target
|
||||
|
||||
# Split training and test sets
|
||||
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
|
||||
random_state=18)
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
def get_diabetes_dataset():
|
||||
"""
|
||||
Loads the Iris dataset from scikit-learn.
|
||||
|
||||
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
|
||||
:return: Entire dataset and labels as numpy array.
|
||||
"""
|
||||
return _load_diabetes()
|
||||
|
||||
|
||||
def get_german_credit_dataset(test_set: float = 0.3):
|
||||
"""
|
||||
Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
|
||||
|
||||
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
|
||||
:return: Dataset and labels as pandas dataframes.
|
||||
"""
|
||||
|
||||
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
|
||||
data_dir = '../datasets/german'
|
||||
data_file = '../datasets/german/data'
|
||||
|
||||
if not path.exists(data_dir):
|
||||
mkdir(data_dir)
|
||||
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
if not path.exists(data_file):
|
||||
urlretrieve(url, data_file)
|
||||
|
||||
# load data
|
||||
features = ["Existing_checking_account", "Duration_in_month", "Credit_history", "Purpose", "Credit_amount",
|
||||
"Savings_account", "Present_employment_since", "Installment_rate", "Personal_status_sex", "debtors",
|
||||
"Present_residence", "Property", "Age", "Other_installment_plans", "Housing",
|
||||
"Number_of_existing_credits", "Job", "N_people_being_liable_provide_maintenance", "Telephone",
|
||||
"Foreign_worker", "label"]
|
||||
data = pd.read_csv(data_file, sep=" ", names=features, engine="python")
|
||||
# remove rows with missing label
|
||||
data = data.dropna(subset=["label"])
|
||||
_modify_german_dataset(data)
|
||||
|
||||
# Split training and test sets
|
||||
stratified = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_set, random_state=18)
|
||||
for train_set, test_set in stratified.split(data, data["label"]):
|
||||
train = data.iloc[train_set]
|
||||
test = data.iloc[test_set]
|
||||
x_train = train.drop(["label"], axis=1)
|
||||
y_train = train.loc[:, "label"]
|
||||
x_test = test.drop(["label"], axis=1)
|
||||
y_test = test.loc[:, "label"]
|
||||
|
||||
categorical_features = ["Existing_checking_account", "Credit_history", "Purpose", "Savings_account",
|
||||
"Present_employment_since", "Personal_status_sex", "debtors", "Property",
|
||||
"Other_installment_plans", "Housing", "Job"]
|
||||
x_train.reset_index(drop=True, inplace=True)
|
||||
y_train.reset_index(drop=True, inplace=True)
|
||||
x_test.reset_index(drop=True, inplace=True)
|
||||
y_test.reset_index(drop=True, inplace=True)
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
def _modify_german_dataset(data):
|
||||
|
||||
def modify_Foreign_worker(value):
|
||||
if value == 'A201':
|
||||
return 1
|
||||
elif value == 'A202':
|
||||
return 0
|
||||
else:
|
||||
raise Exception('Bad value')
|
||||
|
||||
def modify_Telephone(value):
|
||||
if value == 'A191':
|
||||
return 0
|
||||
elif value == 'A192':
|
||||
return 1
|
||||
else:
|
||||
raise Exception('Bad value')
|
||||
data['Foreign_worker'] = data['Foreign_worker'].apply(modify_Foreign_worker)
|
||||
data['Telephone'] = data['Telephone'].apply(modify_Telephone)
|
||||
|
||||
|
||||
def get_adult_dataset():
|
||||
"""
|
||||
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary.
|
||||
|
||||
:return: Dataset and labels as pandas dataframes.
|
||||
"""
|
||||
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
|
||||
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
|
||||
'label']
|
||||
train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
|
||||
test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
|
||||
data_dir = '../datasets/adult'
|
||||
train_file = '../datasets/adult/train'
|
||||
test_file = '../datasets/adult/test'
|
||||
|
||||
if not path.exists(data_dir):
|
||||
mkdir(data_dir)
|
||||
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
if not path.exists(train_file):
|
||||
urlretrieve(train_url, train_file)
|
||||
if not path.exists(test_file):
|
||||
urlretrieve(test_url, test_file)
|
||||
|
||||
train = pd.read_csv(train_file, sep=', ', names=features, engine='python')
|
||||
test = pd.read_csv(test_file, sep=', ', names=features, engine='python')
|
||||
test = test.iloc[1:]
|
||||
|
||||
train = _modify_adult_dataset(train)
|
||||
test = _modify_adult_dataset(test)
|
||||
|
||||
x_train = train.drop(['label'], axis=1)
|
||||
y_train = train.loc[:, 'label']
|
||||
x_test = test.drop(['label'], axis=1)
|
||||
y_test = test.loc[:, 'label']
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
def _modify_adult_dataset(data):
|
||||
def modify_label(value):
|
||||
if value == '<=50K.' or value == '<=50K':
|
||||
return 0
|
||||
elif value == '>50K.' or value == '>50K':
|
||||
return 1
|
||||
else:
|
||||
raise Exception('Bad label value')
|
||||
|
||||
def modify_native_country(value):
|
||||
Euro_1 = ['Italy', 'Holand-Netherlands', 'Germany', 'France']
|
||||
Euro_2 = ['Yugoslavia', 'South', 'Portugal', 'Poland', 'Hungary', 'Greece']
|
||||
SE_Asia = ['Vietnam', 'Thailand', 'Philippines', 'Laos', 'Cambodia']
|
||||
UnitedStates = ['United-States']
|
||||
LatinAmerica = ['Trinadad&Tobago', 'Puerto-Rico', 'Outlying-US(Guam-USVI-etc)', 'Nicaragua', 'Mexico',
|
||||
'Jamaica', 'Honduras', 'Haiti', 'Guatemala', 'Dominican-Republic']
|
||||
China = ['Taiwan', 'Hong', 'China']
|
||||
BritishCommonwealth = ['Scotland', 'Ireland', 'India', 'England', 'Canada']
|
||||
SouthAmerica = ['Peru', 'El-Salvador', 'Ecuador', 'Columbia']
|
||||
Other = ['Japan', 'Iran', 'Cuba']
|
||||
|
||||
if value in Euro_1:
|
||||
return 'Euro_1'
|
||||
elif value in Euro_2:
|
||||
return 'Euro_2'
|
||||
elif value in SE_Asia:
|
||||
return 'SE_Asia'
|
||||
elif value in UnitedStates:
|
||||
return 'UnitedStates'
|
||||
elif value in LatinAmerica:
|
||||
return 'LatinAmerica'
|
||||
elif value in China:
|
||||
return 'China'
|
||||
elif value in BritishCommonwealth:
|
||||
return 'BritishCommonwealth'
|
||||
elif value in SouthAmerica:
|
||||
return 'SouthAmerica'
|
||||
elif value in Other:
|
||||
return 'Other'
|
||||
elif value == '?':
|
||||
return 'Unknown'
|
||||
else:
|
||||
raise Exception('Bad native country value')
|
||||
|
||||
data['label'] = data['label'].apply(modify_label)
|
||||
data['native-country'] = data['native-country'].apply(modify_native_country)
|
||||
|
||||
for col in ('age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'):
|
||||
try:
|
||||
data[col] = data[col].fillna(0)
|
||||
except KeyError:
|
||||
print('missing column ' + col)
|
||||
|
||||
for col in ('workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'):
|
||||
try:
|
||||
data[col] = data[col].fillna('NA')
|
||||
except KeyError:
|
||||
print('missing column ' + col)
|
||||
|
||||
return data.drop(['fnlwgt', 'education'], axis=1)
|
||||
|
||||
|
||||
def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
|
||||
"""
|
||||
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it if necessary.
|
||||
|
||||
:param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot
|
||||
encoded and data is scaled using sklearn's StandardScaler.
|
||||
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1.
|
||||
:param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute
|
||||
inference. This is done by assigning the original value 'problematic' the new value 1, and
|
||||
the other original values are assigned the new value 0.
|
||||
:return: Dataset and labels as pandas dataframes.
|
||||
"""
|
||||
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data'
|
||||
data_dir = '../datasets/nursery'
|
||||
data_file = '../datasets/nursery/data'
|
||||
|
||||
if not path.exists(data_dir):
|
||||
mkdir(data_dir)
|
||||
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
if not path.exists(data_file):
|
||||
urlretrieve(url, data_file)
|
||||
|
||||
# load data
|
||||
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health", "label"]
|
||||
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]
|
||||
data = pd.read_csv(data_file, sep=",", names=features, engine="python")
|
||||
# remove rows with missing label or too sparse label
|
||||
data = data.dropna(subset=["label"])
|
||||
data.drop(data.loc[data["label"] == "recommend"].index, axis=0, inplace=True)
|
||||
|
||||
# fill missing values
|
||||
data["children"] = data["children"].fillna(0)
|
||||
|
||||
for col in ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]:
|
||||
data[col] = data[col].fillna("other")
|
||||
|
||||
# make categorical label
|
||||
def modify_label(value): # 5 classes
|
||||
if value == "not_recom":
|
||||
return 0
|
||||
elif value == "very_recom":
|
||||
return 1
|
||||
elif value == "priority":
|
||||
return 2
|
||||
elif value == "spec_prior":
|
||||
return 3
|
||||
else:
|
||||
raise Exception("Bad label value: %s" % value)
|
||||
|
||||
data["label"] = data["label"].apply(modify_label)
|
||||
data["children"] = data["children"].apply(lambda x: "4" if x == "more" else x)
|
||||
|
||||
if transform_social:
|
||||
|
||||
def modify_social(value):
|
||||
if value == "problematic":
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
data["social"] = data["social"].apply(modify_social)
|
||||
categorical_features.remove("social")
|
||||
|
||||
if not raw:
|
||||
# one-hot-encode categorical features
|
||||
features_to_remove = []
|
||||
for feature in categorical_features:
|
||||
all_values = data.loc[:, feature]
|
||||
values = list(all_values.unique())
|
||||
data[feature] = pd.Categorical(data.loc[:, feature], categories=values, ordered=False)
|
||||
one_hot_vector = pd.get_dummies(data[feature], prefix=feature)
|
||||
data = pd.concat([data, one_hot_vector], axis=1)
|
||||
features_to_remove.append(feature)
|
||||
data = data.drop(features_to_remove, axis=1)
|
||||
|
||||
# normalize data
|
||||
label = data.loc[:, "label"]
|
||||
features = data.drop(["label"], axis=1)
|
||||
scaler = sklearn.preprocessing.StandardScaler()
|
||||
scaler.fit(features)
|
||||
scaled_features = pd.DataFrame(scaler.transform(features), columns=features.columns)
|
||||
data = pd.concat([label, scaled_features], axis=1, join="inner")
|
||||
|
||||
# Split training and test sets
|
||||
stratified = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_set, random_state=18)
|
||||
for train_set, test_set in stratified.split(data, data["label"]):
|
||||
train = data.iloc[train_set]
|
||||
test = data.iloc[test_set]
|
||||
x_train = train.drop(["label"], axis=1)
|
||||
y_train = train.loc[:, "label"]
|
||||
x_test = test.drop(["label"], axis=1)
|
||||
y_test = test.loc[:, "label"]
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
7
apt/utils/datasets/__init__.py
Normal file
7
apt/utils/datasets/__init__.py
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
"""
|
||||
The AI Privacy Toolbox (datasets).
|
||||
Implementation of datasets utility components for datasets creation, load, and store
|
||||
"""
|
||||
|
||||
from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, \
|
||||
OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE
|
||||
320
apt/utils/datasets/datasets.py
Normal file
320
apt/utils/datasets/datasets.py
Normal file
|
|
@ -0,0 +1,320 @@
|
|||
# !/usr/bin/env python
|
||||
"""
|
||||
The AI Privacy Toolbox (datasets).
|
||||
Implementation of utility classes for dataset handling
|
||||
"""
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from typing import Callable, Collection, Any, Union, List, Optional
|
||||
|
||||
import tarfile
|
||||
import os
|
||||
import urllib.request
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import logging
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor]
|
||||
OUTPUT_DATA_ARRAY_TYPE = np.ndarray
|
||||
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
|
||||
|
||||
|
||||
def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
|
||||
"""
|
||||
converts from INPUT_DATA_ARRAY_TYPE to numpy array
|
||||
"""
|
||||
if type(arr) == np.ndarray:
|
||||
return arr
|
||||
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
|
||||
self.is_pandas = True
|
||||
return arr.to_numpy()
|
||||
if isinstance(arr, list):
|
||||
return np.array(arr)
|
||||
if type(arr) == Tensor:
|
||||
return arr.detach().cpu().numpy()
|
||||
|
||||
raise ValueError('Non supported type: ', type(arr).__name__)
|
||||
|
||||
|
||||
def array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
|
||||
"""
|
||||
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
|
||||
"""
|
||||
if type(arr) == np.ndarray:
|
||||
return torch.from_numpy(arr)
|
||||
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
|
||||
self.is_pandas = True
|
||||
return torch.from_numpy(arr.to_numpy())
|
||||
if isinstance(arr, list):
|
||||
return torch.tensor(arr)
|
||||
if type(arr) == Tensor:
|
||||
return arr
|
||||
|
||||
raise ValueError('Non supported type: ', type(arr).__name__)
|
||||
|
||||
|
||||
class Dataset(metaclass=ABCMeta):
|
||||
"""Base Abstract Class for Dataset"""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_samples(self) -> Collection[Any]:
|
||||
"""Return data samples"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_labels(self) -> Collection[Any]:
|
||||
"""Return labels"""
|
||||
pass
|
||||
|
||||
|
||||
class StoredDataset(Dataset):
|
||||
"""Abstract Class for Storable Dataset"""
|
||||
|
||||
@abstractmethod
|
||||
def load_from_file(self, path: str):
|
||||
"""Load dataset from file"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load(self, **kwargs):
|
||||
"""Load dataset"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None:
|
||||
"""
|
||||
Download the dataset from URL
|
||||
:param url: dataset URL, the dataset will be requested from this URL
|
||||
:param dest_path: local dataset destination path
|
||||
:param filename: local dataset filename
|
||||
:param unzip: flag whether or not perform extraction
|
||||
:return: None
|
||||
"""
|
||||
file_path = os.path.join(dest_path, filename)
|
||||
|
||||
if os.path.exists(file_path):
|
||||
logger.warning("Files already downloaded, skipping downloading")
|
||||
|
||||
else:
|
||||
os.makedirs(dest_path, exist_ok=True)
|
||||
logger.info("Downloading the dataset...")
|
||||
urllib.request.urlretrieve(url, file_path)
|
||||
logger.info('Dataset Downloaded')
|
||||
|
||||
if unzip:
|
||||
StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
|
||||
|
||||
@staticmethod
|
||||
def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
|
||||
"""
|
||||
Extract dataset from archived file
|
||||
:param zip_path: path to archived file
|
||||
:param dest_path: directory path to uncompress the file to
|
||||
:param remove_archive: whether remove the archive file after uncompress (default False)
|
||||
:return: None
|
||||
"""
|
||||
logger.info("Extracting the dataset...")
|
||||
tar = tarfile.open(zip_path)
|
||||
tar.extractall(path=dest_path)
|
||||
|
||||
logger.info("Dataset was extracted to {}".format(dest_path))
|
||||
if remove_archive:
|
||||
logger.info("Removing a zip file")
|
||||
os.remove(zip_path)
|
||||
logger.info("Extracted the dataset")
|
||||
|
||||
@staticmethod
|
||||
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None:
|
||||
"""
|
||||
Split the data and take only a part of it
|
||||
:param datafile: dataset file path
|
||||
:param dest_datafile: destination path for the partial dataset file
|
||||
:param ratio: part of the dataset to save
|
||||
:param shuffle: whether to shuffle the data or not (default True)
|
||||
:param delimiter: dataset delimiter (default ",")
|
||||
:param fmt: format for the correct data saving
|
||||
:return: None
|
||||
"""
|
||||
if os.path.isfile(dest_datafile):
|
||||
logger.info(f"The partial debug split already exists {dest_datafile}")
|
||||
return
|
||||
else:
|
||||
os.makedirs(os.path.dirname(dest_datafile), exist_ok=True)
|
||||
|
||||
data = np.genfromtxt(datafile, delimiter=delimiter)
|
||||
if shuffle:
|
||||
logger.info("Shuffling data")
|
||||
np.random.shuffle(data)
|
||||
|
||||
debug_data = data[:int(len(data) * ratio)]
|
||||
logger.info(f"Saving {ratio} of the data to {dest_datafile}")
|
||||
np.savetxt(dest_datafile, debug_data, delimiter=delimiter, fmt=fmt)
|
||||
|
||||
|
||||
class ArrayDataset(Dataset):
|
||||
"""Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
|
||||
|
||||
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
|
||||
features_names: Optional = None, **kwargs):
|
||||
"""
|
||||
ArrayDataset constructor.
|
||||
:param x: collection of data samples
|
||||
:param y: collection of labels (optional)
|
||||
:param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
|
||||
:param kwargs: dataset parameters
|
||||
"""
|
||||
self.is_pandas = False
|
||||
self.features_names = features_names
|
||||
self._y = array2numpy(self, y) if y is not None else None
|
||||
self._x = array2numpy(self, x)
|
||||
if self.is_pandas:
|
||||
if features_names and not np.array_equal(features_names, x.columns):
|
||||
raise ValueError("The supplied features are not the same as in the data features")
|
||||
self.features_names = x.columns.to_list()
|
||||
|
||||
if y is not None and len(self._x) != len(self._y):
|
||||
raise ValueError('Non equivalent lengths of x and y')
|
||||
|
||||
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return data samples as numpy array"""
|
||||
return self._x
|
||||
|
||||
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return labels as numpy array"""
|
||||
return self._y
|
||||
|
||||
|
||||
class PytorchData(Dataset):
|
||||
|
||||
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
|
||||
"""
|
||||
PytorchData constructor.
|
||||
:param x: collection of data samples
|
||||
:param y: collection of labels (optional)
|
||||
:param kwargs: dataset parameters
|
||||
"""
|
||||
self.is_pandas = False
|
||||
self._y = array2torch_tensor(self, y) if y is not None else None
|
||||
self._x = array2torch_tensor(self, x)
|
||||
if self.is_pandas:
|
||||
self.features_names = x.columns
|
||||
|
||||
if y is not None and len(self._x) != len(self._y):
|
||||
raise ValueError('Non equivalent lengths of x and y')
|
||||
|
||||
|
||||
if self._y is not None:
|
||||
self.__getitem__ = self.get_item
|
||||
else:
|
||||
self.__getitem__ = self.get_sample_item
|
||||
|
||||
|
||||
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return data samples as numpy array"""
|
||||
return array2numpy(self._x)
|
||||
|
||||
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return labels as numpy array"""
|
||||
return array2numpy(self._y) if self._y is not None else None
|
||||
|
||||
def get_sample_item(self, idx) -> Tensor:
|
||||
return self.x[idx]
|
||||
|
||||
def get_item(self, idx) -> Tensor:
|
||||
sample, label = self.x[idx], self.y[idx]
|
||||
return sample, label
|
||||
|
||||
def __len__(self):
|
||||
return len(self.x)
|
||||
|
||||
|
||||
class DatasetFactory:
|
||||
"""Factory class for dataset creation"""
|
||||
registry = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, name: str) -> Callable:
|
||||
"""
|
||||
Class method to register Dataset to the internal registry
|
||||
:param name: dataset name
|
||||
:return:
|
||||
"""
|
||||
|
||||
def inner_wrapper(wrapped_class: Dataset) -> Any:
|
||||
if name in cls.registry:
|
||||
logger.warning('Dataset %s already exists. Will replace it', name)
|
||||
cls.registry[name] = wrapped_class
|
||||
return wrapped_class
|
||||
|
||||
return inner_wrapper
|
||||
|
||||
@classmethod
|
||||
def create_dataset(cls, name: str, **kwargs) -> Dataset:
|
||||
"""
|
||||
Factory command to create dataset instance.
|
||||
This method gets the appropriate Dataset class from the registry
|
||||
and creates an instance of it, while passing in the parameters
|
||||
given in ``kwargs``.
|
||||
:param name: The name of the dataset to create.
|
||||
:param kwargs: dataset parameters
|
||||
:return: An instance of the dataset that is created.
|
||||
"""
|
||||
if name not in cls.registry:
|
||||
msg = f'Dataset {name} does not exist in the registry'
|
||||
logger.error(msg)
|
||||
raise ValueError(msg)
|
||||
|
||||
exec_class = cls.registry[name]
|
||||
executor = exec_class(**kwargs)
|
||||
return executor
|
||||
|
||||
|
||||
class Data:
|
||||
def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
|
||||
"""
|
||||
Data class constructor.
|
||||
The class stores train and test datasets.
|
||||
If neither of the datasets was provided,
|
||||
Both train and test datasets will be create using
|
||||
DatasetFactory to create a dataset instance
|
||||
"""
|
||||
if train or test:
|
||||
self.train = train
|
||||
self.test = test
|
||||
else:
|
||||
self.train = DatasetFactory.create_dataset(train=True, **kwargs)
|
||||
self.test = DatasetFactory.create_dataset(train=False, **kwargs)
|
||||
|
||||
def get_train_set(self) -> Dataset:
|
||||
"""Return train DatasetBase"""
|
||||
return self.train
|
||||
|
||||
def get_test_set(self) -> Dataset:
|
||||
"""Return test DatasetBase"""
|
||||
return self.test
|
||||
|
||||
def get_train_samples(self) -> Collection[Any]:
|
||||
"""Return train set samples"""
|
||||
return self.train.get_samples()
|
||||
|
||||
def get_train_labels(self) -> Collection[Any]:
|
||||
"""Return train set labels"""
|
||||
return self.train.get_labels()
|
||||
|
||||
def get_test_samples(self) -> Collection[Any]:
|
||||
"""Return test set samples"""
|
||||
return self.test.get_samples()
|
||||
|
||||
def get_test_labels(self) -> Collection[Any]:
|
||||
"""Return test set labels"""
|
||||
return self.test.get_labels()
|
||||
2
apt/utils/models/__init__.py
Normal file
2
apt/utils/models/__init__.py
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
from apt.utils.models.model import Model, ModelOutputType
|
||||
from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor
|
||||
109
apt/utils/models/model.py
Normal file
109
apt/utils/models/model.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
from abc import ABCMeta, abstractmethod
|
||||
from typing import Any, Optional
|
||||
from enum import Enum, auto
|
||||
|
||||
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
|
||||
|
||||
|
||||
class ModelOutputType(Enum):
|
||||
CLASSIFIER_VECTOR = auto() # probabilities or logits
|
||||
CLASSIFIER_SCALAR = auto() # label only
|
||||
REGRESSOR_SCALAR = auto() # value
|
||||
|
||||
|
||||
class Model(metaclass=ABCMeta):
|
||||
"""
|
||||
Abstract base class for ML model wrappers.
|
||||
"""
|
||||
|
||||
def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
|
||||
unlimited_queries: Optional[bool] = True, **kwargs):
|
||||
"""
|
||||
Initialize a `Model` wrapper object.
|
||||
|
||||
:param model: The original model object (of the underlying ML framework)
|
||||
:param output_type: The type of output the model yields (vector/label only for classifiers,
|
||||
value for regressors)
|
||||
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
|
||||
Set to True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals
|
||||
are also available. Optional, Default is True.
|
||||
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
|
||||
unlimited queries to the model API or whether there is a limit to the number of
|
||||
queries that can be submitted. Optional, Default is True.
|
||||
"""
|
||||
self._model = model
|
||||
self._output_type = output_type
|
||||
self._black_box_access = black_box_access
|
||||
self._unlimited_queries = unlimited_queries
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, train_data: Dataset, **kwargs) -> None:
|
||||
"""
|
||||
Fit the model using the training data.
|
||||
|
||||
:param train_data: Training data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Perform predictions using the model for input `x`.
|
||||
|
||||
:param x: Input samples.
|
||||
:type x: `np.ndarray` or `pandas.DataFrame`
|
||||
:return: Predictions from the model.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def score(self, test_data: Dataset, **kwargs):
|
||||
"""
|
||||
Score the model using test data.
|
||||
|
||||
:param test_data: Test data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
return NotImplementedError
|
||||
|
||||
@property
|
||||
def model(self) -> Any:
|
||||
"""
|
||||
Return the model.
|
||||
|
||||
:return: The model.
|
||||
"""
|
||||
return self._model
|
||||
|
||||
@property
|
||||
def output_type(self) -> ModelOutputType:
|
||||
"""
|
||||
Return the model's output type.
|
||||
|
||||
:return: The model's output type.
|
||||
"""
|
||||
return self._output_type
|
||||
|
||||
@property
|
||||
def black_box_access(self) -> bool:
|
||||
"""
|
||||
Return True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals are also available.
|
||||
|
||||
:return: True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals are also available.
|
||||
"""
|
||||
return self._black_box_access
|
||||
|
||||
@property
|
||||
def unlimited_queries(self) -> bool:
|
||||
"""
|
||||
If black_box_access is True, Return whether a user can perform unlimited queries to the model API
|
||||
or whether there is a limit to the number of queries that can be submitted.
|
||||
|
||||
:return: If black_box_access is True, Return whether a user can perform unlimited queries to the model API
|
||||
or whether there is a limit to the number of queries that can be submitted.
|
||||
"""
|
||||
return self._unlimited_queries
|
||||
112
apt/utils/models/sklearn_model.py
Normal file
112
apt/utils/models/sklearn_model.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.base import BaseEstimator
|
||||
|
||||
from apt.utils.models import Model, ModelOutputType
|
||||
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
|
||||
|
||||
from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
|
||||
from art.estimators.regression.scikitlearn import ScikitlearnRegressor
|
||||
|
||||
|
||||
class SklearnModel(Model):
|
||||
"""
|
||||
Wrapper class for scikitlearn models.
|
||||
"""
|
||||
def score(self, test_data: Dataset, **kwargs):
|
||||
"""
|
||||
Score the model using test data.
|
||||
|
||||
:param test_data: Test data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs)
|
||||
|
||||
|
||||
class SklearnClassifier(SklearnModel):
|
||||
"""
|
||||
Wrapper class for scikitlearn classification models.
|
||||
"""
|
||||
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
|
||||
unlimited_queries: Optional[bool] = True, **kwargs):
|
||||
"""
|
||||
Initialize a `SklearnClassifier` wrapper object.
|
||||
|
||||
:param model: The original sklearn model object.
|
||||
:param output_type: The type of output the model yields (vector/label only for classifiers,
|
||||
value for regressors)
|
||||
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
|
||||
Set to True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals
|
||||
are also available. Optional, Default is True.
|
||||
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
|
||||
unlimited queries to the model API or whether there is a limit to the number of
|
||||
queries that can be submitted. Optional, Default is True.
|
||||
"""
|
||||
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
|
||||
self._art_model = ArtSklearnClassifier(model)
|
||||
|
||||
def fit(self, train_data: Dataset, **kwargs) -> None:
|
||||
"""
|
||||
Fit the model using the training data.
|
||||
|
||||
:param train_data: Training data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
encoder = OneHotEncoder(sparse=False)
|
||||
y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
|
||||
self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
|
||||
|
||||
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Perform predictions using the model for input `x`.
|
||||
|
||||
:param x: Input samples.
|
||||
:type x: `np.ndarray` or `pandas.DataFrame`
|
||||
:return: Predictions from the model (class probabilities, if supported).
|
||||
"""
|
||||
return self._art_model.predict(x, **kwargs)
|
||||
|
||||
|
||||
class SklearnRegressor(SklearnModel):
|
||||
"""
|
||||
Wrapper class for scikitlearn regression models.
|
||||
"""
|
||||
def __init__(self, model: BaseEstimator, black_box_access: Optional[bool] = True,
|
||||
unlimited_queries: Optional[bool] = True, **kwargs):
|
||||
"""
|
||||
Initialize a `SklearnRegressor` wrapper object.
|
||||
|
||||
:param model: The original sklearn model object.
|
||||
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
|
||||
Set to True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals
|
||||
are also available. Optional, Default is True.
|
||||
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
|
||||
unlimited queries to the model API or whether there is a limit to the number of
|
||||
queries that can be submitted. Optional, Default is True.
|
||||
"""
|
||||
super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
|
||||
self._art_model = ScikitlearnRegressor(model)
|
||||
|
||||
def fit(self, train_data: Dataset, **kwargs) -> None:
|
||||
"""
|
||||
Fit the model using the training data.
|
||||
|
||||
:param train_data: Training data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
|
||||
|
||||
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Perform predictions using the model for input `x`.
|
||||
|
||||
:param x: Input samples.
|
||||
:type x: `np.ndarray` or `pandas.DataFrame`
|
||||
:return: Predictions from the model.
|
||||
"""
|
||||
return self._art_model.predict(x, **kwargs)
|
||||
Loading…
Add table
Add a link
Reference in a new issue