Make data minimization more consistent and performant (#83)

* Update requirements

* Update incompatible scipy version

* Reduce runtime of dataset assessment tests

* ncp is now a class that contains 3 values: fit_score, transform_score and generalizations_score so that it doesn't matter in what order the different methods are called, all calculated ncp scores are stored.
Generalizations can now be applied either from tree cells or from global generalizations struct depending on the value of generalize_using_transform. Representative values can also be computed from global generalizations.
Removing a feature from the generalization can also be applied in either mode.

* Compute generalizations with test data when possible (for computing better representatives).

* Externalize common test code to methods.
This commit is contained in:
abigailgold 2023-08-21 18:39:15 +03:00 committed by GitHub
parent e9a225501f
commit 13a0567183
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 1004 additions and 689 deletions

View file

@ -15,11 +15,12 @@ import pandas as pd
import logging
import torch
from torch import Tensor
from scipy.sparse import csr_matrix
logger = logging.getLogger(__name__)
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor]
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor, csr_matrix]
OUTPUT_DATA_ARRAY_TYPE = np.ndarray
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
@ -29,14 +30,16 @@ def array2numpy(arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
"""
converts from INPUT_DATA_ARRAY_TYPE to numpy array
"""
if type(arr) == np.ndarray:
if isinstance(arr, np.ndarray):
return arr
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
if isinstance(arr, pd.DataFrame) or isinstance(arr, pd.Series):
return arr.to_numpy()
if isinstance(arr, list):
return np.array(arr)
if type(arr) == Tensor:
if isinstance(arr, Tensor):
return arr.detach().cpu().numpy()
if isinstance(arr, csr_matrix):
return arr.toarray()
raise ValueError("Non supported type: ", type(arr).__name__)
@ -45,14 +48,16 @@ def array2torch_tensor(arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
"""
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
"""
if type(arr) == np.ndarray:
if isinstance(arr, np.ndarray):
return torch.from_numpy(arr)
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
if isinstance(arr, pd.DataFrame) or isinstance(arr, pd.Series):
return torch.from_numpy(arr.to_numpy())
if isinstance(arr, list):
return torch.tensor(arr)
if type(arr) == Tensor:
if isinstance(arr, Tensor):
return arr
if isinstance(arr, csr_matrix):
return torch.from_numpy(arr.toarray())
raise ValueError("Non supported type: ", type(arr).__name__)
@ -217,7 +222,7 @@ class ArrayDataset(Dataset):
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
features_names: Optional[list] = None, **kwargs):
self.is_pandas = self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series
self.is_pandas = self.is_pandas = isinstance(x, pd.DataFrame) or isinstance(x, pd.Series)
self.features_names = features_names
self._y = array2numpy(y) if y is not None else None
@ -325,7 +330,7 @@ class PytorchData(Dataset):
self._y = array2torch_tensor(y) if y is not None else None
self._x = array2torch_tensor(x)
self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series
self.is_pandas = isinstance(x, pd.DataFrame) or isinstance(x, pd.Series)
if self.is_pandas:
self.features_names = x.columns