This commit is contained in:
olasaadi 2022-03-23 17:54:37 +02:00
parent 312469212e
commit 06158c8508
2 changed files with 12 additions and 8 deletions

View file

@ -52,13 +52,13 @@ class Anonymize:
else:
raise ValueError('No data provided')
transformed = self._anonymize_ndarray(dataset.get_samples().copy(), dataset.get_labels())
transformed = self._anonymize(dataset.get_samples().copy(), dataset.get_labels())
if dataset.is_pandas:
return pd.DataFrame(transformed, columns=self._features)
else:
return transformed
def _anonymize_ndarray(self, x, y):
def _anonymize(self, x, y):
if x.shape[0] != y.shape[0]:
raise ValueError("x and y should have same number of rows")
x_anonymizer_train = x[:, self.quasi_identifiers]
@ -75,7 +75,7 @@ class Anonymize:
self.anonymizer.fit(x_prepared, y)
cells_by_id = self._calculate_cells(x, x_prepared)
return self._anonymize_data_numpy(x, x_prepared, cells_by_id)
return self._anonymize_data(x, x_prepared, cells_by_id)
def _calculate_cells(self, x, x_anonymizer_train):
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
@ -129,7 +129,7 @@ class Anonymize:
node_ids = self._find_sample_nodes(samples)
return [cells_by_id[node_id] for node_id in node_ids]
def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
def _anonymize_data(self, x, x_anonymizer_train, cells_by_id):
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
index = 0
for row in x:

View file

@ -41,13 +41,14 @@ def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
raise ValueError('Non supported type: ', type(arr).__name__)
def array2torch_tensor(arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
def array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
"""
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
"""
if type(arr) == np.ndarray:
return torch.from_numpy(arr)
if type(arr) == pd.DataFrame:
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
self.is_pandas = True
return torch.from_numpy(arr.to_numpy())
if isinstance(arr, list):
return torch.tensor(arr)
@ -198,8 +199,11 @@ class PytorchData(Dataset):
:param y: collection of labels (optional)
:param kwargs: dataset parameters
"""
self._x = array2torch_tensor(x)
self._y = array2torch_tensor(y) if y is not None else None
self.is_pandas = False
self._y = array2torch_tensor(self, y) if y is not None else None
self._x = array2torch_tensor(self, x)
if self.is_pandas:
self.features_names = x.columns
if y is not None and len(self._x) != len(self._y):
raise ValueError('Non equivalent lengths of x and y')