Documentation updates (#29)

* Bump version to 0.1.0 (breaking changes to some APIs) * Update documentation * Update requirements * gitignore
2026-05-11 21:12:37 +02:00 · 2022-05-02 11:46:18 +03:00 · 2022-05-02 11:46:18 +03:00 · fd6be8e778
commit fd6be8e778
parent 014aed9670
12 changed files with 640 additions and 298 deletions
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -21,7 +21,8 @@ from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnCl


 class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
-    """ A transformer that generalizes data to representative points.
+    """
+    A transformer that generalizes data to representative points.

    Learns data generalizations based on an original model's predictions
    and a target accuracy. Once the generalizations are learned, can
@ -34,52 +35,33 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    need to supply an existing ``estimator`` to init.
    In summary, either ``estimator`` and ``target_accuracy`` should be
    supplied or ``cells`` should be supplied.
-    Parameters
-    ----------
-    estimator : estimator, optional
-        The original model for which generalization is being performed.
-        Should be pre-fitted.
-    target_accuracy : float, optional
-        The required accuracy when applying the base model to the
-        generalized data. Accuracy is measured relative to the original
-        accuracy of the model.
-    categorical_features: list of str, optional
-        The list of categorical features should only be supplied when
-         passing data as a pandas dataframe.
-    features_to_minimize: List of str or numbers, optional
-        The features that need to be minimized in case of pandas data,
-         and indexes of features in case of numpy data.
-    cells : list of object, optional
-        The cells used to generalize records. Each cell must define a
-        range or subset of categories for each feature, as well as a
-        representative value for each feature.
-        This parameter should be used when instantiating a transformer
-        object without first fitting it.
-    train_only_QI : Bool, optional
-        The required method to train data set for minimizing. Default is
-        to train the tree just on the features that are given as
-        features_to_minimize.
-    is_regression : Bool, optional
-        Whether the model is a regression model or not (if False, assumes
-        a classification model). Default is False.

-    Attributes
-    ----------
-    features_ : list of str
-        The feature names, in the order that they appear in the data.
-    cells_ : list of object
-        The cells used to generalize records, as learned when calling fit.
-    ncp_ : float
-        The NCP (information loss) score of the resulting generalization,
-        as measured on the training data.
-    generalizations_ : object
-        The generalizations that were learned (actual feature ranges).
+    :param estimator: The original model for which generalization is being performed. Should be pre-fitted.
+    :type estimator: sklearn `BaseEstimator` or `Model`
+    :param target_accuracy: The required relative accuracy when applying the base model to the generalized data.
+                            Accuracy is measured relative to the original accuracy of the model.
+    :type target_accuracy: float, optional
+    :param cells: The cells used to generalize records. Each cell must define a range or subset of categories for
+                  each feature, as well as a representative value for each feature. This parameter should be used
+                  when instantiating a transformer object without first fitting it.
+    :type cells: list of objects, optional
+    :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
+                                 encoded before using them to train the decision tree model).
+    :type categorical_features: list of strings, optional
+    :param features_to_minimize: The features to be minimized.
+    :type features_to_minimize: list of strings or int, optional
+    :param train_only_QI: Whether to train the tree just on the ``features_to_minimize`` or on all features. Default
+                          is only on ``features_to_minimize``.
+    :type train_only_QI: boolean, optional
+    :param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
+                          Default is False.
+    :type is_regression: boolean, optional
    """

-    def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998,
-                 cells: list = None, categorical_features: Union[np.ndarray, list] = None,
-                 features_to_minimize: Union[np.ndarray, list] = None, train_only_QI: bool = True,
-                 is_regression: bool = False):
+    def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998,
+                 cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None,
+                 features_to_minimize: Optional[Union[np.ndarray, list]] = None, train_only_QI: Optional[bool] = True,
+                 is_regression: Optional[bool] = False):
        if issubclass(estimator.__class__, Model):
            self.estimator = estimator
        else:
@ -97,18 +79,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        self.is_regression = is_regression

    def get_params(self, deep=True):
-        """Get parameters for this estimator.
+        """
+        Get parameters

-        Parameters
-        ----------
-        deep : boolean, optional
-            If True, will return the parameters for this estimator and contained
-            subobjects that are estimators.
-
-        Returns
-        -------
-        params : mapping of string to any
-            Parameter names mapped to their values.
+        :param deep: If True, will return the parameters for this estimator and contained
+                     sub-objects that are estimators.
+        :type deep: boolean, optional
+        :return: Parameter names mapped to their values
        """
        ret = {}
        ret['target_accuracy'] = self.target_accuracy
@ -120,12 +97,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        return ret

    def set_params(self, **params):
-        """Set the parameters of this estimator.
+        """
+        Set parameters

-        Returns
-        -------
-        self : object
-            Returns self.
+        :param target_accuracy: The required relative accuracy when applying the base model to the generalized data.
+                                Accuracy is measured relative to the original accuracy of the model.
+        :type target_accuracy: float, optional
+        :param cells: The cells used to generalize records. Each cell must define a range or subset of categories for
+                      each feature, as well as a representative value for each feature. This parameter should be used
+                      when instantiating a transformer object without first fitting it.
+        :type cells: list of objects, optional
+        :return: self
        """
        if 'target_accuracy' in params:
            self.target_accuracy = params['target_accuracy']
@ -135,29 +117,32 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

    @property
    def generalizations(self):
+        """
+        Return the generalizations derived from the model and test data.
+
+        :return: generalizations object. Contains 3 sections: 'ranges' that contains ranges for numerical features,
+                                 'categories' that contains sub-groups of categories for categorical features, and
+                                 'untouched' that contains the features that could not be generalized.
+        """
        return self.generalizations_

    def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
-                      features_names: Optional = None, dataset: Optional[ArrayDataset] = None):
-        """Learns the generalizations based on training data, and applies them to the data.
+                      features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
+        """
+        Learns the generalizations based on training data, and applies them to the data.

-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            The training input samples.
-        y : array-like, shape (n_samples,), optional
-            The target values. An array of int.
-            This should contain the predictions of the original model on ``X``.
-        features_names : list of str, The feature names, in the order that they appear in the data,
-                        provided just if X and y were provided (optional).
-        dataset : Data wrapper containing the training input samples and the predictions of the
-                  original model on the training data.
-        Either X,y OR dataset need to be provided, not both.
-        Returns
-        -------
-        X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
-            The array containing the representative values to which each record in
-            ``X`` is mapped.
+        :param X: The training input samples.
+        :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
+        :param y: The target values. This should contain the predictions of the original model on ``X``.
+        :type y: array-like, shape (n_samples,), optional
+        :param features_names: The feature names, in the order that they appear in the data. Can be provided when
+                               passing the data as ``X`` and ``y``
+        :type features_names: list of strings, optional
+        :param dataset: Data wrapper containing the training input samples and the predictions of the original model
+                        on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
+        :type dataset: `ArrayDataset`, optional
+        :return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
+                 pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
        """
        self.fit(X, y, features_names, dataset=dataset)
        return self.transform(X, features_names, dataset=dataset)
@ -166,23 +151,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            features_names: Optional = None, dataset: ArrayDataset = None):
        """Learns the generalizations based on training data.

-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            The training input samples.
-        y : array-like, shape (n_samples,), optional
-            The target values. An array of int.
-            This should contain the predictions of the original model on ``X``.
-        features_names : list of str, The feature names, in the order that they appear in the data,
-                        provided just if X and y were provided (optional).
-        dataset : Data wrapper containing the training input samples and the predictions of the
-                  original model on the training data.
-        Either X,y OR dataset need to be provided, not both.
-        Returns
-        -------
-        X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
-            The array containing the representative values to which each record in
-            ``X`` is mapped.
+        :param X: The training input samples.
+        :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
+        :param y: The target values. This should contain the predictions of the original model on ``X``.
+        :type y: array-like, shape (n_samples,), optional
+        :param features_names: The feature names, in the order that they appear in the data. Can be provided when
+                               passing the data as ``X`` and ``y``
+        :type features_names: list of strings, optional
+        :param dataset: Data wrapper containing the training input samples and the predictions of the original model
+                        on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
+        :type dataset: `ArrayDataset`, optional
+        :return: self
        """

        # take into account that estimator, X, y, cells, features may be None
@ -242,7 +221,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                used_X_train = X_train_QI

            # collect feature data (such as min, max)
-
            feature_data = {}
            for feature in self._features:
                if feature not in feature_data.keys():
@ -386,23 +364,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        # Return the transformer
        return self

-    def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None):
+    def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
+                  dataset: Optional[ArrayDataset] = None):
        """ Transforms data records to representative points.

-        Parameters
-        ----------
-        X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
-         may contain both numeric and categorical data.
-            The input samples.
-        features_names : list of str, The feature names, in the order that they appear in the data,
-                        provided just if X was provided (optional).
-        dataset : Data wrapper containing the training input samples.
-        Either X OR dataset need to be provided, not both.
-        Returns
-        -------
-        X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
-            The array containing the representative values to which each record in
-            ``X`` is mapped.
+        :param X: The training input samples.
+        :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
+        :param features_names: The feature names, in the order that they appear in the data. Can be provided when
+                               passing the data as ``X`` and ``y``
+        :type features_names: list of strings, optional
+        :param dataset: Data wrapper containing the training input samples and the predictions of the original model
+                        on the training data. Either ``X`` OR ``dataset`` need to be provided, not both.
+        :type dataset: `ArrayDataset`, optional
+        :return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
+                 pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
        """

        # Check if fit has been called