From e72f0b6c9b3c990369d99d5604eeb6b387e49a15 Mon Sep 17 00:00:00 2001 From: Shen Date: Fri, 8 Sep 2017 15:10:40 +0200 Subject: [PATCH] Input warping using Kumar warping --- GPy/models/__init__.py | 1 + GPy/models/input_warped_gp.py | 149 ++++++++++++++++ GPy/testing/model_tests.py | 62 +++++++ GPy/util/__init__.py | 2 +- GPy/util/input_warping_functions.py | 262 ++++++++++++++++++++++++++++ 5 files changed, 475 insertions(+), 1 deletion(-) create mode 100644 GPy/models/input_warped_gp.py create mode 100644 GPy/util/input_warping_functions.py diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index ce7ba50b..35d11cd7 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -9,6 +9,7 @@ from .gplvm import GPLVM from .bcgplvm import BCGPLVM from .sparse_gplvm import SparseGPLVM from .warped_gp import WarpedGP +from .input_warped_gp import InputWarpedGP from .bayesian_gplvm import BayesianGPLVM from .mrd import MRD from .gradient_checker import GradientChecker, HessianChecker, SkewChecker diff --git a/GPy/models/input_warped_gp.py b/GPy/models/input_warped_gp.py new file mode 100644 index 00000000..6c31ebcf --- /dev/null +++ b/GPy/models/input_warped_gp.py @@ -0,0 +1,149 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np + +from ..core import GP +from .. import likelihoods +from ..util.input_warping_functions import KumarWarping +from .. import kern + + +class InputWarpedGP(GP): + """Input Warped GP + + This defines a GP model that applies a warping function to the Input. + By default, it uses Kumar Warping (CDF of Kumaraswamy distribution) + + Parameters + ---------- + X : array_like, shape = (n_samples, n_features) for input data + + Y : array_like, shape = (n_samples, 1) for output data + + kernel : object, optional + An instance of kernel function defined in GPy.kern + Default to Matern 32 + + warping_function : object, optional + An instance of warping function defined in GPy.util.input_warping_functions + Default to KumarWarping + + warping_indices : list of int, optional + An list of indices of which features in X should be warped. + It is used in the Kumar warping function + + normalizer : bool, optional + A bool variable indicates whether to normalize the output + + Xmin : list of float, optional + The min values for every feature in X + It is used in the Kumar warping function + + Xmax : list of float, optional + The max values for every feature in X + It is used in the Kumar warping function + + epsilon : float, optional + We normalize X to [0+e, 1-e]. If not given, using the default value defined in KumarWarping function + + Attributes + ---------- + X_untransformed : array_like, shape = (n_samples, n_features) + A copy of original input X + + X_warped : array_like, shape = (n_samples, n_features) + Input data after warping + + warping_function : object, optional + An instance of warping function defined in GPy.util.input_warping_functions + Default to KumarWarping + + Notes + ----- + Kumar warping uses the CDF of Kumaraswamy distribution. More on the Kumaraswamy distribution can be found at the + wiki page: https://en.wikipedia.org/wiki/Kumaraswamy_distribution + + References + ---------- + Snoek, J.; Swersky, K.; Zemel, R. S. & Adams, R. P. + Input Warping for Bayesian Optimization of Non-stationary Functions + preprint arXiv:1402.0929, 2014 + """ + def __init__(self, X, Y, kernel=None, normalizer=False, warping_function=None, warping_indices=None, Xmin=None, Xmax=None, epsilon=None): + if X.ndim == 1: + X = X.reshape(-1, 1) + self.X_untransformed = X.copy() + + if kernel is None: + kernel = kern.sde_Matern32(X.shape[1], variance=1.) + self.kernel = kernel + + if warping_function is None: + self.warping_function = KumarWarping(self.X_untransformed, warping_indices, epsilon, Xmin, Xmax) + else: + self.warping_function = warping_function + + self.X_warped = self.transform_data(self.X_untransformed) + likelihood = likelihoods.Gaussian() + super(InputWarpedGP, self).__init__(self.X_warped, Y, likelihood=likelihood, kernel=kernel, normalizer=normalizer) + + # Add the parameters in the warping function to the model parameters hierarchy + self.link_parameter(self.warping_function) + + def parameters_changed(self): + """Update the gradients of parameters for warping function + + This method is called when having new values of parameters for warping function, kernels + and other parameters in a normal GP + """ + # using the warped X to update + self.X = self.transform_data(self.X_untransformed) + super(InputWarpedGP, self).parameters_changed() + # the gradient of log likelihood w.r.t. input AFTER warping is a product of dL_dK and dK_dX + dL_dX = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X) + self.warping_function.update_grads(self.X_untransformed, dL_dX) + + def transform_data(self, X, test_data=False): + """Apply warping_function to some Input data + + Parameters + ---------- + X : array_like, shape = (n_samples, n_features) + + test_data: bool, optional + Default to False, should set to True when transforming test data + """ + return self.warping_function.f(X, test_data) + + def log_likelihood(self): + """Compute the marginal log likelihood + + For input warping, just use the normal GP log likelihood + """ + return GP.log_likelihood(self) + + def predict(self, Xnew): + """Prediction on the new data + + Parameters + ---------- + Xnew : array_like, shape = (n_samples, n_features) + The test data. + + Returns + ------- + mean : array_like, shape = (n_samples, output.dim) + Posterior mean at the location of Xnew + + var : array_like, shape = (n_samples, 1) + Posterior variance at the location of Xnew + """ + Xnew_warped = self.transform_data(Xnew, test_data=True) + mean, var = super(InputWarpedGP, self).predict(Xnew_warped, kern=self.kernel, full_cov=False) + return mean, var + +if __name__ == '__main__': + X = np.random.randn(100, 1) + Y = np.sin(X) + np.random.randn(100, 1)*0.05 + m = InputWarpedGP(X, Y) diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py index c6dc50f1..fc56fe5d 100644 --- a/GPy/testing/model_tests.py +++ b/GPy/testing/model_tests.py @@ -399,6 +399,68 @@ class MiscTests(unittest.TestCase): m.optimize() print(m) + def test_input_warped_gp_identity(self): + """ + A InputWarpedGP with the identity warping function should be + equal to a standard GP. + """ + k = GPy.kern.RBF(1) + m = GPy.models.GPRegression(self.X, self.Y, kernel=k) + m.optimize() + preds = m.predict(self.X) + + warp_k = GPy.kern.RBF(1) + warp_f = GPy.util.input_warping_functions.IdentifyWarping() + warp_m = GPy.models.InputWarpedGP(self.X, self.Y, kernel=warp_k, warping_function=warp_f) + warp_m.optimize() + warp_preds = warp_m.predict(self.X) + + np.testing.assert_almost_equal(preds, warp_preds, decimal=4) + + def test_kumar_warping_gradient(self): + n_X = 100 + np.random.seed(0) + X = np.random.randn(n_X, 2) + Y = np.sum(np.sin(X), 1).reshape(n_X, 1) + + k1 = GPy.kern.Linear(2) + m1 = GPy.models.InputWarpedGP(X, Y, kernel=k1) + m1.randomize() + self.assertEquals(m1.checkgrad(), True) + + k2 = GPy.kern.RBF(2) + m2 = GPy.models.InputWarpedGP(X, Y, kernel=k2) + m2.randomize() + m2.checkgrad() + self.assertEquals(m2.checkgrad(), True) + + k3 = GPy.kern.Matern52(2) + m3 = GPy.models.InputWarpedGP(X, Y, kernel=k3) + m3.randomize() + m3.checkgrad() + self.assertEquals(m3.checkgrad(), True) + + def test_kumar_warping_parameters(self): + np.random.seed(1) + X = np.random.rand(5, 2) + epsilon = 1e-6 + + # testing warping indices + warping_ind_1 = [0, 1, 2] + warping_ind_2 = [-1, 1, 2] + warping_ind_3 = [0, 1.5, 2] + self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, warping_ind_1) + self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, warping_ind_2) + self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, warping_ind_3) + + # testing Xmin and Xmax + Xmin_1, Xmax_1 = None, [1, 1] + Xmin_2, Xmax_2 = [0, 0], None + Xmin_3, Xmax_3 = [0, 0, 0], [1, 1] + self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, [0, 1], epsilon, Xmin_1, Xmax_1) + self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, [0, 1], epsilon, Xmin_2, Xmax_2) + self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, [0, 1], epsilon, Xmin_3, Xmax_3) + def test_warped_gp_identity(self): """ A WarpedGP with the identity warping function should be diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py index 4994ddcb..0a4f48d6 100644 --- a/GPy/util/__init__.py +++ b/GPy/util/__init__.py @@ -17,4 +17,4 @@ from . import multioutput from . import parallel from . import functions from . import cluster_with_offset -from . import quad_integrate +from . import input_warping_functions diff --git a/GPy/util/input_warping_functions.py b/GPy/util/input_warping_functions.py new file mode 100644 index 00000000..0f46c3d7 --- /dev/null +++ b/GPy/util/input_warping_functions.py @@ -0,0 +1,262 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from ..core.parameterization import Parameterized, Param +from ..core.parameterization.priors import LogGaussian + + +class InputWarpingFunction(Parameterized): + """Abstract class for input warping functions + """ + + def __init__(self, name): + super(InputWarpingFunction, self).__init__(name=name) + + def f(self, X, test=False): + + raise NotImplementedError + + def fgrad_x(self, X): + raise NotImplementedError + + def update_grads(self, X, dL_dW): + raise NotImplementedError + + +class IdentifyWarping(InputWarpingFunction): + """The identity warping function, for testing""" + def __init__(self): + super(IdentifyWarping, self).__init__(name='input_warp_identity') + + def f(self, X, test_data=False): + return X + + def fgrad_X(self, X): + return np.zeros(X.shape) + + def update_grads(self, X, dL_dW): + pass + + +class InputWarpingTest(InputWarpingFunction): + """The identity warping function, for testing""" + def __init__(self): + super(InputWarpingTest, self).__init__(name='input_warp_test') + self.a = Param('a', 1.0) + self.set_prior(LogGaussian(0.0, 0.75)) + self.link_parameter(self.a) + + def f(self, X, test_data=False): + return X * self.a + + def fgrad_X(self, X): + return self.ones(X.shape) * self.a + + def update_grads(self, X, dL_dW): + self.a.gradient[:] = np.sum(dL_dW * X) + + +class KumarWarping(InputWarpingFunction): + """Kumar Warping for input data + + Parameters + ---------- + X : array_like, shape = (n_samples, n_features) + The input data that is going to be warped + + warping_indices: list of int, optional + The features that are going to be warped + Default to warp all the features + + epsilon: float, optional + Used to normalized input data to [0+e, 1-e] + Default to 1e-6 + + Xmin : list of float, Optional + The min values for each feature defined by users + Default to the train minimum + + Xmax : list of float, Optional + The max values for each feature defined by users + Default to the train maximum + + Attributes + ---------- + warping_indices: list of int + The features that are going to be warped + Default to warp all the features + + warping_dim: int + The number of features to be warped + + Xmin : list of float + The min values for each feature defined by users + Default to the train minimum + + Xmax : list of float + The max values for each feature defined by users + Default to the train maximum + + epsilon: float + Used to normalized input data to [0+e, 1-e] + Default to 1e-6 + + X_normalized : array_like, shape = (n_samples, n_features) + The normalized training X + + scaling : list of float, length = n_features in X + Defined as 1.0 / (self.Xmax - self.Xmin) + + params : list of Param + The list of all the parameters used in Kumar Warping + + num_parameters: int + The number of parameters used in Kumar Warping + """ + + def __init__(self, X, warping_indices=None, epsilon=None, Xmin=None, Xmax=None): + + super(KumarWarping, self).__init__(name='input_warp_kumar') + + if warping_indices is not None and np.max(warping_indices) > X.shape[1] -1: + raise ValueError("Kumar warping indices exceed feature dimension") + + if warping_indices is not None and np.min(warping_indices) < 0: + raise ValueError("Kumar warping indices should be larger than 0") + + if warping_indices is not None and np.any(list(map(lambda x: not isinstance(x, int), warping_indices))): + raise ValueError("Kumar warping indices should be integer") + + if Xmin is None and Xmax is None: + Xmin = X.min(axis=0) + Xmax = X.max(axis=0) + else: + if Xmin is None or Xmax is None: + raise ValueError("Xmin and Xmax need to be provide at the same time!") + if len(Xmin) != X.shape[1] or len(Xmax) != X.shape[1]: + raise ValueError("Xmin and Xmax should have n_feature values!") + + if epsilon is None: + epsilon = 1e-6 + self.epsilon = epsilon + + self.Xmin = Xmin - self.epsilon + self.Xmax = Xmax + self.epsilon + self.scaling = 1.0 / (self.Xmax - self.Xmin) + self.X_normalized = (X - self.Xmin) / (self.Xmax - self.Xmin) + + if warping_indices is None: + warping_indices = range(X.shape[1]) + + self.warping_indices = warping_indices + self.warping_dim = len(self.warping_indices) + self.num_parameters = 2 * self.warping_dim + + # create parameters + self.params = [[Param('a%d' % i, 1.0), Param('b%d' % i, 1.0)] for i in range(self.warping_dim)] + + # add constraints + for i in range(self.warping_dim): + self.params[i][0].constrain_bounded(0.0, 10.0) + self.params[i][1].constrain_bounded(0.0, 10.0) + + # set priors and add them into handler + for i in range(self.warping_dim): + self.params[i][0].set_prior(LogGaussian(0.0, 0.75)) + self.params[i][1].set_prior(LogGaussian(0.0, 0.75)) + self.link_parameter(self.params[i][0]) + self.link_parameter(self.params[i][1]) + + def f(self, X, test_data=False): + """Apply warping_function to some Input data + + Parameters: + ----------- + X : array_like, shape = (n_samples, n_features) + + test_data: bool, optional + Default to False, should set to True when transforming test data + + Returns + ------- + X_warped : array_like, shape = (n_samples, n_features) + The warped input data + + Math + ---- + f(x) = 1 - (1 - x^a)^b + """ + X_warped = X.copy() + if test_data: + X_normalized = (X - self.Xmin) / (self.Xmax - self.Xmin) + else: + X_normalized = self.X_normalized + + for i_seq, i_fea in enumerate(self.warping_indices): + a, b = self.params[i_seq][0], self.params[i_seq][1] + X_warped[:, i_fea] = 1 - np.power(1 - np.power(X_normalized[:, i_fea], a), b) + return X_warped + + def fgrad_X(self, X): + """Compute the gradient of warping function with respect to X + + Parameters + ---------- + X : array_like, shape = (n_samples, n_features) + The location to compute gradient + + Returns + ------- + grad : array_like, shape = (n_samples, n_features) + The gradient for every location at X + + Math + ---- + grad = a * b * x ^(a-1) * (1 - x^a)^(b-1) + """ + grad = np.zeros(X.shape) + for i_seq, i_fea in enumerate(self.warping_indices): + a, b = self.params[i_seq][0], self.params[i_seq][1] + grad[:, i_fea] = a * b * np.power(self.X_normalized[:, i_fea], a-1) * \ + np.power(1 - np.power(self.X_normalized[:, i_fea], a), b-1) * self.scaling[i_fea] + return grad + + def update_grads(self, X, dL_dW): + """Update the gradients of marginal log likelihood with respect to the parameters of warping function + + Parameters + ---------- + X : array_like, shape = (n_samples, n_features) + The input BEFORE warping + + dL_dW : array_like, shape = (n_samples, n_features) + The gradient of marginal log likelihood with respect to the Warped input + + Math + ---- + let w = f(x), the input after warping, then + dW_da = b * (1 - x^a)^(b - 1) * x^a * ln(x) + dW_db = - (1 - x^a)^b * ln(1 - x^a) + dL_da = dL_dW * dW_da + dL_db = dL_dW * dW_db + """ + for i_seq, i_fea in enumerate(self.warping_indices): + ai, bi = self.params[i_seq][0], self.params[i_seq][1] + + # cache some value for save some computation + x_pow_a = np.power(self.X_normalized[:, i_fea], ai) + + # compute gradient for ai, bi on all X + dz_dai = bi * np.power(1 - x_pow_a, bi-1) * x_pow_a * np.log(self.X_normalized[:, i_fea]) + dz_dbi = - np.power(1 - x_pow_a, bi) * np.log(1 - x_pow_a) + + # sum gradients on all the data + dL_dai = np.sum(dL_dW[:, i_fea] * dz_dai) + dL_dbi = np.sum(dL_dW[:, i_fea] * dz_dbi) + self.params[i_seq][0].gradient[:] = dL_dai + self.params[i_seq][1].gradient[:] = dL_dbi + + + +