diff --git a/GPy/__init__.py b/GPy/__init__.py index 1dc3e6b1..9ee3d935 100644 --- a/GPy/__init__.py +++ b/GPy/__init__.py @@ -14,6 +14,8 @@ from . import testing from . import kern from . import plotting +from .util import normalizer + # backwards compatibility import sys backwards_compatibility = ['lists_and_dicts', 'observable_array', 'ties_and_remappings', 'index_operations'] diff --git a/GPy/__version__.py b/GPy/__version__.py index a955fdae..3e8d9f94 100644 --- a/GPy/__version__.py +++ b/GPy/__version__.py @@ -1 +1 @@ -__version__ = "1.2.1" +__version__ = "1.4.0" diff --git a/GPy/core/gp.py b/GPy/core/gp.py index 9cbe3daf..f0910d9d 100644 --- a/GPy/core/gp.py +++ b/GPy/core/gp.py @@ -2,17 +2,17 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np -from .. import kern -from GPy.core.model import Model -from paramz import ObsAr +from .model import Model +from .parameterization.variational import VariationalPosterior from .mapping import Mapping from .. import likelihoods +from .. import kern from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation -from GPy.core.parameterization.variational import VariationalPosterior +from ..util.normalizer import Standardize +from paramz import ObsAr import logging import warnings -from GPy.util.normalizer import MeanNorm logger = logging.getLogger("GP") class GP(Model): @@ -28,7 +28,7 @@ class GP(Model): :param Norm normalizer: normalize the outputs Y. Prediction will be un-normalized using this normalizer. - If normalizer is None, we will normalize using MeanNorm. + If normalizer is None, we will normalize using Standardize. If normalizer is False, no normalization will be done. .. Note:: Multiple independent outputs are allowed using columns of Y @@ -49,7 +49,7 @@ class GP(Model): logger.info("initializing Y") if normalizer is True: - self.normalizer = MeanNorm() + self.normalizer = Standardize() elif normalizer is False: self.normalizer = None else: @@ -64,6 +64,7 @@ class GP(Model): self.Y_normalized = self.Y else: self.Y = Y + self.Y_normalized = self.Y if Y.shape[0] != self.num_data: #There can be cases where we want inputs than outputs, for example if we have multiple latent @@ -248,14 +249,16 @@ class GP(Model): """ #predict the latent function values mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern) - if self.normalizer is not None: - mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var) if include_likelihood: # now push through likelihood if likelihood is None: likelihood = self.likelihood mu, var = likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata) + + if self.normalizer is not None: + mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var) + return mu, var def predict_noiseless(self, Xnew, full_cov=False, Y_metadata=None, kern=None): @@ -300,11 +303,14 @@ class GP(Model): :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)] """ m, v = self._raw_predict(X, full_cov=False, kern=kern) - if self.normalizer is not None: - m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v) if likelihood is None: likelihood = self.likelihood - return likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata) + + quantiles = likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata) + + if self.normalizer is not None: + quantiles = [self.normalizer.inverse_mean(q) for q in quantiles] + return quantiles def predictive_gradients(self, Xnew, kern=None): """ diff --git a/GPy/core/parameterization/priorizable.py b/GPy/core/parameterization/priorizable.py index 1d1153c7..eb82e862 100644 --- a/GPy/core/parameterization/priorizable.py +++ b/GPy/core/parameterization/priorizable.py @@ -16,7 +16,7 @@ class Priorizable(Parameterizable): def __setstate__(self, state): super(Priorizable, self).__setstate__(state) - self._index_operations['priors'] = self.priors + #self._index_operations['priors'] = self.priors #=========================================================================== diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py index 11734564..19919f97 100644 --- a/GPy/examples/regression.py +++ b/GPy/examples/regression.py @@ -550,3 +550,34 @@ def parametric_mean_function(max_iters=100, optimize=True, plot=True): return m +def warped_gp_cubic_sine(max_iters=100): + """ + A test replicating the cubic sine regression problem from + Snelson's paper. + """ + X = (2 * np.pi) * np.random.random(151) - np.pi + Y = np.sin(X) + np.random.normal(0,0.2,151) + Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y]) + X = X[:, None] + Y = Y[:, None] + + warp_k = GPy.kern.RBF(1) + warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2) + warp_m = GPy.models.WarpedGP(X, Y, kernel=warp_k, warping_function=warp_f) + warp_m['.*\.d'].constrain_fixed(1.0) + m = GPy.models.GPRegression(X, Y) + m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters) + warp_m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters) + #m.optimize(max_iters=max_iters) + #warp_m.optimize(max_iters=max_iters) + + print(warp_m) + print(warp_m['.*warp.*']) + + warp_m.predict_in_warped_space = False + warp_m.plot(title="Warped GP - Latent space") + warp_m.predict_in_warped_space = True + warp_m.plot(title="Warped GP - Warped space") + m.plot(title="Standard GP") + warp_m.plot_warping() + pb.show() diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py index 99fe809b..ca20e5a9 100644 --- a/GPy/kern/src/add.py +++ b/GPy/kern/src/add.py @@ -13,15 +13,21 @@ class Add(CombinationKernel): propagates gradients through. This kernel will take over the active dims of it's subkernels passed in. + + NOTE: The subkernels will be copies of the original kernels, to prevent + unexpected behavior. """ def __init__(self, subkerns, name='sum'): - for i, kern in enumerate(subkerns[:]): + _newkerns = [] + for kern in subkerns: if isinstance(kern, Add): - del subkerns[i] - for part in kern.parts[::-1]: + for part in kern.parts: #kern.unlink_parameter(part) - subkerns.insert(i, part.copy()) - super(Add, self).__init__(subkerns, name) + _newkerns.append(part.copy()) + else: + _newkerns.append(kern.copy()) + + super(Add, self).__init__(_newkerns, name) self._exact_psicomp = self._check_exact_psicomp() def _check_exact_psicomp(self): diff --git a/GPy/kern/src/static.py b/GPy/kern/src/static.py index 5cf4a1c9..0ffd8112 100644 --- a/GPy/kern/src/static.py +++ b/GPy/kern/src/static.py @@ -135,9 +135,7 @@ class Bias(Static): def K(self, X, X2=None): shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0]) - ret = np.empty(shape, dtype=np.float64) - ret[:] = self.variance - return ret + return np.full(shape, self.variance, dtype=np.float64) def update_gradients_full(self, dL_dK, X, X2=None): self.variance.gradient = dL_dK.sum() @@ -146,9 +144,7 @@ class Bias(Static): self.variance.gradient = dL_dKdiag.sum() def psi2(self, Z, variational_posterior): - ret = np.empty((Z.shape[0], Z.shape[0]), dtype=np.float64) - ret[:] = self.variance*self.variance*variational_posterior.shape[0] - return ret + return np.full((Z.shape[0], Z.shape[0]), self.variance*self.variance*variational_posterior.shape[0], dtype=np.float64) def psi2n(self, Z, variational_posterior): ret = np.empty((variational_posterior.mean.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64) diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index 654b1938..6d19d8a5 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -26,3 +26,5 @@ from .dpgplvm import DPBayesianGPLVM from .state_space_model import StateSpace from .ibp_lfm import IBPLFM + +from .gp_offset_regression import GPOffsetRegression diff --git a/GPy/models/gp_offset_regression.py b/GPy/models/gp_offset_regression.py new file mode 100644 index 00000000..b843e08d --- /dev/null +++ b/GPy/models/gp_offset_regression.py @@ -0,0 +1,95 @@ +# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt) +# Licensed under the BSD 3-clause license (see LICENSE.txt) +# Written by Mike Smith. michaeltsmith.org.uk + +import numpy as np +from ..core import GP +from .. import likelihoods +from .. import kern +from ..core import Param + +class GPOffsetRegression(GP): + """ + Gaussian Process model for offset regression + + :param X: input observations, we assume for this class that this has one dimension of actual inputs and the last dimension should be the index of the cluster (so X should be Nx2) + :param Y: observed values (Nx1?) + :param kernel: a GPy kernel, defaults to rbf + :param Norm normalizer: [False] + :param noise_var: the noise variance for Gaussian likelhood, defaults to 1. + + Normalize Y with the norm given. + If normalizer is False, no normalization will be done + If it is None, we use GaussianNorm(alization) + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1., mean_function=None): + + assert X.shape[1]>1, "Need at least two input dimensions, as last dimension is the label of the cluster" + if kernel is None: + kernel = kern.RBF(X.shape[1]-1) + + #self._log_marginal_likelihood = np.nan #todo + + likelihood = likelihoods.Gaussian(variance=noise_var) + self.X_fixed = X[:,:-1] + self.selected = np.array([int(x) for x in X[:,-1]]) + + + super(GPOffsetRegression, self).__init__(X, Y, kernel, likelihood, name='GP offset regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function) + maxcluster = np.max(self.selected) + self.offset = Param('offset', np.zeros(maxcluster)) + #self.offset.set_prior(...) + self.link_parameter(self.offset) + + #def dr_doffset(self, X, sel): #how much r changes wrt the offset hyperparameters + + #def dL_doffset(self, X, sel): + # dL_dr = self.dK_dr_via_X(X, X) * dL_dK + + + def dr_doffset(self,X,sel,delta): + #given an input matrix, X and the offsets (delta) + #finds dr/dDelta + #returns them as a list, one for each offset (delta). + #get the input values + + #a matrix G represents the effect of increasing the offset on the radius passed to the kernel for each input. For example + #what effect will increasing offset 4 have on the kernel output of inputs 5 and 8? Answer: Gs[4][5,8]... (positive or negative) + Gs = [] + for i,d in enumerate(delta): + #X[sel==(i+1)]-=d + G = np.repeat(np.array(sel==(i+1))[:,None]*1,len(X),axis=1) - np.repeat(np.array(sel==(i+1))[None,:]*1,len(X),axis=0) + Gs.append(G) + #does subtracting the two Xs end up positive or negative (if negative we need to flip the sign in G). + w = np.repeat(X,len(X),axis=1) - np.repeat(X.T,len(X),axis=0) + dr_doffsets = [] + for i,d in enumerate(delta): + dr_doffset = np.sign(w * Gs[i]) + #print "dr_doffset %d" % i + #print dr_doffset + #print Gs[i] + #print w + dr_doffsets.append(dr_doffset) + + #lastly we need to divide by the lengthscale: So far we've found d(X_i - X_j)/dOffsets + #we want dr/dOffsets. (X_i - X_j)/lengthscale = r + dr_doffsets /= self.kern.lengthscale + return dr_doffsets + + def parameters_changed(self): + offsets = np.hstack([0.0,self.offset.values])[:,None] + + self.X = self.X_fixed - offsets[self.selected] + super(GPOffsetRegression, self).parameters_changed() + + dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK'] + + dr_doff = self.dr_doffset(self.X,self.selected,self.offset.values) + for i in range(len(dr_doff)): + dL_doff = dL_dr * dr_doff[i] + self.offset.gradient[i] = -np.sum(dL_doff) + diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py index df947d3e..a24401fe 100644 --- a/GPy/models/warped_gp.py +++ b/GPy/models/warped_gp.py @@ -2,65 +2,56 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np -from ..util.warping_functions import * +#from ..util.warping_functions import * from ..core import GP from .. import likelihoods -from GPy.util.warping_functions import TanhWarpingFunction_d +from paramz import ObsAr +#from GPy.util.warping_functions import TanhFunction +from ..util.warping_functions import TanhFunction from GPy import kern class WarpedGP(GP): - def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3): - + """ + This defines a GP Regression model that applies a + warping function to the output. + """ + def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalizer=False): if kernel is None: kernel = kern.RBF(X.shape[1]) - if warping_function == None: - self.warping_function = TanhWarpingFunction_d(warping_terms) + self.warping_function = TanhFunction(warping_terms) self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1) * 1) else: self.warping_function = warping_function - - self.scale_data = False - if self.scale_data: - Y = self._scale_data(Y) - #self.has_uncertain_inputs = False - self.Y_untransformed = Y.copy() - self.predict_in_warped_space = True likelihood = likelihoods.Gaussian() - - GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel) + super(WarpedGP, self).__init__(X, Y.copy(), likelihood=likelihood, kernel=kernel, normalizer=normalizer) + self.Y_normalized = self.Y_normalized.copy() + self.Y_untransformed = self.Y_normalized.copy() + self.predict_in_warped_space = True self.link_parameter(self.warping_function) - def _scale_data(self, Y): - self._Ymax = Y.max() - self._Ymin = Y.min() - return (Y - self._Ymin) / (self._Ymax - self._Ymin) - 0.5 - - def _unscale_data(self, Y): - return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin + def set_XY(self, X=None, Y=None): + super(WarpedGP, self).set_XY(X, Y) + self.Y_untransformed = self.Y_normalized.copy() + self.update_model(True) def parameters_changed(self): - self.Y[:] = self.transform_data() + """ + Notice that we update the warping function gradients here. + """ + self.Y_normalized[:] = self.transform_data() super(WarpedGP, self).parameters_changed() - Kiy = self.posterior.woodbury_vector.flatten() - - grad_y = self.warping_function.fgrad_y(self.Y_untransformed) - grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, - return_covar_chain=True) - djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0) - dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0) - - warping_grads = -dquad_dpsi + djac_dpsi - - self.warping_function.psi.gradient[:] = warping_grads[:, :-1] - self.warping_function.d.gradient[:] = warping_grads[0, -1] + self.warping_function.update_grads(self.Y_untransformed, Kiy) def transform_data(self): Y = self.warping_function.f(self.Y_untransformed.copy()).copy() return Y def log_likelihood(self): + """ + Notice we add the jacobian of the warping function here. + """ ll = GP.log_likelihood(self) jacobian = self.warping_function.fgrad_y(self.Y_untransformed) return ll + np.log(jacobian).sum() @@ -73,36 +64,42 @@ class WarpedGP(GP): arg2 = np.ones(shape=gh_samples.shape).dot(mean.T) return self.warping_function.f_inv(arg1 + arg2, y=pred_init) - def _get_warped_mean(self, mean, std, pred_init=None, deg_gauss_hermite=100): + def _get_warped_mean(self, mean, std, pred_init=None, deg_gauss_hermite=20): """ Calculate the warped mean by using Gauss-Hermite quadrature. """ gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite) - gh_samples = gh_samples[:,None] - gh_weights = gh_weights[None,:] + gh_samples = gh_samples[:, None] + gh_weights = gh_weights[None, :] return gh_weights.dot(self._get_warped_term(mean, std, gh_samples)) / np.sqrt(np.pi) - def _get_warped_variance(self, mean, std, pred_init=None, deg_gauss_hermite=100): + def _get_warped_variance(self, mean, std, pred_init=None, deg_gauss_hermite=20): """ Calculate the warped variance by using Gauss-Hermite quadrature. """ gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite) - gh_samples = gh_samples[:,None] - gh_weights = gh_weights[None,:] + gh_samples = gh_samples[:, None] + gh_weights = gh_weights[None, :] arg1 = gh_weights.dot(self._get_warped_term(mean, std, gh_samples, pred_init=pred_init) ** 2) / np.sqrt(np.pi) arg2 = self._get_warped_mean(mean, std, pred_init=pred_init, deg_gauss_hermite=deg_gauss_hermite) return arg1 - (arg2 ** 2) - def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None, - median=False, deg_gauss_hermite=100): - # normalize X values - # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale - mu, var = GP._raw_predict(self, Xnew) - + def predict(self, Xnew, kern=None, pred_init=None, Y_metadata=None, + median=False, deg_gauss_hermite=20, likelihood=None): + """ + Prediction results depend on: + - The value of the self.predict_in_warped_space flag + - The median flag passed as argument + The likelihood keyword is never used, it is just to follow the plotting API. + """ + #mu, var = GP._raw_predict(self, Xnew) # now push through likelihood - mean, var = self.likelihood.predictive_values(mu, var) + #mean, var = self.likelihood.predictive_values(mu, var) + + mean, var = super(WarpedGP, self).predict(Xnew, kern=kern, full_cov=False, likelihood=likelihood) + if self.predict_in_warped_space: std = np.sqrt(var) @@ -116,13 +113,9 @@ class WarpedGP(GP): else: wmean = mean wvar = var - - if self.scale_data: - pred = self._unscale_data(pred) - return wmean, wvar - def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None): + def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, likelihood=None, kern=None): """ Get the predictive quantiles around the prediction at X @@ -133,19 +126,38 @@ class WarpedGP(GP): :returns: list of quantiles for each X and predictive quantiles for interval combination :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)] """ - m, v = self._raw_predict(X, full_cov=False) - if self.normalizer is not None: - m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v) - a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata) - #return [a, b] - if not self.predict_in_warped_space: - return [a, b] - #print a.shape - new_a = self.warping_function.f_inv(a) - new_b = self.warping_function.f_inv(b) + qs = super(WarpedGP, self).predict_quantiles(X, quantiles, Y_metadata=Y_metadata, likelihood=likelihood, kern=kern) + if self.predict_in_warped_space: + return [self.warping_function.f_inv(q) for q in qs] + return qs + #m, v = self._raw_predict(X, full_cov=False) + #if self.normalizer is not None: + # m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v) + #a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata) + #if not self.predict_in_warped_space: + # return [a, b] + #new_a = self.warping_function.f_inv(a) + #new_b = self.warping_function.f_inv(b) + #return [new_a, new_b] - return [new_a, new_b] - #return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata) + def log_predictive_density(self, x_test, y_test, Y_metadata=None): + """ + Calculation of the log predictive density. Notice we add + the jacobian of the warping function here. + + .. math: + p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*}) + + :param x_test: test locations (x_{*}) + :type x_test: (Nx1) array + :param y_test: test observations (y_{*}) + :type y_test: (Nx1) array + :param Y_metadata: metadata associated with the test points + """ + mu_star, var_star = self._raw_predict(x_test) + fy = self.warping_function.f(y_test) + ll_lpd = self.likelihood.log_predictive_density(fy, mu_star, var_star, Y_metadata=Y_metadata) + return ll_lpd + np.log(self.warping_function.fgrad_y(y_test)) if __name__ == '__main__': diff --git a/GPy/plotting/abstract_plotting_library.py b/GPy/plotting/abstract_plotting_library.py index 64061b99..95377ed7 100644 --- a/GPy/plotting/abstract_plotting_library.py +++ b/GPy/plotting/abstract_plotting_library.py @@ -61,6 +61,8 @@ class AbstractPlottingLibrary(object): """ Get a new figure with nrows and ncolumns subplots. Does not initialize the canvases yet. + + There is individual kwargs for the individual plotting libraries to use. """ raise NotImplementedError("Implement all plot functions in AbstractPlottingLibrary in order to use your own plotting library") diff --git a/GPy/plotting/gpy_plot/plot_util.py b/GPy/plotting/gpy_plot/plot_util.py index 0d472d06..7bd1723f 100644 --- a/GPy/plotting/gpy_plot/plot_util.py +++ b/GPy/plotting/gpy_plot/plot_util.py @@ -31,6 +31,7 @@ import numpy as np from scipy import sparse import itertools +from ...models import WarpedGP def in_ipynb(): try: @@ -295,6 +296,10 @@ def get_x_y_var(model): Y = model.Y.values except AttributeError: Y = model.Y + + if isinstance(model, WarpedGP) and not model.predict_in_warped_space: + Y = model.Y_normalized + if sparse.issparse(Y): Y = Y.todense().view(np.ndarray) return X, X_variance, Y @@ -381,4 +386,4 @@ def x_frame2D(X,plot_limits=None,resolution=None): resolution = resolution or 50 xx, yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] Xnew = np.c_[xx.flat, yy.flat] - return Xnew, xx, yy, xmin, xmax \ No newline at end of file + return Xnew, xx, yy, xmin, xmax diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py index e4411e23..1e0731ab 100644 --- a/GPy/testing/model_tests.py +++ b/GPy/testing/model_tests.py @@ -91,18 +91,32 @@ class MiscTests(unittest.TestCase): k = GPy.kern.RBF(1) m2 = GPy.models.GPRegression(self.X, (Y-mu)/std, kernel=k, normalizer=False) m2[:] = m[:] + mu1, var1 = m.predict(m.X, full_cov=True) mu2, var2 = m2.predict(m2.X, full_cov=True) np.testing.assert_allclose(mu1, (mu2*std)+mu) - np.testing.assert_allclose(var1, var2) + np.testing.assert_allclose(var1, var2*std**2) + mu1, var1 = m.predict(m.X, full_cov=False) mu2, var2 = m2.predict(m2.X, full_cov=False) + np.testing.assert_allclose(mu1, (mu2*std)+mu) - np.testing.assert_allclose(var1, var2) + np.testing.assert_allclose(var1, var2*std**2) q50n = m.predict_quantiles(m.X, (50,)) q50 = m2.predict_quantiles(m2.X, (50,)) + np.testing.assert_allclose(q50n[0], (q50[0]*std)+mu) + + # Test variance component: + qs = np.array([2.5, 97.5]) + # The quantiles get computed before unormalization + # And transformed using the mean transformation: + c = np.random.choice(self.X.shape[0]) + q95 = m2.predict_quantiles(self.X[[c]], qs) + mu, var = m2.predict(self.X[[c]]) + from scipy.stats import norm + np.testing.assert_allclose((mu+(norm.ppf(qs/100.)*np.sqrt(var))).flatten(), np.array(q95).flatten()) def check_jacobian(self): try: @@ -361,51 +375,95 @@ class MiscTests(unittest.TestCase): preds = m.predict(self.X) warp_k = GPy.kern.RBF(1) - warp_f = GPy.util.warping_functions.IdentityFunction() - warp_m = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k, warping_function=warp_f) + warp_f = GPy.util.warping_functions.IdentityFunction(closed_inverse=False) + warp_m = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k, + warping_function=warp_f) warp_m.optimize() warp_preds = warp_m.predict(self.X) - np.testing.assert_almost_equal(preds, warp_preds) + warp_k_exact = GPy.kern.RBF(1) + warp_f_exact = GPy.util.warping_functions.IdentityFunction() + warp_m_exact = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k_exact, + warping_function=warp_f_exact) + warp_m_exact.optimize() + warp_preds_exact = warp_m_exact.predict(self.X) + + np.testing.assert_almost_equal(preds, warp_preds, decimal=4) + np.testing.assert_almost_equal(preds, warp_preds_exact, decimal=4) - @unittest.skip('Comment this to plot the modified sine function') - def test_warped_gp_sine(self): + def test_warped_gp_log(self): """ - A test replicating the sine regression problem from - Snelson's paper. + A WarpedGP with the log warping function should be + equal to a standard GP with log labels. + Note that we predict the median here. + """ + k = GPy.kern.RBF(1) + Y = np.abs(self.Y) + logY = np.log(Y) + m = GPy.models.GPRegression(self.X, logY, kernel=k) + m.optimize() + preds = m.predict(self.X)[0] + + warp_k = GPy.kern.RBF(1) + warp_f = GPy.util.warping_functions.LogFunction(closed_inverse=False) + warp_m = GPy.models.WarpedGP(self.X, Y, kernel=warp_k, + warping_function=warp_f) + warp_m.optimize() + warp_preds = warp_m.predict(self.X, median=True)[0] + + warp_k_exact = GPy.kern.RBF(1) + warp_f_exact = GPy.util.warping_functions.LogFunction() + warp_m_exact = GPy.models.WarpedGP(self.X, Y, kernel=warp_k_exact, + warping_function=warp_f_exact) + warp_m_exact.optimize(messages=True) + warp_preds_exact = warp_m_exact.predict(self.X, median=True)[0] + + np.testing.assert_almost_equal(np.exp(preds), warp_preds, decimal=4) + np.testing.assert_almost_equal(np.exp(preds), warp_preds_exact, decimal=4) + + def test_warped_gp_cubic_sine(self, max_iters=100): + """ + A test replicating the cubic sine regression problem from + Snelson's paper. This test doesn't have any assertions, it's + just to ensure coverage of the tanh warping function code. """ X = (2 * np.pi) * np.random.random(151) - np.pi - Y = np.sin(X) + np.random.normal(0,0.1,151) - Y = np.exp(Y) - 5 - #Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y]) + 0 + Y = np.sin(X) + np.random.normal(0,0.2,151) + Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y]) + X = X[:, None] + Y = Y[:, None] - #np.seterr(over='raise') - import matplotlib.pyplot as plt - warp_k = GPy.kern.RBF(1) - warp_f = GPy.util.warping_functions.TanhWarpingFunction_d(n_terms=2) - warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f) - #warp_m['.*variance.*'].constrain_fixed(0.25) - #warp_m['.*lengthscale.*'].constrain_fixed(1) - #warp_m['warp_tanh.d'].constrain_fixed(1) - #warp_m.randomize() - #warp_m['.*warp_tanh.psi*'][:,0:2].constrain_bounded(0,100) - #warp_m['.*warp_tanh.psi*'][:,0:1].constrain_fixed(1) - - #print(warp_m.checkgrad()) - #warp_m.plot() - #plt.show() - - warp_m.optimize_restarts(parallel=True, robust=True) - #print(warp_m.checkgrad()) - print(warp_m) - print(warp_m['.*warp.*']) + warp_m = GPy.models.WarpedGP(X, Y)#, kernel=warp_k)#, warping_function=warp_f) + warp_m['.*\.d'].constrain_fixed(1.0) + warp_m.optimize_restarts(parallel=False, robust=False, num_restarts=5, + max_iters=max_iters) + warp_m.predict(X) + warp_m.predict_quantiles(X) + warp_m.log_predictive_density(X, Y) warp_m.predict_in_warped_space = False warp_m.plot() warp_m.predict_in_warped_space = True warp_m.plot() - warp_f.plot(X.min()-10, X.max()+10) - plt.show() + + def test_offset_regression(self): + #Tests GPy.models.GPOffsetRegression. Using two small time series + #from a sine wave, we confirm the algorithm determines that the + #likelihood is maximised when the offset hyperparameter is approximately + #equal to the actual offset in X between the two time series. + offset = 3 + X1 = np.arange(0,50,5.0)[:,None] + X2 = np.arange(0+offset,50+offset,5.0)[:,None] + X = np.vstack([X1,X2]) + ind = np.vstack([np.zeros([10,1]),np.ones([10,1])]) + X = np.hstack([X,ind]) + Y = np.sin((X[0:10,0])/30.0)[:,None] + Y = np.vstack([Y,Y]) + m = GPy.models.GPOffsetRegression(X,Y) + m.rbf.lengthscale=5.0 #make it something other than one to check our gradients properly! + assert m.checkgrad(), "Gradients of offset parameters don't match numerical approximations." + m.optimize() + assert np.abs(m.offset[0]-offset)<0.1, ("GPOffsetRegression model failing to estimate correct offset (value estimated = %0.2f instead of %0.2f)" % (m.offset[0], offset)) class GradientTests(np.testing.TestCase): diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py index 3c6241f3..ba3d7ddf 100644 --- a/GPy/testing/util_tests.py +++ b/GPy/testing/util_tests.py @@ -29,6 +29,7 @@ #=============================================================================== import unittest, numpy as np +import GPy class TestDebug(unittest.TestCase): def test_checkFinite(self): @@ -107,3 +108,50 @@ class TestDebug(unittest.TestCase): self.assertTrue(d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5]) self.assertTrue(d[tuple(X[:,1])] == [1]) + def test_offset_cluster(self): + #Tests the GPy.util.cluster_with_offset.cluster utility with a small + #test data set. Not using random noise just in case it occasionally + #causes it not to cluster correctly. + #groundtruth cluster identifiers are: [0,1,1,0] + + #data contains a list of the four sets of time series (3 per data point) + + data = [np.array([[ 2.18094245, 1.96529789, 2.00265523, 2.18218742, 2.06795428], + [ 1.62254829, 1.75748448, 1.83879347, 1.87531326, 1.52503496], + [ 1.54589609, 1.61607914, 2.00463192, 1.48771394, 1.63339218]]), + np.array([[ 2.86766106, 2.97953437, 2.91958876, 2.92510506, 3.03239241], + [ 2.57368423, 2.59954886, 3.10000395, 2.75806125, 2.89865704], + [ 2.58916318, 2.53698259, 2.63858411, 2.63102504, 2.51853901]]), + np.array([[ 2.77834168, 2.9618564 , 2.88482141, 3.24259745, 2.9716821 ], + [ 2.60675576, 2.67095624, 2.94824436, 2.80520631, 2.87247516], + [ 2.49543562, 2.5492281 , 2.6505866 , 2.65015308, 2.59738616]]), + np.array([[ 1.76783086, 2.21666738, 2.07939706, 1.9268263 , 2.23360121], + [ 1.94305547, 1.94648592, 2.1278921 , 2.09481457, 2.08575238], + [ 1.69336013, 1.72285186, 1.6339506 , 1.61212022, 1.39198698]])] + + #inputs contains their associated X values + + inputs = [np.array([[ 0. ], + [ 0.68040097], + [ 1.20316795], + [ 1.798749 ], + [ 2.14891733]]), np.array([[ 0. ], + [ 0.51910637], + [ 0.98259352], + [ 1.57442965], + [ 1.82515098]]), np.array([[ 0. ], + [ 0.66645478], + [ 1.59464591], + [ 1.69769551], + [ 1.80932752]]), np.array([[ 0. ], + [ 0.87512108], + [ 1.71881079], + [ 2.67162871], + [ 3.23761907]])] + + #try doing the clustering + active = GPy.util.cluster_with_offset.cluster(data,inputs) + #check to see that the clustering has correctly clustered the time series. + clusters = set([frozenset(cluster) for cluster in active]) + assert set([1,2]) in clusters, "Offset Clustering algorithm failed" + assert set([0,3]) in clusters, "Offset Clustering algoirthm failed" diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py index fb1eb0d6..685551fd 100644 --- a/GPy/util/__init__.py +++ b/GPy/util/__init__.py @@ -16,3 +16,4 @@ from . import initialization from . import multioutput from . import parallel from . import functions +from . import cluster_with_offset diff --git a/GPy/util/cluster_with_offset.py b/GPy/util/cluster_with_offset.py new file mode 100644 index 00000000..d1e896f1 --- /dev/null +++ b/GPy/util/cluster_with_offset.py @@ -0,0 +1,184 @@ +# Copyright (c) 2016, Mike Smith +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import GPy +import numpy as np +import sys #so I can print dots + +def get_log_likelihood(inputs,data,clust): + """Get the LL of a combined set of clusters, ignoring time series offsets. + + Get the log likelihood of a cluster without worrying about the fact + different time series are offset. We're using it here really for those + cases in which we only have one cluster to get the loglikelihood of. + + arguments: + inputs -- the 'X's in a list, one item per cluster + data -- the 'Y's in a list, one item per cluster + clust -- list of clusters to use + + returns a tuple: + log likelihood and the offset (which is always zero for this model) + """ + + S = data[0].shape[0] #number of time series + + #build a new dataset from the clusters, by combining all clusters together + X = np.zeros([0,1]) + Y = np.zeros([0,S]) + + #for each person in the cluster, + #add their inputs and data to the new dataset + for p in clust: + X = np.vstack([X,inputs[p]]) + Y = np.vstack([Y,data[p].T]) + + #find the loglikelihood. We just add together the LL for each time series. + #ll=0 + #for s in range(S): + # m = GPy.models.GPRegression(X,Y[:,s][:,None]) + # m.optimize() + # ll+=m.log_likelihood() + + m = GPy.models.GPRegression(X,Y) + m.optimize() + ll=m.log_likelihood() + return ll,0 + +def get_log_likelihood_offset(inputs,data,clust): + """Get the log likelihood of a combined set of clusters, fitting the offsets + + arguments: + inputs -- the 'X's in a list, one item per cluster + data -- the 'Y's in a list, one item per cluster + clust -- list of clusters to use + + returns a tuple: + log likelihood and the offset + """ + + #if we've only got one cluster, the model has an error, so we want to just + #use normal GPRegression. + if len(clust)==1: + return get_log_likelihood(inputs,data,clust) + + S = data[0].shape[0] #number of time series + + X = np.zeros([0,2]) #notice the extra column, this is for the cluster index + Y = np.zeros([0,S]) + + #for each person in the cluster, add their inputs and data to the new + #dataset. Note we add an index identifying which person is which data point. + #This is for the offset model to use, to allow it to know which data points + #to shift. + for i,p in enumerate(clust): + idx = i*np.ones([inputs[p].shape[0],1]) + X = np.vstack([X,np.hstack([inputs[p],idx])]) + Y = np.vstack([Y,data[p].T]) + + m = GPy.models.GPOffsetRegression(X,Y) + #TODO: How to select a sensible prior? + m.offset.set_prior(GPy.priors.Gaussian(0,20)) + #TODO: Set a sensible start value for the length scale, + #make it long to help the offset fit. + + m.optimize() + + ll = m.log_likelihood() + offset = m.offset.values[0] + return ll,offset + +def cluster(data,inputs,verbose=False): + """Clusters data + + Using the new offset model, this method uses a greedy algorithm to cluster + the data. It starts with all the data points in separate clusters and tests + whether combining them increases the overall log-likelihood (LL). It then + iteratively joins pairs of clusters which cause the greatest increase in + the LL, until no join increases the LL. + + arguments: + inputs -- the 'X's in a list, one item per cluster + data -- the 'Y's in a list, one item per cluster + + returns a list of the clusters. + """ + N=len(data) + + + #Define a set of N active cluster + active = [] + for p in range(0,N): + active.append([p]) + + loglikes = np.zeros(len(active)) + loglikes[:] = None + + pairloglikes = np.zeros([len(active),len(active)]) + pairloglikes[:] = None + pairoffset = np.zeros([len(active),len(active)]) + + it = 0 + while True: + + if verbose: + it +=1 + print("Iteration %d" % it) + + #Compute the log-likelihood of each cluster (add them together) + for clusti in range(len(active)): + if verbose: + sys.stdout.write('.') + sys.stdout.flush() + if np.isnan(loglikes[clusti]): + loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti]) + + #try combining with each other cluster... + for clustj in range(clusti): #count from 0 to clustj-1 + temp = [clusti,clustj] + if np.isnan(pairloglikes[clusti,clustj]): + pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp) + + seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1) + loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering + top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape) + + #if loglikeimprovement.shape[0]<3: + # #no more clustering to do - this shouldn't happen really unless + # #we've set the threshold to apply clustering to less than 0 + # break + + #if theres further clustering to be done... + if loglikeimprovement[top[0],top[1]]>0: + active[top[0]].extend(active[top[1]]) + offset=pairoffset[top[0],top[1]] + inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset]) + data[top[0]] = np.hstack([data[top[0]],data[top[1]]]) + del inputs[top[1]] + del data[top[1]] + del active[top[1]] + + #None = message to say we need to recalculate + pairloglikes[:,top[0]] = None + pairloglikes[top[0],:] = None + pairloglikes = np.delete(pairloglikes,top[1],0) + pairloglikes = np.delete(pairloglikes,top[1],1) + loglikes[top[0]] = None + loglikes = np.delete(loglikes,top[1]) + else: + break + + #if loglikeimprovement[top[0],top[1]]>0: + # print "joined" + # print top + # print offset + # print offsets + # print offsets[top[1]]-offsets[top[0]] + + #TODO Add a way to return the offsets applied to all the time series + return active + +#starttime = time.time() +#active = cluster(data,inputs) +#endtime = time.time() +#print "TOTAL TIME %0.4f" % (endtime-starttime) diff --git a/GPy/util/normalizer.py b/GPy/util/normalizer.py index 854726c4..78ad945d 100644 --- a/GPy/util/normalizer.py +++ b/GPy/util/normalizer.py @@ -6,7 +6,7 @@ Created on Aug 27, 2014 import logging import numpy as np -class Norm(object): +class _Norm(object): def __init__(self): pass def scale_by(self, Y): @@ -18,7 +18,8 @@ class Norm(object): """ Project Y into normalized space """ - raise NotImplementedError + if not self.scaled(): + raise AttributeError("Norm object not initialized yet, try calling scale_by(data) first.") def inverse_mean(self, X): """ Project the normalized object X into space of Y @@ -31,15 +32,46 @@ class Norm(object): Whether this Norm object has been initialized. """ raise NotImplementedError -class MeanNorm(Norm): + +class Standardize(_Norm): def __init__(self): self.mean = None def scale_by(self, Y): Y = np.ma.masked_invalid(Y, copy=False) self.mean = Y.mean(0).view(np.ndarray) + self.std = Y.std(0).view(np.ndarray) def normalize(self, Y): - return Y-self.mean + super(Standardize, self).normalize(Y) + return (Y-self.mean)/self.std def inverse_mean(self, X): - return X+self.mean + return (X*self.std)+self.mean + def inverse_variance(self, var): + return (var*(self.std**2)) def scaled(self): return self.mean is not None + +# Inverse variance to be implemented, disabling for now +# If someone in the future want to implement this, +# we need to implement the inverse variance for +# normalization. This means, we need to know the factor +# for the variance to be multiplied to the variance in +# normalized space. This is easy to compute for standardization +# (see above) but gets tricky here. +# class Normalize(_Norm): +# def __init__(self): +# self.ymin = None +# self.ymax = None +# def scale_by(self, Y): +# Y = np.ma.masked_invalid(Y, copy=False) +# self.ymin = Y.min(0).view(np.ndarray) +# self.ymax = Y.max(0).view(np.ndarray) +# def normalize(self, Y): +# super(Normalize, self).normalize(Y) +# return (Y - self.ymin) / (self.ymax - self.ymin) - .5 +# def inverse_mean(self, X): +# return (X + .5) * (self.ymax - self.ymin) + self.ymin +# def inverse_variance(self, var): +# +# return (var*(self.std**2)) +# def scaled(self): +# return (self.ymin is not None) and (self.ymax is not None) \ No newline at end of file diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py index 1617283c..3af28d43 100644 --- a/GPy/util/warping_functions.py +++ b/GPy/util/warping_functions.py @@ -4,6 +4,7 @@ import numpy as np from ..core.parameterization import Parameterized, Param from paramz.transformations import Logexp +import sys class WarpingFunction(Parameterized): @@ -14,6 +15,7 @@ class WarpingFunction(Parameterized): def __init__(self, name): super(WarpingFunction, self).__init__(name=name) + self.rate = 0.1 def f(self, y, psi): """function transformation @@ -29,15 +31,32 @@ class WarpingFunction(Parameterized): """gradient of f w.r.t to y""" raise NotImplementedError - def f_inv(self, z, psi): - """inverse function transformation""" - raise NotImplementedError + def f_inv(self, z, max_iterations=250, y=None): + """ + Calculate the numerical inverse of f. This should be + overwritten for specific warping functions where the + inverse can be found in closed form. - def _get_param_names(self): - raise NotImplementedError + :param max_iterations: maximum number of N.R. iterations + """ + + z = z.copy() + y = np.ones_like(z) + + it = 0 + update = np.inf + while np.abs(update).sum() > 1e-10 and it < max_iterations: + fy = self.f(y) + fgrady = self.fgrad_y(y) + update = (fy - z) / fgrady + y -= self.rate * update + it += 1 + #if it == max_iterations: + # print("WARNING!!! Maximum number of iterations reached in f_inv ") + # print("Sum of roots: %.4f" % np.sum(fy - z)) + return y def plot(self, xmin, xmax): - psi = self.psi y = np.arange(xmin, xmax, 0.01) f_y = self.f(y) from matplotlib import pyplot as plt @@ -49,183 +68,59 @@ class WarpingFunction(Parameterized): plt.show() -class TanhWarpingFunction(WarpingFunction): - - def __init__(self, n_terms=3): - """n_terms specifies the number of tanh terms to be used""" - self.n_terms = n_terms - self.num_parameters = 3 * self.n_terms - super(TanhWarpingFunction, self).__init__(name='warp_tanh') - - def f(self, y, psi): +class TanhFunction(WarpingFunction): + """ + This is the function proposed in Snelson et al.: + A sum of tanh functions with linear trends outside + the range. Notice the term 'd', which scales the + linear trend. + """ + def __init__(self, n_terms=3, initial_y=None): """ - transform y with f using parameter vector psi - psi = [[a,b,c]] - ::math::`f = \\sum_{terms} a * tanh(b*(y+c))` + n_terms specifies the number of tanh terms to be used """ - #1. check that number of params is consistent - assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions' - assert psi.shape[1] == 3, 'inconsistent parameter dimensions' - - #2. exponentiate the a and b (positive!) - mpsi = psi.copy() - - #3. transform data - z = y.copy() - for i in range(len(mpsi)): - a,b,c = mpsi[i] - z += a*np.tanh(b*(y+c)) - return z - - def f_inv(self, y, psi, iterations=10): - """ - calculate the numerical inverse of f - :param iterations: number of N.R. iterations - """ - - y = y.copy() - z = np.ones_like(y) - for i in range(iterations): - z -= (self.f(z, psi) - y)/self.fgrad_y(z,psi) - return z - - def fgrad_y(self, y, psi, return_precalc=False): - """ - gradient of f w.r.t to y ([N x 1]) - returns: Nx1 vector of derivatives, unless return_precalc is true, - then it also returns the precomputed stuff - """ - - mpsi = psi.copy() - - # vectorized version - - # S = (mpsi[:,1]*(y + mpsi[:,2])).T - S = (mpsi[:,1]*(y[:,:,None] + mpsi[:,2])).T - R = np.tanh(S) - D = 1-R**2 - - # GRAD = (1+(mpsi[:,0:1]*mpsi[:,1:2]*D).sum(axis=0))[:,np.newaxis] - GRAD = (1+(mpsi[:,0:1][:,:,None]*mpsi[:,1:2][:,:,None]*D).sum(axis=0)).T - - if return_precalc: - # return GRAD,S.sum(axis=1),R.sum(axis=1),D.sum(axis=1) - return GRAD, S, R, D - - return GRAD - - def fgrad_y_psi(self, y, psi, return_covar_chain=False): - """ - gradient of f w.r.t to y and psi - returns: NxIx3 tensor of partial derivatives - """ - - # 1. exponentiate the a and b (positive!) - mpsi = psi.copy() - w, s, r, d = self.fgrad_y(y, psi, return_precalc = True) - - gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 3)) - for i in range(len(mpsi)): - a,b,c = mpsi[i] - gradients[:,:,i,0] = (b*(1.0/np.cosh(s[i]))**2).T - gradients[:,:,i,1] = a*(d[i] - 2.0*s[i]*r[i]*(1.0/np.cosh(s[i]))**2).T - gradients[:,:,i,2] = (-2.0*a*(b**2)*r[i]*((1.0/np.cosh(s[i]))**2)).T - - if return_covar_chain: - covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 3)) - - for i in range(len(mpsi)): - a,b,c = mpsi[i] - covar_grad_chain[:, :, i, 0] = (r[i]).T - covar_grad_chain[:, :, i, 1] = (a*(y + c) * ((1.0/np.cosh(s[i]))**2).T) - covar_grad_chain[:, :, i, 2] = a*b*((1.0/np.cosh(s[i]))**2).T - - return gradients, covar_grad_chain - - return gradients - - def _get_param_names(self): - variables = ['a', 'b', 'c'] - names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] for q in range(self.n_terms)],[]) - return names - - -class TanhWarpingFunction_d(WarpingFunction): - - def __init__(self, n_terms=3): - """n_terms specifies the number of tanh terms to be used""" self.n_terms = n_terms self.num_parameters = 3 * self.n_terms + 1 self.psi = np.ones((self.n_terms, 3)) - - super(TanhWarpingFunction_d, self).__init__(name='warp_tanh') + super(TanhFunction, self).__init__(name='warp_tanh') self.psi = Param('psi', self.psi) self.psi[:, :2].constrain_positive() - self.d = Param('%s' % ('d'), 1.0, Logexp()) self.link_parameter(self.psi) self.link_parameter(self.d) + self.initial_y = initial_y def f(self, y): """ Transform y with f using parameter vector psi psi = [[a,b,c]] - :math:`f = \\sum_{terms} a * tanh(b*(y+c))` + :math:`f = (y * d) + \\sum_{terms} a * tanh(b *(y + c))` """ - #1. check that number of params is consistent - # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions' - # assert psi.shape[1] == 4, 'inconsistent parameter dimensions' - d = self.d mpsi = self.psi - - #3. transform data - z = d*y.copy() + z = d * y.copy() for i in range(len(mpsi)): - a,b,c = mpsi[i] - z += a*np.tanh(b*(y+c)) + a, b, c = mpsi[i] + z += a * np.tanh(b * (y + c)) return z - def f_inv(self, z, max_iterations=1000, y=None): - """ - calculate the numerical inverse of f - - :param max_iterations: maximum number of N.R. iterations - """ - - z = z.copy() - if y is None: - y = np.ones_like(z) - - it = 0 - update = np.inf - - while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations): - fy = self.f(y) - fgrady = self.fgrad_y(y) - update = (fy - z)/fgrady - y -= update - it += 1 - if it == max_iterations: - print("WARNING!!! Maximum number of iterations reached in f_inv ") - return y - def fgrad_y(self, y, return_precalc=False): """ gradient of f w.r.t to y ([N x 1]) - :returns: Nx1 vector of derivatives, unless return_precalc is true, then it also returns the precomputed stuff + :returns: Nx1 vector of derivatives, unless return_precalc is true, + then it also returns the precomputed stuff """ d = self.d mpsi = self.psi # vectorized version - S = (mpsi[:,1]*(y[:,:,None] + mpsi[:,2])).T + S = (mpsi[:,1] * (y[:,:,None] + mpsi[:,2])).T R = np.tanh(S) - D = 1-R**2 + D = 1 - (R ** 2) - GRAD = (d + (mpsi[:,0:1][:,:,None]*mpsi[:,1:2][:,:,None]*D).sum(axis=0)).T + GRAD = (d + (mpsi[:,0:1][:,:,None] * mpsi[:,1:2][:,:,None] * D).sum(axis=0)).T if return_precalc: return GRAD, S, R, D @@ -244,45 +139,79 @@ class TanhWarpingFunction_d(WarpingFunction): gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4)) for i in range(len(mpsi)): a,b,c = mpsi[i] - gradients[:,:,i,0] = (b*(1.0/np.cosh(s[i]))**2).T - gradients[:,:,i,1] = a*(d[i] - 2.0*s[i]*r[i]*(1.0/np.cosh(s[i]))**2).T - gradients[:,:,i,2] = (-2.0*a*(b**2)*r[i]*((1.0/np.cosh(s[i]))**2)).T - gradients[:,:,0,3] = 1.0 + gradients[:, :, i, 0] = (b * (1.0/np.cosh(s[i])) ** 2).T + gradients[:, :, i, 1] = a * (d[i] - 2.0 * s[i] * r[i] * (1.0/np.cosh(s[i])) ** 2).T + gradients[:, :, i, 2] = (-2.0 * a * (b ** 2) * r[i] * ((1.0 / np.cosh(s[i])) ** 2)).T + gradients[:, :, 0, 3] = 1.0 if return_covar_chain: covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4)) - for i in range(len(mpsi)): a,b,c = mpsi[i] covar_grad_chain[:, :, i, 0] = (r[i]).T - covar_grad_chain[:, :, i, 1] = (a*(y + c) * ((1.0/np.cosh(s[i]))**2).T) - covar_grad_chain[:, :, i, 2] = a*b*((1.0/np.cosh(s[i]))**2).T + covar_grad_chain[:, :, i, 1] = (a * (y + c) * ((1.0 / np.cosh(s[i])) ** 2).T) + covar_grad_chain[:, :, i, 2] = a * b * ((1.0 / np.cosh(s[i])) ** 2).T covar_grad_chain[:, :, 0, 3] = y - return gradients, covar_grad_chain return gradients - def _get_param_names(self): - variables = ['a', 'b', 'c', 'd'] - names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] for q in range(self.n_terms)],[]) - names.append('warp_tanh_d') - return names + def update_grads(self, Y_untransformed, Kiy): + grad_y = self.fgrad_y(Y_untransformed) + grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed, + return_covar_chain=True) + djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0) + dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0) + + warping_grads = -dquad_dpsi + djac_dpsi + + self.psi.gradient[:] = warping_grads[:, :-1] + self.d.gradient[:] = warping_grads[0, -1] + + +class LogFunction(WarpingFunction): + """ + Easy wrapper for applying a fixed log warping function to + positive-only values. + The closed_inverse flag should only be set to False for + debugging and testing purposes. + """ + def __init__(self, closed_inverse=True): + self.num_parameters = 0 + super(LogFunction, self).__init__(name='log') + if closed_inverse: + self.f_inv = self._f_inv + + def f(self, y): + return np.log(y) + + def fgrad_y(self, y): + return 1. / y + + def update_grads(self, Y_untransformed, Kiy): + pass + + def fgrad_y_psi(self, y, return_covar_chain=False): + if return_covar_chain: + return 0, 0 + return 0 + + def _f_inv(self, z, y=None): + return np.exp(z) class IdentityFunction(WarpingFunction): """ Identity warping function. This is for testing and sanity check purposes and should not be used in practice. + The closed_inverse flag should only be set to False for + debugging and testing purposes. """ - def __init__(self): - self.num_parameters = 4 - self.psi = Param('psi', np.zeros((1,3))) - self.d = Param('%s' % ('d'), 1.0, Logexp()) + def __init__(self, closed_inverse=True): + self.num_parameters = 0 super(IdentityFunction, self).__init__(name='identity') - self.link_parameter(self.psi) - self.link_parameter(self.d) - + if closed_inverse: + self.f_inv = self._f_inv def f(self, y): return y @@ -290,11 +219,14 @@ class IdentityFunction(WarpingFunction): def fgrad_y(self, y): return np.ones(y.shape) + def update_grads(self, Y_untransformed, Kiy): + pass + def fgrad_y_psi(self, y, return_covar_chain=False): - gradients = np.zeros((y.shape[0], y.shape[1], len(self.psi), 4)) if return_covar_chain: - return gradients, gradients - return gradients + return 0, 0 + return 0 - def f_inv(self, z, y=None): + def _f_inv(self, z, y=None): return z + diff --git a/README.md b/README.md index e0e715a9..513f2704 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The Gaussian processes framework in Python. * Travis-CI [unit-tests](https://travis-ci.org/SheffieldML/GPy) * [![licence](https://img.shields.io/badge/licence-BSD-blue.svg)](http://opensource.org/licenses/BSD-3-Clause) -[![develstat](https://travis-ci.org/SheffieldML/GPy.svg?branch=devel)](https://travis-ci.org/SheffieldML/GPy) [![appveyor](https://ci.appveyor.com/api/projects/status/662o6tha09m2jix3/branch/deploy?svg=true)](https://ci.appveyor.com/project/mzwiessele/gpy/branch/deploy) [![coverallsdevel](https://coveralls.io/repos/github/SheffieldML/GPy/badge.svg?branch=devel)](https://coveralls.io/github/SheffieldML/GPy?branch=devel) [![covdevel](http://codecov.io/github/SheffieldML/GPy/coverage.svg?branch=devel)](http://codecov.io/github/SheffieldML/GPy?branch=devel) [![Research software impact](http://depsy.org/api/package/pypi/GPy/badge.svg)](http://depsy.org/package/python/GPy) [![Code Health](https://landscape.io/github/SheffieldML/GPy/devel/landscape.svg?style=flat)](https://landscape.io/github/SheffieldML/GPy/devel) +[![deploystat](https://travis-ci.org/SheffieldML/GPy.svg?branch=deploy)](https://travis-ci.org/SheffieldML/GPy) [![appveyor](https://ci.appveyor.com/api/projects/status/662o6tha09m2jix3/branch/deploy?svg=true)](https://ci.appveyor.com/project/mzwiessele/gpy/branch/deploy) [![coverallsdevel](https://coveralls.io/repos/github/SheffieldML/GPy/badge.svg?branch=devel)](https://coveralls.io/github/SheffieldML/GPy?branch=devel) [![covdevel](http://codecov.io/github/SheffieldML/GPy/coverage.svg?branch=devel)](http://codecov.io/github/SheffieldML/GPy?branch=devel) [![Research software impact](http://depsy.org/api/package/pypi/GPy/badge.svg)](http://depsy.org/package/python/GPy) [![Code Health](https://landscape.io/github/SheffieldML/GPy/devel/landscape.svg?style=flat)](https://landscape.io/github/SheffieldML/GPy/devel) ## Updated Structure diff --git a/appveyor.yml b/appveyor.yml index f7f7b7ab..ed0ec0d6 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -3,7 +3,7 @@ environment: secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00= COVERALLS_REPO_TOKEN: secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS - gpy_version: 1.2.1 + gpy_version: 1.4.0 matrix: - PYTHON_VERSION: 2.7 MINICONDA: C:\Miniconda-x64 @@ -71,10 +71,10 @@ deploy_script: - echo password:%pip_access% >> %USERPROFILE%\\.pypirc - ps: >- if ($env:APPVEYOR_REPO_BRANCH -eq 'devel') { - twine upload -r test dist/* + twine upload --skip-existing -r test dist/* } elseif ($env:APPVEYOR_REPO_BRANCH -eq 'deploy') { - twine upload dist/* + twine upload --skip-existing dist/* } else { echo not deploying on other branches diff --git a/setup.cfg b/setup.cfg index 57fddfaf..9546c53a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.2.1 +current_version = 1.4.0 tag = False commit = True diff --git a/setup.py b/setup.py index c8d20add..1cf2321d 100644 --- a/setup.py +++ b/setup.py @@ -148,7 +148,7 @@ setup(name = 'GPy', include_package_data = True, py_modules = ['GPy.__init__'], test_suite = 'GPy.testing', - install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.5.2'], + install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.6.6'], extras_require = {'docs':['sphinx'], 'optional':['mpi4py', 'ipython>=4.0.0',