diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/non_gaussian.py similarity index 77% rename from GPy/examples/laplace_approximations.py rename to GPy/examples/non_gaussian.py index f74e4d37..622b3edd 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/non_gaussian.py @@ -2,22 +2,21 @@ import GPy import numpy as np import matplotlib.pyplot as plt from GPy.util import datasets -#np.random.seed(1) -def student_t_approx(): +def student_t_approx(optimize=True, plot=True): """ - Example of regressing with a student t likelihood + Example of regressing with a student t likelihood using Laplace """ real_std = 0.1 #Start a function, any function X = np.linspace(0.0, np.pi*2, 100)[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_std + Y = Y/Y.max() Yc = Y.copy() X_full = np.linspace(0.0, np.pi*2, 500)[:, None] Y_full = np.sin(X_full) - - Y = Y/Y.max() + Y_full = Y_full/Y_full.max() #Slightly noisy data Yc[75:80] += 1 @@ -34,94 +33,93 @@ def student_t_approx(): deg_free = 5 print "Real noise: ", real_std initial_var_guess = 0.5 + edited_real_sd = initial_var_guess - #t_rv = t(deg_free, loc=0, scale=real_var) - #noise = t_rvrvs(size=Y.shape) - #Y += noise - - plt.figure(1) - plt.suptitle('Gaussian likelihood') # Kernel object kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() - kernel5 = kernel1.copy() - kernel6 = kernel1.copy() - print "Clean Gaussian" - #A GP should completely break down due to the points as they get a lot of weight - # create simple GP model - m = GPy.models.GPRegression(X, Y, kernel=kernel1) + #Gaussian GP model on clean data + m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1) # optimize - m.ensure_default_constraints() - m.constrain_fixed('white', 1e-4) - m.randomize() - m.optimize() - # plot - ax = plt.subplot(211) - m.plot(ax=ax) - plt.plot(X_full, Y_full) - plt.ylim(-1.5, 1.5) - plt.title('Gaussian clean') - print m + m1.ensure_default_constraints() + m1.constrain_fixed('white', 1e-5) + m1.randomize() - #Corrupt - print "Corrupt Gaussian" - m = GPy.models.GPRegression(X, Yc, kernel=kernel2) - m.ensure_default_constraints() - m.constrain_fixed('white', 1e-4) - m.randomize() - m.optimize() - ax = plt.subplot(212) - m.plot(ax=ax) - plt.plot(X_full, Y_full) - plt.ylim(-1.5, 1.5) - plt.title('Gaussian corrupt') - print m + #Gaussian GP model on corrupt data + m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2) + m2.ensure_default_constraints() + m2.constrain_fixed('white', 1e-5) + m2.randomize() - plt.figure(2) - plt.suptitle('Student-t likelihood') - edited_real_sd = initial_var_guess - - print "Clean student t, rasm" + #Student t GP model on clean data t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution) - m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood) - m.ensure_default_constraints() - m.constrain_positive('t_noise') - m.constrain_fixed('white', 1e-4) - m.randomize() - #m.update_likelihood_approximation() - m.optimize() - print(m) - ax = plt.subplot(211) - m.plot(ax=ax) - plt.plot(X_full, Y_full) - plt.ylim(-1.5, 1.5) - plt.title('Student-t rasm clean') + m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood) + m3.ensure_default_constraints() + m3.constrain_bounded('t_noise', 1e-6, 10.) + m3.constrain_fixed('white', 1e-5) + m3.randomize() - print "Corrupt student t, rasm" + #Student t GP model on corrupt data t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd) corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution) - m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood) - m.ensure_default_constraints() - m.constrain_bounded('t_noise', 1e-6, 10.) - m.constrain_fixed('white', 1e-4) - m.randomize() - for a in range(1): - m.randomize() - m_start = m.copy() - print m - m.optimize('scg', messages=1) - print(m) - ax = plt.subplot(212) - m.plot(ax=ax) - plt.plot(X_full, Y_full) - plt.ylim(-1.5, 1.5) - plt.title('Student-t rasm corrupt') + m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood) + m4.ensure_default_constraints() + m4.constrain_bounded('t_noise', 1e-6, 10.) + m4.constrain_fixed('white', 1e-5) + m4.randomize() - return m + if optimize: + optimizer='scg' + print "Clean Gaussian" + m1.optimize(optimizer, messages=1) + print "Corrupt Gaussian" + m2.optimize(optimizer, messages=1) + print "Clean student t" + m3.optimize(optimizer, messages=1) + print "Corrupt student t" + m4.optimize(optimizer, messages=1) + + if False: + print m1 + print m3 + plt.figure(3) + plt.scatter(X, m1.likelihood.Y, c='g') + plt.scatter(X, m3.likelihood.Y, c='r') + + if plot: + plt.figure(1) + plt.suptitle('Gaussian likelihood') + ax = plt.subplot(211) + m1.plot(ax=ax) + plt.plot(X_full, Y_full) + plt.ylim(-1.5, 1.5) + plt.title('Gaussian clean') + + ax = plt.subplot(212) + m2.plot(ax=ax) + plt.plot(X_full, Y_full) + plt.ylim(-1.5, 1.5) + plt.title('Gaussian corrupt') + + plt.figure(2) + plt.suptitle('Student-t likelihood') + ax = plt.subplot(211) + m3.plot(ax=ax) + plt.plot(X_full, Y_full) + plt.ylim(-1.5, 1.5) + plt.title('Student-t rasm clean') + + ax = plt.subplot(212) + m4.plot(ax=ax) + plt.plot(X_full, Y_full) + plt.ylim(-1.5, 1.5) + plt.title('Student-t rasm corrupt') + + return m1, m2, m3, m4 def boston_example(): import sklearn @@ -294,3 +292,4 @@ def precipitation_example(): for n, (train, test) in enumerate(kf): X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test] print "Fold {}".format(n) + diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py index 57160d64..e5dcdd19 100644 --- a/GPy/likelihoods/laplace.py +++ b/GPy/likelihoods/laplace.py @@ -15,6 +15,7 @@ import scipy as sp from likelihood import likelihood from ..util.linalg import mdot, jitchol, pddet, dpotrs from functools import partial as partial_func +import warnings class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -64,6 +65,7 @@ class Laplace(likelihood): self.YYT = None self.old_Ki_f = None + self.bad_fhat = False def predictive_values(self,mu,var,full_cov,**noise_args): if full_cov: @@ -198,18 +200,16 @@ class Laplace(likelihood): Y_tilde = Wi*self.Ki_f + self.f_hat self.Wi_K_i = self.W12BiW12 - self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K) - self.lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data) - self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) + ln_det_Wi_K = pddet(self.Sigma_tilde + self.K) + lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data) + y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) - Z_tilde = (+ self.lik + Z_tilde = (+ lik - 0.5*self.ln_B_det - + 0.5*self.ln_det_Wi_K + + 0.5*ln_det_Wi_K - 0.5*self.f_Ki_f - + 0.5*self.y_Wi_Ki_i_y + + 0.5*y_Wi_K_i_y ) - #print "Term, {}, {}, {}, {}, {}".format(self.lik, - 0.5*self.ln_B_det, + 0.5*self.ln_det_Wi_K, - 0.5*self.f_Ki_f, + 0.5*self.y_Wi_Ki_i_y) - #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) self.Y = Y_tilde @@ -247,7 +247,10 @@ class Laplace(likelihood): #At this point get the hessian matrix (or vector as W is diagonal) self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data) - #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though + if not self.noise_model.log_concave: + #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6)) + self.W[self.W < 1e-6] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N)) self.Ki_f = self.Ki_f @@ -283,11 +286,11 @@ class Laplace(likelihood): except: import ipdb; ipdb.set_trace() - W12BiW12 = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0] + W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0] ln_B_det = 2*np.sum(np.log(np.diag(L))) - return W12BiW12, ln_B_det + return W12BiW12a, ln_B_det - def rasm_mode(self, K, MAX_ITER=30): + def rasm_mode(self, K, MAX_ITER=40): """ Rasmussen's numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -302,9 +305,10 @@ class Laplace(likelihood): """ #old_Ki_f = np.zeros((self.N, 1)) - #Start f's at zero originally - if self.old_Ki_f is None: - old_Ki_f = np.zeros((self.N, 1)) + #Start f's at zero originally of if we have gone off track, try restarting + if self.old_Ki_f is None or self.bad_fhat: + old_Ki_f = np.random.rand(self.N, 1)/50.0 + #old_Ki_f = self.Y f = np.dot(K, old_Ki_f) else: #Start at the old best point @@ -318,7 +322,7 @@ class Laplace(likelihood): return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data) difference = np.inf - epsilon = 1e-5 + epsilon = 1e-7 #step_size = 1 #rs = 0 i = 0 @@ -381,14 +385,20 @@ class Laplace(likelihood): #difference = abs(new_obj - old_obj) #old_obj = new_obj.copy() - difference = np.abs(np.sum(f - f_old)) - #difference = np.abs(np.sum(Ki_f - old_Ki_f)) + difference = np.abs(np.sum(f - f_old)) + np.abs(np.sum(Ki_f - old_Ki_f)) + #difference = np.abs(np.sum(Ki_f - old_Ki_f))/np.float(self.N) old_Ki_f = Ki_f.copy() i += 1 self.old_Ki_f = old_Ki_f.copy() + + #Warn of bad fits if difference > epsilon: - print "Not perfect f_hat fit difference: {}".format(difference) + self.bad_fhat = True + warnings.warn("Not perfect f_hat fit difference: {}".format(difference)) + elif self.bad_fhat: + self.bad_fhat = False + warnings.warn("f_hat now perfect again") self.Ki_f = Ki_f return f