From b663fff622fe325b320c6cb4655ec315cd97dbba Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 13 Sep 2013 14:34:28 +0100 Subject: [PATCH] Now checkgrads for gaussian, and ALMOST for student t --- GPy/examples/laplace_approximations.py | 67 ++++++++++---- GPy/likelihoods/laplace.py | 123 +++++++++++++++---------- 2 files changed, 119 insertions(+), 71 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 50e1858b..e8af74eb 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -1,6 +1,7 @@ import GPy import numpy as np import matplotlib.pyplot as plt +from GPy.util import datasets np.random.seed(1) def timing(): @@ -405,7 +406,7 @@ def student_t_approx(): """ real_std = 0.1 #Start a function, any function - X = np.linspace(0.0, 10.0, 50)[:, None] + X = np.linspace(0.0, 10.0, 100)[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_std Yc = Y.copy() @@ -422,7 +423,7 @@ def student_t_approx(): #Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 8 + deg_free = 5 print "Real noise: ", real_std initial_var_guess = 0.1 @@ -456,11 +457,13 @@ def student_t_approx(): m = GPy.models.GPRegression(X, Y, kernel=kernel1) # optimize m.ensure_default_constraints() + m.randomize() m.optimize() # plot - plt.subplot(211) - m.plot() + ax = plt.subplot(211) + m.plot(ax=ax) plt.plot(X_full, Y_full) + plt.ylim(-1.5, 1.5) plt.title('Gaussian clean') print m @@ -468,16 +471,18 @@ def student_t_approx(): print "Corrupt Gaussian" m = GPy.models.GPRegression(X, Yc, kernel=kernel2) m.ensure_default_constraints() - #m.optimize() - plt.subplot(212) - m.plot() + m.randomize() + m.optimize() + ax = plt.subplot(212) + m.plot(ax=ax) plt.plot(X_full, Y_full) + plt.ylim(-1.5, 1.5) plt.title('Gaussian corrupt') print m plt.figure(2) plt.suptitle('Student-t likelihood') - edited_real_sd = real_std #initial_var_guess + edited_real_sd = initial_var_guess print "Clean student t, rasm" t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd) @@ -486,13 +491,14 @@ def student_t_approx(): m.ensure_default_constraints() m.constrain_positive('t_noise') m.randomize() - m.update_likelihood_approximation() + import ipdb; ipdb.set_trace() # XXX BREAKPOINT + #m.update_likelihood_approximation() m.optimize() print(m) - plt.subplot(222) - m.plot() + ax = plt.subplot(211) + m.plot(ax=ax) plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) + plt.ylim(-1.5, 1.5) plt.title('Student-t rasm clean') print "Corrupt student t, rasm" @@ -502,15 +508,17 @@ def student_t_approx(): m.ensure_default_constraints() m.constrain_positive('t_noise') m.randomize() - m.update_likelihood_approximation() + #m.update_likelihood_approximation() + import ipdb; ipdb.set_trace() # XXX BREAKPOINT m.optimize() print(m) - plt.subplot(224) - m.plot() + ax = plt.subplot(212) + m.plot(ax=ax) plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) + plt.ylim(-1.5, 1.5) plt.title('Student-t rasm corrupt') + import ipdb; ipdb.set_trace() # XXX BREAKPOINT return m #print "Clean student t, ncg" @@ -607,7 +615,6 @@ def gaussian_f_check(): mgp.optimize() print "Gaussian" print mgp - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT kernelg = kernelgp.copy() #kernelst += GPy.kern.bias(X.shape[1]) @@ -615,6 +622,7 @@ def gaussian_f_check(): g_distribution = GPy.likelihoods.functions.Gaussian(variance=0.1, N=N, D=D) g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm') m = GPy.models.GPRegression(X, Y, kernelg, likelihood=g_likelihood) + m.likelihood.X = X #m['rbf_v'] = mgp._get_params()[0] #m['rbf_l'] = mgp._get_params()[1] + 1 m.ensure_default_constraints() @@ -623,18 +631,37 @@ def gaussian_f_check(): #m.constrain_bounded('t_no', 2*real_std**2, 1e3) #m.constrain_positive('bias') m.constrain_positive('noise_var') + #m['noise_variance'] = 0.1 + #m.likelihood.X = X m.randomize() import ipdb; ipdb.set_trace() # XXX BREAKPOINT - m['noise_variance'] = 0.1 - #m.likelihood.X = X plt.figure() ax = plt.subplot(211) m.plot(ax=ax) - ax = plt.subplot(212) + m.optimize() + ax = plt.subplot(212) m.plot(ax=ax) + print "final optimised gaussian" print m print "real GP" print mgp import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + +def boston_example(): + data = datasets.boston_housing() + X = data['X'].copy() + Y = data['Y'].copy() + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) + mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp.randomize() + mgp.optimize() + mgp.plot() + import ipdb; ipdb.set_trace() # XXX BREAKPOINT + +def plot_f_approx(model): + plt.figure() + model.plot(ax=plt.gca()) + plt.plot(model.X, model.likelihood.f_hat, c='g') diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py index 2f98b2ff..2897e1de 100644 --- a/GPy/likelihoods/laplace.py +++ b/GPy/likelihoods/laplace.py @@ -7,6 +7,7 @@ from likelihood import likelihood from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet from scipy.linalg.lapack import dtrtrs import random +from functools import partial #import pylab as plt class Laplace(likelihood): @@ -87,11 +88,15 @@ class Laplace(likelihood): #Implicit impl = mdot(dlp, dL_dfhat, I_KW_i) - expl_a = mdot(self.Ki_f, self.Ki_f.T) + #expl_a = mdot(self.Ki_f, self.Ki_f.T) + expl_a = np.dot(self.Ki_f, self.Ki_f.T) expl_b = self.Wi_K_i #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b) - expl = 0.5*expl_a + 0.5*expl_b # Might need to be -? - dL_dthetaK_exp = dK_dthetaK(expl, X) + #expl = 0.5*expl_a - 0.5*expl_b # Might need to be -? + #dL_dthetaK_exp = dK_dthetaK(expl, X) + dL_dthetaK_exp_a = dK_dthetaK(expl_a, X) + dL_dthetaK_exp_b = dK_dthetaK(expl_b, X) + dL_dthetaK_exp = 0.5*dL_dthetaK_exp_a - 0.5*dL_dthetaK_exp_b dL_dthetaK_imp = dK_dthetaK(impl, X) #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) #print "expl_a: {}, {} expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b)) @@ -116,7 +121,13 @@ class Laplace(likelihood): #b = 0.5*np.dot(np.diag(e).T, d) #g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1)) #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i]) - dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + + #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i]) + #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i])))) + + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[thetaL_i]) + ) + import ipdb; ipdb.set_trace() # XXX BREAKPOINT #Implicit df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) @@ -168,22 +179,31 @@ class Laplace(likelihood): Y_tilde = Wi*self.Ki_f + self.f_hat self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R + #self.Wi_K_i, _, _, self.ln_det_Wi_K = pdinv(self.Sigma_tilde + self.K) # TODO: Check if Wi_K_i == R above and same with det below + self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K) + #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6 - self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) + #self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) - self.aA = 0.5*self.ln_det_K_Wi__Bi - self.bB = - 0.5*self.f_Ki_f - self.cC = 0.5*self.y_Wi_Ki_i_y + #self.aA = 0.5*self.ln_det_K_Wi__Bi + #self.bB = - 0.5*self.f_Ki_f + #self.cC = 0.5*self.y_Wi_Ki_i_y Z_tilde = (+ self.lik - + 0.5*self.ln_det_K_Wi__Bi + #+ 0.5*self.ln_det_K_Wi__Bi + - 0.5*self.ln_B_det + + 0.5*self.ln_det_Wi_K - 0.5*self.f_Ki_f + 0.5*self.y_Wi_Ki_i_y ) - print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC) - print self.likelihood_function._get_params() + #self.aA = 0.5*self.ln_det_Wi_K + #self.bB = - 0.5*self.f_Ki_f + #self.cC = 0.5*self.y_Wi_Ki_i_y + #self.dD = -0.5*self.ln_B_det + #print "Ztilde: {} lik: {} a: {} b: {} c: {} d:".format(Z_tilde, self.lik, self.aA, self.bB, self.cC, self.dD) + print "param value: {}".format(self.likelihood_function._get_params()) #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -222,7 +242,7 @@ class Laplace(likelihood): #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) - self.Bi, _, _, B_det = pdinv(self.B) + self.Bi, _, _, self.ln_B_det = pdinv(self.B) #Do the computation again at f to get Ki_f which is useful #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) @@ -234,7 +254,7 @@ class Laplace(likelihood): self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K) #For det, |I + KW| == |I + W_12*K*W_12| - self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T) + #self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T) #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W)) #self.ln_z_hat = (- 0.5*self.f_Ki_f @@ -299,7 +319,7 @@ class Laplace(likelihood): def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10): """ - Rasmussens numerically stable mode finding + Rasmussen's numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 :K: Covariance matrix @@ -308,7 +328,7 @@ class Laplace(likelihood): :returns: f_mode """ self.old_before_s = self.likelihood_function._get_params() - print "before: ", self.old_before_s + #print "before: ", self.old_before_s #if self.old_before_s < 1e-5: #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT @@ -351,42 +371,42 @@ class Laplace(likelihood): full_step_a = b - W_12*solve_L da = full_step_a - old_a - #f_old = f.copy() - #def inner_obj(step_size, old_a, da, K): - #a = old_a + step_size*da - #f = np.dot(K, a) - #self.a = a.copy() # This is nasty, need to set something within an optimization though - #self.f = f.copy() - #return -obj(a, f) - - #from functools import partial - #i_o = partial(inner_obj, old_a=old_a, da=da, K=K) - ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20) - #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun - #f = self.f.copy() - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - f_old = f.copy() - update_passed = False - while not update_passed: + def inner_obj(step_size, old_a, da, K): a = old_a + step_size*da f = np.dot(K, a) + self.a = a.copy() # This is nasty, need to set something within an optimization though + self.f = f.copy() + return -obj(a, f) - old_obj = new_obj - new_obj = obj(a, f) - difference = new_obj - old_obj - print "difference: ",difference - if difference < 0: - #print "Objective function rose", np.float(difference) - #If the objective function isn't rising, restart optimization - step_size *= 0.8 - #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) - #objective function isn't increasing, try reducing step size - f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode - old_obj = new_obj - rs += 1 - else: - update_passed = True + i_o = partial(inner_obj, old_a=old_a, da=da, K=K) + #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20) + new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun + f = self.f.copy() + a = self.a.copy() + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + #f_old = f.copy() + #update_passed = False + #while not update_passed: + #a = old_a + step_size*da + #f = np.dot(K, a) + + #old_obj = new_obj + #new_obj = obj(a, f) + #difference = new_obj - old_obj + ##print "difference: ",difference + #if difference < 0: + ##print "Objective function rose", np.float(difference) + ##If the objective function isn't rising, restart optimization + #step_size *= 0.8 + ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + ##objective function isn't increasing, try reducing step size + #f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode + #old_obj = new_obj + #rs += 1 + #else: + #update_passed = True #difference = abs(new_obj - old_obj) #old_obj = new_obj.copy() @@ -400,10 +420,11 @@ class Laplace(likelihood): self.old_a = old_a.copy() #print "Positive difference obj: ", np.float(difference) #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) - print "Iterations: {}, Final_difference: {}".format(i, difference) + #print "Iterations: {}, Final_difference: {}".format(i, difference) if difference > 1e-4: - print "FAIL FAIL FAIL FAIL FAIL FAIL" - if False: + #if True: + #print "Not perfect f_hat fit difference: {}".format(difference) + if True: import ipdb; ipdb.set_trace() ### XXX BREAKPOINT if hasattr(self, 'X'): import pylab as pb @@ -449,7 +470,7 @@ class Laplace(likelihood): self.old_ff = f.copy() self.old_K = self.K.copy() self.old_s = self.likelihood_function._get_params() - print "after: ", self.old_s + #print "after: ", self.old_s #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a)) self.a = a #self.B, self.B_chol, self.W_12 = B, L, W_12