From 617d73ca3271f080ed2e58efd9cbd9a49e301ac0 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 26 Jun 2013 15:44:26 +0100 Subject: [PATCH] Now checkgrads a lot more of the time, but still fails in optimisation, seems also odd that when parameter is fixed kernel parameters go to infinity --- GPy/examples/laplace_approximations.py | 17 +++++++++++------ GPy/likelihoods/Laplace.py | 23 ++++++++--------------- GPy/models/GP.py | 7 +++++-- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 61291e71..0fd3efeb 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -98,7 +98,7 @@ def debug_student_t_noise_approx(): Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 10 + deg_free = 100 real_sd = np.sqrt(real_var) print "Real noise std: ", real_sd @@ -133,20 +133,23 @@ def debug_student_t_noise_approx(): #plt.plot(X_full, Y_full) #print m - edited_real_sd = initial_var_guess #real_sd + real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free))) + edited_real_sd = real_stu_t_std#initial_var_guess #real_sd #edited_real_sd = real_sd print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernel6) - m['rbf_len'] = 1.5 + #m['rbf_len'] = 1.5 #m.constrain_fixed('rbf_v', 1.0898) #m.constrain_fixed('rbf_l', 1.8651) - #m.constrain_fixed('t_noise_variance', real_sd) + m.constrain_fixed('t_noise_std', edited_real_sd) #m.constrain_positive('rbf') - #m.constrain_positive('t_noise') - m.constrain_positive('') + #m.constrain_positive('t_noise_std') + #m.constrain_positive('') + m.ensure_default_constraints() #m.constrain_fixed('t_noi', real_sd) m.update_likelihood_approximation() #m.optimize(messages=True) @@ -264,6 +267,7 @@ def student_t_approx(): stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel6) m.ensure_default_constraints() + m.constrain_positive('t_noise') m.update_likelihood_approximation() m.optimize() print(m) @@ -278,6 +282,7 @@ def student_t_approx(): corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() + m.constrain_positive('t_noise') m.update_likelihood_approximation() m.optimize() print(m) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index b9d74846..1431a7c6 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -109,7 +109,7 @@ class Laplace(likelihood): #Implicit df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL) - print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) + #print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) @@ -147,10 +147,11 @@ class Laplace(likelihood): Li = chol_inv(L) Lt_W = L.T*self.W.T - Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] + Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0] self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N) Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT self.Sigma_tilde = np.diagflat(1.0/self.W) @@ -166,7 +167,7 @@ class Laplace(likelihood): - 0.5*self.f_Ki_f + 0.5*y_Wi_Ki_i_y ) - print "Ztilde: {}".format(Z_tilde) + #print "Ztilde: {}".format(Z_tilde) #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -280,7 +281,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=10): + def rasm_mode(self, K, MAX_ITER=250, MAX_RESTART=10): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -308,7 +309,6 @@ class Laplace(likelihood): rs = 0 i = 0 while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: - #f_old = f.copy() W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur @@ -338,10 +338,10 @@ class Laplace(likelihood): #print "difference: ",difference if difference < -epsilon: #print grad - print "Objective function rose", np.float(difference) + #print "Objective function rose", np.float(difference) #If the objective function isn't rising, restart optimization step_size *= 0.4 - print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) #objective function isn't increasing, try reducing step size #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode #old_obj = tmp_old_obj @@ -351,18 +351,11 @@ class Laplace(likelihood): update_passed = True difference = np.abs(np.sum(f - f_old)) + abs(difference) - #print "Iter difference: ", difference - #print "F: ", f - #print "A: ", a old_a = a - #print "Positive difference obj: ", np.float(difference) - #difference = np.float(abs(difference)) i += 1 #print "Positive difference obj: ", np.float(difference) - print "Iterations: ",i - print "Step size reductions", rs - print "Final difference: ", difference + print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) self.a = a self.B, self.B_chol, self.W_12 = B, L, W_12 self.Bi, _, _, B_det = pdinv(self.B) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 636ebba0..7b6fab27 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -141,10 +141,11 @@ class GP(model): Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta """ - self.likelihood.fit_full(self.kern.K(self.X)) - self.likelihood._set_params(self.likelihood._get_params()) dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) + print "dL_dthetaK should be: ", dL_dthetaK if isinstance(self.likelihood, Laplace): + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) dK_dthetaK = self.kern.dK_dtheta dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) @@ -153,6 +154,8 @@ class GP(model): else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + print "dL_dthetaK is: ", dL_dthetaK + return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))