diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 6374a5fd..a1c71c71 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -16,6 +16,9 @@ def student_t_approx(): Y = np.sin(X) + np.random.randn(*X.shape)*real_var Yc = Y.copy() + X_full = np.linspace(0.0, 10.0, 500)[:, None] + Y_full = np.sin(X_full) + #Y = Y/Y.max() Yc[10] += 100 @@ -25,7 +28,7 @@ def student_t_approx(): #Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 20 #100000.5 + deg_free = 10 real_sd = np.sqrt(real_var) #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) @@ -47,6 +50,8 @@ def student_t_approx(): kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() + kernel5 = kernel1.copy() + kernel6 = kernel1.copy() print "Clean Gaussian" #A GP should completely break down due to the points as they get a lot of weight @@ -58,6 +63,7 @@ def student_t_approx(): # plot plt.subplot(211) m.plot() + plt.plot(X_full, Y_full) print m #Corrupt @@ -67,40 +73,64 @@ def student_t_approx(): m.optimize() plt.subplot(212) m.plot() + plt.plot(X_full, Y_full) print m plt.figure(2) plt.suptitle('Student-t likelihood') edited_real_sd = real_sd - # Likelihood object + print "Clean student t, ncg" t_distribution = student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = Laplace(Y, t_distribution) - - print "Clean student t" + stu_t_likelihood = Laplace(Y, t_distribution, rasm=False) m = GPy.models.GP(X, stu_t_likelihood, kernel3) m.ensure_default_constraints() m.update_likelihood_approximation() - # optimize m.optimize() print(m) - # plot - plt.subplot(211) + plt.subplot(221) m.plot() - plt.ylim(-2.5,2.5) - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) - print "Corrupt student t" + print "Corrupt student t, ncg" t_distribution = student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = Laplace(Yc, t_distribution) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(223) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Clean student t, rasm" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, stu_t_likelihood, kernel6) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(222) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Corrupt student t, rasm" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() m.update_likelihood_approximation() m.optimize() print(m) - plt.subplot(212) + plt.subplot(224) m.plot() - plt.ylim(-2.5,2.5) + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT ###with a student t distribution, since it has heavy tails it should work well diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 1411c22b..8eb69869 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,16 +1,15 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv, det -from functools import partial +from scipy.linalg import cholesky, eig, inv, det, cho_solve from GPy.likelihoods.likelihood import likelihood -from GPy.util.linalg import pdinv,mdot +from GPy.util.linalg import pdinv, mdot, jitchol #import numpy.testing.assert_array_equal class Laplace(likelihood): """Laplace approximation to a posterior""" - def __init__(self, data, likelihood_function): + def __init__(self, data, likelihood_function, rasm=True): """ Laplace Approximation @@ -30,6 +29,7 @@ class Laplace(likelihood): """ self.data = data self.likelihood_function = likelihood_function + self.rasm = rasm #Inital values self.N, self.D = self.data.shape @@ -102,20 +102,16 @@ class Laplace(likelihood): #f_hat? should be f but we must have optimized for them I guess? Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) + + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat)) + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) ) - self.Z = Z_tilde - self.Y = Y_tilde[:, None] + #Convert to float as its (1, 1) and Z must be a scalar + self.Z = np.float64(Z_tilde) + self.Y = Y_tilde self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = self.Sigma_tilde - #if not self.likelihood_function.log_concave: - #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur - ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance - ##To cause the posterior to become less certain than the prior and likelihood, - ##This is a property only held by non-log-concave likelihoods self.precision = 1 / np.diag(self.covariance_matrix)[:, None] def fit_full(self, K): @@ -125,32 +121,15 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() - f = np.zeros((self.N, 1)) - (self.Ki, _, _, self.log_Kdet) = pdinv(K) - LOG_K_CONST = -(0.5 * self.log_Kdet) - OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST - #Find \hat(f) using a newton raphson optimizer for example - #TODO: Add newton-raphson as subclass of optimizer class - - #FIXME: Can we get rid of this horrible reshaping? - def obj(f): - #f = f[:, None] - res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST) - return float(res) - - def obj_grad(f): - #f = f[:, None] - res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) - return np.squeeze(res) - - def obj_hess(f): - res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) - return np.squeeze(res) - - self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + self.Ki, _, _, self.log_Kdet = pdinv(K) + if self.rasm: + self.f_hat = self.rasm_mode(K) + else: + self.f_hat = self.ncg_mode(K) #At this point get the hessian matrix - self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat)) + self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat)) + if not self.likelihood_function.log_concave: self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance @@ -176,8 +155,92 @@ class Laplace(likelihood): #Unsure whether its log_hess or log_hess_i self.ln_z_hat = (- 0.5*self.log_hess_hat_det + 0.5*self.log_Kdet - + self.likelihood_function.link_function(self.data[:,0], self.f_hat) + + self.likelihood_function.link_function(self.data, self.f_hat) + #+ self.likelihood_function.link_function(self.data, self.f_hat) - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) ) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return self._compute_GP_variables() + + def ncg_mode(self, K): + """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) + :K: Covariance matrix + :returns: f_mode + """ + self.K = K.copy() + f = np.zeros((self.N, 1)) + (self.Ki, _, _, self.log_Kdet) = pdinv(K) + LOG_K_CONST = -(0.5 * self.log_Kdet) + + #FIXME: Can we get rid of this horrible reshaping? + def obj(f): + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + + self.NORMAL_CONST + LOG_K_CONST) + return float(res) + + def obj_grad(f): + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) + return np.squeeze(res) + + def obj_hess(f): + res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) + return np.squeeze(res) + + f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + return f_hat[:, None] + + def rasm_mode(self, K): + """ + Rasmussens numerically stable mode finding + For nomenclature see Rasmussen & Williams 2006 + + :K: Covariance matrix + :returns: f_mode + """ + f = np.zeros((self.N, 1)) + new_obj = -np.inf + old_obj = np.inf + + def obj(a, f): + #Careful of shape of data! + return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f) + + difference = np.inf + epsilon = 1e-16 + step_size = 1 + while difference > epsilon: + W = -np.diag(self.likelihood_function.link_hess(self.data, f)) + if not self.likelihood_function.log_concave: + #if np.any(W < 0): + #print "NEGATIVE VALUES :(" + #pass + W[W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + #If the likelihood is non-log-concave. We wan't to say that there is a negative variance + #To cause the posterior to become less certain than the prior and likelihood, + #This is a property only held by non-log-concave likelihoods + #W is diagnoal so its sqrt is just the sqrt of the diagonal elements + W_12 = np.sqrt(W) + B = np.eye(self.N) + mdot(W_12, K, W_12) + L = jitchol(B) + b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)) + #TODO: Check L is lower + solve_L = cho_solve((L, True), mdot(W_12, (K, b))) + a = b - mdot(W_12, solve_L) + f = np.dot(K, a) + old_obj = new_obj + new_obj = obj(a, f) + difference = new_obj - old_obj + #print "Difference: ", new_obj - old_obj + if difference < 0: + #If the objective function isn't rising, restart optimization + print "Reducing step-size, restarting" + #objective function isn't increasing, try reducing step size + step_size *= 0.9 + f = np.zeros((self.N, 1)) + new_obj = -np.inf + old_obj = np.inf + + difference = abs(difference) + + return f diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 50f9b620..15859a81 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -36,7 +36,10 @@ class student_t(likelihood_function): :returns: float(likelihood evaluated for this point) """ + y = np.squeeze(y) + f = np.squeeze(f) assert y.shape == f.shape + e = y - f objective = (gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) @@ -44,6 +47,7 @@ class student_t(likelihood_function): - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v)) ) + print (e**2).shape return np.sum(objective) def link_grad(self, y, f): @@ -57,10 +61,12 @@ class student_t(likelihood_function): :returns: gradient of likelihood evaluated at points """ + y = np.squeeze(y) + f = np.squeeze(f) assert y.shape == f.shape e = y - f grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) - return grad + return np.squeeze(grad) def link_hess(self, y, f): """ @@ -75,11 +81,12 @@ class student_t(likelihood_function): :f: latent variables f :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ + y = np.squeeze(y) + f = np.squeeze(f) assert y.shape == f.shape e = y - f - #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) - return hess + return np.squeeze(hess) def predictive_values(self, mu, var): """