From f641ab54a8b6d32445e7d08cb18902958afcf3e5 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 9 Sep 2013 13:41:58 +0100 Subject: [PATCH] Checked out relavent files --- GPy/examples/laplace_approximations.py | 639 +++++++++++++++++++++++++ GPy/likelihoods/Laplace.py | 453 ++++++++++++++++++ GPy/models/GP.py | 319 ++++++++++++ 3 files changed, 1411 insertions(+) create mode 100644 GPy/examples/laplace_approximations.py create mode 100644 GPy/likelihoods/Laplace.py create mode 100644 GPy/models/GP.py diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py new file mode 100644 index 00000000..8be08a8f --- /dev/null +++ b/GPy/examples/laplace_approximations.py @@ -0,0 +1,639 @@ +import GPy +import numpy as np +import matplotlib.pyplot as plt +np.random.seed(1) + +def timing(): + real_var = 0.1 + times = 1 + deg_free = 10 + real_sd = np.sqrt(real_var) + the_is = np.zeros(times) + X = np.linspace(0.0, 10.0, 300)[:, None] + + for a in xrange(times): + Y = np.sin(X) + np.random.randn(*X.shape)*real_var + Yc = Y.copy() + + Yc[10] += 100 + Yc[25] += 10 + Yc[23] += 10 + Yc[24] += 10 + Yc[250] += 10 + #Yc[4] += 10000 + + edited_real_sd = real_sd + kernel1 = GPy.kern.rbf(X.shape[1]) + + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) + corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + the_is[a] = m.likelihood.i + + print the_is + print np.mean(the_is) + +def v_fail_test(): + #plt.close('all') + real_var = 0.1 + X = np.linspace(0.0, 10.0, 50)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*real_var + Y = Y/Y.max() + + #Add student t random noise to datapoints + deg_free = 10 + real_sd = np.sqrt(real_var) + print "Real noise std: ", real_sd + + kernel1 = GPy.kern.white(X.shape[1]) #+ GPy.kern.white(X.shape[1]) + + edited_real_sd = 0.3#real_sd + edited_real_sd = real_sd + + print "Clean student t, rasm" + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernel1) + m.constrain_positive('') + vs = 25 + noises = 30 + checkgrads = np.zeros((vs, noises)) + vs_noises = np.zeros((vs, noises)) + for v_ind, v in enumerate(np.linspace(1, 100, vs)): + m.likelihood.likelihood_function.v = v + print v + for noise_ind, noise in enumerate(np.linspace(0.0001, 100, noises)): + m['t_noise'] = noise + m.update_likelihood_approximation() + checkgrads[v_ind, noise_ind] = m.checkgrad() + vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2) + + plt.figure() + plt.title('Checkgrads') + plt.imshow(checkgrads, interpolation='nearest') + plt.xlabel('noise') + plt.ylabel('v') + + #plt.figure() + #plt.title('variance change') + #plt.imshow(vs_noises, interpolation='nearest') + #plt.xlabel('noise') + #plt.ylabel('v') + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + print(m) + +def student_t_obj_plane(): + plt.close('all') + X = np.linspace(0, 1, 50)[:, None] + real_std = 0.002 + noise = np.random.randn(*X.shape)*real_std + Y = np.sin(X*2*np.pi) + noise + deg_free = 1000 + + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) + mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp['noise'] = real_std**2 + print "Gaussian" + print mgp + + kernelst = kernelgp.copy() + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2)) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernelst) + m.ensure_default_constraints() + m.constrain_fixed('t_no', real_std**2) + vs = 10 + ls = 10 + objs_t = np.zeros((vs, ls)) + objs_g = np.zeros((vs, ls)) + rbf_vs = np.linspace(1e-6, 8, vs) + rbf_ls = np.linspace(1e-2, 8, ls) + for v_id, rbf_v in enumerate(rbf_vs): + for l_id, rbf_l in enumerate(rbf_ls): + m['rbf_v'] = rbf_v + m['rbf_l'] = rbf_l + mgp['rbf_v'] = rbf_v + mgp['rbf_l'] = rbf_l + objs_t[v_id, l_id] = m.log_likelihood() + objs_g[v_id, l_id] = mgp.log_likelihood() + plt.figure() + plt.subplot(211) + plt.title('Student t') + plt.imshow(objs_t, interpolation='none') + plt.xlabel('variance') + plt.ylabel('lengthscale') + plt.subplot(212) + plt.title('Gaussian') + plt.imshow(objs_g, interpolation='none') + plt.xlabel('variance') + plt.ylabel('lengthscale') + plt.show() + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return objs_t + +def student_t_f_check(): + plt.close('all') + X = np.linspace(0, 1, 50)[:, None] + real_std = 0.2 + noise = np.random.randn(*X.shape)*real_std + Y = np.sin(X*2*np.pi) + noise + deg_free = 1000 + + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) + mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp.randomize() + mgp.optimize() + print "Gaussian" + print mgp + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + kernelst = kernelgp.copy() + #kernelst += GPy.kern.bias(X.shape[1]) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernelst) + #m['rbf_v'] = mgp._get_params()[0] + #m['rbf_l'] = mgp._get_params()[1] + 1 + m.ensure_default_constraints() + #m.constrain_fixed('rbf_v', mgp._get_params()[0]) + #m.constrain_fixed('rbf_l', mgp._get_params()[1]) + #m.constrain_bounded('t_no', 2*real_std**2, 1e3) + #m.constrain_positive('bias') + m.constrain_positive('t_no') + m.randomize() + m['t_no'] = 0.3 + m.likelihood.X = X + #print m + plt.figure() + plt.subplot(211) + m.plot() + print "OPTIMIZED ONCE" + plt.subplot(212) + m.optimize() + m.plot() + print "final optimised student t" + print m + print "real GP" + print mgp + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return m + +def student_t_fix_optimise_check(): + plt.close('all') + real_var = 0.1 + real_std = np.sqrt(real_var) + X = np.random.rand(200)[:, None] + noise = np.random.randn(*X.shape)*real_std + Y = np.sin(X*2*np.pi) + noise + X_full = X + Y_full = np.sin(X_full) + Y = Y/Y.max() + Y_full = Y_full/Y_full.max() + deg_free = 1000 + + #GP + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) + mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp.randomize() + mgp.optimize() + + kernelst = kernelgp.copy() + real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free)) + + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std2) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + + plt.figure(1) + plt.suptitle('Student likelihood') + m = GPy.models.GP(X, stu_t_likelihood, kernelst) + m.constrain_fixed('rbf_var', mgp._get_params()[0]) + m.constrain_fixed('rbf_len', mgp._get_params()[1]) + m.constrain_positive('t_noise') + #m.ensure_default_constraints() + + m.update_likelihood_approximation() + print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood()) + plt.subplot(231) + m.plot() + plt.title('Student t original data noise') + + #Fix student t noise variance to same a GP + gp_noise = mgp._get_params()[2] + m['t_noise_std2'] = gp_noise + m.update_likelihood_approximation() + print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood()) + plt.subplot(232) + m.plot() + plt.title('Student t GP noise') + + #Fix student t noise to variance converted from the GP + real_stu_t_std2gp = (gp_noise)*((deg_free - 2)/float(deg_free)) + m['t_noise_std2'] = real_stu_t_std2gp + m.update_likelihood_approximation() + print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood()) + plt.subplot(233) + m.plot() + plt.title('Student t GP noise converted') + + m.constrain_positive('t_noise_std2') + m.randomize() + m.update_likelihood_approximation() + plt.subplot(234) + m.plot() + plt.title('Student t fixed rbf') + m.optimize() + print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood()) + plt.subplot(235) + m.plot() + plt.title('Student t fixed rbf optimised') + + plt.figure(2) + mrbf = m.copy() + mrbf.unconstrain('') + mrbf.constrain_fixed('t_noise', m.likelihood.likelihood_function.sigma2) + gp_var = mgp._get_params()[0] + gp_len = mgp._get_params()[1] + mrbf.constrain_fixed('rbf_var', gp_var) + mrbf.constrain_positive('rbf_len') + mrbf.randomize() + print "Before optimize" + print mrbf + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + mrbf.checkgrad(verbose=1) + plt.subplot(121) + mrbf.plot() + plt.title('Student t fixed noise') + mrbf.optimize() + print "After optimize" + print mrbf + plt.subplot(122) + mrbf.plot() + plt.title('Student t fixed noise optimized') + print mrbf + + plt.figure(3) + print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood()) + plt.suptitle('Gaussian likelihood optimised') + mgp.plot() + print "Real std: {}".format(real_std) + print "Real variance {}".format(real_std**2) + + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + print "Len should be: {}".format(gp_len) + return mrbf + +def debug_student_t_noise_approx(): + plot = False + real_var = 0.1 + #Start a function, any function + #X = np.linspace(0.0, 10.0, 50)[:, None] + X = np.random.rand(100)[:, None] + #X = np.random.rand(100)[:, None] + #X = np.array([0.5, 1])[:, None] + Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1 + #Y = X + np.random.randn(*X.shape)*real_var + #ty = np.array([1., 9.97733584, 4.17841363])[:, None] + #Y = ty + + X_full = X + Y_full = np.sin(X_full) + 1 + + Y = Y/Y.max() + + #Add student t random noise to datapoints + deg_free = 100 + + real_sd = np.sqrt(real_var) + print "Real noise std: ", real_sd + + initial_var_guess = 0.3 + #t_rv = t(deg_free, loc=0, scale=real_var) + #noise = t_rvrvs(size=Y.shape) + #Y += noise + + plt.close('all') + # Kernel object + kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1]) + #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1]) + kernel2 = kernel1.copy() + kernel3 = kernel1.copy() + kernel4 = kernel1.copy() + kernel5 = kernel1.copy() + kernel6 = kernel1.copy() + + print "Clean Gaussian" + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + #m = GPy.models.GP_regression(X, Y, kernel=kernel1) + ## optimize + #m.ensure_default_constraints() + #m.optimize() + ## plot + #if plot: + #plt.figure(1) + #plt.suptitle('Gaussian likelihood') + #plt.subplot(131) + #m.plot() + #plt.plot(X_full, Y_full) + #print m + + real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free))) + edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd + #edited_real_sd = real_sd + + print "Clean student t, rasm" + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + + m = GPy.models.GP(X, stu_t_likelihood, kernel6) + #m['rbf_len'] = 1.5 + #m.constrain_fixed('rbf_v', 1.0898) + #m.constrain_fixed('rbf_l', 0.2651) + #m.constrain_fixed('t_noise_std2', edited_real_sd) + #m.constrain_positive('rbf') + m.constrain_positive('t_noise_std2') + #m.constrain_positive('') + #m.constrain_bounded('t_noi', 0.001, 10) + #m.constrain_fixed('t_noi', real_stu_t_std) + #m.constrain_fixed('white', 0.01) + #m.constrain_fixed('t_no', 0.01) + #m['rbf_var'] = 0.20446332 + #m['rbf_leng'] = 0.85776241 + #m['t_noise'] = 0.667083294421005 + m.ensure_default_constraints() + m.update_likelihood_approximation() + #m.optimize(messages=True) + print(m) + #return m + #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) + if plot: + plt.suptitle('Student-t likelihood') + plt.subplot(132) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + print "Real noise std: ", real_sd + print "or Real noise std: ", real_stu_t_std + return m + + #print "Clean student t, ncg" + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) + #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') + #m = GPy.models.GP(X, stu_t_likelihood, kernel3) + #m.ensure_default_constraints() + #m.update_likelihood_approximation() + #m.optimize() + #print(m) + #if plot: + #plt.subplot(133) + #m.plot() + #plt.plot(X_full, Y_full) + #plt.ylim(-2.5, 2.5) + + #plt.show() + +def student_t_approx(): + """ + Example of regressing with a student t likelihood + """ + real_std = 0.1 + #Start a function, any function + X = np.linspace(0.0, 10.0, 50)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*real_std + Yc = Y.copy() + + X_full = np.linspace(0.0, 10.0, 500)[:, None] + Y_full = np.sin(X_full) + + Y = Y/Y.max() + + Yc[10] += 100 + Yc[25] += 10 + Yc[23] += 10 + Yc[26] += 1000 + Yc[24] += 10 + #Yc = Yc/Yc.max() + + #Add student t random noise to datapoints + deg_free = 8 + print "Real noise: ", real_std + + initial_var_guess = 0.1 + #t_rv = t(deg_free, loc=0, scale=real_var) + #noise = t_rvrvs(size=Y.shape) + #Y += noise + + #Add some extreme value noise to some of the datapoints + #percent_corrupted = 0.15 + #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) + #indices = np.arange(Y.shape[0]) + #np.random.shuffle(indices) + #corrupted_indices = indices[:corrupted_datums] + #print corrupted_indices + #noise = t_rv.rvs(size=(len(corrupted_indices), 1)) + #Y[corrupted_indices] += noise + + plt.figure(1) + plt.suptitle('Gaussian likelihood') + # Kernel object + kernel1 = GPy.kern.rbf(X.shape[1]) + kernel2 = kernel1.copy() + kernel3 = kernel1.copy() + kernel4 = kernel1.copy() + kernel5 = kernel1.copy() + kernel6 = kernel1.copy() + + print "Clean Gaussian" + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X, Y, kernel=kernel1) + # optimize + m.ensure_default_constraints() + m.optimize() + # plot + plt.subplot(211) + m.plot() + plt.plot(X_full, Y_full) + plt.title('Gaussian clean') + print m + + #Corrupt + print "Corrupt Gaussian" + m = GPy.models.GP_regression(X, Yc, kernel=kernel2) + m.ensure_default_constraints() + #m.optimize() + plt.subplot(212) + m.plot() + plt.plot(X_full, Y_full) + plt.title('Gaussian corrupt') + print m + + plt.figure(2) + plt.suptitle('Student-t likelihood') + edited_real_sd = real_std #initial_var_guess + + print "Clean student t, rasm" + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernel6) + m.ensure_default_constraints() + m.constrain_positive('t_noise') + m.randomize() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(222) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + plt.title('Student-t rasm clean') + + print "Corrupt student t, rasm" + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) + corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) + m.ensure_default_constraints() + m.constrain_positive('t_noise') + m.randomize() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(224) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + plt.title('Student-t rasm corrupt') + + return m + + #print "Clean student t, ncg" + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) + #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') + #m = GPy.models.GP(X, stu_t_likelihood, kernel3) + #m.ensure_default_constraints() + #m.update_likelihood_approximation() + #m.optimize() + #print(m) + #plt.subplot(221) + #m.plot() + #plt.plot(X_full, Y_full) + #plt.ylim(-2.5, 2.5) + #plt.title('Student-t ncg clean') + + #print "Corrupt student t, ncg" + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) + #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg') + #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) + #m.ensure_default_constraints() + #m.update_likelihood_approximation() + #m.optimize() + #print(m) + #plt.subplot(223) + #m.plot() + #plt.plot(X_full, Y_full) + #plt.ylim(-2.5, 2.5) + #plt.title('Student-t ncg corrupt') + + + ###with a student t distribution, since it has heavy tails it should work well + ###likelihood_function = student_t(deg_free, sigma2=real_var) + ###lap = Laplace(Y, likelihood_function) + ###cov = kernel.K(X) + ###lap.fit_full(cov) + + ###test_range = np.arange(0, 10, 0.1) + ###plt.plot(test_range, t_rv.pdf(test_range)) + ###for i in xrange(X.shape[0]): + ###mode = lap.f_hat[i] + ###covariance = lap.hess_hat_i[i,i] + ###scaling = np.exp(lap.ln_z_hat) + ###normalised_approx = norm(loc=mode, scale=covariance) + ###print "Normal with mode %f, and variance %f" % (mode, covariance) + ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + ###plt.show() + + return m + + +def noisy_laplace_approx(): + """ + Example of regressing with a student t likelihood + """ + #Start a function, any function + X = np.sort(np.random.uniform(0, 15, 70))[:, None] + Y = np.sin(X) + + #Add some extreme value noise to some of the datapoints + percent_corrupted = 0.05 + corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) + indices = np.arange(Y.shape[0]) + np.random.shuffle(indices) + corrupted_indices = indices[:corrupted_datums] + print corrupted_indices + noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1)) + Y[corrupted_indices] += noise + + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X, Y) + + # optimize + m.ensure_default_constraints() + m.optimize() + # plot + m.plot() + print m + + #with a student t distribution, since it has heavy tails it should work well + +def gaussian_f_check(): + plt.close('all') + X = np.linspace(0, 1, 50)[:, None] + real_std = 0.2 + noise = np.random.randn(*X.shape)*real_std + Y = np.sin(X*2*np.pi) + noise + + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) + mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp.randomize() + mgp.optimize() + print "Gaussian" + print mgp + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + kernelg = kernelgp.copy() + #kernelst += GPy.kern.bias(X.shape[1]) + N, D = X.shape + g_distribution = GPy.likelihoods.likelihood_functions.gaussian(variance=0.1, N=N, D=D) + g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm') + m = GPy.models.GP(X, g_likelihood, kernelg) + #m['rbf_v'] = mgp._get_params()[0] + #m['rbf_l'] = mgp._get_params()[1] + 1 + m.ensure_default_constraints() + #m.constrain_fixed('rbf_v', mgp._get_params()[0]) + #m.constrain_fixed('rbf_l', mgp._get_params()[1]) + #m.constrain_bounded('t_no', 2*real_std**2, 1e3) + #m.constrain_positive('bias') + m.constrain_positive('noise_var') + m.randomize() + m['noise_variance'] = 0.1 + m.likelihood.X = X + plt.figure() + plt.subplot(211) + m.plot() + plt.subplot(212) + m.optimize() + m.plot() + print "final optimised gaussian" + print m + print "real GP" + print mgp + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py new file mode 100644 index 00000000..58304c23 --- /dev/null +++ b/GPy/likelihoods/Laplace.py @@ -0,0 +1,453 @@ +import numpy as np +import scipy as sp +import GPy +from scipy.linalg import inv, cho_solve, det +from numpy.linalg import cond +from likelihood import likelihood +from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet +from scipy.linalg.lapack import dtrtrs +import random +#import pylab as plt + +class Laplace(likelihood): + """Laplace approximation to a posterior""" + + def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'): + """ + Laplace Approximation + + First find the moments \hat{f} and the hessian at this point (using Newton-Raphson) + then find the z^{prime} which allows this to be a normalised gaussian instead of a + non-normalized gaussian + + Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle} + which makes a gaussian the same as the laplace approximation + + Arguments + --------- + + :data: array of data the likelihood function is approximating + :likelihood_function: likelihood function - subclass of likelihood_function + :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data + :opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data) + + """ + self.data = data + self.likelihood_function = likelihood_function + self.extra_data = extra_data + self.opt = opt + + #Inital values + self.N, self.D = self.data.shape + self.is_heteroscedastic = True + self.Nparams = 0 + + self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi)) + + #Initial values for the GP variables + self.Y = np.zeros((self.N, 1)) + self.covariance_matrix = np.eye(self.N) + self.precision = np.ones(self.N)[:, None] + self.Z = 0 + self.YYT = None + + self.old_a = None + + def predictive_values(self, mu, var, full_cov): + if full_cov: + raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood") + return self.likelihood_function.predictive_values(mu, var) + + def _get_params(self): + return np.asarray(self.likelihood_function._get_params()) + + def _get_param_names(self): + return self.likelihood_function._get_param_names() + + def _set_params(self, p): + return self.likelihood_function._set_params(p) + + def _shared_gradients_components(self): + #FIXME: Careful of side effects! And make sure W and K are up to date! + d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) + dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T + I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i) + return dL_dfhat, I_KW_i + + def _Kgradients(self, dK_dthetaK, X): + """ + Gradients with respect to prior kernel parameters + """ + dL_dfhat, I_KW_i = self._shared_gradients_components() + dlp = self.likelihood_function.dlik_df(self.data, self.f_hat) + + #Implicit + impl = mdot(dlp, dL_dfhat, I_KW_i) + expl_a = mdot(self.Ki_f, self.Ki_f.T) + expl_b = self.Wi_K_i + #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b) + expl = 0.5*expl_a + 0.5*expl_b # Might need to be -? + dL_dthetaK_exp = dK_dthetaK(expl, X) + dL_dthetaK_imp = dK_dthetaK(impl, X) + #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) + #print "expl_a: {}, {} expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b)) + dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp + return dL_dthetaK + + def _gradients(self, partial): + """ + Gradients with respect to likelihood parameters + """ + dL_dfhat, I_KW_i = self._shared_gradients_components() + dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat) + + num_params = len(dlik_dthetaL) + dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter + for thetaL_i in range(num_params): + #Explicit + #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + #a = 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + #d = dlik_hess_dthetaL[thetaL_i] + #e = pdinv(pdinv(self.K)[0] + np.diagflat(self.W))[0] + #b = 0.5*np.dot(np.diag(e).T, d) + #g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1)) + #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i]) + dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + + #Implicit + df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) + dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL) + #print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) + dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp + + return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) + + def _compute_GP_variables(self): + """ + Generates data Y which would give the normal distribution identical to the laplace approximation + + GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} + that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood + + Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal) + then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f) + due to the z rescaling. + + at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1) + This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1) + giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f) + + $$\tilde{Y} = \tilde{\Sigma} Hf$$ + where + $$\tilde{\Sigma}^{-1} = H - K^{-1}$$ + i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$ + since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$ + and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ + $$\tilde{\Sigma} = W^{-1}$$ + + """ + #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I + #dtritri -> L -> L_i + #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i + #((L.T*w)_i + I)f_hat = y_tilde + #L = jitchol(self.K) + #Li = chol_inv(L) + #Lt_W = L.T*self.W.T + + #Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0] + #self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N) + #Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) + + Wi = 1.0/self.W + self.Sigma_tilde = np.diagflat(Wi) + + Y_tilde = Wi*self.Ki_f + self.f_hat + + self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R + #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6 + + self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) + self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) + + self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) + self.aA = 0.5*self.ln_det_K_Wi__Bi + self.bB = - 0.5*self.f_Ki_f + self.cC = 0.5*self.y_Wi_Ki_i_y + Z_tilde = (+ self.lik + + 0.5*self.ln_det_K_Wi__Bi + - 0.5*self.f_Ki_f + + 0.5*self.y_Wi_Ki_i_y + ) + print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC) + print self.likelihood_function._get_params() + + #Convert to float as its (1, 1) and Z must be a scalar + self.Z = np.float64(Z_tilde) + self.Y = Y_tilde + self.YYT = np.dot(self.Y, self.Y.T) + self.covariance_matrix = self.Sigma_tilde + self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None] + + def fit_full(self, K): + """ + The laplace approximation algorithm, find K and expand hessian + For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability + :K: Covariance matrix + """ + self.K = K.copy() + + #Find mode + self.f_hat = { + 'rasm': self.rasm_mode, + 'ncg': self.ncg_mode, + 'nelder': self.nelder_mode + }[self.opt](self.K) + + #Compute hessian and other variables at mode + self._compute_likelihood_variables() + + def _compute_likelihood_variables(self): + #At this point get the hessian matrix (or vector as W is diagonal) + self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data) + + if not self.likelihood_function.log_concave: + self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + #If the likelihood is non-log-concave. We wan't to say that there is a negative variance + #To cause the posterior to become less certain than the prior and likelihood, + #This is a property only held by non-log-concave likelihoods + + #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though + self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) + self.Bi, _, _, B_det = pdinv(self.B) + + #Do the computation again at f to get Ki_f which is useful + #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) + #solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b)) + #a = b - self.W_12*solve_chol + self.Ki_f = self.a + + self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f) + self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K) + + #For det, |I + KW| == |I + W_12*K*W_12| + self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T) + + #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W)) + #self.ln_z_hat = (- 0.5*self.f_Ki_f + #- self.ln_I_KW_det + #+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) + #) + + return self._compute_GP_variables() + + def _compute_B_statistics(self, K, W): + """Rasmussen suggests the use of a numerically stable positive definite matrix B + Which has a positive diagonal element and can be easyily inverted + + :K: Covariance matrix + :W: Negative hessian at a point (diagonal matrix) + :returns: (B, L) + """ + #W is diagonal so its sqrt is just the sqrt of the diagonal elements + W_12 = np.sqrt(W) + B = np.eye(self.N) + W_12*K*W_12.T + L = jitchol(B) + return (B, L, W_12) + + def nelder_mode(self, K): + f = np.zeros((self.N, 1)) + self.Ki, _, _, self.ln_K_det = pdinv(K) + def obj(f): + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f))) + return float(res) + + res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True}) + f_new = res.x + return f_new[:, None] + + def ncg_mode(self, K): + """ + Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) + :K: Covariance matrix + :returns: f_mode + """ + self.Ki, _, _, self.ln_K_det = pdinv(K) + + f = np.zeros((self.N, 1)) + + #FIXME: Can we get rid of this horrible reshaping? + #ONLY WORKS FOR 1D DATA + def obj(f): + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f)) + - self.NORMAL_CONST) + return float(res) + + def obj_grad(f): + res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f)) + return np.squeeze(res) + + def obj_hess(f): + res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki) + return np.squeeze(res) + + f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False) + return f_hat[:, None] + + def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10): + """ + Rasmussens numerically stable mode finding + For nomenclature see Rasmussen & Williams 2006 + + :K: Covariance matrix + :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation + :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation + :returns: f_mode + """ + self.old_before_s = self.likelihood_function._get_params() + print "before: ", self.old_before_s + #if self.old_before_s < 1e-5: + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + #old_a = np.zeros((self.N, 1)) + if self.old_a is None: + old_a = np.zeros((self.N, 1)) + f = np.dot(K, old_a) + else: + old_a = self.old_a.copy() + f = self.f_hat.copy() + + new_obj = -np.inf + old_obj = np.inf + + def obj(a, f): + return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data) + + difference = np.inf + epsilon = 1e-4 + step_size = 1 + rs = 0 + i = 0 + + while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART: + W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) + #W = np.maximum(W, 0) + if not self.likelihood_function.log_concave: + W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + # If the likelihood is non-log-concave. We wan't to say that there is a negative variance + # To cause the posterior to become less certain than the prior and likelihood, + # This is a property only held by non-log-concave likelihoods + B, L, W_12 = self._compute_B_statistics(K, W.copy()) + + W_f = W*f + grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) + + b = W_f + grad + solve_L = cho_solve((L, True), W_12*np.dot(K, b)) + #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet + full_step_a = b - W_12*solve_L + da = full_step_a - old_a + + #f_old = f.copy() + #def inner_obj(step_size, old_a, da, K): + #a = old_a + step_size*da + #f = np.dot(K, a) + #self.a = a.copy() # This is nasty, need to set something within an optimization though + #self.f = f.copy() + #return -obj(a, f) + + #from functools import partial + #i_o = partial(inner_obj, old_a=old_a, da=da, K=K) + ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20) + #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun + #f = self.f.copy() + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + f_old = f.copy() + update_passed = False + while not update_passed: + a = old_a + step_size*da + f = np.dot(K, a) + + old_obj = new_obj + new_obj = obj(a, f) + difference = new_obj - old_obj + print "difference: ",difference + if difference < 0: + #print "Objective function rose", np.float(difference) + #If the objective function isn't rising, restart optimization + step_size *= 0.8 + #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #objective function isn't increasing, try reducing step size + f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode + old_obj = new_obj + rs += 1 + else: + update_passed = True + + #difference = abs(new_obj - old_obj) + #old_obj = new_obj.copy() + #difference = np.abs(np.sum(f - f_old)) + difference = np.abs(np.sum(a - old_a)) + #old_a = self.a.copy() #a + old_a = a.copy() + i += 1 + #print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a)) + + self.old_a = old_a.copy() + #print "Positive difference obj: ", np.float(difference) + #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) + print "Iterations: {}, Final_difference: {}".format(i, difference) + if difference > 1e-4: + print "FAIL FAIL FAIL FAIL FAIL FAIL" + if False: + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + if hasattr(self, 'X'): + import pylab as pb + pb.figure() + pb.subplot(311) + pb.title('old f_hat') + pb.plot(self.X, self.f_hat) + pb.subplot(312) + pb.title('old ff') + pb.plot(self.X, self.old_ff) + pb.subplot(313) + pb.title('new f_hat') + pb.plot(self.X, f) + + pb.figure() + pb.subplot(121) + pb.title('old K') + pb.imshow(np.diagflat(self.old_K), interpolation='none') + pb.colorbar() + pb.subplot(122) + pb.title('new K') + pb.imshow(np.diagflat(K), interpolation='none') + pb.colorbar() + + pb.figure() + pb.subplot(121) + pb.title('old W') + pb.imshow(np.diagflat(self.old_W), interpolation='none') + pb.colorbar() + pb.subplot(122) + pb.title('new W') + pb.imshow(np.diagflat(W), interpolation='none') + pb.colorbar() + + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + pb.close('all') + + #FIXME: DELETE THESE + self.old_W = W.copy() + self.old_grad = grad.copy() + self.old_B = B.copy() + self.old_W_12 = W_12.copy() + self.old_ff = f.copy() + self.old_K = self.K.copy() + self.old_s = self.likelihood_function._get_params() + print "after: ", self.old_s + #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a)) + self.a = a + #self.B, self.B_chol, self.W_12 = B, L, W_12 + #self.Bi, _, _, B_det = pdinv(self.B) + return f diff --git a/GPy/models/GP.py b/GPy/models/GP.py new file mode 100644 index 00000000..77620488 --- /dev/null +++ b/GPy/models/GP.py @@ -0,0 +1,319 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from scipy import linalg +import pylab as pb +from .. import kern +from ..core import model +from ..util.linalg import pdinv, mdot, tdot +from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango +from ..likelihoods import EP, Laplace + +class GP(model): + """ + Gaussian Process model for regression and EP + + :param X: input observations + :param kernel: a GPy kernel, defaults to rbf+white + :parm likelihood: a GPy likelihood + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :rtype: model object + :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 + :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] + :type powerep: list + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + def __init__(self, X, likelihood, kernel, normalize_X=False): + self.has_uncertain_inputs=False + + # parse arguments + self.X = X + assert len(self.X.shape) == 2 + self.N, self.Q = self.X.shape + assert isinstance(kernel, kern.kern) + self.kern = kernel + self.likelihood = likelihood + assert self.X.shape[0] == self.likelihood.data.shape[0] + self.N, self.D = self.likelihood.data.shape + + # here's some simple normalization for the inputs + if normalize_X: + self._Xmean = X.mean(0)[None, :] + self._Xstd = X.std(0)[None, :] + self.X = (X.copy() - self._Xmean) / self._Xstd + if hasattr(self, 'Z'): + self.Z = (self.Z - self._Xmean) / self._Xstd + else: + self._Xmean = np.zeros((1, self.X.shape[1])) + self._Xstd = np.ones((1, self.X.shape[1])) + + if not hasattr(self,'has_uncertain_inputs'): + self.has_uncertain_inputs = False + model.__init__(self) + + def dL_dZ(self): + """ + TODO: one day we might like to learn Z by gradient methods? + """ + #FIXME: this doesn;t live here. + return np.zeros_like(self.Z) + + def _set_params(self, p): + self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()]) + # self.likelihood._set_params(p[self.kern.Nparam:]) # test by Nicolas + self.likelihood._set_params(p[self.kern.Nparam_transformed():]) # test by Nicolas + + if isinstance(self.likelihood, Laplace): + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) + + self.K = self.kern.K(self.X) + self.K += self.likelihood.covariance_matrix + + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + + # the gradient of the likelihood wrt the covariance matrix + if self.likelihood.YYT is None: + #alpha = np.dot(self.Ki, self.likelihood.Y) + alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1) + + self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki) + else: + #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki) + tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1) + tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1) + self.dL_dK = 0.5 * (tmp - self.D * self.Ki) + + def _get_params(self): + return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params())) + + def _get_param_names(self): + return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + + def _update_params_callback(self, p): + #parameters will be in transformed space + self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()]) + #set_params_transformed for likelihood doesn't exist? + self.likelihood._set_params(p[self.kern.Nparam_transformed():]) + #update the likelihood approximation within the optimisation with the current parameters + self.update_likelihood_approximation() + + def update_likelihood_approximation(self): + """ + Approximates a non-gaussian likelihood using Expectation Propagation + + For a Gaussian likelihood, no iteration is required: + this function does nothing + """ + self.likelihood.fit_full(self.kern.K(self.X)) + self._set_params(self._get_params()) # update the GP + + def _model_fit_term(self): + """ + Computes the model fit using YYT if it's available + """ + if self.likelihood.YYT is None: + tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1) + return -0.5 * np.sum(np.square(tmp)) + #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y))) + else: + return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT)) + + def log_likelihood(self): + """ + The log marginal likelihood of the GP. + + For an EP model, can be written as the log likelihood of a regression + model for a new variable Y* = v_tilde/tau_tilde, with a covariance + matrix K* = K + diag(1./tau_tilde) plus a normalization term. + """ + #if isinstance(self.likelihood, Laplace): + #self.likelihood.fit_full(self.kern.K(self.X)) + #self.likelihood._set_params(self.likelihood._get_params()) + l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z + print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z) + return l + + def _log_likelihood_gradients(self): + """ + The gradient of all parameters. + + Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta + """ + dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) + print "dL_dthetaK should be: ", dL_dthetaK + if isinstance(self.likelihood, Laplace): + #self.likelihood.fit_full(self.kern.K(self.X)) + #self.likelihood._set_params(self.likelihood._get_params()) + dK_dthetaK = self.kern.dK_dtheta + dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy()) + dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + else: + dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + #print "dL_dthetaK: {} dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL) + + return np.hstack((dL_dthetaK, dL_dthetaL)) + #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) + + def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False): + """ + Internal helper function for making predictions, does not account + for normalization or likelihood + """ + Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T + #KiKx = np.dot(self.Ki, Kx) + KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1) + mu = np.dot(KiKx.T, self.likelihood.Y) + if full_cov: + Kxx = self.kern.K(_Xnew, which_parts=which_parts) + var = Kxx - np.dot(KiKx.T, Kx) + else: + Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts) + var = Kxx - np.sum(np.multiply(KiKx, Kx), 0) + var = var[:, None] + if stop: + debug_this + return mu, var + + + def predict(self, Xnew, which_parts='all', full_cov=False): + """ + Predict the function(s) at the new point(s) Xnew. + + Arguments + --------- + :param Xnew: The points at which to make a prediction + :type Xnew: np.ndarray, Nnew x self.Q + :param which_parts: specifies which outputs kernel(s) to use in prediction + :type which_parts: ('all', list of bools) + :param full_cov: whether to return the folll covariance matrix, or just the diagonal + :type full_cov: bool + :rtype: posterior mean, a Numpy array, Nnew x self.D + :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise + :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays, Nnew x self.D + + + If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew. + This is to allow for different normalizations of the output dimensions. + + """ + # normalize X values + Xnew = (Xnew.copy() - self._Xmean) / self._Xstd + mu, var = self._raw_predict(Xnew, which_parts, full_cov) + + # now push through likelihood + mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov) + + return mean, var, _025pm, _975pm + + + def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False): + """ + Plot the GP's view of the world, where the data is normalized and the + likelihood is Gaussian. + + :param samples: the number of a posteriori samples to plot + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :param which_parts: which of the kernel functions to plot (additively) + :type which_parts: 'all', or list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + + Plot the posterior of the GP. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, we've no implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions + using which_data and which_functions + """ + if which_data == 'all': + which_data = slice(None) + + if self.X.shape[1] == 1: + Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits) + if samples == 0: + m, v = self._raw_predict(Xnew, which_parts=which_parts) + gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v)) + pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5) + else: + m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True) + Ysim = np.random.multivariate_normal(m.flatten(), v, samples) + gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None]) + for i in range(samples): + pb.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25) + pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5) + pb.xlim(xmin, xmax) + ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None]))) + ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin) + pb.ylim(ymin, ymax) + if hasattr(self, 'Z'): + pb.plot(self.Z, self.Z * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12) + + elif self.X.shape[1] == 2: + resolution = resolution or 50 + Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution) + m, v = self._raw_predict(Xnew, which_parts=which_parts) + m = m.reshape(resolution, resolution).T + pb.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) + pb.scatter(Xorig[:, 0], Xorig[:, 1], 40, Yorig, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) + pb.xlim(xmin[0], xmax[0]) + pb.ylim(xmin[1], xmax[1]) + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" + + def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20): + """ + TODO: Docstrings! + :param levels: for 2D plotting, the number of contour levels to use + + """ + # TODO include samples + if which_data == 'all': + which_data = slice(None) + + if self.X.shape[1] == 1: + + Xu = self.X * self._Xstd + self._Xmean # NOTE self.X are the normalized values now + + Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits) + m, var, lower, upper = self.predict(Xnew, which_parts=which_parts) + gpplot(Xnew, m, lower, upper) + pb.plot(Xu[which_data], self.likelihood.data[which_data], 'kx', mew=1.5) + if self.has_uncertain_inputs: + pb.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0], + xerr=2 * np.sqrt(self.X_variance[which_data, 0]), + ecolor='k', fmt=None, elinewidth=.5, alpha=.5) + + ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper)) + ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin) + pb.xlim(xmin, xmax) + pb.ylim(ymin, ymax) + if hasattr(self, 'Z'): + Zu = self.Z * self._Xstd + self._Xmean + pb.plot(Zu, Zu * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12) + # pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten())) + + elif self.X.shape[1] == 2: # FIXME + resolution = resolution or 50 + Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution) + x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution) + m, var, lower, upper = self.predict(Xnew, which_parts=which_parts) + m = m.reshape(resolution, resolution).T + pb.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) + Yf = self.likelihood.Y.flatten() + pb.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) + pb.xlim(xmin[0], xmax[0]) + pb.ylim(xmin[1], xmax[1]) + if hasattr(self, 'Z'): + pb.plot(self.Z[:, 0], self.Z[:, 1], 'wo') + + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions"