Checked out relavent files

2026-05-10 20:42:39 +02:00 · 2013-09-09 13:41:58 +01:00 · 2013-09-09 13:41:58 +01:00 · f641ab54a8
commit f641ab54a8
parent 29358519d9
3 changed files with 1411 additions and 0 deletions
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@ -0,0 +1,639 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
 np.random.seed(1)
 def timing():
    real_var = 0.1
    times = 1
    deg_free = 10
    real_sd = np.sqrt(real_var)
    the_is = np.zeros(times)
    X = np.linspace(0.0, 10.0, 300)[:, None]
    for a in xrange(times):
        Y = np.sin(X) + np.random.randn(*X.shape)*real_var
        Yc = Y.copy()
        Yc[10] += 100
        Yc[25] += 10
        Yc[23] += 10
        Yc[24] += 10
        Yc[250] += 10
        #Yc[4] += 10000
        edited_real_sd = real_sd
        kernel1 = GPy.kern.rbf(X.shape[1])
        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
        m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
        m.ensure_default_constraints()
        m.update_likelihood_approximation()
        m.optimize()
        the_is[a] = m.likelihood.i
    print the_is
    print np.mean(the_is)
 def v_fail_test():
    #plt.close('all')
    real_var = 0.1
    X = np.linspace(0.0, 10.0, 50)[:, None]
    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
    Y = Y/Y.max()
    #Add student t random noise to datapoints
    deg_free = 10
    real_sd = np.sqrt(real_var)
    print "Real noise std: ", real_sd
    kernel1 = GPy.kern.white(X.shape[1]) #+ GPy.kern.white(X.shape[1])
    edited_real_sd = 0.3#real_sd
    edited_real_sd = real_sd
    print "Clean student t, rasm"
    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
    m = GPy.models.GP(X, stu_t_likelihood, kernel1)
    m.constrain_positive('')
    vs = 25
    noises = 30
    checkgrads = np.zeros((vs, noises))
    vs_noises = np.zeros((vs, noises))
    for v_ind, v in enumerate(np.linspace(1, 100, vs)):
        m.likelihood.likelihood_function.v = v
        print v
        for noise_ind, noise in enumerate(np.linspace(0.0001, 100, noises)):
            m['t_noise'] = noise
            m.update_likelihood_approximation()
            checkgrads[v_ind, noise_ind] = m.checkgrad()
            vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2)
    plt.figure()
    plt.title('Checkgrads')
    plt.imshow(checkgrads, interpolation='nearest')
    plt.xlabel('noise')
    plt.ylabel('v')
    #plt.figure()
    #plt.title('variance change')
    #plt.imshow(vs_noises, interpolation='nearest')
    #plt.xlabel('noise')
    #plt.ylabel('v')
    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    print(m)
 def student_t_obj_plane():
    plt.close('all')
    X = np.linspace(0, 1, 50)[:, None]
    real_std = 0.002
    noise = np.random.randn(*X.shape)*real_std
    Y = np.sin(X*2*np.pi) + noise
    deg_free = 1000
    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
    mgp.ensure_default_constraints()
    mgp['noise'] = real_std**2
    print "Gaussian"
    print mgp
    kernelst = kernelgp.copy()
    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
    m.ensure_default_constraints()
    m.constrain_fixed('t_no', real_std**2)
    vs = 10
    ls = 10
    objs_t = np.zeros((vs, ls))
    objs_g = np.zeros((vs, ls))
    rbf_vs = np.linspace(1e-6, 8, vs)
    rbf_ls = np.linspace(1e-2, 8, ls)
    for v_id, rbf_v in enumerate(rbf_vs):
        for l_id, rbf_l in enumerate(rbf_ls):
            m['rbf_v'] = rbf_v
            m['rbf_l'] = rbf_l
            mgp['rbf_v'] = rbf_v
            mgp['rbf_l'] = rbf_l
            objs_t[v_id, l_id] = m.log_likelihood()
            objs_g[v_id, l_id] = mgp.log_likelihood()
    plt.figure()
    plt.subplot(211)
    plt.title('Student t')
    plt.imshow(objs_t, interpolation='none')
    plt.xlabel('variance')
    plt.ylabel('lengthscale')
    plt.subplot(212)
    plt.title('Gaussian')
    plt.imshow(objs_g, interpolation='none')
    plt.xlabel('variance')
    plt.ylabel('lengthscale')
    plt.show()
    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    return objs_t
 def student_t_f_check():
    plt.close('all')
    X = np.linspace(0, 1, 50)[:, None]
    real_std = 0.2
    noise = np.random.randn(*X.shape)*real_std
    Y = np.sin(X*2*np.pi) + noise
    deg_free = 1000
    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
    mgp.ensure_default_constraints()
    mgp.randomize()
    mgp.optimize()
    print "Gaussian"
    print mgp
    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    kernelst = kernelgp.copy()
    #kernelst += GPy.kern.bias(X.shape[1])
    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
    #m['rbf_v'] = mgp._get_params()[0]
    #m['rbf_l'] = mgp._get_params()[1] + 1
    m.ensure_default_constraints()
    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
    #m.constrain_positive('bias')
    m.constrain_positive('t_no')
    m.randomize()
    m['t_no'] = 0.3
    m.likelihood.X = X
    #print m
    plt.figure()
    plt.subplot(211)
    m.plot()
    print "OPTIMIZED ONCE"
    plt.subplot(212)
    m.optimize()
    m.plot()
    print "final optimised student t"
    print m
    print "real GP"
    print mgp
    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    return m
 def student_t_fix_optimise_check():
    plt.close('all')
    real_var = 0.1
    real_std = np.sqrt(real_var)
    X = np.random.rand(200)[:, None]
    noise = np.random.randn(*X.shape)*real_std
    Y = np.sin(X*2*np.pi) + noise
    X_full = X
    Y_full = np.sin(X_full)
    Y = Y/Y.max()
    Y_full = Y_full/Y_full.max()
    deg_free = 1000
    #GP
    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
    mgp.ensure_default_constraints()
    mgp.randomize()
    mgp.optimize()
    kernelst = kernelgp.copy()
    real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std2)
    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
    plt.figure(1)
    plt.suptitle('Student likelihood')
    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
    m.constrain_fixed('rbf_var', mgp._get_params()[0])
    m.constrain_fixed('rbf_len', mgp._get_params()[1])
    m.constrain_positive('t_noise')
    #m.ensure_default_constraints()
    m.update_likelihood_approximation()
    print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood())
    plt.subplot(231)
    m.plot()
    plt.title('Student t original data noise')
    #Fix student t noise variance to same a GP
    gp_noise = mgp._get_params()[2]
    m['t_noise_std2'] = gp_noise
    m.update_likelihood_approximation()
    print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood())
    plt.subplot(232)
    m.plot()
    plt.title('Student t GP noise')
    #Fix student t noise to variance converted from the GP
    real_stu_t_std2gp = (gp_noise)*((deg_free - 2)/float(deg_free))
    m['t_noise_std2'] = real_stu_t_std2gp
    m.update_likelihood_approximation()
    print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood())
    plt.subplot(233)
    m.plot()
    plt.title('Student t GP noise converted')
    m.constrain_positive('t_noise_std2')
    m.randomize()
    m.update_likelihood_approximation()
    plt.subplot(234)
    m.plot()
    plt.title('Student t fixed rbf')
    m.optimize()
    print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood())
    plt.subplot(235)
    m.plot()
    plt.title('Student t fixed rbf optimised')
    plt.figure(2)
    mrbf = m.copy()
    mrbf.unconstrain('')
    mrbf.constrain_fixed('t_noise', m.likelihood.likelihood_function.sigma2)
    gp_var = mgp._get_params()[0]
    gp_len = mgp._get_params()[1]
    mrbf.constrain_fixed('rbf_var', gp_var)
    mrbf.constrain_positive('rbf_len')
    mrbf.randomize()
    print "Before optimize"
    print mrbf
    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    mrbf.checkgrad(verbose=1)
    plt.subplot(121)
    mrbf.plot()
    plt.title('Student t fixed noise')
    mrbf.optimize()
    print "After optimize"
    print mrbf
    plt.subplot(122)
    mrbf.plot()
    plt.title('Student t fixed noise optimized')
    print mrbf
    plt.figure(3)
    print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood())
    plt.suptitle('Gaussian likelihood optimised')
    mgp.plot()
    print "Real std: {}".format(real_std)
    print "Real variance {}".format(real_std**2)
    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    print "Len should be: {}".format(gp_len)
    return mrbf
 def debug_student_t_noise_approx():
    plot = False
    real_var = 0.1
    #Start a function, any function
    #X = np.linspace(0.0, 10.0, 50)[:, None]
    X = np.random.rand(100)[:, None]
    #X = np.random.rand(100)[:, None]
    #X = np.array([0.5, 1])[:, None]
    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1
    #Y = X + np.random.randn(*X.shape)*real_var
    #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
    #Y = ty
    X_full = X
    Y_full = np.sin(X_full) + 1
    Y = Y/Y.max()
    #Add student t random noise to datapoints
    deg_free = 100
    real_sd = np.sqrt(real_var)
    print "Real noise std: ", real_sd
    initial_var_guess = 0.3
    #t_rv = t(deg_free, loc=0, scale=real_var)
    #noise = t_rvrvs(size=Y.shape)
    #Y += noise
    plt.close('all')
    # Kernel object
    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
    #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
    kernel2 = kernel1.copy()
    kernel3 = kernel1.copy()
    kernel4 = kernel1.copy()
    kernel5 = kernel1.copy()
    kernel6 = kernel1.copy()
    print "Clean Gaussian"
    #A GP should completely break down due to the points as they get a lot of weight
    # create simple GP model
    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
    ## optimize
    #m.ensure_default_constraints()
    #m.optimize()
    ## plot
    #if plot:
        #plt.figure(1)
        #plt.suptitle('Gaussian likelihood')
        #plt.subplot(131)
        #m.plot()
        #plt.plot(X_full, Y_full)
    #print m
    real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
    edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd
    #edited_real_sd = real_sd
    print "Clean student t, rasm"
    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
    #m['rbf_len'] = 1.5
    #m.constrain_fixed('rbf_v', 1.0898)
    #m.constrain_fixed('rbf_l', 0.2651)
    #m.constrain_fixed('t_noise_std2', edited_real_sd)
    #m.constrain_positive('rbf')
    m.constrain_positive('t_noise_std2')
    #m.constrain_positive('')
    #m.constrain_bounded('t_noi', 0.001, 10)
    #m.constrain_fixed('t_noi', real_stu_t_std)
    #m.constrain_fixed('white', 0.01)
    #m.constrain_fixed('t_no', 0.01)
    #m['rbf_var'] = 0.20446332
    #m['rbf_leng'] = 0.85776241
    #m['t_noise'] = 0.667083294421005
    m.ensure_default_constraints()
    m.update_likelihood_approximation()
    #m.optimize(messages=True)
    print(m)
    #return m
    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
    if plot:
        plt.suptitle('Student-t likelihood')
        plt.subplot(132)
        m.plot()
        plt.plot(X_full, Y_full)
        plt.ylim(-2.5, 2.5)
    print "Real noise std: ", real_sd
    print "or Real noise std: ", real_stu_t_std
    return m
    #print "Clean student t, ncg"
    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
    #m.ensure_default_constraints()
    #m.update_likelihood_approximation()
    #m.optimize()
    #print(m)
    #if plot:
        #plt.subplot(133)
        #m.plot()
        #plt.plot(X_full, Y_full)
        #plt.ylim(-2.5, 2.5)
    #plt.show()
 def student_t_approx():
    """
    Example of regressing with a student t likelihood
    """
    real_std = 0.1
    #Start a function, any function
    X = np.linspace(0.0, 10.0, 50)[:, None]
    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
    Yc = Y.copy()
    X_full = np.linspace(0.0, 10.0, 500)[:, None]
    Y_full = np.sin(X_full)
    Y = Y/Y.max()
    Yc[10] += 100
    Yc[25] += 10
    Yc[23] += 10
    Yc[26] += 1000
    Yc[24] += 10
    #Yc = Yc/Yc.max()
    #Add student t random noise to datapoints
    deg_free = 8
    print "Real noise: ", real_std
    initial_var_guess = 0.1
    #t_rv = t(deg_free, loc=0, scale=real_var)
    #noise = t_rvrvs(size=Y.shape)
    #Y += noise
    #Add some extreme value noise to some of the datapoints
    #percent_corrupted = 0.15
    #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
    #indices = np.arange(Y.shape[0])
    #np.random.shuffle(indices)
    #corrupted_indices = indices[:corrupted_datums]
    #print corrupted_indices
    #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
    #Y[corrupted_indices] += noise
    plt.figure(1)
    plt.suptitle('Gaussian likelihood')
    # Kernel object
    kernel1 = GPy.kern.rbf(X.shape[1])
    kernel2 = kernel1.copy()
    kernel3 = kernel1.copy()
    kernel4 = kernel1.copy()
    kernel5 = kernel1.copy()
    kernel6 = kernel1.copy()
    print "Clean Gaussian"
    #A GP should completely break down due to the points as they get a lot of weight
    # create simple GP model
    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
    plt.subplot(211)
    m.plot()
    plt.plot(X_full, Y_full)
    plt.title('Gaussian clean')
    print m
    #Corrupt
    print "Corrupt Gaussian"
    m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
    m.ensure_default_constraints()
    #m.optimize()
    plt.subplot(212)
    m.plot()
    plt.plot(X_full, Y_full)
    plt.title('Gaussian corrupt')
    print m
    plt.figure(2)
    plt.suptitle('Student-t likelihood')
    edited_real_sd = real_std #initial_var_guess
    print "Clean student t, rasm"
    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
    m.ensure_default_constraints()
    m.constrain_positive('t_noise')
    m.randomize()
    m.update_likelihood_approximation()
    m.optimize()
    print(m)
    plt.subplot(222)
    m.plot()
    plt.plot(X_full, Y_full)
    plt.ylim(-2.5, 2.5)
    plt.title('Student-t rasm clean')
    print "Corrupt student t, rasm"
    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
    m.ensure_default_constraints()
    m.constrain_positive('t_noise')
    m.randomize()
    m.update_likelihood_approximation()
    m.optimize()
    print(m)
    plt.subplot(224)
    m.plot()
    plt.plot(X_full, Y_full)
    plt.ylim(-2.5, 2.5)
    plt.title('Student-t rasm corrupt')
    return m
    #print "Clean student t, ncg"
    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
    #m.ensure_default_constraints()
    #m.update_likelihood_approximation()
    #m.optimize()
    #print(m)
    #plt.subplot(221)
    #m.plot()
    #plt.plot(X_full, Y_full)
    #plt.ylim(-2.5, 2.5)
    #plt.title('Student-t ncg clean')
    #print "Corrupt student t, ncg"
    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
    #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
    #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
    #m.ensure_default_constraints()
    #m.update_likelihood_approximation()
    #m.optimize()
    #print(m)
    #plt.subplot(223)
    #m.plot()
    #plt.plot(X_full, Y_full)
    #plt.ylim(-2.5, 2.5)
    #plt.title('Student-t ncg corrupt')
    ###with a student t distribution, since it has heavy tails it should work well
    ###likelihood_function = student_t(deg_free, sigma2=real_var)
    ###lap = Laplace(Y, likelihood_function)
    ###cov = kernel.K(X)
    ###lap.fit_full(cov)
    ###test_range = np.arange(0, 10, 0.1)
    ###plt.plot(test_range, t_rv.pdf(test_range))
    ###for i in xrange(X.shape[0]):
        ###mode = lap.f_hat[i]
        ###covariance = lap.hess_hat_i[i,i]
        ###scaling = np.exp(lap.ln_z_hat)
        ###normalised_approx = norm(loc=mode, scale=covariance)
        ###print "Normal with mode %f, and variance %f" % (mode, covariance)
        ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
    ###plt.show()
    return m
 def noisy_laplace_approx():
    """
    Example of regressing with a student t likelihood
    """
    #Start a function, any function
    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
    Y = np.sin(X)
    #Add some extreme value noise to some of the datapoints
    percent_corrupted = 0.05
    corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
    indices = np.arange(Y.shape[0])
    np.random.shuffle(indices)
    corrupted_indices = indices[:corrupted_datums]
    print corrupted_indices
    noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
    Y[corrupted_indices] += noise
    #A GP should completely break down due to the points as they get a lot of weight
    # create simple GP model
    m = GPy.models.GP_regression(X, Y)
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
    m.plot()
    print m
    #with a student t distribution, since it has heavy tails it should work well
 def gaussian_f_check():
    plt.close('all')
    X = np.linspace(0, 1, 50)[:, None]
    real_std = 0.2
    noise = np.random.randn(*X.shape)*real_std
    Y = np.sin(X*2*np.pi) + noise
    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
    mgp.ensure_default_constraints()
    mgp.randomize()
    mgp.optimize()
    print "Gaussian"
    print mgp
    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    kernelg = kernelgp.copy()
    #kernelst += GPy.kern.bias(X.shape[1])
    N, D = X.shape
    g_distribution = GPy.likelihoods.likelihood_functions.gaussian(variance=0.1, N=N, D=D)
    g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
    m = GPy.models.GP(X, g_likelihood, kernelg)
    #m['rbf_v'] = mgp._get_params()[0]
    #m['rbf_l'] = mgp._get_params()[1] + 1
    m.ensure_default_constraints()
    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
    #m.constrain_positive('bias')
    m.constrain_positive('noise_var')
    m.randomize()
    m['noise_variance'] = 0.1
    m.likelihood.X = X
    plt.figure()
    plt.subplot(211)
    m.plot()
    plt.subplot(212)
    m.optimize()
    m.plot()
    print "final optimised gaussian"
    print m
    print "real GP"
    print mgp
    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@ -0,0 +1,453 @@
 import numpy as np
 import scipy as sp
 import GPy
 from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from likelihood import likelihood
 from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
 import random
 #import pylab as plt
 class Laplace(likelihood):
    """Laplace approximation to a posterior"""
    def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'):
        """
        Laplace Approximation
        First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
        then find the z^{prime} which allows this to be a normalised gaussian instead of a
        non-normalized gaussian
        Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
        which makes a gaussian the same as the laplace approximation
        Arguments
        ---------
        :data: array of data the likelihood function is approximating
        :likelihood_function: likelihood function - subclass of likelihood_function
        :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
        :opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
        """
        self.data = data
        self.likelihood_function = likelihood_function
        self.extra_data = extra_data
        self.opt = opt
        #Inital values
        self.N, self.D = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0
        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
        #Initial values for the GP variables
        self.Y = np.zeros((self.N, 1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)[:, None]
        self.Z = 0
        self.YYT = None
        self.old_a = None
    def predictive_values(self, mu, var, full_cov):
        if full_cov:
            raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
        return self.likelihood_function.predictive_values(mu, var)
    def _get_params(self):
        return np.asarray(self.likelihood_function._get_params())
    def _get_param_names(self):
        return self.likelihood_function._get_param_names()
    def _set_params(self, p):
        return self.likelihood_function._set_params(p)
    def _shared_gradients_components(self):
        #FIXME: Careful of side effects! And make sure W and K are up to date!
        d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T
        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
        return dL_dfhat, I_KW_i
    def _Kgradients(self, dK_dthetaK, X):
        """
        Gradients with respect to prior kernel parameters
        """
        dL_dfhat, I_KW_i = self._shared_gradients_components()
        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
        #Implicit
        impl = mdot(dlp, dL_dfhat, I_KW_i)
        expl_a = mdot(self.Ki_f, self.Ki_f.T)
        expl_b = self.Wi_K_i
        #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b)
        expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
        dL_dthetaK_exp = dK_dthetaK(expl, X)
        dL_dthetaK_imp = dK_dthetaK(impl, X)
        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
        #print "expl_a: {}, {}     expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b))
        dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
        return dL_dthetaK
    def _gradients(self, partial):
        """
        Gradients with respect to likelihood parameters
        """
        dL_dfhat, I_KW_i = self._shared_gradients_components()
        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
        num_params = len(dlik_dthetaL)
        dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
        for thetaL_i in range(num_params):
            #Explicit
            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
            #a = 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
            #d = dlik_hess_dthetaL[thetaL_i]
            #e = pdinv(pdinv(self.K)[0] + np.diagflat(self.W))[0]
            #b = 0.5*np.dot(np.diag(e).T, d)
            #g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1))
            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i])
            dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
            #Implicit
            df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
            dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
    def _compute_GP_variables(self):
        """
        Generates data Y which would give the normal distribution identical to the laplace approximation
        GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
        that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
        Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
        then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
        due to the z rescaling.
        at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
        This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
        giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
        $$\tilde{Y} = \tilde{\Sigma} Hf$$
        where
        $$\tilde{\Sigma}^{-1} = H - K^{-1}$$
        i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
        since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
        and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
        $$\tilde{\Sigma} = W^{-1}$$
        """
        #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
        #dtritri -> L -> L_i
        #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
        #((L.T*w)_i + I)f_hat = y_tilde
        #L = jitchol(self.K)
        #Li = chol_inv(L)
        #Lt_W = L.T*self.W.T
        #Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
        #self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
        #Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
        Wi = 1.0/self.W
        self.Sigma_tilde = np.diagflat(Wi)
        Y_tilde = Wi*self.Ki_f + self.f_hat
        self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
        #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
        self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
        self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
        self.aA = 0.5*self.ln_det_K_Wi__Bi
        self.bB = - 0.5*self.f_Ki_f
        self.cC = 0.5*self.y_Wi_Ki_i_y
        Z_tilde = (+ self.lik
                   + 0.5*self.ln_det_K_Wi__Bi
                   - 0.5*self.f_Ki_f
                   + 0.5*self.y_Wi_Ki_i_y
                  )
        print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
        print self.likelihood_function._get_params()
        #Convert to float as its (1, 1) and Z must be a scalar
        self.Z = np.float64(Z_tilde)
        self.Y = Y_tilde
        self.YYT = np.dot(self.Y, self.Y.T)
        self.covariance_matrix = self.Sigma_tilde
        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
    def fit_full(self, K):
        """
        The laplace approximation algorithm, find K and expand hessian
        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
        :K: Covariance matrix
        """
        self.K = K.copy()
        #Find mode
        self.f_hat = {
            'rasm': self.rasm_mode,
            'ncg': self.ncg_mode,
            'nelder': self.nelder_mode
        }[self.opt](self.K)
        #Compute hessian and other variables at mode
        self._compute_likelihood_variables()
    def _compute_likelihood_variables(self):
        #At this point get the hessian matrix (or vector as W is diagonal)
        self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
        if not self.likelihood_function.log_concave:
            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                       #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                       #To cause the posterior to become less certain than the prior and likelihood,
                                       #This is a property only held by non-log-concave likelihoods
        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
        self.Bi, _, _, B_det = pdinv(self.B)
        #Do the computation again at f to get Ki_f which is useful
        #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
        #solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
        #a = b - self.W_12*solve_chol
        self.Ki_f = self.a
        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
        self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
        #For det, |I + KW| == |I + W_12*K*W_12|
        self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
        #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
        #self.ln_z_hat = (- 0.5*self.f_Ki_f
                         #- self.ln_I_KW_det
                         #+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
                         #)
        return self._compute_GP_variables()
    def _compute_B_statistics(self, K, W):
        """Rasmussen suggests the use of a numerically stable positive definite matrix B
        Which has a positive diagonal element and can be easyily inverted
        :K: Covariance matrix
        :W: Negative hessian at a point (diagonal matrix)
        :returns: (B, L)
        """
        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
        W_12 = np.sqrt(W)
        B = np.eye(self.N) + W_12*K*W_12.T
        L = jitchol(B)
        return (B, L, W_12)
    def nelder_mode(self, K):
        f = np.zeros((self.N, 1))
        self.Ki, _, _, self.ln_K_det = pdinv(K)
        def obj(f):
            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
            return float(res)
        res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
        f_new = res.x
        return f_new[:, None]
    def ncg_mode(self, K):
        """
        Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
        :K: Covariance matrix
        :returns: f_mode
        """
        self.Ki, _, _, self.ln_K_det = pdinv(K)
        f = np.zeros((self.N, 1))
        #FIXME: Can we get rid of this horrible reshaping?
        #ONLY WORKS FOR 1D DATA
        def obj(f):
            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
                        - self.NORMAL_CONST)
            return float(res)
        def obj_grad(f):
            res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
            return np.squeeze(res)
        def obj_hess(f):
            res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
            return np.squeeze(res)
        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
        return f_hat[:, None]
    def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
        """
        Rasmussens numerically stable mode finding
        For nomenclature see Rasmussen & Williams 2006
        :K: Covariance matrix
        :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
        :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
        :returns: f_mode
        """
        self.old_before_s = self.likelihood_function._get_params()
        print "before: ", self.old_before_s
        #if self.old_before_s < 1e-5:
            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
        #old_a = np.zeros((self.N, 1))
        if self.old_a is None:
            old_a = np.zeros((self.N, 1))
            f = np.dot(K, old_a)
        else:
            old_a = self.old_a.copy()
            f = self.f_hat.copy()
        new_obj = -np.inf
        old_obj = np.inf
        def obj(a, f):
            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
        difference = np.inf
        epsilon = 1e-4
        step_size = 1
        rs = 0
        i = 0
        while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
            W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
            #W = np.maximum(W, 0)
            if not self.likelihood_function.log_concave:
                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                    # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    # To cause the posterior to become less certain than the prior and likelihood,
                                    # This is a property only held by non-log-concave likelihoods
            B, L, W_12 = self._compute_B_statistics(K, W.copy())
            W_f = W*f
            grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
            b = W_f + grad
            solve_L = cho_solve((L, True), W_12*np.dot(K, b))
            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
            full_step_a = b - W_12*solve_L
            da = full_step_a - old_a
            #f_old = f.copy()
            #def inner_obj(step_size, old_a, da, K):
                #a = old_a + step_size*da
                #f = np.dot(K, a)
                #self.a = a.copy() # This is nasty, need to set something within an optimization though
                #self.f = f.copy()
                #return -obj(a, f)
            #from functools import partial
            #i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
            ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
            #f = self.f.copy()
            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
            f_old = f.copy()
            update_passed = False
            while not update_passed:
                a = old_a + step_size*da
                f = np.dot(K, a)
                old_obj = new_obj
                new_obj = obj(a, f)
                difference = new_obj - old_obj
                print "difference: ",difference
                if difference < 0:
                    #print "Objective function rose", np.float(difference)
                    #If the objective function isn't rising, restart optimization
                    step_size *= 0.8
                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                    #objective function isn't increasing, try reducing step size
                    f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
                    old_obj = new_obj
                    rs += 1
                else:
                    update_passed = True
            #difference = abs(new_obj - old_obj)
            #old_obj = new_obj.copy()
            #difference = np.abs(np.sum(f - f_old))
            difference = np.abs(np.sum(a - old_a))
            #old_a = self.a.copy() #a
            old_a = a.copy()
            i += 1
            #print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
        self.old_a = old_a.copy()
        #print "Positive difference obj: ", np.float(difference)
        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
        print "Iterations: {}, Final_difference: {}".format(i, difference)
        if difference > 1e-4:
            print "FAIL FAIL FAIL FAIL FAIL FAIL"
            if False:
                import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
                if hasattr(self, 'X'):
                    import pylab as pb
                    pb.figure()
                    pb.subplot(311)
                    pb.title('old f_hat')
                    pb.plot(self.X, self.f_hat)
                    pb.subplot(312)
                    pb.title('old ff')
                    pb.plot(self.X, self.old_ff)
                    pb.subplot(313)
                    pb.title('new f_hat')
                    pb.plot(self.X, f)
                    pb.figure()
                    pb.subplot(121)
                    pb.title('old K')
                    pb.imshow(np.diagflat(self.old_K), interpolation='none')
                    pb.colorbar()
                    pb.subplot(122)
                    pb.title('new K')
                    pb.imshow(np.diagflat(K), interpolation='none')
                    pb.colorbar()
                    pb.figure()
                    pb.subplot(121)
                    pb.title('old W')
                    pb.imshow(np.diagflat(self.old_W), interpolation='none')
                    pb.colorbar()
                    pb.subplot(122)
                    pb.title('new W')
                    pb.imshow(np.diagflat(W), interpolation='none')
                    pb.colorbar()
                    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
                    pb.close('all')
        #FIXME: DELETE THESE
        self.old_W = W.copy()
        self.old_grad = grad.copy()
        self.old_B = B.copy()
        self.old_W_12 = W_12.copy()
        self.old_ff = f.copy()
        self.old_K = self.K.copy()
        self.old_s = self.likelihood_function._get_params()
        print "after: ", self.old_s
        #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
        self.a = a
        #self.B, self.B_chol, self.W_12 = B, L, W_12
        #self.Bi, _, _, B_det = pdinv(self.B)
        return f
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@ -0,0 +1,319 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import linalg
 import pylab as pb
 from .. import kern
 from ..core import model
 from ..util.linalg import pdinv, mdot, tdot
 from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango
 from ..likelihoods import EP, Laplace
 class GP(model):
    """
    Gaussian Process model for regression and EP
    :param X: input observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :parm likelihood: a GPy likelihood
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :rtype: model object
    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
    :type powerep: list
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        self.has_uncertain_inputs=False
        # parse arguments
        self.X = X
        assert len(self.X.shape) == 2
        self.N, self.Q = self.X.shape
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
        self.likelihood = likelihood
        assert self.X.shape[0] == self.likelihood.data.shape[0]
        self.N, self.D = self.likelihood.data.shape
        # here's some simple normalization for the inputs
        if normalize_X:
            self._Xmean = X.mean(0)[None, :]
            self._Xstd = X.std(0)[None, :]
            self.X = (X.copy() - self._Xmean) / self._Xstd
            if hasattr(self, 'Z'):
                self.Z = (self.Z - self._Xmean) / self._Xstd
        else:
            self._Xmean = np.zeros((1, self.X.shape[1]))
            self._Xstd = np.ones((1, self.X.shape[1]))
        if not hasattr(self,'has_uncertain_inputs'):
            self.has_uncertain_inputs = False
        model.__init__(self)
    def dL_dZ(self):
        """
        TODO: one day we might like to learn Z by gradient methods?
        """
        #FIXME: this doesn;t live here.
        return np.zeros_like(self.Z)
    def _set_params(self, p):
        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
        # self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
        self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas
        if isinstance(self.likelihood, Laplace):
            self.likelihood.fit_full(self.kern.K(self.X))
            self.likelihood._set_params(self.likelihood._get_params())
        self.K = self.kern.K(self.X)
        self.K += self.likelihood.covariance_matrix
        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
        # the gradient of the likelihood wrt the covariance matrix
        if self.likelihood.YYT is None:
            #alpha = np.dot(self.Ki, self.likelihood.Y)
            alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1)
            self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
        else:
            #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
            self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def _update_params_callback(self, p):
        #parameters will be in transformed space
        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
        #set_params_transformed for likelihood doesn't exist?
        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
        #update the likelihood approximation within the optimisation with the current parameters
        self.update_likelihood_approximation()
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
        For a Gaussian likelihood, no iteration is required:
        this function does nothing
        """
        self.likelihood.fit_full(self.kern.K(self.X))
        self._set_params(self._get_params())  # update the GP
    def _model_fit_term(self):
        """
        Computes the model fit using YYT if it's available
        """
        if self.likelihood.YYT is None:
            tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
            return -0.5 * np.sum(np.square(tmp))
            #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
        else:
            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
    def log_likelihood(self):
        """
        The log marginal likelihood of the GP.
        For an EP model,  can be written as the log likelihood of a regression
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
        #if isinstance(self.likelihood, Laplace):
            #self.likelihood.fit_full(self.kern.K(self.X))
            #self.likelihood._set_params(self.likelihood._get_params())
        l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
        print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z)
        return l
    def _log_likelihood_gradients(self):
        """
        The gradient of all parameters.
        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
        """
        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
        print "dL_dthetaK should be: ", dL_dthetaK
        if isinstance(self.likelihood, Laplace):
            #self.likelihood.fit_full(self.kern.K(self.X))
            #self.likelihood._set_params(self.likelihood._get_params())
            dK_dthetaK = self.kern.dK_dtheta
            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
        else:
            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
        #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
        #print "dL_dthetaK: {}   dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
        return np.hstack((dL_dthetaK, dL_dthetaL))
        #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
        """
        Internal helper function for making predictions, does not account
        for normalization or likelihood
        """
        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
        #KiKx = np.dot(self.Ki, Kx)
        KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1)
        mu = np.dot(KiKx.T, self.likelihood.Y)
        if full_cov:
            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
            var = Kxx - np.dot(KiKx.T, Kx)
        else:
            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
            var = var[:, None]
        if stop:
            debug_this
        return mu, var
    def predict(self, Xnew, which_parts='all', full_cov=False):
        """
        Predict the function(s) at the new point(s) Xnew.
        Arguments
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.Q
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.D
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.D
           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
        """
        # normalize X values
        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
        mu, var = self._raw_predict(Xnew, which_parts, full_cov)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
        return mean, var, _025pm, _975pm
    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False):
        """
        Plot the GP's view of the world, where the data is normalized and the
        likelihood is Gaussian.
        :param samples: the number of a posteriori samples to plot
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - In higher dimensions, we've no implemented this yet !TODO!
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
        """
        if which_data == 'all':
            which_data = slice(None)
        if self.X.shape[1] == 1:
            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
            if samples == 0:
                m, v = self._raw_predict(Xnew, which_parts=which_parts)
                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v))
                pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
            else:
                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None])
                for i in range(samples):
                    pb.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
            pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
            pb.xlim(xmin, xmax)
            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            pb.ylim(ymin, ymax)
            if hasattr(self, 'Z'):
                pb.plot(self.Z, self.Z * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
        elif self.X.shape[1] == 2:
            resolution = resolution or 50
            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
            m, v = self._raw_predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
            pb.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
            pb.scatter(Xorig[:, 0], Xorig[:, 1], 40, Yorig, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max())
            pb.xlim(xmin[0], xmax[0])
            pb.ylim(xmin[1], xmax[1])
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20):
        """
        TODO: Docstrings!
        :param levels: for 2D plotting, the number of contour levels to use
        """
        # TODO include samples
        if which_data == 'all':
            which_data = slice(None)
        if self.X.shape[1] == 1:
            Xu = self.X * self._Xstd + self._Xmean  # NOTE self.X are the normalized values now
            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
            gpplot(Xnew, m, lower, upper)
            pb.plot(Xu[which_data], self.likelihood.data[which_data], 'kx', mew=1.5)
            if self.has_uncertain_inputs:
                pb.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            pb.xlim(xmin, xmax)
            pb.ylim(ymin, ymax)
            if hasattr(self, 'Z'):
                Zu = self.Z * self._Xstd + self._Xmean
                pb.plot(Zu, Zu * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
                    # pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
        elif self.X.shape[1] == 2:  # FIXME
            resolution = resolution or 50
            Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
            pb.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
            Yf = self.likelihood.Y.flatten()
            pb.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
            pb.xlim(xmin[0], xmax[0])
            pb.ylim(xmin[1], xmax[1])
            if hasattr(self, 'Z'):
                pb.plot(self.Z[:, 0], self.Z[:, 1], 'wo')
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"