mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-10 20:42:39 +02:00
Checked out relavent files
This commit is contained in:
parent
29358519d9
commit
f641ab54a8
3 changed files with 1411 additions and 0 deletions
639
GPy/examples/laplace_approximations.py
Normal file
639
GPy/examples/laplace_approximations.py
Normal file
|
|
@ -0,0 +1,639 @@
|
||||||
|
import GPy
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
np.random.seed(1)
|
||||||
|
|
||||||
|
def timing():
|
||||||
|
real_var = 0.1
|
||||||
|
times = 1
|
||||||
|
deg_free = 10
|
||||||
|
real_sd = np.sqrt(real_var)
|
||||||
|
the_is = np.zeros(times)
|
||||||
|
X = np.linspace(0.0, 10.0, 300)[:, None]
|
||||||
|
|
||||||
|
for a in xrange(times):
|
||||||
|
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
|
||||||
|
Yc = Y.copy()
|
||||||
|
|
||||||
|
Yc[10] += 100
|
||||||
|
Yc[25] += 10
|
||||||
|
Yc[23] += 10
|
||||||
|
Yc[24] += 10
|
||||||
|
Yc[250] += 10
|
||||||
|
#Yc[4] += 10000
|
||||||
|
|
||||||
|
edited_real_sd = real_sd
|
||||||
|
kernel1 = GPy.kern.rbf(X.shape[1])
|
||||||
|
|
||||||
|
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
|
||||||
|
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
|
||||||
|
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
m.optimize()
|
||||||
|
the_is[a] = m.likelihood.i
|
||||||
|
|
||||||
|
print the_is
|
||||||
|
print np.mean(the_is)
|
||||||
|
|
||||||
|
def v_fail_test():
|
||||||
|
#plt.close('all')
|
||||||
|
real_var = 0.1
|
||||||
|
X = np.linspace(0.0, 10.0, 50)[:, None]
|
||||||
|
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
|
||||||
|
Y = Y/Y.max()
|
||||||
|
|
||||||
|
#Add student t random noise to datapoints
|
||||||
|
deg_free = 10
|
||||||
|
real_sd = np.sqrt(real_var)
|
||||||
|
print "Real noise std: ", real_sd
|
||||||
|
|
||||||
|
kernel1 = GPy.kern.white(X.shape[1]) #+ GPy.kern.white(X.shape[1])
|
||||||
|
|
||||||
|
edited_real_sd = 0.3#real_sd
|
||||||
|
edited_real_sd = real_sd
|
||||||
|
|
||||||
|
print "Clean student t, rasm"
|
||||||
|
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
|
||||||
|
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
|
||||||
|
m = GPy.models.GP(X, stu_t_likelihood, kernel1)
|
||||||
|
m.constrain_positive('')
|
||||||
|
vs = 25
|
||||||
|
noises = 30
|
||||||
|
checkgrads = np.zeros((vs, noises))
|
||||||
|
vs_noises = np.zeros((vs, noises))
|
||||||
|
for v_ind, v in enumerate(np.linspace(1, 100, vs)):
|
||||||
|
m.likelihood.likelihood_function.v = v
|
||||||
|
print v
|
||||||
|
for noise_ind, noise in enumerate(np.linspace(0.0001, 100, noises)):
|
||||||
|
m['t_noise'] = noise
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
checkgrads[v_ind, noise_ind] = m.checkgrad()
|
||||||
|
vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2)
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
plt.title('Checkgrads')
|
||||||
|
plt.imshow(checkgrads, interpolation='nearest')
|
||||||
|
plt.xlabel('noise')
|
||||||
|
plt.ylabel('v')
|
||||||
|
|
||||||
|
#plt.figure()
|
||||||
|
#plt.title('variance change')
|
||||||
|
#plt.imshow(vs_noises, interpolation='nearest')
|
||||||
|
#plt.xlabel('noise')
|
||||||
|
#plt.ylabel('v')
|
||||||
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
print(m)
|
||||||
|
|
||||||
|
def student_t_obj_plane():
|
||||||
|
plt.close('all')
|
||||||
|
X = np.linspace(0, 1, 50)[:, None]
|
||||||
|
real_std = 0.002
|
||||||
|
noise = np.random.randn(*X.shape)*real_std
|
||||||
|
Y = np.sin(X*2*np.pi) + noise
|
||||||
|
deg_free = 1000
|
||||||
|
|
||||||
|
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
|
||||||
|
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
|
||||||
|
mgp.ensure_default_constraints()
|
||||||
|
mgp['noise'] = real_std**2
|
||||||
|
print "Gaussian"
|
||||||
|
print mgp
|
||||||
|
|
||||||
|
kernelst = kernelgp.copy()
|
||||||
|
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
|
||||||
|
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
|
||||||
|
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.constrain_fixed('t_no', real_std**2)
|
||||||
|
vs = 10
|
||||||
|
ls = 10
|
||||||
|
objs_t = np.zeros((vs, ls))
|
||||||
|
objs_g = np.zeros((vs, ls))
|
||||||
|
rbf_vs = np.linspace(1e-6, 8, vs)
|
||||||
|
rbf_ls = np.linspace(1e-2, 8, ls)
|
||||||
|
for v_id, rbf_v in enumerate(rbf_vs):
|
||||||
|
for l_id, rbf_l in enumerate(rbf_ls):
|
||||||
|
m['rbf_v'] = rbf_v
|
||||||
|
m['rbf_l'] = rbf_l
|
||||||
|
mgp['rbf_v'] = rbf_v
|
||||||
|
mgp['rbf_l'] = rbf_l
|
||||||
|
objs_t[v_id, l_id] = m.log_likelihood()
|
||||||
|
objs_g[v_id, l_id] = mgp.log_likelihood()
|
||||||
|
plt.figure()
|
||||||
|
plt.subplot(211)
|
||||||
|
plt.title('Student t')
|
||||||
|
plt.imshow(objs_t, interpolation='none')
|
||||||
|
plt.xlabel('variance')
|
||||||
|
plt.ylabel('lengthscale')
|
||||||
|
plt.subplot(212)
|
||||||
|
plt.title('Gaussian')
|
||||||
|
plt.imshow(objs_g, interpolation='none')
|
||||||
|
plt.xlabel('variance')
|
||||||
|
plt.ylabel('lengthscale')
|
||||||
|
plt.show()
|
||||||
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
return objs_t
|
||||||
|
|
||||||
|
def student_t_f_check():
|
||||||
|
plt.close('all')
|
||||||
|
X = np.linspace(0, 1, 50)[:, None]
|
||||||
|
real_std = 0.2
|
||||||
|
noise = np.random.randn(*X.shape)*real_std
|
||||||
|
Y = np.sin(X*2*np.pi) + noise
|
||||||
|
deg_free = 1000
|
||||||
|
|
||||||
|
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
|
||||||
|
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
|
||||||
|
mgp.ensure_default_constraints()
|
||||||
|
mgp.randomize()
|
||||||
|
mgp.optimize()
|
||||||
|
print "Gaussian"
|
||||||
|
print mgp
|
||||||
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
|
||||||
|
kernelst = kernelgp.copy()
|
||||||
|
#kernelst += GPy.kern.bias(X.shape[1])
|
||||||
|
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
|
||||||
|
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
|
||||||
|
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
|
||||||
|
#m['rbf_v'] = mgp._get_params()[0]
|
||||||
|
#m['rbf_l'] = mgp._get_params()[1] + 1
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
#m.constrain_fixed('rbf_v', mgp._get_params()[0])
|
||||||
|
#m.constrain_fixed('rbf_l', mgp._get_params()[1])
|
||||||
|
#m.constrain_bounded('t_no', 2*real_std**2, 1e3)
|
||||||
|
#m.constrain_positive('bias')
|
||||||
|
m.constrain_positive('t_no')
|
||||||
|
m.randomize()
|
||||||
|
m['t_no'] = 0.3
|
||||||
|
m.likelihood.X = X
|
||||||
|
#print m
|
||||||
|
plt.figure()
|
||||||
|
plt.subplot(211)
|
||||||
|
m.plot()
|
||||||
|
print "OPTIMIZED ONCE"
|
||||||
|
plt.subplot(212)
|
||||||
|
m.optimize()
|
||||||
|
m.plot()
|
||||||
|
print "final optimised student t"
|
||||||
|
print m
|
||||||
|
print "real GP"
|
||||||
|
print mgp
|
||||||
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
return m
|
||||||
|
|
||||||
|
def student_t_fix_optimise_check():
|
||||||
|
plt.close('all')
|
||||||
|
real_var = 0.1
|
||||||
|
real_std = np.sqrt(real_var)
|
||||||
|
X = np.random.rand(200)[:, None]
|
||||||
|
noise = np.random.randn(*X.shape)*real_std
|
||||||
|
Y = np.sin(X*2*np.pi) + noise
|
||||||
|
X_full = X
|
||||||
|
Y_full = np.sin(X_full)
|
||||||
|
Y = Y/Y.max()
|
||||||
|
Y_full = Y_full/Y_full.max()
|
||||||
|
deg_free = 1000
|
||||||
|
|
||||||
|
#GP
|
||||||
|
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
|
||||||
|
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
|
||||||
|
mgp.ensure_default_constraints()
|
||||||
|
mgp.randomize()
|
||||||
|
mgp.optimize()
|
||||||
|
|
||||||
|
kernelst = kernelgp.copy()
|
||||||
|
real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
|
||||||
|
|
||||||
|
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std2)
|
||||||
|
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
|
||||||
|
|
||||||
|
plt.figure(1)
|
||||||
|
plt.suptitle('Student likelihood')
|
||||||
|
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
|
||||||
|
m.constrain_fixed('rbf_var', mgp._get_params()[0])
|
||||||
|
m.constrain_fixed('rbf_len', mgp._get_params()[1])
|
||||||
|
m.constrain_positive('t_noise')
|
||||||
|
#m.ensure_default_constraints()
|
||||||
|
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood())
|
||||||
|
plt.subplot(231)
|
||||||
|
m.plot()
|
||||||
|
plt.title('Student t original data noise')
|
||||||
|
|
||||||
|
#Fix student t noise variance to same a GP
|
||||||
|
gp_noise = mgp._get_params()[2]
|
||||||
|
m['t_noise_std2'] = gp_noise
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood())
|
||||||
|
plt.subplot(232)
|
||||||
|
m.plot()
|
||||||
|
plt.title('Student t GP noise')
|
||||||
|
|
||||||
|
#Fix student t noise to variance converted from the GP
|
||||||
|
real_stu_t_std2gp = (gp_noise)*((deg_free - 2)/float(deg_free))
|
||||||
|
m['t_noise_std2'] = real_stu_t_std2gp
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood())
|
||||||
|
plt.subplot(233)
|
||||||
|
m.plot()
|
||||||
|
plt.title('Student t GP noise converted')
|
||||||
|
|
||||||
|
m.constrain_positive('t_noise_std2')
|
||||||
|
m.randomize()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
plt.subplot(234)
|
||||||
|
m.plot()
|
||||||
|
plt.title('Student t fixed rbf')
|
||||||
|
m.optimize()
|
||||||
|
print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood())
|
||||||
|
plt.subplot(235)
|
||||||
|
m.plot()
|
||||||
|
plt.title('Student t fixed rbf optimised')
|
||||||
|
|
||||||
|
plt.figure(2)
|
||||||
|
mrbf = m.copy()
|
||||||
|
mrbf.unconstrain('')
|
||||||
|
mrbf.constrain_fixed('t_noise', m.likelihood.likelihood_function.sigma2)
|
||||||
|
gp_var = mgp._get_params()[0]
|
||||||
|
gp_len = mgp._get_params()[1]
|
||||||
|
mrbf.constrain_fixed('rbf_var', gp_var)
|
||||||
|
mrbf.constrain_positive('rbf_len')
|
||||||
|
mrbf.randomize()
|
||||||
|
print "Before optimize"
|
||||||
|
print mrbf
|
||||||
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
mrbf.checkgrad(verbose=1)
|
||||||
|
plt.subplot(121)
|
||||||
|
mrbf.plot()
|
||||||
|
plt.title('Student t fixed noise')
|
||||||
|
mrbf.optimize()
|
||||||
|
print "After optimize"
|
||||||
|
print mrbf
|
||||||
|
plt.subplot(122)
|
||||||
|
mrbf.plot()
|
||||||
|
plt.title('Student t fixed noise optimized')
|
||||||
|
print mrbf
|
||||||
|
|
||||||
|
plt.figure(3)
|
||||||
|
print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood())
|
||||||
|
plt.suptitle('Gaussian likelihood optimised')
|
||||||
|
mgp.plot()
|
||||||
|
print "Real std: {}".format(real_std)
|
||||||
|
print "Real variance {}".format(real_std**2)
|
||||||
|
|
||||||
|
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
|
||||||
|
print "Len should be: {}".format(gp_len)
|
||||||
|
return mrbf
|
||||||
|
|
||||||
|
def debug_student_t_noise_approx():
|
||||||
|
plot = False
|
||||||
|
real_var = 0.1
|
||||||
|
#Start a function, any function
|
||||||
|
#X = np.linspace(0.0, 10.0, 50)[:, None]
|
||||||
|
X = np.random.rand(100)[:, None]
|
||||||
|
#X = np.random.rand(100)[:, None]
|
||||||
|
#X = np.array([0.5, 1])[:, None]
|
||||||
|
Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1
|
||||||
|
#Y = X + np.random.randn(*X.shape)*real_var
|
||||||
|
#ty = np.array([1., 9.97733584, 4.17841363])[:, None]
|
||||||
|
#Y = ty
|
||||||
|
|
||||||
|
X_full = X
|
||||||
|
Y_full = np.sin(X_full) + 1
|
||||||
|
|
||||||
|
Y = Y/Y.max()
|
||||||
|
|
||||||
|
#Add student t random noise to datapoints
|
||||||
|
deg_free = 100
|
||||||
|
|
||||||
|
real_sd = np.sqrt(real_var)
|
||||||
|
print "Real noise std: ", real_sd
|
||||||
|
|
||||||
|
initial_var_guess = 0.3
|
||||||
|
#t_rv = t(deg_free, loc=0, scale=real_var)
|
||||||
|
#noise = t_rvrvs(size=Y.shape)
|
||||||
|
#Y += noise
|
||||||
|
|
||||||
|
plt.close('all')
|
||||||
|
# Kernel object
|
||||||
|
kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
|
||||||
|
#kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
|
||||||
|
kernel2 = kernel1.copy()
|
||||||
|
kernel3 = kernel1.copy()
|
||||||
|
kernel4 = kernel1.copy()
|
||||||
|
kernel5 = kernel1.copy()
|
||||||
|
kernel6 = kernel1.copy()
|
||||||
|
|
||||||
|
print "Clean Gaussian"
|
||||||
|
#A GP should completely break down due to the points as they get a lot of weight
|
||||||
|
# create simple GP model
|
||||||
|
#m = GPy.models.GP_regression(X, Y, kernel=kernel1)
|
||||||
|
## optimize
|
||||||
|
#m.ensure_default_constraints()
|
||||||
|
#m.optimize()
|
||||||
|
## plot
|
||||||
|
#if plot:
|
||||||
|
#plt.figure(1)
|
||||||
|
#plt.suptitle('Gaussian likelihood')
|
||||||
|
#plt.subplot(131)
|
||||||
|
#m.plot()
|
||||||
|
#plt.plot(X_full, Y_full)
|
||||||
|
#print m
|
||||||
|
|
||||||
|
real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
|
||||||
|
edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd
|
||||||
|
#edited_real_sd = real_sd
|
||||||
|
|
||||||
|
print "Clean student t, rasm"
|
||||||
|
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
|
||||||
|
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
|
||||||
|
|
||||||
|
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
|
||||||
|
#m['rbf_len'] = 1.5
|
||||||
|
#m.constrain_fixed('rbf_v', 1.0898)
|
||||||
|
#m.constrain_fixed('rbf_l', 0.2651)
|
||||||
|
#m.constrain_fixed('t_noise_std2', edited_real_sd)
|
||||||
|
#m.constrain_positive('rbf')
|
||||||
|
m.constrain_positive('t_noise_std2')
|
||||||
|
#m.constrain_positive('')
|
||||||
|
#m.constrain_bounded('t_noi', 0.001, 10)
|
||||||
|
#m.constrain_fixed('t_noi', real_stu_t_std)
|
||||||
|
#m.constrain_fixed('white', 0.01)
|
||||||
|
#m.constrain_fixed('t_no', 0.01)
|
||||||
|
#m['rbf_var'] = 0.20446332
|
||||||
|
#m['rbf_leng'] = 0.85776241
|
||||||
|
#m['t_noise'] = 0.667083294421005
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
#m.optimize(messages=True)
|
||||||
|
print(m)
|
||||||
|
#return m
|
||||||
|
#m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
|
||||||
|
if plot:
|
||||||
|
plt.suptitle('Student-t likelihood')
|
||||||
|
plt.subplot(132)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
plt.ylim(-2.5, 2.5)
|
||||||
|
print "Real noise std: ", real_sd
|
||||||
|
print "or Real noise std: ", real_stu_t_std
|
||||||
|
return m
|
||||||
|
|
||||||
|
#print "Clean student t, ncg"
|
||||||
|
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
|
||||||
|
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
|
||||||
|
#m = GPy.models.GP(X, stu_t_likelihood, kernel3)
|
||||||
|
#m.ensure_default_constraints()
|
||||||
|
#m.update_likelihood_approximation()
|
||||||
|
#m.optimize()
|
||||||
|
#print(m)
|
||||||
|
#if plot:
|
||||||
|
#plt.subplot(133)
|
||||||
|
#m.plot()
|
||||||
|
#plt.plot(X_full, Y_full)
|
||||||
|
#plt.ylim(-2.5, 2.5)
|
||||||
|
|
||||||
|
#plt.show()
|
||||||
|
|
||||||
|
def student_t_approx():
|
||||||
|
"""
|
||||||
|
Example of regressing with a student t likelihood
|
||||||
|
"""
|
||||||
|
real_std = 0.1
|
||||||
|
#Start a function, any function
|
||||||
|
X = np.linspace(0.0, 10.0, 50)[:, None]
|
||||||
|
Y = np.sin(X) + np.random.randn(*X.shape)*real_std
|
||||||
|
Yc = Y.copy()
|
||||||
|
|
||||||
|
X_full = np.linspace(0.0, 10.0, 500)[:, None]
|
||||||
|
Y_full = np.sin(X_full)
|
||||||
|
|
||||||
|
Y = Y/Y.max()
|
||||||
|
|
||||||
|
Yc[10] += 100
|
||||||
|
Yc[25] += 10
|
||||||
|
Yc[23] += 10
|
||||||
|
Yc[26] += 1000
|
||||||
|
Yc[24] += 10
|
||||||
|
#Yc = Yc/Yc.max()
|
||||||
|
|
||||||
|
#Add student t random noise to datapoints
|
||||||
|
deg_free = 8
|
||||||
|
print "Real noise: ", real_std
|
||||||
|
|
||||||
|
initial_var_guess = 0.1
|
||||||
|
#t_rv = t(deg_free, loc=0, scale=real_var)
|
||||||
|
#noise = t_rvrvs(size=Y.shape)
|
||||||
|
#Y += noise
|
||||||
|
|
||||||
|
#Add some extreme value noise to some of the datapoints
|
||||||
|
#percent_corrupted = 0.15
|
||||||
|
#corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
|
||||||
|
#indices = np.arange(Y.shape[0])
|
||||||
|
#np.random.shuffle(indices)
|
||||||
|
#corrupted_indices = indices[:corrupted_datums]
|
||||||
|
#print corrupted_indices
|
||||||
|
#noise = t_rv.rvs(size=(len(corrupted_indices), 1))
|
||||||
|
#Y[corrupted_indices] += noise
|
||||||
|
|
||||||
|
plt.figure(1)
|
||||||
|
plt.suptitle('Gaussian likelihood')
|
||||||
|
# Kernel object
|
||||||
|
kernel1 = GPy.kern.rbf(X.shape[1])
|
||||||
|
kernel2 = kernel1.copy()
|
||||||
|
kernel3 = kernel1.copy()
|
||||||
|
kernel4 = kernel1.copy()
|
||||||
|
kernel5 = kernel1.copy()
|
||||||
|
kernel6 = kernel1.copy()
|
||||||
|
|
||||||
|
print "Clean Gaussian"
|
||||||
|
#A GP should completely break down due to the points as they get a lot of weight
|
||||||
|
# create simple GP model
|
||||||
|
m = GPy.models.GP_regression(X, Y, kernel=kernel1)
|
||||||
|
# optimize
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.optimize()
|
||||||
|
# plot
|
||||||
|
plt.subplot(211)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
plt.title('Gaussian clean')
|
||||||
|
print m
|
||||||
|
|
||||||
|
#Corrupt
|
||||||
|
print "Corrupt Gaussian"
|
||||||
|
m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
#m.optimize()
|
||||||
|
plt.subplot(212)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
plt.title('Gaussian corrupt')
|
||||||
|
print m
|
||||||
|
|
||||||
|
plt.figure(2)
|
||||||
|
plt.suptitle('Student-t likelihood')
|
||||||
|
edited_real_sd = real_std #initial_var_guess
|
||||||
|
|
||||||
|
print "Clean student t, rasm"
|
||||||
|
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
|
||||||
|
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
|
||||||
|
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.constrain_positive('t_noise')
|
||||||
|
m.randomize()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
m.optimize()
|
||||||
|
print(m)
|
||||||
|
plt.subplot(222)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
plt.ylim(-2.5, 2.5)
|
||||||
|
plt.title('Student-t rasm clean')
|
||||||
|
|
||||||
|
print "Corrupt student t, rasm"
|
||||||
|
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
|
||||||
|
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
|
||||||
|
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.constrain_positive('t_noise')
|
||||||
|
m.randomize()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
m.optimize()
|
||||||
|
print(m)
|
||||||
|
plt.subplot(224)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
plt.ylim(-2.5, 2.5)
|
||||||
|
plt.title('Student-t rasm corrupt')
|
||||||
|
|
||||||
|
return m
|
||||||
|
|
||||||
|
#print "Clean student t, ncg"
|
||||||
|
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
|
||||||
|
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
|
||||||
|
#m = GPy.models.GP(X, stu_t_likelihood, kernel3)
|
||||||
|
#m.ensure_default_constraints()
|
||||||
|
#m.update_likelihood_approximation()
|
||||||
|
#m.optimize()
|
||||||
|
#print(m)
|
||||||
|
#plt.subplot(221)
|
||||||
|
#m.plot()
|
||||||
|
#plt.plot(X_full, Y_full)
|
||||||
|
#plt.ylim(-2.5, 2.5)
|
||||||
|
#plt.title('Student-t ncg clean')
|
||||||
|
|
||||||
|
#print "Corrupt student t, ncg"
|
||||||
|
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
|
||||||
|
#corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
|
||||||
|
#m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
|
||||||
|
#m.ensure_default_constraints()
|
||||||
|
#m.update_likelihood_approximation()
|
||||||
|
#m.optimize()
|
||||||
|
#print(m)
|
||||||
|
#plt.subplot(223)
|
||||||
|
#m.plot()
|
||||||
|
#plt.plot(X_full, Y_full)
|
||||||
|
#plt.ylim(-2.5, 2.5)
|
||||||
|
#plt.title('Student-t ncg corrupt')
|
||||||
|
|
||||||
|
|
||||||
|
###with a student t distribution, since it has heavy tails it should work well
|
||||||
|
###likelihood_function = student_t(deg_free, sigma2=real_var)
|
||||||
|
###lap = Laplace(Y, likelihood_function)
|
||||||
|
###cov = kernel.K(X)
|
||||||
|
###lap.fit_full(cov)
|
||||||
|
|
||||||
|
###test_range = np.arange(0, 10, 0.1)
|
||||||
|
###plt.plot(test_range, t_rv.pdf(test_range))
|
||||||
|
###for i in xrange(X.shape[0]):
|
||||||
|
###mode = lap.f_hat[i]
|
||||||
|
###covariance = lap.hess_hat_i[i,i]
|
||||||
|
###scaling = np.exp(lap.ln_z_hat)
|
||||||
|
###normalised_approx = norm(loc=mode, scale=covariance)
|
||||||
|
###print "Normal with mode %f, and variance %f" % (mode, covariance)
|
||||||
|
###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
|
||||||
|
###plt.show()
|
||||||
|
|
||||||
|
return m
|
||||||
|
|
||||||
|
|
||||||
|
def noisy_laplace_approx():
|
||||||
|
"""
|
||||||
|
Example of regressing with a student t likelihood
|
||||||
|
"""
|
||||||
|
#Start a function, any function
|
||||||
|
X = np.sort(np.random.uniform(0, 15, 70))[:, None]
|
||||||
|
Y = np.sin(X)
|
||||||
|
|
||||||
|
#Add some extreme value noise to some of the datapoints
|
||||||
|
percent_corrupted = 0.05
|
||||||
|
corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
|
||||||
|
indices = np.arange(Y.shape[0])
|
||||||
|
np.random.shuffle(indices)
|
||||||
|
corrupted_indices = indices[:corrupted_datums]
|
||||||
|
print corrupted_indices
|
||||||
|
noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
|
||||||
|
Y[corrupted_indices] += noise
|
||||||
|
|
||||||
|
#A GP should completely break down due to the points as they get a lot of weight
|
||||||
|
# create simple GP model
|
||||||
|
m = GPy.models.GP_regression(X, Y)
|
||||||
|
|
||||||
|
# optimize
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.optimize()
|
||||||
|
# plot
|
||||||
|
m.plot()
|
||||||
|
print m
|
||||||
|
|
||||||
|
#with a student t distribution, since it has heavy tails it should work well
|
||||||
|
|
||||||
|
def gaussian_f_check():
|
||||||
|
plt.close('all')
|
||||||
|
X = np.linspace(0, 1, 50)[:, None]
|
||||||
|
real_std = 0.2
|
||||||
|
noise = np.random.randn(*X.shape)*real_std
|
||||||
|
Y = np.sin(X*2*np.pi) + noise
|
||||||
|
|
||||||
|
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
|
||||||
|
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
|
||||||
|
mgp.ensure_default_constraints()
|
||||||
|
mgp.randomize()
|
||||||
|
mgp.optimize()
|
||||||
|
print "Gaussian"
|
||||||
|
print mgp
|
||||||
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
|
||||||
|
kernelg = kernelgp.copy()
|
||||||
|
#kernelst += GPy.kern.bias(X.shape[1])
|
||||||
|
N, D = X.shape
|
||||||
|
g_distribution = GPy.likelihoods.likelihood_functions.gaussian(variance=0.1, N=N, D=D)
|
||||||
|
g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
|
||||||
|
m = GPy.models.GP(X, g_likelihood, kernelg)
|
||||||
|
#m['rbf_v'] = mgp._get_params()[0]
|
||||||
|
#m['rbf_l'] = mgp._get_params()[1] + 1
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
#m.constrain_fixed('rbf_v', mgp._get_params()[0])
|
||||||
|
#m.constrain_fixed('rbf_l', mgp._get_params()[1])
|
||||||
|
#m.constrain_bounded('t_no', 2*real_std**2, 1e3)
|
||||||
|
#m.constrain_positive('bias')
|
||||||
|
m.constrain_positive('noise_var')
|
||||||
|
m.randomize()
|
||||||
|
m['noise_variance'] = 0.1
|
||||||
|
m.likelihood.X = X
|
||||||
|
plt.figure()
|
||||||
|
plt.subplot(211)
|
||||||
|
m.plot()
|
||||||
|
plt.subplot(212)
|
||||||
|
m.optimize()
|
||||||
|
m.plot()
|
||||||
|
print "final optimised gaussian"
|
||||||
|
print m
|
||||||
|
print "real GP"
|
||||||
|
print mgp
|
||||||
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
453
GPy/likelihoods/Laplace.py
Normal file
453
GPy/likelihoods/Laplace.py
Normal file
|
|
@ -0,0 +1,453 @@
|
||||||
|
import numpy as np
|
||||||
|
import scipy as sp
|
||||||
|
import GPy
|
||||||
|
from scipy.linalg import inv, cho_solve, det
|
||||||
|
from numpy.linalg import cond
|
||||||
|
from likelihood import likelihood
|
||||||
|
from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
|
||||||
|
from scipy.linalg.lapack import dtrtrs
|
||||||
|
import random
|
||||||
|
#import pylab as plt
|
||||||
|
|
||||||
|
class Laplace(likelihood):
|
||||||
|
"""Laplace approximation to a posterior"""
|
||||||
|
|
||||||
|
def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'):
|
||||||
|
"""
|
||||||
|
Laplace Approximation
|
||||||
|
|
||||||
|
First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
|
||||||
|
then find the z^{prime} which allows this to be a normalised gaussian instead of a
|
||||||
|
non-normalized gaussian
|
||||||
|
|
||||||
|
Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
|
||||||
|
which makes a gaussian the same as the laplace approximation
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
|
||||||
|
:data: array of data the likelihood function is approximating
|
||||||
|
:likelihood_function: likelihood function - subclass of likelihood_function
|
||||||
|
:extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
|
||||||
|
:opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.data = data
|
||||||
|
self.likelihood_function = likelihood_function
|
||||||
|
self.extra_data = extra_data
|
||||||
|
self.opt = opt
|
||||||
|
|
||||||
|
#Inital values
|
||||||
|
self.N, self.D = self.data.shape
|
||||||
|
self.is_heteroscedastic = True
|
||||||
|
self.Nparams = 0
|
||||||
|
|
||||||
|
self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
|
||||||
|
|
||||||
|
#Initial values for the GP variables
|
||||||
|
self.Y = np.zeros((self.N, 1))
|
||||||
|
self.covariance_matrix = np.eye(self.N)
|
||||||
|
self.precision = np.ones(self.N)[:, None]
|
||||||
|
self.Z = 0
|
||||||
|
self.YYT = None
|
||||||
|
|
||||||
|
self.old_a = None
|
||||||
|
|
||||||
|
def predictive_values(self, mu, var, full_cov):
|
||||||
|
if full_cov:
|
||||||
|
raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
|
||||||
|
return self.likelihood_function.predictive_values(mu, var)
|
||||||
|
|
||||||
|
def _get_params(self):
|
||||||
|
return np.asarray(self.likelihood_function._get_params())
|
||||||
|
|
||||||
|
def _get_param_names(self):
|
||||||
|
return self.likelihood_function._get_param_names()
|
||||||
|
|
||||||
|
def _set_params(self, p):
|
||||||
|
return self.likelihood_function._set_params(p)
|
||||||
|
|
||||||
|
def _shared_gradients_components(self):
|
||||||
|
#FIXME: Careful of side effects! And make sure W and K are up to date!
|
||||||
|
d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
|
||||||
|
dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T
|
||||||
|
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
|
||||||
|
return dL_dfhat, I_KW_i
|
||||||
|
|
||||||
|
def _Kgradients(self, dK_dthetaK, X):
|
||||||
|
"""
|
||||||
|
Gradients with respect to prior kernel parameters
|
||||||
|
"""
|
||||||
|
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
||||||
|
dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
|
||||||
|
|
||||||
|
#Implicit
|
||||||
|
impl = mdot(dlp, dL_dfhat, I_KW_i)
|
||||||
|
expl_a = mdot(self.Ki_f, self.Ki_f.T)
|
||||||
|
expl_b = self.Wi_K_i
|
||||||
|
#print "expl_a: {}, expl_b: {}".format(expl_a, expl_b)
|
||||||
|
expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
|
||||||
|
dL_dthetaK_exp = dK_dthetaK(expl, X)
|
||||||
|
dL_dthetaK_imp = dK_dthetaK(impl, X)
|
||||||
|
#print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
|
||||||
|
#print "expl_a: {}, {} expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b))
|
||||||
|
dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
|
||||||
|
return dL_dthetaK
|
||||||
|
|
||||||
|
def _gradients(self, partial):
|
||||||
|
"""
|
||||||
|
Gradients with respect to likelihood parameters
|
||||||
|
"""
|
||||||
|
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
||||||
|
dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
|
||||||
|
|
||||||
|
num_params = len(dlik_dthetaL)
|
||||||
|
dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
|
||||||
|
for thetaL_i in range(num_params):
|
||||||
|
#Explicit
|
||||||
|
#dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
|
||||||
|
#a = 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
|
||||||
|
#d = dlik_hess_dthetaL[thetaL_i]
|
||||||
|
#e = pdinv(pdinv(self.K)[0] + np.diagflat(self.W))[0]
|
||||||
|
#b = 0.5*np.dot(np.diag(e).T, d)
|
||||||
|
#g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1))
|
||||||
|
#dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i])
|
||||||
|
dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
|
||||||
|
|
||||||
|
#Implicit
|
||||||
|
df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
|
||||||
|
dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
|
||||||
|
#print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
|
||||||
|
dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
|
||||||
|
|
||||||
|
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
|
||||||
|
|
||||||
|
def _compute_GP_variables(self):
|
||||||
|
"""
|
||||||
|
Generates data Y which would give the normal distribution identical to the laplace approximation
|
||||||
|
|
||||||
|
GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
|
||||||
|
that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
|
||||||
|
|
||||||
|
Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
|
||||||
|
then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
|
||||||
|
due to the z rescaling.
|
||||||
|
|
||||||
|
at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
|
||||||
|
This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
|
||||||
|
giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
|
||||||
|
|
||||||
|
$$\tilde{Y} = \tilde{\Sigma} Hf$$
|
||||||
|
where
|
||||||
|
$$\tilde{\Sigma}^{-1} = H - K^{-1}$$
|
||||||
|
i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
|
||||||
|
since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
|
||||||
|
and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
|
||||||
|
$$\tilde{\Sigma} = W^{-1}$$
|
||||||
|
|
||||||
|
"""
|
||||||
|
#Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
|
||||||
|
#dtritri -> L -> L_i
|
||||||
|
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
|
||||||
|
#((L.T*w)_i + I)f_hat = y_tilde
|
||||||
|
#L = jitchol(self.K)
|
||||||
|
#Li = chol_inv(L)
|
||||||
|
#Lt_W = L.T*self.W.T
|
||||||
|
|
||||||
|
#Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
|
||||||
|
#self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
|
||||||
|
#Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
|
||||||
|
|
||||||
|
Wi = 1.0/self.W
|
||||||
|
self.Sigma_tilde = np.diagflat(Wi)
|
||||||
|
|
||||||
|
Y_tilde = Wi*self.Ki_f + self.f_hat
|
||||||
|
|
||||||
|
self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
|
||||||
|
#self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
|
||||||
|
|
||||||
|
self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
|
||||||
|
self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
|
||||||
|
|
||||||
|
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
|
||||||
|
self.aA = 0.5*self.ln_det_K_Wi__Bi
|
||||||
|
self.bB = - 0.5*self.f_Ki_f
|
||||||
|
self.cC = 0.5*self.y_Wi_Ki_i_y
|
||||||
|
Z_tilde = (+ self.lik
|
||||||
|
+ 0.5*self.ln_det_K_Wi__Bi
|
||||||
|
- 0.5*self.f_Ki_f
|
||||||
|
+ 0.5*self.y_Wi_Ki_i_y
|
||||||
|
)
|
||||||
|
print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
|
||||||
|
print self.likelihood_function._get_params()
|
||||||
|
|
||||||
|
#Convert to float as its (1, 1) and Z must be a scalar
|
||||||
|
self.Z = np.float64(Z_tilde)
|
||||||
|
self.Y = Y_tilde
|
||||||
|
self.YYT = np.dot(self.Y, self.Y.T)
|
||||||
|
self.covariance_matrix = self.Sigma_tilde
|
||||||
|
self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
|
||||||
|
|
||||||
|
def fit_full(self, K):
|
||||||
|
"""
|
||||||
|
The laplace approximation algorithm, find K and expand hessian
|
||||||
|
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
|
||||||
|
:K: Covariance matrix
|
||||||
|
"""
|
||||||
|
self.K = K.copy()
|
||||||
|
|
||||||
|
#Find mode
|
||||||
|
self.f_hat = {
|
||||||
|
'rasm': self.rasm_mode,
|
||||||
|
'ncg': self.ncg_mode,
|
||||||
|
'nelder': self.nelder_mode
|
||||||
|
}[self.opt](self.K)
|
||||||
|
|
||||||
|
#Compute hessian and other variables at mode
|
||||||
|
self._compute_likelihood_variables()
|
||||||
|
|
||||||
|
def _compute_likelihood_variables(self):
|
||||||
|
#At this point get the hessian matrix (or vector as W is diagonal)
|
||||||
|
self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
|
||||||
|
|
||||||
|
if not self.likelihood_function.log_concave:
|
||||||
|
self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
||||||
|
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
||||||
|
#To cause the posterior to become less certain than the prior and likelihood,
|
||||||
|
#This is a property only held by non-log-concave likelihoods
|
||||||
|
|
||||||
|
#TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
|
||||||
|
self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
|
||||||
|
self.Bi, _, _, B_det = pdinv(self.B)
|
||||||
|
|
||||||
|
#Do the computation again at f to get Ki_f which is useful
|
||||||
|
#b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
|
||||||
|
#solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
|
||||||
|
#a = b - self.W_12*solve_chol
|
||||||
|
self.Ki_f = self.a
|
||||||
|
|
||||||
|
self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
|
||||||
|
self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
|
||||||
|
|
||||||
|
#For det, |I + KW| == |I + W_12*K*W_12|
|
||||||
|
self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
|
||||||
|
|
||||||
|
#self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
|
||||||
|
#self.ln_z_hat = (- 0.5*self.f_Ki_f
|
||||||
|
#- self.ln_I_KW_det
|
||||||
|
#+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
|
||||||
|
#)
|
||||||
|
|
||||||
|
return self._compute_GP_variables()
|
||||||
|
|
||||||
|
def _compute_B_statistics(self, K, W):
|
||||||
|
"""Rasmussen suggests the use of a numerically stable positive definite matrix B
|
||||||
|
Which has a positive diagonal element and can be easyily inverted
|
||||||
|
|
||||||
|
:K: Covariance matrix
|
||||||
|
:W: Negative hessian at a point (diagonal matrix)
|
||||||
|
:returns: (B, L)
|
||||||
|
"""
|
||||||
|
#W is diagonal so its sqrt is just the sqrt of the diagonal elements
|
||||||
|
W_12 = np.sqrt(W)
|
||||||
|
B = np.eye(self.N) + W_12*K*W_12.T
|
||||||
|
L = jitchol(B)
|
||||||
|
return (B, L, W_12)
|
||||||
|
|
||||||
|
def nelder_mode(self, K):
|
||||||
|
f = np.zeros((self.N, 1))
|
||||||
|
self.Ki, _, _, self.ln_K_det = pdinv(K)
|
||||||
|
def obj(f):
|
||||||
|
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
|
||||||
|
return float(res)
|
||||||
|
|
||||||
|
res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
|
||||||
|
f_new = res.x
|
||||||
|
return f_new[:, None]
|
||||||
|
|
||||||
|
def ncg_mode(self, K):
|
||||||
|
"""
|
||||||
|
Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
|
||||||
|
:K: Covariance matrix
|
||||||
|
:returns: f_mode
|
||||||
|
"""
|
||||||
|
self.Ki, _, _, self.ln_K_det = pdinv(K)
|
||||||
|
|
||||||
|
f = np.zeros((self.N, 1))
|
||||||
|
|
||||||
|
#FIXME: Can we get rid of this horrible reshaping?
|
||||||
|
#ONLY WORKS FOR 1D DATA
|
||||||
|
def obj(f):
|
||||||
|
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
|
||||||
|
- self.NORMAL_CONST)
|
||||||
|
return float(res)
|
||||||
|
|
||||||
|
def obj_grad(f):
|
||||||
|
res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
|
||||||
|
return np.squeeze(res)
|
||||||
|
|
||||||
|
def obj_hess(f):
|
||||||
|
res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
|
||||||
|
return np.squeeze(res)
|
||||||
|
|
||||||
|
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
|
||||||
|
return f_hat[:, None]
|
||||||
|
|
||||||
|
def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
|
||||||
|
"""
|
||||||
|
Rasmussens numerically stable mode finding
|
||||||
|
For nomenclature see Rasmussen & Williams 2006
|
||||||
|
|
||||||
|
:K: Covariance matrix
|
||||||
|
:MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
|
||||||
|
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
|
||||||
|
:returns: f_mode
|
||||||
|
"""
|
||||||
|
self.old_before_s = self.likelihood_function._get_params()
|
||||||
|
print "before: ", self.old_before_s
|
||||||
|
#if self.old_before_s < 1e-5:
|
||||||
|
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
|
||||||
|
#old_a = np.zeros((self.N, 1))
|
||||||
|
if self.old_a is None:
|
||||||
|
old_a = np.zeros((self.N, 1))
|
||||||
|
f = np.dot(K, old_a)
|
||||||
|
else:
|
||||||
|
old_a = self.old_a.copy()
|
||||||
|
f = self.f_hat.copy()
|
||||||
|
|
||||||
|
new_obj = -np.inf
|
||||||
|
old_obj = np.inf
|
||||||
|
|
||||||
|
def obj(a, f):
|
||||||
|
return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
|
||||||
|
|
||||||
|
difference = np.inf
|
||||||
|
epsilon = 1e-4
|
||||||
|
step_size = 1
|
||||||
|
rs = 0
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
|
||||||
|
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
|
||||||
|
#W = np.maximum(W, 0)
|
||||||
|
if not self.likelihood_function.log_concave:
|
||||||
|
W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
||||||
|
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
||||||
|
# To cause the posterior to become less certain than the prior and likelihood,
|
||||||
|
# This is a property only held by non-log-concave likelihoods
|
||||||
|
B, L, W_12 = self._compute_B_statistics(K, W.copy())
|
||||||
|
|
||||||
|
W_f = W*f
|
||||||
|
grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
|
||||||
|
|
||||||
|
b = W_f + grad
|
||||||
|
solve_L = cho_solve((L, True), W_12*np.dot(K, b))
|
||||||
|
#Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
|
||||||
|
full_step_a = b - W_12*solve_L
|
||||||
|
da = full_step_a - old_a
|
||||||
|
|
||||||
|
#f_old = f.copy()
|
||||||
|
#def inner_obj(step_size, old_a, da, K):
|
||||||
|
#a = old_a + step_size*da
|
||||||
|
#f = np.dot(K, a)
|
||||||
|
#self.a = a.copy() # This is nasty, need to set something within an optimization though
|
||||||
|
#self.f = f.copy()
|
||||||
|
#return -obj(a, f)
|
||||||
|
|
||||||
|
#from functools import partial
|
||||||
|
#i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
|
||||||
|
##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
|
||||||
|
#new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
|
||||||
|
#f = self.f.copy()
|
||||||
|
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
|
||||||
|
f_old = f.copy()
|
||||||
|
update_passed = False
|
||||||
|
while not update_passed:
|
||||||
|
a = old_a + step_size*da
|
||||||
|
f = np.dot(K, a)
|
||||||
|
|
||||||
|
old_obj = new_obj
|
||||||
|
new_obj = obj(a, f)
|
||||||
|
difference = new_obj - old_obj
|
||||||
|
print "difference: ",difference
|
||||||
|
if difference < 0:
|
||||||
|
#print "Objective function rose", np.float(difference)
|
||||||
|
#If the objective function isn't rising, restart optimization
|
||||||
|
step_size *= 0.8
|
||||||
|
#print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
|
||||||
|
#objective function isn't increasing, try reducing step size
|
||||||
|
f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
|
||||||
|
old_obj = new_obj
|
||||||
|
rs += 1
|
||||||
|
else:
|
||||||
|
update_passed = True
|
||||||
|
|
||||||
|
#difference = abs(new_obj - old_obj)
|
||||||
|
#old_obj = new_obj.copy()
|
||||||
|
#difference = np.abs(np.sum(f - f_old))
|
||||||
|
difference = np.abs(np.sum(a - old_a))
|
||||||
|
#old_a = self.a.copy() #a
|
||||||
|
old_a = a.copy()
|
||||||
|
i += 1
|
||||||
|
#print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
|
||||||
|
|
||||||
|
self.old_a = old_a.copy()
|
||||||
|
#print "Positive difference obj: ", np.float(difference)
|
||||||
|
#print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
|
||||||
|
print "Iterations: {}, Final_difference: {}".format(i, difference)
|
||||||
|
if difference > 1e-4:
|
||||||
|
print "FAIL FAIL FAIL FAIL FAIL FAIL"
|
||||||
|
if False:
|
||||||
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
if hasattr(self, 'X'):
|
||||||
|
import pylab as pb
|
||||||
|
pb.figure()
|
||||||
|
pb.subplot(311)
|
||||||
|
pb.title('old f_hat')
|
||||||
|
pb.plot(self.X, self.f_hat)
|
||||||
|
pb.subplot(312)
|
||||||
|
pb.title('old ff')
|
||||||
|
pb.plot(self.X, self.old_ff)
|
||||||
|
pb.subplot(313)
|
||||||
|
pb.title('new f_hat')
|
||||||
|
pb.plot(self.X, f)
|
||||||
|
|
||||||
|
pb.figure()
|
||||||
|
pb.subplot(121)
|
||||||
|
pb.title('old K')
|
||||||
|
pb.imshow(np.diagflat(self.old_K), interpolation='none')
|
||||||
|
pb.colorbar()
|
||||||
|
pb.subplot(122)
|
||||||
|
pb.title('new K')
|
||||||
|
pb.imshow(np.diagflat(K), interpolation='none')
|
||||||
|
pb.colorbar()
|
||||||
|
|
||||||
|
pb.figure()
|
||||||
|
pb.subplot(121)
|
||||||
|
pb.title('old W')
|
||||||
|
pb.imshow(np.diagflat(self.old_W), interpolation='none')
|
||||||
|
pb.colorbar()
|
||||||
|
pb.subplot(122)
|
||||||
|
pb.title('new W')
|
||||||
|
pb.imshow(np.diagflat(W), interpolation='none')
|
||||||
|
pb.colorbar()
|
||||||
|
|
||||||
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
pb.close('all')
|
||||||
|
|
||||||
|
#FIXME: DELETE THESE
|
||||||
|
self.old_W = W.copy()
|
||||||
|
self.old_grad = grad.copy()
|
||||||
|
self.old_B = B.copy()
|
||||||
|
self.old_W_12 = W_12.copy()
|
||||||
|
self.old_ff = f.copy()
|
||||||
|
self.old_K = self.K.copy()
|
||||||
|
self.old_s = self.likelihood_function._get_params()
|
||||||
|
print "after: ", self.old_s
|
||||||
|
#print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
|
||||||
|
self.a = a
|
||||||
|
#self.B, self.B_chol, self.W_12 = B, L, W_12
|
||||||
|
#self.Bi, _, _, B_det = pdinv(self.B)
|
||||||
|
return f
|
||||||
319
GPy/models/GP.py
Normal file
319
GPy/models/GP.py
Normal file
|
|
@ -0,0 +1,319 @@
|
||||||
|
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
||||||
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from scipy import linalg
|
||||||
|
import pylab as pb
|
||||||
|
from .. import kern
|
||||||
|
from ..core import model
|
||||||
|
from ..util.linalg import pdinv, mdot, tdot
|
||||||
|
from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango
|
||||||
|
from ..likelihoods import EP, Laplace
|
||||||
|
|
||||||
|
class GP(model):
|
||||||
|
"""
|
||||||
|
Gaussian Process model for regression and EP
|
||||||
|
|
||||||
|
:param X: input observations
|
||||||
|
:param kernel: a GPy kernel, defaults to rbf+white
|
||||||
|
:parm likelihood: a GPy likelihood
|
||||||
|
:param normalize_X: whether to normalize the input data before computing (predictions will be in original scales)
|
||||||
|
:type normalize_X: False|True
|
||||||
|
:rtype: model object
|
||||||
|
:param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
|
||||||
|
:param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
|
||||||
|
:type powerep: list
|
||||||
|
|
||||||
|
.. Note:: Multiple independent outputs are allowed using columns of Y
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, X, likelihood, kernel, normalize_X=False):
|
||||||
|
self.has_uncertain_inputs=False
|
||||||
|
|
||||||
|
# parse arguments
|
||||||
|
self.X = X
|
||||||
|
assert len(self.X.shape) == 2
|
||||||
|
self.N, self.Q = self.X.shape
|
||||||
|
assert isinstance(kernel, kern.kern)
|
||||||
|
self.kern = kernel
|
||||||
|
self.likelihood = likelihood
|
||||||
|
assert self.X.shape[0] == self.likelihood.data.shape[0]
|
||||||
|
self.N, self.D = self.likelihood.data.shape
|
||||||
|
|
||||||
|
# here's some simple normalization for the inputs
|
||||||
|
if normalize_X:
|
||||||
|
self._Xmean = X.mean(0)[None, :]
|
||||||
|
self._Xstd = X.std(0)[None, :]
|
||||||
|
self.X = (X.copy() - self._Xmean) / self._Xstd
|
||||||
|
if hasattr(self, 'Z'):
|
||||||
|
self.Z = (self.Z - self._Xmean) / self._Xstd
|
||||||
|
else:
|
||||||
|
self._Xmean = np.zeros((1, self.X.shape[1]))
|
||||||
|
self._Xstd = np.ones((1, self.X.shape[1]))
|
||||||
|
|
||||||
|
if not hasattr(self,'has_uncertain_inputs'):
|
||||||
|
self.has_uncertain_inputs = False
|
||||||
|
model.__init__(self)
|
||||||
|
|
||||||
|
def dL_dZ(self):
|
||||||
|
"""
|
||||||
|
TODO: one day we might like to learn Z by gradient methods?
|
||||||
|
"""
|
||||||
|
#FIXME: this doesn;t live here.
|
||||||
|
return np.zeros_like(self.Z)
|
||||||
|
|
||||||
|
def _set_params(self, p):
|
||||||
|
self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
|
||||||
|
# self.likelihood._set_params(p[self.kern.Nparam:]) # test by Nicolas
|
||||||
|
self.likelihood._set_params(p[self.kern.Nparam_transformed():]) # test by Nicolas
|
||||||
|
|
||||||
|
if isinstance(self.likelihood, Laplace):
|
||||||
|
self.likelihood.fit_full(self.kern.K(self.X))
|
||||||
|
self.likelihood._set_params(self.likelihood._get_params())
|
||||||
|
|
||||||
|
self.K = self.kern.K(self.X)
|
||||||
|
self.K += self.likelihood.covariance_matrix
|
||||||
|
|
||||||
|
self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
|
||||||
|
|
||||||
|
# the gradient of the likelihood wrt the covariance matrix
|
||||||
|
if self.likelihood.YYT is None:
|
||||||
|
#alpha = np.dot(self.Ki, self.likelihood.Y)
|
||||||
|
alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1)
|
||||||
|
|
||||||
|
self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
|
||||||
|
else:
|
||||||
|
#tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
|
||||||
|
tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
|
||||||
|
tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
|
||||||
|
self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
|
||||||
|
|
||||||
|
def _get_params(self):
|
||||||
|
return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
|
||||||
|
|
||||||
|
def _get_param_names(self):
|
||||||
|
return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
|
||||||
|
|
||||||
|
def _update_params_callback(self, p):
|
||||||
|
#parameters will be in transformed space
|
||||||
|
self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
|
||||||
|
#set_params_transformed for likelihood doesn't exist?
|
||||||
|
self.likelihood._set_params(p[self.kern.Nparam_transformed():])
|
||||||
|
#update the likelihood approximation within the optimisation with the current parameters
|
||||||
|
self.update_likelihood_approximation()
|
||||||
|
|
||||||
|
def update_likelihood_approximation(self):
|
||||||
|
"""
|
||||||
|
Approximates a non-gaussian likelihood using Expectation Propagation
|
||||||
|
|
||||||
|
For a Gaussian likelihood, no iteration is required:
|
||||||
|
this function does nothing
|
||||||
|
"""
|
||||||
|
self.likelihood.fit_full(self.kern.K(self.X))
|
||||||
|
self._set_params(self._get_params()) # update the GP
|
||||||
|
|
||||||
|
def _model_fit_term(self):
|
||||||
|
"""
|
||||||
|
Computes the model fit using YYT if it's available
|
||||||
|
"""
|
||||||
|
if self.likelihood.YYT is None:
|
||||||
|
tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
|
||||||
|
return -0.5 * np.sum(np.square(tmp))
|
||||||
|
#return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
|
||||||
|
else:
|
||||||
|
return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
"""
|
||||||
|
The log marginal likelihood of the GP.
|
||||||
|
|
||||||
|
For an EP model, can be written as the log likelihood of a regression
|
||||||
|
model for a new variable Y* = v_tilde/tau_tilde, with a covariance
|
||||||
|
matrix K* = K + diag(1./tau_tilde) plus a normalization term.
|
||||||
|
"""
|
||||||
|
#if isinstance(self.likelihood, Laplace):
|
||||||
|
#self.likelihood.fit_full(self.kern.K(self.X))
|
||||||
|
#self.likelihood._set_params(self.likelihood._get_params())
|
||||||
|
l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
|
||||||
|
print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z)
|
||||||
|
return l
|
||||||
|
|
||||||
|
def _log_likelihood_gradients(self):
|
||||||
|
"""
|
||||||
|
The gradient of all parameters.
|
||||||
|
|
||||||
|
Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
|
||||||
|
"""
|
||||||
|
dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
|
||||||
|
print "dL_dthetaK should be: ", dL_dthetaK
|
||||||
|
if isinstance(self.likelihood, Laplace):
|
||||||
|
#self.likelihood.fit_full(self.kern.K(self.X))
|
||||||
|
#self.likelihood._set_params(self.likelihood._get_params())
|
||||||
|
dK_dthetaK = self.kern.dK_dtheta
|
||||||
|
dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
|
||||||
|
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
||||||
|
else:
|
||||||
|
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
||||||
|
#print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
|
||||||
|
#print "dL_dthetaK: {} dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
|
||||||
|
|
||||||
|
return np.hstack((dL_dthetaK, dL_dthetaL))
|
||||||
|
#return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
|
||||||
|
|
||||||
|
def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
|
||||||
|
"""
|
||||||
|
Internal helper function for making predictions, does not account
|
||||||
|
for normalization or likelihood
|
||||||
|
"""
|
||||||
|
Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
|
||||||
|
#KiKx = np.dot(self.Ki, Kx)
|
||||||
|
KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1)
|
||||||
|
mu = np.dot(KiKx.T, self.likelihood.Y)
|
||||||
|
if full_cov:
|
||||||
|
Kxx = self.kern.K(_Xnew, which_parts=which_parts)
|
||||||
|
var = Kxx - np.dot(KiKx.T, Kx)
|
||||||
|
else:
|
||||||
|
Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
|
||||||
|
var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
|
||||||
|
var = var[:, None]
|
||||||
|
if stop:
|
||||||
|
debug_this
|
||||||
|
return mu, var
|
||||||
|
|
||||||
|
|
||||||
|
def predict(self, Xnew, which_parts='all', full_cov=False):
|
||||||
|
"""
|
||||||
|
Predict the function(s) at the new point(s) Xnew.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
:param Xnew: The points at which to make a prediction
|
||||||
|
:type Xnew: np.ndarray, Nnew x self.Q
|
||||||
|
:param which_parts: specifies which outputs kernel(s) to use in prediction
|
||||||
|
:type which_parts: ('all', list of bools)
|
||||||
|
:param full_cov: whether to return the folll covariance matrix, or just the diagonal
|
||||||
|
:type full_cov: bool
|
||||||
|
:rtype: posterior mean, a Numpy array, Nnew x self.D
|
||||||
|
:rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
|
||||||
|
:rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays, Nnew x self.D
|
||||||
|
|
||||||
|
|
||||||
|
If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
|
||||||
|
This is to allow for different normalizations of the output dimensions.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# normalize X values
|
||||||
|
Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
|
||||||
|
mu, var = self._raw_predict(Xnew, which_parts, full_cov)
|
||||||
|
|
||||||
|
# now push through likelihood
|
||||||
|
mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
|
||||||
|
|
||||||
|
return mean, var, _025pm, _975pm
|
||||||
|
|
||||||
|
|
||||||
|
def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False):
|
||||||
|
"""
|
||||||
|
Plot the GP's view of the world, where the data is normalized and the
|
||||||
|
likelihood is Gaussian.
|
||||||
|
|
||||||
|
:param samples: the number of a posteriori samples to plot
|
||||||
|
:param which_data: which if the training data to plot (default all)
|
||||||
|
:type which_data: 'all' or a slice object to slice self.X, self.Y
|
||||||
|
:param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
|
||||||
|
:param which_parts: which of the kernel functions to plot (additively)
|
||||||
|
:type which_parts: 'all', or list of bools
|
||||||
|
:param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
|
||||||
|
|
||||||
|
Plot the posterior of the GP.
|
||||||
|
- In one dimension, the function is plotted with a shaded region identifying two standard deviations.
|
||||||
|
- In two dimsensions, a contour-plot shows the mean predicted function
|
||||||
|
- In higher dimensions, we've no implemented this yet !TODO!
|
||||||
|
|
||||||
|
Can plot only part of the data and part of the posterior functions
|
||||||
|
using which_data and which_functions
|
||||||
|
"""
|
||||||
|
if which_data == 'all':
|
||||||
|
which_data = slice(None)
|
||||||
|
|
||||||
|
if self.X.shape[1] == 1:
|
||||||
|
Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
|
||||||
|
if samples == 0:
|
||||||
|
m, v = self._raw_predict(Xnew, which_parts=which_parts)
|
||||||
|
gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v))
|
||||||
|
pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
|
||||||
|
else:
|
||||||
|
m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
|
||||||
|
Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
|
||||||
|
gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None])
|
||||||
|
for i in range(samples):
|
||||||
|
pb.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
|
||||||
|
pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
|
||||||
|
pb.xlim(xmin, xmax)
|
||||||
|
ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
|
||||||
|
ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
|
||||||
|
pb.ylim(ymin, ymax)
|
||||||
|
if hasattr(self, 'Z'):
|
||||||
|
pb.plot(self.Z, self.Z * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
|
||||||
|
|
||||||
|
elif self.X.shape[1] == 2:
|
||||||
|
resolution = resolution or 50
|
||||||
|
Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
|
||||||
|
m, v = self._raw_predict(Xnew, which_parts=which_parts)
|
||||||
|
m = m.reshape(resolution, resolution).T
|
||||||
|
pb.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
|
||||||
|
pb.scatter(Xorig[:, 0], Xorig[:, 1], 40, Yorig, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max())
|
||||||
|
pb.xlim(xmin[0], xmax[0])
|
||||||
|
pb.ylim(xmin[1], xmax[1])
|
||||||
|
else:
|
||||||
|
raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
|
||||||
|
|
||||||
|
def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20):
|
||||||
|
"""
|
||||||
|
TODO: Docstrings!
|
||||||
|
:param levels: for 2D plotting, the number of contour levels to use
|
||||||
|
|
||||||
|
"""
|
||||||
|
# TODO include samples
|
||||||
|
if which_data == 'all':
|
||||||
|
which_data = slice(None)
|
||||||
|
|
||||||
|
if self.X.shape[1] == 1:
|
||||||
|
|
||||||
|
Xu = self.X * self._Xstd + self._Xmean # NOTE self.X are the normalized values now
|
||||||
|
|
||||||
|
Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
|
||||||
|
m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
|
||||||
|
gpplot(Xnew, m, lower, upper)
|
||||||
|
pb.plot(Xu[which_data], self.likelihood.data[which_data], 'kx', mew=1.5)
|
||||||
|
if self.has_uncertain_inputs:
|
||||||
|
pb.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
|
||||||
|
xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
|
||||||
|
ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
|
||||||
|
|
||||||
|
ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
|
||||||
|
ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
|
||||||
|
pb.xlim(xmin, xmax)
|
||||||
|
pb.ylim(ymin, ymax)
|
||||||
|
if hasattr(self, 'Z'):
|
||||||
|
Zu = self.Z * self._Xstd + self._Xmean
|
||||||
|
pb.plot(Zu, Zu * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
|
||||||
|
# pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
|
||||||
|
|
||||||
|
elif self.X.shape[1] == 2: # FIXME
|
||||||
|
resolution = resolution or 50
|
||||||
|
Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
|
||||||
|
x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
|
||||||
|
m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
|
||||||
|
m = m.reshape(resolution, resolution).T
|
||||||
|
pb.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
|
||||||
|
Yf = self.likelihood.Y.flatten()
|
||||||
|
pb.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
|
||||||
|
pb.xlim(xmin[0], xmax[0])
|
||||||
|
pb.ylim(xmin[1], xmax[1])
|
||||||
|
if hasattr(self, 'Z'):
|
||||||
|
pb.plot(self.Z[:, 0], self.Z[:, 1], 'wo')
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
|
||||||
Loading…
Add table
Add a link
Reference in a new issue