Checked out relavent files

This commit is contained in:
Alan Saul 2013-09-09 13:41:58 +01:00
parent 29358519d9
commit f641ab54a8
3 changed files with 1411 additions and 0 deletions

View file

@ -0,0 +1,639 @@
import GPy
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(1)
def timing():
real_var = 0.1
times = 1
deg_free = 10
real_sd = np.sqrt(real_var)
the_is = np.zeros(times)
X = np.linspace(0.0, 10.0, 300)[:, None]
for a in xrange(times):
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
Yc = Y.copy()
Yc[10] += 100
Yc[25] += 10
Yc[23] += 10
Yc[24] += 10
Yc[250] += 10
#Yc[4] += 10000
edited_real_sd = real_sd
kernel1 = GPy.kern.rbf(X.shape[1])
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
the_is[a] = m.likelihood.i
print the_is
print np.mean(the_is)
def v_fail_test():
#plt.close('all')
real_var = 0.1
X = np.linspace(0.0, 10.0, 50)[:, None]
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
Y = Y/Y.max()
#Add student t random noise to datapoints
deg_free = 10
real_sd = np.sqrt(real_var)
print "Real noise std: ", real_sd
kernel1 = GPy.kern.white(X.shape[1]) #+ GPy.kern.white(X.shape[1])
edited_real_sd = 0.3#real_sd
edited_real_sd = real_sd
print "Clean student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernel1)
m.constrain_positive('')
vs = 25
noises = 30
checkgrads = np.zeros((vs, noises))
vs_noises = np.zeros((vs, noises))
for v_ind, v in enumerate(np.linspace(1, 100, vs)):
m.likelihood.likelihood_function.v = v
print v
for noise_ind, noise in enumerate(np.linspace(0.0001, 100, noises)):
m['t_noise'] = noise
m.update_likelihood_approximation()
checkgrads[v_ind, noise_ind] = m.checkgrad()
vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2)
plt.figure()
plt.title('Checkgrads')
plt.imshow(checkgrads, interpolation='nearest')
plt.xlabel('noise')
plt.ylabel('v')
#plt.figure()
#plt.title('variance change')
#plt.imshow(vs_noises, interpolation='nearest')
#plt.xlabel('noise')
#plt.ylabel('v')
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
print(m)
def student_t_obj_plane():
plt.close('all')
X = np.linspace(0, 1, 50)[:, None]
real_std = 0.002
noise = np.random.randn(*X.shape)*real_std
Y = np.sin(X*2*np.pi) + noise
deg_free = 1000
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
mgp.ensure_default_constraints()
mgp['noise'] = real_std**2
print "Gaussian"
print mgp
kernelst = kernelgp.copy()
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
m.ensure_default_constraints()
m.constrain_fixed('t_no', real_std**2)
vs = 10
ls = 10
objs_t = np.zeros((vs, ls))
objs_g = np.zeros((vs, ls))
rbf_vs = np.linspace(1e-6, 8, vs)
rbf_ls = np.linspace(1e-2, 8, ls)
for v_id, rbf_v in enumerate(rbf_vs):
for l_id, rbf_l in enumerate(rbf_ls):
m['rbf_v'] = rbf_v
m['rbf_l'] = rbf_l
mgp['rbf_v'] = rbf_v
mgp['rbf_l'] = rbf_l
objs_t[v_id, l_id] = m.log_likelihood()
objs_g[v_id, l_id] = mgp.log_likelihood()
plt.figure()
plt.subplot(211)
plt.title('Student t')
plt.imshow(objs_t, interpolation='none')
plt.xlabel('variance')
plt.ylabel('lengthscale')
plt.subplot(212)
plt.title('Gaussian')
plt.imshow(objs_g, interpolation='none')
plt.xlabel('variance')
plt.ylabel('lengthscale')
plt.show()
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
return objs_t
def student_t_f_check():
plt.close('all')
X = np.linspace(0, 1, 50)[:, None]
real_std = 0.2
noise = np.random.randn(*X.shape)*real_std
Y = np.sin(X*2*np.pi) + noise
deg_free = 1000
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
mgp.ensure_default_constraints()
mgp.randomize()
mgp.optimize()
print "Gaussian"
print mgp
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
kernelst = kernelgp.copy()
#kernelst += GPy.kern.bias(X.shape[1])
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
#m['rbf_v'] = mgp._get_params()[0]
#m['rbf_l'] = mgp._get_params()[1] + 1
m.ensure_default_constraints()
#m.constrain_fixed('rbf_v', mgp._get_params()[0])
#m.constrain_fixed('rbf_l', mgp._get_params()[1])
#m.constrain_bounded('t_no', 2*real_std**2, 1e3)
#m.constrain_positive('bias')
m.constrain_positive('t_no')
m.randomize()
m['t_no'] = 0.3
m.likelihood.X = X
#print m
plt.figure()
plt.subplot(211)
m.plot()
print "OPTIMIZED ONCE"
plt.subplot(212)
m.optimize()
m.plot()
print "final optimised student t"
print m
print "real GP"
print mgp
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
return m
def student_t_fix_optimise_check():
plt.close('all')
real_var = 0.1
real_std = np.sqrt(real_var)
X = np.random.rand(200)[:, None]
noise = np.random.randn(*X.shape)*real_std
Y = np.sin(X*2*np.pi) + noise
X_full = X
Y_full = np.sin(X_full)
Y = Y/Y.max()
Y_full = Y_full/Y_full.max()
deg_free = 1000
#GP
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
mgp.ensure_default_constraints()
mgp.randomize()
mgp.optimize()
kernelst = kernelgp.copy()
real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std2)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
plt.figure(1)
plt.suptitle('Student likelihood')
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
m.constrain_fixed('rbf_var', mgp._get_params()[0])
m.constrain_fixed('rbf_len', mgp._get_params()[1])
m.constrain_positive('t_noise')
#m.ensure_default_constraints()
m.update_likelihood_approximation()
print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood())
plt.subplot(231)
m.plot()
plt.title('Student t original data noise')
#Fix student t noise variance to same a GP
gp_noise = mgp._get_params()[2]
m['t_noise_std2'] = gp_noise
m.update_likelihood_approximation()
print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood())
plt.subplot(232)
m.plot()
plt.title('Student t GP noise')
#Fix student t noise to variance converted from the GP
real_stu_t_std2gp = (gp_noise)*((deg_free - 2)/float(deg_free))
m['t_noise_std2'] = real_stu_t_std2gp
m.update_likelihood_approximation()
print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood())
plt.subplot(233)
m.plot()
plt.title('Student t GP noise converted')
m.constrain_positive('t_noise_std2')
m.randomize()
m.update_likelihood_approximation()
plt.subplot(234)
m.plot()
plt.title('Student t fixed rbf')
m.optimize()
print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood())
plt.subplot(235)
m.plot()
plt.title('Student t fixed rbf optimised')
plt.figure(2)
mrbf = m.copy()
mrbf.unconstrain('')
mrbf.constrain_fixed('t_noise', m.likelihood.likelihood_function.sigma2)
gp_var = mgp._get_params()[0]
gp_len = mgp._get_params()[1]
mrbf.constrain_fixed('rbf_var', gp_var)
mrbf.constrain_positive('rbf_len')
mrbf.randomize()
print "Before optimize"
print mrbf
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
mrbf.checkgrad(verbose=1)
plt.subplot(121)
mrbf.plot()
plt.title('Student t fixed noise')
mrbf.optimize()
print "After optimize"
print mrbf
plt.subplot(122)
mrbf.plot()
plt.title('Student t fixed noise optimized')
print mrbf
plt.figure(3)
print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood())
plt.suptitle('Gaussian likelihood optimised')
mgp.plot()
print "Real std: {}".format(real_std)
print "Real variance {}".format(real_std**2)
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
print "Len should be: {}".format(gp_len)
return mrbf
def debug_student_t_noise_approx():
plot = False
real_var = 0.1
#Start a function, any function
#X = np.linspace(0.0, 10.0, 50)[:, None]
X = np.random.rand(100)[:, None]
#X = np.random.rand(100)[:, None]
#X = np.array([0.5, 1])[:, None]
Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1
#Y = X + np.random.randn(*X.shape)*real_var
#ty = np.array([1., 9.97733584, 4.17841363])[:, None]
#Y = ty
X_full = X
Y_full = np.sin(X_full) + 1
Y = Y/Y.max()
#Add student t random noise to datapoints
deg_free = 100
real_sd = np.sqrt(real_var)
print "Real noise std: ", real_sd
initial_var_guess = 0.3
#t_rv = t(deg_free, loc=0, scale=real_var)
#noise = t_rvrvs(size=Y.shape)
#Y += noise
plt.close('all')
# Kernel object
kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
#kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
kernel2 = kernel1.copy()
kernel3 = kernel1.copy()
kernel4 = kernel1.copy()
kernel5 = kernel1.copy()
kernel6 = kernel1.copy()
print "Clean Gaussian"
#A GP should completely break down due to the points as they get a lot of weight
# create simple GP model
#m = GPy.models.GP_regression(X, Y, kernel=kernel1)
## optimize
#m.ensure_default_constraints()
#m.optimize()
## plot
#if plot:
#plt.figure(1)
#plt.suptitle('Gaussian likelihood')
#plt.subplot(131)
#m.plot()
#plt.plot(X_full, Y_full)
#print m
real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd
#edited_real_sd = real_sd
print "Clean student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
#m['rbf_len'] = 1.5
#m.constrain_fixed('rbf_v', 1.0898)
#m.constrain_fixed('rbf_l', 0.2651)
#m.constrain_fixed('t_noise_std2', edited_real_sd)
#m.constrain_positive('rbf')
m.constrain_positive('t_noise_std2')
#m.constrain_positive('')
#m.constrain_bounded('t_noi', 0.001, 10)
#m.constrain_fixed('t_noi', real_stu_t_std)
#m.constrain_fixed('white', 0.01)
#m.constrain_fixed('t_no', 0.01)
#m['rbf_var'] = 0.20446332
#m['rbf_leng'] = 0.85776241
#m['t_noise'] = 0.667083294421005
m.ensure_default_constraints()
m.update_likelihood_approximation()
#m.optimize(messages=True)
print(m)
#return m
#m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
if plot:
plt.suptitle('Student-t likelihood')
plt.subplot(132)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
print "Real noise std: ", real_sd
print "or Real noise std: ", real_stu_t_std
return m
#print "Clean student t, ncg"
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
#m = GPy.models.GP(X, stu_t_likelihood, kernel3)
#m.ensure_default_constraints()
#m.update_likelihood_approximation()
#m.optimize()
#print(m)
#if plot:
#plt.subplot(133)
#m.plot()
#plt.plot(X_full, Y_full)
#plt.ylim(-2.5, 2.5)
#plt.show()
def student_t_approx():
"""
Example of regressing with a student t likelihood
"""
real_std = 0.1
#Start a function, any function
X = np.linspace(0.0, 10.0, 50)[:, None]
Y = np.sin(X) + np.random.randn(*X.shape)*real_std
Yc = Y.copy()
X_full = np.linspace(0.0, 10.0, 500)[:, None]
Y_full = np.sin(X_full)
Y = Y/Y.max()
Yc[10] += 100
Yc[25] += 10
Yc[23] += 10
Yc[26] += 1000
Yc[24] += 10
#Yc = Yc/Yc.max()
#Add student t random noise to datapoints
deg_free = 8
print "Real noise: ", real_std
initial_var_guess = 0.1
#t_rv = t(deg_free, loc=0, scale=real_var)
#noise = t_rvrvs(size=Y.shape)
#Y += noise
#Add some extreme value noise to some of the datapoints
#percent_corrupted = 0.15
#corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
#indices = np.arange(Y.shape[0])
#np.random.shuffle(indices)
#corrupted_indices = indices[:corrupted_datums]
#print corrupted_indices
#noise = t_rv.rvs(size=(len(corrupted_indices), 1))
#Y[corrupted_indices] += noise
plt.figure(1)
plt.suptitle('Gaussian likelihood')
# Kernel object
kernel1 = GPy.kern.rbf(X.shape[1])
kernel2 = kernel1.copy()
kernel3 = kernel1.copy()
kernel4 = kernel1.copy()
kernel5 = kernel1.copy()
kernel6 = kernel1.copy()
print "Clean Gaussian"
#A GP should completely break down due to the points as they get a lot of weight
# create simple GP model
m = GPy.models.GP_regression(X, Y, kernel=kernel1)
# optimize
m.ensure_default_constraints()
m.optimize()
# plot
plt.subplot(211)
m.plot()
plt.plot(X_full, Y_full)
plt.title('Gaussian clean')
print m
#Corrupt
print "Corrupt Gaussian"
m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
m.ensure_default_constraints()
#m.optimize()
plt.subplot(212)
m.plot()
plt.plot(X_full, Y_full)
plt.title('Gaussian corrupt')
print m
plt.figure(2)
plt.suptitle('Student-t likelihood')
edited_real_sd = real_std #initial_var_guess
print "Clean student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
m.ensure_default_constraints()
m.constrain_positive('t_noise')
m.randomize()
m.update_likelihood_approximation()
m.optimize()
print(m)
plt.subplot(222)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
plt.title('Student-t rasm clean')
print "Corrupt student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
m.ensure_default_constraints()
m.constrain_positive('t_noise')
m.randomize()
m.update_likelihood_approximation()
m.optimize()
print(m)
plt.subplot(224)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
plt.title('Student-t rasm corrupt')
return m
#print "Clean student t, ncg"
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
#m = GPy.models.GP(X, stu_t_likelihood, kernel3)
#m.ensure_default_constraints()
#m.update_likelihood_approximation()
#m.optimize()
#print(m)
#plt.subplot(221)
#m.plot()
#plt.plot(X_full, Y_full)
#plt.ylim(-2.5, 2.5)
#plt.title('Student-t ncg clean')
#print "Corrupt student t, ncg"
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
#corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
#m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
#m.ensure_default_constraints()
#m.update_likelihood_approximation()
#m.optimize()
#print(m)
#plt.subplot(223)
#m.plot()
#plt.plot(X_full, Y_full)
#plt.ylim(-2.5, 2.5)
#plt.title('Student-t ncg corrupt')
###with a student t distribution, since it has heavy tails it should work well
###likelihood_function = student_t(deg_free, sigma2=real_var)
###lap = Laplace(Y, likelihood_function)
###cov = kernel.K(X)
###lap.fit_full(cov)
###test_range = np.arange(0, 10, 0.1)
###plt.plot(test_range, t_rv.pdf(test_range))
###for i in xrange(X.shape[0]):
###mode = lap.f_hat[i]
###covariance = lap.hess_hat_i[i,i]
###scaling = np.exp(lap.ln_z_hat)
###normalised_approx = norm(loc=mode, scale=covariance)
###print "Normal with mode %f, and variance %f" % (mode, covariance)
###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
###plt.show()
return m
def noisy_laplace_approx():
"""
Example of regressing with a student t likelihood
"""
#Start a function, any function
X = np.sort(np.random.uniform(0, 15, 70))[:, None]
Y = np.sin(X)
#Add some extreme value noise to some of the datapoints
percent_corrupted = 0.05
corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
indices = np.arange(Y.shape[0])
np.random.shuffle(indices)
corrupted_indices = indices[:corrupted_datums]
print corrupted_indices
noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
Y[corrupted_indices] += noise
#A GP should completely break down due to the points as they get a lot of weight
# create simple GP model
m = GPy.models.GP_regression(X, Y)
# optimize
m.ensure_default_constraints()
m.optimize()
# plot
m.plot()
print m
#with a student t distribution, since it has heavy tails it should work well
def gaussian_f_check():
plt.close('all')
X = np.linspace(0, 1, 50)[:, None]
real_std = 0.2
noise = np.random.randn(*X.shape)*real_std
Y = np.sin(X*2*np.pi) + noise
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
mgp.ensure_default_constraints()
mgp.randomize()
mgp.optimize()
print "Gaussian"
print mgp
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
kernelg = kernelgp.copy()
#kernelst += GPy.kern.bias(X.shape[1])
N, D = X.shape
g_distribution = GPy.likelihoods.likelihood_functions.gaussian(variance=0.1, N=N, D=D)
g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
m = GPy.models.GP(X, g_likelihood, kernelg)
#m['rbf_v'] = mgp._get_params()[0]
#m['rbf_l'] = mgp._get_params()[1] + 1
m.ensure_default_constraints()
#m.constrain_fixed('rbf_v', mgp._get_params()[0])
#m.constrain_fixed('rbf_l', mgp._get_params()[1])
#m.constrain_bounded('t_no', 2*real_std**2, 1e3)
#m.constrain_positive('bias')
m.constrain_positive('noise_var')
m.randomize()
m['noise_variance'] = 0.1
m.likelihood.X = X
plt.figure()
plt.subplot(211)
m.plot()
plt.subplot(212)
m.optimize()
m.plot()
print "final optimised gaussian"
print m
print "real GP"
print mgp
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT

453
GPy/likelihoods/Laplace.py Normal file
View file

@ -0,0 +1,453 @@
import numpy as np
import scipy as sp
import GPy
from scipy.linalg import inv, cho_solve, det
from numpy.linalg import cond
from likelihood import likelihood
from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
from scipy.linalg.lapack import dtrtrs
import random
#import pylab as plt
class Laplace(likelihood):
"""Laplace approximation to a posterior"""
def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'):
"""
Laplace Approximation
First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
then find the z^{prime} which allows this to be a normalised gaussian instead of a
non-normalized gaussian
Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
which makes a gaussian the same as the laplace approximation
Arguments
---------
:data: array of data the likelihood function is approximating
:likelihood_function: likelihood function - subclass of likelihood_function
:extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
:opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
"""
self.data = data
self.likelihood_function = likelihood_function
self.extra_data = extra_data
self.opt = opt
#Inital values
self.N, self.D = self.data.shape
self.is_heteroscedastic = True
self.Nparams = 0
self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
#Initial values for the GP variables
self.Y = np.zeros((self.N, 1))
self.covariance_matrix = np.eye(self.N)
self.precision = np.ones(self.N)[:, None]
self.Z = 0
self.YYT = None
self.old_a = None
def predictive_values(self, mu, var, full_cov):
if full_cov:
raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
return self.likelihood_function.predictive_values(mu, var)
def _get_params(self):
return np.asarray(self.likelihood_function._get_params())
def _get_param_names(self):
return self.likelihood_function._get_param_names()
def _set_params(self, p):
return self.likelihood_function._set_params(p)
def _shared_gradients_components(self):
#FIXME: Careful of side effects! And make sure W and K are up to date!
d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
return dL_dfhat, I_KW_i
def _Kgradients(self, dK_dthetaK, X):
"""
Gradients with respect to prior kernel parameters
"""
dL_dfhat, I_KW_i = self._shared_gradients_components()
dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
#Implicit
impl = mdot(dlp, dL_dfhat, I_KW_i)
expl_a = mdot(self.Ki_f, self.Ki_f.T)
expl_b = self.Wi_K_i
#print "expl_a: {}, expl_b: {}".format(expl_a, expl_b)
expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
dL_dthetaK_exp = dK_dthetaK(expl, X)
dL_dthetaK_imp = dK_dthetaK(impl, X)
#print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
#print "expl_a: {}, {} expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b))
dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
return dL_dthetaK
def _gradients(self, partial):
"""
Gradients with respect to likelihood parameters
"""
dL_dfhat, I_KW_i = self._shared_gradients_components()
dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
num_params = len(dlik_dthetaL)
dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
for thetaL_i in range(num_params):
#Explicit
#dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
#a = 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
#d = dlik_hess_dthetaL[thetaL_i]
#e = pdinv(pdinv(self.K)[0] + np.diagflat(self.W))[0]
#b = 0.5*np.dot(np.diag(e).T, d)
#g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1))
#dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i])
dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
#Implicit
df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
#print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
def _compute_GP_variables(self):
"""
Generates data Y which would give the normal distribution identical to the laplace approximation
GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
due to the z rescaling.
at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
$$\tilde{Y} = \tilde{\Sigma} Hf$$
where
$$\tilde{\Sigma}^{-1} = H - K^{-1}$$
i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
$$\tilde{\Sigma} = W^{-1}$$
"""
#Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
#dtritri -> L -> L_i
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
#((L.T*w)_i + I)f_hat = y_tilde
#L = jitchol(self.K)
#Li = chol_inv(L)
#Lt_W = L.T*self.W.T
#Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
#self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
#Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
Wi = 1.0/self.W
self.Sigma_tilde = np.diagflat(Wi)
Y_tilde = Wi*self.Ki_f + self.f_hat
self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
#self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
self.aA = 0.5*self.ln_det_K_Wi__Bi
self.bB = - 0.5*self.f_Ki_f
self.cC = 0.5*self.y_Wi_Ki_i_y
Z_tilde = (+ self.lik
+ 0.5*self.ln_det_K_Wi__Bi
- 0.5*self.f_Ki_f
+ 0.5*self.y_Wi_Ki_i_y
)
print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
print self.likelihood_function._get_params()
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
self.Y = Y_tilde
self.YYT = np.dot(self.Y, self.Y.T)
self.covariance_matrix = self.Sigma_tilde
self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
def fit_full(self, K):
"""
The laplace approximation algorithm, find K and expand hessian
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
:K: Covariance matrix
"""
self.K = K.copy()
#Find mode
self.f_hat = {
'rasm': self.rasm_mode,
'ncg': self.ncg_mode,
'nelder': self.nelder_mode
}[self.opt](self.K)
#Compute hessian and other variables at mode
self._compute_likelihood_variables()
def _compute_likelihood_variables(self):
#At this point get the hessian matrix (or vector as W is diagonal)
self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
if not self.likelihood_function.log_concave:
self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
#To cause the posterior to become less certain than the prior and likelihood,
#This is a property only held by non-log-concave likelihoods
#TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
self.Bi, _, _, B_det = pdinv(self.B)
#Do the computation again at f to get Ki_f which is useful
#b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
#solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
#a = b - self.W_12*solve_chol
self.Ki_f = self.a
self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
#For det, |I + KW| == |I + W_12*K*W_12|
self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
#self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
#self.ln_z_hat = (- 0.5*self.f_Ki_f
#- self.ln_I_KW_det
#+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
#)
return self._compute_GP_variables()
def _compute_B_statistics(self, K, W):
"""Rasmussen suggests the use of a numerically stable positive definite matrix B
Which has a positive diagonal element and can be easyily inverted
:K: Covariance matrix
:W: Negative hessian at a point (diagonal matrix)
:returns: (B, L)
"""
#W is diagonal so its sqrt is just the sqrt of the diagonal elements
W_12 = np.sqrt(W)
B = np.eye(self.N) + W_12*K*W_12.T
L = jitchol(B)
return (B, L, W_12)
def nelder_mode(self, K):
f = np.zeros((self.N, 1))
self.Ki, _, _, self.ln_K_det = pdinv(K)
def obj(f):
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
return float(res)
res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
f_new = res.x
return f_new[:, None]
def ncg_mode(self, K):
"""
Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
:K: Covariance matrix
:returns: f_mode
"""
self.Ki, _, _, self.ln_K_det = pdinv(K)
f = np.zeros((self.N, 1))
#FIXME: Can we get rid of this horrible reshaping?
#ONLY WORKS FOR 1D DATA
def obj(f):
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
- self.NORMAL_CONST)
return float(res)
def obj_grad(f):
res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
return np.squeeze(res)
def obj_hess(f):
res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
return np.squeeze(res)
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
return f_hat[:, None]
def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
"""
Rasmussens numerically stable mode finding
For nomenclature see Rasmussen & Williams 2006
:K: Covariance matrix
:MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
:returns: f_mode
"""
self.old_before_s = self.likelihood_function._get_params()
print "before: ", self.old_before_s
#if self.old_before_s < 1e-5:
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
#old_a = np.zeros((self.N, 1))
if self.old_a is None:
old_a = np.zeros((self.N, 1))
f = np.dot(K, old_a)
else:
old_a = self.old_a.copy()
f = self.f_hat.copy()
new_obj = -np.inf
old_obj = np.inf
def obj(a, f):
return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
difference = np.inf
epsilon = 1e-4
step_size = 1
rs = 0
i = 0
while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
#W = np.maximum(W, 0)
if not self.likelihood_function.log_concave:
W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
# To cause the posterior to become less certain than the prior and likelihood,
# This is a property only held by non-log-concave likelihoods
B, L, W_12 = self._compute_B_statistics(K, W.copy())
W_f = W*f
grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
b = W_f + grad
solve_L = cho_solve((L, True), W_12*np.dot(K, b))
#Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
full_step_a = b - W_12*solve_L
da = full_step_a - old_a
#f_old = f.copy()
#def inner_obj(step_size, old_a, da, K):
#a = old_a + step_size*da
#f = np.dot(K, a)
#self.a = a.copy() # This is nasty, need to set something within an optimization though
#self.f = f.copy()
#return -obj(a, f)
#from functools import partial
#i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
#new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
#f = self.f.copy()
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
f_old = f.copy()
update_passed = False
while not update_passed:
a = old_a + step_size*da
f = np.dot(K, a)
old_obj = new_obj
new_obj = obj(a, f)
difference = new_obj - old_obj
print "difference: ",difference
if difference < 0:
#print "Objective function rose", np.float(difference)
#If the objective function isn't rising, restart optimization
step_size *= 0.8
#print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
#objective function isn't increasing, try reducing step size
f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
old_obj = new_obj
rs += 1
else:
update_passed = True
#difference = abs(new_obj - old_obj)
#old_obj = new_obj.copy()
#difference = np.abs(np.sum(f - f_old))
difference = np.abs(np.sum(a - old_a))
#old_a = self.a.copy() #a
old_a = a.copy()
i += 1
#print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.old_a = old_a.copy()
#print "Positive difference obj: ", np.float(difference)
#print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
print "Iterations: {}, Final_difference: {}".format(i, difference)
if difference > 1e-4:
print "FAIL FAIL FAIL FAIL FAIL FAIL"
if False:
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
if hasattr(self, 'X'):
import pylab as pb
pb.figure()
pb.subplot(311)
pb.title('old f_hat')
pb.plot(self.X, self.f_hat)
pb.subplot(312)
pb.title('old ff')
pb.plot(self.X, self.old_ff)
pb.subplot(313)
pb.title('new f_hat')
pb.plot(self.X, f)
pb.figure()
pb.subplot(121)
pb.title('old K')
pb.imshow(np.diagflat(self.old_K), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new K')
pb.imshow(np.diagflat(K), interpolation='none')
pb.colorbar()
pb.figure()
pb.subplot(121)
pb.title('old W')
pb.imshow(np.diagflat(self.old_W), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new W')
pb.imshow(np.diagflat(W), interpolation='none')
pb.colorbar()
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
pb.close('all')
#FIXME: DELETE THESE
self.old_W = W.copy()
self.old_grad = grad.copy()
self.old_B = B.copy()
self.old_W_12 = W_12.copy()
self.old_ff = f.copy()
self.old_K = self.K.copy()
self.old_s = self.likelihood_function._get_params()
print "after: ", self.old_s
#print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.a = a
#self.B, self.B_chol, self.W_12 = B, L, W_12
#self.Bi, _, _, B_det = pdinv(self.B)
return f

319
GPy/models/GP.py Normal file
View file

@ -0,0 +1,319 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
from scipy import linalg
import pylab as pb
from .. import kern
from ..core import model
from ..util.linalg import pdinv, mdot, tdot
from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango
from ..likelihoods import EP, Laplace
class GP(model):
"""
Gaussian Process model for regression and EP
:param X: input observations
:param kernel: a GPy kernel, defaults to rbf+white
:parm likelihood: a GPy likelihood
:param normalize_X: whether to normalize the input data before computing (predictions will be in original scales)
:type normalize_X: False|True
:rtype: model object
:param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
:param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
:type powerep: list
.. Note:: Multiple independent outputs are allowed using columns of Y
"""
def __init__(self, X, likelihood, kernel, normalize_X=False):
self.has_uncertain_inputs=False
# parse arguments
self.X = X
assert len(self.X.shape) == 2
self.N, self.Q = self.X.shape
assert isinstance(kernel, kern.kern)
self.kern = kernel
self.likelihood = likelihood
assert self.X.shape[0] == self.likelihood.data.shape[0]
self.N, self.D = self.likelihood.data.shape
# here's some simple normalization for the inputs
if normalize_X:
self._Xmean = X.mean(0)[None, :]
self._Xstd = X.std(0)[None, :]
self.X = (X.copy() - self._Xmean) / self._Xstd
if hasattr(self, 'Z'):
self.Z = (self.Z - self._Xmean) / self._Xstd
else:
self._Xmean = np.zeros((1, self.X.shape[1]))
self._Xstd = np.ones((1, self.X.shape[1]))
if not hasattr(self,'has_uncertain_inputs'):
self.has_uncertain_inputs = False
model.__init__(self)
def dL_dZ(self):
"""
TODO: one day we might like to learn Z by gradient methods?
"""
#FIXME: this doesn;t live here.
return np.zeros_like(self.Z)
def _set_params(self, p):
self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
# self.likelihood._set_params(p[self.kern.Nparam:]) # test by Nicolas
self.likelihood._set_params(p[self.kern.Nparam_transformed():]) # test by Nicolas
if isinstance(self.likelihood, Laplace):
self.likelihood.fit_full(self.kern.K(self.X))
self.likelihood._set_params(self.likelihood._get_params())
self.K = self.kern.K(self.X)
self.K += self.likelihood.covariance_matrix
self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
# the gradient of the likelihood wrt the covariance matrix
if self.likelihood.YYT is None:
#alpha = np.dot(self.Ki, self.likelihood.Y)
alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1)
self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
else:
#tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
def _get_params(self):
return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
def _get_param_names(self):
return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
def _update_params_callback(self, p):
#parameters will be in transformed space
self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
#set_params_transformed for likelihood doesn't exist?
self.likelihood._set_params(p[self.kern.Nparam_transformed():])
#update the likelihood approximation within the optimisation with the current parameters
self.update_likelihood_approximation()
def update_likelihood_approximation(self):
"""
Approximates a non-gaussian likelihood using Expectation Propagation
For a Gaussian likelihood, no iteration is required:
this function does nothing
"""
self.likelihood.fit_full(self.kern.K(self.X))
self._set_params(self._get_params()) # update the GP
def _model_fit_term(self):
"""
Computes the model fit using YYT if it's available
"""
if self.likelihood.YYT is None:
tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
return -0.5 * np.sum(np.square(tmp))
#return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
else:
return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
def log_likelihood(self):
"""
The log marginal likelihood of the GP.
For an EP model, can be written as the log likelihood of a regression
model for a new variable Y* = v_tilde/tau_tilde, with a covariance
matrix K* = K + diag(1./tau_tilde) plus a normalization term.
"""
#if isinstance(self.likelihood, Laplace):
#self.likelihood.fit_full(self.kern.K(self.X))
#self.likelihood._set_params(self.likelihood._get_params())
l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z)
return l
def _log_likelihood_gradients(self):
"""
The gradient of all parameters.
Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
"""
dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
print "dL_dthetaK should be: ", dL_dthetaK
if isinstance(self.likelihood, Laplace):
#self.likelihood.fit_full(self.kern.K(self.X))
#self.likelihood._set_params(self.likelihood._get_params())
dK_dthetaK = self.kern.dK_dtheta
dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
else:
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
#print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
#print "dL_dthetaK: {} dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
return np.hstack((dL_dthetaK, dL_dthetaL))
#return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
"""
Internal helper function for making predictions, does not account
for normalization or likelihood
"""
Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
#KiKx = np.dot(self.Ki, Kx)
KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1)
mu = np.dot(KiKx.T, self.likelihood.Y)
if full_cov:
Kxx = self.kern.K(_Xnew, which_parts=which_parts)
var = Kxx - np.dot(KiKx.T, Kx)
else:
Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
var = var[:, None]
if stop:
debug_this
return mu, var
def predict(self, Xnew, which_parts='all', full_cov=False):
"""
Predict the function(s) at the new point(s) Xnew.
Arguments
---------
:param Xnew: The points at which to make a prediction
:type Xnew: np.ndarray, Nnew x self.Q
:param which_parts: specifies which outputs kernel(s) to use in prediction
:type which_parts: ('all', list of bools)
:param full_cov: whether to return the folll covariance matrix, or just the diagonal
:type full_cov: bool
:rtype: posterior mean, a Numpy array, Nnew x self.D
:rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
:rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays, Nnew x self.D
If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
This is to allow for different normalizations of the output dimensions.
"""
# normalize X values
Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
mu, var = self._raw_predict(Xnew, which_parts, full_cov)
# now push through likelihood
mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
return mean, var, _025pm, _975pm
def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False):
"""
Plot the GP's view of the world, where the data is normalized and the
likelihood is Gaussian.
:param samples: the number of a posteriori samples to plot
:param which_data: which if the training data to plot (default all)
:type which_data: 'all' or a slice object to slice self.X, self.Y
:param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
:param which_parts: which of the kernel functions to plot (additively)
:type which_parts: 'all', or list of bools
:param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
Plot the posterior of the GP.
- In one dimension, the function is plotted with a shaded region identifying two standard deviations.
- In two dimsensions, a contour-plot shows the mean predicted function
- In higher dimensions, we've no implemented this yet !TODO!
Can plot only part of the data and part of the posterior functions
using which_data and which_functions
"""
if which_data == 'all':
which_data = slice(None)
if self.X.shape[1] == 1:
Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
if samples == 0:
m, v = self._raw_predict(Xnew, which_parts=which_parts)
gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v))
pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
else:
m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None])
for i in range(samples):
pb.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
pb.xlim(xmin, xmax)
ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
pb.ylim(ymin, ymax)
if hasattr(self, 'Z'):
pb.plot(self.Z, self.Z * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
elif self.X.shape[1] == 2:
resolution = resolution or 50
Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
m, v = self._raw_predict(Xnew, which_parts=which_parts)
m = m.reshape(resolution, resolution).T
pb.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
pb.scatter(Xorig[:, 0], Xorig[:, 1], 40, Yorig, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max())
pb.xlim(xmin[0], xmax[0])
pb.ylim(xmin[1], xmax[1])
else:
raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20):
"""
TODO: Docstrings!
:param levels: for 2D plotting, the number of contour levels to use
"""
# TODO include samples
if which_data == 'all':
which_data = slice(None)
if self.X.shape[1] == 1:
Xu = self.X * self._Xstd + self._Xmean # NOTE self.X are the normalized values now
Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
gpplot(Xnew, m, lower, upper)
pb.plot(Xu[which_data], self.likelihood.data[which_data], 'kx', mew=1.5)
if self.has_uncertain_inputs:
pb.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
pb.xlim(xmin, xmax)
pb.ylim(ymin, ymax)
if hasattr(self, 'Z'):
Zu = self.Z * self._Xstd + self._Xmean
pb.plot(Zu, Zu * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
# pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
elif self.X.shape[1] == 2: # FIXME
resolution = resolution or 50
Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
m = m.reshape(resolution, resolution).T
pb.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
Yf = self.likelihood.Y.flatten()
pb.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
pb.xlim(xmin[0], xmax[0])
pb.ylim(xmin[1], xmax[1])
if hasattr(self, 'Z'):
pb.plot(self.Z[:, 0], self.Z[:, 1], 'wo')
else:
raise NotImplementedError, "Cannot define a frame with more than two input dimensions"