Tidied up laplace

This commit is contained in:
Alan Saul 2013-10-03 19:04:00 +01:00
parent 8343615098
commit da67e39e50
4 changed files with 159 additions and 283 deletions

View file

@ -27,7 +27,7 @@ def timing():
kernel1 = GPy.kern.rbf(X.shape[1]) kernel1 = GPy.kern.rbf(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
m = GPy.models.GPRegression(X, Yc.copy(), kernel1, likelihood=corrupt_stu_t_likelihood) m = GPy.models.GPRegression(X, Yc.copy(), kernel1, likelihood=corrupt_stu_t_likelihood)
m.ensure_default_constraints() m.ensure_default_constraints()
m.update_likelihood_approximation() m.update_likelihood_approximation()
@ -56,7 +56,7 @@ def v_fail_test():
print "Clean student t, rasm" print "Clean student t, rasm"
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y.copy(), kernel1, likelihood=stu_t_likelihood) m = GPy.models.GPRegression(X, Y.copy(), kernel1, likelihood=stu_t_likelihood)
m.constrain_positive('') m.constrain_positive('')
vs = 25 vs = 25
@ -103,7 +103,7 @@ def student_t_obj_plane():
kernelst = kernelgp.copy() kernelst = kernelgp.copy()
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=(real_std**2)) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=(real_std**2))
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y, kernelst, likelihood=stu_t_likelihood) m = GPy.models.GPRegression(X, Y, kernelst, likelihood=stu_t_likelihood)
m.ensure_default_constraints() m.ensure_default_constraints()
m.constrain_fixed('t_no', real_std**2) m.constrain_fixed('t_no', real_std**2)
@ -156,7 +156,7 @@ def student_t_f_check():
kernelst = kernelgp.copy() kernelst = kernelgp.copy()
#kernelst += GPy.kern.bias(X.shape[1]) #kernelst += GPy.kern.bias(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=0.05) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=0.05)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y.copy(), kernelst, likelihood=stu_t_likelihood) m = GPy.models.GPRegression(X, Y.copy(), kernelst, likelihood=stu_t_likelihood)
#m['rbf_v'] = mgp._get_params()[0] #m['rbf_v'] = mgp._get_params()[0]
#m['rbf_l'] = mgp._get_params()[1] + 1 #m['rbf_l'] = mgp._get_params()[1] + 1
@ -208,7 +208,7 @@ def student_t_fix_optimise_check():
real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free)) real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=real_stu_t_std2) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=real_stu_t_std2)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
plt.figure(1) plt.figure(1)
plt.suptitle('Student likelihood') plt.suptitle('Student likelihood')
@ -351,7 +351,7 @@ def debug_student_t_noise_approx():
print "Clean student t, rasm" print "Clean student t, rasm"
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y, kernel6, likelihood=stu_t_likelihood) m = GPy.models.GPRegression(X, Y, kernel6, likelihood=stu_t_likelihood)
#m['rbf_len'] = 1.5 #m['rbf_len'] = 1.5
@ -488,7 +488,7 @@ def student_t_approx():
print "Clean student t, rasm" print "Clean student t, rasm"
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood) m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
m.ensure_default_constraints() m.ensure_default_constraints()
m.constrain_positive('t_noise') m.constrain_positive('t_noise')
@ -504,7 +504,7 @@ def student_t_approx():
print "Corrupt student t, rasm" print "Corrupt student t, rasm"
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood) m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
m.ensure_default_constraints() m.ensure_default_constraints()
m.constrain_positive('t_noise') m.constrain_positive('t_noise')
@ -526,51 +526,22 @@ def student_t_approx():
import ipdb; ipdb.set_trace() # XXX BREAKPOINT import ipdb; ipdb.set_trace() # XXX BREAKPOINT
return m return m
#print "Clean student t, ncg" #with a student t distribution, since it has heavy tails it should work well
#t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd) #likelihood_function = student_t(deg_free=deg_free, sigma2=real_var)
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') #lap = Laplace(Y, likelihood_function)
#m = GPy.models.GPRegression(X, Y, kernel3, likelihood=stu_t_likelihood) #cov = kernel.K(X)
#m.ensure_default_constraints() #lap.fit_full(cov)
#m.update_likelihood_approximation()
#m.optimize()
#print(m)
#plt.subplot(221)
#m.plot()
#plt.plot(X_full, Y_full)
#plt.ylim(-2.5, 2.5)
#plt.title('Student-t ncg clean')
#print "Corrupt student t, ncg" #test_range = np.arange(0, 10, 0.1)
#t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd) #plt.plot(test_range, t_rv.pdf(test_range))
#corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg') #for i in xrange(X.shape[0]):
#m = GPy.models.GPRegression(X, Y, kernel5, likelihood=corrupt_stu_t_likelihood) #mode = lap.f_hat[i]
#m.ensure_default_constraints() #covariance = lap.hess_hat_i[i,i]
#m.update_likelihood_approximation() #scaling = np.exp(lap.ln_z_hat)
#m.optimize() #normalised_approx = norm(loc=mode, scale=covariance)
#print(m) #print "Normal with mode %f, and variance %f" % (mode, covariance)
#plt.subplot(223) #plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
#m.plot() #plt.show()
#plt.plot(X_full, Y_full)
#plt.ylim(-2.5, 2.5)
#plt.title('Student-t ncg corrupt')
###with a student t distribution, since it has heavy tails it should work well
###likelihood_function = student_t(deg_free=deg_free, sigma2=real_var)
###lap = Laplace(Y, likelihood_function)
###cov = kernel.K(X)
###lap.fit_full(cov)
###test_range = np.arange(0, 10, 0.1)
###plt.plot(test_range, t_rv.pdf(test_range))
###for i in xrange(X.shape[0]):
###mode = lap.f_hat[i]
###covariance = lap.hess_hat_i[i,i]
###scaling = np.exp(lap.ln_z_hat)
###normalised_approx = norm(loc=mode, scale=covariance)
###print "Normal with mode %f, and variance %f" % (mode, covariance)
###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
###plt.show()
return m return m
@ -625,7 +596,7 @@ def gaussian_f_check():
#kernelst += GPy.kern.bias(X.shape[1]) #kernelst += GPy.kern.bias(X.shape[1])
N, D = X.shape N, D = X.shape
g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=0.1, N=N, D=D) g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=0.1, N=N, D=D)
g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm') g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution)
m = GPy.models.GPRegression(X, Y, kernelg, likelihood=g_likelihood) m = GPy.models.GPRegression(X, Y, kernelg, likelihood=g_likelihood)
m.likelihood.X = X m.likelihood.X = X
#m['rbf_v'] = mgp._get_params()[0] #m['rbf_v'] = mgp._get_params()[0]
@ -702,7 +673,7 @@ def boston_example():
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
N, D = Y_train.shape N, D = Y_train.shape
g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D) g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution, opt='rasm') g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=g_likelihood) mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=g_likelihood)
mg.ensure_default_constraints() mg.ensure_default_constraints()
mg.constrain_positive('noise_variance') mg.constrain_positive('noise_variance')
@ -729,7 +700,7 @@ def boston_example():
print "Student-T GP {}df".format(deg_free) print "Student-T GP {}df".format(deg_free)
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood) mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
mstu_t.ensure_default_constraints() mstu_t.ensure_default_constraints()
mstu_t.constrain_fixed('white', 1e-5) mstu_t.constrain_fixed('white', 1e-5)
@ -755,7 +726,7 @@ def boston_example():
print "Student-T GP {}df".format(deg_free) print "Student-T GP {}df".format(deg_free)
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood) mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
mstu_t.ensure_default_constraints() mstu_t.ensure_default_constraints()
mstu_t.constrain_fixed('white', 1e-5) mstu_t.constrain_fixed('white', 1e-5)
@ -782,7 +753,7 @@ def boston_example():
print "Student-T GP {}df".format(deg_free) print "Student-T GP {}df".format(deg_free)
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood) mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
mstu_t.ensure_default_constraints() mstu_t.ensure_default_constraints()
mstu_t.constrain_fixed('white', 1e-5) mstu_t.constrain_fixed('white', 1e-5)
@ -808,7 +779,7 @@ def boston_example():
print "Student-T GP {}df".format(deg_free) print "Student-T GP {}df".format(deg_free)
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise) t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood) mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
mstu_t.ensure_default_constraints() mstu_t.ensure_default_constraints()
mstu_t.constrain_fixed('white', 1e-5) mstu_t.constrain_fixed('white', 1e-5)

View file

@ -1,42 +1,42 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np import numpy as np
import scipy as sp import scipy as sp
import GPy from scipy.linalg import cho_solve
from scipy.linalg import inv, cho_solve, det
from numpy.linalg import cond
from likelihood import likelihood from likelihood import likelihood
from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet, dtrtrs from ..util.linalg import mdot, jitchol, pddet
from scipy.linalg.lapack import dtrtrs from scipy.linalg.lapack import dtrtrs
import random from functools import partial as partial_func
from functools import partial
#import pylab as plt
class Laplace(likelihood): class Laplace(likelihood):
"""Laplace approximation to a posterior""" """Laplace approximation to a posterior"""
def __init__(self, data, noise_model, extra_data=None, opt='rasm'): def __init__(self, data, noise_model, extra_data=None):
""" """
Laplace Approximation Laplace Approximation
First find the moments \hat{f} and the hessian at this point (using Newton-Raphson) Find the moments \hat{f} and the hessian at this point
then find the z^{prime} which allows this to be a normalised gaussian instead of a (using Newton-Raphson) of the unnormalised posterior
non-normalized gaussian
Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle} Compute the GP variables (i.e. generate some Y^{squiggle} and
which makes a gaussian the same as the laplace approximation z^{squiggle} which makes a gaussian the same as the laplace
approximation to the posterior, but normalised
Arguments Arguments
--------- ---------
:data: array of data the likelihood function is approximating :param data: array of data the likelihood function is approximating
:noise_model: likelihood function - subclass of noise_model :type data: NxD
:extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data :param noise_model: likelihood function - subclass of noise_model
:opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data) :type noise_model: noise_model
:param extra_data: additional data used by some likelihood functions,
for example survival likelihoods need censoring data
""" """
self.data = data self.data = data
self.noise_model = noise_model self.noise_model = noise_model
self.extra_data = extra_data self.extra_data = extra_data
self.opt = opt
#Inital values #Inital values
self.N, self.D = self.data.shape self.N, self.D = self.data.shape
@ -48,6 +48,9 @@ class Laplace(likelihood):
likelihood.__init__(self) likelihood.__init__(self)
def restart(self): def restart(self):
"""
Reset likelihood variables to their defaults
"""
#Initial values for the GP variables #Initial values for the GP variables
self.Y = np.zeros((self.N, 1)) self.Y = np.zeros((self.N, 1))
self.covariance_matrix = np.eye(self.N) self.covariance_matrix = np.eye(self.N)
@ -55,11 +58,12 @@ class Laplace(likelihood):
self.Z = 0 self.Z = 0
self.YYT = None self.YYT = None
self.old_a = None self.old_Ki_f = None
def predictive_values(self, mu, var, full_cov): def predictive_values(self, mu, var, full_cov):
if full_cov: if full_cov:
raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood") raise NotImplementedError("Cannot make correlated predictions\
with an Laplace likelihood")
return self.noise_model.predictive_values(mu, var) return self.noise_model.predictive_values(mu, var)
def _get_params(self): def _get_params(self):
@ -79,7 +83,10 @@ class Laplace(likelihood):
def _Kgradients(self): def _Kgradients(self):
""" """
Gradients with respect to prior kernel parameters Gradients with respect to prior kernel parameters dL_dK to be chained
with dK_dthetaK to give dL_dthetaK
:returns: dL_dK matrix
:rtype: Matrix (1 x num_kernel_params)
""" """
dL_dfhat, I_KW_i = self._shared_gradients_components() dL_dfhat, I_KW_i = self._shared_gradients_components()
dlp = self.noise_model.dlik_df(self.data, self.f_hat) dlp = self.noise_model.dlik_df(self.data, self.f_hat)
@ -93,19 +100,25 @@ class Laplace(likelihood):
#Implicit #Implicit
impl = mdot(dlp, dL_dfhat, I_KW_i) impl = mdot(dlp, dL_dfhat, I_KW_i)
#No longer required as we are computing these in the gp already otherwise we would take them away and add them back #No longer required as we are computing these in the gp already
#otherwise we would take them away and add them back
#dL_dthetaK_imp = dK_dthetaK(impl, X) #dL_dthetaK_imp = dK_dthetaK(impl, X)
#dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
#dL_dK = expl + impl #dL_dK = expl + impl
#No need to compute explicit as we are computing dZ_dK to account for the difference #No need to compute explicit as we are computing dZ_dK to account
#Between the K gradients of a normal GP, and the K gradients including the implicit part #for the difference between the K gradients of a normal GP,
#and the K gradients including the implicit part
dL_dK = impl dL_dK = impl
return dL_dK return dL_dK
def _gradients(self, partial): def _gradients(self, partial):
""" """
Gradients with respect to likelihood parameters Gradients with respect to likelihood parameters (dL_dthetaL)
:param partial: Not needed by this likelihood
:type partial: lambda function
:rtype: array of derivatives (1 x num_likelihood_params)
""" """
dL_dfhat, I_KW_i = self._shared_gradients_components() dL_dfhat, I_KW_i = self._shared_gradients_components()
dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.data, self.f_hat) dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.data, self.f_hat)
@ -123,62 +136,51 @@ class Laplace(likelihood):
#Implicit #Implicit
dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL) dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
#print "LIK: dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) return dL_dthetaL
def _compute_GP_variables(self): def _compute_GP_variables(self):
""" """
Generates data Y which would give the normal distribution identical to the laplace approximation Generate data Y which would give the normal distribution identical
to the laplace approximation to the posterior, but normalised
GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} GPy expects a likelihood to be gaussian, so need to caluclate
that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood the data Y^{\tilde} that makes the posterior match that found
by a laplace approximation to a non-gaussian likelihood but with
a gaussian likelihood
Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal) Firstly,
then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f) The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
due to the z rescaling. i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
we wish to find the hessian \Sigma^{\tilde}
that has the same curvature but using our new simulated data Y^{\tilde}
i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1) Secondly,
This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1) GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f) So we can suck up any differences between that and our log marginal likelihood approximation
p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
$$\tilde{Y} = \tilde{\Sigma} Hf$$ which we want to optimize instead, by equating them and rearranging, the difference is added onto
where the log p(y) that GPy optimizes by default
$$\tilde{\Sigma}^{-1} = H - K^{-1}$$
i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
$$\tilde{\Sigma} = W^{-1}$$
Thirdly,
Since we have gradients that depend on how we move f^{\hat}, we have implicit components
aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
gp.py code
""" """
#Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
#dtritri -> L -> L_i
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
#((L.T*w)_i + I)f_hat = y_tilde
#L = jitchol(self.K)
#Li = chol_inv(L)
#Lt_W = L.T*self.W.T
#Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
#self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
#Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
Wi = 1.0/self.W Wi = 1.0/self.W
self.Sigma_tilde = np.diagflat(Wi) self.Sigma_tilde = np.diagflat(Wi)
Y_tilde = Wi*self.Ki_f + self.f_hat Y_tilde = Wi*self.Ki_f + self.f_hat
#self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
#self.Wi_K_i = self.W_12*cho_solve((self.B_chol, True), np.diagflat(self.W_12))
self.Wi_K_i = self.W12BiW12 self.Wi_K_i = self.W12BiW12
#self.Wi_K_i, _, _, self.ln_det_Wi_K = pdinv(self.Sigma_tilde + self.K) # TODO: Check if Wi_K_i == R above and same with det below
self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K) self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
self.lik = self.noise_model.link_function(self.data, self.f_hat, extra_data=self.extra_data) self.lik = self.noise_model.link_function(self.data, self.f_hat, extra_data=self.extra_data)
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
Z_tilde = (+ self.lik Z_tilde = (+ self.lik
- 0.5*self.ln_B_det - 0.5*self.ln_B_det
+ 0.5*self.ln_det_Wi_K + 0.5*self.ln_det_Wi_K
@ -201,54 +203,46 @@ class Laplace(likelihood):
""" """
The laplace approximation algorithm, find K and expand hessian The laplace approximation algorithm, find K and expand hessian
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
:K: Covariance matrix :param K: Covariance matrix evaluated at locations X
:type K: NxD matrix
""" """
self.K = K.copy() self.K = K.copy()
#Find mode #Find mode
self.f_hat = { self.f_hat = self.rasm_mode(self.K)
'rasm': self.rasm_mode,
'ncg': self.ncg_mode,
'nelder': self.nelder_mode
}[self.opt](self.K)
#Compute hessian and other variables at mode #Compute hessian and other variables at mode
self._compute_likelihood_variables() self._compute_likelihood_variables()
#Compute fake variables replicating laplace approximation to posterior
self._compute_GP_variables()
def _compute_likelihood_variables(self): def _compute_likelihood_variables(self):
"""
Compute the variables required to compute gaussian Y variables
"""
#At this point get the hessian matrix (or vector as W is diagonal) #At this point get the hessian matrix (or vector as W is diagonal)
self.W = -self.noise_model.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data) self.W = -self.noise_model.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
#TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N)) self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
#Do the computation again at f to get Ki_f which is useful self.Ki_f = self.Ki_f
#b = self.W*self.f_hat + self.noise_model.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
#solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
#a = b - self.W_12*solve_chol
self.Ki_f = self.a
self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f) self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K) self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
#For det, |I + KW| == |I + W_12*K*W_12|
#self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
#self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
#self.ln_z_hat = (- 0.5*self.f_Ki_f
#- self.ln_I_KW_det
#+ self.noise_model.link_function(self.data, self.f_hat, extra_data=self.extra_data)
#)
return self._compute_GP_variables()
def _compute_B_statistics(self, K, W, a): def _compute_B_statistics(self, K, W, a):
"""Rasmussen suggests the use of a numerically stable positive definite matrix B """
Rasmussen suggests the use of a numerically stable positive definite matrix B
Which has a positive diagonal element and can be easyily inverted Which has a positive diagonal element and can be easyily inverted
:K: Covariance matrix :param K: Covariance matrix evaluated at locations X
:W: Negative hessian at a point (diagonal matrix) :type K: NxD matrix
:returns: (B, L) :param W: Negative hessian at a point (diagonal matrix)
:type W: Vector of diagonal values of hessian (1xN)
:param a: Matrix to calculate W12BiW12a
:type a: Matrix NxN
:returns: (W12BiW12, ln_B_det)
""" """
if not self.noise_model.log_concave: if not self.noise_model.log_concave:
#print "Under 1e-10: {}".format(np.sum(W < 1e-10)) #print "Under 1e-10: {}".format(np.sum(W < 1e-10))
@ -265,74 +259,37 @@ class Laplace(likelihood):
W12BiW12= W_12*cho_solve((L, True), W_12*a) W12BiW12= W_12*cho_solve((L, True), W_12*a)
ln_B_det = 2*np.sum(np.log(np.diag(L))) ln_B_det = 2*np.sum(np.log(np.diag(L)))
return (W12BiW12, ln_B_det) return W12BiW12, ln_B_det
def nelder_mode(self, K): def rasm_mode(self, K, MAX_ITER=100):
f = np.zeros((self.N, 1))
self.Ki, _, _, self.ln_K_det = pdinv(K)
def obj(f):
res = -1 * (self.noise_model.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
return float(res)
res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
f_new = res.x
return f_new[:, None]
def ncg_mode(self, K):
"""
Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
:K: Covariance matrix
:returns: f_mode
"""
self.Ki, _, _, self.ln_K_det = pdinv(K)
f = np.zeros((self.N, 1))
#FIXME: Can we get rid of this horrible reshaping?
#ONLY WORKS FOR 1D DATA
def obj(f):
res = -1 * (self.noise_model.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
- self.NORMAL_CONST)
return float(res)
def obj_grad(f):
res = -1 * (self.noise_model.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
return np.squeeze(res)
def obj_hess(f):
res = -1 * (np.diag(self.noise_model.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
return np.squeeze(res)
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
return f_hat[:, None]
def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
""" """
Rasmussen's numerically stable mode finding Rasmussen's numerically stable mode finding
For nomenclature see Rasmussen & Williams 2006 For nomenclature see Rasmussen & Williams 2006
Influenced by GPML (BSD) code, all errors are our own
:K: Covariance matrix :param K: Covariance matrix evaluated at locations X
:MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation :type K: NxD matrix
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation :param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
:returns: f_mode :type MAX_ITER: scalar
:returns: f_hat, mode on which to make laplace approxmiation
:rtype: NxD matrix
""" """
#self.old_before_s = self.noise_model._get_params() #old_Ki_f = np.zeros((self.N, 1))
#print "before: ", self.old_before_s
#if self.old_before_s < 1e-5:
#old_a = np.zeros((self.N, 1)) #Start f's at zero originally
if self.old_a is None: if self.old_Ki_f is None:
old_a = np.zeros((self.N, 1)) old_Ki_f = np.zeros((self.N, 1))
f = np.dot(K, old_a) f = np.dot(K, old_Ki_f)
else: else:
old_a = self.old_a.copy() #Start at the old best point
old_Ki_f = self.old_Ki_f.copy()
f = self.f_hat.copy() f = self.f_hat.copy()
new_obj = -np.inf new_obj = -np.inf
old_obj = np.inf old_obj = np.inf
def obj(a, f): def obj(Ki_f, f):
return -0.5*np.dot(a.T, f) + self.noise_model.link_function(self.data, f, extra_data=self.extra_data) return -0.5*np.dot(Ki_f.T, f) + self.noise_model.link_function(self.data, f, extra_data=self.extra_data)
difference = np.inf difference = np.inf
epsilon = 1e-6 epsilon = 1e-6
@ -340,42 +297,43 @@ class Laplace(likelihood):
rs = 0 rs = 0
i = 0 i = 0
while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART: while difference > epsilon and i < MAX_ITER:
W = -self.noise_model.d2lik_d2f(self.data, f, extra_data=self.extra_data) W = -self.noise_model.d2lik_d2f(self.data, f, extra_data=self.extra_data)
W_f = W*f W_f = W*f
grad = self.noise_model.dlik_df(self.data, f, extra_data=self.extra_data) grad = self.noise_model.dlik_df(self.data, f, extra_data=self.extra_data)
b = W_f + grad b = W_f + grad
#TODO!!!
W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b)) W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
#solve_L = cho_solve((L, True), W_12*np.dot(K, b))
#Work out the DIRECTION that we want to move in, but don't choose the stepsize yet #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
full_step_a = b - W12BiW12Kb full_step_Ki_f = b - W12BiW12Kb
da = full_step_a - old_a dKi_f = full_step_Ki_f - old_Ki_f
f_old = f.copy() f_old = f.copy()
def inner_obj(step_size, old_a, da, K): def inner_obj(step_size, old_Ki_f, dKi_f, K):
a = old_a + step_size*da Ki_f = old_Ki_f + step_size*dKi_f
f = np.dot(K, a) f = np.dot(K, Ki_f)
self.a = a.copy() # This is nasty, need to set something within an optimization though # This is nasty, need to set something within an optimization though
self.Ki_f = Ki_f.copy()
self.f = f.copy() self.f = f.copy()
return -obj(a, f) return -obj(Ki_f, f)
i_o = partial(inner_obj, old_a=old_a, da=da, K=K) i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
#new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20) #Find the stepsize that minimizes the objective function using a brent line search
new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':30}).fun new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':30}).fun
f = self.f.copy() f = self.f.copy()
a = self.a.copy() Ki_f = self.Ki_f.copy()
#Optimize without linesearch
#f_old = f.copy() #f_old = f.copy()
#update_passed = False #update_passed = False
#while not update_passed: #while not update_passed:
#a = old_a + step_size*da #Ki_f = old_Ki_f + step_size*dKi_f
#f = np.dot(K, a) #f = np.dot(K, Ki_f)
#old_obj = new_obj #old_obj = new_obj
#new_obj = obj(a, f) #new_obj = obj(Ki_f, f)
#difference = new_obj - old_obj #difference = new_obj - old_obj
##print "difference: ",difference ##print "difference: ",difference
#if difference < 0: #if difference < 0:
@ -390,70 +348,18 @@ class Laplace(likelihood):
#else: #else:
#update_passed = True #update_passed = True
#old_Ki_f = self.Ki_f.copy()
#difference = abs(new_obj - old_obj) #difference = abs(new_obj - old_obj)
#old_obj = new_obj.copy() #old_obj = new_obj.copy()
#difference = np.abs(np.sum(f - f_old)) #difference = np.abs(np.sum(f - f_old))
difference = np.abs(np.sum(a - old_a)) difference = np.abs(np.sum(Ki_f - old_Ki_f))
#old_a = self.a.copy() #a old_Ki_f = Ki_f.copy()
old_a = a.copy()
i += 1 i += 1
#print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.old_a = old_a.copy() self.old_Ki_f = old_Ki_f.copy()
#print "Positive difference obj: ", np.float(difference)
#print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
#print "Iterations: {}, Final_difference: {}".format(i, difference)
if difference > epsilon: if difference > epsilon:
print "Not perfect f_hat fit difference: {}".format(difference) print "Not perfect f_hat fit difference: {}".format(difference)
if False:
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
if hasattr(self, 'X'):
import pylab as pb
pb.figure()
pb.subplot(311)
pb.title('old f_hat')
pb.plot(self.X, self.f_hat)
pb.subplot(312)
pb.title('old ff')
pb.plot(self.X, self.old_ff)
pb.subplot(313)
pb.title('new f_hat')
pb.plot(self.X, f)
pb.figure() self.Ki_f = Ki_f
pb.subplot(121)
pb.title('old K')
pb.imshow(np.diagflat(self.old_K), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new K')
pb.imshow(np.diagflat(K), interpolation='none')
pb.colorbar()
pb.figure()
pb.subplot(121)
pb.title('old W')
pb.imshow(np.diagflat(self.old_W), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new W')
pb.imshow(np.diagflat(W), interpolation='none')
pb.colorbar()
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
pb.close('all')
#FIXME: DELETE THESE
#self.old_W = W.copy()
#self.old_grad = grad.copy()
#self.old_B = B.copy()
#self.old_W_12 = W_12.copy()
#self.old_ff = f.copy()
#self.old_K = self.K.copy()
#self.old_s = self.noise_model._get_params()
#print "after: ", self.old_s
#print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.a = a
#self.B, self.B_chol, self.W_12 = B, L, W_12
#self.Bi, _, _, B_det = pdinv(self.B)
return f return f

View file

@ -2,7 +2,7 @@
# Licensed under the BSD 3-clause license (see LICENSE.txt) # Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np import numpy as np
from scipy import stats,special from scipy import stats, special
import scipy as sp import scipy as sp
import gp_transformations import gp_transformations
from noise_distributions import NoiseDistribution from noise_distributions import NoiseDistribution
@ -180,7 +180,6 @@ class StudentT(NoiseDistribution):
#However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
true_var = sigma**2 + self.variance true_var = sigma**2 + self.variance
print "True var: {}".format(true_var)
return true_var return true_var
def _predictive_mean_analytical(self, mu, var): def _predictive_mean_analytical(self, mu, var):

View file

@ -218,7 +218,7 @@ class LaplaceTests(unittest.TestCase):
print "\n{}".format(inspect.stack()[0][3]) print "\n{}".format(inspect.stack()[0][3])
self.Y = self.Y/self.Y.max() self.Y = self.Y/self.Y.max()
kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1]) kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss, opt='rasm') gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss)
m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=gauss_laplace) m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=gauss_laplace)
m.ensure_default_constraints() m.ensure_default_constraints()
m.randomize() m.randomize()
@ -230,7 +230,7 @@ class LaplaceTests(unittest.TestCase):
self.Y = self.Y/self.Y.max() self.Y = self.Y/self.Y.max()
self.stu_t = GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var) self.stu_t = GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var)
kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1]) kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm') stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace) m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
m.ensure_default_constraints() m.ensure_default_constraints()
m.constrain_positive('t_noise') m.constrain_positive('t_noise')
@ -244,7 +244,7 @@ class LaplaceTests(unittest.TestCase):
self.Y = self.Y/self.Y.max() self.Y = self.Y/self.Y.max()
white_var = 1 white_var = 1
kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1]) kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm') stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace) m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
m.ensure_default_constraints() m.ensure_default_constraints()
m.constrain_positive('t_noise') m.constrain_positive('t_noise')
@ -259,7 +259,7 @@ class LaplaceTests(unittest.TestCase):
self.Y = self.Y/self.Y.max() self.Y = self.Y/self.Y.max()
white_var = 1 white_var = 1
kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1]) kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm') stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace) m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
m.ensure_default_constraints() m.ensure_default_constraints()
m.constrain_positive('t_noise') m.constrain_positive('t_noise')