Tidied up laplace

This commit is contained in:
Alan Saul 2013-10-03 19:04:00 +01:00
parent 8343615098
commit da67e39e50
4 changed files with 159 additions and 283 deletions

View file

@ -27,7 +27,7 @@ def timing():
kernel1 = GPy.kern.rbf(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
m = GPy.models.GPRegression(X, Yc.copy(), kernel1, likelihood=corrupt_stu_t_likelihood)
m.ensure_default_constraints()
m.update_likelihood_approximation()
@ -56,7 +56,7 @@ def v_fail_test():
print "Clean student t, rasm"
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y.copy(), kernel1, likelihood=stu_t_likelihood)
m.constrain_positive('')
vs = 25
@ -103,7 +103,7 @@ def student_t_obj_plane():
kernelst = kernelgp.copy()
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=(real_std**2))
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y, kernelst, likelihood=stu_t_likelihood)
m.ensure_default_constraints()
m.constrain_fixed('t_no', real_std**2)
@ -156,7 +156,7 @@ def student_t_f_check():
kernelst = kernelgp.copy()
#kernelst += GPy.kern.bias(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=0.05)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y.copy(), kernelst, likelihood=stu_t_likelihood)
#m['rbf_v'] = mgp._get_params()[0]
#m['rbf_l'] = mgp._get_params()[1] + 1
@ -208,7 +208,7 @@ def student_t_fix_optimise_check():
real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=real_stu_t_std2)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
plt.figure(1)
plt.suptitle('Student likelihood')
@ -351,7 +351,7 @@ def debug_student_t_noise_approx():
print "Clean student t, rasm"
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y, kernel6, likelihood=stu_t_likelihood)
#m['rbf_len'] = 1.5
@ -488,7 +488,7 @@ def student_t_approx():
print "Clean student t, rasm"
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
m.ensure_default_constraints()
m.constrain_positive('t_noise')
@ -504,7 +504,7 @@ def student_t_approx():
print "Corrupt student t, rasm"
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
m.ensure_default_constraints()
m.constrain_positive('t_noise')
@ -526,51 +526,22 @@ def student_t_approx():
import ipdb; ipdb.set_trace() # XXX BREAKPOINT
return m
#print "Clean student t, ncg"
#t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
#m = GPy.models.GPRegression(X, Y, kernel3, likelihood=stu_t_likelihood)
#m.ensure_default_constraints()
#m.update_likelihood_approximation()
#m.optimize()
#print(m)
#plt.subplot(221)
#m.plot()
#plt.plot(X_full, Y_full)
#plt.ylim(-2.5, 2.5)
#plt.title('Student-t ncg clean')
#with a student t distribution, since it has heavy tails it should work well
#likelihood_function = student_t(deg_free=deg_free, sigma2=real_var)
#lap = Laplace(Y, likelihood_function)
#cov = kernel.K(X)
#lap.fit_full(cov)
#print "Corrupt student t, ncg"
#t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
#corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
#m = GPy.models.GPRegression(X, Y, kernel5, likelihood=corrupt_stu_t_likelihood)
#m.ensure_default_constraints()
#m.update_likelihood_approximation()
#m.optimize()
#print(m)
#plt.subplot(223)
#m.plot()
#plt.plot(X_full, Y_full)
#plt.ylim(-2.5, 2.5)
#plt.title('Student-t ncg corrupt')
###with a student t distribution, since it has heavy tails it should work well
###likelihood_function = student_t(deg_free=deg_free, sigma2=real_var)
###lap = Laplace(Y, likelihood_function)
###cov = kernel.K(X)
###lap.fit_full(cov)
###test_range = np.arange(0, 10, 0.1)
###plt.plot(test_range, t_rv.pdf(test_range))
###for i in xrange(X.shape[0]):
###mode = lap.f_hat[i]
###covariance = lap.hess_hat_i[i,i]
###scaling = np.exp(lap.ln_z_hat)
###normalised_approx = norm(loc=mode, scale=covariance)
###print "Normal with mode %f, and variance %f" % (mode, covariance)
###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
###plt.show()
#test_range = np.arange(0, 10, 0.1)
#plt.plot(test_range, t_rv.pdf(test_range))
#for i in xrange(X.shape[0]):
#mode = lap.f_hat[i]
#covariance = lap.hess_hat_i[i,i]
#scaling = np.exp(lap.ln_z_hat)
#normalised_approx = norm(loc=mode, scale=covariance)
#print "Normal with mode %f, and variance %f" % (mode, covariance)
#plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
#plt.show()
return m
@ -625,7 +596,7 @@ def gaussian_f_check():
#kernelst += GPy.kern.bias(X.shape[1])
N, D = X.shape
g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=0.1, N=N, D=D)
g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution)
m = GPy.models.GPRegression(X, Y, kernelg, likelihood=g_likelihood)
m.likelihood.X = X
#m['rbf_v'] = mgp._get_params()[0]
@ -702,7 +673,7 @@ def boston_example():
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
N, D = Y_train.shape
g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution, opt='rasm')
g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=g_likelihood)
mg.ensure_default_constraints()
mg.constrain_positive('noise_variance')
@ -729,7 +700,7 @@ def boston_example():
print "Student-T GP {}df".format(deg_free)
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
mstu_t.ensure_default_constraints()
mstu_t.constrain_fixed('white', 1e-5)
@ -755,7 +726,7 @@ def boston_example():
print "Student-T GP {}df".format(deg_free)
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
mstu_t.ensure_default_constraints()
mstu_t.constrain_fixed('white', 1e-5)
@ -782,7 +753,7 @@ def boston_example():
print "Student-T GP {}df".format(deg_free)
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
mstu_t.ensure_default_constraints()
mstu_t.constrain_fixed('white', 1e-5)
@ -808,7 +779,7 @@ def boston_example():
print "Student-T GP {}df".format(deg_free)
kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
mstu_t.ensure_default_constraints()
mstu_t.constrain_fixed('white', 1e-5)

View file

@ -1,42 +1,42 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
import scipy as sp
import GPy
from scipy.linalg import inv, cho_solve, det
from numpy.linalg import cond
from scipy.linalg import cho_solve
from likelihood import likelihood
from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet, dtrtrs
from ..util.linalg import mdot, jitchol, pddet
from scipy.linalg.lapack import dtrtrs
import random
from functools import partial
#import pylab as plt
from functools import partial as partial_func
class Laplace(likelihood):
"""Laplace approximation to a posterior"""
def __init__(self, data, noise_model, extra_data=None, opt='rasm'):
def __init__(self, data, noise_model, extra_data=None):
"""
Laplace Approximation
First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
then find the z^{prime} which allows this to be a normalised gaussian instead of a
non-normalized gaussian
Find the moments \hat{f} and the hessian at this point
(using Newton-Raphson) of the unnormalised posterior
Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
which makes a gaussian the same as the laplace approximation
Compute the GP variables (i.e. generate some Y^{squiggle} and
z^{squiggle} which makes a gaussian the same as the laplace
approximation to the posterior, but normalised
Arguments
---------
:data: array of data the likelihood function is approximating
:noise_model: likelihood function - subclass of noise_model
:extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
:opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
:param data: array of data the likelihood function is approximating
:type data: NxD
:param noise_model: likelihood function - subclass of noise_model
:type noise_model: noise_model
:param extra_data: additional data used by some likelihood functions,
for example survival likelihoods need censoring data
"""
self.data = data
self.noise_model = noise_model
self.extra_data = extra_data
self.opt = opt
#Inital values
self.N, self.D = self.data.shape
@ -48,6 +48,9 @@ class Laplace(likelihood):
likelihood.__init__(self)
def restart(self):
"""
Reset likelihood variables to their defaults
"""
#Initial values for the GP variables
self.Y = np.zeros((self.N, 1))
self.covariance_matrix = np.eye(self.N)
@ -55,11 +58,12 @@ class Laplace(likelihood):
self.Z = 0
self.YYT = None
self.old_a = None
self.old_Ki_f = None
def predictive_values(self, mu, var, full_cov):
if full_cov:
raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
raise NotImplementedError("Cannot make correlated predictions\
with an Laplace likelihood")
return self.noise_model.predictive_values(mu, var)
def _get_params(self):
@ -79,7 +83,10 @@ class Laplace(likelihood):
def _Kgradients(self):
"""
Gradients with respect to prior kernel parameters
Gradients with respect to prior kernel parameters dL_dK to be chained
with dK_dthetaK to give dL_dthetaK
:returns: dL_dK matrix
:rtype: Matrix (1 x num_kernel_params)
"""
dL_dfhat, I_KW_i = self._shared_gradients_components()
dlp = self.noise_model.dlik_df(self.data, self.f_hat)
@ -93,19 +100,25 @@ class Laplace(likelihood):
#Implicit
impl = mdot(dlp, dL_dfhat, I_KW_i)
#No longer required as we are computing these in the gp already otherwise we would take them away and add them back
#No longer required as we are computing these in the gp already
#otherwise we would take them away and add them back
#dL_dthetaK_imp = dK_dthetaK(impl, X)
#dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
#dL_dK = expl + impl
#No need to compute explicit as we are computing dZ_dK to account for the difference
#Between the K gradients of a normal GP, and the K gradients including the implicit part
#No need to compute explicit as we are computing dZ_dK to account
#for the difference between the K gradients of a normal GP,
#and the K gradients including the implicit part
dL_dK = impl
return dL_dK
def _gradients(self, partial):
"""
Gradients with respect to likelihood parameters
Gradients with respect to likelihood parameters (dL_dthetaL)
:param partial: Not needed by this likelihood
:type partial: lambda function
:rtype: array of derivatives (1 x num_likelihood_params)
"""
dL_dfhat, I_KW_i = self._shared_gradients_components()
dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.data, self.f_hat)
@ -123,62 +136,51 @@ class Laplace(likelihood):
#Implicit
dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
#print "LIK: dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
return dL_dthetaL
def _compute_GP_variables(self):
"""
Generates data Y which would give the normal distribution identical to the laplace approximation
Generate data Y which would give the normal distribution identical
to the laplace approximation to the posterior, but normalised
GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
GPy expects a likelihood to be gaussian, so need to caluclate
the data Y^{\tilde} that makes the posterior match that found
by a laplace approximation to a non-gaussian likelihood but with
a gaussian likelihood
Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
due to the z rescaling.
Firstly,
The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
we wish to find the hessian \Sigma^{\tilde}
that has the same curvature but using our new simulated data Y^{\tilde}
i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
$$\tilde{Y} = \tilde{\Sigma} Hf$$
where
$$\tilde{\Sigma}^{-1} = H - K^{-1}$$
i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
$$\tilde{\Sigma} = W^{-1}$$
Secondly,
GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
So we can suck up any differences between that and our log marginal likelihood approximation
p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
which we want to optimize instead, by equating them and rearranging, the difference is added onto
the log p(y) that GPy optimizes by default
Thirdly,
Since we have gradients that depend on how we move f^{\hat}, we have implicit components
aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
gp.py code
"""
#Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
#dtritri -> L -> L_i
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
#((L.T*w)_i + I)f_hat = y_tilde
#L = jitchol(self.K)
#Li = chol_inv(L)
#Lt_W = L.T*self.W.T
#Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
#self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
#Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
Wi = 1.0/self.W
self.Sigma_tilde = np.diagflat(Wi)
Y_tilde = Wi*self.Ki_f + self.f_hat
#self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
#self.Wi_K_i = self.W_12*cho_solve((self.B_chol, True), np.diagflat(self.W_12))
self.Wi_K_i = self.W12BiW12
#self.Wi_K_i, _, _, self.ln_det_Wi_K = pdinv(self.Sigma_tilde + self.K) # TODO: Check if Wi_K_i == R above and same with det below
self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
self.lik = self.noise_model.link_function(self.data, self.f_hat, extra_data=self.extra_data)
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
Z_tilde = (+ self.lik
- 0.5*self.ln_B_det
+ 0.5*self.ln_det_Wi_K
@ -201,54 +203,46 @@ class Laplace(likelihood):
"""
The laplace approximation algorithm, find K and expand hessian
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
:K: Covariance matrix
:param K: Covariance matrix evaluated at locations X
:type K: NxD matrix
"""
self.K = K.copy()
#Find mode
self.f_hat = {
'rasm': self.rasm_mode,
'ncg': self.ncg_mode,
'nelder': self.nelder_mode
}[self.opt](self.K)
self.f_hat = self.rasm_mode(self.K)
#Compute hessian and other variables at mode
self._compute_likelihood_variables()
#Compute fake variables replicating laplace approximation to posterior
self._compute_GP_variables()
def _compute_likelihood_variables(self):
"""
Compute the variables required to compute gaussian Y variables
"""
#At this point get the hessian matrix (or vector as W is diagonal)
self.W = -self.noise_model.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
#TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
#Do the computation again at f to get Ki_f which is useful
#b = self.W*self.f_hat + self.noise_model.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
#solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
#a = b - self.W_12*solve_chol
self.Ki_f = self.a
self.Ki_f = self.Ki_f
self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
#For det, |I + KW| == |I + W_12*K*W_12|
#self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
#self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
#self.ln_z_hat = (- 0.5*self.f_Ki_f
#- self.ln_I_KW_det
#+ self.noise_model.link_function(self.data, self.f_hat, extra_data=self.extra_data)
#)
return self._compute_GP_variables()
def _compute_B_statistics(self, K, W, a):
"""Rasmussen suggests the use of a numerically stable positive definite matrix B
"""
Rasmussen suggests the use of a numerically stable positive definite matrix B
Which has a positive diagonal element and can be easyily inverted
:K: Covariance matrix
:W: Negative hessian at a point (diagonal matrix)
:returns: (B, L)
:param K: Covariance matrix evaluated at locations X
:type K: NxD matrix
:param W: Negative hessian at a point (diagonal matrix)
:type W: Vector of diagonal values of hessian (1xN)
:param a: Matrix to calculate W12BiW12a
:type a: Matrix NxN
:returns: (W12BiW12, ln_B_det)
"""
if not self.noise_model.log_concave:
#print "Under 1e-10: {}".format(np.sum(W < 1e-10))
@ -265,74 +259,37 @@ class Laplace(likelihood):
W12BiW12= W_12*cho_solve((L, True), W_12*a)
ln_B_det = 2*np.sum(np.log(np.diag(L)))
return (W12BiW12, ln_B_det)
return W12BiW12, ln_B_det
def nelder_mode(self, K):
f = np.zeros((self.N, 1))
self.Ki, _, _, self.ln_K_det = pdinv(K)
def obj(f):
res = -1 * (self.noise_model.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
return float(res)
res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
f_new = res.x
return f_new[:, None]
def ncg_mode(self, K):
"""
Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
:K: Covariance matrix
:returns: f_mode
"""
self.Ki, _, _, self.ln_K_det = pdinv(K)
f = np.zeros((self.N, 1))
#FIXME: Can we get rid of this horrible reshaping?
#ONLY WORKS FOR 1D DATA
def obj(f):
res = -1 * (self.noise_model.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
- self.NORMAL_CONST)
return float(res)
def obj_grad(f):
res = -1 * (self.noise_model.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
return np.squeeze(res)
def obj_hess(f):
res = -1 * (np.diag(self.noise_model.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
return np.squeeze(res)
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
return f_hat[:, None]
def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
def rasm_mode(self, K, MAX_ITER=100):
"""
Rasmussen's numerically stable mode finding
For nomenclature see Rasmussen & Williams 2006
Influenced by GPML (BSD) code, all errors are our own
:K: Covariance matrix
:MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
:returns: f_mode
:param K: Covariance matrix evaluated at locations X
:type K: NxD matrix
:param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
:type MAX_ITER: scalar
:returns: f_hat, mode on which to make laplace approxmiation
:rtype: NxD matrix
"""
#self.old_before_s = self.noise_model._get_params()
#print "before: ", self.old_before_s
#if self.old_before_s < 1e-5:
#old_Ki_f = np.zeros((self.N, 1))
#old_a = np.zeros((self.N, 1))
if self.old_a is None:
old_a = np.zeros((self.N, 1))
f = np.dot(K, old_a)
#Start f's at zero originally
if self.old_Ki_f is None:
old_Ki_f = np.zeros((self.N, 1))
f = np.dot(K, old_Ki_f)
else:
old_a = self.old_a.copy()
#Start at the old best point
old_Ki_f = self.old_Ki_f.copy()
f = self.f_hat.copy()
new_obj = -np.inf
old_obj = np.inf
def obj(a, f):
return -0.5*np.dot(a.T, f) + self.noise_model.link_function(self.data, f, extra_data=self.extra_data)
def obj(Ki_f, f):
return -0.5*np.dot(Ki_f.T, f) + self.noise_model.link_function(self.data, f, extra_data=self.extra_data)
difference = np.inf
epsilon = 1e-6
@ -340,42 +297,43 @@ class Laplace(likelihood):
rs = 0
i = 0
while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
while difference > epsilon and i < MAX_ITER:
W = -self.noise_model.d2lik_d2f(self.data, f, extra_data=self.extra_data)
W_f = W*f
grad = self.noise_model.dlik_df(self.data, f, extra_data=self.extra_data)
b = W_f + grad
#TODO!!!
W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
#solve_L = cho_solve((L, True), W_12*np.dot(K, b))
#Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
full_step_a = b - W12BiW12Kb
da = full_step_a - old_a
full_step_Ki_f = b - W12BiW12Kb
dKi_f = full_step_Ki_f - old_Ki_f
f_old = f.copy()
def inner_obj(step_size, old_a, da, K):
a = old_a + step_size*da
f = np.dot(K, a)
self.a = a.copy() # This is nasty, need to set something within an optimization though
def inner_obj(step_size, old_Ki_f, dKi_f, K):
Ki_f = old_Ki_f + step_size*dKi_f
f = np.dot(K, Ki_f)
# This is nasty, need to set something within an optimization though
self.Ki_f = Ki_f.copy()
self.f = f.copy()
return -obj(a, f)
return -obj(Ki_f, f)
i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
#new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
#Find the stepsize that minimizes the objective function using a brent line search
new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':30}).fun
f = self.f.copy()
a = self.a.copy()
Ki_f = self.Ki_f.copy()
#Optimize without linesearch
#f_old = f.copy()
#update_passed = False
#while not update_passed:
#a = old_a + step_size*da
#f = np.dot(K, a)
#Ki_f = old_Ki_f + step_size*dKi_f
#f = np.dot(K, Ki_f)
#old_obj = new_obj
#new_obj = obj(a, f)
#new_obj = obj(Ki_f, f)
#difference = new_obj - old_obj
##print "difference: ",difference
#if difference < 0:
@ -390,70 +348,18 @@ class Laplace(likelihood):
#else:
#update_passed = True
#old_Ki_f = self.Ki_f.copy()
#difference = abs(new_obj - old_obj)
#old_obj = new_obj.copy()
#difference = np.abs(np.sum(f - f_old))
difference = np.abs(np.sum(a - old_a))
#old_a = self.a.copy() #a
old_a = a.copy()
difference = np.abs(np.sum(Ki_f - old_Ki_f))
old_Ki_f = Ki_f.copy()
i += 1
#print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.old_a = old_a.copy()
#print "Positive difference obj: ", np.float(difference)
#print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
#print "Iterations: {}, Final_difference: {}".format(i, difference)
self.old_Ki_f = old_Ki_f.copy()
if difference > epsilon:
print "Not perfect f_hat fit difference: {}".format(difference)
if False:
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
if hasattr(self, 'X'):
import pylab as pb
pb.figure()
pb.subplot(311)
pb.title('old f_hat')
pb.plot(self.X, self.f_hat)
pb.subplot(312)
pb.title('old ff')
pb.plot(self.X, self.old_ff)
pb.subplot(313)
pb.title('new f_hat')
pb.plot(self.X, f)
pb.figure()
pb.subplot(121)
pb.title('old K')
pb.imshow(np.diagflat(self.old_K), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new K')
pb.imshow(np.diagflat(K), interpolation='none')
pb.colorbar()
pb.figure()
pb.subplot(121)
pb.title('old W')
pb.imshow(np.diagflat(self.old_W), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new W')
pb.imshow(np.diagflat(W), interpolation='none')
pb.colorbar()
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
pb.close('all')
#FIXME: DELETE THESE
#self.old_W = W.copy()
#self.old_grad = grad.copy()
#self.old_B = B.copy()
#self.old_W_12 = W_12.copy()
#self.old_ff = f.copy()
#self.old_K = self.K.copy()
#self.old_s = self.noise_model._get_params()
#print "after: ", self.old_s
#print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.a = a
#self.B, self.B_chol, self.W_12 = B, L, W_12
#self.Bi, _, _, B_det = pdinv(self.B)
self.Ki_f = Ki_f
return f

View file

@ -180,7 +180,6 @@ class StudentT(NoiseDistribution):
#However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
true_var = sigma**2 + self.variance
print "True var: {}".format(true_var)
return true_var
def _predictive_mean_analytical(self, mu, var):

View file

@ -218,7 +218,7 @@ class LaplaceTests(unittest.TestCase):
print "\n{}".format(inspect.stack()[0][3])
self.Y = self.Y/self.Y.max()
kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss, opt='rasm')
gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss)
m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=gauss_laplace)
m.ensure_default_constraints()
m.randomize()
@ -230,7 +230,7 @@ class LaplaceTests(unittest.TestCase):
self.Y = self.Y/self.Y.max()
self.stu_t = GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var)
kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
m.ensure_default_constraints()
m.constrain_positive('t_noise')
@ -244,7 +244,7 @@ class LaplaceTests(unittest.TestCase):
self.Y = self.Y/self.Y.max()
white_var = 1
kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
m.ensure_default_constraints()
m.constrain_positive('t_noise')
@ -259,7 +259,7 @@ class LaplaceTests(unittest.TestCase):
self.Y = self.Y/self.Y.max()
white_var = 1
kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
m.ensure_default_constraints()
m.constrain_positive('t_noise')