an afternoon's work on the laplace approximation

This commit is contained in:
James Hensman 2014-02-05 16:23:35 +00:00
parent 2ff7e286ee
commit f653bc430e
6 changed files with 120 additions and 225 deletions

View file

@ -55,9 +55,9 @@ class GP(Model):
self.add_parameter(self.kern) self.add_parameter(self.kern)
self.add_parameter(self.likelihood) self.add_parameter(self.likelihood)
self.parameters_changed() self.parameters_changed()
def parameters_changed(self): def parameters_changed(self):
self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y) self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y)
self._dL_dK = grad_dict['dL_dK'] self._dL_dK = grad_dict['dL_dK']
@ -65,9 +65,6 @@ class GP(Model):
def log_likelihood(self): def log_likelihood(self):
return self._log_marginal_likelihood return self._log_marginal_likelihood
def dL_dtheta_K(self):
return self.kern.dK_dtheta(self.posterior.dL_dK, self.X)
def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False): def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
""" """
Internal helper function for making predictions, does not account Internal helper function for making predictions, does not account

View file

@ -325,7 +325,6 @@ class Model(Parameterized):
if self._fail_count >= self._allowed_failures: if self._fail_count >= self._allowed_failures:
raise e raise e
self._fail_count += 1 self._fail_count += 1
import ipdb;ipdb.set_trace()
obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100) obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
return obj_grads return obj_grads

View file

@ -74,6 +74,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
self.size = sum(p.size for p in self._parameters_) self.size = sum(p.size for p in self._parameters_)
if not self._has_fixes(): if not self._has_fixes():
self._fixes_ = None self._fixes_ = None
self._param_slices_ = []
self._connect_parameters() self._connect_parameters()
self._added_names_ = set() self._added_names_ = set()
del self._in_init_ del self._in_init_
@ -213,7 +214,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
return return
i = 0 i = 0
sizes = [0] sizes = [0]
self._param_slices_ = []
for p in self._parameters_: for p in self._parameters_:
p._direct_parent_ = self p._direct_parent_ = self
p._highest_parent_ = self p._highest_parent_ = self
@ -315,7 +315,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
return n return n
def _get_params(self): def _get_params(self):
# don't overwrite this anymore! # don't overwrite this anymore!
return numpy.hstack([x._get_params() for x in self._parameters_]) return numpy.hstack([x._get_params() for x in self._parameters_ if x.size>0])
def _set_params(self, params, update=True): def _set_params(self, params, update=True):
# don't overwrite this anymore! # don't overwrite this anymore!
[p._set_params(params[s], update=update) for p,s in itertools.izip(self._parameters_,self._param_slices_)] [p._set_params(params[s], update=update) for p,s in itertools.izip(self._parameters_,self._param_slices_)]

View file

@ -42,8 +42,8 @@ class SparseGP(GP):
raise NotImplementedError, "what to do what to do?" raise NotImplementedError, "what to do what to do?"
print "defaulting to ", inference_method, "for latent function inference" print "defaulting to ", inference_method, "for latent function inference"
self.Z = Param('inducing inputs', Z)
self.Z = Z
self.num_inducing = Z.shape[0] self.num_inducing = Z.shape[0]
if not (X_variance is None): if not (X_variance is None):
@ -52,10 +52,8 @@ class SparseGP(GP):
GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name) GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
self.Z = Param('inducing inputs', self.Z)
self.add_parameter(self.Z, index=0) self.add_parameter(self.Z, index=0)
self.add_parameter(self.kern)
self.add_parameter(self.likelihood)
def parameters_changed(self): def parameters_changed(self):
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y) self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)

View file

@ -11,7 +11,8 @@
#http://gaussianprocess.org/gpml/code. #http://gaussianprocess.org/gpml/code.
import numpy as np import numpy as np
from ...util.linalg import mdot, jitchol, pddet, dpotrs from ...util.linalg import mdot, jitchol, pddet, dpotrs, dtrtrs
from ...util.misc import param_to_array
from functools import partial as partial_func from functools import partial as partial_func
from posterior import Posterior from posterior import Posterior
import warnings import warnings
@ -27,7 +28,6 @@ class LaplaceInference(object):
(using Newton-Raphson) of the unnormalised posterior (using Newton-Raphson) of the unnormalised posterior
""" """
self.NORMAL_CONST = (0.5 * np.log(2 * np.pi))
self._mode_finding_tolerance = 1e-7 self._mode_finding_tolerance = 1e-7
self._mode_finding_max_iter = 40 self._mode_finding_max_iter = 40
@ -39,58 +39,133 @@ class LaplaceInference(object):
Returns a Posterior class containing essential quantities of the posterior Returns a Posterior class containing essential quantities of the posterior
""" """
#make Y a normal array!
Y = param_to_array(Y)
# Compute K # Compute K
K = kern.K(X) K = kern.K(X)
#Find mode #Find mode
if self.bad_fhat: if self.bad_fhat:
Ki_f_init = np.random.randn(*Y.shape)/50 Ki_f_init = np.zeros_like(Y)
else: else:
Ki_f_init = self._previous_Ki_fhat Ki_f_init = self._previous_Ki_fhat
self.f_hat, self._previous_Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata) f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
stop
#Compute hessian and other variables at mode #Compute hessian and other variables at mode
self._compute_likelihood_variables() log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, Y_metadata)
likelihood.gradient = self.likelihood_gradients() #likelihood.gradient = self.likelihood_gradients()
dL_dK = self._Kgradients() kern.update_gradients_full(dL_dK, X)
kern.update_gradients_full(dL_dK)
return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK} self._previous_Ki_fhat = Ki_fhat.copy()
return Posterior(woodbury_vector=woodbury_vector, woodbury_inv = K_Wi_i, K=K), log_marginal, {'dL_dK':dL_dK}
def _shared_gradients_components(self): def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
""" """
A helper function to compute some common quantities Rasmussen's numerically stable mode finding
""" For nomenclature see Rasmussen & Williams 2006
d3lik_d3fhat = likelihood.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data) Influenced by GPML (BSD) code, all errors are our own
dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
return dL_dfhat, I_KW_i
def _Kgradients(self): :param K: Covariance matrix evaluated at locations X
:type K: NxD matrix
:param Y: The data
:type Y: np.ndarray
:param likelihood: the likelihood of the latent function value for the given data
:type likelihood: a GPy.likelihood object
:param Ki_f_init: the initial guess at the mode
:type Ki_f_init: np.ndarray
:param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
:type Y_metadata: np.ndarray | None
:returns: f_hat, mode on which to make laplace approxmiation
:rtype: np.ndarray
""" """
Gradients with respect to prior kernel parameters dL_dK to be chained
with dK_dthetaK to give dL_dthetaK
:returns: dL_dK matrix
:rtype: Matrix (1 x num_kernel_params)
"""
dL_dfhat, I_KW_i = self._shared_gradients_components()
dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work?
#Explicit Ki_f = Ki_f_init.copy()
expl_a = np.dot(self.Ki_f, self.Ki_f.T) f = np.dot(K, Ki_f)
expl_b = self.Wi_K_i
expl = 0.5*expl_a - 0.5*expl_b
dL_dthetaK_exp = dK_dthetaK(expl, X) #define the objective function (to be maximised)
def obj(Ki_f, f):
return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
difference = np.inf
iteration = 0
while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata)
W_f = W*f
grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata)
b = W_f + grad # R+W p46 line 6.
#W12BiW12Kb, B_logdet = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave)
W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))
#Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
full_step_Ki_f = b - W12BiW12Kb # full_step_Ki_f = a in R&W p46 line 6.
dKi_f = full_step_Ki_f - Ki_f
#define an objective for the line search (minimize this one)
def inner_obj(step_size):
Ki_f_trial = Ki_f + step_size*dKi_f
f_trial = np.dot(K, Ki_f_trial)
return -obj(Ki_f_trial, f_trial)
#use scipy for the line search, the compute new values of f, Ki_f
step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
Ki_f_new = Ki_f + step*dKi_f
f_new = np.dot(K, Ki_f_new)
difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
Ki_f = Ki_f_new
f = f_new
iteration += 1
#Warn of bad fits
if difference > self._mode_finding_tolerance:
if not self.bad_fhat:
warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
self.bad_fhat = True
elif self.bad_fhat:
self.bad_fhat = False
warnings.warn("f_hat now fine again")
return f, Ki_f
def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, Y_metadata):
"""
At the mode, compute the hessian and effective covariance matrix.
returns: logZ : approximation to the marginal likelihood
Cov : the approximation to the covariance matrix
"""
#At this point get the hessian matrix (or vector as W is diagonal)
W = -likelihood.d2logpdf_df2(f_hat, Y, extra_data=Y_metadata)
K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
#compute vital matrices
C = np.dot(LiW12, K)
Ki_W_i = K - C.T.dot(C)
#compute the log marginal
log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, extra_data=Y_metadata) - np.sum(np.log(np.diag(L)))
#compute dL_dK
explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
#Implicit #Implicit
impl = mdot(dlp, dL_dfhat, I_KW_i) d3lik_d3fhat = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata)
dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*d3lik_d3fhat) #why isn't this -0.5? s2 in R&W p126 line 9.
woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata)
implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i))
dL_dK = expl + impl dL_dK = explicit_part + implicit_part
return log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector
return dL_dK
def likelihood_gradients(self): def likelihood_gradients(self):
""" """
@ -118,24 +193,7 @@ class LaplaceInference(object):
return dL_dthetaL return dL_dthetaL
def _compute_likelihood_variables(self): def _compute_B_statistics(self, K, W, log_concave):
"""
At the mode, compute the hessian and effective covaraince matrix.
"""
#At this point get the hessian matrix (or vector as W is diagonal)
self.W = -likelihood.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
if not likelihood.log_concave:
#print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
self.W[self.W < 1e-6] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N), likelihood.log_concave)
self.Ki_f = self.Ki_f
self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
def _compute_B_statistics(self, K, W, a, log_concave):
""" """
Rasmussen suggests the use of a numerically stable positive definite matrix B Rasmussen suggests the use of a numerically stable positive definite matrix B
Which has a positive diagonal element and can be easyily inverted Which has a positive diagonal element and can be easyily inverted
@ -144,9 +202,7 @@ class LaplaceInference(object):
:type K: NxN matrix :type K: NxN matrix
:param W: Negative hessian at a point (diagonal matrix) :param W: Negative hessian at a point (diagonal matrix)
:type W: Vector of diagonal values of hessian (1xN) :type W: Vector of diagonal values of hessian (1xN)
:param a: Matrix to calculate W12BiW12a :returns: (W12BiW12, L_B, Li_W12)
:type a: Matrix NxN
:returns: (W12BiW12, ln_B_det)
""" """
if not log_concave: if not log_concave:
#print "Under 1e-10: {}".format(np.sum(W < 1e-6)) #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
@ -155,169 +211,13 @@ class LaplaceInference(object):
# To cause the posterior to become less certain than the prior and likelihood, # To cause the posterior to become less certain than the prior and likelihood,
# This is a property only held by non-log-concave likelihoods # This is a property only held by non-log-concave likelihoods
#W is diagonal so its sqrt is just the sqrt of the diagonal elements #W is diagonal so its sqrt is just the sqrt of the diagonal elements
W_12 = np.sqrt(W) W_12 = np.sqrt(W)
B = np.eye(K.shape[0]) + W_12*K*W_12.T B = np.eye(K.shape[0]) + W_12*K*W_12.T
L = jitchol(B) L = jitchol(B)
W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0] LiW12, _ = dtrtrs(L, np.diag(W_12[:,0]), lower=1, trans=0)
ln_B_det = 2.*np.sum(np.log(np.diag(L))) K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25
return W12BiW12a, ln_B_det
def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
"""
Rasmussen's numerically stable mode finding
For nomenclature see Rasmussen & Williams 2006
Influenced by GPML (BSD) code, all errors are our own
:param K: Covariance matrix evaluated at locations X
:type K: NxD matrix
:param Y: The data
:type Y: np.ndarray
:param likelihood: the likelihood of the latent function value for the given data
:type likelihood: a GPy.likelihood object
:param Ki_f_init: the initial guess at the mode
:type Ki_f_init: np.ndarray
:param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
:type Y_metadata: np.ndarray | None
:returns: f_hat, mode on which to make laplace approxmiation
:rtype: np.ndarray
"""
##Start f's at zero originally or if we have gone off track, try restarting
#if self.old_Ki_f is None or self.bad_fhat:
#old_Ki_f = np.random.rand(self.N, 1)/50.0
##old_Ki_f = self.Y
#f = np.dot(K, old_Ki_f)
#else:
##Start at the old best point
#old_Ki_f = self.old_Ki_f.copy()
#f = self.f_hat.copy()
Ki_f = Ki_f_init.copy()
f = np.dot(K, Ki_f)
#define the objective function (to be maximised)
def obj(Ki_f, f):
return -0.5*np.dot(Ki_f.T, f) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
difference = np.inf
i = 0
while difference > self._mode_finding_tolerance and i < self._mode_finding_max_iter:
W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata)
W_f = W*f
grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata)
b = W_f + grad
W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave)
#Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
full_step_Ki_f = b - W12BiW12Kb
dKi_f = full_step_Ki_f - Ki_f
#define an objective for the line search
def inner_obj(step_size):
Ki_f_trial = Ki_f + step_size*dKi_f
f_trial = np.dot(K, Ki_f_trial)
print -obj(Ki_f_trial, f_trial),
return -obj(Ki_f_trial, f_trial)
#use scipy for the line search, the compute new values of f, Ki_f
step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
Ki_f_new = Ki_f + step*dKi_f
f_new = np.dot(K, Ki_f_new)
print ""
print obj(Ki_f, f), obj(Ki_f_new, f_new), step
print ""
#i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
#Find the stepsize that minimizes the objective function using a brent line search
#The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
#steps than get this exact then make a step, if B was bigger it might be the other way around though
#new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
#new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
#f = self.tmp_f.copy()
#Ki_f = self.tmp_Ki_f.copy()
difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
Ki_f = Ki_f_new
f = f_new
i += 1
#Warn of bad fits
if difference > self._mode_finding_tolerance:
if not self.bad_fhat:
warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
self.bad_fhat = True
elif self.bad_fhat:
self.bad_fhat = False
warnings.warn("f_hat now fine again")
return f, Ki_f
def _compute_GP_variables(self):
"""
Generate data Y which would give the normal distribution identical
to the laplace approximation to the posterior, but normalised
GPy expects a likelihood to be gaussian, so need to caluclate
the data Y^{\tilde} that makes the posterior match that found
by a laplace approximation to a non-gaussian likelihood but with
a gaussian likelihood
Firstly,
The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
we wish to find the hessian \Sigma^{\tilde}
that has the same curvature but using our new simulated data Y^{\tilde}
i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
Secondly,
GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
So we can suck up any differences between that and our log marginal likelihood approximation
p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
which we want to optimize instead, by equating them and rearranging, the difference is added onto
the log p(y) that GPy optimizes by default
Thirdly,
Since we have gradients that depend on how we move f^{\hat}, we have implicit components
aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
gp.py code
"""
Wi = 1.0/self.W
self.Sigma_tilde = np.diagflat(Wi)
Y_tilde = Wi*self.Ki_f + self.f_hat
self.Wi_K_i = self.W12BiW12
ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
lik = likelihood.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
Z_tilde = (+ lik
- 0.5*self.ln_B_det
+ 0.5*ln_det_Wi_K
- 0.5*self.f_Ki_f
+ 0.5*y_Wi_K_i_y
+ self.NORMAL_CONST
)
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
self.Y = Y_tilde
self.YYT = np.dot(self.Y, self.Y.T)
self.covariance_matrix = self.Sigma_tilde
self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
#Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
self.dZ_dK = self._Kgradients()
#+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
return K_Wi_i, L, LiW12

View file

@ -48,6 +48,7 @@ class Posterior(object):
if ((woodbury_chol is not None) and (woodbury_vector is not None))\ if ((woodbury_chol is not None) and (woodbury_vector is not None))\
or ((woodbury_inv is not None) and (woodbury_vector is not None))\ or ((woodbury_inv is not None) and (woodbury_vector is not None))\
or ((woodbury_inv is not None) and (mean is not None))\
or ((mean is not None) and (cov is not None)): or ((mean is not None) and (cov is not None)):
pass # we have sufficient to compute the posterior pass # we have sufficient to compute the posterior
else: else: