2013-03-13 17:55:41 +00:00
|
|
|
import numpy as np
|
|
|
|
|
import scipy as sp
|
2013-03-12 17:42:00 +00:00
|
|
|
import GPy
|
|
|
|
|
from GPy.util.linalg import jitchol
|
2013-03-13 17:55:41 +00:00
|
|
|
from functools import partial
|
|
|
|
|
from GPy.likelihoods.likelihood import likelihood
|
|
|
|
|
from GPy.util.linalg import pdinv,mdot
|
2013-03-14 15:30:22 +00:00
|
|
|
from scipy.stats import norm
|
2013-03-13 17:55:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class Laplace(likelihood):
|
2013-03-12 17:42:00 +00:00
|
|
|
"""Laplace approximation to a posterior"""
|
|
|
|
|
|
2013-03-14 15:30:22 +00:00
|
|
|
def __init__(self, data, likelihood_function):
|
2013-03-12 17:42:00 +00:00
|
|
|
"""
|
|
|
|
|
Laplace Approximation
|
|
|
|
|
|
|
|
|
|
First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
|
|
|
|
|
then find the z^{prime} which allows this to be a normalised gaussian instead of a
|
|
|
|
|
non-normalized gaussian
|
|
|
|
|
|
|
|
|
|
Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
|
|
|
|
|
which makes a gaussian the same as the laplace approximation
|
|
|
|
|
|
|
|
|
|
Arguments
|
|
|
|
|
---------
|
|
|
|
|
|
|
|
|
|
:data: @todo
|
|
|
|
|
:likelihood_function: @todo
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
self.data = data
|
|
|
|
|
self.likelihood_function = likelihood_function
|
|
|
|
|
|
|
|
|
|
#Inital values
|
|
|
|
|
self.N, self.D = self.data.shape
|
|
|
|
|
|
|
|
|
|
def _compute_GP_variables(self):
|
|
|
|
|
"""
|
|
|
|
|
Generates data Y which would give the normal distribution identical to the laplace approximation
|
|
|
|
|
|
|
|
|
|
GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
|
|
|
|
|
that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
|
2013-03-15 17:38:13 +00:00
|
|
|
|
|
|
|
|
Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
|
|
|
|
|
then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
|
|
|
|
|
due to the z rescaling.
|
|
|
|
|
|
|
|
|
|
at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
|
|
|
|
|
|
|
|
|
|
This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
|
|
|
|
|
giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
|
|
|
|
|
|
|
|
|
|
$$\tilde{Y} = \tilde{\Sigma} Hf$$
|
|
|
|
|
where
|
|
|
|
|
$$\tilde{\Sigma}^{-1} = H - K^{-1}$$
|
|
|
|
|
i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
|
|
|
|
|
since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
|
|
|
|
|
and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
|
|
|
|
|
|
2013-03-12 17:42:00 +00:00
|
|
|
"""
|
2013-03-15 17:38:13 +00:00
|
|
|
self.Sigma_tilde = self.hess_hat -
|
|
|
|
|
self.Z =
|
2013-03-14 15:30:22 +00:00
|
|
|
#self.Y =
|
|
|
|
|
#self.YYT =
|
|
|
|
|
#self.covariance_matrix =
|
|
|
|
|
#self.precision =
|
2013-03-12 17:42:00 +00:00
|
|
|
|
|
|
|
|
def fit_full(self, K):
|
|
|
|
|
"""
|
|
|
|
|
The laplace approximation algorithm
|
|
|
|
|
For nomenclature see Rasmussen & Williams 2006
|
|
|
|
|
:K: Covariance matrix
|
|
|
|
|
"""
|
2013-03-13 17:55:41 +00:00
|
|
|
f = np.zeros((self.N, 1))
|
2013-03-14 15:30:22 +00:00
|
|
|
#K = np.diag(np.ones(self.N))
|
2013-03-15 17:38:13 +00:00
|
|
|
(self.Ki, _, _, self.log_Kdet) = pdinv(K)
|
|
|
|
|
obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
|
2013-03-12 17:42:00 +00:00
|
|
|
|
|
|
|
|
#Find \hat(f) using a newton raphson optimizer for example
|
2013-03-13 17:55:41 +00:00
|
|
|
#TODO: Add newton-raphson as subclass of optimizer class
|
|
|
|
|
|
|
|
|
|
#FIXME: Can we get rid of this horrible reshaping?
|
|
|
|
|
def obj(f):
|
|
|
|
|
f = f[:, None]
|
2013-03-15 17:38:13 +00:00
|
|
|
res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant)
|
2013-03-13 17:55:41 +00:00
|
|
|
return float(res)
|
|
|
|
|
|
|
|
|
|
def obj_grad(f):
|
|
|
|
|
f = f[:, None]
|
2013-03-15 17:38:13 +00:00
|
|
|
res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f))
|
2013-03-13 17:55:41 +00:00
|
|
|
return np.squeeze(res)
|
|
|
|
|
|
|
|
|
|
def obj_hess(f):
|
|
|
|
|
f = f[:, None]
|
2013-03-15 17:38:13 +00:00
|
|
|
res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki)
|
2013-03-13 17:55:41 +00:00
|
|
|
return np.squeeze(res)
|
|
|
|
|
|
|
|
|
|
self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
|
2013-03-14 15:30:22 +00:00
|
|
|
print self.f_hat
|
2013-03-12 17:42:00 +00:00
|
|
|
|
|
|
|
|
#At this point get the hessian matrix
|
2013-03-14 15:30:22 +00:00
|
|
|
self.hess_hat = obj_hess(self.f_hat)
|
2013-03-13 17:55:41 +00:00
|
|
|
|
|
|
|
|
#Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
|
2013-03-15 17:38:13 +00:00
|
|
|
self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
|
|
|
|
|
#z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
|
|
|
|
|
#the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
|
|
|
|
|
#z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
|
|
|
|
|
self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised)
|
2013-03-12 17:42:00 +00:00
|
|
|
|
2013-03-14 15:30:22 +00:00
|
|
|
return self._compute_GP_variables()
|