GPy/GPy/likelihoods/laplace.py
2013-09-18 13:18:28 +01:00

450 lines
19 KiB
Python

import numpy as np
import scipy as sp
import GPy
from scipy.linalg import inv, cho_solve, det
from numpy.linalg import cond
from likelihood import likelihood
from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet, dtrtrs
from scipy.linalg.lapack import dtrtrs
import random
from functools import partial
#import pylab as plt
class Laplace(likelihood):
"""Laplace approximation to a posterior"""
def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'):
"""
Laplace Approximation
First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
then find the z^{prime} which allows this to be a normalised gaussian instead of a
non-normalized gaussian
Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
which makes a gaussian the same as the laplace approximation
Arguments
---------
:data: array of data the likelihood function is approximating
:likelihood_function: likelihood function - subclass of likelihood_function
:extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
:opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
"""
self.data = data
self.likelihood_function = likelihood_function
self.extra_data = extra_data
self.opt = opt
#Inital values
self.N, self.D = self.data.shape
self.is_heteroscedastic = True
self.Nparams = 0
self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
self.restart()
def restart(self):
#Initial values for the GP variables
self.Y = np.zeros((self.N, 1))
self.covariance_matrix = np.eye(self.N)
self.precision = np.ones(self.N)[:, None]
self.Z = 0
self.YYT = None
self.old_a = None
def predictive_values(self, mu, var, full_cov):
if full_cov:
raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
return self.likelihood_function.predictive_values(mu, var)
def _get_params(self):
return np.asarray(self.likelihood_function._get_params())
def _get_param_names(self):
return self.likelihood_function._get_param_names()
def _set_params(self, p):
return self.likelihood_function._set_params(p)
def _shared_gradients_components(self):
d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat, extra_data=self.extra_data)
dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
return dL_dfhat, I_KW_i
def _Kgradients(self, dK_dthetaK, X):
"""
Gradients with respect to prior kernel parameters
"""
dL_dfhat, I_KW_i = self._shared_gradients_components()
dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
#Explicit
expl_a = np.dot(self.Ki_f, self.Ki_f.T)
expl_b = self.Wi_K_i
expl = 0.5*expl_a - 0.5*expl_b
dL_dthetaK_exp = dK_dthetaK(expl, X)
#Implicit
impl = mdot(dlp, dL_dfhat, I_KW_i)
dL_dthetaK_imp = dK_dthetaK(impl, X)
#print "K: dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
return dL_dthetaK
def _gradients(self, partial):
"""
Gradients with respect to likelihood parameters
"""
dL_dfhat, I_KW_i = self._shared_gradients_components()
dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
num_params = len(dlik_dthetaL)
# make space for one derivative for each likelihood parameter
dL_dthetaL = np.zeros(num_params)
for thetaL_i in range(num_params):
#Explicit
dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
#- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
+ np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[thetaL_i])
)
#Implicit
dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
#print "LIK: dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
def _compute_GP_variables(self):
"""
Generates data Y which would give the normal distribution identical to the laplace approximation
GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
due to the z rescaling.
at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
$$\tilde{Y} = \tilde{\Sigma} Hf$$
where
$$\tilde{\Sigma}^{-1} = H - K^{-1}$$
i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
$$\tilde{\Sigma} = W^{-1}$$
"""
#Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
#dtritri -> L -> L_i
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
#((L.T*w)_i + I)f_hat = y_tilde
#L = jitchol(self.K)
#Li = chol_inv(L)
#Lt_W = L.T*self.W.T
#Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
#self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
#Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
Wi = 1.0/self.W
self.Sigma_tilde = np.diagflat(Wi)
Y_tilde = Wi*self.Ki_f + self.f_hat
#self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
self.Wi_K_i = self.W_12*cho_solve((self.B_chol, True), np.diagflat(self.W_12))
#self.Wi_K_i, _, _, self.ln_det_Wi_K = pdinv(self.Sigma_tilde + self.K) # TODO: Check if Wi_K_i == R above and same with det below
self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
Z_tilde = (+ self.lik
- 0.5*self.ln_B_det
+ 0.5*self.ln_det_Wi_K
- 0.5*self.f_Ki_f
+ 0.5*self.y_Wi_Ki_i_y
)
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
self.Y = Y_tilde
self.YYT = np.dot(self.Y, self.Y.T)
self.covariance_matrix = self.Sigma_tilde
self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
def fit_full(self, K):
"""
The laplace approximation algorithm, find K and expand hessian
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
:K: Covariance matrix
"""
self.K = K.copy()
#Find mode
self.f_hat = {
'rasm': self.rasm_mode,
'ncg': self.ncg_mode,
'nelder': self.nelder_mode
}[self.opt](self.K)
#Compute hessian and other variables at mode
self._compute_likelihood_variables()
def _compute_likelihood_variables(self):
#At this point get the hessian matrix (or vector as W is diagonal)
self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
if not self.likelihood_function.log_concave:
#print "Under 1e-6: {}".format(np.sum(self.W < 1e-6))
self.W[self.W < 1e-6] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
#To cause the posterior to become less certain than the prior and likelihood,
#This is a property only held by non-log-concave likelihoods
#TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
self.Bi, _, _, self.ln_B_det = pdinv(self.B)
#Do the computation again at f to get Ki_f which is useful
#b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
#solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
#a = b - self.W_12*solve_chol
self.Ki_f = self.a
self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
self.Ki_W_i = self.K - mdot(self.K, self.W_12*cho_solve((self.B_chol, True), np.diagflat(self.W_12)), self.K)
#For det, |I + KW| == |I + W_12*K*W_12|
#self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
#self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
#self.ln_z_hat = (- 0.5*self.f_Ki_f
#- self.ln_I_KW_det
#+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
#)
return self._compute_GP_variables()
def _compute_B_statistics(self, K, W):
"""Rasmussen suggests the use of a numerically stable positive definite matrix B
Which has a positive diagonal element and can be easyily inverted
:K: Covariance matrix
:W: Negative hessian at a point (diagonal matrix)
:returns: (B, L)
"""
#W is diagonal so its sqrt is just the sqrt of the diagonal elements
W_12 = np.sqrt(W)
B = np.eye(self.N) + W_12*K*W_12.T
L = jitchol(B)
return (B, L, W_12)
def nelder_mode(self, K):
f = np.zeros((self.N, 1))
self.Ki, _, _, self.ln_K_det = pdinv(K)
def obj(f):
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
return float(res)
res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
f_new = res.x
return f_new[:, None]
def ncg_mode(self, K):
"""
Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
:K: Covariance matrix
:returns: f_mode
"""
self.Ki, _, _, self.ln_K_det = pdinv(K)
f = np.zeros((self.N, 1))
#FIXME: Can we get rid of this horrible reshaping?
#ONLY WORKS FOR 1D DATA
def obj(f):
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
- self.NORMAL_CONST)
return float(res)
def obj_grad(f):
res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
return np.squeeze(res)
def obj_hess(f):
res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
return np.squeeze(res)
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
return f_hat[:, None]
def rasm_mode(self, K, MAX_ITER=200, MAX_RESTART=10):
"""
Rasmussen's numerically stable mode finding
For nomenclature see Rasmussen & Williams 2006
:K: Covariance matrix
:MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
:returns: f_mode
"""
#self.old_before_s = self.likelihood_function._get_params()
#print "before: ", self.old_before_s
#if self.old_before_s < 1e-5:
#old_a = np.zeros((self.N, 1))
if self.old_a is None:
old_a = np.zeros((self.N, 1))
f = np.dot(K, old_a)
else:
old_a = self.old_a.copy()
f = self.f_hat.copy()
new_obj = -np.inf
old_obj = np.inf
def obj(a, f):
return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
difference = np.inf
epsilon = 1e-10
step_size = 1
rs = 0
i = 0
while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
#W = np.maximum(W, 0)
if not self.likelihood_function.log_concave:
#print "Under 1e-10: {}".format(np.sum(W < 1e-10))
W[W < 1e-10] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
# To cause the posterior to become less certain than the prior and likelihood,
# This is a property only held by non-log-concave likelihoods
B, L, W_12 = self._compute_B_statistics(K, W.copy())
W_f = W*f
grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
b = W_f + grad
solve_L = cho_solve((L, True), W_12*np.dot(K, b))
#Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
full_step_a = b - W_12*solve_L
da = full_step_a - old_a
f_old = f.copy()
def inner_obj(step_size, old_a, da, K):
a = old_a + step_size*da
f = np.dot(K, a)
self.a = a.copy() # This is nasty, need to set something within an optimization though
self.f = f.copy()
return -obj(a, f)
i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
#new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-6, options={'maxiter':20, 'disp':True}).fun
f = self.f.copy()
a = self.a.copy()
#f_old = f.copy()
#update_passed = False
#while not update_passed:
#a = old_a + step_size*da
#f = np.dot(K, a)
#old_obj = new_obj
#new_obj = obj(a, f)
#difference = new_obj - old_obj
##print "difference: ",difference
#if difference < 0:
##print "Objective function rose", np.float(difference)
##If the objective function isn't rising, restart optimization
#step_size *= 0.8
##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
##objective function isn't increasing, try reducing step size
#f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
#old_obj = new_obj
#rs += 1
#else:
#update_passed = True
#difference = abs(new_obj - old_obj)
#old_obj = new_obj.copy()
#difference = np.abs(np.sum(f - f_old))
difference = np.abs(np.sum(a - old_a))
#old_a = self.a.copy() #a
old_a = a.copy()
i += 1
#print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.old_a = old_a.copy()
#print "Positive difference obj: ", np.float(difference)
#print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
#print "Iterations: {}, Final_difference: {}".format(i, difference)
if difference > epsilon:
print "Not perfect f_hat fit difference: {}".format(difference)
if False:
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
if hasattr(self, 'X'):
import pylab as pb
pb.figure()
pb.subplot(311)
pb.title('old f_hat')
pb.plot(self.X, self.f_hat)
pb.subplot(312)
pb.title('old ff')
pb.plot(self.X, self.old_ff)
pb.subplot(313)
pb.title('new f_hat')
pb.plot(self.X, f)
pb.figure()
pb.subplot(121)
pb.title('old K')
pb.imshow(np.diagflat(self.old_K), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new K')
pb.imshow(np.diagflat(K), interpolation='none')
pb.colorbar()
pb.figure()
pb.subplot(121)
pb.title('old W')
pb.imshow(np.diagflat(self.old_W), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new W')
pb.imshow(np.diagflat(W), interpolation='none')
pb.colorbar()
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
pb.close('all')
#FIXME: DELETE THESE
self.old_W = W.copy()
self.old_grad = grad.copy()
self.old_B = B.copy()
self.old_W_12 = W_12.copy()
self.old_ff = f.copy()
self.old_K = self.K.copy()
self.old_s = self.likelihood_function._get_params()
#print "after: ", self.old_s
#print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.a = a
#self.B, self.B_chol, self.W_12 = B, L, W_12
#self.Bi, _, _, B_det = pdinv(self.B)
return f