mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-01 07:46:22 +02:00
450 lines
19 KiB
Python
450 lines
19 KiB
Python
import numpy as np
|
|
import scipy as sp
|
|
import GPy
|
|
from scipy.linalg import inv, cho_solve, det
|
|
from numpy.linalg import cond
|
|
from likelihood import likelihood
|
|
from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet, dtrtrs
|
|
from scipy.linalg.lapack import dtrtrs
|
|
import random
|
|
from functools import partial
|
|
#import pylab as plt
|
|
|
|
class Laplace(likelihood):
|
|
"""Laplace approximation to a posterior"""
|
|
|
|
def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'):
|
|
"""
|
|
Laplace Approximation
|
|
|
|
First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
|
|
then find the z^{prime} which allows this to be a normalised gaussian instead of a
|
|
non-normalized gaussian
|
|
|
|
Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
|
|
which makes a gaussian the same as the laplace approximation
|
|
|
|
Arguments
|
|
---------
|
|
|
|
:data: array of data the likelihood function is approximating
|
|
:likelihood_function: likelihood function - subclass of likelihood_function
|
|
:extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
|
|
:opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
|
|
|
|
"""
|
|
self.data = data
|
|
self.likelihood_function = likelihood_function
|
|
self.extra_data = extra_data
|
|
self.opt = opt
|
|
|
|
#Inital values
|
|
self.N, self.D = self.data.shape
|
|
self.is_heteroscedastic = True
|
|
self.Nparams = 0
|
|
self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
|
|
|
|
self.restart()
|
|
|
|
def restart(self):
|
|
#Initial values for the GP variables
|
|
self.Y = np.zeros((self.N, 1))
|
|
self.covariance_matrix = np.eye(self.N)
|
|
self.precision = np.ones(self.N)[:, None]
|
|
self.Z = 0
|
|
self.YYT = None
|
|
|
|
self.old_a = None
|
|
|
|
def predictive_values(self, mu, var, full_cov):
|
|
if full_cov:
|
|
raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
|
|
return self.likelihood_function.predictive_values(mu, var)
|
|
|
|
def _get_params(self):
|
|
return np.asarray(self.likelihood_function._get_params())
|
|
|
|
def _get_param_names(self):
|
|
return self.likelihood_function._get_param_names()
|
|
|
|
def _set_params(self, p):
|
|
return self.likelihood_function._set_params(p)
|
|
|
|
def _shared_gradients_components(self):
|
|
d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat, extra_data=self.extra_data)
|
|
dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
|
|
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
|
|
return dL_dfhat, I_KW_i
|
|
|
|
def _Kgradients(self, dK_dthetaK, X):
|
|
"""
|
|
Gradients with respect to prior kernel parameters
|
|
"""
|
|
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
|
dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
|
|
|
|
#Explicit
|
|
expl_a = np.dot(self.Ki_f, self.Ki_f.T)
|
|
expl_b = self.Wi_K_i
|
|
expl = 0.5*expl_a - 0.5*expl_b
|
|
dL_dthetaK_exp = dK_dthetaK(expl, X)
|
|
|
|
#Implicit
|
|
impl = mdot(dlp, dL_dfhat, I_KW_i)
|
|
dL_dthetaK_imp = dK_dthetaK(impl, X)
|
|
#print "K: dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
|
|
dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
|
|
return dL_dthetaK
|
|
|
|
def _gradients(self, partial):
|
|
"""
|
|
Gradients with respect to likelihood parameters
|
|
"""
|
|
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
|
dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
|
|
|
|
num_params = len(dlik_dthetaL)
|
|
# make space for one derivative for each likelihood parameter
|
|
dL_dthetaL = np.zeros(num_params)
|
|
for thetaL_i in range(num_params):
|
|
#Explicit
|
|
dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
|
|
#- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
|
|
+ np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[thetaL_i])
|
|
)
|
|
|
|
#Implicit
|
|
dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
|
|
dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
|
|
#print "LIK: dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
|
|
dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
|
|
|
|
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
|
|
|
|
def _compute_GP_variables(self):
|
|
"""
|
|
Generates data Y which would give the normal distribution identical to the laplace approximation
|
|
|
|
GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
|
|
that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
|
|
|
|
Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
|
|
then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
|
|
due to the z rescaling.
|
|
|
|
at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
|
|
This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
|
|
giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
|
|
|
|
$$\tilde{Y} = \tilde{\Sigma} Hf$$
|
|
where
|
|
$$\tilde{\Sigma}^{-1} = H - K^{-1}$$
|
|
i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
|
|
since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
|
|
and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
|
|
$$\tilde{\Sigma} = W^{-1}$$
|
|
|
|
"""
|
|
#Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
|
|
#dtritri -> L -> L_i
|
|
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
|
|
#((L.T*w)_i + I)f_hat = y_tilde
|
|
#L = jitchol(self.K)
|
|
#Li = chol_inv(L)
|
|
#Lt_W = L.T*self.W.T
|
|
|
|
#Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
|
|
#self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
|
|
#Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
|
|
|
|
Wi = 1.0/self.W
|
|
self.Sigma_tilde = np.diagflat(Wi)
|
|
|
|
Y_tilde = Wi*self.Ki_f + self.f_hat
|
|
|
|
#self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
|
|
self.Wi_K_i = self.W_12*cho_solve((self.B_chol, True), np.diagflat(self.W_12))
|
|
#self.Wi_K_i, _, _, self.ln_det_Wi_K = pdinv(self.Sigma_tilde + self.K) # TODO: Check if Wi_K_i == R above and same with det below
|
|
|
|
self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
|
|
|
|
self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
|
|
|
|
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
|
|
Z_tilde = (+ self.lik
|
|
- 0.5*self.ln_B_det
|
|
+ 0.5*self.ln_det_Wi_K
|
|
- 0.5*self.f_Ki_f
|
|
+ 0.5*self.y_Wi_Ki_i_y
|
|
)
|
|
|
|
#Convert to float as its (1, 1) and Z must be a scalar
|
|
self.Z = np.float64(Z_tilde)
|
|
self.Y = Y_tilde
|
|
self.YYT = np.dot(self.Y, self.Y.T)
|
|
self.covariance_matrix = self.Sigma_tilde
|
|
self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
|
|
|
|
def fit_full(self, K):
|
|
"""
|
|
The laplace approximation algorithm, find K and expand hessian
|
|
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
|
|
:K: Covariance matrix
|
|
"""
|
|
self.K = K.copy()
|
|
|
|
#Find mode
|
|
self.f_hat = {
|
|
'rasm': self.rasm_mode,
|
|
'ncg': self.ncg_mode,
|
|
'nelder': self.nelder_mode
|
|
}[self.opt](self.K)
|
|
|
|
#Compute hessian and other variables at mode
|
|
self._compute_likelihood_variables()
|
|
|
|
def _compute_likelihood_variables(self):
|
|
#At this point get the hessian matrix (or vector as W is diagonal)
|
|
self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
|
|
|
|
if not self.likelihood_function.log_concave:
|
|
#print "Under 1e-6: {}".format(np.sum(self.W < 1e-6))
|
|
self.W[self.W < 1e-6] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
|
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
|
#To cause the posterior to become less certain than the prior and likelihood,
|
|
#This is a property only held by non-log-concave likelihoods
|
|
|
|
#TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
|
|
self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
|
|
self.Bi, _, _, self.ln_B_det = pdinv(self.B)
|
|
|
|
#Do the computation again at f to get Ki_f which is useful
|
|
#b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
|
|
#solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
|
|
#a = b - self.W_12*solve_chol
|
|
self.Ki_f = self.a
|
|
|
|
self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
|
|
self.Ki_W_i = self.K - mdot(self.K, self.W_12*cho_solve((self.B_chol, True), np.diagflat(self.W_12)), self.K)
|
|
|
|
#For det, |I + KW| == |I + W_12*K*W_12|
|
|
#self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
|
|
|
|
#self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
|
|
#self.ln_z_hat = (- 0.5*self.f_Ki_f
|
|
#- self.ln_I_KW_det
|
|
#+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
|
|
#)
|
|
|
|
return self._compute_GP_variables()
|
|
|
|
def _compute_B_statistics(self, K, W):
|
|
"""Rasmussen suggests the use of a numerically stable positive definite matrix B
|
|
Which has a positive diagonal element and can be easyily inverted
|
|
|
|
:K: Covariance matrix
|
|
:W: Negative hessian at a point (diagonal matrix)
|
|
:returns: (B, L)
|
|
"""
|
|
#W is diagonal so its sqrt is just the sqrt of the diagonal elements
|
|
W_12 = np.sqrt(W)
|
|
B = np.eye(self.N) + W_12*K*W_12.T
|
|
L = jitchol(B)
|
|
return (B, L, W_12)
|
|
|
|
def nelder_mode(self, K):
|
|
f = np.zeros((self.N, 1))
|
|
self.Ki, _, _, self.ln_K_det = pdinv(K)
|
|
def obj(f):
|
|
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
|
|
return float(res)
|
|
|
|
res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
|
|
f_new = res.x
|
|
return f_new[:, None]
|
|
|
|
def ncg_mode(self, K):
|
|
"""
|
|
Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
|
|
:K: Covariance matrix
|
|
:returns: f_mode
|
|
"""
|
|
self.Ki, _, _, self.ln_K_det = pdinv(K)
|
|
|
|
f = np.zeros((self.N, 1))
|
|
|
|
#FIXME: Can we get rid of this horrible reshaping?
|
|
#ONLY WORKS FOR 1D DATA
|
|
def obj(f):
|
|
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
|
|
- self.NORMAL_CONST)
|
|
return float(res)
|
|
|
|
def obj_grad(f):
|
|
res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
|
|
return np.squeeze(res)
|
|
|
|
def obj_hess(f):
|
|
res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
|
|
return np.squeeze(res)
|
|
|
|
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
|
|
return f_hat[:, None]
|
|
|
|
def rasm_mode(self, K, MAX_ITER=200, MAX_RESTART=10):
|
|
"""
|
|
Rasmussen's numerically stable mode finding
|
|
For nomenclature see Rasmussen & Williams 2006
|
|
|
|
:K: Covariance matrix
|
|
:MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
|
|
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
|
|
:returns: f_mode
|
|
"""
|
|
#self.old_before_s = self.likelihood_function._get_params()
|
|
#print "before: ", self.old_before_s
|
|
#if self.old_before_s < 1e-5:
|
|
|
|
#old_a = np.zeros((self.N, 1))
|
|
if self.old_a is None:
|
|
old_a = np.zeros((self.N, 1))
|
|
f = np.dot(K, old_a)
|
|
else:
|
|
old_a = self.old_a.copy()
|
|
f = self.f_hat.copy()
|
|
|
|
new_obj = -np.inf
|
|
old_obj = np.inf
|
|
|
|
def obj(a, f):
|
|
return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
|
|
|
|
difference = np.inf
|
|
epsilon = 1e-10
|
|
step_size = 1
|
|
rs = 0
|
|
i = 0
|
|
|
|
while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
|
|
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
|
|
#W = np.maximum(W, 0)
|
|
if not self.likelihood_function.log_concave:
|
|
#print "Under 1e-10: {}".format(np.sum(W < 1e-10))
|
|
W[W < 1e-10] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
|
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
|
# To cause the posterior to become less certain than the prior and likelihood,
|
|
# This is a property only held by non-log-concave likelihoods
|
|
B, L, W_12 = self._compute_B_statistics(K, W.copy())
|
|
|
|
W_f = W*f
|
|
grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
|
|
|
|
b = W_f + grad
|
|
solve_L = cho_solve((L, True), W_12*np.dot(K, b))
|
|
#Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
|
|
full_step_a = b - W_12*solve_L
|
|
da = full_step_a - old_a
|
|
|
|
f_old = f.copy()
|
|
def inner_obj(step_size, old_a, da, K):
|
|
a = old_a + step_size*da
|
|
f = np.dot(K, a)
|
|
self.a = a.copy() # This is nasty, need to set something within an optimization though
|
|
self.f = f.copy()
|
|
return -obj(a, f)
|
|
|
|
i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
|
|
#new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
|
|
new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-6, options={'maxiter':20, 'disp':True}).fun
|
|
f = self.f.copy()
|
|
a = self.a.copy()
|
|
|
|
#f_old = f.copy()
|
|
#update_passed = False
|
|
#while not update_passed:
|
|
#a = old_a + step_size*da
|
|
#f = np.dot(K, a)
|
|
|
|
#old_obj = new_obj
|
|
#new_obj = obj(a, f)
|
|
#difference = new_obj - old_obj
|
|
##print "difference: ",difference
|
|
#if difference < 0:
|
|
##print "Objective function rose", np.float(difference)
|
|
##If the objective function isn't rising, restart optimization
|
|
#step_size *= 0.8
|
|
##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
|
|
##objective function isn't increasing, try reducing step size
|
|
#f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
|
|
#old_obj = new_obj
|
|
#rs += 1
|
|
#else:
|
|
#update_passed = True
|
|
|
|
#difference = abs(new_obj - old_obj)
|
|
#old_obj = new_obj.copy()
|
|
#difference = np.abs(np.sum(f - f_old))
|
|
difference = np.abs(np.sum(a - old_a))
|
|
#old_a = self.a.copy() #a
|
|
old_a = a.copy()
|
|
i += 1
|
|
#print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
|
|
|
|
self.old_a = old_a.copy()
|
|
#print "Positive difference obj: ", np.float(difference)
|
|
#print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
|
|
#print "Iterations: {}, Final_difference: {}".format(i, difference)
|
|
if difference > epsilon:
|
|
print "Not perfect f_hat fit difference: {}".format(difference)
|
|
if False:
|
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
|
if hasattr(self, 'X'):
|
|
import pylab as pb
|
|
pb.figure()
|
|
pb.subplot(311)
|
|
pb.title('old f_hat')
|
|
pb.plot(self.X, self.f_hat)
|
|
pb.subplot(312)
|
|
pb.title('old ff')
|
|
pb.plot(self.X, self.old_ff)
|
|
pb.subplot(313)
|
|
pb.title('new f_hat')
|
|
pb.plot(self.X, f)
|
|
|
|
pb.figure()
|
|
pb.subplot(121)
|
|
pb.title('old K')
|
|
pb.imshow(np.diagflat(self.old_K), interpolation='none')
|
|
pb.colorbar()
|
|
pb.subplot(122)
|
|
pb.title('new K')
|
|
pb.imshow(np.diagflat(K), interpolation='none')
|
|
pb.colorbar()
|
|
|
|
pb.figure()
|
|
pb.subplot(121)
|
|
pb.title('old W')
|
|
pb.imshow(np.diagflat(self.old_W), interpolation='none')
|
|
pb.colorbar()
|
|
pb.subplot(122)
|
|
pb.title('new W')
|
|
pb.imshow(np.diagflat(W), interpolation='none')
|
|
pb.colorbar()
|
|
|
|
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
|
pb.close('all')
|
|
|
|
#FIXME: DELETE THESE
|
|
self.old_W = W.copy()
|
|
self.old_grad = grad.copy()
|
|
self.old_B = B.copy()
|
|
self.old_W_12 = W_12.copy()
|
|
self.old_ff = f.copy()
|
|
self.old_K = self.K.copy()
|
|
self.old_s = self.likelihood_function._get_params()
|
|
#print "after: ", self.old_s
|
|
#print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
|
|
self.a = a
|
|
#self.B, self.B_chol, self.W_12 = B, L, W_12
|
|
#self.Bi, _, _, B_det = pdinv(self.B)
|
|
return f
|