Attempted to introduce gradient methods, won't work yet I doubt

This commit is contained in:
Alan Saul 2013-04-19 12:23:00 +01:00
parent 7b44a4cb53
commit 1420aa532c
5 changed files with 177 additions and 37 deletions

View file

@ -1,6 +1,7 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import laplace_approximations
import classification
import regression
import dimensionality_reduction

View file

@ -4,28 +4,9 @@ import GPy
from scipy.linalg import cholesky, eig, inv, cho_solve, det
from numpy.linalg import cond
from GPy.likelihoods.likelihood import likelihood
from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
from scipy.linalg.lapack import dtrtrs
#TODO: Move this to utils
def det_ln_diag(A):
"""
log determinant of a diagonal matrix
$$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
"""
return np.log(np.diagonal(A)).sum()
def pddet(A):
"""
Determinant of a positive definite matrix
"""
L = cholesky(A)
logdetA = 2*sum(np.log(np.diag(L)))
return logdetA
class Laplace(likelihood):
"""Laplace approximation to a posterior"""
@ -75,17 +56,92 @@ class Laplace(likelihood):
return self.likelihood_function.predictive_values(mu, var)
def _get_params(self):
return np.zeros(0)
return np.asarray(self.likelihood_function._get_params())
def _get_param_names(self):
return []
return self.likelihood_function._get_param_names()
def _set_params(self, p):
pass # TODO: Laplace likelihood might want to take some parameters...
return self.likelihood_function._set_params()
def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
"""
Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL
dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from
changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted
Similar terms arise when finding the gradients with respect to changes in the liklihood
parameters
"""
return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
def _shared_gradients_components(self):
dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y)
dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
return dL_dytil, dytil_dfhat
def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
"""
#explicit #implicit #implicit
dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
:param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
:param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
:returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
"""
dL_dytil, dytil_dfhat = self._shared_gradients_components()
I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W))
#FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
dytil_dthetaK = dytil_dfhat*dfhat_dthetaK
#FIXME: Careful dL_dK = dL_d_K_Sigma
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
dL_dSigma = dL_d_K_Sigma
d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
#explicit #implicit
dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK
return dL_dthetaK_implicit
def _gradients(self, partial):
return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
raise NotImplementedError
"""
Gradients with respect to likelihood parameters
Complicated, it differs for parameters of the kernel \theta_{K}, and
parameters of the likelihood, \theta_{L}
dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
dL_dK*dK_dthetaL = 0
dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
dytil_dfhat = Sigma*Ki + I
fhat = K*log_p(y|fhat) from rasm p125
dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat) from rasm p125
dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
dW_dthetaX = d_dthetaX[d2phi_d2fhat]
d2phi_d2fhat = Hessian function of likelihood
partial = dL_dK
"""
dL_dytil, dytil_dfhat = self._shared_gradients_components()
dfhat_dthetaL = self.likelihood_function.df_dtheta()
dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL
return dL_dthetaL
#return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
def _compute_GP_variables(self):
"""
@ -112,8 +168,9 @@ class Laplace(likelihood):
$$\tilde{\Sigma} = W^{-1}$$
"""
epsilon = 1e-6
epsilon = 1e14
#Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
#dtritri -> L -> L_i
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
#((L.T*w)_i + I)f_hat = y_tilde
@ -122,11 +179,12 @@ class Laplace(likelihood):
Lt_W = np.dot(L.T, self.W)
##Check it isn't singular!
if cond(Lt_W) > 1e14:
if cond(Lt_W) > epsilon:
print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
#f.T(Ki + W)f
f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
@ -156,16 +214,16 @@ class Laplace(likelihood):
#)
##Check it isn't singular!
if cond(self.W) > 1e14:
if cond(self.W) > epsilon:
print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
Sigma_tilde = inv(self.W) # Damn
self.Sigma_tilde = inv(self.W) # Damn
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
self.Y = Y_tilde
self.YYT = np.dot(self.Y, self.Y.T)
self.covariance_matrix = Sigma_tilde
self.covariance_matrix = self.Sigma_tilde
self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
def fit_full(self, K):

View file

@ -20,6 +20,16 @@ class likelihood_function:
def __init__(self,location=0,scale=1):
self.location = location
self.scale = scale
self.log_concave = True
def _get_params(self):
return np.zeros(0)
def _get_param_names(self):
return []
def _set_params(self, p):
pass
class probit(likelihood_function):
"""
@ -149,12 +159,22 @@ class student_t(likelihood_function):
d2ln p(yi|fi)_d2fifj
"""
def __init__(self, deg_free, sigma=2):
super(student_t, self).__init__()
self.v = deg_free
self.sigma = sigma
#FIXME: This should be in the superclass
self.log_concave = False
def _get_params(self):
return np.asarray(self.sigma)
def _get_param_names(self):
return ["t_noise_variance"]
def _set_params(self, x):
self.sigma = float(x)
#self.covariance_matrix = np.eye(self.N)*self._variance
#self.precision = 1./self._variance
@property
def variance(self, extra_data=None):
return (self.v / float(self.v - 2)) * (self.sigma**2)
@ -222,6 +242,40 @@ class student_t(likelihood_function):
hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
return np.squeeze(hess)
def d3link(self, y, f, extra_data=None):
"""
Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
$$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
#NB f-y not y-f
e = f - y
d3link_d3f = ( (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
/ ((e**2 + (self.sigma**2)*self.v)**3)
)
return d3link_d3f
def link_hess_grad_sigma(self, y, f, extra_data=None):
"""
Gradient of the hessian w.r.t sigma parameter
$$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}}
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
hess_grad_sigma = ( (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
/ ((e**2 + (self.sigma**2)*self.v)**3)
)
return hess_grad_sigma
def _gradients(self, y, f, extra_data=None):
return [self.link_hess_grad_sigma] # list as we might learn many parameters
def predictive_values(self, mu, var):
"""
Compute mean, and conficence interval (percentiles 5 and 95) of the prediction

View file

@ -8,7 +8,7 @@ from .. import kern
from ..core import model
from ..util.linalg import pdinv,mdot
from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango
from ..likelihoods import EP
from ..likelihoods import EP, Laplace
class GP(model):
"""
@ -128,7 +128,19 @@ class GP(model):
For the likelihood parameters, pass in alpha = K^-1 y
"""
return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK,X=self.X,slices1=self.Xslices,slices2=self.Xslices), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
if isinstance(self.likelihood, Laplace):
dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
#Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
fake_dL_dKs = np.ones(self.dL_dK.shape)
dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
else:
dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
return np.hstack((dL_dthetaK, dL_dthetaL))
def _raw_predict(self,_Xnew,slices=None, full_cov=False):
"""

View file

@ -14,6 +14,21 @@ import types
#import scipy.lib.lapack.flapack
import scipy as sp
def det_ln_diag(A):
"""
log determinant of a diagonal matrix
$$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
"""
return np.log(np.diagonal(A)).sum()
def pddet(A):
"""
Determinant of a positive definite matrix
"""
L = cholesky(A)
logdetA = 2*sum(np.log(np.diag(L)))
return logdetA
def trace_dot(a,b):
"""
efficiently compute the trace of the matrix product of a and b
@ -166,8 +181,8 @@ def PCA(Y, Q):
"""
if not np.allclose(Y.mean(axis=0), 0.0):
print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
#Y -= Y.mean(axis=0)
#Y -= Y.mean(axis=0)
Z = linalg.svd(Y-Y.mean(axis=0), full_matrices = False)
[X, W] = [Z[0][:,0:Q], np.dot(np.diag(Z[1]), Z[2]).T[:,0:Q]]