mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-04-27 22:06:22 +02:00
Attempted to introduce gradient methods, won't work yet I doubt
This commit is contained in:
parent
7b44a4cb53
commit
1420aa532c
5 changed files with 177 additions and 37 deletions
|
|
@ -1,6 +1,7 @@
|
|||
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||
|
||||
import laplace_approximations
|
||||
import classification
|
||||
import regression
|
||||
import dimensionality_reduction
|
||||
|
|
|
|||
|
|
@ -4,28 +4,9 @@ import GPy
|
|||
from scipy.linalg import cholesky, eig, inv, cho_solve, det
|
||||
from numpy.linalg import cond
|
||||
from GPy.likelihoods.likelihood import likelihood
|
||||
from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
|
||||
from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
|
||||
from scipy.linalg.lapack import dtrtrs
|
||||
|
||||
#TODO: Move this to utils
|
||||
|
||||
|
||||
def det_ln_diag(A):
|
||||
"""
|
||||
log determinant of a diagonal matrix
|
||||
$$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
|
||||
"""
|
||||
return np.log(np.diagonal(A)).sum()
|
||||
|
||||
|
||||
def pddet(A):
|
||||
"""
|
||||
Determinant of a positive definite matrix
|
||||
"""
|
||||
L = cholesky(A)
|
||||
logdetA = 2*sum(np.log(np.diag(L)))
|
||||
return logdetA
|
||||
|
||||
|
||||
class Laplace(likelihood):
|
||||
"""Laplace approximation to a posterior"""
|
||||
|
|
@ -75,17 +56,92 @@ class Laplace(likelihood):
|
|||
return self.likelihood_function.predictive_values(mu, var)
|
||||
|
||||
def _get_params(self):
|
||||
return np.zeros(0)
|
||||
return np.asarray(self.likelihood_function._get_params())
|
||||
|
||||
def _get_param_names(self):
|
||||
return []
|
||||
return self.likelihood_function._get_param_names()
|
||||
|
||||
def _set_params(self, p):
|
||||
pass # TODO: Laplace likelihood might want to take some parameters...
|
||||
return self.likelihood_function._set_params()
|
||||
|
||||
def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
|
||||
"""
|
||||
Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL
|
||||
|
||||
dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from
|
||||
changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted
|
||||
|
||||
Similar terms arise when finding the gradients with respect to changes in the liklihood
|
||||
parameters
|
||||
"""
|
||||
return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
|
||||
|
||||
def _shared_gradients_components(self):
|
||||
dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y)
|
||||
dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
|
||||
return dL_dytil, dytil_dfhat
|
||||
|
||||
def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
|
||||
"""
|
||||
#explicit #implicit #implicit
|
||||
dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
|
||||
:param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
|
||||
:param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
|
||||
:returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
|
||||
"""
|
||||
dL_dytil, dytil_dfhat = self._shared_gradients_components()
|
||||
|
||||
I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W))
|
||||
#FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
|
||||
dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
|
||||
|
||||
dytil_dthetaK = dytil_dfhat*dfhat_dthetaK
|
||||
|
||||
#FIXME: Careful dL_dK = dL_d_K_Sigma
|
||||
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
|
||||
dL_dSigma = dL_d_K_Sigma
|
||||
d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
|
||||
#explicit #implicit
|
||||
dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
|
||||
dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
|
||||
|
||||
dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK
|
||||
return dL_dthetaK_implicit
|
||||
|
||||
def _gradients(self, partial):
|
||||
return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
|
||||
raise NotImplementedError
|
||||
"""
|
||||
Gradients with respect to likelihood parameters
|
||||
|
||||
Complicated, it differs for parameters of the kernel \theta_{K}, and
|
||||
parameters of the likelihood, \theta_{L}
|
||||
|
||||
dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
|
||||
dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
|
||||
dL_dK*dK_dthetaL = 0
|
||||
|
||||
dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
|
||||
dytil_dfhat = Sigma*Ki + I
|
||||
|
||||
fhat = K*log_p(y|fhat) from rasm p125
|
||||
dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat) from rasm p125
|
||||
|
||||
dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
|
||||
dW_dthetaX = d_dthetaX[d2phi_d2fhat]
|
||||
d2phi_d2fhat = Hessian function of likelihood
|
||||
|
||||
partial = dL_dK
|
||||
"""
|
||||
dL_dytil, dytil_dfhat = self._shared_gradients_components()
|
||||
dfhat_dthetaL = self.likelihood_function.df_dtheta()
|
||||
|
||||
dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
|
||||
dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
|
||||
dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
|
||||
|
||||
dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
|
||||
dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL
|
||||
return dL_dthetaL
|
||||
#return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
|
||||
|
||||
def _compute_GP_variables(self):
|
||||
"""
|
||||
|
|
@ -112,8 +168,9 @@ class Laplace(likelihood):
|
|||
$$\tilde{\Sigma} = W^{-1}$$
|
||||
|
||||
"""
|
||||
epsilon = 1e-6
|
||||
epsilon = 1e14
|
||||
|
||||
#Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
|
||||
#dtritri -> L -> L_i
|
||||
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
|
||||
#((L.T*w)_i + I)f_hat = y_tilde
|
||||
|
|
@ -122,11 +179,12 @@ class Laplace(likelihood):
|
|||
Lt_W = np.dot(L.T, self.W)
|
||||
|
||||
##Check it isn't singular!
|
||||
if cond(Lt_W) > 1e14:
|
||||
if cond(Lt_W) > epsilon:
|
||||
print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
|
||||
|
||||
Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
|
||||
Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
|
||||
self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
|
||||
Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
|
||||
|
||||
#f.T(Ki + W)f
|
||||
f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
|
||||
|
|
@ -156,16 +214,16 @@ class Laplace(likelihood):
|
|||
#)
|
||||
|
||||
##Check it isn't singular!
|
||||
if cond(self.W) > 1e14:
|
||||
if cond(self.W) > epsilon:
|
||||
print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
|
||||
|
||||
Sigma_tilde = inv(self.W) # Damn
|
||||
self.Sigma_tilde = inv(self.W) # Damn
|
||||
|
||||
#Convert to float as its (1, 1) and Z must be a scalar
|
||||
self.Z = np.float64(Z_tilde)
|
||||
self.Y = Y_tilde
|
||||
self.YYT = np.dot(self.Y, self.Y.T)
|
||||
self.covariance_matrix = Sigma_tilde
|
||||
self.covariance_matrix = self.Sigma_tilde
|
||||
self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
|
||||
|
||||
def fit_full(self, K):
|
||||
|
|
|
|||
|
|
@ -20,6 +20,16 @@ class likelihood_function:
|
|||
def __init__(self,location=0,scale=1):
|
||||
self.location = location
|
||||
self.scale = scale
|
||||
self.log_concave = True
|
||||
|
||||
def _get_params(self):
|
||||
return np.zeros(0)
|
||||
|
||||
def _get_param_names(self):
|
||||
return []
|
||||
|
||||
def _set_params(self, p):
|
||||
pass
|
||||
|
||||
class probit(likelihood_function):
|
||||
"""
|
||||
|
|
@ -149,12 +159,22 @@ class student_t(likelihood_function):
|
|||
d2ln p(yi|fi)_d2fifj
|
||||
"""
|
||||
def __init__(self, deg_free, sigma=2):
|
||||
super(student_t, self).__init__()
|
||||
self.v = deg_free
|
||||
self.sigma = sigma
|
||||
|
||||
#FIXME: This should be in the superclass
|
||||
self.log_concave = False
|
||||
|
||||
def _get_params(self):
|
||||
return np.asarray(self.sigma)
|
||||
|
||||
def _get_param_names(self):
|
||||
return ["t_noise_variance"]
|
||||
|
||||
def _set_params(self, x):
|
||||
self.sigma = float(x)
|
||||
#self.covariance_matrix = np.eye(self.N)*self._variance
|
||||
#self.precision = 1./self._variance
|
||||
|
||||
@property
|
||||
def variance(self, extra_data=None):
|
||||
return (self.v / float(self.v - 2)) * (self.sigma**2)
|
||||
|
|
@ -222,6 +242,40 @@ class student_t(likelihood_function):
|
|||
hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
|
||||
return np.squeeze(hess)
|
||||
|
||||
def d3link(self, y, f, extra_data=None):
|
||||
"""
|
||||
Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
|
||||
|
||||
$$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$
|
||||
"""
|
||||
y = np.squeeze(y)
|
||||
f = np.squeeze(f)
|
||||
assert y.shape == f.shape
|
||||
#NB f-y not y-f
|
||||
e = f - y
|
||||
d3link_d3f = ( (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
|
||||
/ ((e**2 + (self.sigma**2)*self.v)**3)
|
||||
)
|
||||
return d3link_d3f
|
||||
|
||||
def link_hess_grad_sigma(self, y, f, extra_data=None):
|
||||
"""
|
||||
Gradient of the hessian w.r.t sigma parameter
|
||||
|
||||
$$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}}
|
||||
"""
|
||||
y = np.squeeze(y)
|
||||
f = np.squeeze(f)
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
hess_grad_sigma = ( (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
|
||||
/ ((e**2 + (self.sigma**2)*self.v)**3)
|
||||
)
|
||||
return hess_grad_sigma
|
||||
|
||||
def _gradients(self, y, f, extra_data=None):
|
||||
return [self.link_hess_grad_sigma] # list as we might learn many parameters
|
||||
|
||||
def predictive_values(self, mu, var):
|
||||
"""
|
||||
Compute mean, and conficence interval (percentiles 5 and 95) of the prediction
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from .. import kern
|
|||
from ..core import model
|
||||
from ..util.linalg import pdinv,mdot
|
||||
from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango
|
||||
from ..likelihoods import EP
|
||||
from ..likelihoods import EP, Laplace
|
||||
|
||||
class GP(model):
|
||||
"""
|
||||
|
|
@ -128,7 +128,19 @@ class GP(model):
|
|||
|
||||
For the likelihood parameters, pass in alpha = K^-1 y
|
||||
"""
|
||||
return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK,X=self.X,slices1=self.Xslices,slices2=self.Xslices), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
|
||||
if isinstance(self.likelihood, Laplace):
|
||||
dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
|
||||
#Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
|
||||
fake_dL_dKs = np.ones(self.dL_dK.shape)
|
||||
dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
|
||||
|
||||
dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
|
||||
dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
|
||||
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
||||
else:
|
||||
dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
|
||||
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
||||
return np.hstack((dL_dthetaK, dL_dthetaL))
|
||||
|
||||
def _raw_predict(self,_Xnew,slices=None, full_cov=False):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -14,6 +14,21 @@ import types
|
|||
#import scipy.lib.lapack.flapack
|
||||
import scipy as sp
|
||||
|
||||
def det_ln_diag(A):
|
||||
"""
|
||||
log determinant of a diagonal matrix
|
||||
$$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
|
||||
"""
|
||||
return np.log(np.diagonal(A)).sum()
|
||||
|
||||
def pddet(A):
|
||||
"""
|
||||
Determinant of a positive definite matrix
|
||||
"""
|
||||
L = cholesky(A)
|
||||
logdetA = 2*sum(np.log(np.diag(L)))
|
||||
return logdetA
|
||||
|
||||
def trace_dot(a,b):
|
||||
"""
|
||||
efficiently compute the trace of the matrix product of a and b
|
||||
|
|
@ -166,8 +181,8 @@ def PCA(Y, Q):
|
|||
"""
|
||||
if not np.allclose(Y.mean(axis=0), 0.0):
|
||||
print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
|
||||
|
||||
#Y -= Y.mean(axis=0)
|
||||
|
||||
#Y -= Y.mean(axis=0)
|
||||
|
||||
Z = linalg.svd(Y-Y.mean(axis=0), full_matrices = False)
|
||||
[X, W] = [Z[0][:,0:Q], np.dot(np.diag(Z[1]), Z[2]).T[:,0:Q]]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue