From 1420aa532c5df8eaf4e6db5b89e77f4b375ebf1c Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 19 Apr 2013 12:23:00 +0100 Subject: [PATCH] Attempted to introduce gradient methods, won't work yet I doubt --- GPy/examples/__init__.py | 1 + GPy/likelihoods/Laplace.py | 120 ++++++++++++++++++------ GPy/likelihoods/likelihood_functions.py | 58 +++++++++++- GPy/models/GP.py | 16 +++- GPy/util/linalg.py | 19 +++- 5 files changed, 177 insertions(+), 37 deletions(-) diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py index 551bff54..68832e77 100644 --- a/GPy/examples/__init__.py +++ b/GPy/examples/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) +import laplace_approximations import classification import regression import dimensionality_reduction diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 4d94ba0f..b1b41957 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -4,28 +4,9 @@ import GPy from scipy.linalg import cholesky, eig, inv, cho_solve, det from numpy.linalg import cond from GPy.likelihoods.likelihood import likelihood -from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv +from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet from scipy.linalg.lapack import dtrtrs -#TODO: Move this to utils - - -def det_ln_diag(A): - """ - log determinant of a diagonal matrix - $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$ - """ - return np.log(np.diagonal(A)).sum() - - -def pddet(A): - """ - Determinant of a positive definite matrix - """ - L = cholesky(A) - logdetA = 2*sum(np.log(np.diag(L))) - return logdetA - class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -75,17 +56,92 @@ class Laplace(likelihood): return self.likelihood_function.predictive_values(mu, var) def _get_params(self): - return np.zeros(0) + return np.asarray(self.likelihood_function._get_params()) def _get_param_names(self): - return [] + return self.likelihood_function._get_param_names() def _set_params(self, p): - pass # TODO: Laplace likelihood might want to take some parameters... + return self.likelihood_function._set_params() + + def both_gradients(self, dL_d_K_Sigma, dK_dthetaK): + """ + Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL + + dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from + changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted + + Similar terms arise when finding the gradients with respect to changes in the liklihood + parameters + """ + return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) + + def _shared_gradients_components(self): + dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y) + dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? + return dL_dytil, dytil_dfhat + + def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): + """ + #explicit #implicit #implicit + dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK) + :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance) + :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers + :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters + """ + dL_dytil, dytil_dfhat = self._shared_gradients_components() + + I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W)) + #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! + dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) + + dytil_dthetaK = dytil_dfhat*dfhat_dthetaK + + #FIXME: Careful dL_dK = dL_d_K_Sigma + #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? + dL_dSigma = dL_d_K_Sigma + d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) + #explicit #implicit + dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS + dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde) + + dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK + return dL_dthetaK_implicit def _gradients(self, partial): - return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... - raise NotImplementedError + """ + Gradients with respect to likelihood parameters + + Complicated, it differs for parameters of the kernel \theta_{K}, and + parameters of the likelihood, \theta_{L} + + dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK) + dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL) + dL_dK*dK_dthetaL = 0 + + dytil_dthetaX = dytil_dfhat * dfhat_dthetaX + dytil_dfhat = Sigma*Ki + I + + fhat = K*log_p(y|fhat) from rasm p125 + dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat) from rasm p125 + + dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi + dW_dthetaX = d_dthetaX[d2phi_d2fhat] + d2phi_d2fhat = Hessian function of likelihood + + partial = dL_dK + """ + dL_dytil, dytil_dfhat = self._shared_gradients_components() + dfhat_dthetaL = self.likelihood_function.df_dtheta() + + dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? + dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde) + dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case + + dytil_dthetaL = dytil_dfhat*dfhat_dthetaL + dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL + return dL_dthetaL + #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... def _compute_GP_variables(self): """ @@ -112,8 +168,9 @@ class Laplace(likelihood): $$\tilde{\Sigma} = W^{-1}$$ """ - epsilon = 1e-6 + epsilon = 1e14 + #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I #dtritri -> L -> L_i #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i #((L.T*w)_i + I)f_hat = y_tilde @@ -122,11 +179,12 @@ class Laplace(likelihood): Lt_W = np.dot(L.T, self.W) ##Check it isn't singular! - if cond(Lt_W) > 1e14: + if cond(Lt_W) > epsilon: print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem" Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] - Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat) + self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N) + Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) #f.T(Ki + W)f f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) @@ -156,16 +214,16 @@ class Laplace(likelihood): #) ##Check it isn't singular! - if cond(self.W) > 1e14: + if cond(self.W) > epsilon: print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem" - Sigma_tilde = inv(self.W) # Damn + self.Sigma_tilde = inv(self.W) # Damn #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) self.Y = Y_tilde self.YYT = np.dot(self.Y, self.Y.T) - self.covariance_matrix = Sigma_tilde + self.covariance_matrix = self.Sigma_tilde self.precision = 1 / np.diag(self.covariance_matrix)[:, None] def fit_full(self, K): diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index c759e15f..6e72b029 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -20,6 +20,16 @@ class likelihood_function: def __init__(self,location=0,scale=1): self.location = location self.scale = scale + self.log_concave = True + + def _get_params(self): + return np.zeros(0) + + def _get_param_names(self): + return [] + + def _set_params(self, p): + pass class probit(likelihood_function): """ @@ -149,12 +159,22 @@ class student_t(likelihood_function): d2ln p(yi|fi)_d2fifj """ def __init__(self, deg_free, sigma=2): + super(student_t, self).__init__() self.v = deg_free self.sigma = sigma - - #FIXME: This should be in the superclass self.log_concave = False + def _get_params(self): + return np.asarray(self.sigma) + + def _get_param_names(self): + return ["t_noise_variance"] + + def _set_params(self, x): + self.sigma = float(x) + #self.covariance_matrix = np.eye(self.N)*self._variance + #self.precision = 1./self._variance + @property def variance(self, extra_data=None): return (self.v / float(self.v - 2)) * (self.sigma**2) @@ -222,6 +242,40 @@ class student_t(likelihood_function): hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) return np.squeeze(hess) + def d3link(self, y, f, extra_data=None): + """ + Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j + + $$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$ + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + #NB f-y not y-f + e = f - y + d3link_d3f = ( (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e)) + / ((e**2 + (self.sigma**2)*self.v)**3) + ) + return d3link_d3f + + def link_hess_grad_sigma(self, y, f, extra_data=None): + """ + Gradient of the hessian w.r.t sigma parameter + + $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}} + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + e = y - f + hess_grad_sigma = ( (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) + / ((e**2 + (self.sigma**2)*self.v)**3) + ) + return hess_grad_sigma + + def _gradients(self, y, f, extra_data=None): + return [self.link_hess_grad_sigma] # list as we might learn many parameters + def predictive_values(self, mu, var): """ Compute mean, and conficence interval (percentiles 5 and 95) of the prediction diff --git a/GPy/models/GP.py b/GPy/models/GP.py index cfda0cfe..1024b5ef 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -8,7 +8,7 @@ from .. import kern from ..core import model from ..util.linalg import pdinv,mdot from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango -from ..likelihoods import EP +from ..likelihoods import EP, Laplace class GP(model): """ @@ -128,7 +128,19 @@ class GP(model): For the likelihood parameters, pass in alpha = K^-1 y """ - return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK,X=self.X,slices1=self.Xslices,slices2=self.Xslices), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) + if isinstance(self.likelihood, Laplace): + dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices) + #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained + fake_dL_dKs = np.ones(self.dL_dK.shape) + dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices) + + dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK) + dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit + dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + else: + dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices) + dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + return np.hstack((dL_dthetaK, dL_dthetaL)) def _raw_predict(self,_Xnew,slices=None, full_cov=False): """ diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py index f88099a4..cb899397 100644 --- a/GPy/util/linalg.py +++ b/GPy/util/linalg.py @@ -14,6 +14,21 @@ import types #import scipy.lib.lapack.flapack import scipy as sp +def det_ln_diag(A): + """ + log determinant of a diagonal matrix + $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$ + """ + return np.log(np.diagonal(A)).sum() + +def pddet(A): + """ + Determinant of a positive definite matrix + """ + L = cholesky(A) + logdetA = 2*sum(np.log(np.diag(L))) + return logdetA + def trace_dot(a,b): """ efficiently compute the trace of the matrix product of a and b @@ -166,8 +181,8 @@ def PCA(Y, Q): """ if not np.allclose(Y.mean(axis=0), 0.0): print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)" - - #Y -= Y.mean(axis=0) + + #Y -= Y.mean(axis=0) Z = linalg.svd(Y-Y.mean(axis=0), full_matrices = False) [X, W] = [Z[0][:,0:Q], np.dot(np.diag(Z[1]), Z[2]).T[:,0:Q]]