diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 2af51f2b..ce3f870f 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -96,7 +96,10 @@ class Laplace(likelihood): #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically a = mdot(dWi_dfhat, Ki, self.f_hat) - dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) + b = np.dot(self.Sigma_tilde, Ki) + dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) + #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N) + self.dytil_dfhat = dytil_dfhat return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -330,19 +333,25 @@ class Laplace(likelihood): def fit_full(self, K): """ - The laplace approximation algorithm + The laplace approximation algorithm, find K and expand hessian For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability :K: Covariance matrix """ self.K = K.copy() - #assert np.all(self.K.T == self.K) - #self.K_safe = K.copy() + + #Find mode if self.rasm: self.f_hat = self.rasm_mode(K) else: self.f_hat = self.ncg_mode(K) + #Compute hessian and other variables at mode + self._compute_likelihood_variables() + + def _compute_likelihood_variables(self): #At this point get the hessian matrix + #print "Data: ", self.data + #print "fhat: ", self.f_hat self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data)) if not self.likelihood_function.log_concave: @@ -352,14 +361,14 @@ class Laplace(likelihood): #This is a property only held by non-log-concave likelihoods #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though - self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W) + self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) self.Bi, _, _, B_det = pdinv(self.B) Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None] - solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b))) + solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b))) a = b - mdot(self.W_12, solve_chol) self.f_Ki_f = np.dot(self.f_hat.T, a) self.ln_K_det = pddet(self.K) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 0d194c01..646293d2 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -10,8 +10,7 @@ from scipy.special import gammaln, gamma from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf class likelihood_function: - """ - Likelihood class for doing Expectation propagation + """ Likelihood class for doing Expectation propagation :param Y: observed output (Nx1 numpy.darray) ..Note:: Y values allowed depend on the likelihood_function used @@ -241,6 +240,7 @@ class student_t(likelihood_function): y = np.squeeze(y) f = np.squeeze(f) assert y.shape == f.shape + e = y - f hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) return np.squeeze(hess) diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py new file mode 100644 index 00000000..394950d5 --- /dev/null +++ b/GPy/testing/laplace_approx.tests.py @@ -0,0 +1,123 @@ +import unittest +import numpy as np + +import GPy +from GPy.models import GP +from GPy.util.linalg import pdinv, tdot +from scipy import linalg + +class LikelihoodGradParam(GP): + def __init__(self, X, likelihood_function, kernel, param_name=None, function=None, **kwargs): + super(LikelihoodGradParam, self).__init__(X, likelihood_function, kernel) + self.param_name = param_name + self.func = function + #self.func_params = kwargs + #self.parameter = self.likelihood.__getattribute__(self.param_name) + + def _get_param_names(self): + f_hats = ["f_{}".format(i) for i in range(len(self.likelihood.f_hat))] + return f_hats + + def _get_params(self): + return np.hstack([np.squeeze(self.likelihood.f_hat)]) + #return np.hstack([self.likelihood.__getattribute__(self.param_name)]) + + def hack_dL_dK(self): + self.K = self.kern.K(self.X) + self.K += self.likelihood.covariance_matrix + + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + + # the gradient of the likelihood wrt the covariance matrix + if self.likelihood.YYT is None: + alpha, _ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y, lower=1) + self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki) + else: + tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1) + tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1) + self.dL_dK = 0.5 * (tmp - self.D * self.Ki) + + def _set_params(self, x): + self.likelihood.f_hat = x.reshape(self.N, 1) + self.likelihood._compute_likelihood_variables() + self.hack_dL_dK() + + def log_likelihood(self): + return self.func(self.likelihood)[0, 0] + + def _log_likelihood_gradients(self): + #gradient = self.likelihood.__getattribute__(self.param_name) + self.likelihood._compute_likelihood_variables() + self.likelihood._gradients(partial=np.diag(self.dL_dK)) + gradient = getattr(self.likelihood, self.param_name) + #Need to sum over fhats? For dytil_dfhat... + #gradient = np.flatten(gradient, axis=0) + #return gradient[:, 0] + return gradient[0, :] + + +class LaplaceTests(unittest.TestCase): + def setUp(self): + real_var = 0.1 + #Start a function, any function + #self.X = np.linspace(0.0, 10.0, 30)[:, None] + self.X = np.random.randn(2,1) + #self.X = np.ones((10,1)) + Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var + self.Y = Y/Y.max() + self.kernel = GPy.kern.rbf(self.X.shape[1]) + + deg_free = 10000 + real_sd = np.sqrt(real_var) + initial_sd_guess = 1 + + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=initial_sd_guess) + self.stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) + self.stu_t_likelihood.fit_full(self.kernel.K(self.X)) + self.m = LikelihoodGradParam(self.X, self.stu_t_likelihood, self.kernel, None, None) + self.m.constrain_fixed('rbf_v', 1.0898) + self.m.constrain_fixed('rbf_l', 1.8651) + + def tearDown(self): + self.m = None + + def test_dy_dfhat(self): + def ytil(likelihood): + Sigma_tilde = likelihood.Sigma_tilde + K = likelihood.K + Ki, _, _, _ = pdinv(K) + f_hat = likelihood.f_hat + Sigma, _, _, _ = pdinv(Sigma_tilde) + return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat) + + self.m.func = ytil + self.m.param_name = 'dytil_dfhat' + self.m.randomize() + #try: + self.m.checkgrad(verbose=1) + assert self.m.checkgrad() + #except: + #import ipdb;ipdb.set_trace() + + + #def test_dL_dytil(self): + #def L(likelihood): + ##-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z + #Sigma_tilde = likelihood.Sigma_tilde + #Ki = likelihood.K + #f_hat = likelihood.f_hat + #Sigma, _, _, _ = pdinv(Sigma_tilde) + #return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat) + + #self.m.func = L + #self.m.param_name = 'dL_dytil' + #m.randomize() + ##try: + #m.checkgrad(verbose=1) + #assert m.checkgrad() + #except: + #import ipdb;ipdb.set_trace() + +if __name__ == "__main__": + unittest.main() +