diff --git a/GPy/core/gp.py b/GPy/core/gp.py index 060b617a..031ed16e 100644 --- a/GPy/core/gp.py +++ b/GPy/core/gp.py @@ -55,9 +55,9 @@ class GP(Model): self.add_parameter(self.kern) self.add_parameter(self.likelihood) - self.parameters_changed() + def parameters_changed(self): self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y) self._dL_dK = grad_dict['dL_dK'] @@ -65,9 +65,6 @@ class GP(Model): def log_likelihood(self): return self._log_marginal_likelihood - def dL_dtheta_K(self): - return self.kern.dK_dtheta(self.posterior.dL_dK, self.X) - def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False): """ Internal helper function for making predictions, does not account diff --git a/GPy/core/model.py b/GPy/core/model.py index f4de0405..dc108641 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -325,7 +325,6 @@ class Model(Parameterized): if self._fail_count >= self._allowed_failures: raise e self._fail_count += 1 - import ipdb;ipdb.set_trace() obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100) return obj_grads diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py index 5a04fbfd..c2025202 100644 --- a/GPy/core/parameterization/parameterized.py +++ b/GPy/core/parameterization/parameterized.py @@ -74,6 +74,7 @@ class Parameterized(Constrainable, Pickleable, Observable): self.size = sum(p.size for p in self._parameters_) if not self._has_fixes(): self._fixes_ = None + self._param_slices_ = [] self._connect_parameters() self._added_names_ = set() del self._in_init_ @@ -213,7 +214,6 @@ class Parameterized(Constrainable, Pickleable, Observable): return i = 0 sizes = [0] - self._param_slices_ = [] for p in self._parameters_: p._direct_parent_ = self p._highest_parent_ = self @@ -315,7 +315,7 @@ class Parameterized(Constrainable, Pickleable, Observable): return n def _get_params(self): # don't overwrite this anymore! - return numpy.hstack([x._get_params() for x in self._parameters_]) + return numpy.hstack([x._get_params() for x in self._parameters_ if x.size>0]) def _set_params(self, params, update=True): # don't overwrite this anymore! [p._set_params(params[s], update=update) for p,s in itertools.izip(self._parameters_,self._param_slices_)] diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index ab1f3bf0..089f2b38 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -42,8 +42,8 @@ class SparseGP(GP): raise NotImplementedError, "what to do what to do?" print "defaulting to ", inference_method, "for latent function inference" + self.Z = Param('inducing inputs', Z) - self.Z = Z self.num_inducing = Z.shape[0] if not (X_variance is None): @@ -52,10 +52,8 @@ class SparseGP(GP): GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name) - self.Z = Param('inducing inputs', self.Z) self.add_parameter(self.Z, index=0) - self.add_parameter(self.kern) - self.add_parameter(self.likelihood) + def parameters_changed(self): self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y) diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py index a6d3c9b5..9a66d3b6 100644 --- a/GPy/inference/latent_function_inference/laplace.py +++ b/GPy/inference/latent_function_inference/laplace.py @@ -11,7 +11,8 @@ #http://gaussianprocess.org/gpml/code. import numpy as np -from ...util.linalg import mdot, jitchol, pddet, dpotrs +from ...util.linalg import mdot, jitchol, pddet, dpotrs, dtrtrs +from ...util.misc import param_to_array from functools import partial as partial_func from posterior import Posterior import warnings @@ -27,7 +28,6 @@ class LaplaceInference(object): (using Newton-Raphson) of the unnormalised posterior """ - self.NORMAL_CONST = (0.5 * np.log(2 * np.pi)) self._mode_finding_tolerance = 1e-7 self._mode_finding_max_iter = 40 @@ -39,58 +39,133 @@ class LaplaceInference(object): Returns a Posterior class containing essential quantities of the posterior """ + #make Y a normal array! + Y = param_to_array(Y) + # Compute K K = kern.K(X) #Find mode if self.bad_fhat: - Ki_f_init = np.random.randn(*Y.shape)/50 + Ki_f_init = np.zeros_like(Y) else: Ki_f_init = self._previous_Ki_fhat - self.f_hat, self._previous_Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata) - - stop + f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata) #Compute hessian and other variables at mode - self._compute_likelihood_variables() + log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, Y_metadata) - likelihood.gradient = self.likelihood_gradients() - dL_dK = self._Kgradients() - kern.update_gradients_full(dL_dK) + #likelihood.gradient = self.likelihood_gradients() + kern.update_gradients_full(dL_dK, X) - return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK} + self._previous_Ki_fhat = Ki_fhat.copy() + return Posterior(woodbury_vector=woodbury_vector, woodbury_inv = K_Wi_i, K=K), log_marginal, {'dL_dK':dL_dK} - def _shared_gradients_components(self): + def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None): """ - A helper function to compute some common quantities - """ - d3lik_d3fhat = likelihood.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data) - dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5? - I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i) - return dL_dfhat, I_KW_i + Rasmussen's numerically stable mode finding + For nomenclature see Rasmussen & Williams 2006 + Influenced by GPML (BSD) code, all errors are our own - def _Kgradients(self): + :param K: Covariance matrix evaluated at locations X + :type K: NxD matrix + :param Y: The data + :type Y: np.ndarray + :param likelihood: the likelihood of the latent function value for the given data + :type likelihood: a GPy.likelihood object + :param Ki_f_init: the initial guess at the mode + :type Ki_f_init: np.ndarray + :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object + :type Y_metadata: np.ndarray | None + :returns: f_hat, mode on which to make laplace approxmiation + :rtype: np.ndarray """ - Gradients with respect to prior kernel parameters dL_dK to be chained - with dK_dthetaK to give dL_dthetaK - :returns: dL_dK matrix - :rtype: Matrix (1 x num_kernel_params) - """ - dL_dfhat, I_KW_i = self._shared_gradients_components() - dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work? - #Explicit - expl_a = np.dot(self.Ki_f, self.Ki_f.T) - expl_b = self.Wi_K_i - expl = 0.5*expl_a - 0.5*expl_b - dL_dthetaK_exp = dK_dthetaK(expl, X) + Ki_f = Ki_f_init.copy() + f = np.dot(K, Ki_f) + + + #define the objective function (to be maximised) + def obj(Ki_f, f): + return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + likelihood.logpdf(f, Y, extra_data=Y_metadata) + + difference = np.inf + iteration = 0 + while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter: + W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata) + + W_f = W*f + grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata) + + b = W_f + grad # R+W p46 line 6. + #W12BiW12Kb, B_logdet = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave) + W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave) + W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b)) + + #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet + full_step_Ki_f = b - W12BiW12Kb # full_step_Ki_f = a in R&W p46 line 6. + dKi_f = full_step_Ki_f - Ki_f + + #define an objective for the line search (minimize this one) + def inner_obj(step_size): + Ki_f_trial = Ki_f + step_size*dKi_f + f_trial = np.dot(K, Ki_f_trial) + return -obj(Ki_f_trial, f_trial) + + #use scipy for the line search, the compute new values of f, Ki_f + step = optimize.brent(inner_obj, tol=1e-4, maxiter=12) + Ki_f_new = Ki_f + step*dKi_f + f_new = np.dot(K, Ki_f_new) + + difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f)) + Ki_f = Ki_f_new + f = f_new + iteration += 1 + + #Warn of bad fits + if difference > self._mode_finding_tolerance: + if not self.bad_fhat: + warnings.warn("Not perfect f_hat fit difference: {}".format(difference)) + self.bad_fhat = True + elif self.bad_fhat: + self.bad_fhat = False + warnings.warn("f_hat now fine again") + + return f, Ki_f + + + def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, Y_metadata): + """ + At the mode, compute the hessian and effective covariance matrix. + + returns: logZ : approximation to the marginal likelihood + Cov : the approximation to the covariance matrix + """ + #At this point get the hessian matrix (or vector as W is diagonal) + W = -likelihood.d2logpdf_df2(f_hat, Y, extra_data=Y_metadata) + + K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave) + + #compute vital matrices + C = np.dot(LiW12, K) + Ki_W_i = K - C.T.dot(C) + + #compute the log marginal + log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, extra_data=Y_metadata) - np.sum(np.log(np.diag(L))) + + #compute dL_dK + explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i) #Implicit - impl = mdot(dlp, dL_dfhat, I_KW_i) + d3lik_d3fhat = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata) + dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*d3lik_d3fhat) #why isn't this -0.5? s2 in R&W p126 line 9. + woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata) + implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i)) - dL_dK = expl + impl + dL_dK = explicit_part + implicit_part + + return log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector - return dL_dK def likelihood_gradients(self): """ @@ -118,24 +193,7 @@ class LaplaceInference(object): return dL_dthetaL - def _compute_likelihood_variables(self): - """ - At the mode, compute the hessian and effective covaraince matrix. - """ - #At this point get the hessian matrix (or vector as W is diagonal) - self.W = -likelihood.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data) - - if not likelihood.log_concave: - #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6)) - self.W[self.W < 1e-6] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur - - self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N), likelihood.log_concave) - - self.Ki_f = self.Ki_f - self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f) - self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K) - - def _compute_B_statistics(self, K, W, a, log_concave): + def _compute_B_statistics(self, K, W, log_concave): """ Rasmussen suggests the use of a numerically stable positive definite matrix B Which has a positive diagonal element and can be easyily inverted @@ -144,9 +202,7 @@ class LaplaceInference(object): :type K: NxN matrix :param W: Negative hessian at a point (diagonal matrix) :type W: Vector of diagonal values of hessian (1xN) - :param a: Matrix to calculate W12BiW12a - :type a: Matrix NxN - :returns: (W12BiW12, ln_B_det) + :returns: (W12BiW12, L_B, Li_W12) """ if not log_concave: #print "Under 1e-10: {}".format(np.sum(W < 1e-6)) @@ -155,169 +211,13 @@ class LaplaceInference(object): # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods - #W is diagonal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) B = np.eye(K.shape[0]) + W_12*K*W_12.T L = jitchol(B) - W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0] - ln_B_det = 2.*np.sum(np.log(np.diag(L))) - return W12BiW12a, ln_B_det - - def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None): - """ - Rasmussen's numerically stable mode finding - For nomenclature see Rasmussen & Williams 2006 - Influenced by GPML (BSD) code, all errors are our own - - :param K: Covariance matrix evaluated at locations X - :type K: NxD matrix - :param Y: The data - :type Y: np.ndarray - :param likelihood: the likelihood of the latent function value for the given data - :type likelihood: a GPy.likelihood object - :param Ki_f_init: the initial guess at the mode - :type Ki_f_init: np.ndarray - :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object - :type Y_metadata: np.ndarray | None - :returns: f_hat, mode on which to make laplace approxmiation - :rtype: np.ndarray - """ - - ##Start f's at zero originally or if we have gone off track, try restarting - #if self.old_Ki_f is None or self.bad_fhat: - #old_Ki_f = np.random.rand(self.N, 1)/50.0 - ##old_Ki_f = self.Y - #f = np.dot(K, old_Ki_f) - #else: - ##Start at the old best point - #old_Ki_f = self.old_Ki_f.copy() - #f = self.f_hat.copy() - - Ki_f = Ki_f_init.copy() - f = np.dot(K, Ki_f) - - - #define the objective function (to be maximised) - def obj(Ki_f, f): - return -0.5*np.dot(Ki_f.T, f) + likelihood.logpdf(f, Y, extra_data=Y_metadata) - - difference = np.inf - i = 0 - while difference > self._mode_finding_tolerance and i < self._mode_finding_max_iter: - W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata) - - W_f = W*f - grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata) - - b = W_f + grad - W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave) - - #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet - full_step_Ki_f = b - W12BiW12Kb - dKi_f = full_step_Ki_f - Ki_f - - #define an objective for the line search - def inner_obj(step_size): - Ki_f_trial = Ki_f + step_size*dKi_f - f_trial = np.dot(K, Ki_f_trial) - print -obj(Ki_f_trial, f_trial), - return -obj(Ki_f_trial, f_trial) - - #use scipy for the line search, the compute new values of f, Ki_f - step = optimize.brent(inner_obj, tol=1e-4, maxiter=12) - Ki_f_new = Ki_f + step*dKi_f - f_new = np.dot(K, Ki_f_new) - - print "" - print obj(Ki_f, f), obj(Ki_f_new, f_new), step - print "" - - #i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K) - #Find the stepsize that minimizes the objective function using a brent line search - #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full - #steps than get this exact then make a step, if B was bigger it might be the other way around though - #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun - #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10) - #f = self.tmp_f.copy() - #Ki_f = self.tmp_Ki_f.copy() - - difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f)) - Ki_f = Ki_f_new - f = f_new - i += 1 - - - #Warn of bad fits - if difference > self._mode_finding_tolerance: - if not self.bad_fhat: - warnings.warn("Not perfect f_hat fit difference: {}".format(difference)) - self.bad_fhat = True - elif self.bad_fhat: - self.bad_fhat = False - warnings.warn("f_hat now fine again") - - return f, Ki_f - - def _compute_GP_variables(self): - """ - Generate data Y which would give the normal distribution identical - to the laplace approximation to the posterior, but normalised - - GPy expects a likelihood to be gaussian, so need to caluclate - the data Y^{\tilde} that makes the posterior match that found - by a laplace approximation to a non-gaussian likelihood but with - a gaussian likelihood - - Firstly, - The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1}, - i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood, - we wish to find the hessian \Sigma^{\tilde} - that has the same curvature but using our new simulated data Y^{\tilde} - i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) - and we wish to find what Y^{\tilde} and \Sigma^{\tilde} - We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1} - - Secondly, - GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z - So we can suck up any differences between that and our log marginal likelihood approximation - p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W| - which we want to optimize instead, by equating them and rearranging, the difference is added onto - the log p(y) that GPy optimizes by default - - Thirdly, - Since we have gradients that depend on how we move f^{\hat}, we have implicit components - aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the - gp.py code - """ - Wi = 1.0/self.W - self.Sigma_tilde = np.diagflat(Wi) - - Y_tilde = Wi*self.Ki_f + self.f_hat - - self.Wi_K_i = self.W12BiW12 - ln_det_Wi_K = pddet(self.Sigma_tilde + self.K) - lik = likelihood.logpdf(self.f_hat, self.data, extra_data=self.extra_data) - y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) - - Z_tilde = (+ lik - - 0.5*self.ln_B_det - + 0.5*ln_det_Wi_K - - 0.5*self.f_Ki_f - + 0.5*y_Wi_K_i_y - + self.NORMAL_CONST - ) - - #Convert to float as its (1, 1) and Z must be a scalar - self.Z = np.float64(Z_tilde) - self.Y = Y_tilde - self.YYT = np.dot(self.Y, self.Y.T) - self.covariance_matrix = self.Sigma_tilde - self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None] - - #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods - self.dZ_dK = self._Kgradients() - #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again + LiW12, _ = dtrtrs(L, np.diag(W_12[:,0]), lower=1, trans=0) + K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25 + return K_Wi_i, L, LiW12 diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py index c0974dc5..a0f4104c 100644 --- a/GPy/inference/latent_function_inference/posterior.py +++ b/GPy/inference/latent_function_inference/posterior.py @@ -48,6 +48,7 @@ class Posterior(object): if ((woodbury_chol is not None) and (woodbury_vector is not None))\ or ((woodbury_inv is not None) and (woodbury_vector is not None))\ + or ((woodbury_inv is not None) and (mean is not None))\ or ((mean is not None) and (cov is not None)): pass # we have sufficient to compute the posterior else: