diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index 3152d4b5..ab1f3bf0 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -53,26 +53,45 @@ class SparseGP(GP): GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name) self.Z = Param('inducing inputs', self.Z) - self.add_parameter(self.Z, gradient=self.dL_dZ, index=0) - self.add_parameter(self.kern, gradient=self.dL_dtheta) - self.add_parameter(self.likelihood, gradient=lambda:self.likelihood._gradients(partial=self.partial_for_likelihood)) + self.add_parameter(self.Z, index=0) + self.add_parameter(self.kern) + self.add_parameter(self.likelihood) def parameters_changed(self): self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y) - #The derivative of the bound wrt the inducing inputs Z - self.Z.gradient = self.kern.dK_dX(self.dL_dKmm, self.Z) - if self.has_uncertain_inputs: + #The derivative of the bound wrt the inducing inputs Z + self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + if self.X_variance is None: + self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X) + else: self.Z.gradient += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance) self.Z.gradient += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance) - else: - self.Z.gradient += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X) def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): """ Make a prediction for the latent function values """ - #TODO!!! + if X_variance_new is None: + Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts) + mu = np.dot(Kx.T, self.posterior.woodbury_vector) + if full_cov: + Kxx = self.kern.K(Xnew, which_parts=which_parts) + var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx) # NOTE this won't work for plotting + else: + Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts) + var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0) + else: + # assert which_parts=='all', "swithching out parts of variational kernels is not implemented" + Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts + mu = np.dot(Kx, self.Cpsi1V) + if full_cov: + raise NotImplementedError, "TODO" + else: + Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new) + psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new) + var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1) + return mu, var[:,None] def _getstate(self): diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py index 82de1419..3e5022aa 100644 --- a/GPy/inference/latent_function_inference/posterior.py +++ b/GPy/inference/latent_function_inference/posterior.py @@ -2,7 +2,7 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np -from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs +from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify class Posterior(object): """ @@ -62,6 +62,7 @@ class Posterior(object): #compute this lazily self._precision = None + self._woodbury_inv = None @property def mean(self): @@ -91,6 +92,14 @@ class Posterior(object): _, _, self._woodbury_chol, _ = pdinv(Wi) return self._woodbury_chol + @property + def woodbury_inv(self): + if self._woodbury_inv is None: + self._woodbury_inv, _ = dpotri(self.woodbury_chol) + symmetrify(self._woodbury_inv) + return self._woodbury_inv + + @property def woodbury_vector(self): if self._woodbury_vector is None: diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py index 7ceeff11..b5ba4c2d 100644 --- a/GPy/inference/latent_function_inference/varDTC.py +++ b/GPy/inference/latent_function_inference/varDTC.py @@ -2,7 +2,7 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) from posterior import Posterior -from ...util.linalg import jitchol, backsub_both_sides, tdot +from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs import numpy as np log_2_pi = np.log(2*np.pi) @@ -30,7 +30,7 @@ class VarDTC(object): if (N>D): return Y else: - #if Y in self.cache, return self.Cache[Y], else stor Y in cache and return L. + #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L. raise NotImplementedError, 'TODO' #TODO def get_VVTfactor(self, Y, prec): @@ -66,9 +66,9 @@ class VarDTC(object): # The rather complex computations of A if uncertain_inputs: if het_noise: - psi2_beta = (psi2 * (likelihood.precision.flatten().reshape(num_data, 1, 1))).sum(0) + psi2_beta = (psi2 * (beta.flatten().reshape(num_data, 1, 1))).sum(0) else: - psi2_beta = psi2.sum(0) * likelihood.precision + psi2_beta = psi2.sum(0) * beta evals, evecs = linalg.eigh(psi2_beta) clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable if not np.array_equal(evals, clipped_evals): @@ -89,6 +89,7 @@ class VarDTC(object): # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! VVT_factor = self.get_VVTfactor(Y, beta) + trYYT = np.sum(np.square(Y)) psi1Vf = np.dot(psi1.T, VVT_factor) # back substutue C into psi1Vf @@ -97,17 +98,6 @@ class VarDTC(object): tmp, info2 = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) Cpsi1Vf, info3 = dtrtrs(Lm, tmp, lower=1, trans=1) - #compute log marginal likelihood - if het_noise: - A = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y) - B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A)) - else: - A = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * likelihood.trYYT - B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A)) - C = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2)) - D = 0.5 * data_fit - log_marginal = A + B + C + D - # Compute dL_dKmm tmp = tdot(_LBi_Lmi_psi1Vf) @@ -120,17 +110,17 @@ class VarDTC(object): # Compute dL_dpsi dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten() - dL_dpsi1 = np.dot(likelihood.VVT_factor, Cpsi1Vf.T) + dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T) dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi) if het_noise: if uncertain_inputs: dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :] else: - dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * likelihood.precision.reshape(num_data, 1)).T).T + dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T dL_dpsi2 = None else: - dL_dpsi2 = likelihood.precision * dL_dpsi2_beta + dL_dpsi2 = beta * dL_dpsi2_beta if uncertain_inputs: # repeat for each of the N psi_2 matrices dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0) @@ -152,40 +142,55 @@ class VarDTC(object): Lmi_psi1, nil = dtrtrs(Lm, np.asfortranarray(psi1.T), lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0) - partial_for_likelihood = -0.5 * likelihood.precision + 0.5 * likelihood.V**2 - partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * likelihood.precision**2 + partial_for_likelihood = -0.5 * beta + 0.5 * likelihood.V**2 + partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2 - partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*likelihood.precision**2 + partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2 - partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * likelihood.precision**2 - partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * likelihood.precision**2 + partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * beta**2 + partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2 else: # likelihood is not heteroscedatic - partial_for_likelihood = -0.5 * num_data * output_dim * likelihood.precision + 0.5 * likelihood.trYYT * likelihood.precision ** 2 - partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * likelihood.precision ** 2 - np.trace(_A) * likelihood.precision) - partial_for_likelihood += likelihood.precision * (0.5 * np.sum(_A * DBi_plus_BiPBi) - data_fit) + partial_for_likelihood = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2 + partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta) + partial_for_likelihood += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit) + + #compute log marginal likelihood + if het_noise: + lik_1 = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y) + lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A)) + else: + lik_1 = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * trYYT + lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A)) + lik_3 = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2)) + lik_4 = 0.5 * data_fit + log_marginal = lik_1 + lik_2 + lik_3 + lik_4 #put the gradients in the right places - likelihood.update_gradients(np.diag(partial_for_likelihood)) + likelihood.update_gradients(partial_for_likelihood) if uncertain_inputs: - kern.update_gradients_variational(??) grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2} + kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict) else: - kern.update_gradients_sparse(??) - grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2} + grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1} + kern.update_gradients_sparse(X=X, Z=Z, **grad_dict) #get sufficient things for posterior prediction if VVT_factor.shape[1] == Y.shape[1]: woodbury_vector = Cpsi1Vf # == Cpsi1V - woodbury_chol = ?? else: - raise NotImplementedError #TODO + psi1V = np.dot(Y.T*beta, psi1).T + tmp, _ = dtrtrs(Lm, np.asfortranarray(psi1V), lower=1, trans=0) + tmp, _ = dpotrs(LB, tmp, lower=1) + woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) + #TODO: totally wrong, fix. + woodbury_chol = np.eye(num_inducing) #construct a posterior object post = Posterior(woodbury_chol=woodbury_chol, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) - return Posterior, log_marginal, grad_dict + return post, log_marginal, grad_dict diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py index de87ff14..68b26e3c 100644 --- a/GPy/kern/kern.py +++ b/GPy/kern/kern.py @@ -284,9 +284,10 @@ class kern(Parameterized): [p.update_gradients_full(dL_dK, X) for p in self._parameters_] def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z): - raise NotImplementedError + [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X, Z) for p in self._parameters_] + def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z): - raise NotImplementedError + [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_] def dK_dtheta(self, dL_dK, X, X2=None): """ diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py index 885fed96..89f6894c 100644 --- a/GPy/kern/parts/rbf.py +++ b/GPy/kern/parts/rbf.py @@ -117,7 +117,7 @@ class RBF(Kernpart): self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z) else: - self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm) + self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm) #from Kmm self._K_computations(Z, None) diff --git a/GPy/kern/parts/white.py b/GPy/kern/parts/white.py index a59c9f98..c9677f28 100644 --- a/GPy/kern/parts/white.py +++ b/GPy/kern/parts/white.py @@ -32,7 +32,7 @@ class White(Kernpart): self.variance.gradient = np.trace(dL_dK) def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z): - raise NotImplementedError + self.variance.gradient = np.trace(dL_dKmm) + np.sum(dL_dKdiag) def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z): raise NotImplementedError