diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py index cc4e81cd..20d58a3f 100644 --- a/GPy/core/svgp.py +++ b/GPy/core/svgp.py @@ -56,6 +56,7 @@ class SVGP(SparseGP): self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.kern.gradients_X(self.grad_dict['dL_dKmn'], self.Z, self.X) + self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) #update the variational parameter gradients: self.m.gradient = self.grad_dict['dL_dm'] self.chol.gradient = self.grad_dict['dL_dchol'] diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py index 7ca43b81..1b0ec19e 100644 --- a/GPy/inference/latent_function_inference/svgp.py +++ b/GPy/inference/latent_function_inference/svgp.py @@ -5,27 +5,9 @@ import numpy as np from posterior import Posterior class SVGP(LatentFunctionInference): - def likelihood_quadrature(self, Y, m, v): - Ysign = np.where(Y==1,1,-1).flatten() - from scipy import stats - self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20) - - #assume probit for now. - X = self.gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None] - p = stats.norm.cdf(X) - p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability - N = stats.norm.pdf(X) - F = np.log(p).dot(self.gh_w) - NoverP = N/p - dF_dm = (NoverP*Ysign[:,None]).dot(self.gh_w) - dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w) - return F, dF_dm, dF_dv - - def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None): assert Y.shape[1]==1, "multi outputs not implemented" - num_inducing = Z.shape[0] #expand cholesky representation L = choleskies.flat_to_triang(q_u_chol[:,None]).squeeze() @@ -57,9 +39,7 @@ class SVGP(LatentFunctionInference): dKL_dKmm = 0.5*Kmmi - 0.5*Kmmi.dot(S).dot(Kmmi) - 0.5*Kmmim[:,None]*Kmmim[None,:] #quadrature for the likelihood - #F, dF_dmu, dF_dv = likelihood.variational_expectations(Y, mu, v) - F, dF_dmu, dF_dv = self.likelihood_quadrature(Y, mu, v) - + F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v) #rescale the F term if working on a batch #F, dF_dmu, dF_dv = F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale @@ -82,7 +62,7 @@ class SVGP(LatentFunctionInference): dL_dchol = 2.*np.dot(dL_dS, L) dL_dchol = choleskies.triang_to_flat(dL_dchol[:,:,None]).squeeze() - return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol} + return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL} diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py index 596b9dc3..ff2ab30a 100644 --- a/GPy/likelihoods/bernoulli.py +++ b/GPy/likelihoods/bernoulli.py @@ -133,7 +133,7 @@ class Bernoulli(Likelihood): """ #objective = y*np.log(inv_link_f) + (1.-y)*np.log(inv_link_f) p = np.where(y==1, inv_link_f, 1.-inv_link_f) - return np.log(np.clip(p, 1e-6 ,np.inf)) + return np.log(np.clip(p, 1e-9 ,np.inf)) def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None): """ @@ -152,7 +152,7 @@ class Bernoulli(Likelihood): """ #grad = (y/inv_link_f) - (1.-y)/(1-inv_link_f) #grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f)) - ff = np.clip(inv_link_f, 1e-6, 1-1e-6) + ff = np.clip(inv_link_f, 1e-9, 1-1e-9) denom = np.where(y, ff, -(1-ff)) return 1./denom @@ -180,7 +180,7 @@ class Bernoulli(Likelihood): #d2logpdf_dlink2 = -y/(inv_link_f**2) - (1-y)/((1-inv_link_f)**2) #d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f)) arg = np.where(y, inv_link_f, 1.-inv_link_f) - ret = -1./np.square(np.clip(arg, 1e-3, np.inf)) + ret = -1./np.square(np.clip(arg, 1e-9, 1e9)) if np.any(np.isinf(ret)): stop return ret diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py index 125f306f..b6540c98 100644 --- a/GPy/likelihoods/gaussian.py +++ b/GPy/likelihoods/gaussian.py @@ -316,3 +316,11 @@ class Gaussian(Likelihood): v = var_star + self.variance return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v + def variational_expectations(self, Y, m, v, gh_points=None): + lik_var = float(self.variance) + F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m.dot(Y))/lik_var + dF_dmu = (Y - m)/lik_var + dF_dv = -0.5/lik_var + dF_dlik_var = -0.5/lik_var + 0.5(np.square(Y) + np.square(m) + v - 2*m.dot(Y))/(lik_var**2) + dF_dtheta = [dF_dlik_var] + return F, dF_dmu, dF_dv, dF_dtheta diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py index 203439d6..87b7315e 100644 --- a/GPy/likelihoods/likelihood.py +++ b/GPy/likelihoods/likelihood.py @@ -133,7 +133,7 @@ class Likelihood(Parameterized): def variational_expectations(self, Y, m, v, gh_points=None): """ - Use Gauss-Hermite Quadrature to compute + Use Gauss-Hermite Quadrature to compute E_p(f) [ log p(y|f) ] d/dm E_p(f) [ log p(y|f) ] @@ -143,9 +143,10 @@ class Likelihood(Parameterized): if no gh_points are passed, we construct them using defualt options """ + #May be broken if gh_points is None: - gh_x, gh_w = np.polynomial.hermite.hermgauss(12) + gh_x, gh_w = np.polynomial.hermite.hermgauss(20) else: gh_x, gh_w = gh_points @@ -156,15 +157,15 @@ class Likelihood(Parameterized): X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None] #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid. - # broadcast needs to be handled carefully. + # broadcast needs to be handled carefully. logp = self.logpdf(X,Y[:,None]) dlogp_dx = self.dlogpdf_df(X, Y[:,None]) d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None]) #clipping for numerical stability - logp = np.clip(logp,-1e6,1e6) - dlogp_dx = np.clip(dlogp_dx,-1e6,1e6) - d2logp_dx2 = np.clip(d2logp_dx2,-1e6,1e6) + #logp = np.clip(logp,-1e9,1e9) + #dlogp_dx = np.clip(dlogp_dx,-1e9,1e9) + #d2logp_dx2 = np.clip(d2logp_dx2,-1e9,1e9) #average over the gird to get derivatives of the Gaussian's parameters F = np.dot(logp, gh_w) @@ -176,10 +177,8 @@ class Likelihood(Parameterized): if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)): stop - return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape) - - - + dF_dtheta = None # Not yet implemented + return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), None def predictive_mean(self, mu, variance, Y_metadata=None): """