diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py index a0ee51da..ebed29bb 100644 --- a/GPy/core/__init__.py +++ b/GPy/core/__init__.py @@ -7,5 +7,6 @@ from parameterization.param import Param, ParamConcatenation from parameterization.observable_array import ObsAr from gp import GP +from svgp import SVGP from sparse_gp import SparseGP from mapping import * diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py new file mode 100644 index 00000000..cc4e81cd --- /dev/null +++ b/GPy/core/svgp.py @@ -0,0 +1,90 @@ +# Copyright (c) 2014, James Hensman, Alex Matthews +# Distributed under the terms of the GNU General public License, see LICENSE.txt + +import numpy as np +from ..util import choleskies +from sparse_gp import SparseGP +from parameterization.param import Param + +class SVGP(SparseGP): + def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None): + """ + Stochastic Variational GP. + + For Gaussian Likelihoods, this implements + + Gaussian Processes for Big data, Hensman, Fusi and Lawrence, UAI 2013, + + But without natural gradients. We'll use the lower-triangluar + representation of the covariance matrix to ensure + positive-definiteness. + + For Non Gaussian Likelihoods, this implements + + Hensman, Matthews and Ghahramani, Scalable Variational GP Classification, ArXiv 1411.2005 + """ + + #create the SVI inference method + from ..inference.latent_function_inference import SVGP as svgp_inf + inf_method = svgp_inf() + + SparseGP.__init__(self,X, Y, Z, kernel, likelihood, inference_method=inf_method, + name=name, Y_metadata=Y_metadata, normalizer=False) + + #?? self.set_data(X, Y) + + self.m = Param('q_u_mean', np.zeros(self.num_inducing)) + chol = choleskies.triang_to_flat(np.eye(self.num_inducing)[:,:,None]) + self.chol = Param('q_u_chol', chol.flatten()) + self.link_parameter(self.chol) + self.link_parameter(self.m) + + #self.batch_scale = 1. # how to rescale the batch likelihood in case of minibatches + + + def parameters_changed(self): + self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata) + + #update the kernel gradients + self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z) + grad = self.kern.gradient.copy() + self.kern.update_gradients_full(self.grad_dict['dL_dKmn'], self.Z, self.X) + grad += self.kern.gradient + self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X) + self.kern.gradient += grad + if not self.Z.is_fixed:# only compute these expensive gradients if we need them + self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.kern.gradients_X(self.grad_dict['dL_dKmn'], self.Z, self.X) + + + #update the variational parameter gradients: + self.m.gradient = self.grad_dict['dL_dm'] + self.chol.gradient = self.grad_dict['dL_dchol'] + + + #def set_data(self, X, Y): + #assert X.shape[1]==self.Z.shape[1] + #self.X, self.Y = GPy.core.ObsAr(X), Y + + def optimizeWithFreezingZ(self): + self.Z.fix() + self.kern.fix() + self.optimize('bfgs') + self.Z.unfix() + self.kern.constrain_positive() + self.optimize('bfgs') + +#class SPGPC_stoch(SPGPC): + #def __init__(self, X, Y, Z, kern=None, likelihood=None, batchsize=10): + #SPGPC.__init__(self, X[:1], Y[:1], Z, kern, likelihood) + #self.X_all, self.Y_all = X, Y + #self.batchsize = batchsize + #self.batch_scale = float(self.X_all.shape[0])/float(self.batchsize) +# + #def stochastic_grad(self, w): + #i = np.random.permutation(self.X_all.shape[0])[:self.batchsize] + #self.set_data(self.X_all[i], self.Y_all[i]) + #return self._grads(w) + + + + diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py index c507f7e1..67f57638 100644 --- a/GPy/inference/latent_function_inference/__init__.py +++ b/GPy/inference/latent_function_inference/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2012, James Hensman +# Copyright (c) 2012-2014, Max Zwiessele, James Hensman # Licensed under the BSD 3-clause license (see LICENSE.txt) __doc__ = """ @@ -69,6 +69,7 @@ from expectation_propagation_dtc import EPDTC from dtc import DTC from fitc import FITC from var_dtc_parallel import VarDTC_minibatch +from svgp import SVGP # class FullLatentFunctionData(object): # diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py new file mode 100644 index 00000000..7ca43b81 --- /dev/null +++ b/GPy/inference/latent_function_inference/svgp.py @@ -0,0 +1,88 @@ +from . import LatentFunctionInference +from ...util import linalg +from ...util import choleskies +import numpy as np +from posterior import Posterior + +class SVGP(LatentFunctionInference): + def likelihood_quadrature(self, Y, m, v): + Ysign = np.where(Y==1,1,-1).flatten() + from scipy import stats + self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20) + + #assume probit for now. + X = self.gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None] + p = stats.norm.cdf(X) + p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability + N = stats.norm.pdf(X) + F = np.log(p).dot(self.gh_w) + NoverP = N/p + dF_dm = (NoverP*Ysign[:,None]).dot(self.gh_w) + dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w) + return F, dF_dm, dF_dv + + + def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None): + assert Y.shape[1]==1, "multi outputs not implemented" + + + num_inducing = Z.shape[0] + #expand cholesky representation + L = choleskies.flat_to_triang(q_u_chol[:,None]).squeeze() + S = L.dot(L.T) + Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1) + logdetS = 2.*np.sum(np.log(np.abs(np.diag(L)))) + + if np.any(np.isinf(Si)): + print "warning:Cholesky representation unstable" + S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S)) + Si, Lnew, _,_ = linalg.pdinv(S) + + #compute kernel related stuff + Kmm = kern.K(Z) + Knm = kern.K(X, Z) + Knn_diag = kern.Kdiag(X) + Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm) + + #compute the marginal means and variances of q(f) + A = np.dot(Knm, Kmmi) + mu = np.dot(A, q_u_mean) + v = Knn_diag - np.sum(A*Knm,1) + np.sum(A*A.dot(S),1) + + #compute the KL term + Kmmim = np.dot(Kmmi, q_u_mean) + KL = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi*S) + 0.5*q_u_mean.dot(Kmmim) + dKL_dm = Kmmim + dKL_dS = 0.5*(Kmmi - Si) + dKL_dKmm = 0.5*Kmmi - 0.5*Kmmi.dot(S).dot(Kmmi) - 0.5*Kmmim[:,None]*Kmmim[None,:] + + #quadrature for the likelihood + #F, dF_dmu, dF_dv = likelihood.variational_expectations(Y, mu, v) + F, dF_dmu, dF_dv = self.likelihood_quadrature(Y, mu, v) + + + #rescale the F term if working on a batch + #F, dF_dmu, dF_dv = F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale + + #derivatives of quadratured likelihood + Adv = A.T*dF_dv # As if dF_Dv is diagonal + Admu = A.T.dot(dF_dmu) + AdvA = np.dot(Adv,A) + tmp = AdvA.dot(S).dot(Kmmi) + dF_dKmm = -Admu[:,None].dot(Kmmim[None,:]) + AdvA - tmp - tmp.T + dF_dKmm = 0.5*(dF_dKmm + dF_dKmm.T) # necessary? GPy bug? + dF_dKmn = 2.*(Kmmi.dot(S) - np.eye(num_inducing)).dot(Adv) + Kmmim[:,None]*dF_dmu[None,:] + dF_dm = Admu + dF_dS = AdvA + + #sum (gradients of) expected likelihood and KL part + log_marginal = F.sum() - KL + dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn + + dL_dchol = 2.*np.dot(dL_dS, L) + dL_dchol = choleskies.triang_to_flat(dL_dchol[:,:,None]).squeeze() + + return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol} + + + diff --git a/GPy/util/choleskies.py b/GPy/util/choleskies.py new file mode 100644 index 00000000..3f37fc3f --- /dev/null +++ b/GPy/util/choleskies.py @@ -0,0 +1,130 @@ +# Copyright James Hensman and Max Zwiessele 2014 +# Licensed under the GNU GPL version 3.0 + +import numpy as np +from scipy import weave +import linalg + + +def safe_root(N): + i = np.sqrt(N) + j = int(i) + if i != j: + raise ValueError, "N is not square!" + return j + +def flat_to_triang(flat): + """take a matrix N x D and return a M X M x D array where + + N = M(M+1)/2 + + the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat. + """ + N, D = flat.shape + M = (-1 + safe_root(8*N+1))/2 + ret = np.zeros((M, M, D)) + flat = np.ascontiguousarray(flat) + + code = """ + int count = 0; + for(int m=0; milk', Ki, LL) + #self._loglik = np.sum([np.sum(np.log(np.abs(np.diag()))) for i in range(self.L.shape[-1])]) +#