mean functions now working for svgp. with tests

This commit is contained in:
James Hensman 2015-03-26 16:20:17 +00:00
parent 02f2bb5c76
commit 72de607199
5 changed files with 83 additions and 18 deletions

View file

@ -19,7 +19,7 @@ class SparseGP(GP):
This model allows (approximate) inference using variational DTC or FITC
(Gaussian likelihoods) as well as non-conjugate sparse methods based on
these.
This is not for missing data, as the implementation for missing data involves
some inefficient optimization routine decisions.
See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
@ -39,7 +39,7 @@ class SparseGP(GP):
"""
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
name='sparse gp', Y_metadata=None, normalizer=False):
#pick a sensible inference method
if inference_method is None:
@ -53,7 +53,7 @@ class SparseGP(GP):
self.Z = Param('inducing inputs', Z)
self.num_inducing = Z.shape[0]
GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
logger.info("Adding Z as parameter")
self.link_parameter(self.Z, index=0)
@ -61,7 +61,7 @@ class SparseGP(GP):
def has_uncertain_inputs(self):
return isinstance(self.X, VariationalPosterior)
def set_Z(self, Z, trigger_update=True):
if trigger_update: self.update_model(False)
self.unlink_parameter(self.Z)
@ -110,8 +110,8 @@ class SparseGP(GP):
def _raw_predict(self, Xnew, full_cov=False, kern=None):
"""
Make a prediction for the latent function values.
Make a prediction for the latent function values.
For certain inputs we give back a full_cov of shape NxN,
if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
we take only the diagonal elements across N.
@ -136,6 +136,9 @@ class SparseGP(GP):
else:
Kxx = kern.Kdiag(Xnew)
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
#add in the mean function
if self.mean_function is not None:
mu += self.mean_function.f(Xnew)
else:
psi0_star = self.kern.psi0(self.Z, Xnew)
psi1_star = self.kern.psi1(self.Z, Xnew)
@ -165,4 +168,5 @@ class SparseGP(GP):
var[i] = var_
else:
var[i] = np.diag(var_)+p0-t2
return mu, var

View file

@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf
class SVGP(SparseGP):
def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
"""
Stochastic Variational GP.
@ -38,7 +38,7 @@ class SVGP(SparseGP):
#create the SVI inference method
inf_method = svgp_inf()
SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
name=name, Y_metadata=Y_metadata, normalizer=False)
self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
@ -48,7 +48,7 @@ class SVGP(SparseGP):
self.link_parameter(self.m)
def parameters_changed(self):
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
#update the kernel gradients
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
@ -65,6 +65,13 @@ class SVGP(SparseGP):
self.m.gradient = self.grad_dict['dL_dm']
self.chol.gradient = self.grad_dict['dL_dchol']
if self.mean_function is not None:
self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
g = self.mean_function.gradient[:].copy()
self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
self.mean_function.gradient[:] += g
self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
def set_data(self, X, Y):
"""
Set the data without calling parameters_changed to avoid wasted computation

View file

@ -15,7 +15,7 @@ class Posterior(object):
the function at any new point x_* by integrating over this posterior.
"""
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
"""
woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@ -67,6 +67,7 @@ class Posterior(object):
#option 2:
self._mean = mean
self._covariance = cov
self._prior_mean = prior_mean
#compute this lazily
self._precision = None
@ -175,7 +176,7 @@ class Posterior(object):
$$
"""
if self._woodbury_vector is None:
self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
return self._woodbury_vector
@property

View file

@ -7,7 +7,7 @@ from posterior import Posterior
class SVGP(LatentFunctionInference):
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
assert mean_function is None, "inference with a mean function not implemented"
num_inducing = Z.shape[0]
num_data, num_outputs = Y.shape
@ -23,6 +23,15 @@ class SVGP(LatentFunctionInference):
#S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
#Si, Lnew, _,_ = linalg.pdinv(S)
#compute mean function stuff
if mean_function is not None:
prior_mean_u = mean_function.f(Z)
prior_mean_f = mean_function.f(X)
else:
prior_mean_u = np.zeros((num_inducing, num_outputs))
prior_mean_f = np.zeros((num_data, num_outputs))
#compute kernel related stuff
Kmm = kern.K(Z)
Knm = kern.K(X, Z)
@ -31,17 +40,31 @@ class SVGP(LatentFunctionInference):
#compute the marginal means and variances of q(f)
A = np.dot(Knm, Kmmi)
mu = np.dot(A, q_u_mean)
mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
#compute the KL term
Kmmim = np.dot(Kmmi, q_u_mean)
KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
KL = KLs.sum()
dKL_dm = Kmmim
#gradient of the KL term (assuming zero mean function)
dKL_dm = Kmmim.copy()
dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
if mean_function is not None:
#adjust KL term for mean function
Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
KL += -np.sum(q_u_mean*Kmmi_mfZ)
KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
#adjust gradient for mean fucntion
dKL_dm -= Kmmi_mfZ
dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
#compute gradients for mean_function
dKL_dmfZ = Kmmi_mfZ - Kmmim
#quadrature for the likelihood
F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
@ -51,11 +74,9 @@ class SVGP(LatentFunctionInference):
if dF_dthetaL is not None:
dF_dthetaL = dF_dthetaL.sum(1)*batch_scale
#derivatives of expected likelihood
#derivatives of expected likelihood, assuming zero mean function
Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
Admu = A.T.dot(dF_dmu)
#AdvA = np.einsum('ijk,jl->ilk', Adv, A)
#AdvA = np.dot(A.T, Adv).swapaxes(0,1)
AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
@ -65,6 +86,14 @@ class SVGP(LatentFunctionInference):
dF_dm = Admu
dF_dS = AdvA
#adjust gradient to account for mean function
if mean_function is not None:
dF_dmfX = dF_dmu.copy()
dF_dmfZ = -Admu
dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
dF_dKmm += Admu.dot(Kmmi_mfZ.T)
#sum (gradients of) expected likelihood and KL part
log_marginal = F.sum() - KL
dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
@ -72,4 +101,8 @@ class SVGP(LatentFunctionInference):
dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
dL_dchol = choleskies.triang_to_flat(dL_dchol)
return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
if mean_function is not None:
grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
grad_dict['dL_dmfX'] = dF_dmfX
return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict

View file

@ -32,3 +32,23 @@ class SVGP_classification(np.testing.TestCase):
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
def test_grad(self):
assert self.m.checkgrad(step=1e-4)
class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
"""
Inference in the SVGP with a Bernoulli likelihood
"""
def setUp(self):
X = np.linspace(0,10,100).reshape(-1,1)
Z = np.linspace(0,10,10).reshape(-1,1)
latent_f = np.exp(0.1*X * 0.05*X**2)
Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
mf = GPy.mappings.Linear(1,1)
lik = GPy.likelihoods.Poisson()
k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
def test_grad(self):
assert self.m.checkgrad(step=1e-4)