mean functions now working for svgp. with tests

This commit is contained in:
James Hensman 2015-03-26 16:20:17 +00:00
parent 02f2bb5c76
commit 72de607199
5 changed files with 83 additions and 18 deletions

View file

@ -19,7 +19,7 @@ class SparseGP(GP):
This model allows (approximate) inference using variational DTC or FITC This model allows (approximate) inference using variational DTC or FITC
(Gaussian likelihoods) as well as non-conjugate sparse methods based on (Gaussian likelihoods) as well as non-conjugate sparse methods based on
these. these.
This is not for missing data, as the implementation for missing data involves This is not for missing data, as the implementation for missing data involves
some inefficient optimization routine decisions. some inefficient optimization routine decisions.
See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'. See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
@ -39,7 +39,7 @@ class SparseGP(GP):
""" """
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
name='sparse gp', Y_metadata=None, normalizer=False): name='sparse gp', Y_metadata=None, normalizer=False):
#pick a sensible inference method #pick a sensible inference method
if inference_method is None: if inference_method is None:
@ -53,7 +53,7 @@ class SparseGP(GP):
self.Z = Param('inducing inputs', Z) self.Z = Param('inducing inputs', Z)
self.num_inducing = Z.shape[0] self.num_inducing = Z.shape[0]
GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer) GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
logger.info("Adding Z as parameter") logger.info("Adding Z as parameter")
self.link_parameter(self.Z, index=0) self.link_parameter(self.Z, index=0)
@ -61,7 +61,7 @@ class SparseGP(GP):
def has_uncertain_inputs(self): def has_uncertain_inputs(self):
return isinstance(self.X, VariationalPosterior) return isinstance(self.X, VariationalPosterior)
def set_Z(self, Z, trigger_update=True): def set_Z(self, Z, trigger_update=True):
if trigger_update: self.update_model(False) if trigger_update: self.update_model(False)
self.unlink_parameter(self.Z) self.unlink_parameter(self.Z)
@ -110,8 +110,8 @@ class SparseGP(GP):
def _raw_predict(self, Xnew, full_cov=False, kern=None): def _raw_predict(self, Xnew, full_cov=False, kern=None):
""" """
Make a prediction for the latent function values. Make a prediction for the latent function values.
For certain inputs we give back a full_cov of shape NxN, For certain inputs we give back a full_cov of shape NxN,
if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
we take only the diagonal elements across N. we take only the diagonal elements across N.
@ -136,6 +136,9 @@ class SparseGP(GP):
else: else:
Kxx = kern.Kdiag(Xnew) Kxx = kern.Kdiag(Xnew)
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
#add in the mean function
if self.mean_function is not None:
mu += self.mean_function.f(Xnew)
else: else:
psi0_star = self.kern.psi0(self.Z, Xnew) psi0_star = self.kern.psi0(self.Z, Xnew)
psi1_star = self.kern.psi1(self.Z, Xnew) psi1_star = self.kern.psi1(self.Z, Xnew)
@ -165,4 +168,5 @@ class SparseGP(GP):
var[i] = var_ var[i] = var_
else: else:
var[i] = np.diag(var_)+p0-t2 var[i] = np.diag(var_)+p0-t2
return mu, var return mu, var

View file

@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf
class SVGP(SparseGP): class SVGP(SparseGP):
def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None): def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
""" """
Stochastic Variational GP. Stochastic Variational GP.
@ -38,7 +38,7 @@ class SVGP(SparseGP):
#create the SVI inference method #create the SVI inference method
inf_method = svgp_inf() inf_method = svgp_inf()
SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method, SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
name=name, Y_metadata=Y_metadata, normalizer=False) name=name, Y_metadata=Y_metadata, normalizer=False)
self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1]))) self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
@ -48,7 +48,7 @@ class SVGP(SparseGP):
self.link_parameter(self.m) self.link_parameter(self.m)
def parameters_changed(self): def parameters_changed(self):
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0])) self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
#update the kernel gradients #update the kernel gradients
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z) self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
@ -65,6 +65,13 @@ class SVGP(SparseGP):
self.m.gradient = self.grad_dict['dL_dm'] self.m.gradient = self.grad_dict['dL_dm']
self.chol.gradient = self.grad_dict['dL_dchol'] self.chol.gradient = self.grad_dict['dL_dchol']
if self.mean_function is not None:
self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
g = self.mean_function.gradient[:].copy()
self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
self.mean_function.gradient[:] += g
self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
def set_data(self, X, Y): def set_data(self, X, Y):
""" """
Set the data without calling parameters_changed to avoid wasted computation Set the data without calling parameters_changed to avoid wasted computation

View file

@ -15,7 +15,7 @@ class Posterior(object):
the function at any new point x_* by integrating over this posterior. the function at any new point x_* by integrating over this posterior.
""" """
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None): def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
""" """
woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@ -67,6 +67,7 @@ class Posterior(object):
#option 2: #option 2:
self._mean = mean self._mean = mean
self._covariance = cov self._covariance = cov
self._prior_mean = prior_mean
#compute this lazily #compute this lazily
self._precision = None self._precision = None
@ -175,7 +176,7 @@ class Posterior(object):
$$ $$
""" """
if self._woodbury_vector is None: if self._woodbury_vector is None:
self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean) self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
return self._woodbury_vector return self._woodbury_vector
@property @property

View file

@ -7,7 +7,7 @@ from posterior import Posterior
class SVGP(LatentFunctionInference): class SVGP(LatentFunctionInference):
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0): def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
assert mean_function is None, "inference with a mean function not implemented"
num_inducing = Z.shape[0] num_inducing = Z.shape[0]
num_data, num_outputs = Y.shape num_data, num_outputs = Y.shape
@ -23,6 +23,15 @@ class SVGP(LatentFunctionInference):
#S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S)) #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
#Si, Lnew, _,_ = linalg.pdinv(S) #Si, Lnew, _,_ = linalg.pdinv(S)
#compute mean function stuff
if mean_function is not None:
prior_mean_u = mean_function.f(Z)
prior_mean_f = mean_function.f(X)
else:
prior_mean_u = np.zeros((num_inducing, num_outputs))
prior_mean_f = np.zeros((num_data, num_outputs))
#compute kernel related stuff #compute kernel related stuff
Kmm = kern.K(Z) Kmm = kern.K(Z)
Knm = kern.K(X, Z) Knm = kern.K(X, Z)
@ -31,17 +40,31 @@ class SVGP(LatentFunctionInference):
#compute the marginal means and variances of q(f) #compute the marginal means and variances of q(f)
A = np.dot(Knm, Kmmi) A = np.dot(Knm, Kmmi)
mu = np.dot(A, q_u_mean) mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1) v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
#compute the KL term #compute the KL term
Kmmim = np.dot(Kmmi, q_u_mean) Kmmim = np.dot(Kmmi, q_u_mean)
KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0) KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
KL = KLs.sum() KL = KLs.sum()
dKL_dm = Kmmim #gradient of the KL term (assuming zero mean function)
dKL_dm = Kmmim.copy()
dKL_dS = 0.5*(Kmmi[:,:,None] - Si) dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T) dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
if mean_function is not None:
#adjust KL term for mean function
Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
KL += -np.sum(q_u_mean*Kmmi_mfZ)
KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
#adjust gradient for mean fucntion
dKL_dm -= Kmmi_mfZ
dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
#compute gradients for mean_function
dKL_dmfZ = Kmmi_mfZ - Kmmim
#quadrature for the likelihood #quadrature for the likelihood
F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata) F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
@ -51,11 +74,9 @@ class SVGP(LatentFunctionInference):
if dF_dthetaL is not None: if dF_dthetaL is not None:
dF_dthetaL = dF_dthetaL.sum(1)*batch_scale dF_dthetaL = dF_dthetaL.sum(1)*batch_scale
#derivatives of expected likelihood #derivatives of expected likelihood, assuming zero mean function
Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
Admu = A.T.dot(dF_dmu) Admu = A.T.dot(dF_dmu)
#AdvA = np.einsum('ijk,jl->ilk', Adv, A)
#AdvA = np.dot(A.T, Adv).swapaxes(0,1)
AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)]) AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi) tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
@ -65,6 +86,14 @@ class SVGP(LatentFunctionInference):
dF_dm = Admu dF_dm = Admu
dF_dS = AdvA dF_dS = AdvA
#adjust gradient to account for mean function
if mean_function is not None:
dF_dmfX = dF_dmu.copy()
dF_dmfZ = -Admu
dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
dF_dKmm += Admu.dot(Kmmi_mfZ.T)
#sum (gradients of) expected likelihood and KL part #sum (gradients of) expected likelihood and KL part
log_marginal = F.sum() - KL log_marginal = F.sum() - KL
dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
@ -72,4 +101,8 @@ class SVGP(LatentFunctionInference):
dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)]) dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
dL_dchol = choleskies.triang_to_flat(dL_dchol) dL_dchol = choleskies.triang_to_flat(dL_dchol)
return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL} grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
if mean_function is not None:
grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
grad_dict['dL_dmfX'] = dF_dmfX
return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict

View file

@ -32,3 +32,23 @@ class SVGP_classification(np.testing.TestCase):
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k) self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
def test_grad(self): def test_grad(self):
assert self.m.checkgrad(step=1e-4) assert self.m.checkgrad(step=1e-4)
class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
"""
Inference in the SVGP with a Bernoulli likelihood
"""
def setUp(self):
X = np.linspace(0,10,100).reshape(-1,1)
Z = np.linspace(0,10,10).reshape(-1,1)
latent_f = np.exp(0.1*X * 0.05*X**2)
Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
mf = GPy.mappings.Linear(1,1)
lik = GPy.likelihoods.Poisson()
k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
def test_grad(self):
assert self.m.checkgrad(step=1e-4)