mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-08 11:32:39 +02:00
mean functions now working for svgp. with tests
This commit is contained in:
parent
02f2bb5c76
commit
72de607199
5 changed files with 83 additions and 18 deletions
|
|
@ -19,7 +19,7 @@ class SparseGP(GP):
|
||||||
This model allows (approximate) inference using variational DTC or FITC
|
This model allows (approximate) inference using variational DTC or FITC
|
||||||
(Gaussian likelihoods) as well as non-conjugate sparse methods based on
|
(Gaussian likelihoods) as well as non-conjugate sparse methods based on
|
||||||
these.
|
these.
|
||||||
|
|
||||||
This is not for missing data, as the implementation for missing data involves
|
This is not for missing data, as the implementation for missing data involves
|
||||||
some inefficient optimization routine decisions.
|
some inefficient optimization routine decisions.
|
||||||
See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
|
See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
|
||||||
|
|
@ -39,7 +39,7 @@ class SparseGP(GP):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
|
def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
|
||||||
name='sparse gp', Y_metadata=None, normalizer=False):
|
name='sparse gp', Y_metadata=None, normalizer=False):
|
||||||
#pick a sensible inference method
|
#pick a sensible inference method
|
||||||
if inference_method is None:
|
if inference_method is None:
|
||||||
|
|
@ -53,7 +53,7 @@ class SparseGP(GP):
|
||||||
self.Z = Param('inducing inputs', Z)
|
self.Z = Param('inducing inputs', Z)
|
||||||
self.num_inducing = Z.shape[0]
|
self.num_inducing = Z.shape[0]
|
||||||
|
|
||||||
GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
|
GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
|
||||||
|
|
||||||
logger.info("Adding Z as parameter")
|
logger.info("Adding Z as parameter")
|
||||||
self.link_parameter(self.Z, index=0)
|
self.link_parameter(self.Z, index=0)
|
||||||
|
|
@ -61,7 +61,7 @@ class SparseGP(GP):
|
||||||
|
|
||||||
def has_uncertain_inputs(self):
|
def has_uncertain_inputs(self):
|
||||||
return isinstance(self.X, VariationalPosterior)
|
return isinstance(self.X, VariationalPosterior)
|
||||||
|
|
||||||
def set_Z(self, Z, trigger_update=True):
|
def set_Z(self, Z, trigger_update=True):
|
||||||
if trigger_update: self.update_model(False)
|
if trigger_update: self.update_model(False)
|
||||||
self.unlink_parameter(self.Z)
|
self.unlink_parameter(self.Z)
|
||||||
|
|
@ -110,8 +110,8 @@ class SparseGP(GP):
|
||||||
|
|
||||||
def _raw_predict(self, Xnew, full_cov=False, kern=None):
|
def _raw_predict(self, Xnew, full_cov=False, kern=None):
|
||||||
"""
|
"""
|
||||||
Make a prediction for the latent function values.
|
Make a prediction for the latent function values.
|
||||||
|
|
||||||
For certain inputs we give back a full_cov of shape NxN,
|
For certain inputs we give back a full_cov of shape NxN,
|
||||||
if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
|
if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
|
||||||
we take only the diagonal elements across N.
|
we take only the diagonal elements across N.
|
||||||
|
|
@ -136,6 +136,9 @@ class SparseGP(GP):
|
||||||
else:
|
else:
|
||||||
Kxx = kern.Kdiag(Xnew)
|
Kxx = kern.Kdiag(Xnew)
|
||||||
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
|
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
|
||||||
|
#add in the mean function
|
||||||
|
if self.mean_function is not None:
|
||||||
|
mu += self.mean_function.f(Xnew)
|
||||||
else:
|
else:
|
||||||
psi0_star = self.kern.psi0(self.Z, Xnew)
|
psi0_star = self.kern.psi0(self.Z, Xnew)
|
||||||
psi1_star = self.kern.psi1(self.Z, Xnew)
|
psi1_star = self.kern.psi1(self.Z, Xnew)
|
||||||
|
|
@ -165,4 +168,5 @@ class SparseGP(GP):
|
||||||
var[i] = var_
|
var[i] = var_
|
||||||
else:
|
else:
|
||||||
var[i] = np.diag(var_)+p0-t2
|
var[i] = np.diag(var_)+p0-t2
|
||||||
|
|
||||||
return mu, var
|
return mu, var
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf
|
||||||
|
|
||||||
|
|
||||||
class SVGP(SparseGP):
|
class SVGP(SparseGP):
|
||||||
def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
|
def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
|
||||||
"""
|
"""
|
||||||
Stochastic Variational GP.
|
Stochastic Variational GP.
|
||||||
|
|
||||||
|
|
@ -38,7 +38,7 @@ class SVGP(SparseGP):
|
||||||
#create the SVI inference method
|
#create the SVI inference method
|
||||||
inf_method = svgp_inf()
|
inf_method = svgp_inf()
|
||||||
|
|
||||||
SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
|
SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
|
||||||
name=name, Y_metadata=Y_metadata, normalizer=False)
|
name=name, Y_metadata=Y_metadata, normalizer=False)
|
||||||
|
|
||||||
self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
|
self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
|
||||||
|
|
@ -48,7 +48,7 @@ class SVGP(SparseGP):
|
||||||
self.link_parameter(self.m)
|
self.link_parameter(self.m)
|
||||||
|
|
||||||
def parameters_changed(self):
|
def parameters_changed(self):
|
||||||
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
|
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
|
||||||
|
|
||||||
#update the kernel gradients
|
#update the kernel gradients
|
||||||
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
|
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
|
||||||
|
|
@ -65,6 +65,13 @@ class SVGP(SparseGP):
|
||||||
self.m.gradient = self.grad_dict['dL_dm']
|
self.m.gradient = self.grad_dict['dL_dm']
|
||||||
self.chol.gradient = self.grad_dict['dL_dchol']
|
self.chol.gradient = self.grad_dict['dL_dchol']
|
||||||
|
|
||||||
|
if self.mean_function is not None:
|
||||||
|
self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
|
||||||
|
g = self.mean_function.gradient[:].copy()
|
||||||
|
self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
|
||||||
|
self.mean_function.gradient[:] += g
|
||||||
|
self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
|
||||||
|
|
||||||
def set_data(self, X, Y):
|
def set_data(self, X, Y):
|
||||||
"""
|
"""
|
||||||
Set the data without calling parameters_changed to avoid wasted computation
|
Set the data without calling parameters_changed to avoid wasted computation
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ class Posterior(object):
|
||||||
the function at any new point x_* by integrating over this posterior.
|
the function at any new point x_* by integrating over this posterior.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
|
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
|
||||||
"""
|
"""
|
||||||
woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
|
woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
|
||||||
woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
|
woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
|
||||||
|
|
@ -67,6 +67,7 @@ class Posterior(object):
|
||||||
#option 2:
|
#option 2:
|
||||||
self._mean = mean
|
self._mean = mean
|
||||||
self._covariance = cov
|
self._covariance = cov
|
||||||
|
self._prior_mean = prior_mean
|
||||||
|
|
||||||
#compute this lazily
|
#compute this lazily
|
||||||
self._precision = None
|
self._precision = None
|
||||||
|
|
@ -175,7 +176,7 @@ class Posterior(object):
|
||||||
$$
|
$$
|
||||||
"""
|
"""
|
||||||
if self._woodbury_vector is None:
|
if self._woodbury_vector is None:
|
||||||
self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
|
self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
|
||||||
return self._woodbury_vector
|
return self._woodbury_vector
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from posterior import Posterior
|
||||||
class SVGP(LatentFunctionInference):
|
class SVGP(LatentFunctionInference):
|
||||||
|
|
||||||
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
|
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
|
||||||
assert mean_function is None, "inference with a mean function not implemented"
|
|
||||||
num_inducing = Z.shape[0]
|
num_inducing = Z.shape[0]
|
||||||
num_data, num_outputs = Y.shape
|
num_data, num_outputs = Y.shape
|
||||||
|
|
||||||
|
|
@ -23,6 +23,15 @@ class SVGP(LatentFunctionInference):
|
||||||
#S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
|
#S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
|
||||||
#Si, Lnew, _,_ = linalg.pdinv(S)
|
#Si, Lnew, _,_ = linalg.pdinv(S)
|
||||||
|
|
||||||
|
#compute mean function stuff
|
||||||
|
if mean_function is not None:
|
||||||
|
prior_mean_u = mean_function.f(Z)
|
||||||
|
prior_mean_f = mean_function.f(X)
|
||||||
|
else:
|
||||||
|
prior_mean_u = np.zeros((num_inducing, num_outputs))
|
||||||
|
prior_mean_f = np.zeros((num_data, num_outputs))
|
||||||
|
|
||||||
|
|
||||||
#compute kernel related stuff
|
#compute kernel related stuff
|
||||||
Kmm = kern.K(Z)
|
Kmm = kern.K(Z)
|
||||||
Knm = kern.K(X, Z)
|
Knm = kern.K(X, Z)
|
||||||
|
|
@ -31,17 +40,31 @@ class SVGP(LatentFunctionInference):
|
||||||
|
|
||||||
#compute the marginal means and variances of q(f)
|
#compute the marginal means and variances of q(f)
|
||||||
A = np.dot(Knm, Kmmi)
|
A = np.dot(Knm, Kmmi)
|
||||||
mu = np.dot(A, q_u_mean)
|
mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
|
||||||
v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
|
v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
|
||||||
|
|
||||||
#compute the KL term
|
#compute the KL term
|
||||||
Kmmim = np.dot(Kmmi, q_u_mean)
|
Kmmim = np.dot(Kmmi, q_u_mean)
|
||||||
KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
|
KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
|
||||||
KL = KLs.sum()
|
KL = KLs.sum()
|
||||||
dKL_dm = Kmmim
|
#gradient of the KL term (assuming zero mean function)
|
||||||
|
dKL_dm = Kmmim.copy()
|
||||||
dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
|
dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
|
||||||
dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
|
dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
|
||||||
|
|
||||||
|
if mean_function is not None:
|
||||||
|
#adjust KL term for mean function
|
||||||
|
Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
|
||||||
|
KL += -np.sum(q_u_mean*Kmmi_mfZ)
|
||||||
|
KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
|
||||||
|
|
||||||
|
#adjust gradient for mean fucntion
|
||||||
|
dKL_dm -= Kmmi_mfZ
|
||||||
|
dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
|
||||||
|
dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
|
||||||
|
|
||||||
|
#compute gradients for mean_function
|
||||||
|
dKL_dmfZ = Kmmi_mfZ - Kmmim
|
||||||
|
|
||||||
#quadrature for the likelihood
|
#quadrature for the likelihood
|
||||||
F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
|
F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
|
||||||
|
|
@ -51,11 +74,9 @@ class SVGP(LatentFunctionInference):
|
||||||
if dF_dthetaL is not None:
|
if dF_dthetaL is not None:
|
||||||
dF_dthetaL = dF_dthetaL.sum(1)*batch_scale
|
dF_dthetaL = dF_dthetaL.sum(1)*batch_scale
|
||||||
|
|
||||||
#derivatives of expected likelihood
|
#derivatives of expected likelihood, assuming zero mean function
|
||||||
Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
|
Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
|
||||||
Admu = A.T.dot(dF_dmu)
|
Admu = A.T.dot(dF_dmu)
|
||||||
#AdvA = np.einsum('ijk,jl->ilk', Adv, A)
|
|
||||||
#AdvA = np.dot(A.T, Adv).swapaxes(0,1)
|
|
||||||
AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
|
AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
|
||||||
tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
|
tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
|
||||||
dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
|
dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
|
||||||
|
|
@ -65,6 +86,14 @@ class SVGP(LatentFunctionInference):
|
||||||
dF_dm = Admu
|
dF_dm = Admu
|
||||||
dF_dS = AdvA
|
dF_dS = AdvA
|
||||||
|
|
||||||
|
#adjust gradient to account for mean function
|
||||||
|
if mean_function is not None:
|
||||||
|
dF_dmfX = dF_dmu.copy()
|
||||||
|
dF_dmfZ = -Admu
|
||||||
|
dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
|
||||||
|
dF_dKmm += Admu.dot(Kmmi_mfZ.T)
|
||||||
|
|
||||||
|
|
||||||
#sum (gradients of) expected likelihood and KL part
|
#sum (gradients of) expected likelihood and KL part
|
||||||
log_marginal = F.sum() - KL
|
log_marginal = F.sum() - KL
|
||||||
dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
|
dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
|
||||||
|
|
@ -72,4 +101,8 @@ class SVGP(LatentFunctionInference):
|
||||||
dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
|
dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
|
||||||
dL_dchol = choleskies.triang_to_flat(dL_dchol)
|
dL_dchol = choleskies.triang_to_flat(dL_dchol)
|
||||||
|
|
||||||
return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
|
grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
|
||||||
|
if mean_function is not None:
|
||||||
|
grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
|
||||||
|
grad_dict['dL_dmfX'] = dF_dmfX
|
||||||
|
return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
|
||||||
|
|
|
||||||
|
|
@ -32,3 +32,23 @@ class SVGP_classification(np.testing.TestCase):
|
||||||
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
|
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
|
||||||
def test_grad(self):
|
def test_grad(self):
|
||||||
assert self.m.checkgrad(step=1e-4)
|
assert self.m.checkgrad(step=1e-4)
|
||||||
|
|
||||||
|
class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
|
||||||
|
"""
|
||||||
|
Inference in the SVGP with a Bernoulli likelihood
|
||||||
|
"""
|
||||||
|
def setUp(self):
|
||||||
|
X = np.linspace(0,10,100).reshape(-1,1)
|
||||||
|
Z = np.linspace(0,10,10).reshape(-1,1)
|
||||||
|
latent_f = np.exp(0.1*X * 0.05*X**2)
|
||||||
|
Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
|
||||||
|
|
||||||
|
mf = GPy.mappings.Linear(1,1)
|
||||||
|
|
||||||
|
lik = GPy.likelihoods.Poisson()
|
||||||
|
k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
|
||||||
|
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
|
||||||
|
def test_grad(self):
|
||||||
|
assert self.m.checkgrad(step=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue