sparse GP now checkgrads, optimises sensibly. Predicitno still not working

This commit is contained in:
James Hensman 2014-01-30 12:03:02 +00:00
parent a93312c306
commit e0dbfbe148
6 changed files with 81 additions and 47 deletions

View file

@ -53,26 +53,45 @@ class SparseGP(GP):
GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name) GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
self.Z = Param('inducing inputs', self.Z) self.Z = Param('inducing inputs', self.Z)
self.add_parameter(self.Z, gradient=self.dL_dZ, index=0) self.add_parameter(self.Z, index=0)
self.add_parameter(self.kern, gradient=self.dL_dtheta) self.add_parameter(self.kern)
self.add_parameter(self.likelihood, gradient=lambda:self.likelihood._gradients(partial=self.partial_for_likelihood)) self.add_parameter(self.likelihood)
def parameters_changed(self): def parameters_changed(self):
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y) self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
#The derivative of the bound wrt the inducing inputs Z #The derivative of the bound wrt the inducing inputs Z
self.Z.gradient = self.kern.dK_dX(self.dL_dKmm, self.Z) self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
if self.has_uncertain_inputs: if self.X_variance is None:
self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
else:
self.Z.gradient += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance) self.Z.gradient += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
self.Z.gradient += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance) self.Z.gradient += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
else:
self.Z.gradient += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X)
def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
""" """
Make a prediction for the latent function values Make a prediction for the latent function values
""" """
#TODO!!! if X_variance_new is None:
Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
mu = np.dot(Kx.T, self.posterior.woodbury_vector)
if full_cov:
Kxx = self.kern.K(Xnew, which_parts=which_parts)
var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx) # NOTE this won't work for plotting
else:
Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0)
else:
# assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
mu = np.dot(Kx, self.Cpsi1V)
if full_cov:
raise NotImplementedError, "TODO"
else:
Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
return mu, var[:,None]
def _getstate(self): def _getstate(self):

View file

@ -2,7 +2,7 @@
# Licensed under the BSD 3-clause license (see LICENSE.txt) # Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np import numpy as np
from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify
class Posterior(object): class Posterior(object):
""" """
@ -62,6 +62,7 @@ class Posterior(object):
#compute this lazily #compute this lazily
self._precision = None self._precision = None
self._woodbury_inv = None
@property @property
def mean(self): def mean(self):
@ -91,6 +92,14 @@ class Posterior(object):
_, _, self._woodbury_chol, _ = pdinv(Wi) _, _, self._woodbury_chol, _ = pdinv(Wi)
return self._woodbury_chol return self._woodbury_chol
@property
def woodbury_inv(self):
if self._woodbury_inv is None:
self._woodbury_inv, _ = dpotri(self.woodbury_chol)
symmetrify(self._woodbury_inv)
return self._woodbury_inv
@property @property
def woodbury_vector(self): def woodbury_vector(self):
if self._woodbury_vector is None: if self._woodbury_vector is None:

View file

@ -2,7 +2,7 @@
# Licensed under the BSD 3-clause license (see LICENSE.txt) # Licensed under the BSD 3-clause license (see LICENSE.txt)
from posterior import Posterior from posterior import Posterior
from ...util.linalg import jitchol, backsub_both_sides, tdot from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs
import numpy as np import numpy as np
log_2_pi = np.log(2*np.pi) log_2_pi = np.log(2*np.pi)
@ -30,7 +30,7 @@ class VarDTC(object):
if (N>D): if (N>D):
return Y return Y
else: else:
#if Y in self.cache, return self.Cache[Y], else stor Y in cache and return L. #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
raise NotImplementedError, 'TODO' #TODO raise NotImplementedError, 'TODO' #TODO
def get_VVTfactor(self, Y, prec): def get_VVTfactor(self, Y, prec):
@ -66,9 +66,9 @@ class VarDTC(object):
# The rather complex computations of A # The rather complex computations of A
if uncertain_inputs: if uncertain_inputs:
if het_noise: if het_noise:
psi2_beta = (psi2 * (likelihood.precision.flatten().reshape(num_data, 1, 1))).sum(0) psi2_beta = (psi2 * (beta.flatten().reshape(num_data, 1, 1))).sum(0)
else: else:
psi2_beta = psi2.sum(0) * likelihood.precision psi2_beta = psi2.sum(0) * beta
evals, evecs = linalg.eigh(psi2_beta) evals, evecs = linalg.eigh(psi2_beta)
clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
if not np.array_equal(evals, clipped_evals): if not np.array_equal(evals, clipped_evals):
@ -89,6 +89,7 @@ class VarDTC(object):
# VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
VVT_factor = self.get_VVTfactor(Y, beta) VVT_factor = self.get_VVTfactor(Y, beta)
trYYT = np.sum(np.square(Y))
psi1Vf = np.dot(psi1.T, VVT_factor) psi1Vf = np.dot(psi1.T, VVT_factor)
# back substutue C into psi1Vf # back substutue C into psi1Vf
@ -97,17 +98,6 @@ class VarDTC(object):
tmp, info2 = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) tmp, info2 = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
Cpsi1Vf, info3 = dtrtrs(Lm, tmp, lower=1, trans=1) Cpsi1Vf, info3 = dtrtrs(Lm, tmp, lower=1, trans=1)
#compute log marginal likelihood
if het_noise:
A = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
else:
A = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * likelihood.trYYT
B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
C = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
D = 0.5 * data_fit
log_marginal = A + B + C + D
# Compute dL_dKmm # Compute dL_dKmm
tmp = tdot(_LBi_Lmi_psi1Vf) tmp = tdot(_LBi_Lmi_psi1Vf)
@ -120,17 +110,17 @@ class VarDTC(object):
# Compute dL_dpsi # Compute dL_dpsi
dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten() dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten()
dL_dpsi1 = np.dot(likelihood.VVT_factor, Cpsi1Vf.T) dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi) dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
if het_noise: if het_noise:
if uncertain_inputs: if uncertain_inputs:
dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :] dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
else: else:
dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * likelihood.precision.reshape(num_data, 1)).T).T dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
dL_dpsi2 = None dL_dpsi2 = None
else: else:
dL_dpsi2 = likelihood.precision * dL_dpsi2_beta dL_dpsi2 = beta * dL_dpsi2_beta
if uncertain_inputs: if uncertain_inputs:
# repeat for each of the N psi_2 matrices # repeat for each of the N psi_2 matrices
dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0) dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
@ -152,40 +142,55 @@ class VarDTC(object):
Lmi_psi1, nil = dtrtrs(Lm, np.asfortranarray(psi1.T), lower=1, trans=0) Lmi_psi1, nil = dtrtrs(Lm, np.asfortranarray(psi1.T), lower=1, trans=0)
_LBi_Lmi_psi1, _ = dtrtrs(LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
partial_for_likelihood = -0.5 * likelihood.precision + 0.5 * likelihood.V**2 partial_for_likelihood = -0.5 * beta + 0.5 * likelihood.V**2
partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * likelihood.precision**2 partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*likelihood.precision**2 partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * likelihood.precision**2 partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * beta**2
partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * likelihood.precision**2 partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
else: else:
# likelihood is not heteroscedatic # likelihood is not heteroscedatic
partial_for_likelihood = -0.5 * num_data * output_dim * likelihood.precision + 0.5 * likelihood.trYYT * likelihood.precision ** 2 partial_for_likelihood = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * likelihood.precision ** 2 - np.trace(_A) * likelihood.precision) partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
partial_for_likelihood += likelihood.precision * (0.5 * np.sum(_A * DBi_plus_BiPBi) - data_fit) partial_for_likelihood += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
#compute log marginal likelihood
if het_noise:
lik_1 = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
else:
lik_1 = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * trYYT
lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
lik_3 = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
lik_4 = 0.5 * data_fit
log_marginal = lik_1 + lik_2 + lik_3 + lik_4
#put the gradients in the right places #put the gradients in the right places
likelihood.update_gradients(np.diag(partial_for_likelihood)) likelihood.update_gradients(partial_for_likelihood)
if uncertain_inputs: if uncertain_inputs:
kern.update_gradients_variational(??)
grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2} grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
else: else:
kern.update_gradients_sparse(??) grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1}
grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2} kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
#get sufficient things for posterior prediction #get sufficient things for posterior prediction
if VVT_factor.shape[1] == Y.shape[1]: if VVT_factor.shape[1] == Y.shape[1]:
woodbury_vector = Cpsi1Vf # == Cpsi1V woodbury_vector = Cpsi1Vf # == Cpsi1V
woodbury_chol = ??
else: else:
raise NotImplementedError #TODO psi1V = np.dot(Y.T*beta, psi1).T
tmp, _ = dtrtrs(Lm, np.asfortranarray(psi1V), lower=1, trans=0)
tmp, _ = dpotrs(LB, tmp, lower=1)
woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
#TODO: totally wrong, fix.
woodbury_chol = np.eye(num_inducing)
#construct a posterior object #construct a posterior object
post = Posterior(woodbury_chol=woodbury_chol, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) post = Posterior(woodbury_chol=woodbury_chol, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
return Posterior, log_marginal, grad_dict return post, log_marginal, grad_dict

View file

@ -284,9 +284,10 @@ class kern(Parameterized):
[p.update_gradients_full(dL_dK, X) for p in self._parameters_] [p.update_gradients_full(dL_dK, X) for p in self._parameters_]
def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z): def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
raise NotImplementedError [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X, Z) for p in self._parameters_]
def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z): def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
raise NotImplementedError [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
def dK_dtheta(self, dL_dK, X, X2=None): def dK_dtheta(self, dL_dK, X, X2=None):
""" """

View file

@ -117,7 +117,7 @@ class RBF(Kernpart):
self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z) self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
else: else:
self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm) self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
#from Kmm #from Kmm
self._K_computations(Z, None) self._K_computations(Z, None)

View file

@ -32,7 +32,7 @@ class White(Kernpart):
self.variance.gradient = np.trace(dL_dK) self.variance.gradient = np.trace(dL_dK)
def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z): def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
raise NotImplementedError self.variance.gradient = np.trace(dL_dKmm) + np.sum(dL_dKdiag)
def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z): def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
raise NotImplementedError raise NotImplementedError