mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-21 14:05:14 +02:00
sparse GP now checkgrads, optimises sensibly. Predicitno still not working
This commit is contained in:
parent
a93312c306
commit
e0dbfbe148
6 changed files with 81 additions and 47 deletions
|
|
@ -53,26 +53,45 @@ class SparseGP(GP):
|
|||
GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
|
||||
|
||||
self.Z = Param('inducing inputs', self.Z)
|
||||
self.add_parameter(self.Z, gradient=self.dL_dZ, index=0)
|
||||
self.add_parameter(self.kern, gradient=self.dL_dtheta)
|
||||
self.add_parameter(self.likelihood, gradient=lambda:self.likelihood._gradients(partial=self.partial_for_likelihood))
|
||||
self.add_parameter(self.Z, index=0)
|
||||
self.add_parameter(self.kern)
|
||||
self.add_parameter(self.likelihood)
|
||||
|
||||
def parameters_changed(self):
|
||||
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
|
||||
|
||||
#The derivative of the bound wrt the inducing inputs Z
|
||||
self.Z.gradient = self.kern.dK_dX(self.dL_dKmm, self.Z)
|
||||
if self.has_uncertain_inputs:
|
||||
#The derivative of the bound wrt the inducing inputs Z
|
||||
self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
|
||||
if self.X_variance is None:
|
||||
self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
|
||||
else:
|
||||
self.Z.gradient += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
|
||||
self.Z.gradient += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
|
||||
else:
|
||||
self.Z.gradient += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X)
|
||||
|
||||
def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
|
||||
"""
|
||||
Make a prediction for the latent function values
|
||||
"""
|
||||
#TODO!!!
|
||||
if X_variance_new is None:
|
||||
Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
|
||||
mu = np.dot(Kx.T, self.posterior.woodbury_vector)
|
||||
if full_cov:
|
||||
Kxx = self.kern.K(Xnew, which_parts=which_parts)
|
||||
var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx) # NOTE this won't work for plotting
|
||||
else:
|
||||
Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
|
||||
var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0)
|
||||
else:
|
||||
# assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
|
||||
Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
|
||||
mu = np.dot(Kx, self.Cpsi1V)
|
||||
if full_cov:
|
||||
raise NotImplementedError, "TODO"
|
||||
else:
|
||||
Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
|
||||
psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
|
||||
var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
|
||||
return mu, var[:,None]
|
||||
|
||||
|
||||
def _getstate(self):
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||
|
||||
import numpy as np
|
||||
from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs
|
||||
from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify
|
||||
|
||||
class Posterior(object):
|
||||
"""
|
||||
|
|
@ -62,6 +62,7 @@ class Posterior(object):
|
|||
|
||||
#compute this lazily
|
||||
self._precision = None
|
||||
self._woodbury_inv = None
|
||||
|
||||
@property
|
||||
def mean(self):
|
||||
|
|
@ -91,6 +92,14 @@ class Posterior(object):
|
|||
_, _, self._woodbury_chol, _ = pdinv(Wi)
|
||||
return self._woodbury_chol
|
||||
|
||||
@property
|
||||
def woodbury_inv(self):
|
||||
if self._woodbury_inv is None:
|
||||
self._woodbury_inv, _ = dpotri(self.woodbury_chol)
|
||||
symmetrify(self._woodbury_inv)
|
||||
return self._woodbury_inv
|
||||
|
||||
|
||||
@property
|
||||
def woodbury_vector(self):
|
||||
if self._woodbury_vector is None:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||
|
||||
from posterior import Posterior
|
||||
from ...util.linalg import jitchol, backsub_both_sides, tdot
|
||||
from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs
|
||||
import numpy as np
|
||||
log_2_pi = np.log(2*np.pi)
|
||||
|
||||
|
|
@ -30,7 +30,7 @@ class VarDTC(object):
|
|||
if (N>D):
|
||||
return Y
|
||||
else:
|
||||
#if Y in self.cache, return self.Cache[Y], else stor Y in cache and return L.
|
||||
#if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
|
||||
raise NotImplementedError, 'TODO' #TODO
|
||||
|
||||
def get_VVTfactor(self, Y, prec):
|
||||
|
|
@ -66,9 +66,9 @@ class VarDTC(object):
|
|||
# The rather complex computations of A
|
||||
if uncertain_inputs:
|
||||
if het_noise:
|
||||
psi2_beta = (psi2 * (likelihood.precision.flatten().reshape(num_data, 1, 1))).sum(0)
|
||||
psi2_beta = (psi2 * (beta.flatten().reshape(num_data, 1, 1))).sum(0)
|
||||
else:
|
||||
psi2_beta = psi2.sum(0) * likelihood.precision
|
||||
psi2_beta = psi2.sum(0) * beta
|
||||
evals, evecs = linalg.eigh(psi2_beta)
|
||||
clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
|
||||
if not np.array_equal(evals, clipped_evals):
|
||||
|
|
@ -89,6 +89,7 @@ class VarDTC(object):
|
|||
|
||||
# VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
|
||||
VVT_factor = self.get_VVTfactor(Y, beta)
|
||||
trYYT = np.sum(np.square(Y))
|
||||
psi1Vf = np.dot(psi1.T, VVT_factor)
|
||||
|
||||
# back substutue C into psi1Vf
|
||||
|
|
@ -97,17 +98,6 @@ class VarDTC(object):
|
|||
tmp, info2 = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
|
||||
Cpsi1Vf, info3 = dtrtrs(Lm, tmp, lower=1, trans=1)
|
||||
|
||||
#compute log marginal likelihood
|
||||
if het_noise:
|
||||
A = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
|
||||
B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
|
||||
else:
|
||||
A = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * likelihood.trYYT
|
||||
B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
|
||||
C = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
|
||||
D = 0.5 * data_fit
|
||||
log_marginal = A + B + C + D
|
||||
|
||||
|
||||
# Compute dL_dKmm
|
||||
tmp = tdot(_LBi_Lmi_psi1Vf)
|
||||
|
|
@ -120,17 +110,17 @@ class VarDTC(object):
|
|||
|
||||
# Compute dL_dpsi
|
||||
dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten()
|
||||
dL_dpsi1 = np.dot(likelihood.VVT_factor, Cpsi1Vf.T)
|
||||
dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
|
||||
dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
|
||||
|
||||
if het_noise:
|
||||
if uncertain_inputs:
|
||||
dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
|
||||
else:
|
||||
dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * likelihood.precision.reshape(num_data, 1)).T).T
|
||||
dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
|
||||
dL_dpsi2 = None
|
||||
else:
|
||||
dL_dpsi2 = likelihood.precision * dL_dpsi2_beta
|
||||
dL_dpsi2 = beta * dL_dpsi2_beta
|
||||
if uncertain_inputs:
|
||||
# repeat for each of the N psi_2 matrices
|
||||
dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
|
||||
|
|
@ -152,40 +142,55 @@ class VarDTC(object):
|
|||
Lmi_psi1, nil = dtrtrs(Lm, np.asfortranarray(psi1.T), lower=1, trans=0)
|
||||
_LBi_Lmi_psi1, _ = dtrtrs(LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
|
||||
|
||||
partial_for_likelihood = -0.5 * likelihood.precision + 0.5 * likelihood.V**2
|
||||
partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * likelihood.precision**2
|
||||
partial_for_likelihood = -0.5 * beta + 0.5 * likelihood.V**2
|
||||
partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
|
||||
|
||||
partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*likelihood.precision**2
|
||||
partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
|
||||
|
||||
partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * likelihood.precision**2
|
||||
partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * likelihood.precision**2
|
||||
partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * beta**2
|
||||
partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
|
||||
|
||||
else:
|
||||
# likelihood is not heteroscedatic
|
||||
partial_for_likelihood = -0.5 * num_data * output_dim * likelihood.precision + 0.5 * likelihood.trYYT * likelihood.precision ** 2
|
||||
partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * likelihood.precision ** 2 - np.trace(_A) * likelihood.precision)
|
||||
partial_for_likelihood += likelihood.precision * (0.5 * np.sum(_A * DBi_plus_BiPBi) - data_fit)
|
||||
partial_for_likelihood = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
|
||||
partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
|
||||
partial_for_likelihood += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
|
||||
|
||||
#compute log marginal likelihood
|
||||
if het_noise:
|
||||
lik_1 = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
|
||||
lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
|
||||
else:
|
||||
lik_1 = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * trYYT
|
||||
lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
|
||||
lik_3 = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
|
||||
lik_4 = 0.5 * data_fit
|
||||
log_marginal = lik_1 + lik_2 + lik_3 + lik_4
|
||||
|
||||
#put the gradients in the right places
|
||||
likelihood.update_gradients(np.diag(partial_for_likelihood))
|
||||
likelihood.update_gradients(partial_for_likelihood)
|
||||
|
||||
if uncertain_inputs:
|
||||
kern.update_gradients_variational(??)
|
||||
grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
|
||||
kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
|
||||
else:
|
||||
kern.update_gradients_sparse(??)
|
||||
grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
|
||||
grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1}
|
||||
kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
|
||||
|
||||
#get sufficient things for posterior prediction
|
||||
if VVT_factor.shape[1] == Y.shape[1]:
|
||||
woodbury_vector = Cpsi1Vf # == Cpsi1V
|
||||
woodbury_chol = ??
|
||||
else:
|
||||
raise NotImplementedError #TODO
|
||||
psi1V = np.dot(Y.T*beta, psi1).T
|
||||
tmp, _ = dtrtrs(Lm, np.asfortranarray(psi1V), lower=1, trans=0)
|
||||
tmp, _ = dpotrs(LB, tmp, lower=1)
|
||||
woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
|
||||
#TODO: totally wrong, fix.
|
||||
woodbury_chol = np.eye(num_inducing)
|
||||
|
||||
#construct a posterior object
|
||||
post = Posterior(woodbury_chol=woodbury_chol, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
|
||||
|
||||
return Posterior, log_marginal, grad_dict
|
||||
return post, log_marginal, grad_dict
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -284,9 +284,10 @@ class kern(Parameterized):
|
|||
[p.update_gradients_full(dL_dK, X) for p in self._parameters_]
|
||||
|
||||
def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
|
||||
raise NotImplementedError
|
||||
[p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X, Z) for p in self._parameters_]
|
||||
|
||||
def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
|
||||
raise NotImplementedError
|
||||
[p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
|
||||
|
||||
def dK_dtheta(self, dL_dK, X, X2=None):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -117,7 +117,7 @@ class RBF(Kernpart):
|
|||
self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
|
||||
|
||||
else:
|
||||
self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
|
||||
self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
|
||||
|
||||
#from Kmm
|
||||
self._K_computations(Z, None)
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ class White(Kernpart):
|
|||
self.variance.gradient = np.trace(dL_dK)
|
||||
|
||||
def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
|
||||
raise NotImplementedError
|
||||
self.variance.gradient = np.trace(dL_dKmm) + np.sum(dL_dKdiag)
|
||||
|
||||
def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
|
||||
raise NotImplementedError
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue