Merge changes for model.py and optimization.py on comments.

This commit is contained in:
Neil Lawrence 2012-12-05 19:22:36 -08:00
commit 60fd1c55dc
12 changed files with 336 additions and 195 deletions

View file

@ -10,6 +10,10 @@ from .. import kern
from ..inference.likelihoods import likelihood
from GP_regression import GP_regression
#Still TODO:
# make use of slices properly (kernel can now do this)
# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
class sparse_GP_regression(GP_regression):
"""
Variational sparse GP model (Regression)
@ -22,6 +26,8 @@ class sparse_GP_regression(GP_regression):
:type kernel: a GPy kernel
:param Z: inducing inputs (optional, see note)
:type Z: np.ndarray (M x Q) | None
:param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
:type X_uncertainty: np.ndarray (N x Q) | None
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
:type M: int
@ -31,7 +37,7 @@ class sparse_GP_regression(GP_regression):
:type normalize_(X|Y): bool
"""
def __init__(self,X,Y,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
self.beta = beta
if Z is None:
self.Z = np.random.permutation(X.copy())[:M]
@ -40,10 +46,20 @@ class sparse_GP_regression(GP_regression):
assert Z.shape[1]==X.shape[1]
self.Z = Z
self.M = Z.shape[1]
if X_uncertainty is None:
self.has_uncertain_inputs=False
else:
assert X_uncertainty.shape==X.shape
self.has_uncertain_inputs=False
self.X_uncertainty = X_uncertainty
GP_regression.__init__(self, X, Y, kernel = kernel, normalize_X = normalize_X, normalize_Y = normalize_Y)
GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
self.trYYT = np.sum(np.square(self.Y))
#normalise X uncertainty also
if self.has_uncertain_inputs:
self.X_uncertainty /= np.square(self._Xstd)
def set_param(self, p):
self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
self.beta = p[self.M*self.Q]
@ -54,18 +70,23 @@ class sparse_GP_regression(GP_regression):
def _compute_kernel_matrices(self):
# kernel computations, using BGPLVM notation
#TODO: the following can be switched out in the case of uncertain inputs (or the BGPLVM!)
#TODO: slices for psi statistics (easy enough)
self.Kmm = self.kern.K(self.Z)
self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
self.psi1 = self.kern.K(self.Z,self.X)
self.psi2 = np.dot(self.psi1,self.psi1.T)
if self.has_uncertain_inputs:
self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
else:
self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
self.psi1 = self.kern.K(self.Z,self.X)
self.psi2 = np.dot(self.psi1,self.psi1.T)
def _computations(self):
# TODO find routine to multiply triangular matrices
self.psi1Y = np.dot(self.psi1, self.Y)
self.psi1YYpsi1 = np.dot(self.psi1Y, self.psi1Y.T)
self.V = self.beta*self.Y
self.psi1V = np.dot(self.psi1, self.V)
self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
self.Lm = jitchol(self.Kmm)
self.Lmi = chol_inv(self.Lm)
self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
@ -77,25 +98,19 @@ class sparse_GP_regression(GP_regression):
self.LLambdai = np.dot(self.LBi, self.Lmi)
self.trace_K = self.psi0 - np.trace(self.A)
self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
self.C = mdot(self.LLambdai, self.psi1Y)
self.G = mdot(self.LBL_inv, self.psi1YYpsi1, self.LBL_inv.T)
self.C = mdot(self.LLambdai, self.psi1V)
self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T)
# Computes dL_dpsi
# Compute dL_dpsi
self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.Y) # this is sane.
tmp = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
self.dL_dpsi1 = self.beta2 * tmp
self.dL_dpsi2 = (- 0.5 * self.D * self.beta * (self.LBL_inv - self.Kmmi)
- self.beta**3 * 0.5 * self.G)
dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.V)
self.dL_dpsi1 = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
# Computes dL_dKmm TODO: nicer precomputations
tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi)
self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB
self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC
tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
- self.beta*mdot(self.G, self.psi2, self.Kmmi))
self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE
# Compute dL_dKmm
self.dL_dKmm = -0.5 * self.beta * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC
self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE
def get_param(self):
return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()])
@ -104,65 +119,83 @@ class sparse_GP_regression(GP_regression):
return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names()
def log_likelihood(self):
"""
Compute the (lower bound on the) log marginal likelihood
"""
A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
B = -0.5*self.beta*self.D*self.trace_K
C = -self.D * np.sum(np.log(np.diag(self.LB)))
D = -0.5*self.beta*self.trYYT
E = +0.5*self.beta2*np.sum(self.psi1YYpsi1 * self.LBL_inv)
E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
return A+B+C+D+E
def dL_dbeta(self):
""" compute the gradient of the log likelihood wrt beta.
TODO: suport heteroscedatic noise"""
"""
Compute the gradient of the log likelihood wrt beta.
TODO: suport heteroscedatic noise
"""
dA_dbeta = 0.5 * self.N*self.D/self.beta
dB_dbeta = - 0.5 * self.D * self.trace_K
dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)
dD_dbeta = - 0.5 * self.trYYT
tmp = mdot(self.LBi.T, self.LLambdai, self.psi1Y)
dE_dbeta = (self.beta * np.sum(np.square(self.C)) - 0.5 * self.beta2
* np.sum(self.A * np.dot(tmp, tmp.T)))
tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
dE_dbeta = np.sum(np.square(self.C))/self.beta - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T))
return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
def dL_dtheta(self):
#re-cast computations in psi2 back to psi1:
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
"""
Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
"""
dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
if self.has_uncertain_inputs:
dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
else:
#re-cast computations in psi2 back to psi1:
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
return dL_dtheta
def dL_dZ(self):
#re-cast computations in psi2 back to psi1:
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
"""
The derivative of the bound wrt the inducing inputs Z
"""
dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
if self.has_uncertain_inputs:
dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
else:
#re-cast computations in psi2 back to psi1:
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
return dL_dZ
def log_likelihood_gradients(self):
return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
def _raw_predict(self,Xnew,slices):
def _raw_predict(self, Xnew, slices):
"""Internal helper function for making predictions, does not account for normalisation"""
Kx = self.kern.K(self.Z, Xnew)
Kxx = self.kern.K(Xnew)
mu = self.beta * mdot(Kx.T, self.LBL_inv, self.psi1Y)
mu = mdot(Kx.T, self.LBL_inv, self.psi1V)
var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case.
return mu,var
def plot(self,*args,**kwargs):
def plot(self, *args, **kwargs):
"""
Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
"""
GP_regression.plot(self,*args,**kwargs)
if self.Q==1:
pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
if self.has_uncertain_inputs:
pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
if self.Q==2:
pb.plot(self.Z[:,0],self.Z[:,1],'wo')

View file

@ -1,70 +0,0 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
import pylab as pb
from ..util.linalg import mdot, jitchol, chol_inv, pdinv
from ..util.plot import gpplot
from .. import kern
from ..inference.likelihoods import likelihood
from sparse_GP_regression import sparse_GP_regression
class uncertain_input_GP_regression(sparse_GP_regression):
"""
Variational sparse GP model (Regression) with uncertainty on the inputs
:param X: inputs
:type X: np.ndarray (N x Q)
:param X_uncertainty: uncertainty on X (Gaussian variances, assumed isotrpic)
:type X_uncertainty: np.ndarray (N x Q)
:param Y: observed data
:type Y: np.ndarray of observations (N x D)
:param kernel : the kernel/covariance function. See link kernels
:type kernel: a GPy kernel
:param Z: inducing inputs (optional, see note)
:type Z: np.ndarray (M x Q) | None
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
:type M: int
:param beta: noise precision. TODO> ignore beta if doing EP
:type beta: float
:param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
:type normalize_(X|Y): bool
"""
def __init__(self,X,Y,X_uncertainty,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
self.X_uncertainty = X_uncertainty
sparse_GP_regression.__init__(self, X, Y, kernel = kernel, beta = beta, normalize_X = normalize_X, normalize_Y = normalize_Y)
self.trYYT = np.sum(np.square(self.Y))
def _compute_kernel_matrices(self):
# kernel computations, using BGPLVM notation
#TODO: slices for psi statistics (easy enough)
self.Kmm = self.kern.K(self.Z)
self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
def dL_dtheta(self):
#re-cast computations in psi2 back to psi1:
dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
return dL_dtheta
def dL_dZ(self):
dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
return dL_dZ
def plot(self,*args,**kwargs):
"""
Plot the fitted model: just call the sparse GP_regression plot function and then add
markers to represent uncertainty on the inputs
"""
sparse_GP_regression.plot(self,*args,**kwargs)
if self.Q==1:
pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))

View file

@ -0,0 +1,128 @@
# Copyright (c) 2012 James Hensman
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
import pylab as pb
from ..util.linalg import mdot, jitchol, chol_inv, pdinv
from ..util.plot import gpplot
from .. import kern
from ..inference.likelihoods import likelihood
from sparse_GP_regression import sparse_GP_regression
class uncollapsed_sparse_GP(sparse_GP_regression):
"""
Variational sparse GP model (Regression), where the approximating distribution q(u) is represented explicitly
:param X: inputs
:type X: np.ndarray (N x Q)
:param Y: observed data
:type Y: np.ndarray of observations (N x D)
:param q_u: canonical parameters of the distribution squasehd into a 1D array
:type q_u: np.ndarray
:param kernel : the kernel/covariance function. See link kernels
:type kernel: a GPy kernel
:param Z: inducing inputs (optional, see note)
:type Z: np.ndarray (M x Q) | None
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
:type M: int
:param beta: noise precision. TODO> ignore beta if doing EP
:type beta: float
:param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
:type normalize_(X|Y): bool
"""
def __init__(self, X, Y, q_u=None, *args, **kwargs)
D = Y.shape[1]
if q_u is None:
if Z is None:
M = Z.shape[0]
else:
M=M
self.set_vb_param(np.hstack((np.ones(M*D)),np.eye(M).flatten()))
sparse_GP_regression.__init__(self, X, Y, *args, **kwargs)
def _computations(self):
self.V = self.beta*self.Y
self.psi1V = np.dot(self.psi1, self.V)
self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
self.Lm = jitchol(self.Kmm)
self.Lmi = chol_inv(self.Lm)
self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
self.A = mdot(self.Lmi, self.psi2, self.Lmi.T)
self.B = np.eye(self.M) + self.beta * self.A
self.Lambda = mdot(self.Lmi.T,self.B,sel.Lmi)
# Compute dL_dpsi
self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
self.dL_dpsi1 =
self.dL_dpsi2 =
# Compute dL_dKmm
self.dL_dKmm =
self.dL_dKmm +=
self.dL_dKmm +=
def log_likelihood(self):
"""
Compute the (lower bound on the) log marginal likelihood
"""
A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
B = -0.5*self.beta*self.D*self.trace_K
C = -self.D *(self.Kmm_hld +0.5*np.sum(self.Lambda * self.mmT_S) + self.M/2.)
E = -0.5*self.beta*self.trYYT
F = np.sum(np.dot(self.V.T,self.projected_mean))
return A+B+C+D+E+F
def dL_dbeta(self):
"""
Compute the gradient of the log likelihood wrt beta.
TODO: suport heteroscedatic noise
"""
dA_dbeta = 0.5 * self.N*self.D/self.beta
dB_dbeta = - 0.5 * self.D * self.trace_K
dC_dbeta = - 0.5 * self.D * #TODO
dD_dbeta = - 0.5 * self.trYYT
return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
def _raw_predict(self, Xnew, slices):
"""Internal helper function for making predictions, does not account for normalisation"""
#TODO
return mu,var
def set_vb_param(self,vb_param):
"""set the distribution q(u) from the canonical parameters"""
self.q_u_prec = -2.*vb_param[self.M*self.D:].reshape(self.M,self.M)
self.q_u_prec_L = jitchol(self.q_u_prec)
self.q_u_cov_L = chol_inv(self.q_u_prec_L)
self.q_u_cov = np.dot(self.q_u_cov_L,self.q_u_cov_L.T)
self.q_u_mean = -2.*np.dot(self.q_u_cov,vb_param[:self.M*self.D].reshape(self.M,self.D))
self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov)
self.q_u_canonical = (np.dot(self.q_u_prec, self.q_u_mean),-0.5*self.q_u_prec)
#TODO: computations now?
def get_vb_param(self):
"""
Return the canonical parameters of the distribution q(u)
"""
return np.hstack([e.flatten() for e in self.q_u_canonical])
def vb_grad_natgrad(self):
"""
Compute the gradients of the lower bound wrt the canonical and
Expectation parameters of u.
Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
"""
foobar #TODO
def plot(self, *args, **kwargs):
"""
add the distribution q(u) to the plot from sparse_GP_regression
"""
sparse_GP_regression.plot(self,*args,**kwargs)
#TODO: plot the q(u) dist.