This commit is contained in:
Nicolo Fusi 2012-11-29 16:32:48 +00:00
parent 63321e8409
commit 61984274dd
8 changed files with 991 additions and 0 deletions

61
GPy/models/GPLVM.py Normal file
View file

@ -0,0 +1,61 @@
import numpy as np
import pylab as pb
import sys, pdb
from .. import kern
from ..core import model
from ..util.linalg import pdinv, PCA
from GP_regression import GP_regression
class GPLVM(GP_regression):
"""
Gaussian Process Latent Variable Model
:param Y: observed data
:type Y: np.ndarray
:param Q: latent dimensionality
:type Q: int
:param init: initialisation method for the latent space
:type init: 'PCA'|'random'
"""
def __init__(self, Y, Q, init='PCA', **kwargs):
X = self.initialise_latent(init, Q, Y)
GP_regression.__init__(self, X, Y, **kwargs)
def initialise_latent(self, init, Q, Y):
if init == 'PCA':
return PCA(Y, Q)[0]
else:
return np.random.randn(Y.shape[0], Q)
def get_param_names(self):
return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
+ self.kern.extract_param_names())
def get_param(self):
return np.hstack((self.X.flatten(), self.kern.extract_param()))
def set_param(self,x):
self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
GP_regression.set_param(self, x[self.X.size:])
def log_likelihood_gradients(self):
dL_dK = self.dL_dK()
dK_dtheta = self.kern.dK_dtheta(self.X)
dL_dtheta = (dK_dtheta*dL_dK[:,:,None]).sum(0).sum(0)
dK_dX = self.kern.dK_dX(self.X)
dL_dX = 2.*np.sum(dL_dK[:,:,None]*dK_dX,0)
return np.hstack((dL_dX.flatten(),dL_dtheta))
def plot(self):
assert self.Y.shape[1]==2
pb.scatter(self.Y[:,0],self.Y[:,1],40,self.X[:,0].copy(),linewidth=0)
Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None]
mu, var = self.predict(Xnew)
pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)
def plot_latent(self):
raise NotImplementedError

153
GPy/models/GP_EP.py Normal file
View file

@ -0,0 +1,153 @@
import numpy as np
import pylab as pb
from scipy import stats, linalg
from .. import kern
from ..inference.Expectation_Propagation import EP,Full
from ..inference.likelihoods import likelihood,probit,poisson,gaussian
from ..core import model
from ..util.linalg import pdinv,jitchol
from ..util.plot import gpplot
class GP_EP(model):
def __init__(self,X,likelihood,kernel=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]):
"""
Simple Gaussian Process with Non-Gaussian likelihood
Arguments
---------
:param X: input observations (NxD numpy.darray)
:param likelihood: a GPy likelihood (likelihood class)
:param kernel: a GPy kernel (kern class)
:param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 (float)
:param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list)
:rtype: GPy model class.
"""
assert isinstance(kernel,kern.kern), 'kernel is not a kern instance'
self.likelihood = likelihood
self.Y = self.likelihood.Y
self.kernel = kernel
self.X = X
self.N, self.D = self.X.shape
self.eta,self.delta = powerep
self.epsilon_ep = epsilon_ep
self.jitter = 1e-12
self.K = self.kernel.K(self.X)
model.__init__(self)
def set_param(self,p):
self.kernel.expand_param(p)
def get_param(self):
return self.kernel.extract_param()
def get_param_names(self):
return self.kernel.extract_param_names()
def approximate_likelihood(self):
self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta])
self.ep_approx.fit_EP()
def posterior_param(self):
self.K = self.kernel.K(self.X)
self.Sroot_tilde_K = np.sqrt(self.ep_approx.tau_tilde)[:,None]*self.K
B = np.eye(self.N) + np.sqrt(self.ep_approx.tau_tilde)[None,:]*self.Sroot_tilde_K
#self.L = np.linalg.cholesky(B)
self.L = jitchol(B)
V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1)
self.Sigma = self.K - np.dot(V.T,V)
self.mu = np.dot(self.Sigma,self.ep_approx.v_tilde)
def log_likelihood(self):
"""
Returns
-------
The EP approximation to the log-marginal likelihood
"""
self.posterior_param()
mu_ = self.ep_approx.v_/self.ep_approx.tau_
L1 =.5*sum(np.log(1+self.ep_approx.tau_tilde*1./self.ep_approx.tau_))-sum(np.log(np.diag(self.L)))
L2A =.5*np.sum((self.Sigma-np.diag(1./(self.ep_approx.tau_+self.ep_approx.tau_tilde))) * np.dot(self.ep_approx.v_tilde[:,None],self.ep_approx.v_tilde[None,:]))
L2B = .5*np.dot(mu_*(self.ep_approx.tau_/(self.ep_approx.tau_tilde+self.ep_approx.tau_)),self.ep_approx.tau_tilde*mu_ - 2*self.ep_approx.v_tilde)
L3 = sum(np.log(self.ep_approx.Z_hat))
return L1 + L2A + L2B + L3
def log_likelihood_gradients(self):
dK_dp = self.kernel.dK_dtheta(self.X)
self.dK_dp = dK_dp
aux1,info_1 = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
b = self.ep_approx.v_tilde - np.sqrt(self.ep_approx.tau_tilde)*linalg.flapack.dtrtrs(self.L.T,aux1)[0]
U,info_u = linalg.flapack.dtrtrs(self.L,np.diag(np.sqrt(self.ep_approx.tau_tilde)),lower=1)
dL_dK = 0.5*(np.outer(b,b)-np.dot(U.T,U))
self.dL_dK = dL_dK
return np.array([np.sum(dK_dpi*dL_dK) for dK_dpi in dK_dp.T])
def predict(self,X):
#TODO: check output dimensions
self.posterior_param()
K_x = self.kernel.K(self.X,X)
Kxx = self.kernel.K(X)
aux1,info = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
aux2,info = linalg.flapack.dtrtrs(self.L.T, aux1,lower=0)
zeta = np.sqrt(self.ep_approx.tau_tilde)*aux2
f = np.dot(K_x.T,self.ep_approx.v_tilde-zeta)
v,info = linalg.flapack.dtrtrs(self.L,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x,lower=1)
variance = Kxx - np.dot(v.T,v)
vdiag = np.diag(variance)
y=self.likelihood.predictive_mean(f,vdiag)
return f,vdiag,y
def plot(self):
"""
Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals.
"""
if self.X.shape[1]==1:
pb.figure()
xmin,xmax = self.X.min(),self.X.max()
xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
Xnew = np.linspace(xmin,xmax,100)[:,None]
mu_f, var_f, mu_phi = self.predict(Xnew)
pb.subplot(211)
self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.X,Mean_u=self.mu,Var_u=np.diag(self.Sigma))
pb.subplot(212)
self.likelihood.plot1Db(self.X,Xnew,mu_phi)
elif self.X.shape[1]==2:
pb.figure()
x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0)
x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0)
x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min)
x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min)
axis1 = np.linspace(x1min,x1max,50)
axis2 = np.linspace(x2min,x2max,50)
XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)]
Xnew = np.c_[XX1.flatten(),XX2.flatten()]
f,v,p = self.predict(Xnew)
self.likelihood.plot2D(self.X,Xnew,p)
else:
raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"
def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense
"""
Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved.
"""
self.epsilon_em = epsilon
log_likelihood_change = self.epsilon_em + 1.
self.parameters_path = [self.kernel.get_param()]
self.approximate_likelihood()
self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
self.log_likelihood_path = [self.log_likelihood()]
iteration = 0
while log_likelihood_change > self.epsilon_em:
print 'EM iteration', iteration
self.optimize(max_f_eval = max_f_eval)
log_likelihood_new = self.log_likelihood()
log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
if log_likelihood_change < 0:
print 'log_likelihood decrement'
self.kernel.expand_param(self.parameters_path[-1])
self.kernM.expand_param(self.parameters_path[-1])
else:
self.approximate_likelihood()
self.log_likelihood_path.append(self.log_likelihood())
self.parameters_path.append(self.kernel.get_param())
self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
iteration += 1

207
GPy/models/GP_regression.py Normal file
View file

@ -0,0 +1,207 @@
import numpy as np
import pylab as pb
from .. import kern
from ..core import model
from ..util.linalg import pdinv,mdot
from ..util.plot import gpplot, Tango
class GP_regression(model):
"""
Gaussian Process model for regression
:param X: input observations
:param Y: observed values
:param kernel: a GPy kernel, defaults to rbf+white
:param normalize_X: whether to normalize the input data before computing (predictions will be in original scales)
:type normalize_X: False|True
:param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales)
:type normalize_Y: False|True
:param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing)
:rtype: model object
.. Note:: Multiple independent outputs are allowed using columns of Y
"""
def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
if kernel is None:
kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1])
# parse arguments
self.Xslices = Xslices
assert isinstance(kernel, kern.kern)
self.kern = kernel
self.X = X
self.Y = Y
assert len(self.X.shape)==2
assert len(self.Y.shape)==2
assert self.X.shape[0] == self.Y.shape[0]
self.N, self.D = self.Y.shape
self.N, self.Q = self.X.shape
#here's some simple normalisation
if normalize_X:
self._Xmean = X.mean(0)[None,:]
self._Xstd = X.std(0)[None,:]
self.X = (X.copy()- self._Xmean) / self._Xstd
else:
self._Xmean = np.zeros((1,self.X.shape[1]))
self._Xstd = np.ones((1,self.X.shape[1]))
if normalize_Y:
self._Ymean = Y.mean(0)[None,:]
self._Ystd = Y.std(0)[None,:]
self.Y = (Y.copy()- self._Ymean) / self._Ystd
else:
self._Ymean = np.zeros((1,self.Y.shape[1]))
self._Ystd = np.ones((1,self.Y.shape[1]))
if self.D > self.N:
# then it's more efficient to store Youter
self.Youter = np.dot(self.Y, self.Y.T)
else:
self.Youter = None
model.__init__(self)
def set_param(self,p):
self.kern.expand_param(p)
self.K = self.kern.K(self.X,slices1=self.Xslices)
self.Ki,self.hld = pdinv(self.K)
def get_param(self):
return self.kern.extract_param()
def get_param_names(self):
return self.kern.extract_param_names()
def _model_fit_term(self):
"""
Computes the model fit using Youter if it's available
"""
if self.Youter is None:
return -0.5*np.trace(mdot(self.Y.T,self.Ki,self.Y))
else:
return -0.5*np.sum(np.multiply(self.Ki, self.Y))
def log_likelihood(self):
complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - self.D*self.hld
return complexity_term + self._model_fit_term()
def dL_dK(self):
if self.Youter is None:
alpha = np.dot(self.Ki,self.Y)
dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki)
else:
dL_dK = 0.5*(mdot(self.Ki, self.Youter, self.Ki) - self.D*self.Ki)
return dL_dK
def log_likelihood_gradients(self):
return (self.kern.dK_dtheta(self.X,slices1=self.Xslices)*self.dL_dK()[:,:,None]).sum(0).sum(0)
def predict(self,Xnew, slices=None):
"""
Predict the function(s) at the new point(s) Xnew.
Arguments
---------
:param Xnew: The points at which to make a prediction
:type Xnew: np.ndarray, Nnew x self.Q
:param slices: specifies which outputs kernel(s) the Xnew correspond to (see below)
:type slices: (None, list of slice objects, list of ints)
:rtype: posterior mean, a Numpy array, Nnew x self.D
:rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D)
.. Note:: "slices" specifies how the the points X_new co-vary wich the training points.
- If None, the new points covary throigh every kernel part (default)
- If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part
- If a list of booleans, specifying which kernel parts are active
If self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
This is to allow for different normalisations of the output dimensions.
"""
#normalise X values
Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
mu, var = self._raw_predict(Xnew,slices)
#un-normalise
mu = mu*self._Ystd + self._Ymean
if self.D==1:
var *= np.square(self._Ystd)
else:
var = var[:,:,None] * np.square(self._Ystd)
return mu,var
def _raw_predict(self,_Xnew,slices):
"""Internal helper function for making predictions, does not account for normalisation"""
Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices)
Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices)
mu = np.dot(np.dot(Kx.T,self.Ki),self.Y)
var = Kxx - np.dot(np.dot(Kx.T,self.Ki),Kx)
return mu, var
def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None):
"""
:param samples: the number of a posteriori samples to plot
:param which_data: which if the training data to plot (default all)
:type which_data: 'all' or a slice object to slice self.X, self.Y
:param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
:param which_functions: which of the kernel functions to plot (additively)
:type which_functions: list of bools
:param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
Plot the posterior of the GP.
- In one dimension, the function is plotted with a shaded region identifying two standard deviations.
- In two dimsensions, a contour-plot shows the mean predicted function
- In higher dimensions, we've no implemented this yet !TODO!
Can plot only part of the data and part of the posterior functions using which_data and which_functions
"""
if which_functions=='all':
which_functions = [True]*self.kern.Nparts
if which_data=='all':
which_data = slice(None)
X = self.X[which_data,:]
Y = self.Y[which_data,:]
Xorig = X*self._Xstd + self._Xmean
Yorig = Y*self._Ystd + self._Ymean
if plot_limits is None:
xmin,xmax = Xorig.min(0),Xorig.max(0)
xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
elif len(plot_limits)==2:
xmin, xmax = plot_limits
else:
raise ValueError, "Bad limits for plotting"
if self.X.shape[1]==1:
Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
m,v = self.predict(Xnew,slices=which_functions)
gpplot(Xnew,m,v)
if samples:
s = np.random.multivariate_normal(m.flatten(),v,samples)
pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8)
pb.plot(Xorig,Yorig,'kx',mew=1.5)
pb.xlim(xmin,xmax)
elif self.X.shape[1]==2:
resolution = 50 or resolution
xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
Xtest = np.vstack((xx.flatten(),yy.flatten())).T
zz,vv = self.predict(Xtest,slices=which_functions)
zz = zz.reshape(resolution,resolution)
pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet)
pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max())
pb.xlim(xmin[0],xmax[0])
pb.ylim(xmin[1],xmax[1])
else:
raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"

7
GPy/models/__init__.py Normal file
View file

@ -0,0 +1,7 @@
from GP_regression import GP_regression
from sparse_GP_regression import sparse_GP_regression
from GPLVM import GPLVM
from warped_GP import warpedGP
from simple_GP_EP import GP_EP
from generalized_FITC import generalized_FITC
from sparse_GPLVM import sparse_GPLVM

View file

@ -0,0 +1,239 @@
import numpy as np
import pylab as pb
from scipy import stats, linalg
from .. import kern
from ..core import model
from ..util.linalg import pdinv,mdot
from ..util.plot import gpplot
from ..inference.Expectation_Propagation import EP,Full,DTC,FITC
from ..inference.likelihoods import likelihood,probit
class generalized_FITC(model):
def __init__(self,X,likelihood,kernel=None,inducing=10,epsilon_ep=1e-3,powerep=[1.,1.]):
"""
Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC.
Arguments
---------
X : input observations
likelihood : Output's likelihood (likelihood class)
kernel : a GPy kernel
inducing : Either an array specifying the inducing points location or a scalar defining their number.
epsilon_ep : EP convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
powerep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
"""
assert isinstance(kernel,kern.kern)
self.likelihood = likelihood
self.Y = self.likelihood.Y
self.kernel = kernel
self.X = X
self.N, self.D = self.X.shape
assert self.Y.shape[0] == self.N
if type(inducing) == int:
self.M = inducing
self.Z = (np.random.random_sample(self.D*self.M)*(self.X.max()-self.X.min())+self.X.min()).reshape(self.M,-1)
elif type(inducing) == np.ndarray:
self.Z = inducing
self.M = self.Z.shape[0]
self.eta,self.delta = powerep
self.epsilon_ep = epsilon_ep
self.jitter = 1e-12
model.__init__(self)
def set_param(self,p):
self.kernel.expand_param(p[0:-self.Z.size])
self.Z = p[-self.Z.size:].reshape(self.M,self.D)
def get_param(self):
return np.hstack([self.kernel.extract_param(),self.Z.flatten()])
def get_param_names(self):
return self.kernel.extract_param_names()+['iip_%i'%i for i in range(self.Z.size)]
def approximate_likelihood(self):
self.Kmm = self.kernel.K(self.Z)
self.Knm = self.kernel.K(self.X,self.Z)
self.Knn_diag = self.kernel.Kdiag(self.X)
self.ep_approx = FITC(self.Kmm,self.likelihood,self.Knm.T,self.Knn_diag,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta])
self.ep_approx.fit_EP()
def posterior_param(self):
self.Knn_diag = self.kernel.Kdiag(self.X)
self.Kmm = self.kernel.K(self.Z)
self.Kmmi, self.Kmm_hld = pdinv(self.Kmm)
self.Knm = self.kernel.K(self.X,self.Z)
self.KmmiKmn = np.dot(self.Kmmi,self.Knm.T)
self.Qnn = np.dot(self.Knm,self.KmmiKmn)
self.Diag0 = self.Knn_diag - np.diag(self.Qnn)
self.R0 = np.linalg.cholesky(self.Kmmi).T
self.Taut = self.ep_approx.tau_tilde/(1.+ self.ep_approx.tau_tilde*self.Diag0)
self.KmnTaut = self.Knm.T*self.Taut[None,:]
self.KmnTautKnm = np.dot(self.KmnTaut, self.Knm)
self.Woodbury_inv, self.Woodbury_hld = pdinv(self.Kmm + self.KmnTautKnm)
self.Qnn_diag = self.Knn_diag - np.diag(self.Qnn) + 1./self.ep_approx.tau_tilde
self.Qi = -np.dot(self.KmnTaut.T, np.dot(self.Woodbury_inv,self.KmnTaut)) + np.diag(self.Taut)
self.hld = 0.5*np.sum(np.log(self.Diag0 + 1./self.ep_approx.tau_tilde)) - self.Kmm_hld + self.Woodbury_hld
self.Diag = self.Diag0/(1.+ self.Diag0 * self.ep_approx.tau_tilde)
self.P = (self.Diag / self.Diag0)[:,None] * self.Knm
self.RPT0 = np.dot(self.R0,self.Knm.T)
self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
self.R,info = linalg.flapack.dtrtrs(self.L,self.R0,lower=1)
self.RPT = np.dot(self.R,self.P.T)
self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
self.w = self.Diag * self.ep_approx.v_tilde
self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.ep_approx.v_tilde))
self.mu = self.w + np.dot(self.P,self.gamma)
self.mu_tilde = (self.ep_approx.v_tilde/self.ep_approx.tau_tilde)[:,None]
def log_likelihood(self):
self.posterior_param()
self.Youter = np.dot(self.mu_tilde,self.mu_tilde.T)
A = -self.hld
B = -.5*np.sum(self.Qi*self.Youter)
C = sum(np.log(self.ep_approx.Z_hat))
D = .5*np.sum(np.log(1./self.ep_approx.tau_tilde + 1./self.ep_approx.tau_))
E = .5*np.sum((self.ep_approx.v_/self.ep_approx.tau_ - self.mu_tilde.flatten())**2/(1./self.ep_approx.tau_ + 1./self.ep_approx.tau_tilde))
return A + B + C + D + E
def log_likelihood_gradients(self):
dKmm_dtheta = self.kernel.dK_dtheta(self.Z)
dKnn_dtheta = self.kernel.dK_dtheta(self.X)
dKmn_dtheta = self.kernel.dK_dtheta(self.Z,self.X)
dKmm_dZ = -self.kernel.dK_dX(self.Z)
dKnm_dZ = -self.kernel.dK_dX(self.X,self.Z)
tmp = [np.dot(dKmn_dtheta_i,self.KmmiKmn) for dKmn_dtheta_i in dKmn_dtheta.T]
dQnn_dtheta = [tmp_i + tmp_i.T - np.dot(np.dot(self.KmmiKmn.T,dKmm_dtheta_i),self.KmmiKmn) for tmp_i,dKmm_dtheta_i in zip(tmp,dKmm_dtheta.T)]
dDiag0_dtheta = [np.diag(dKnn_dtheta_i) - np.diag(dQnn_dtheta_i) for dKnn_dtheta_i,dQnn_dtheta_i in zip(dKnn_dtheta.T,dQnn_dtheta)]
dQ_dtheta = [np.diag(dDiag0_dtheta_i) + dQnn_dtheta_i for dDiag0_dtheta_i,dQnn_dtheta_i in zip(dDiag0_dtheta,dQnn_dtheta)]
dW_dtheta = [dKmm_dtheta_i + 2*np.dot(self.KmnTaut,dKmn_dtheta_i) - np.dot(self.KmnTaut*dDiag0_dtheta_i,self.KmnTaut.T) for dKmm_dtheta_i,dDiag0_dtheta_i,dKmn_dtheta_i in zip(dKmm_dtheta.T,dDiag0_dtheta,dKmn_dtheta.T)]
QiY = np.dot(self.Qi, self.mu_tilde)
QiYYQi = np.outer(QiY,QiY)
WiKmnTaut = np.dot(self.Woodbury_inv,self.KmnTaut)
K_Y = np.dot(self.KmmiKmn,QiY)
# gradient - theta
Atheta = [-0.5*np.dot(self.Taut,dDiag0_dtheta_i) + 0.5*np.sum(self.Kmmi*dKmm_dtheta_i) - 0.5*np.sum(self.Woodbury_inv*dW_dtheta_i) for dDiag0_dtheta_i,dKmm_dtheta_i,dW_dtheta_i in zip(dDiag0_dtheta,dKmm_dtheta.T,dW_dtheta)]
Btheta = np.array([0.5*np.sum(QiYYQi*dQ_dtheta_i) for dQ_dtheta_i in dQ_dtheta])
dL_dtheta = Atheta + Btheta
# gradient - Z
# Az
dQnn_dZ_diag_a2 = (np.array([d[:,:,None]*self.KmmiKmn[:,:,None] for d in dKnm_dZ.transpose(2,0,1)]).reshape(self.D,self.M,self.N)).transpose(1,2,0)
dQnn_dZ_diag_b2 = (np.array([(self.KmmiKmn*np.sum(d[:,:,None]*self.KmmiKmn,-2))[:,:,None] for d in dKmm_dZ.transpose(2,0,1)]).reshape(self.D,self.M,self.N)).transpose(1,2,0)
dQnn_dZ_diag = dQnn_dZ_diag_a2 - dQnn_dZ_diag_b2
d_hld_Diag1_dZ = -np.sum(np.dot(self.KmmiKmn*self.Taut,self.KmmiKmn.T)[:,:,None]*dKmm_dZ,-2) + np.sum((self.KmmiKmn*self.Taut)[:,:,None]*dKnm_dZ,-2)
d_hld_Kmm_dZ = np.sum(self.Kmmi[:,:,None]*dKmm_dZ,-2)
d_hld_W_dZ1 = np.sum(WiKmnTaut[:,:,None]*dKnm_dZ,-2)
d_hld_W_dZ3 = np.sum(self.Woodbury_inv[:,:,None]*dKmm_dZ,-2)
d_hld_W_dZ2 = np.array([np.sum(np.sum(WiKmnTaut.T*d[:,:,None]*self.KmnTaut.T,-2),-1) for d in dQnn_dZ_diag.transpose(2,0,1)]).T
Az = d_hld_Diag1_dZ + d_hld_Kmm_dZ - d_hld_W_dZ1 - d_hld_W_dZ2 - d_hld_W_dZ3
# Bz
Bz2 = np.sum(np.dot(K_Y,QiY.T)[:,:,None]*dKnm_dZ,-2)
Bz3 = - np.sum(np.dot(K_Y,K_Y.T)[:,:,None]*dKmm_dZ,-2)
Bz1 = -np.array([np.sum((QiY**2)*d[:,:,None],-2) for d in dQnn_dZ_diag.transpose(2,0,1)]).reshape(self.D,self.M).T
Bz = Bz1 + Bz2 + Bz3
dL_dZ = (Az + Bz).flatten()
return np.hstack([dL_dtheta, dL_dZ])
def predict(self,X):
"""
Make a prediction for the vsGP model
Arguments
---------
X : Input prediction data - Nx1 numpy array (floats)
"""
#TODO: check output dimensions
K_x = self.kernel.K(self.Z,X)
Kxx = self.kernel.K(X)
#K_x = self.kernM.cross.K(X)
# q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
# Ci = I + (RPT0)Di(RPT0).T
# C = I - [RPT0] * (D+[RPT0].T*[RPT0])^-1*[RPT0].T
# = I - [RPT0] * (D + self.Qnn)^-1 * [RPT0].T
# = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
# = I - V.T * V
U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1)
C = np.eye(self.M) - np.dot(V.T,V)
mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
#self.C = C
#self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
#self.mu_u = mu_u
#self.U = U
# q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
mu_H = np.dot(mu_u,self.mu)
self.mu_H = mu_H
Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
# q(f_star|y) = N(f_star|mu_star,sigma2_star)
KR0T = np.dot(K_x.T,self.R0.T)
mu_star = np.dot(KR0T,mu_H)
sigma2_star = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
vdiag = np.diag(sigma2_star)
# q(y_star|y) = non-gaussian posterior probability of class membership
p = self.likelihood.predictive_mean(mu_star,vdiag)
return mu_star,vdiag,p
def plot(self):
"""
Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals.
"""
if self.X.shape[1]==1:
pb.figure()
xmin,xmax = np.r_[self.X,self.Z].min(),np.r_[self.X,self.Z].max()
xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
Xnew = np.linspace(xmin,xmax,100)[:,None]
mu_f, var_f, mu_phi = self.predict(Xnew)
self.mu_inducing,self.var_diag_inducing,self.phi_inducing = self.predict(self.Z)
pb.subplot(211)
self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.Z,Mean_u=self.mu_inducing,Var_u=self.var_diag_inducing)
pb.subplot(212)
self.likelihood.plot1Db(self.X,Xnew,mu_phi,self.Z)
elif self.X.shape[1]==2:
pb.figure()
x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0)
x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0)
x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min)
x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min)
axis1 = np.linspace(x1min,x1max,50)
axis2 = np.linspace(x2min,x2max,50)
XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)]
Xnew = np.c_[XX1.flatten(),XX2.flatten()]
f,v,p = self.predict(Xnew)
self.likelihood.plot2D(self.X,Xnew,p,self.Z)
else:
raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"
def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense
"""
Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved.
"""
self.epsilon_em = epsilon
log_likelihood_change = self.epsilon_em + 1.
self.parameters_path = [self.kernel.get_param()]
self.approximate_likelihood()
self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
self.inducing_inputs_path = [self.Z]
self.log_likelihood_path = [self.log_likelihood()]
iteration = 0
while log_likelihood_change > self.epsilon_em:
print 'EM iteration', iteration
self.optimize(max_f_eval = max_f_eval)
log_likelihood_new = self.log_likelihood()
log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
if log_likelihood_change < 0:
print 'log_likelihood decrement'
self.kernel.expand_param(self.parameters_path[-1])
self.kernM = self.kernel.copy()
slef.kernM.expand_X(self.iducing_inputs_path[-1])
self.__init__(self.kernel,self.likelihood,kernM=self.kernM,powerep=[self.eta,self.delta],epsilon_ep = self.epsilon_ep, epsilon_em = self.epsilon_em)
else:
self.approximate_likelihood()
self.log_likelihood_path.append(self.log_likelihood())
self.parameters_path.append(self.kernel.get_param())
self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
self.inducing_inputs_path.append(self.Z)
iteration += 1

View file

@ -0,0 +1,57 @@
import numpy as np
import pylab as pb
import sys, pdb
# from .. import kern
# from ..core import model
# from ..util.linalg import pdinv, PCA
from GPLVM import GPLVM
from sparse_GP_regression import sparse_GP_regression
class sparse_GPLVM(sparse_GP_regression, GPLVM):
"""
Sparse Gaussian Process Latent Variable Model
:param Y: observed data
:type Y: np.ndarray
:param Q: latent dimensionality
:type Q: int
:param init: initialisation method for the latent space
:type init: 'PCA'|'random'
"""
def __init__(self, Y, Q, init='PCA', **kwargs):
X = self.initialise_latent(init, Q, Y)
sparse_GP_regression.__init__(self, X, Y, **kwargs)
def get_param_names(self):
return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
+ sparse_GP_regression.get_param_names(self))
def get_param(self):
return np.hstack((self.X.flatten(), sparse_GP_regression.get_param(self)))
def set_param(self,x):
self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
sparse_GP_regression.set_param(self, x[self.X.size:])
def log_likelihood(self):
return sparse_GP_regression.log_likelihood(self)
def dL_dX(self):
dpsi0_dX = self.kern.dKdiag_dX(self.X)
dpsi1_dX = self.kern.dK_dX(self.X,self.Z)
dpsi2_dX = self.psi1[:,None,:,None]*dpsi1_dX[None,:,:,:]
dL_dX = ((self.dL_dpsi0 * dpsi0_dX).sum(0)
+ (self.dL_dpsi1[:,:,None]*dpsi1_dX).sum(0)
+ 2.0*(self.dL_dpsi2[:, :, None,None] * dpsi2_dX).sum(0).sum(0))
return dL_dX
def log_likelihood_gradients(self):
return np.hstack((self.dL_dX().flatten(), sparse_GP_regression.log_likelihood_gradients(self)))
def plot(self):
GPLVM.plot(self)
mu, var = sparse_GP_regression.predict(self, self.Z+np.random.randn(*self.Z.shape)*0.0001)
pb.plot(mu[:, 0] , mu[:, 1], 'ko')

View file

@ -0,0 +1,177 @@
import numpy as np
import pylab as pb
from ..util.linalg import mdot, jitchol, chol_inv, pdinv
from ..util.plot import gpplot
from .. import kern
from ..inference.likelihoods import likelihood
from GP_regression import GP_regression
class sparse_GP_regression(GP_regression):
"""
Variational sparse GP model (Regression)
:param X: inputs
:type X: np.ndarray (N x Q)
:param Y: observed data
:type Y: np.ndarray of observations (N x D)
:param kernel : the kernel/covariance function. See link kernels
:type kernel: a GPy kernel
:param Z: inducing inputs (optional, see note)
:type Z: np.ndarray (M x Q) | None
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
:type M: int
:param beta: noise precision. TODO> ignore beta if doing EP
:type beta: float
:param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
:type normalize_(X|Y): bool
"""
def __init__(self,X,Y,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
self.beta = beta
if Z is None:
self.Z = np.random.permutation(X.copy())[:M]
self.M = M
else:
assert Z.shape[1]==X.shape[1]
self.Z = Z
self.M = Z.shape[1]
GP_regression.__init__(self, X, Y, kernel = kernel, normalize_X = normalize_X, normalize_Y = normalize_Y)
self.trYYT = np.sum(np.square(self.Y))
def set_param(self, p):
self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
self.beta = p[self.M*self.Q]
self.kern.set_param(p[self.Z.size + 1:])
self.beta2 = self.beta**2
self._compute_kernel_matrices()
self._computations()
def _compute_kernel_matrices(self):
# kernel computations, using BGPLVM notation
#TODO: the following can be switched out in the case of uncertain inputs (or the BGPLVM!)
#TODO: slices for psi statistics (easy enough)
self.Kmm = self.kern.K(self.Z)
self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
self.psi1 = self.kern.K(self.Z,self.X)
self.psi2 = np.dot(self.psi1,self.psi1.T)
self.dKmm_dtheta = self.kern.dK_dtheta(self.Z)
self.dpsi0_dtheta = self.kern.dKdiag_dtheta(self.X).sum(0)
self.dpsi1_dtheta = self.kern.dK_dtheta(self.Z,self.X)
tmp = np.dot(self.psi1, self.dpsi1_dtheta)
self.dpsi2_dtheta = tmp + tmp.transpose(1,0,2)
self.dpsi1_dZ = self.kern.dK_dX(self.Z,self.X)
self.dpsi2_dZ = np.tensordot(self.psi1,self.dpsi1_dZ,((1),(0)))*2.0
self.dKmm_dZ = self.kern.dK_dX(self.Z)
def _computations(self):
# TODO find routine to multiply triangular matrices
self.psi1Y = np.dot(self.psi1, self.Y)
self.psi1YYpsi1 = np.dot(self.psi1Y, self.psi1Y.T)
self.Lm = jitchol(self.Kmm)
self.Lmi = chol_inv(self.Lm)
self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
self.A = mdot(self.Lmi, self.psi2, self.Lmi.T)
self.B = np.eye(self.M) + self.beta * self.A
self.LB = jitchol(self.B)
self.LBi = chol_inv(self.LB)
self.Bi = np.dot(self.LBi.T, self.LBi)
self.LLambdai = np.dot(self.LBi, self.Lmi)
self.trace_K = self.psi0 - np.trace(self.A)
self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
self.C = mdot(self.LLambdai, self.psi1Y)
self.G = mdot(self.LBL_inv, self.psi1YYpsi1, self.LBL_inv.T)
# Computes dL_dpsi
self.dL_dpsi0 = - 0.5 * self.D * self.beta
dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.Y) # this is sane.
tmp = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
self.dL_dpsi1 = self.beta2 * tmp
self.dL_dpsi2 = (- 0.5 * self.D * self.beta * (self.LBL_inv - self.Kmmi)
- self.beta**3 * 0.5 * self.G)
# Computes dL_dKmm TODO: nicer precomputations
# tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi)
# self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB
# self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC
# tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
# - self.beta*mdot(self.G, self.psi2, self.Kmmi))
# self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G)
tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi)
self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB
self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC
tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
- self.beta*mdot(self.G, self.psi2, self.Kmmi))
self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE
def get_param(self):
return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()])
def get_param_names(self):
return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names()
def log_likelihood(self):
A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
B = -0.5*self.beta*self.D*self.trace_K
C = -self.D * np.sum(np.log(np.diag(self.LB)))
D = -0.5*self.beta*self.trYYT
E = +0.5*self.beta2*np.sum(self.psi1YYpsi1 * self.LBL_inv)
return A+B+C+D+E
def dL_dbeta(self):
""" compute the gradient of the log likelihood wrt beta.
TODO: suport heteroscedatic noise"""
dA_dbeta = 0.5 * self.N*self.D/self.beta
dB_dbeta = - 0.5 * self.D * self.trace_K
dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)
dD_dbeta = - 0.5 * self.trYYT
tmp = mdot(self.LBi.T, self.LLambdai, self.psi1Y)
dE_dbeta = (self.beta * np.sum(np.square(self.C)) - 0.5 * self.beta2
* np.sum(self.A * np.dot(tmp, tmp.T)))
return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
def dL_dtheta(self):
dL_dtheta = (self.dL_dpsi0 * self.dpsi0_dtheta + (self.dL_dpsi1[:,:, None] * self.dpsi1_dtheta).sum(0).sum(0)
+ (self.dL_dpsi2[:, :, None] * self.dpsi2_dtheta).sum(0).sum(0)
+ (self.dL_dKmm[:, :, None] * self.dKmm_dtheta).sum(0).sum(0))
return dL_dtheta
def dL_dZ(self):
dL_dZ = ((self.dL_dpsi1[:,:,None]*self.dpsi1_dZ.swapaxes(0,1)).sum(1)
+ (self.dL_dpsi2[:, :, None] * self.dpsi2_dZ).sum(0)
+ 2.0*(self.dL_dKmm[:, :, None] * self.dKmm_dZ).sum(0))
return dL_dZ
def log_likelihood_gradients(self):
return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
def _raw_predict(self,_Xnew,slices):
"""Internal helper function for making predictions, does not account for normalisation"""
Kx = self.kern.K(self.Z, _Xnew, self.Xslices, slices)
Kxx = self.kern.K(_Xnew, slices1=slices, slices2=slices)
mu = self.beta * mdot(Kx.T, self.LBL_inv, self.psi1Y)
var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + np.eye(_Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case.
return mu,var
def plot(self,*args,**kwargs):
"""
Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
"""
GP_regression.plot(self,*args,**kwargs)
if self.Q==1:
pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
if self.Q==2:
pb.plot(self.Z[:,0],self.Z[:,1],'wo')

90
GPy/models/warped_GP.py Normal file
View file

@ -0,0 +1,90 @@
import numpy as np
from .. import kern
from ..core import model
from ..util.linalg import pdinv
from ..util.plot import gpplot
from ..util.warping_functions import *
from GP_regression import GP_regression
class warpedGP(GP_regression):
"""
TODO: fucking docstrings!
@nfusi: I'#ve hacked a little on this, but no guarantees. J.
"""
def __init__(self, X, Y, warping_function = None, warping_terms = 3, **kwargs):
if warping_function == None:
self.warping_function = TanhWarpingFunction(warping_terms)
# self.warping_params = np.random.randn(self.warping_function.n_terms, 3)
self.warping_params = np.ones((self.warping_function.n_terms, 3))*1.0 # TODO better init
self.warp_params_shape = (self.warping_function.n_terms, 3) # todo get this from the subclass
self.Z = Y.copy()
self.N, self.D = Y.shape
self.transform_data()
GP_regression.__init__(self, X, self.Y, **kwargs)
def set_param(self, x):
self.warping_params = x[:self.warping_function.num_parameters].reshape(self.warp_params_shape).copy()
self.transform_data()
GP_regression.set_param(self, x[self.warping_function.num_parameters:].copy())
def get_param(self):
return np.hstack((self.warping_params.flatten().copy(), GP_regression.get_param(self).copy()))
def get_param_names(self):
warping_names = self.warping_function.get_param_names()
param_names = GP_regression.get_param_names(self)
return warping_names + param_names
def transform_data(self):
self.Y = self.warping_function.f(self.Z.copy(), self.warping_params).copy()
# this supports the 'smart' behaviour in GP_regression
if self.D > self.N:
self.Youter = np.dot(self.Y, self.Y.T)
else:
self.Youter = None
return self.Y
def log_likelihood(self):
ll = GP_regression.log_likelihood(self)
jacobian = self.warping_function.fgrad_y(self.Z, self.warping_params)
return ll + np.log(jacobian).sum()
def log_likelihood_gradients(self):
ll_grads = GP_regression.log_likelihood_gradients(self)
alpha = np.dot(self.Ki, self.Y.flatten())
warping_grads = self.warping_function_gradients(alpha)
return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
def warping_function_gradients(self, Kiy):
grad_y = self.warping_function.fgrad_y(self.Z, self.warping_params)
grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Z, self.warping_params,
return_covar_chain = True)
djac_dpsi = ((1.0/grad_y[:,:, None, None])*grad_y_psi).sum(axis=0).sum(axis=0)
dquad_dpsi = (Kiy[:,None,None,None] * grad_psi).sum(axis=0).sum(axis=0)
return -dquad_dpsi + djac_dpsi
def plot_warping(self):
self.warping_function.plot(self.warping_params, self.Z.min(), self.Z.max())
def predict(self, X, in_unwarped_space = False, **kwargs):
mu, var = GP_regression.predict(self, X, **kwargs)
# The plot() function calls set_param() before calling predict()
# this is causing the observations to be plotted in the transformed
# space (where Y lives), making the plot looks very wrong
# if the predictions are made in the untransformed space
# (where Z lives). To fix this I included the option below. It's
# just a quick fix until I figure out something smarter.
if in_unwarped_space:
mu = self.warping_function.f_inv(mu, self.warping_params)
var = self.warping_function.f_inv(var, self.warping_params)
return mu, var