mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-10 04:22:38 +02:00
models
This commit is contained in:
parent
63321e8409
commit
61984274dd
8 changed files with 991 additions and 0 deletions
61
GPy/models/GPLVM.py
Normal file
61
GPy/models/GPLVM.py
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
import numpy as np
|
||||||
|
import pylab as pb
|
||||||
|
import sys, pdb
|
||||||
|
from .. import kern
|
||||||
|
from ..core import model
|
||||||
|
from ..util.linalg import pdinv, PCA
|
||||||
|
from GP_regression import GP_regression
|
||||||
|
|
||||||
|
class GPLVM(GP_regression):
|
||||||
|
"""
|
||||||
|
Gaussian Process Latent Variable Model
|
||||||
|
|
||||||
|
:param Y: observed data
|
||||||
|
:type Y: np.ndarray
|
||||||
|
:param Q: latent dimensionality
|
||||||
|
:type Q: int
|
||||||
|
:param init: initialisation method for the latent space
|
||||||
|
:type init: 'PCA'|'random'
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, Y, Q, init='PCA', **kwargs):
|
||||||
|
X = self.initialise_latent(init, Q, Y)
|
||||||
|
GP_regression.__init__(self, X, Y, **kwargs)
|
||||||
|
|
||||||
|
def initialise_latent(self, init, Q, Y):
|
||||||
|
if init == 'PCA':
|
||||||
|
return PCA(Y, Q)[0]
|
||||||
|
else:
|
||||||
|
return np.random.randn(Y.shape[0], Q)
|
||||||
|
|
||||||
|
def get_param_names(self):
|
||||||
|
return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
|
||||||
|
+ self.kern.extract_param_names())
|
||||||
|
|
||||||
|
def get_param(self):
|
||||||
|
return np.hstack((self.X.flatten(), self.kern.extract_param()))
|
||||||
|
|
||||||
|
def set_param(self,x):
|
||||||
|
self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
|
||||||
|
GP_regression.set_param(self, x[self.X.size:])
|
||||||
|
|
||||||
|
def log_likelihood_gradients(self):
|
||||||
|
dL_dK = self.dL_dK()
|
||||||
|
|
||||||
|
dK_dtheta = self.kern.dK_dtheta(self.X)
|
||||||
|
dL_dtheta = (dK_dtheta*dL_dK[:,:,None]).sum(0).sum(0)
|
||||||
|
|
||||||
|
dK_dX = self.kern.dK_dX(self.X)
|
||||||
|
dL_dX = 2.*np.sum(dL_dK[:,:,None]*dK_dX,0)
|
||||||
|
|
||||||
|
return np.hstack((dL_dX.flatten(),dL_dtheta))
|
||||||
|
|
||||||
|
def plot(self):
|
||||||
|
assert self.Y.shape[1]==2
|
||||||
|
pb.scatter(self.Y[:,0],self.Y[:,1],40,self.X[:,0].copy(),linewidth=0)
|
||||||
|
Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None]
|
||||||
|
mu, var = self.predict(Xnew)
|
||||||
|
pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)
|
||||||
|
|
||||||
|
def plot_latent(self):
|
||||||
|
raise NotImplementedError
|
||||||
153
GPy/models/GP_EP.py
Normal file
153
GPy/models/GP_EP.py
Normal file
|
|
@ -0,0 +1,153 @@
|
||||||
|
import numpy as np
|
||||||
|
import pylab as pb
|
||||||
|
from scipy import stats, linalg
|
||||||
|
from .. import kern
|
||||||
|
from ..inference.Expectation_Propagation import EP,Full
|
||||||
|
from ..inference.likelihoods import likelihood,probit,poisson,gaussian
|
||||||
|
from ..core import model
|
||||||
|
from ..util.linalg import pdinv,jitchol
|
||||||
|
from ..util.plot import gpplot
|
||||||
|
|
||||||
|
class GP_EP(model):
|
||||||
|
def __init__(self,X,likelihood,kernel=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]):
|
||||||
|
"""
|
||||||
|
Simple Gaussian Process with Non-Gaussian likelihood
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
:param X: input observations (NxD numpy.darray)
|
||||||
|
:param likelihood: a GPy likelihood (likelihood class)
|
||||||
|
:param kernel: a GPy kernel (kern class)
|
||||||
|
:param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 (float)
|
||||||
|
:param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list)
|
||||||
|
:rtype: GPy model class.
|
||||||
|
"""
|
||||||
|
assert isinstance(kernel,kern.kern), 'kernel is not a kern instance'
|
||||||
|
self.likelihood = likelihood
|
||||||
|
self.Y = self.likelihood.Y
|
||||||
|
self.kernel = kernel
|
||||||
|
self.X = X
|
||||||
|
self.N, self.D = self.X.shape
|
||||||
|
self.eta,self.delta = powerep
|
||||||
|
self.epsilon_ep = epsilon_ep
|
||||||
|
self.jitter = 1e-12
|
||||||
|
self.K = self.kernel.K(self.X)
|
||||||
|
model.__init__(self)
|
||||||
|
|
||||||
|
def set_param(self,p):
|
||||||
|
self.kernel.expand_param(p)
|
||||||
|
|
||||||
|
def get_param(self):
|
||||||
|
return self.kernel.extract_param()
|
||||||
|
|
||||||
|
def get_param_names(self):
|
||||||
|
return self.kernel.extract_param_names()
|
||||||
|
|
||||||
|
def approximate_likelihood(self):
|
||||||
|
self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta])
|
||||||
|
self.ep_approx.fit_EP()
|
||||||
|
|
||||||
|
def posterior_param(self):
|
||||||
|
self.K = self.kernel.K(self.X)
|
||||||
|
self.Sroot_tilde_K = np.sqrt(self.ep_approx.tau_tilde)[:,None]*self.K
|
||||||
|
B = np.eye(self.N) + np.sqrt(self.ep_approx.tau_tilde)[None,:]*self.Sroot_tilde_K
|
||||||
|
#self.L = np.linalg.cholesky(B)
|
||||||
|
self.L = jitchol(B)
|
||||||
|
V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1)
|
||||||
|
self.Sigma = self.K - np.dot(V.T,V)
|
||||||
|
self.mu = np.dot(self.Sigma,self.ep_approx.v_tilde)
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
"""
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
The EP approximation to the log-marginal likelihood
|
||||||
|
"""
|
||||||
|
self.posterior_param()
|
||||||
|
mu_ = self.ep_approx.v_/self.ep_approx.tau_
|
||||||
|
L1 =.5*sum(np.log(1+self.ep_approx.tau_tilde*1./self.ep_approx.tau_))-sum(np.log(np.diag(self.L)))
|
||||||
|
L2A =.5*np.sum((self.Sigma-np.diag(1./(self.ep_approx.tau_+self.ep_approx.tau_tilde))) * np.dot(self.ep_approx.v_tilde[:,None],self.ep_approx.v_tilde[None,:]))
|
||||||
|
L2B = .5*np.dot(mu_*(self.ep_approx.tau_/(self.ep_approx.tau_tilde+self.ep_approx.tau_)),self.ep_approx.tau_tilde*mu_ - 2*self.ep_approx.v_tilde)
|
||||||
|
L3 = sum(np.log(self.ep_approx.Z_hat))
|
||||||
|
return L1 + L2A + L2B + L3
|
||||||
|
|
||||||
|
def log_likelihood_gradients(self):
|
||||||
|
dK_dp = self.kernel.dK_dtheta(self.X)
|
||||||
|
self.dK_dp = dK_dp
|
||||||
|
aux1,info_1 = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
|
||||||
|
b = self.ep_approx.v_tilde - np.sqrt(self.ep_approx.tau_tilde)*linalg.flapack.dtrtrs(self.L.T,aux1)[0]
|
||||||
|
U,info_u = linalg.flapack.dtrtrs(self.L,np.diag(np.sqrt(self.ep_approx.tau_tilde)),lower=1)
|
||||||
|
dL_dK = 0.5*(np.outer(b,b)-np.dot(U.T,U))
|
||||||
|
self.dL_dK = dL_dK
|
||||||
|
return np.array([np.sum(dK_dpi*dL_dK) for dK_dpi in dK_dp.T])
|
||||||
|
|
||||||
|
def predict(self,X):
|
||||||
|
#TODO: check output dimensions
|
||||||
|
self.posterior_param()
|
||||||
|
K_x = self.kernel.K(self.X,X)
|
||||||
|
Kxx = self.kernel.K(X)
|
||||||
|
aux1,info = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
|
||||||
|
aux2,info = linalg.flapack.dtrtrs(self.L.T, aux1,lower=0)
|
||||||
|
zeta = np.sqrt(self.ep_approx.tau_tilde)*aux2
|
||||||
|
f = np.dot(K_x.T,self.ep_approx.v_tilde-zeta)
|
||||||
|
v,info = linalg.flapack.dtrtrs(self.L,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x,lower=1)
|
||||||
|
variance = Kxx - np.dot(v.T,v)
|
||||||
|
vdiag = np.diag(variance)
|
||||||
|
y=self.likelihood.predictive_mean(f,vdiag)
|
||||||
|
return f,vdiag,y
|
||||||
|
|
||||||
|
def plot(self):
|
||||||
|
"""
|
||||||
|
Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals.
|
||||||
|
"""
|
||||||
|
if self.X.shape[1]==1:
|
||||||
|
pb.figure()
|
||||||
|
xmin,xmax = self.X.min(),self.X.max()
|
||||||
|
xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
|
||||||
|
Xnew = np.linspace(xmin,xmax,100)[:,None]
|
||||||
|
mu_f, var_f, mu_phi = self.predict(Xnew)
|
||||||
|
pb.subplot(211)
|
||||||
|
self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.X,Mean_u=self.mu,Var_u=np.diag(self.Sigma))
|
||||||
|
pb.subplot(212)
|
||||||
|
self.likelihood.plot1Db(self.X,Xnew,mu_phi)
|
||||||
|
elif self.X.shape[1]==2:
|
||||||
|
pb.figure()
|
||||||
|
x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0)
|
||||||
|
x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0)
|
||||||
|
x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min)
|
||||||
|
x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min)
|
||||||
|
axis1 = np.linspace(x1min,x1max,50)
|
||||||
|
axis2 = np.linspace(x2min,x2max,50)
|
||||||
|
XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)]
|
||||||
|
Xnew = np.c_[XX1.flatten(),XX2.flatten()]
|
||||||
|
f,v,p = self.predict(Xnew)
|
||||||
|
self.likelihood.plot2D(self.X,Xnew,p)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"
|
||||||
|
|
||||||
|
def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense
|
||||||
|
"""
|
||||||
|
Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved.
|
||||||
|
"""
|
||||||
|
self.epsilon_em = epsilon
|
||||||
|
log_likelihood_change = self.epsilon_em + 1.
|
||||||
|
self.parameters_path = [self.kernel.get_param()]
|
||||||
|
self.approximate_likelihood()
|
||||||
|
self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
|
||||||
|
self.log_likelihood_path = [self.log_likelihood()]
|
||||||
|
iteration = 0
|
||||||
|
while log_likelihood_change > self.epsilon_em:
|
||||||
|
print 'EM iteration', iteration
|
||||||
|
self.optimize(max_f_eval = max_f_eval)
|
||||||
|
log_likelihood_new = self.log_likelihood()
|
||||||
|
log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
|
||||||
|
if log_likelihood_change < 0:
|
||||||
|
print 'log_likelihood decrement'
|
||||||
|
self.kernel.expand_param(self.parameters_path[-1])
|
||||||
|
self.kernM.expand_param(self.parameters_path[-1])
|
||||||
|
else:
|
||||||
|
self.approximate_likelihood()
|
||||||
|
self.log_likelihood_path.append(self.log_likelihood())
|
||||||
|
self.parameters_path.append(self.kernel.get_param())
|
||||||
|
self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
|
||||||
|
iteration += 1
|
||||||
207
GPy/models/GP_regression.py
Normal file
207
GPy/models/GP_regression.py
Normal file
|
|
@ -0,0 +1,207 @@
|
||||||
|
import numpy as np
|
||||||
|
import pylab as pb
|
||||||
|
from .. import kern
|
||||||
|
from ..core import model
|
||||||
|
from ..util.linalg import pdinv,mdot
|
||||||
|
from ..util.plot import gpplot, Tango
|
||||||
|
|
||||||
|
class GP_regression(model):
|
||||||
|
"""
|
||||||
|
Gaussian Process model for regression
|
||||||
|
|
||||||
|
:param X: input observations
|
||||||
|
:param Y: observed values
|
||||||
|
:param kernel: a GPy kernel, defaults to rbf+white
|
||||||
|
:param normalize_X: whether to normalize the input data before computing (predictions will be in original scales)
|
||||||
|
:type normalize_X: False|True
|
||||||
|
:param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales)
|
||||||
|
:type normalize_Y: False|True
|
||||||
|
:param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing)
|
||||||
|
:rtype: model object
|
||||||
|
|
||||||
|
.. Note:: Multiple independent outputs are allowed using columns of Y
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
|
||||||
|
if kernel is None:
|
||||||
|
kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1])
|
||||||
|
|
||||||
|
# parse arguments
|
||||||
|
self.Xslices = Xslices
|
||||||
|
assert isinstance(kernel, kern.kern)
|
||||||
|
self.kern = kernel
|
||||||
|
self.X = X
|
||||||
|
self.Y = Y
|
||||||
|
assert len(self.X.shape)==2
|
||||||
|
assert len(self.Y.shape)==2
|
||||||
|
assert self.X.shape[0] == self.Y.shape[0]
|
||||||
|
self.N, self.D = self.Y.shape
|
||||||
|
self.N, self.Q = self.X.shape
|
||||||
|
|
||||||
|
#here's some simple normalisation
|
||||||
|
if normalize_X:
|
||||||
|
self._Xmean = X.mean(0)[None,:]
|
||||||
|
self._Xstd = X.std(0)[None,:]
|
||||||
|
self.X = (X.copy()- self._Xmean) / self._Xstd
|
||||||
|
else:
|
||||||
|
self._Xmean = np.zeros((1,self.X.shape[1]))
|
||||||
|
self._Xstd = np.ones((1,self.X.shape[1]))
|
||||||
|
|
||||||
|
if normalize_Y:
|
||||||
|
self._Ymean = Y.mean(0)[None,:]
|
||||||
|
self._Ystd = Y.std(0)[None,:]
|
||||||
|
self.Y = (Y.copy()- self._Ymean) / self._Ystd
|
||||||
|
else:
|
||||||
|
self._Ymean = np.zeros((1,self.Y.shape[1]))
|
||||||
|
self._Ystd = np.ones((1,self.Y.shape[1]))
|
||||||
|
|
||||||
|
if self.D > self.N:
|
||||||
|
# then it's more efficient to store Youter
|
||||||
|
self.Youter = np.dot(self.Y, self.Y.T)
|
||||||
|
else:
|
||||||
|
self.Youter = None
|
||||||
|
|
||||||
|
model.__init__(self)
|
||||||
|
|
||||||
|
def set_param(self,p):
|
||||||
|
self.kern.expand_param(p)
|
||||||
|
self.K = self.kern.K(self.X,slices1=self.Xslices)
|
||||||
|
self.Ki,self.hld = pdinv(self.K)
|
||||||
|
|
||||||
|
def get_param(self):
|
||||||
|
return self.kern.extract_param()
|
||||||
|
|
||||||
|
def get_param_names(self):
|
||||||
|
return self.kern.extract_param_names()
|
||||||
|
|
||||||
|
def _model_fit_term(self):
|
||||||
|
"""
|
||||||
|
Computes the model fit using Youter if it's available
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.Youter is None:
|
||||||
|
return -0.5*np.trace(mdot(self.Y.T,self.Ki,self.Y))
|
||||||
|
else:
|
||||||
|
return -0.5*np.sum(np.multiply(self.Ki, self.Y))
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - self.D*self.hld
|
||||||
|
return complexity_term + self._model_fit_term()
|
||||||
|
|
||||||
|
def dL_dK(self):
|
||||||
|
if self.Youter is None:
|
||||||
|
alpha = np.dot(self.Ki,self.Y)
|
||||||
|
dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki)
|
||||||
|
else:
|
||||||
|
dL_dK = 0.5*(mdot(self.Ki, self.Youter, self.Ki) - self.D*self.Ki)
|
||||||
|
|
||||||
|
return dL_dK
|
||||||
|
|
||||||
|
def log_likelihood_gradients(self):
|
||||||
|
return (self.kern.dK_dtheta(self.X,slices1=self.Xslices)*self.dL_dK()[:,:,None]).sum(0).sum(0)
|
||||||
|
|
||||||
|
def predict(self,Xnew, slices=None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Predict the function(s) at the new point(s) Xnew.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
:param Xnew: The points at which to make a prediction
|
||||||
|
:type Xnew: np.ndarray, Nnew x self.Q
|
||||||
|
:param slices: specifies which outputs kernel(s) the Xnew correspond to (see below)
|
||||||
|
:type slices: (None, list of slice objects, list of ints)
|
||||||
|
:rtype: posterior mean, a Numpy array, Nnew x self.D
|
||||||
|
:rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D)
|
||||||
|
|
||||||
|
.. Note:: "slices" specifies how the the points X_new co-vary wich the training points.
|
||||||
|
|
||||||
|
- If None, the new points covary throigh every kernel part (default)
|
||||||
|
- If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part
|
||||||
|
- If a list of booleans, specifying which kernel parts are active
|
||||||
|
|
||||||
|
If self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
|
||||||
|
This is to allow for different normalisations of the output dimensions.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
#normalise X values
|
||||||
|
Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
|
||||||
|
mu, var = self._raw_predict(Xnew,slices)
|
||||||
|
|
||||||
|
#un-normalise
|
||||||
|
mu = mu*self._Ystd + self._Ymean
|
||||||
|
if self.D==1:
|
||||||
|
var *= np.square(self._Ystd)
|
||||||
|
else:
|
||||||
|
var = var[:,:,None] * np.square(self._Ystd)
|
||||||
|
return mu,var
|
||||||
|
|
||||||
|
def _raw_predict(self,_Xnew,slices):
|
||||||
|
"""Internal helper function for making predictions, does not account for normalisation"""
|
||||||
|
Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices)
|
||||||
|
Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices)
|
||||||
|
mu = np.dot(np.dot(Kx.T,self.Ki),self.Y)
|
||||||
|
var = Kxx - np.dot(np.dot(Kx.T,self.Ki),Kx)
|
||||||
|
return mu, var
|
||||||
|
|
||||||
|
def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None):
|
||||||
|
"""
|
||||||
|
:param samples: the number of a posteriori samples to plot
|
||||||
|
:param which_data: which if the training data to plot (default all)
|
||||||
|
:type which_data: 'all' or a slice object to slice self.X, self.Y
|
||||||
|
:param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
|
||||||
|
:param which_functions: which of the kernel functions to plot (additively)
|
||||||
|
:type which_functions: list of bools
|
||||||
|
:param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
|
||||||
|
|
||||||
|
Plot the posterior of the GP.
|
||||||
|
- In one dimension, the function is plotted with a shaded region identifying two standard deviations.
|
||||||
|
- In two dimsensions, a contour-plot shows the mean predicted function
|
||||||
|
- In higher dimensions, we've no implemented this yet !TODO!
|
||||||
|
|
||||||
|
Can plot only part of the data and part of the posterior functions using which_data and which_functions
|
||||||
|
"""
|
||||||
|
if which_functions=='all':
|
||||||
|
which_functions = [True]*self.kern.Nparts
|
||||||
|
if which_data=='all':
|
||||||
|
which_data = slice(None)
|
||||||
|
|
||||||
|
X = self.X[which_data,:]
|
||||||
|
Y = self.Y[which_data,:]
|
||||||
|
|
||||||
|
Xorig = X*self._Xstd + self._Xmean
|
||||||
|
Yorig = Y*self._Ystd + self._Ymean
|
||||||
|
if plot_limits is None:
|
||||||
|
xmin,xmax = Xorig.min(0),Xorig.max(0)
|
||||||
|
xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
|
||||||
|
elif len(plot_limits)==2:
|
||||||
|
xmin, xmax = plot_limits
|
||||||
|
else:
|
||||||
|
raise ValueError, "Bad limits for plotting"
|
||||||
|
|
||||||
|
|
||||||
|
if self.X.shape[1]==1:
|
||||||
|
Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
|
||||||
|
m,v = self.predict(Xnew,slices=which_functions)
|
||||||
|
gpplot(Xnew,m,v)
|
||||||
|
if samples:
|
||||||
|
s = np.random.multivariate_normal(m.flatten(),v,samples)
|
||||||
|
pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8)
|
||||||
|
pb.plot(Xorig,Yorig,'kx',mew=1.5)
|
||||||
|
pb.xlim(xmin,xmax)
|
||||||
|
|
||||||
|
elif self.X.shape[1]==2:
|
||||||
|
resolution = 50 or resolution
|
||||||
|
xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
|
||||||
|
Xtest = np.vstack((xx.flatten(),yy.flatten())).T
|
||||||
|
zz,vv = self.predict(Xtest,slices=which_functions)
|
||||||
|
zz = zz.reshape(resolution,resolution)
|
||||||
|
pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet)
|
||||||
|
pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max())
|
||||||
|
pb.xlim(xmin[0],xmax[0])
|
||||||
|
pb.ylim(xmin[1],xmax[1])
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"
|
||||||
7
GPy/models/__init__.py
Normal file
7
GPy/models/__init__.py
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
from GP_regression import GP_regression
|
||||||
|
from sparse_GP_regression import sparse_GP_regression
|
||||||
|
from GPLVM import GPLVM
|
||||||
|
from warped_GP import warpedGP
|
||||||
|
from simple_GP_EP import GP_EP
|
||||||
|
from generalized_FITC import generalized_FITC
|
||||||
|
from sparse_GPLVM import sparse_GPLVM
|
||||||
239
GPy/models/generalized_FITC.py
Normal file
239
GPy/models/generalized_FITC.py
Normal file
|
|
@ -0,0 +1,239 @@
|
||||||
|
import numpy as np
|
||||||
|
import pylab as pb
|
||||||
|
from scipy import stats, linalg
|
||||||
|
from .. import kern
|
||||||
|
from ..core import model
|
||||||
|
from ..util.linalg import pdinv,mdot
|
||||||
|
from ..util.plot import gpplot
|
||||||
|
from ..inference.Expectation_Propagation import EP,Full,DTC,FITC
|
||||||
|
from ..inference.likelihoods import likelihood,probit
|
||||||
|
|
||||||
|
class generalized_FITC(model):
|
||||||
|
def __init__(self,X,likelihood,kernel=None,inducing=10,epsilon_ep=1e-3,powerep=[1.,1.]):
|
||||||
|
"""
|
||||||
|
Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
X : input observations
|
||||||
|
likelihood : Output's likelihood (likelihood class)
|
||||||
|
kernel : a GPy kernel
|
||||||
|
inducing : Either an array specifying the inducing points location or a scalar defining their number.
|
||||||
|
epsilon_ep : EP convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
|
||||||
|
powerep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
|
||||||
|
"""
|
||||||
|
assert isinstance(kernel,kern.kern)
|
||||||
|
self.likelihood = likelihood
|
||||||
|
self.Y = self.likelihood.Y
|
||||||
|
self.kernel = kernel
|
||||||
|
self.X = X
|
||||||
|
self.N, self.D = self.X.shape
|
||||||
|
assert self.Y.shape[0] == self.N
|
||||||
|
if type(inducing) == int:
|
||||||
|
self.M = inducing
|
||||||
|
self.Z = (np.random.random_sample(self.D*self.M)*(self.X.max()-self.X.min())+self.X.min()).reshape(self.M,-1)
|
||||||
|
elif type(inducing) == np.ndarray:
|
||||||
|
self.Z = inducing
|
||||||
|
self.M = self.Z.shape[0]
|
||||||
|
self.eta,self.delta = powerep
|
||||||
|
self.epsilon_ep = epsilon_ep
|
||||||
|
self.jitter = 1e-12
|
||||||
|
model.__init__(self)
|
||||||
|
|
||||||
|
def set_param(self,p):
|
||||||
|
self.kernel.expand_param(p[0:-self.Z.size])
|
||||||
|
self.Z = p[-self.Z.size:].reshape(self.M,self.D)
|
||||||
|
|
||||||
|
def get_param(self):
|
||||||
|
return np.hstack([self.kernel.extract_param(),self.Z.flatten()])
|
||||||
|
|
||||||
|
def get_param_names(self):
|
||||||
|
return self.kernel.extract_param_names()+['iip_%i'%i for i in range(self.Z.size)]
|
||||||
|
|
||||||
|
def approximate_likelihood(self):
|
||||||
|
self.Kmm = self.kernel.K(self.Z)
|
||||||
|
self.Knm = self.kernel.K(self.X,self.Z)
|
||||||
|
self.Knn_diag = self.kernel.Kdiag(self.X)
|
||||||
|
self.ep_approx = FITC(self.Kmm,self.likelihood,self.Knm.T,self.Knn_diag,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta])
|
||||||
|
self.ep_approx.fit_EP()
|
||||||
|
|
||||||
|
def posterior_param(self):
|
||||||
|
self.Knn_diag = self.kernel.Kdiag(self.X)
|
||||||
|
self.Kmm = self.kernel.K(self.Z)
|
||||||
|
self.Kmmi, self.Kmm_hld = pdinv(self.Kmm)
|
||||||
|
self.Knm = self.kernel.K(self.X,self.Z)
|
||||||
|
self.KmmiKmn = np.dot(self.Kmmi,self.Knm.T)
|
||||||
|
self.Qnn = np.dot(self.Knm,self.KmmiKmn)
|
||||||
|
self.Diag0 = self.Knn_diag - np.diag(self.Qnn)
|
||||||
|
self.R0 = np.linalg.cholesky(self.Kmmi).T
|
||||||
|
|
||||||
|
self.Taut = self.ep_approx.tau_tilde/(1.+ self.ep_approx.tau_tilde*self.Diag0)
|
||||||
|
self.KmnTaut = self.Knm.T*self.Taut[None,:]
|
||||||
|
self.KmnTautKnm = np.dot(self.KmnTaut, self.Knm)
|
||||||
|
self.Woodbury_inv, self.Woodbury_hld = pdinv(self.Kmm + self.KmnTautKnm)
|
||||||
|
self.Qnn_diag = self.Knn_diag - np.diag(self.Qnn) + 1./self.ep_approx.tau_tilde
|
||||||
|
self.Qi = -np.dot(self.KmnTaut.T, np.dot(self.Woodbury_inv,self.KmnTaut)) + np.diag(self.Taut)
|
||||||
|
self.hld = 0.5*np.sum(np.log(self.Diag0 + 1./self.ep_approx.tau_tilde)) - self.Kmm_hld + self.Woodbury_hld
|
||||||
|
|
||||||
|
self.Diag = self.Diag0/(1.+ self.Diag0 * self.ep_approx.tau_tilde)
|
||||||
|
self.P = (self.Diag / self.Diag0)[:,None] * self.Knm
|
||||||
|
self.RPT0 = np.dot(self.R0,self.Knm.T)
|
||||||
|
self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
|
||||||
|
self.R,info = linalg.flapack.dtrtrs(self.L,self.R0,lower=1)
|
||||||
|
self.RPT = np.dot(self.R,self.P.T)
|
||||||
|
self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
|
||||||
|
self.w = self.Diag * self.ep_approx.v_tilde
|
||||||
|
self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.ep_approx.v_tilde))
|
||||||
|
self.mu = self.w + np.dot(self.P,self.gamma)
|
||||||
|
self.mu_tilde = (self.ep_approx.v_tilde/self.ep_approx.tau_tilde)[:,None]
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
self.posterior_param()
|
||||||
|
self.Youter = np.dot(self.mu_tilde,self.mu_tilde.T)
|
||||||
|
A = -self.hld
|
||||||
|
B = -.5*np.sum(self.Qi*self.Youter)
|
||||||
|
C = sum(np.log(self.ep_approx.Z_hat))
|
||||||
|
D = .5*np.sum(np.log(1./self.ep_approx.tau_tilde + 1./self.ep_approx.tau_))
|
||||||
|
E = .5*np.sum((self.ep_approx.v_/self.ep_approx.tau_ - self.mu_tilde.flatten())**2/(1./self.ep_approx.tau_ + 1./self.ep_approx.tau_tilde))
|
||||||
|
return A + B + C + D + E
|
||||||
|
|
||||||
|
def log_likelihood_gradients(self):
|
||||||
|
dKmm_dtheta = self.kernel.dK_dtheta(self.Z)
|
||||||
|
dKnn_dtheta = self.kernel.dK_dtheta(self.X)
|
||||||
|
dKmn_dtheta = self.kernel.dK_dtheta(self.Z,self.X)
|
||||||
|
dKmm_dZ = -self.kernel.dK_dX(self.Z)
|
||||||
|
dKnm_dZ = -self.kernel.dK_dX(self.X,self.Z)
|
||||||
|
tmp = [np.dot(dKmn_dtheta_i,self.KmmiKmn) for dKmn_dtheta_i in dKmn_dtheta.T]
|
||||||
|
dQnn_dtheta = [tmp_i + tmp_i.T - np.dot(np.dot(self.KmmiKmn.T,dKmm_dtheta_i),self.KmmiKmn) for tmp_i,dKmm_dtheta_i in zip(tmp,dKmm_dtheta.T)]
|
||||||
|
dDiag0_dtheta = [np.diag(dKnn_dtheta_i) - np.diag(dQnn_dtheta_i) for dKnn_dtheta_i,dQnn_dtheta_i in zip(dKnn_dtheta.T,dQnn_dtheta)]
|
||||||
|
dQ_dtheta = [np.diag(dDiag0_dtheta_i) + dQnn_dtheta_i for dDiag0_dtheta_i,dQnn_dtheta_i in zip(dDiag0_dtheta,dQnn_dtheta)]
|
||||||
|
dW_dtheta = [dKmm_dtheta_i + 2*np.dot(self.KmnTaut,dKmn_dtheta_i) - np.dot(self.KmnTaut*dDiag0_dtheta_i,self.KmnTaut.T) for dKmm_dtheta_i,dDiag0_dtheta_i,dKmn_dtheta_i in zip(dKmm_dtheta.T,dDiag0_dtheta,dKmn_dtheta.T)]
|
||||||
|
|
||||||
|
QiY = np.dot(self.Qi, self.mu_tilde)
|
||||||
|
QiYYQi = np.outer(QiY,QiY)
|
||||||
|
WiKmnTaut = np.dot(self.Woodbury_inv,self.KmnTaut)
|
||||||
|
K_Y = np.dot(self.KmmiKmn,QiY)
|
||||||
|
# gradient - theta
|
||||||
|
Atheta = [-0.5*np.dot(self.Taut,dDiag0_dtheta_i) + 0.5*np.sum(self.Kmmi*dKmm_dtheta_i) - 0.5*np.sum(self.Woodbury_inv*dW_dtheta_i) for dDiag0_dtheta_i,dKmm_dtheta_i,dW_dtheta_i in zip(dDiag0_dtheta,dKmm_dtheta.T,dW_dtheta)]
|
||||||
|
Btheta = np.array([0.5*np.sum(QiYYQi*dQ_dtheta_i) for dQ_dtheta_i in dQ_dtheta])
|
||||||
|
dL_dtheta = Atheta + Btheta
|
||||||
|
# gradient - Z
|
||||||
|
# Az
|
||||||
|
dQnn_dZ_diag_a2 = (np.array([d[:,:,None]*self.KmmiKmn[:,:,None] for d in dKnm_dZ.transpose(2,0,1)]).reshape(self.D,self.M,self.N)).transpose(1,2,0)
|
||||||
|
dQnn_dZ_diag_b2 = (np.array([(self.KmmiKmn*np.sum(d[:,:,None]*self.KmmiKmn,-2))[:,:,None] for d in dKmm_dZ.transpose(2,0,1)]).reshape(self.D,self.M,self.N)).transpose(1,2,0)
|
||||||
|
dQnn_dZ_diag = dQnn_dZ_diag_a2 - dQnn_dZ_diag_b2
|
||||||
|
d_hld_Diag1_dZ = -np.sum(np.dot(self.KmmiKmn*self.Taut,self.KmmiKmn.T)[:,:,None]*dKmm_dZ,-2) + np.sum((self.KmmiKmn*self.Taut)[:,:,None]*dKnm_dZ,-2)
|
||||||
|
d_hld_Kmm_dZ = np.sum(self.Kmmi[:,:,None]*dKmm_dZ,-2)
|
||||||
|
d_hld_W_dZ1 = np.sum(WiKmnTaut[:,:,None]*dKnm_dZ,-2)
|
||||||
|
d_hld_W_dZ3 = np.sum(self.Woodbury_inv[:,:,None]*dKmm_dZ,-2)
|
||||||
|
d_hld_W_dZ2 = np.array([np.sum(np.sum(WiKmnTaut.T*d[:,:,None]*self.KmnTaut.T,-2),-1) for d in dQnn_dZ_diag.transpose(2,0,1)]).T
|
||||||
|
Az = d_hld_Diag1_dZ + d_hld_Kmm_dZ - d_hld_W_dZ1 - d_hld_W_dZ2 - d_hld_W_dZ3
|
||||||
|
# Bz
|
||||||
|
Bz2 = np.sum(np.dot(K_Y,QiY.T)[:,:,None]*dKnm_dZ,-2)
|
||||||
|
Bz3 = - np.sum(np.dot(K_Y,K_Y.T)[:,:,None]*dKmm_dZ,-2)
|
||||||
|
Bz1 = -np.array([np.sum((QiY**2)*d[:,:,None],-2) for d in dQnn_dZ_diag.transpose(2,0,1)]).reshape(self.D,self.M).T
|
||||||
|
Bz = Bz1 + Bz2 + Bz3
|
||||||
|
dL_dZ = (Az + Bz).flatten()
|
||||||
|
return np.hstack([dL_dtheta, dL_dZ])
|
||||||
|
|
||||||
|
def predict(self,X):
|
||||||
|
"""
|
||||||
|
Make a prediction for the vsGP model
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
X : Input prediction data - Nx1 numpy array (floats)
|
||||||
|
"""
|
||||||
|
#TODO: check output dimensions
|
||||||
|
K_x = self.kernel.K(self.Z,X)
|
||||||
|
Kxx = self.kernel.K(X)
|
||||||
|
#K_x = self.kernM.cross.K(X)
|
||||||
|
# q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
|
||||||
|
|
||||||
|
# Ci = I + (RPT0)Di(RPT0).T
|
||||||
|
# C = I - [RPT0] * (D+[RPT0].T*[RPT0])^-1*[RPT0].T
|
||||||
|
# = I - [RPT0] * (D + self.Qnn)^-1 * [RPT0].T
|
||||||
|
# = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
|
||||||
|
# = I - V.T * V
|
||||||
|
U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
|
||||||
|
V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1)
|
||||||
|
C = np.eye(self.M) - np.dot(V.T,V)
|
||||||
|
mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
|
||||||
|
#self.C = C
|
||||||
|
#self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
|
||||||
|
#self.mu_u = mu_u
|
||||||
|
#self.U = U
|
||||||
|
# q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
|
||||||
|
mu_H = np.dot(mu_u,self.mu)
|
||||||
|
self.mu_H = mu_H
|
||||||
|
Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
|
||||||
|
# q(f_star|y) = N(f_star|mu_star,sigma2_star)
|
||||||
|
KR0T = np.dot(K_x.T,self.R0.T)
|
||||||
|
mu_star = np.dot(KR0T,mu_H)
|
||||||
|
sigma2_star = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
|
||||||
|
vdiag = np.diag(sigma2_star)
|
||||||
|
# q(y_star|y) = non-gaussian posterior probability of class membership
|
||||||
|
p = self.likelihood.predictive_mean(mu_star,vdiag)
|
||||||
|
return mu_star,vdiag,p
|
||||||
|
|
||||||
|
def plot(self):
|
||||||
|
"""
|
||||||
|
Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals.
|
||||||
|
"""
|
||||||
|
if self.X.shape[1]==1:
|
||||||
|
pb.figure()
|
||||||
|
xmin,xmax = np.r_[self.X,self.Z].min(),np.r_[self.X,self.Z].max()
|
||||||
|
xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
|
||||||
|
Xnew = np.linspace(xmin,xmax,100)[:,None]
|
||||||
|
mu_f, var_f, mu_phi = self.predict(Xnew)
|
||||||
|
self.mu_inducing,self.var_diag_inducing,self.phi_inducing = self.predict(self.Z)
|
||||||
|
pb.subplot(211)
|
||||||
|
self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.Z,Mean_u=self.mu_inducing,Var_u=self.var_diag_inducing)
|
||||||
|
pb.subplot(212)
|
||||||
|
self.likelihood.plot1Db(self.X,Xnew,mu_phi,self.Z)
|
||||||
|
elif self.X.shape[1]==2:
|
||||||
|
pb.figure()
|
||||||
|
x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0)
|
||||||
|
x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0)
|
||||||
|
x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min)
|
||||||
|
x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min)
|
||||||
|
axis1 = np.linspace(x1min,x1max,50)
|
||||||
|
axis2 = np.linspace(x2min,x2max,50)
|
||||||
|
XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)]
|
||||||
|
Xnew = np.c_[XX1.flatten(),XX2.flatten()]
|
||||||
|
f,v,p = self.predict(Xnew)
|
||||||
|
self.likelihood.plot2D(self.X,Xnew,p,self.Z)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"
|
||||||
|
|
||||||
|
def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense
|
||||||
|
"""
|
||||||
|
Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved.
|
||||||
|
"""
|
||||||
|
self.epsilon_em = epsilon
|
||||||
|
log_likelihood_change = self.epsilon_em + 1.
|
||||||
|
self.parameters_path = [self.kernel.get_param()]
|
||||||
|
self.approximate_likelihood()
|
||||||
|
self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
|
||||||
|
self.inducing_inputs_path = [self.Z]
|
||||||
|
self.log_likelihood_path = [self.log_likelihood()]
|
||||||
|
iteration = 0
|
||||||
|
while log_likelihood_change > self.epsilon_em:
|
||||||
|
print 'EM iteration', iteration
|
||||||
|
self.optimize(max_f_eval = max_f_eval)
|
||||||
|
log_likelihood_new = self.log_likelihood()
|
||||||
|
log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
|
||||||
|
if log_likelihood_change < 0:
|
||||||
|
print 'log_likelihood decrement'
|
||||||
|
self.kernel.expand_param(self.parameters_path[-1])
|
||||||
|
self.kernM = self.kernel.copy()
|
||||||
|
slef.kernM.expand_X(self.iducing_inputs_path[-1])
|
||||||
|
self.__init__(self.kernel,self.likelihood,kernM=self.kernM,powerep=[self.eta,self.delta],epsilon_ep = self.epsilon_ep, epsilon_em = self.epsilon_em)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.approximate_likelihood()
|
||||||
|
self.log_likelihood_path.append(self.log_likelihood())
|
||||||
|
self.parameters_path.append(self.kernel.get_param())
|
||||||
|
self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
|
||||||
|
self.inducing_inputs_path.append(self.Z)
|
||||||
|
iteration += 1
|
||||||
57
GPy/models/sparse_GPLVM.py
Normal file
57
GPy/models/sparse_GPLVM.py
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
import numpy as np
|
||||||
|
import pylab as pb
|
||||||
|
import sys, pdb
|
||||||
|
# from .. import kern
|
||||||
|
# from ..core import model
|
||||||
|
# from ..util.linalg import pdinv, PCA
|
||||||
|
from GPLVM import GPLVM
|
||||||
|
from sparse_GP_regression import sparse_GP_regression
|
||||||
|
|
||||||
|
class sparse_GPLVM(sparse_GP_regression, GPLVM):
|
||||||
|
"""
|
||||||
|
Sparse Gaussian Process Latent Variable Model
|
||||||
|
|
||||||
|
:param Y: observed data
|
||||||
|
:type Y: np.ndarray
|
||||||
|
:param Q: latent dimensionality
|
||||||
|
:type Q: int
|
||||||
|
:param init: initialisation method for the latent space
|
||||||
|
:type init: 'PCA'|'random'
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, Y, Q, init='PCA', **kwargs):
|
||||||
|
X = self.initialise_latent(init, Q, Y)
|
||||||
|
sparse_GP_regression.__init__(self, X, Y, **kwargs)
|
||||||
|
|
||||||
|
def get_param_names(self):
|
||||||
|
return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
|
||||||
|
+ sparse_GP_regression.get_param_names(self))
|
||||||
|
|
||||||
|
def get_param(self):
|
||||||
|
return np.hstack((self.X.flatten(), sparse_GP_regression.get_param(self)))
|
||||||
|
|
||||||
|
def set_param(self,x):
|
||||||
|
self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
|
||||||
|
sparse_GP_regression.set_param(self, x[self.X.size:])
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
return sparse_GP_regression.log_likelihood(self)
|
||||||
|
|
||||||
|
def dL_dX(self):
|
||||||
|
dpsi0_dX = self.kern.dKdiag_dX(self.X)
|
||||||
|
dpsi1_dX = self.kern.dK_dX(self.X,self.Z)
|
||||||
|
dpsi2_dX = self.psi1[:,None,:,None]*dpsi1_dX[None,:,:,:]
|
||||||
|
|
||||||
|
dL_dX = ((self.dL_dpsi0 * dpsi0_dX).sum(0)
|
||||||
|
+ (self.dL_dpsi1[:,:,None]*dpsi1_dX).sum(0)
|
||||||
|
+ 2.0*(self.dL_dpsi2[:, :, None,None] * dpsi2_dX).sum(0).sum(0))
|
||||||
|
|
||||||
|
return dL_dX
|
||||||
|
|
||||||
|
def log_likelihood_gradients(self):
|
||||||
|
return np.hstack((self.dL_dX().flatten(), sparse_GP_regression.log_likelihood_gradients(self)))
|
||||||
|
|
||||||
|
def plot(self):
|
||||||
|
GPLVM.plot(self)
|
||||||
|
mu, var = sparse_GP_regression.predict(self, self.Z+np.random.randn(*self.Z.shape)*0.0001)
|
||||||
|
pb.plot(mu[:, 0] , mu[:, 1], 'ko')
|
||||||
177
GPy/models/sparse_GP_regression.py
Normal file
177
GPy/models/sparse_GP_regression.py
Normal file
|
|
@ -0,0 +1,177 @@
|
||||||
|
import numpy as np
|
||||||
|
import pylab as pb
|
||||||
|
from ..util.linalg import mdot, jitchol, chol_inv, pdinv
|
||||||
|
from ..util.plot import gpplot
|
||||||
|
from .. import kern
|
||||||
|
from ..inference.likelihoods import likelihood
|
||||||
|
from GP_regression import GP_regression
|
||||||
|
|
||||||
|
class sparse_GP_regression(GP_regression):
|
||||||
|
"""
|
||||||
|
Variational sparse GP model (Regression)
|
||||||
|
|
||||||
|
:param X: inputs
|
||||||
|
:type X: np.ndarray (N x Q)
|
||||||
|
:param Y: observed data
|
||||||
|
:type Y: np.ndarray of observations (N x D)
|
||||||
|
:param kernel : the kernel/covariance function. See link kernels
|
||||||
|
:type kernel: a GPy kernel
|
||||||
|
:param Z: inducing inputs (optional, see note)
|
||||||
|
:type Z: np.ndarray (M x Q) | None
|
||||||
|
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
|
||||||
|
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
|
||||||
|
:type M: int
|
||||||
|
:param beta: noise precision. TODO> ignore beta if doing EP
|
||||||
|
:type beta: float
|
||||||
|
:param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
|
||||||
|
:type normalize_(X|Y): bool
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,X,Y,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
|
||||||
|
self.beta = beta
|
||||||
|
if Z is None:
|
||||||
|
self.Z = np.random.permutation(X.copy())[:M]
|
||||||
|
self.M = M
|
||||||
|
else:
|
||||||
|
assert Z.shape[1]==X.shape[1]
|
||||||
|
self.Z = Z
|
||||||
|
self.M = Z.shape[1]
|
||||||
|
|
||||||
|
GP_regression.__init__(self, X, Y, kernel = kernel, normalize_X = normalize_X, normalize_Y = normalize_Y)
|
||||||
|
self.trYYT = np.sum(np.square(self.Y))
|
||||||
|
|
||||||
|
def set_param(self, p):
|
||||||
|
self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
|
||||||
|
self.beta = p[self.M*self.Q]
|
||||||
|
self.kern.set_param(p[self.Z.size + 1:])
|
||||||
|
self.beta2 = self.beta**2
|
||||||
|
self._compute_kernel_matrices()
|
||||||
|
self._computations()
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_kernel_matrices(self):
|
||||||
|
# kernel computations, using BGPLVM notation
|
||||||
|
#TODO: the following can be switched out in the case of uncertain inputs (or the BGPLVM!)
|
||||||
|
#TODO: slices for psi statistics (easy enough)
|
||||||
|
|
||||||
|
self.Kmm = self.kern.K(self.Z)
|
||||||
|
self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
|
||||||
|
self.psi1 = self.kern.K(self.Z,self.X)
|
||||||
|
self.psi2 = np.dot(self.psi1,self.psi1.T)
|
||||||
|
|
||||||
|
self.dKmm_dtheta = self.kern.dK_dtheta(self.Z)
|
||||||
|
self.dpsi0_dtheta = self.kern.dKdiag_dtheta(self.X).sum(0)
|
||||||
|
self.dpsi1_dtheta = self.kern.dK_dtheta(self.Z,self.X)
|
||||||
|
tmp = np.dot(self.psi1, self.dpsi1_dtheta)
|
||||||
|
self.dpsi2_dtheta = tmp + tmp.transpose(1,0,2)
|
||||||
|
|
||||||
|
self.dpsi1_dZ = self.kern.dK_dX(self.Z,self.X)
|
||||||
|
self.dpsi2_dZ = np.tensordot(self.psi1,self.dpsi1_dZ,((1),(0)))*2.0
|
||||||
|
self.dKmm_dZ = self.kern.dK_dX(self.Z)
|
||||||
|
|
||||||
|
def _computations(self):
|
||||||
|
# TODO find routine to multiply triangular matrices
|
||||||
|
self.psi1Y = np.dot(self.psi1, self.Y)
|
||||||
|
self.psi1YYpsi1 = np.dot(self.psi1Y, self.psi1Y.T)
|
||||||
|
self.Lm = jitchol(self.Kmm)
|
||||||
|
self.Lmi = chol_inv(self.Lm)
|
||||||
|
self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
|
||||||
|
self.A = mdot(self.Lmi, self.psi2, self.Lmi.T)
|
||||||
|
self.B = np.eye(self.M) + self.beta * self.A
|
||||||
|
self.LB = jitchol(self.B)
|
||||||
|
self.LBi = chol_inv(self.LB)
|
||||||
|
self.Bi = np.dot(self.LBi.T, self.LBi)
|
||||||
|
self.LLambdai = np.dot(self.LBi, self.Lmi)
|
||||||
|
self.trace_K = self.psi0 - np.trace(self.A)
|
||||||
|
self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
|
||||||
|
self.C = mdot(self.LLambdai, self.psi1Y)
|
||||||
|
self.G = mdot(self.LBL_inv, self.psi1YYpsi1, self.LBL_inv.T)
|
||||||
|
|
||||||
|
# Computes dL_dpsi
|
||||||
|
self.dL_dpsi0 = - 0.5 * self.D * self.beta
|
||||||
|
dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.Y) # this is sane.
|
||||||
|
tmp = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
|
||||||
|
self.dL_dpsi1 = self.beta2 * tmp
|
||||||
|
self.dL_dpsi2 = (- 0.5 * self.D * self.beta * (self.LBL_inv - self.Kmmi)
|
||||||
|
- self.beta**3 * 0.5 * self.G)
|
||||||
|
|
||||||
|
# Computes dL_dKmm TODO: nicer precomputations
|
||||||
|
|
||||||
|
# tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi)
|
||||||
|
# self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB
|
||||||
|
# self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC
|
||||||
|
# tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
|
||||||
|
# - self.beta*mdot(self.G, self.psi2, self.Kmmi))
|
||||||
|
# self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G)
|
||||||
|
|
||||||
|
tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi)
|
||||||
|
self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB
|
||||||
|
self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC
|
||||||
|
tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
|
||||||
|
- self.beta*mdot(self.G, self.psi2, self.Kmmi))
|
||||||
|
self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE
|
||||||
|
|
||||||
|
def get_param(self):
|
||||||
|
return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()])
|
||||||
|
|
||||||
|
def get_param_names(self):
|
||||||
|
return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names()
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
|
||||||
|
B = -0.5*self.beta*self.D*self.trace_K
|
||||||
|
C = -self.D * np.sum(np.log(np.diag(self.LB)))
|
||||||
|
D = -0.5*self.beta*self.trYYT
|
||||||
|
E = +0.5*self.beta2*np.sum(self.psi1YYpsi1 * self.LBL_inv)
|
||||||
|
|
||||||
|
return A+B+C+D+E
|
||||||
|
|
||||||
|
def dL_dbeta(self):
|
||||||
|
""" compute the gradient of the log likelihood wrt beta.
|
||||||
|
TODO: suport heteroscedatic noise"""
|
||||||
|
|
||||||
|
dA_dbeta = 0.5 * self.N*self.D/self.beta
|
||||||
|
dB_dbeta = - 0.5 * self.D * self.trace_K
|
||||||
|
dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)
|
||||||
|
dD_dbeta = - 0.5 * self.trYYT
|
||||||
|
tmp = mdot(self.LBi.T, self.LLambdai, self.psi1Y)
|
||||||
|
dE_dbeta = (self.beta * np.sum(np.square(self.C)) - 0.5 * self.beta2
|
||||||
|
* np.sum(self.A * np.dot(tmp, tmp.T)))
|
||||||
|
|
||||||
|
return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
|
||||||
|
|
||||||
|
def dL_dtheta(self):
|
||||||
|
dL_dtheta = (self.dL_dpsi0 * self.dpsi0_dtheta + (self.dL_dpsi1[:,:, None] * self.dpsi1_dtheta).sum(0).sum(0)
|
||||||
|
+ (self.dL_dpsi2[:, :, None] * self.dpsi2_dtheta).sum(0).sum(0)
|
||||||
|
+ (self.dL_dKmm[:, :, None] * self.dKmm_dtheta).sum(0).sum(0))
|
||||||
|
return dL_dtheta
|
||||||
|
|
||||||
|
def dL_dZ(self):
|
||||||
|
dL_dZ = ((self.dL_dpsi1[:,:,None]*self.dpsi1_dZ.swapaxes(0,1)).sum(1)
|
||||||
|
+ (self.dL_dpsi2[:, :, None] * self.dpsi2_dZ).sum(0)
|
||||||
|
+ 2.0*(self.dL_dKmm[:, :, None] * self.dKmm_dZ).sum(0))
|
||||||
|
return dL_dZ
|
||||||
|
|
||||||
|
|
||||||
|
def log_likelihood_gradients(self):
|
||||||
|
return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
|
||||||
|
|
||||||
|
def _raw_predict(self,_Xnew,slices):
|
||||||
|
"""Internal helper function for making predictions, does not account for normalisation"""
|
||||||
|
|
||||||
|
Kx = self.kern.K(self.Z, _Xnew, self.Xslices, slices)
|
||||||
|
Kxx = self.kern.K(_Xnew, slices1=slices, slices2=slices)
|
||||||
|
|
||||||
|
mu = self.beta * mdot(Kx.T, self.LBL_inv, self.psi1Y)
|
||||||
|
var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + np.eye(_Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case.
|
||||||
|
return mu,var
|
||||||
|
|
||||||
|
def plot(self,*args,**kwargs):
|
||||||
|
"""
|
||||||
|
Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
|
||||||
|
"""
|
||||||
|
GP_regression.plot(self,*args,**kwargs)
|
||||||
|
if self.Q==1:
|
||||||
|
pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
|
||||||
|
if self.Q==2:
|
||||||
|
pb.plot(self.Z[:,0],self.Z[:,1],'wo')
|
||||||
90
GPy/models/warped_GP.py
Normal file
90
GPy/models/warped_GP.py
Normal file
|
|
@ -0,0 +1,90 @@
|
||||||
|
import numpy as np
|
||||||
|
from .. import kern
|
||||||
|
from ..core import model
|
||||||
|
from ..util.linalg import pdinv
|
||||||
|
from ..util.plot import gpplot
|
||||||
|
from ..util.warping_functions import *
|
||||||
|
from GP_regression import GP_regression
|
||||||
|
|
||||||
|
|
||||||
|
class warpedGP(GP_regression):
|
||||||
|
"""
|
||||||
|
TODO: fucking docstrings!
|
||||||
|
|
||||||
|
@nfusi: I'#ve hacked a little on this, but no guarantees. J.
|
||||||
|
"""
|
||||||
|
def __init__(self, X, Y, warping_function = None, warping_terms = 3, **kwargs):
|
||||||
|
|
||||||
|
if warping_function == None:
|
||||||
|
self.warping_function = TanhWarpingFunction(warping_terms)
|
||||||
|
# self.warping_params = np.random.randn(self.warping_function.n_terms, 3)
|
||||||
|
self.warping_params = np.ones((self.warping_function.n_terms, 3))*1.0 # TODO better init
|
||||||
|
self.warp_params_shape = (self.warping_function.n_terms, 3) # todo get this from the subclass
|
||||||
|
|
||||||
|
self.Z = Y.copy()
|
||||||
|
self.N, self.D = Y.shape
|
||||||
|
self.transform_data()
|
||||||
|
GP_regression.__init__(self, X, self.Y, **kwargs)
|
||||||
|
|
||||||
|
def set_param(self, x):
|
||||||
|
self.warping_params = x[:self.warping_function.num_parameters].reshape(self.warp_params_shape).copy()
|
||||||
|
self.transform_data()
|
||||||
|
GP_regression.set_param(self, x[self.warping_function.num_parameters:].copy())
|
||||||
|
|
||||||
|
def get_param(self):
|
||||||
|
return np.hstack((self.warping_params.flatten().copy(), GP_regression.get_param(self).copy()))
|
||||||
|
|
||||||
|
def get_param_names(self):
|
||||||
|
warping_names = self.warping_function.get_param_names()
|
||||||
|
param_names = GP_regression.get_param_names(self)
|
||||||
|
return warping_names + param_names
|
||||||
|
|
||||||
|
def transform_data(self):
|
||||||
|
self.Y = self.warping_function.f(self.Z.copy(), self.warping_params).copy()
|
||||||
|
|
||||||
|
# this supports the 'smart' behaviour in GP_regression
|
||||||
|
if self.D > self.N:
|
||||||
|
self.Youter = np.dot(self.Y, self.Y.T)
|
||||||
|
else:
|
||||||
|
self.Youter = None
|
||||||
|
|
||||||
|
return self.Y
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
ll = GP_regression.log_likelihood(self)
|
||||||
|
jacobian = self.warping_function.fgrad_y(self.Z, self.warping_params)
|
||||||
|
return ll + np.log(jacobian).sum()
|
||||||
|
|
||||||
|
def log_likelihood_gradients(self):
|
||||||
|
ll_grads = GP_regression.log_likelihood_gradients(self)
|
||||||
|
alpha = np.dot(self.Ki, self.Y.flatten())
|
||||||
|
warping_grads = self.warping_function_gradients(alpha)
|
||||||
|
return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
|
||||||
|
|
||||||
|
def warping_function_gradients(self, Kiy):
|
||||||
|
grad_y = self.warping_function.fgrad_y(self.Z, self.warping_params)
|
||||||
|
grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Z, self.warping_params,
|
||||||
|
return_covar_chain = True)
|
||||||
|
|
||||||
|
djac_dpsi = ((1.0/grad_y[:,:, None, None])*grad_y_psi).sum(axis=0).sum(axis=0)
|
||||||
|
dquad_dpsi = (Kiy[:,None,None,None] * grad_psi).sum(axis=0).sum(axis=0)
|
||||||
|
|
||||||
|
return -dquad_dpsi + djac_dpsi
|
||||||
|
|
||||||
|
def plot_warping(self):
|
||||||
|
self.warping_function.plot(self.warping_params, self.Z.min(), self.Z.max())
|
||||||
|
|
||||||
|
def predict(self, X, in_unwarped_space = False, **kwargs):
|
||||||
|
mu, var = GP_regression.predict(self, X, **kwargs)
|
||||||
|
|
||||||
|
# The plot() function calls set_param() before calling predict()
|
||||||
|
# this is causing the observations to be plotted in the transformed
|
||||||
|
# space (where Y lives), making the plot looks very wrong
|
||||||
|
# if the predictions are made in the untransformed space
|
||||||
|
# (where Z lives). To fix this I included the option below. It's
|
||||||
|
# just a quick fix until I figure out something smarter.
|
||||||
|
if in_unwarped_space:
|
||||||
|
mu = self.warping_function.f_inv(mu, self.warping_params)
|
||||||
|
var = self.warping_function.f_inv(var, self.warping_params)
|
||||||
|
|
||||||
|
return mu, var
|
||||||
Loading…
Add table
Add a link
Reference in a new issue