From 61984274ddeb5eee10d1f6e04c699d6550f19941 Mon Sep 17 00:00:00 2001 From: Nicolo Fusi Date: Thu, 29 Nov 2012 16:32:48 +0000 Subject: [PATCH] models --- GPy/models/GPLVM.py | 61 ++++++++ GPy/models/GP_EP.py | 153 ++++++++++++++++++ GPy/models/GP_regression.py | 207 +++++++++++++++++++++++++ GPy/models/__init__.py | 7 + GPy/models/generalized_FITC.py | 239 +++++++++++++++++++++++++++++ GPy/models/sparse_GPLVM.py | 57 +++++++ GPy/models/sparse_GP_regression.py | 177 +++++++++++++++++++++ GPy/models/warped_GP.py | 90 +++++++++++ 8 files changed, 991 insertions(+) create mode 100644 GPy/models/GPLVM.py create mode 100644 GPy/models/GP_EP.py create mode 100644 GPy/models/GP_regression.py create mode 100644 GPy/models/__init__.py create mode 100644 GPy/models/generalized_FITC.py create mode 100644 GPy/models/sparse_GPLVM.py create mode 100644 GPy/models/sparse_GP_regression.py create mode 100644 GPy/models/warped_GP.py diff --git a/GPy/models/GPLVM.py b/GPy/models/GPLVM.py new file mode 100644 index 00000000..ac644d9b --- /dev/null +++ b/GPy/models/GPLVM.py @@ -0,0 +1,61 @@ +import numpy as np +import pylab as pb +import sys, pdb +from .. import kern +from ..core import model +from ..util.linalg import pdinv, PCA +from GP_regression import GP_regression + +class GPLVM(GP_regression): + """ + Gaussian Process Latent Variable Model + + :param Y: observed data + :type Y: np.ndarray + :param Q: latent dimensionality + :type Q: int + :param init: initialisation method for the latent space + :type init: 'PCA'|'random' + + """ + def __init__(self, Y, Q, init='PCA', **kwargs): + X = self.initialise_latent(init, Q, Y) + GP_regression.__init__(self, X, Y, **kwargs) + + def initialise_latent(self, init, Q, Y): + if init == 'PCA': + return PCA(Y, Q)[0] + else: + return np.random.randn(Y.shape[0], Q) + + def get_param_names(self): + return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[]) + + self.kern.extract_param_names()) + + def get_param(self): + return np.hstack((self.X.flatten(), self.kern.extract_param())) + + def set_param(self,x): + self.X = x[:self.X.size].reshape(self.N,self.Q).copy() + GP_regression.set_param(self, x[self.X.size:]) + + def log_likelihood_gradients(self): + dL_dK = self.dL_dK() + + dK_dtheta = self.kern.dK_dtheta(self.X) + dL_dtheta = (dK_dtheta*dL_dK[:,:,None]).sum(0).sum(0) + + dK_dX = self.kern.dK_dX(self.X) + dL_dX = 2.*np.sum(dL_dK[:,:,None]*dK_dX,0) + + return np.hstack((dL_dX.flatten(),dL_dtheta)) + + def plot(self): + assert self.Y.shape[1]==2 + pb.scatter(self.Y[:,0],self.Y[:,1],40,self.X[:,0].copy(),linewidth=0) + Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None] + mu, var = self.predict(Xnew) + pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5) + + def plot_latent(self): + raise NotImplementedError diff --git a/GPy/models/GP_EP.py b/GPy/models/GP_EP.py new file mode 100644 index 00000000..134b7c43 --- /dev/null +++ b/GPy/models/GP_EP.py @@ -0,0 +1,153 @@ +import numpy as np +import pylab as pb +from scipy import stats, linalg +from .. import kern +from ..inference.Expectation_Propagation import EP,Full +from ..inference.likelihoods import likelihood,probit,poisson,gaussian +from ..core import model +from ..util.linalg import pdinv,jitchol +from ..util.plot import gpplot + +class GP_EP(model): + def __init__(self,X,likelihood,kernel=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]): + """ + Simple Gaussian Process with Non-Gaussian likelihood + + Arguments + --------- + :param X: input observations (NxD numpy.darray) + :param likelihood: a GPy likelihood (likelihood class) + :param kernel: a GPy kernel (kern class) + :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 (float) + :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list) + :rtype: GPy model class. + """ + assert isinstance(kernel,kern.kern), 'kernel is not a kern instance' + self.likelihood = likelihood + self.Y = self.likelihood.Y + self.kernel = kernel + self.X = X + self.N, self.D = self.X.shape + self.eta,self.delta = powerep + self.epsilon_ep = epsilon_ep + self.jitter = 1e-12 + self.K = self.kernel.K(self.X) + model.__init__(self) + + def set_param(self,p): + self.kernel.expand_param(p) + + def get_param(self): + return self.kernel.extract_param() + + def get_param_names(self): + return self.kernel.extract_param_names() + + def approximate_likelihood(self): + self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta]) + self.ep_approx.fit_EP() + + def posterior_param(self): + self.K = self.kernel.K(self.X) + self.Sroot_tilde_K = np.sqrt(self.ep_approx.tau_tilde)[:,None]*self.K + B = np.eye(self.N) + np.sqrt(self.ep_approx.tau_tilde)[None,:]*self.Sroot_tilde_K + #self.L = np.linalg.cholesky(B) + self.L = jitchol(B) + V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1) + self.Sigma = self.K - np.dot(V.T,V) + self.mu = np.dot(self.Sigma,self.ep_approx.v_tilde) + + def log_likelihood(self): + """ + Returns + ------- + The EP approximation to the log-marginal likelihood + """ + self.posterior_param() + mu_ = self.ep_approx.v_/self.ep_approx.tau_ + L1 =.5*sum(np.log(1+self.ep_approx.tau_tilde*1./self.ep_approx.tau_))-sum(np.log(np.diag(self.L))) + L2A =.5*np.sum((self.Sigma-np.diag(1./(self.ep_approx.tau_+self.ep_approx.tau_tilde))) * np.dot(self.ep_approx.v_tilde[:,None],self.ep_approx.v_tilde[None,:])) + L2B = .5*np.dot(mu_*(self.ep_approx.tau_/(self.ep_approx.tau_tilde+self.ep_approx.tau_)),self.ep_approx.tau_tilde*mu_ - 2*self.ep_approx.v_tilde) + L3 = sum(np.log(self.ep_approx.Z_hat)) + return L1 + L2A + L2B + L3 + + def log_likelihood_gradients(self): + dK_dp = self.kernel.dK_dtheta(self.X) + self.dK_dp = dK_dp + aux1,info_1 = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1) + b = self.ep_approx.v_tilde - np.sqrt(self.ep_approx.tau_tilde)*linalg.flapack.dtrtrs(self.L.T,aux1)[0] + U,info_u = linalg.flapack.dtrtrs(self.L,np.diag(np.sqrt(self.ep_approx.tau_tilde)),lower=1) + dL_dK = 0.5*(np.outer(b,b)-np.dot(U.T,U)) + self.dL_dK = dL_dK + return np.array([np.sum(dK_dpi*dL_dK) for dK_dpi in dK_dp.T]) + + def predict(self,X): + #TODO: check output dimensions + self.posterior_param() + K_x = self.kernel.K(self.X,X) + Kxx = self.kernel.K(X) + aux1,info = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1) + aux2,info = linalg.flapack.dtrtrs(self.L.T, aux1,lower=0) + zeta = np.sqrt(self.ep_approx.tau_tilde)*aux2 + f = np.dot(K_x.T,self.ep_approx.v_tilde-zeta) + v,info = linalg.flapack.dtrtrs(self.L,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x,lower=1) + variance = Kxx - np.dot(v.T,v) + vdiag = np.diag(variance) + y=self.likelihood.predictive_mean(f,vdiag) + return f,vdiag,y + + def plot(self): + """ + Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals. + """ + if self.X.shape[1]==1: + pb.figure() + xmin,xmax = self.X.min(),self.X.max() + xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) + Xnew = np.linspace(xmin,xmax,100)[:,None] + mu_f, var_f, mu_phi = self.predict(Xnew) + pb.subplot(211) + self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.X,Mean_u=self.mu,Var_u=np.diag(self.Sigma)) + pb.subplot(212) + self.likelihood.plot1Db(self.X,Xnew,mu_phi) + elif self.X.shape[1]==2: + pb.figure() + x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0) + x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0) + x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min) + x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min) + axis1 = np.linspace(x1min,x1max,50) + axis2 = np.linspace(x2min,x2max,50) + XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)] + Xnew = np.c_[XX1.flatten(),XX2.flatten()] + f,v,p = self.predict(Xnew) + self.likelihood.plot2D(self.X,Xnew,p) + else: + raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" + + def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense + """ + Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved. + """ + self.epsilon_em = epsilon + log_likelihood_change = self.epsilon_em + 1. + self.parameters_path = [self.kernel.get_param()] + self.approximate_likelihood() + self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]] + self.log_likelihood_path = [self.log_likelihood()] + iteration = 0 + while log_likelihood_change > self.epsilon_em: + print 'EM iteration', iteration + self.optimize(max_f_eval = max_f_eval) + log_likelihood_new = self.log_likelihood() + log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1] + if log_likelihood_change < 0: + print 'log_likelihood decrement' + self.kernel.expand_param(self.parameters_path[-1]) + self.kernM.expand_param(self.parameters_path[-1]) + else: + self.approximate_likelihood() + self.log_likelihood_path.append(self.log_likelihood()) + self.parameters_path.append(self.kernel.get_param()) + self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde]) + iteration += 1 diff --git a/GPy/models/GP_regression.py b/GPy/models/GP_regression.py new file mode 100644 index 00000000..fc947be8 --- /dev/null +++ b/GPy/models/GP_regression.py @@ -0,0 +1,207 @@ +import numpy as np +import pylab as pb +from .. import kern +from ..core import model +from ..util.linalg import pdinv,mdot +from ..util.plot import gpplot, Tango + +class GP_regression(model): + """ + Gaussian Process model for regression + + :param X: input observations + :param Y: observed values + :param kernel: a GPy kernel, defaults to rbf+white + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing) + :rtype: model object + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None): + if kernel is None: + kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1]) + + # parse arguments + self.Xslices = Xslices + assert isinstance(kernel, kern.kern) + self.kern = kernel + self.X = X + self.Y = Y + assert len(self.X.shape)==2 + assert len(self.Y.shape)==2 + assert self.X.shape[0] == self.Y.shape[0] + self.N, self.D = self.Y.shape + self.N, self.Q = self.X.shape + + #here's some simple normalisation + if normalize_X: + self._Xmean = X.mean(0)[None,:] + self._Xstd = X.std(0)[None,:] + self.X = (X.copy()- self._Xmean) / self._Xstd + else: + self._Xmean = np.zeros((1,self.X.shape[1])) + self._Xstd = np.ones((1,self.X.shape[1])) + + if normalize_Y: + self._Ymean = Y.mean(0)[None,:] + self._Ystd = Y.std(0)[None,:] + self.Y = (Y.copy()- self._Ymean) / self._Ystd + else: + self._Ymean = np.zeros((1,self.Y.shape[1])) + self._Ystd = np.ones((1,self.Y.shape[1])) + + if self.D > self.N: + # then it's more efficient to store Youter + self.Youter = np.dot(self.Y, self.Y.T) + else: + self.Youter = None + + model.__init__(self) + + def set_param(self,p): + self.kern.expand_param(p) + self.K = self.kern.K(self.X,slices1=self.Xslices) + self.Ki,self.hld = pdinv(self.K) + + def get_param(self): + return self.kern.extract_param() + + def get_param_names(self): + return self.kern.extract_param_names() + + def _model_fit_term(self): + """ + Computes the model fit using Youter if it's available + """ + + if self.Youter is None: + return -0.5*np.trace(mdot(self.Y.T,self.Ki,self.Y)) + else: + return -0.5*np.sum(np.multiply(self.Ki, self.Y)) + + def log_likelihood(self): + complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - self.D*self.hld + return complexity_term + self._model_fit_term() + + def dL_dK(self): + if self.Youter is None: + alpha = np.dot(self.Ki,self.Y) + dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) + else: + dL_dK = 0.5*(mdot(self.Ki, self.Youter, self.Ki) - self.D*self.Ki) + + return dL_dK + + def log_likelihood_gradients(self): + return (self.kern.dK_dtheta(self.X,slices1=self.Xslices)*self.dL_dK()[:,:,None]).sum(0).sum(0) + + def predict(self,Xnew, slices=None): + """ + + Predict the function(s) at the new point(s) Xnew. + + Arguments + --------- + :param Xnew: The points at which to make a prediction + :type Xnew: np.ndarray, Nnew x self.Q + :param slices: specifies which outputs kernel(s) the Xnew correspond to (see below) + :type slices: (None, list of slice objects, list of ints) + :rtype: posterior mean, a Numpy array, Nnew x self.D + :rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D) + + .. Note:: "slices" specifies how the the points X_new co-vary wich the training points. + + - If None, the new points covary throigh every kernel part (default) + - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part + - If a list of booleans, specifying which kernel parts are active + + If self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew. + This is to allow for different normalisations of the output dimensions. + + """ + + #normalise X values + Xnew = (Xnew.copy() - self._Xmean) / self._Xstd + mu, var = self._raw_predict(Xnew,slices) + + #un-normalise + mu = mu*self._Ystd + self._Ymean + if self.D==1: + var *= np.square(self._Ystd) + else: + var = var[:,:,None] * np.square(self._Ystd) + return mu,var + + def _raw_predict(self,_Xnew,slices): + """Internal helper function for making predictions, does not account for normalisation""" + Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) + Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) + mu = np.dot(np.dot(Kx.T,self.Ki),self.Y) + var = Kxx - np.dot(np.dot(Kx.T,self.Ki),Kx) + return mu, var + + def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): + """ + :param samples: the number of a posteriori samples to plot + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :param which_functions: which of the kernel functions to plot (additively) + :type which_functions: list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + + Plot the posterior of the GP. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, we've no implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions using which_data and which_functions + """ + if which_functions=='all': + which_functions = [True]*self.kern.Nparts + if which_data=='all': + which_data = slice(None) + + X = self.X[which_data,:] + Y = self.Y[which_data,:] + + Xorig = X*self._Xstd + self._Xmean + Yorig = Y*self._Ystd + self._Ymean + if plot_limits is None: + xmin,xmax = Xorig.min(0),Xorig.max(0) + xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) + elif len(plot_limits)==2: + xmin, xmax = plot_limits + else: + raise ValueError, "Bad limits for plotting" + + + if self.X.shape[1]==1: + Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] + m,v = self.predict(Xnew,slices=which_functions) + gpplot(Xnew,m,v) + if samples: + s = np.random.multivariate_normal(m.flatten(),v,samples) + pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) + pb.plot(Xorig,Yorig,'kx',mew=1.5) + pb.xlim(xmin,xmax) + + elif self.X.shape[1]==2: + resolution = 50 or resolution + xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] + Xtest = np.vstack((xx.flatten(),yy.flatten())).T + zz,vv = self.predict(Xtest,slices=which_functions) + zz = zz.reshape(resolution,resolution) + pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) + pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) + pb.xlim(xmin[0],xmax[0]) + pb.ylim(xmin[1],xmax[1]) + + else: + raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py new file mode 100644 index 00000000..1ec2a4fe --- /dev/null +++ b/GPy/models/__init__.py @@ -0,0 +1,7 @@ +from GP_regression import GP_regression +from sparse_GP_regression import sparse_GP_regression +from GPLVM import GPLVM +from warped_GP import warpedGP +from simple_GP_EP import GP_EP +from generalized_FITC import generalized_FITC +from sparse_GPLVM import sparse_GPLVM diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py new file mode 100644 index 00000000..8fb59264 --- /dev/null +++ b/GPy/models/generalized_FITC.py @@ -0,0 +1,239 @@ +import numpy as np +import pylab as pb +from scipy import stats, linalg +from .. import kern +from ..core import model +from ..util.linalg import pdinv,mdot +from ..util.plot import gpplot +from ..inference.Expectation_Propagation import EP,Full,DTC,FITC +from ..inference.likelihoods import likelihood,probit + +class generalized_FITC(model): + def __init__(self,X,likelihood,kernel=None,inducing=10,epsilon_ep=1e-3,powerep=[1.,1.]): + """ + Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC. + + Arguments + --------- + X : input observations + likelihood : Output's likelihood (likelihood class) + kernel : a GPy kernel + inducing : Either an array specifying the inducing points location or a scalar defining their number. + epsilon_ep : EP convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) + powerep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats) + """ + assert isinstance(kernel,kern.kern) + self.likelihood = likelihood + self.Y = self.likelihood.Y + self.kernel = kernel + self.X = X + self.N, self.D = self.X.shape + assert self.Y.shape[0] == self.N + if type(inducing) == int: + self.M = inducing + self.Z = (np.random.random_sample(self.D*self.M)*(self.X.max()-self.X.min())+self.X.min()).reshape(self.M,-1) + elif type(inducing) == np.ndarray: + self.Z = inducing + self.M = self.Z.shape[0] + self.eta,self.delta = powerep + self.epsilon_ep = epsilon_ep + self.jitter = 1e-12 + model.__init__(self) + + def set_param(self,p): + self.kernel.expand_param(p[0:-self.Z.size]) + self.Z = p[-self.Z.size:].reshape(self.M,self.D) + + def get_param(self): + return np.hstack([self.kernel.extract_param(),self.Z.flatten()]) + + def get_param_names(self): + return self.kernel.extract_param_names()+['iip_%i'%i for i in range(self.Z.size)] + + def approximate_likelihood(self): + self.Kmm = self.kernel.K(self.Z) + self.Knm = self.kernel.K(self.X,self.Z) + self.Knn_diag = self.kernel.Kdiag(self.X) + self.ep_approx = FITC(self.Kmm,self.likelihood,self.Knm.T,self.Knn_diag,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta]) + self.ep_approx.fit_EP() + + def posterior_param(self): + self.Knn_diag = self.kernel.Kdiag(self.X) + self.Kmm = self.kernel.K(self.Z) + self.Kmmi, self.Kmm_hld = pdinv(self.Kmm) + self.Knm = self.kernel.K(self.X,self.Z) + self.KmmiKmn = np.dot(self.Kmmi,self.Knm.T) + self.Qnn = np.dot(self.Knm,self.KmmiKmn) + self.Diag0 = self.Knn_diag - np.diag(self.Qnn) + self.R0 = np.linalg.cholesky(self.Kmmi).T + + self.Taut = self.ep_approx.tau_tilde/(1.+ self.ep_approx.tau_tilde*self.Diag0) + self.KmnTaut = self.Knm.T*self.Taut[None,:] + self.KmnTautKnm = np.dot(self.KmnTaut, self.Knm) + self.Woodbury_inv, self.Woodbury_hld = pdinv(self.Kmm + self.KmnTautKnm) + self.Qnn_diag = self.Knn_diag - np.diag(self.Qnn) + 1./self.ep_approx.tau_tilde + self.Qi = -np.dot(self.KmnTaut.T, np.dot(self.Woodbury_inv,self.KmnTaut)) + np.diag(self.Taut) + self.hld = 0.5*np.sum(np.log(self.Diag0 + 1./self.ep_approx.tau_tilde)) - self.Kmm_hld + self.Woodbury_hld + + self.Diag = self.Diag0/(1.+ self.Diag0 * self.ep_approx.tau_tilde) + self.P = (self.Diag / self.Diag0)[:,None] * self.Knm + self.RPT0 = np.dot(self.R0,self.Knm.T) + self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T)) + self.R,info = linalg.flapack.dtrtrs(self.L,self.R0,lower=1) + self.RPT = np.dot(self.R,self.P.T) + self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT) + self.w = self.Diag * self.ep_approx.v_tilde + self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.ep_approx.v_tilde)) + self.mu = self.w + np.dot(self.P,self.gamma) + self.mu_tilde = (self.ep_approx.v_tilde/self.ep_approx.tau_tilde)[:,None] + + def log_likelihood(self): + self.posterior_param() + self.Youter = np.dot(self.mu_tilde,self.mu_tilde.T) + A = -self.hld + B = -.5*np.sum(self.Qi*self.Youter) + C = sum(np.log(self.ep_approx.Z_hat)) + D = .5*np.sum(np.log(1./self.ep_approx.tau_tilde + 1./self.ep_approx.tau_)) + E = .5*np.sum((self.ep_approx.v_/self.ep_approx.tau_ - self.mu_tilde.flatten())**2/(1./self.ep_approx.tau_ + 1./self.ep_approx.tau_tilde)) + return A + B + C + D + E + + def log_likelihood_gradients(self): + dKmm_dtheta = self.kernel.dK_dtheta(self.Z) + dKnn_dtheta = self.kernel.dK_dtheta(self.X) + dKmn_dtheta = self.kernel.dK_dtheta(self.Z,self.X) + dKmm_dZ = -self.kernel.dK_dX(self.Z) + dKnm_dZ = -self.kernel.dK_dX(self.X,self.Z) + tmp = [np.dot(dKmn_dtheta_i,self.KmmiKmn) for dKmn_dtheta_i in dKmn_dtheta.T] + dQnn_dtheta = [tmp_i + tmp_i.T - np.dot(np.dot(self.KmmiKmn.T,dKmm_dtheta_i),self.KmmiKmn) for tmp_i,dKmm_dtheta_i in zip(tmp,dKmm_dtheta.T)] + dDiag0_dtheta = [np.diag(dKnn_dtheta_i) - np.diag(dQnn_dtheta_i) for dKnn_dtheta_i,dQnn_dtheta_i in zip(dKnn_dtheta.T,dQnn_dtheta)] + dQ_dtheta = [np.diag(dDiag0_dtheta_i) + dQnn_dtheta_i for dDiag0_dtheta_i,dQnn_dtheta_i in zip(dDiag0_dtheta,dQnn_dtheta)] + dW_dtheta = [dKmm_dtheta_i + 2*np.dot(self.KmnTaut,dKmn_dtheta_i) - np.dot(self.KmnTaut*dDiag0_dtheta_i,self.KmnTaut.T) for dKmm_dtheta_i,dDiag0_dtheta_i,dKmn_dtheta_i in zip(dKmm_dtheta.T,dDiag0_dtheta,dKmn_dtheta.T)] + + QiY = np.dot(self.Qi, self.mu_tilde) + QiYYQi = np.outer(QiY,QiY) + WiKmnTaut = np.dot(self.Woodbury_inv,self.KmnTaut) + K_Y = np.dot(self.KmmiKmn,QiY) + # gradient - theta + Atheta = [-0.5*np.dot(self.Taut,dDiag0_dtheta_i) + 0.5*np.sum(self.Kmmi*dKmm_dtheta_i) - 0.5*np.sum(self.Woodbury_inv*dW_dtheta_i) for dDiag0_dtheta_i,dKmm_dtheta_i,dW_dtheta_i in zip(dDiag0_dtheta,dKmm_dtheta.T,dW_dtheta)] + Btheta = np.array([0.5*np.sum(QiYYQi*dQ_dtheta_i) for dQ_dtheta_i in dQ_dtheta]) + dL_dtheta = Atheta + Btheta + # gradient - Z + # Az + dQnn_dZ_diag_a2 = (np.array([d[:,:,None]*self.KmmiKmn[:,:,None] for d in dKnm_dZ.transpose(2,0,1)]).reshape(self.D,self.M,self.N)).transpose(1,2,0) + dQnn_dZ_diag_b2 = (np.array([(self.KmmiKmn*np.sum(d[:,:,None]*self.KmmiKmn,-2))[:,:,None] for d in dKmm_dZ.transpose(2,0,1)]).reshape(self.D,self.M,self.N)).transpose(1,2,0) + dQnn_dZ_diag = dQnn_dZ_diag_a2 - dQnn_dZ_diag_b2 + d_hld_Diag1_dZ = -np.sum(np.dot(self.KmmiKmn*self.Taut,self.KmmiKmn.T)[:,:,None]*dKmm_dZ,-2) + np.sum((self.KmmiKmn*self.Taut)[:,:,None]*dKnm_dZ,-2) + d_hld_Kmm_dZ = np.sum(self.Kmmi[:,:,None]*dKmm_dZ,-2) + d_hld_W_dZ1 = np.sum(WiKmnTaut[:,:,None]*dKnm_dZ,-2) + d_hld_W_dZ3 = np.sum(self.Woodbury_inv[:,:,None]*dKmm_dZ,-2) + d_hld_W_dZ2 = np.array([np.sum(np.sum(WiKmnTaut.T*d[:,:,None]*self.KmnTaut.T,-2),-1) for d in dQnn_dZ_diag.transpose(2,0,1)]).T + Az = d_hld_Diag1_dZ + d_hld_Kmm_dZ - d_hld_W_dZ1 - d_hld_W_dZ2 - d_hld_W_dZ3 + # Bz + Bz2 = np.sum(np.dot(K_Y,QiY.T)[:,:,None]*dKnm_dZ,-2) + Bz3 = - np.sum(np.dot(K_Y,K_Y.T)[:,:,None]*dKmm_dZ,-2) + Bz1 = -np.array([np.sum((QiY**2)*d[:,:,None],-2) for d in dQnn_dZ_diag.transpose(2,0,1)]).reshape(self.D,self.M).T + Bz = Bz1 + Bz2 + Bz3 + dL_dZ = (Az + Bz).flatten() + return np.hstack([dL_dtheta, dL_dZ]) + + def predict(self,X): + """ + Make a prediction for the vsGP model + + Arguments + --------- + X : Input prediction data - Nx1 numpy array (floats) + """ + #TODO: check output dimensions + K_x = self.kernel.K(self.Z,X) + Kxx = self.kernel.K(X) + #K_x = self.kernM.cross.K(X) + # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T) + + # Ci = I + (RPT0)Di(RPT0).T + # C = I - [RPT0] * (D+[RPT0].T*[RPT0])^-1*[RPT0].T + # = I - [RPT0] * (D + self.Qnn)^-1 * [RPT0].T + # = I - [RPT0] * (U*U.T)^-1 * [RPT0].T + # = I - V.T * V + U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn) + V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1) + C = np.eye(self.M) - np.dot(V.T,V) + mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:]) + #self.C = C + #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T + #self.mu_u = mu_u + #self.U = U + # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T) + mu_H = np.dot(mu_u,self.mu) + self.mu_H = mu_H + Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T)) + # q(f_star|y) = N(f_star|mu_star,sigma2_star) + KR0T = np.dot(K_x.T,self.R0.T) + mu_star = np.dot(KR0T,mu_H) + sigma2_star = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T)) + vdiag = np.diag(sigma2_star) + # q(y_star|y) = non-gaussian posterior probability of class membership + p = self.likelihood.predictive_mean(mu_star,vdiag) + return mu_star,vdiag,p + + def plot(self): + """ + Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals. + """ + if self.X.shape[1]==1: + pb.figure() + xmin,xmax = np.r_[self.X,self.Z].min(),np.r_[self.X,self.Z].max() + xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) + Xnew = np.linspace(xmin,xmax,100)[:,None] + mu_f, var_f, mu_phi = self.predict(Xnew) + self.mu_inducing,self.var_diag_inducing,self.phi_inducing = self.predict(self.Z) + pb.subplot(211) + self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.Z,Mean_u=self.mu_inducing,Var_u=self.var_diag_inducing) + pb.subplot(212) + self.likelihood.plot1Db(self.X,Xnew,mu_phi,self.Z) + elif self.X.shape[1]==2: + pb.figure() + x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0) + x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0) + x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min) + x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min) + axis1 = np.linspace(x1min,x1max,50) + axis2 = np.linspace(x2min,x2max,50) + XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)] + Xnew = np.c_[XX1.flatten(),XX2.flatten()] + f,v,p = self.predict(Xnew) + self.likelihood.plot2D(self.X,Xnew,p,self.Z) + else: + raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" + + def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense + """ + Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved. + """ + self.epsilon_em = epsilon + log_likelihood_change = self.epsilon_em + 1. + self.parameters_path = [self.kernel.get_param()] + self.approximate_likelihood() + self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]] + self.inducing_inputs_path = [self.Z] + self.log_likelihood_path = [self.log_likelihood()] + iteration = 0 + while log_likelihood_change > self.epsilon_em: + print 'EM iteration', iteration + self.optimize(max_f_eval = max_f_eval) + log_likelihood_new = self.log_likelihood() + log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1] + if log_likelihood_change < 0: + print 'log_likelihood decrement' + self.kernel.expand_param(self.parameters_path[-1]) + self.kernM = self.kernel.copy() + slef.kernM.expand_X(self.iducing_inputs_path[-1]) + self.__init__(self.kernel,self.likelihood,kernM=self.kernM,powerep=[self.eta,self.delta],epsilon_ep = self.epsilon_ep, epsilon_em = self.epsilon_em) + + else: + self.approximate_likelihood() + self.log_likelihood_path.append(self.log_likelihood()) + self.parameters_path.append(self.kernel.get_param()) + self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde]) + self.inducing_inputs_path.append(self.Z) + iteration += 1 diff --git a/GPy/models/sparse_GPLVM.py b/GPy/models/sparse_GPLVM.py new file mode 100644 index 00000000..f1888b6c --- /dev/null +++ b/GPy/models/sparse_GPLVM.py @@ -0,0 +1,57 @@ +import numpy as np +import pylab as pb +import sys, pdb +# from .. import kern +# from ..core import model +# from ..util.linalg import pdinv, PCA +from GPLVM import GPLVM +from sparse_GP_regression import sparse_GP_regression + +class sparse_GPLVM(sparse_GP_regression, GPLVM): + """ + Sparse Gaussian Process Latent Variable Model + + :param Y: observed data + :type Y: np.ndarray + :param Q: latent dimensionality + :type Q: int + :param init: initialisation method for the latent space + :type init: 'PCA'|'random' + + """ + def __init__(self, Y, Q, init='PCA', **kwargs): + X = self.initialise_latent(init, Q, Y) + sparse_GP_regression.__init__(self, X, Y, **kwargs) + + def get_param_names(self): + return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[]) + + sparse_GP_regression.get_param_names(self)) + + def get_param(self): + return np.hstack((self.X.flatten(), sparse_GP_regression.get_param(self))) + + def set_param(self,x): + self.X = x[:self.X.size].reshape(self.N,self.Q).copy() + sparse_GP_regression.set_param(self, x[self.X.size:]) + + def log_likelihood(self): + return sparse_GP_regression.log_likelihood(self) + + def dL_dX(self): + dpsi0_dX = self.kern.dKdiag_dX(self.X) + dpsi1_dX = self.kern.dK_dX(self.X,self.Z) + dpsi2_dX = self.psi1[:,None,:,None]*dpsi1_dX[None,:,:,:] + + dL_dX = ((self.dL_dpsi0 * dpsi0_dX).sum(0) + + (self.dL_dpsi1[:,:,None]*dpsi1_dX).sum(0) + + 2.0*(self.dL_dpsi2[:, :, None,None] * dpsi2_dX).sum(0).sum(0)) + + return dL_dX + + def log_likelihood_gradients(self): + return np.hstack((self.dL_dX().flatten(), sparse_GP_regression.log_likelihood_gradients(self))) + + def plot(self): + GPLVM.plot(self) + mu, var = sparse_GP_regression.predict(self, self.Z+np.random.randn(*self.Z.shape)*0.0001) + pb.plot(mu[:, 0] , mu[:, 1], 'ko') diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py new file mode 100644 index 00000000..fc434691 --- /dev/null +++ b/GPy/models/sparse_GP_regression.py @@ -0,0 +1,177 @@ +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, chol_inv, pdinv +from ..util.plot import gpplot +from .. import kern +from ..inference.likelihoods import likelihood +from GP_regression import GP_regression + +class sparse_GP_regression(GP_regression): + """ + Variational sparse GP model (Regression) + + :param X: inputs + :type X: np.ndarray (N x Q) + :param Y: observed data + :type Y: np.ndarray of observations (N x D) + :param kernel : the kernel/covariance function. See link kernels + :type kernel: a GPy kernel + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None + :param Zslices: slices for the inducing inputs (see slicing TODO: link) + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param beta: noise precision. TODO> ignore beta if doing EP + :type beta: float + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self,X,Y,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False): + self.beta = beta + if Z is None: + self.Z = np.random.permutation(X.copy())[:M] + self.M = M + else: + assert Z.shape[1]==X.shape[1] + self.Z = Z + self.M = Z.shape[1] + + GP_regression.__init__(self, X, Y, kernel = kernel, normalize_X = normalize_X, normalize_Y = normalize_Y) + self.trYYT = np.sum(np.square(self.Y)) + + def set_param(self, p): + self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) + self.beta = p[self.M*self.Q] + self.kern.set_param(p[self.Z.size + 1:]) + self.beta2 = self.beta**2 + self._compute_kernel_matrices() + self._computations() + + + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + #TODO: the following can be switched out in the case of uncertain inputs (or the BGPLVM!) + #TODO: slices for psi statistics (easy enough) + + self.Kmm = self.kern.K(self.Z) + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() + self.psi1 = self.kern.K(self.Z,self.X) + self.psi2 = np.dot(self.psi1,self.psi1.T) + + self.dKmm_dtheta = self.kern.dK_dtheta(self.Z) + self.dpsi0_dtheta = self.kern.dKdiag_dtheta(self.X).sum(0) + self.dpsi1_dtheta = self.kern.dK_dtheta(self.Z,self.X) + tmp = np.dot(self.psi1, self.dpsi1_dtheta) + self.dpsi2_dtheta = tmp + tmp.transpose(1,0,2) + + self.dpsi1_dZ = self.kern.dK_dX(self.Z,self.X) + self.dpsi2_dZ = np.tensordot(self.psi1,self.dpsi1_dZ,((1),(0)))*2.0 + self.dKmm_dZ = self.kern.dK_dX(self.Z) + + def _computations(self): + # TODO find routine to multiply triangular matrices + self.psi1Y = np.dot(self.psi1, self.Y) + self.psi1YYpsi1 = np.dot(self.psi1Y, self.psi1Y.T) + self.Lm = jitchol(self.Kmm) + self.Lmi = chol_inv(self.Lm) + self.Kmmi = np.dot(self.Lmi.T, self.Lmi) + self.A = mdot(self.Lmi, self.psi2, self.Lmi.T) + self.B = np.eye(self.M) + self.beta * self.A + self.LB = jitchol(self.B) + self.LBi = chol_inv(self.LB) + self.Bi = np.dot(self.LBi.T, self.LBi) + self.LLambdai = np.dot(self.LBi, self.Lmi) + self.trace_K = self.psi0 - np.trace(self.A) + self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) + self.C = mdot(self.LLambdai, self.psi1Y) + self.G = mdot(self.LBL_inv, self.psi1YYpsi1, self.LBL_inv.T) + + # Computes dL_dpsi + self.dL_dpsi0 = - 0.5 * self.D * self.beta + dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.Y) # this is sane. + tmp = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1) + self.dL_dpsi1 = self.beta2 * tmp + self.dL_dpsi2 = (- 0.5 * self.D * self.beta * (self.LBL_inv - self.Kmmi) + - self.beta**3 * 0.5 * self.G) + + # Computes dL_dKmm TODO: nicer precomputations + + # tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + # self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB + # self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC + # tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi) + # - self.beta*mdot(self.G, self.psi2, self.Kmmi)) + # self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) + + tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB + self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC + tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi) + - self.beta*mdot(self.G, self.psi2, self.Kmmi)) + self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE + + def get_param(self): + return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()]) + + def get_param_names(self): + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names() + + def log_likelihood(self): + A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) + B = -0.5*self.beta*self.D*self.trace_K + C = -self.D * np.sum(np.log(np.diag(self.LB))) + D = -0.5*self.beta*self.trYYT + E = +0.5*self.beta2*np.sum(self.psi1YYpsi1 * self.LBL_inv) + + return A+B+C+D+E + + def dL_dbeta(self): + """ compute the gradient of the log likelihood wrt beta. + TODO: suport heteroscedatic noise""" + + dA_dbeta = 0.5 * self.N*self.D/self.beta + dB_dbeta = - 0.5 * self.D * self.trace_K + dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A) + dD_dbeta = - 0.5 * self.trYYT + tmp = mdot(self.LBi.T, self.LLambdai, self.psi1Y) + dE_dbeta = (self.beta * np.sum(np.square(self.C)) - 0.5 * self.beta2 + * np.sum(self.A * np.dot(tmp, tmp.T))) + + return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta) + + def dL_dtheta(self): + dL_dtheta = (self.dL_dpsi0 * self.dpsi0_dtheta + (self.dL_dpsi1[:,:, None] * self.dpsi1_dtheta).sum(0).sum(0) + + (self.dL_dpsi2[:, :, None] * self.dpsi2_dtheta).sum(0).sum(0) + + (self.dL_dKmm[:, :, None] * self.dKmm_dtheta).sum(0).sum(0)) + return dL_dtheta + + def dL_dZ(self): + dL_dZ = ((self.dL_dpsi1[:,:,None]*self.dpsi1_dZ.swapaxes(0,1)).sum(1) + + (self.dL_dpsi2[:, :, None] * self.dpsi2_dZ).sum(0) + + 2.0*(self.dL_dKmm[:, :, None] * self.dKmm_dZ).sum(0)) + return dL_dZ + + + def log_likelihood_gradients(self): + return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) + + def _raw_predict(self,_Xnew,slices): + """Internal helper function for making predictions, does not account for normalisation""" + + Kx = self.kern.K(self.Z, _Xnew, self.Xslices, slices) + Kxx = self.kern.K(_Xnew, slices1=slices, slices2=slices) + + mu = self.beta * mdot(Kx.T, self.LBL_inv, self.psi1Y) + var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + np.eye(_Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case. + return mu,var + + def plot(self,*args,**kwargs): + """ + Plot the fitted model: just call the GP_regression plot function and then add inducing inputs + """ + GP_regression.plot(self,*args,**kwargs) + if self.Q==1: + pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) + if self.Q==2: + pb.plot(self.Z[:,0],self.Z[:,1],'wo') diff --git a/GPy/models/warped_GP.py b/GPy/models/warped_GP.py new file mode 100644 index 00000000..880b8b29 --- /dev/null +++ b/GPy/models/warped_GP.py @@ -0,0 +1,90 @@ +import numpy as np +from .. import kern +from ..core import model +from ..util.linalg import pdinv +from ..util.plot import gpplot +from ..util.warping_functions import * +from GP_regression import GP_regression + + +class warpedGP(GP_regression): + """ + TODO: fucking docstrings! + + @nfusi: I'#ve hacked a little on this, but no guarantees. J. + """ + def __init__(self, X, Y, warping_function = None, warping_terms = 3, **kwargs): + + if warping_function == None: + self.warping_function = TanhWarpingFunction(warping_terms) + # self.warping_params = np.random.randn(self.warping_function.n_terms, 3) + self.warping_params = np.ones((self.warping_function.n_terms, 3))*1.0 # TODO better init + self.warp_params_shape = (self.warping_function.n_terms, 3) # todo get this from the subclass + + self.Z = Y.copy() + self.N, self.D = Y.shape + self.transform_data() + GP_regression.__init__(self, X, self.Y, **kwargs) + + def set_param(self, x): + self.warping_params = x[:self.warping_function.num_parameters].reshape(self.warp_params_shape).copy() + self.transform_data() + GP_regression.set_param(self, x[self.warping_function.num_parameters:].copy()) + + def get_param(self): + return np.hstack((self.warping_params.flatten().copy(), GP_regression.get_param(self).copy())) + + def get_param_names(self): + warping_names = self.warping_function.get_param_names() + param_names = GP_regression.get_param_names(self) + return warping_names + param_names + + def transform_data(self): + self.Y = self.warping_function.f(self.Z.copy(), self.warping_params).copy() + + # this supports the 'smart' behaviour in GP_regression + if self.D > self.N: + self.Youter = np.dot(self.Y, self.Y.T) + else: + self.Youter = None + + return self.Y + + def log_likelihood(self): + ll = GP_regression.log_likelihood(self) + jacobian = self.warping_function.fgrad_y(self.Z, self.warping_params) + return ll + np.log(jacobian).sum() + + def log_likelihood_gradients(self): + ll_grads = GP_regression.log_likelihood_gradients(self) + alpha = np.dot(self.Ki, self.Y.flatten()) + warping_grads = self.warping_function_gradients(alpha) + return np.hstack((warping_grads.flatten(), ll_grads.flatten())) + + def warping_function_gradients(self, Kiy): + grad_y = self.warping_function.fgrad_y(self.Z, self.warping_params) + grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Z, self.warping_params, + return_covar_chain = True) + + djac_dpsi = ((1.0/grad_y[:,:, None, None])*grad_y_psi).sum(axis=0).sum(axis=0) + dquad_dpsi = (Kiy[:,None,None,None] * grad_psi).sum(axis=0).sum(axis=0) + + return -dquad_dpsi + djac_dpsi + + def plot_warping(self): + self.warping_function.plot(self.warping_params, self.Z.min(), self.Z.max()) + + def predict(self, X, in_unwarped_space = False, **kwargs): + mu, var = GP_regression.predict(self, X, **kwargs) + + # The plot() function calls set_param() before calling predict() + # this is causing the observations to be plotted in the transformed + # space (where Y lives), making the plot looks very wrong + # if the predictions are made in the untransformed space + # (where Z lives). To fix this I included the option below. It's + # just a quick fix until I figure out something smarter. + if in_unwarped_space: + mu = self.warping_function.f_inv(mu, self.warping_params) + var = self.warping_function.f_inv(var, self.warping_params) + + return mu, var