diff --git a/GPy/core/gp.py b/GPy/core/gp.py new file mode 100644 index 00000000..19bd84ed --- /dev/null +++ b/GPy/core/gp.py @@ -0,0 +1,150 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from scipy import linalg +import pylab as pb +from .. import kern +from ..util.linalg import pdinv, mdot, tdot +#from ..util.plot import gpplot, Tango +from ..likelihoods import EP +from gp_base import GPBase + +class GP(GPBase): + """ + Gaussian Process model for regression and EP + + :param X: input observations + :param kernel: a GPy kernel, defaults to rbf+white + :parm likelihood: a GPy likelihood + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :rtype: model object + :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 + :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] + :type powerep: list + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + def __init__(self, X, likelihood, kernel, normalize_X=False): + GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X) + self._set_params(self._get_params()) + + def _set_params(self, p): + self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()]) + self.likelihood._set_params(p[self.kern.Nparam_transformed():]) + + self.K = self.kern.K(self.X) + self.K += self.likelihood.covariance_matrix + + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + + # the gradient of the likelihood wrt the covariance matrix + if self.likelihood.YYT is None: + #alpha = np.dot(self.Ki, self.likelihood.Y) + alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1) + + self.dL_dK = 0.5 * (tdot(alpha) - self.input_dim * self.Ki) + else: + #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki) + tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1) + tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1) + self.dL_dK = 0.5 * (tmp - self.input_dim * self.Ki) + + def _get_params(self): + return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params())) + + + def _get_param_names(self): + return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + + def update_likelihood_approximation(self): + """ + Approximates a non-gaussian likelihood using Expectation Propagation + + For a Gaussian likelihood, no iteration is required: + this function does nothing + """ + self.likelihood.fit_full(self.kern.K(self.X)) + self._set_params(self._get_params()) # update the GP + + def _model_fit_term(self): + """ + Computes the model fit using YYT if it's available + """ + if self.likelihood.YYT is None: + tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1) + return -0.5 * np.sum(np.square(tmp)) + #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y))) + else: + return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT)) + + def log_likelihood(self): + """ + The log marginal likelihood of the GP. + + For an EP model, can be written as the log likelihood of a regression + model for a new variable Y* = v_tilde/tau_tilde, with a covariance + matrix K* = K + diag(1./tau_tilde) plus a normalization term. + """ + return -0.5 * self.input_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z + + + def _log_likelihood_gradients(self): + """ + The gradient of all parameters. + + Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta + """ + return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) + + def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False): + """ + Internal helper function for making predictions, does not account + for normalization or likelihood + """ + Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T + #KiKx = np.dot(self.Ki, Kx) + KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1) + mu = np.dot(KiKx.T, self.likelihood.Y) + if full_cov: + Kxx = self.kern.K(_Xnew, which_parts=which_parts) + var = Kxx - np.dot(KiKx.T, Kx) + else: + Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts) + var = Kxx - np.sum(np.multiply(KiKx, Kx), 0) + var = var[:, None] + if stop: + debug_this # @UndefinedVariable + return mu, var + + def predict(self, Xnew, which_parts='all', full_cov=False): + """ + Predict the function(s) at the new point(s) Xnew. + Arguments + --------- + :param Xnew: The points at which to make a prediction + :type Xnew: np.ndarray, Nnew x self.input_dim + :param which_parts: specifies which outputs kernel(s) to use in prediction + :type which_parts: ('all', list of bools) + :param full_cov: whether to return the folll covariance matrix, or just the diagonal + :type full_cov: bool + :rtype: posterior mean, a Numpy array, Nnew x self.input_dim + :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise + :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays, Nnew x self.input_dim + + + If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew. + This is to allow for different normalizations of the output dimensions. + + """ + # normalize X values + Xnew = (Xnew.copy() - self._Xmean) / self._Xstd + mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts) + + # now push through likelihood + mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov) + + return mean, var, _025pm, _975pm diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py new file mode 100644 index 00000000..26870927 --- /dev/null +++ b/GPy/core/sparse_gp.py @@ -0,0 +1,304 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv +from scipy import linalg +from ..likelihoods import Gaussian +from gp_base import GPBase + +class SparseGP(GPBase): + """ + Variational sparse GP model + + :param X: inputs + :type X: np.ndarray (N x input_dim) + :param likelihood: a likelihood instance, containing the observed data + :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace) + :param kernel : the kernel (covariance function). See link kernels + :type kernel: a GPy.kern.kern instance + :param X_variance: The uncertainty in the measurements of X (Gaussian variance) + :type X_variance: np.ndarray (N x input_dim) | None + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (num_inducing x input_dim) | None + :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type num_inducing: int + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False): + GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X) + + self.Z = Z + self.num_inducing = Z.shape[0] + self.likelihood = likelihood + + if X_variance is None: + self.has_uncertain_inputs = False + else: + assert X_variance.shape == X.shape + self.has_uncertain_inputs = True + self.X_variance = X_variance + + if normalize_X: + self.Z = (self.Z.copy() - self._Xmean) / self._Xstd + + # normalize X uncertainty also + if self.has_uncertain_inputs: + self.X_variance /= np.square(self._Xstd) + + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z, self.X, self.X_variance) + self.psi1 = self.kern.psi1(self.Z, self.X, self.X_variance).T + self.psi2 = self.kern.psi2(self.Z, self.X, self.X_variance) + else: + self.psi0 = self.kern.Kdiag(self.X) + self.psi1 = self.kern.K(self.Z, self.X) + self.psi2 = None + + def _computations(self): + + # factor Kmm + self.Lm = jitchol(self.Kmm) + + # The rather complex computations of self.A + if self.has_uncertain_inputs: + if self.likelihood.is_heteroscedastic: + psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.N, 1, 1))).sum(0) + else: + psi2_beta = self.psi2.sum(0) * self.likelihood.precision + evals, evecs = linalg.eigh(psi2_beta) + clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable + tmp = evecs * np.sqrt(clipped_evals) + else: + if self.likelihood.is_heteroscedastic: + tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.N))) + else: + tmp = self.psi1 * (np.sqrt(self.likelihood.precision)) + tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1) + self.A = tdot(tmp) + + + # factor B + self.B = np.eye(self.num_inducing) + self.A + self.LB = jitchol(self.B) + + # TODO: make a switch for either first compute psi1V, or VV.T + self.psi1V = np.dot(self.psi1, self.likelihood.V) + + # back substutue C into psi1V + tmp, info1 = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0) + self._LBi_Lmi_psi1V, _ = linalg.lapack.flapack.dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0) + tmp, info2 = linalg.lapack.flapack.dpotrs(self.LB, tmp, lower=1) + self.Cpsi1V, info3 = linalg.lapack.flapack.dtrtrs(self.Lm, tmp, lower=1, trans=1) + + # Compute dL_dKmm + tmp = tdot(self._LBi_Lmi_psi1V) + self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.input_dim * np.eye(self.num_inducing) + tmp) + tmp = -0.5 * self.DBi_plus_BiPBi + tmp += -0.5 * self.B * self.input_dim + tmp += self.input_dim * np.eye(self.num_inducing) + self.dL_dKmm = backsub_both_sides(self.Lm, tmp) + + # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case + self.dL_dpsi0 = -0.5 * self.input_dim * (self.likelihood.precision * np.ones([self.N, 1])).flatten() + self.dL_dpsi1 = np.dot(self.Cpsi1V, self.likelihood.V.T) + dL_dpsi2_beta = 0.5 * backsub_both_sides(self.Lm, self.input_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi) + + if self.likelihood.is_heteroscedastic: + if self.has_uncertain_inputs: + self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :] + else: + self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, self.psi1 * self.likelihood.precision.reshape(1, self.N)) + self.dL_dpsi2 = None + else: + dL_dpsi2 = self.likelihood.precision * dL_dpsi2_beta + if self.has_uncertain_inputs: + # repeat for each of the N psi_2 matrices + self.dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], self.N, axis=0) + else: + # subsume back into psi1 (==Kmn) + self.dL_dpsi1 += 2.*np.dot(dL_dpsi2, self.psi1) + self.dL_dpsi2 = None + + + # the partial derivative vector for the likelihood + if self.likelihood.Nparams == 0: + # save computation here. + self.partial_for_likelihood = None + elif self.likelihood.is_heteroscedastic: + raise NotImplementedError, "heteroscedatic derivates not implemented" + else: + # likelihood is not heterscedatic + self.partial_for_likelihood = -0.5 * self.N * self.input_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2 + self.partial_for_likelihood += 0.5 * self.input_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self.A) * self.likelihood.precision) + self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self.A * self.DBi_plus_BiPBi) - np.sum(np.square(self._LBi_Lmi_psi1V))) + + def log_likelihood(self): + """ Compute the (lower bound on the) log marginal likelihood """ + if self.likelihood.is_heteroscedastic: + A = -0.5 * self.N * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y) + B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A)) + else: + A = -0.5 * self.N * self.input_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT + B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A)) + C = -self.input_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2)) + D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V)) + return A + B + C + D + self.likelihood.Z + + def _set_params(self, p): + self.Z = p[:self.num_inducing * self.input_dim].reshape(self.num_inducing, self.input_dim) + self.kern._set_params(p[self.Z.size:self.Z.size + self.kern.Nparam]) + self.likelihood._set_params(p[self.Z.size + self.kern.Nparam:]) + self._compute_kernel_matrices() + self._computations() + + def _get_params(self): + return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()]) + + def _get_param_names(self): + return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])],[])\ + + self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + + def update_likelihood_approximation(self): + """ + Approximates a non-gaussian likelihood using Expectation Propagation + + For a Gaussian likelihood, no iteration is required: + this function does nothing + """ + if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood + self.likelihood.restart() # TODO check consistency with pseudo_EP + if self.has_uncertain_inputs: + Lmi = chol_inv(self.Lm) + Kmmi = tdot(Lmi.T) + diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)]) + + self.likelihood.fit_FITC(self.Kmm, self.psi1, diag_tr_psi2Kmmi) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion + # raise NotImplementedError, "EP approximation not implemented for uncertain inputs" + else: + self.likelihood.fit_DTC(self.Kmm, self.psi1) + # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0) + self._set_params(self._get_params()) # update the GP + + def _log_likelihood_gradients(self): + return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood))) + + def dL_dtheta(self): + """ + Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel + """ + dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z) + if self.has_uncertain_inputs: + dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X, self.X_variance) + dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T, self.Z, self.X, self.X_variance) + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X, self.X_variance) + else: + dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.Z, self.X) + dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) + + return dL_dtheta + + def dL_dZ(self): + """ + The derivative of the bound wrt the inducing inputs Z + """ + dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm, self.Z) # factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ + if self.has_uncertain_inputs: + dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance) + dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance) + else: + dL_dZ += self.kern.dK_dX(self.dL_dpsi1, self.Z, self.X) + return dL_dZ + + def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): + """Internal helper function for making predictions, does not account for normalization""" + + Bi, _ = linalg.lapack.flapack.dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work! + symmetrify(Bi) + Kmmi_LmiBLmi = backsub_both_sides(self.Lm, np.eye(self.num_inducing) - Bi) + + if X_variance_new is None: + Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts) + mu = np.dot(Kx.T, self.Cpsi1V) + if full_cov: + Kxx = self.kern.K(Xnew, which_parts=which_parts) + var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting + else: + Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts) + var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0) + else: + # assert which_parts=='all', "swithching out parts of variational kernels is not implemented" + Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts + mu = np.dot(Kx, self.Cpsi1V) + if full_cov: + raise NotImplementedError, "TODO" + else: + Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new) + psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new) + var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1) + + return mu, var[:, None] + + def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): + """ + Predict the function(s) at the new point(s) Xnew. + + Arguments + --------- + :param Xnew: The points at which to make a prediction + :type Xnew: np.ndarray, Nnew x self.input_dim + :param X_variance_new: The uncertainty in the prediction points + :type X_variance_new: np.ndarray, Nnew x self.input_dim + :param which_parts: specifies which outputs kernel(s) to use in prediction + :type which_parts: ('all', list of bools) + :param full_cov: whether to return the folll covariance matrix, or just the diagonal + :type full_cov: bool + :rtype: posterior mean, a Numpy array, Nnew x self.input_dim + :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise + :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays, Nnew x self.input_dim + + + If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew. + This is to allow for different normalizations of the output dimensions. + + """ + # normalize X values + Xnew = (Xnew.copy() - self._Xmean) / self._Xstd + if X_variance_new is not None: + X_variance_new = X_variance_new / self._Xstd ** 2 + + # here's the actual prediction by the GP model + mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts) + + # now push through likelihood + mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov) + + return mean, var, _025pm, _975pm + + def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None): + if ax is None: + fig = pb.figure(num=fignum) + ax = fig.add_subplot(111) + + if which_data is 'all': + which_data = slice(None) + + GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax) + if self.X.shape[1] == 1: + if self.has_uncertain_inputs: + Xu = self.X * self._Xstd + self._Xmean # NOTE self.X are the normalized values now + ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0], + xerr=2 * np.sqrt(self.X_variance[which_data, 0]), + ecolor='k', fmt=None, elinewidth=.5, alpha=.5) + Zu = self.Z * self._Xstd + self._Xmean + ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12) + + elif self.X.shape[1] == 2: + Zu = self.Z * self._Xstd + self._Xmean + ax.plot(Zu[:, 0], Zu[:, 1], 'wo') diff --git a/GPy/inference/scg.py b/GPy/inference/scg.py new file mode 100644 index 00000000..5753be7f --- /dev/null +++ b/GPy/inference/scg.py @@ -0,0 +1,169 @@ +# Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012) + +# Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT +# HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT +# NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +import numpy as np +import sys + + +def print_out(len_maxiters, display, fnow, current_grad, beta, iteration): + if display: + print '\r', + print '{0:>0{mi}g} {1:> 12e} {2:> 12e} {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, ' Scale:', beta, '\r', + sys.stdout.flush() + +def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=None, ftol=None, gtol=None): + """ + Optimisation through Scaled Conjugate Gradients (SCG) + + f: the objective function + gradf : the gradient function (should return a 1D np.ndarray) + x : the initial condition + + Returns + x the optimal value for x + flog : a list of all the objective values + function_eval number of fn evaluations + status: string describing convergence status + """ + if xtol is None: + xtol = 1e-6 + if ftol is None: + ftol = 1e-6 + if gtol is None: + gtol = 1e-5 + sigma0 = 1.0e-8 + fold = f(x, *optargs) # Initial function value. + function_eval = 1 + fnow = fold + gradnew = gradf(x, *optargs) # Initial gradient. + current_grad = np.dot(gradnew, gradnew) + gradold = gradnew.copy() + d = -gradnew # Initial search direction. + success = True # Force calculation of directional derivs. + nsuccess = 0 # nsuccess counts number of successes. + beta = 1.0 # Initial scale parameter. + betamin = 1.0e-60 # Lower bound on scale. + betamax = 1.0e100 # Upper bound on scale. + status = "Not converged" + + flog = [fold] + + iteration = 0 + + len_maxiters = len(str(maxiters)) + if display: + print ' {0:{mi}s} {1:11s} {2:11s} {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters) + + # Main optimization loop. + while iteration < maxiters: + + # Calculate first and second directional derivatives. + if success: + mu = np.dot(d, gradnew) + if mu >= 0: + d = -gradnew + mu = np.dot(d, gradnew) + kappa = np.dot(d, d) + sigma = sigma0 / np.sqrt(kappa) + xplus = x + sigma * d + gplus = gradf(xplus, *optargs) + theta = np.dot(d, (gplus - gradnew)) / sigma + + # Increase effective curvature and evaluate step size alpha. + delta = theta + beta * kappa + if delta <= 0: + delta = beta * kappa + beta = beta - theta / kappa + + alpha = -mu / delta + + # Calculate the comparison ratio. + xnew = x + alpha * d + fnew = f(xnew, *optargs) + function_eval += 1 + + if function_eval >= max_f_eval: + status = "Maximum number of function evaluations exceeded" + break +# return x, flog, function_eval, status + + Delta = 2.*(fnew - fold) / (alpha * mu) + if Delta >= 0.: + success = True + nsuccess += 1 + x = xnew + fnow = fnew + else: + success = False + fnow = fold + + # Store relevant variables + flog.append(fnow) # Current function value + + iteration += 1 + print_out(len_maxiters, display, fnow, current_grad, beta, iteration) + + if success: + # Test for termination + if (np.max(np.abs(alpha * d)) < xtol) or (np.abs(fnew - fold) < ftol): + status = 'converged' + break +# return x, flog, function_eval, status + + else: + # Update variables for new position + gradnew = gradf(x, *optargs) + current_grad = np.dot(gradnew, gradnew) + gradold = gradnew + fold = fnew + # If the gradient is zero then we are done. + if current_grad <= gtol: + status = 'converged' + break + # return x, flog, function_eval, status + + # Adjust beta according to comparison ratio. + if Delta < 0.25: + beta = min(4.0 * beta, betamax) + if Delta > 0.75: + beta = max(0.5 * beta, betamin) + + # Update search direction using Polak-Ribiere formula, or re-start + # in direction of negative gradient after nparams steps. + if nsuccess == x.size: + d = -gradnew +# beta = 1. # TODO: betareset!! + nsuccess = 0 + elif success: + Gamma = np.dot(gradold - gradnew, gradnew) / (mu) + d = Gamma * d - gradnew + else: + # If we get here, then we haven't terminated in the given number of + # iterations. + status = "maxiter exceeded" + + if display: + print_out(len_maxiters, display, fnow, current_grad, beta, iteration) + print "" + return x, flog, function_eval, status diff --git a/GPy/inference/sgd.py b/GPy/inference/sgd.py new file mode 100644 index 00000000..2df09ec8 --- /dev/null +++ b/GPy/inference/sgd.py @@ -0,0 +1,356 @@ +import numpy as np +import scipy as sp +import scipy.sparse +from optimization import Optimizer +from scipy import linalg, optimize +import pylab as plt +import copy, sys, pickle + +class opt_SGD(Optimizer): + """ + Optimize using stochastic gradient descent. + + *** Parameters *** + model: reference to the model object + iterations: number of iterations + learning_rate: learning rate + momentum: momentum + + """ + + def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs): + self.opt_name = "Stochastic Gradient Descent" + + self.model = model + self.iterations = iterations + self.momentum = momentum + self.learning_rate = learning_rate + self.x_opt = None + self.f_opt = None + self.messages = messages + self.batch_size = batch_size + self.self_paced = self_paced + self.center = center + self.param_traces = [('noise',[])] + self.iteration_file = iteration_file + self.learning_rate_adaptation = learning_rate_adaptation + self.actual_iter = actual_iter + if self.learning_rate_adaptation != None: + if self.learning_rate_adaptation == 'annealing': + self.learning_rate_0 = self.learning_rate + else: + self.learning_rate_0 = self.learning_rate.mean() + + self.schedule = schedule + # if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1: + # self.param_traces.append(('bias',[])) + # if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1: + # self.param_traces.append(('linear',[])) + # if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1: + # self.param_traces.append(('rbf_var',[])) + + self.param_traces = dict(self.param_traces) + self.fopt_trace = [] + + num_params = len(self.model._get_params()) + if isinstance(self.learning_rate, float): + self.learning_rate = np.ones((num_params,)) * self.learning_rate + + assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter" + + def __str__(self): + status = "\nOptimizer: \t\t\t %s\n" % self.opt_name + status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt + status += "Number of iterations: \t\t %d\n" % self.iterations + status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min()) + status += "Momentum: \t\t\t %.3f\n" % self.momentum + status += "Batch size: \t\t\t %d\n" % self.batch_size + status += "Time elapsed: \t\t\t %s\n" % self.time + return status + + def plot_traces(self): + plt.figure() + plt.subplot(211) + plt.title('Parameters') + for k in self.param_traces.keys(): + plt.plot(self.param_traces[k], label=k) + plt.legend(loc=0) + plt.subplot(212) + plt.title('Objective function') + plt.plot(self.fopt_trace) + + + def non_null_samples(self, data): + return (np.isnan(data).sum(axis=1) == 0) + + def check_for_missing(self, data): + if sp.sparse.issparse(self.model.likelihood.Y): + return True + else: + return np.isnan(data).sum() > 0 + + def subset_parameter_vector(self, x, samples, param_shapes): + subset = np.array([], dtype = int) + x = np.arange(0, len(x)) + i = 0 + + for s in param_shapes: + N, input_dim = s + X = x[i:i+N*input_dim].reshape(N, input_dim) + X = X[samples] + subset = np.append(subset, X.flatten()) + i += N*input_dim + + subset = np.append(subset, x[i:]) + + return subset + + def shift_constraints(self, j): + + constrained_indices = copy.deepcopy(self.model.constrained_indices) + + for c, constraint in enumerate(constrained_indices): + mask = (np.ones_like(constrained_indices[c]) == 1) + for i in range(len(constrained_indices[c])): + pos = np.where(j == constrained_indices[c][i])[0] + if len(pos) == 1: + self.model.constrained_indices[c][i] = pos + else: + mask[i] = False + + self.model.constrained_indices[c] = self.model.constrained_indices[c][mask] + return constrained_indices + # back them up + # bounded_i = copy.deepcopy(self.model.constrained_bounded_indices) + # bounded_l = copy.deepcopy(self.model.constrained_bounded_lowers) + # bounded_u = copy.deepcopy(self.model.constrained_bounded_uppers) + + # for b in range(len(bounded_i)): # for each group of constraints + # for bc in range(len(bounded_i[b])): + # pos = np.where(j == bounded_i[b][bc])[0] + # if len(pos) == 1: + # pos2 = np.where(self.model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0] + # self.model.constrained_bounded_indices[b][pos2] = pos[0] + # else: + # if len(self.model.constrained_bounded_indices[b]) == 1: + # # if it's the last index to be removed + # # the logic here is just a mess. If we remove the last one, then all the + # # b-indices change and we have to iterate through everything to find our + # # current index. Can't deal with this right now. + # raise NotImplementedError + + # else: # just remove it from the indices + # mask = self.model.constrained_bounded_indices[b] != bc + # self.model.constrained_bounded_indices[b] = self.model.constrained_bounded_indices[b][mask] + + + # # here we shif the positive constraints. We cycle through each positive + # # constraint + # positive = self.model.constrained_positive_indices.copy() + # mask = (np.ones_like(positive) == 1) + # for p in range(len(positive)): + # # we now check whether the constrained index appears in the j vector + # # (the vector of the "active" indices) + # pos = np.where(j == self.model.constrained_positive_indices[p])[0] + # if len(pos) == 1: + # self.model.constrained_positive_indices[p] = pos + # else: + # mask[p] = False + # self.model.constrained_positive_indices = self.model.constrained_positive_indices[mask] + + # return (bounded_i, bounded_l, bounded_u), positive + + def restore_constraints(self, c):#b, p): + # self.model.constrained_bounded_indices = b[0] + # self.model.constrained_bounded_lowers = b[1] + # self.model.constrained_bounded_uppers = b[2] + # self.model.constrained_positive_indices = p + self.model.constrained_indices = c + + def get_param_shapes(self, N = None, input_dim = None): + model_name = self.model.__class__.__name__ + if model_name == 'GPLVM': + return [(N, input_dim)] + if model_name == 'Bayesian_GPLVM': + return [(N, input_dim), (N, input_dim)] + else: + raise NotImplementedError + + def step_with_missing_data(self, f_fp, X, step, shapes): + N, input_dim = X.shape + + if not sp.sparse.issparse(self.model.likelihood.Y): + Y = self.model.likelihood.Y + samples = self.non_null_samples(self.model.likelihood.Y) + self.model.N = samples.sum() + Y = Y[samples] + else: + samples = self.model.likelihood.Y.nonzero()[0] + self.model.N = len(samples) + Y = np.asarray(self.model.likelihood.Y[samples].todense(), dtype = np.float64) + + if self.model.N == 0 or Y.std() == 0.0: + return 0, step, self.model.N + + self.model.likelihood._offset = Y.mean() + self.model.likelihood._scale = Y.std() + self.model.likelihood.set_data(Y) + # self.model.likelihood.V = self.model.likelihood.Y*self.model.likelihood.precision + + sigma = self.model.likelihood._variance + self.model.likelihood._variance = None # invalidate cache + self.model.likelihood._set_params(sigma) + + + j = self.subset_parameter_vector(self.x_opt, samples, shapes) + self.model.X = X[samples] + + model_name = self.model.__class__.__name__ + + if model_name == 'Bayesian_GPLVM': + self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T) + self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT) + + ci = self.shift_constraints(j) + f, fp = f_fp(self.x_opt[j]) + + step[j] = self.momentum * step[j] + self.learning_rate[j] * fp + self.x_opt[j] -= step[j] + self.restore_constraints(ci) + + self.model.grads[j] = fp + # restore likelihood _offset and _scale, otherwise when we call set_data(y) on + # the next feature, it will get normalized with the mean and std of this one. + self.model.likelihood._offset = 0 + self.model.likelihood._scale = 1 + + return f, step, self.model.N + + def adapt_learning_rate(self, t, D): + if self.learning_rate_adaptation == 'adagrad': + if t > 0: + g_k = self.model.grads + self.s_k += np.square(g_k) + t0 = 100.0 + self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k)) + + import pdb; pdb.set_trace() + else: + self.learning_rate = np.zeros_like(self.learning_rate) + self.s_k = np.zeros_like(self.x_opt) + + elif self.learning_rate_adaptation == 'annealing': + #self.learning_rate = self.learning_rate_0/(1+float(t+1)/10) + self.learning_rate = np.ones_like(self.learning_rate) * self.schedule[t] + + + elif self.learning_rate_adaptation == 'semi_pesky': + if self.model.__class__.__name__ == 'Bayesian_GPLVM': + g_t = self.model.grads + if t == 0: + self.hbar_t = 0.0 + self.tau_t = 100.0 + self.gbar_t = 0.0 + + self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t + self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t) + self.learning_rate = np.ones_like(self.learning_rate)*(np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t) + tau_t = self.tau_t*(1-self.learning_rate) + 1 + + + def opt(self, f_fp=None, f=None, fp=None): + self.x_opt = self.model._get_params_transformed() + self.grads = [] + + X, Y = self.model.X.copy(), self.model.likelihood.Y.copy() + + self.model.likelihood.YYT = 0 + self.model.likelihood.trYYT = 0 + self.model.likelihood._offset = 0.0 + self.model.likelihood._scale = 1.0 + + N, input_dim = self.model.X.shape + D = self.model.likelihood.Y.shape[1] + num_params = self.model._get_params() + self.trace = [] + missing_data = self.check_for_missing(self.model.likelihood.Y) + + step = np.zeros_like(num_params) + for it in range(self.iterations): + if self.actual_iter != None: + it = self.actual_iter + + self.model.grads = np.zeros_like(self.x_opt) # TODO this is ugly + + if it == 0 or self.self_paced is False: + features = np.random.permutation(Y.shape[1]) + else: + features = np.argsort(NLL) + + b = len(features)/self.batch_size + features = [features[i::b] for i in range(b)] + NLL = [] + import pylab as plt + for count, j in enumerate(features): + self.model.input_dim = len(j) + self.model.likelihood.input_dim = len(j) + self.model.likelihood.set_data(Y[:, j]) + # self.model.likelihood.V = self.model.likelihood.Y*self.model.likelihood.precision + + sigma = self.model.likelihood._variance + self.model.likelihood._variance = None # invalidate cache + self.model.likelihood._set_params(sigma) + + if missing_data: + shapes = self.get_param_shapes(N, input_dim) + f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes) + else: + self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T) + self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT) + Nj = N + f, fp = f_fp(self.x_opt) + self.model.grads = fp.copy() + step = self.momentum * step + self.learning_rate * fp + self.x_opt -= step + + if self.messages == 2: + noise = self.model.likelihood._variance + status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise) + sys.stdout.write(status) + sys.stdout.flush() + self.param_traces['noise'].append(noise) + + self.adapt_learning_rate(it+count, D) + NLL.append(f) + self.fopt_trace.append(NLL[-1]) + # fig = plt.figure('traces') + # plt.clf() + # plt.plot(self.param_traces['noise']) + + # for k in self.param_traces.keys(): + # self.param_traces[k].append(self.model.get(k)[0]) + self.grads.append(self.model.grads.tolist()) + # should really be a sum(), but earlier samples in the iteration will have a very crappy ll + self.f_opt = np.mean(NLL) + self.model.N = N + self.model.X = X + self.model.input_dim = D + self.model.likelihood.N = N + self.model.likelihood.input_dim = D + self.model.likelihood.Y = Y + sigma = self.model.likelihood._variance + self.model.likelihood._variance = None # invalidate cache + self.model.likelihood._set_params(sigma) + + self.trace.append(self.f_opt) + if self.iteration_file is not None: + f = open(self.iteration_file + "iteration%d.pickle" % it, 'w') + data = [self.x_opt, self.fopt_trace, self.param_traces] + pickle.dump(data, f) + f.close() + + if self.messages != 0: + sys.stdout.write('\r' + ' '*len(status)*2 + ' \r') + status = "SGD Iteration: {0: 3d}/{1: 3d} f: {2: 2.3f} max eta: {3: 1.5f}\n".format(it+1, self.iterations, self.f_opt, self.learning_rate.max()) + sys.stdout.write(status) + sys.stdout.flush() diff --git a/GPy/likelihoods/ep.py b/GPy/likelihoods/ep.py new file mode 100644 index 00000000..6409ed58 --- /dev/null +++ b/GPy/likelihoods/ep.py @@ -0,0 +1,336 @@ +import numpy as np +from scipy import stats, linalg +from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot +from likelihood import likelihood + +class EP(likelihood): + def __init__(self,data,LikelihoodFunction,epsilon=1e-3,power_ep=[1.,1.]): + """ + Expectation Propagation + + Arguments + --------- + epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) + LikelihoodFunction : a likelihood function (see likelihood_functions.py) + """ + self.LikelihoodFunction = LikelihoodFunction + self.epsilon = epsilon + self.eta, self.delta = power_ep + self.data = data + self.N, self.input_dim = self.data.shape + self.is_heteroscedastic = True + self.Nparams = 0 + self._transf_data = self.LikelihoodFunction._preprocess_values(data) + + #Initial values - Likelihood approximation parameters: + #p(y|f) = t(f|tau_tilde,v_tilde) + self.tau_tilde = np.zeros(self.N) + self.v_tilde = np.zeros(self.N) + + #initial values for the GP variables + self.Y = np.zeros((self.N,1)) + self.covariance_matrix = np.eye(self.N) + self.precision = np.ones(self.N)[:,None] + self.Z = 0 + self.YYT = None + self.V = self.precision * self.Y + + def restart(self): + self.tau_tilde = np.zeros(self.N) + self.v_tilde = np.zeros(self.N) + self.Y = np.zeros((self.N,1)) + self.covariance_matrix = np.eye(self.N) + self.precision = np.ones(self.N)[:,None] + self.Z = 0 + self.YYT = None + self.V = self.precision * self.Y + + def predictive_values(self,mu,var,full_cov): + if full_cov: + raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood" + return self.LikelihoodFunction.predictive_values(mu,var) + + def _get_params(self): + return np.zeros(0) + def _get_param_names(self): + return [] + def _set_params(self,p): + pass # TODO: the EP likelihood might want to take some parameters... + def _gradients(self,partial): + return np.zeros(0) # TODO: the EP likelihood might want to take some parameters... + + def _compute_GP_variables(self): + #Variables to be called from GP + mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model + sigma_sum = 1./self.tau_ + 1./self.tau_tilde + mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2 + self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep + + self.Y = mu_tilde[:,None] + self.YYT = np.dot(self.Y,self.Y.T) + self.covariance_matrix = np.diag(1./self.tau_tilde) + self.precision = self.tau_tilde[:,None] + self.V = self.precision * self.Y + + def fit_full(self,K): + """ + The expectation-propagation algorithm. + For nomenclature see Rasmussen & Williams 2006. + """ + #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) + mu = np.zeros(self.N) + Sigma = K.copy() + + """ + Initial values - Cavity distribution parameters: + q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)} + sigma_ = 1./tau_ + mu_ = v_/tau_ + """ + self.tau_ = np.empty(self.N,dtype=float) + self.v_ = np.empty(self.N,dtype=float) + + #Initial values - Marginal moments + z = np.empty(self.N,dtype=float) + self.Z_hat = np.empty(self.N,dtype=float) + phi = np.empty(self.N,dtype=float) + mu_hat = np.empty(self.N,dtype=float) + sigma2_hat = np.empty(self.N,dtype=float) + + #Approximation + epsilon_np1 = self.epsilon + 1. + epsilon_np2 = self.epsilon + 1. + self.iterations = 0 + self.np1 = [self.tau_tilde.copy()] + self.np2 = [self.v_tilde.copy()] + while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: + update_order = np.random.permutation(self.N) + for i in update_order: + #Cavity distribution parameters + self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i] + self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i] + #Marginal moments + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i]) + #Site parameters update + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i]) + self.tau_tilde[i] += Delta_tau + self.v_tilde[i] += Delta_v + #Posterior distribution parameters update + DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i]))) + mu = np.dot(Sigma,self.v_tilde) + self.iterations += 1 + #Sigma recomptutation with Cholesky decompositon + Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K + B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K + L = jitchol(B) + V,info = linalg.lapack.flapack.dtrtrs(L,Sroot_tilde_K,lower=1) + Sigma = K - np.dot(V.T,V) + mu = np.dot(Sigma,self.v_tilde) + epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N + epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N + self.np1.append(self.tau_tilde.copy()) + self.np2.append(self.v_tilde.copy()) + + return self._compute_GP_variables() + + def fit_DTC(self, Kmm, Kmn): + """ + The expectation-propagation algorithm with sparse pseudo-input. + For nomenclature see ... 2013. + """ + M = Kmm.shape[0] + + #TODO: this doesn't work with uncertain inputs! + + """ + Prior approximation parameters: + q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) + Sigma0 = Qnn = Knm*Kmmi*Kmn + """ + KmnKnm = np.dot(Kmn,Kmn.T) + Lm = jitchol(Kmm) + Lmi = chol_inv(Lm) + Kmmi = np.dot(Lmi.T,Lmi) + KmmiKmn = np.dot(Kmmi,Kmn) + Qnn_diag = np.sum(Kmn*KmmiKmn,-2) + LLT0 = Kmm.copy() + + #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm) + #KmnKnm = np.dot(Kmn, Kmn.T) + #KmmiKmn = np.dot(Kmmi,Kmn) + #Qnn_diag = np.sum(Kmn*KmmiKmn,-2) + #LLT0 = Kmm.copy() + + """ + Posterior approximation: q(f|y) = N(f| mu, Sigma) + Sigma = Diag + P*R.T*R*P.T + K + mu = w + P*Gamma + """ + mu = np.zeros(self.N) + LLT = Kmm.copy() + Sigma_diag = Qnn_diag.copy() + + """ + Initial values - Cavity distribution parameters: + q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)} + sigma_ = 1./tau_ + mu_ = v_/tau_ + """ + self.tau_ = np.empty(self.N,dtype=float) + self.v_ = np.empty(self.N,dtype=float) + + #Initial values - Marginal moments + z = np.empty(self.N,dtype=float) + self.Z_hat = np.empty(self.N,dtype=float) + phi = np.empty(self.N,dtype=float) + mu_hat = np.empty(self.N,dtype=float) + sigma2_hat = np.empty(self.N,dtype=float) + + #Approximation + epsilon_np1 = 1 + epsilon_np2 = 1 + self.iterations = 0 + np1 = [self.tau_tilde.copy()] + np2 = [self.v_tilde.copy()] + while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: + update_order = np.random.permutation(self.N) + for i in update_order: + #Cavity distribution parameters + self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i] + self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i] + #Marginal moments + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i]) + #Site parameters update + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i]) + self.tau_tilde[i] += Delta_tau + self.v_tilde[i] += Delta_v + #Posterior distribution parameters update + DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau + L = jitchol(LLT) + #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau)) + V,info = linalg.lapack.flapack.dtrtrs(L,Kmn,lower=1) + Sigma_diag = np.sum(V*V,-2) + si = np.sum(V.T*V[:,i],-1) + mu += (Delta_v-Delta_tau*mu[i])*si + self.iterations += 1 + #Sigma recomputation with Cholesky decompositon + LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T) + L = jitchol(LLT) + V,info = linalg.lapack.flapack.dtrtrs(L,Kmn,lower=1) + V2,info = linalg.lapack.flapack.dtrtrs(L.T,V,lower=0) + Sigma_diag = np.sum(V*V,-2) + Knmv_tilde = np.dot(Kmn,self.v_tilde) + mu = np.dot(V2.T,Knmv_tilde) + epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N + epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N + np1.append(self.tau_tilde.copy()) + np2.append(self.v_tilde.copy()) + + self._compute_GP_variables() + + def fit_FITC(self, Kmm, Kmn, Knn_diag): + """ + The expectation-propagation algorithm with sparse pseudo-input. + For nomenclature see Naish-Guzman and Holden, 2008. + """ + M = Kmm.shape[0] + + """ + Prior approximation parameters: + q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) + Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn + """ + Lm = jitchol(Kmm) + Lmi = chol_inv(Lm) + Kmmi = np.dot(Lmi.T,Lmi) + P0 = Kmn.T + KmnKnm = np.dot(P0.T, P0) + KmmiKmn = np.dot(Kmmi,P0.T) + Qnn_diag = np.sum(P0.T*KmmiKmn,-2) + Diag0 = Knn_diag - Qnn_diag + R0 = jitchol(Kmmi).T + + """ + Posterior approximation: q(f|y) = N(f| mu, Sigma) + Sigma = Diag + P*R.T*R*P.T + K + mu = w + P*Gamma + """ + self.w = np.zeros(self.N) + self.Gamma = np.zeros(M) + mu = np.zeros(self.N) + P = P0.copy() + R = R0.copy() + Diag = Diag0.copy() + Sigma_diag = Knn_diag + RPT0 = np.dot(R0,P0.T) + + """ + Initial values - Cavity distribution parameters: + q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)} + sigma_ = 1./tau_ + mu_ = v_/tau_ + """ + self.tau_ = np.empty(self.N,dtype=float) + self.v_ = np.empty(self.N,dtype=float) + + #Initial values - Marginal moments + z = np.empty(self.N,dtype=float) + self.Z_hat = np.empty(self.N,dtype=float) + phi = np.empty(self.N,dtype=float) + mu_hat = np.empty(self.N,dtype=float) + sigma2_hat = np.empty(self.N,dtype=float) + + #Approximation + epsilon_np1 = 1 + epsilon_np2 = 1 + self.iterations = 0 + self.np1 = [self.tau_tilde.copy()] + self.np2 = [self.v_tilde.copy()] + while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: + update_order = np.random.permutation(self.N) + for i in update_order: + #Cavity distribution parameters + self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i] + self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i] + #Marginal moments + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i]) + #Site parameters update + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i]) + self.tau_tilde[i] += Delta_tau + self.v_tilde[i] += Delta_v + #Posterior distribution parameters update + dtd1 = Delta_tau*Diag[i] + 1. + dii = Diag[i] + Diag[i] = dii - (Delta_tau * dii**2.)/dtd1 + pi_ = P[i,:].reshape(1,M) + P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_ + Rp_i = np.dot(R,pi_.T) + RTR = np.dot(R.T,np.dot(np.eye(M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R)) + R = jitchol(RTR).T + self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1 + self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T) + RPT = np.dot(R,P.T) + Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1) + mu = self.w + np.dot(P,self.Gamma) + self.iterations += 1 + #Sigma recomptutation with Cholesky decompositon + Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde) + Diag = Diag0 * Iplus_Dprod_i + P = Iplus_Dprod_i[:,None] * P0 + safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0) + L = jitchol(np.eye(M) + np.dot(RPT0,safe_diag[:,None]*RPT0.T)) + R,info = linalg.lapack.flapack.dtrtrs(L,R0,lower=1) + RPT = np.dot(R,P.T) + Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1) + self.w = Diag * self.v_tilde + self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde)) + mu = self.w + np.dot(P,self.Gamma) + epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N + epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N + self.np1.append(self.tau_tilde.copy()) + self.np2.append(self.v_tilde.copy()) + + return self._compute_GP_variables() diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py new file mode 100644 index 00000000..886d8873 --- /dev/null +++ b/GPy/likelihoods/gaussian.py @@ -0,0 +1,93 @@ +import numpy as np +from likelihood import likelihood + +class Gaussian(likelihood): + """ + Likelihood class for doing Expectation propagation + + :param Y: observed output (Nx1 numpy.darray) + ..Note:: Y values allowed depend on the likelihood_function used + :param variance : + :param normalize: whether to normalize the data before computing (predictions will be in original scales) + :type normalize: False|True + """ + def __init__(self, data, variance=1., normalize=False): + self.is_heteroscedastic = False + self.Nparams = 1 + self.Z = 0. # a correction factor which accounts for the approximation made + N, self.input_dim = data.shape + + # normalization + if normalize: + self._offset = data.mean(0)[None, :] + self._scale = data.std(0)[None, :] + # Don't scale outputs which have zero variance to zero. + self._scale[np.nonzero(self._scale == 0.)] = 1.0e-3 + else: + self._offset = np.zeros((1, self.input_dim)) + self._scale = np.ones((1, self.input_dim)) + + self.set_data(data) + + self._variance = np.asarray(variance) + 1. + self._set_params(np.asarray(variance)) + + def set_data(self, data): + self.data = data + self.N, D = data.shape + assert D == self.input_dim + self.Y = (self.data - self._offset) / self._scale + if D > self.N: + self.YYT = np.dot(self.Y, self.Y.T) + self.trYYT = np.trace(self.YYT) + else: + self.YYT = None + self.trYYT = np.sum(np.square(self.Y)) + + def _get_params(self): + return np.asarray(self._variance) + + def _get_param_names(self): + return ["noise_variance"] + + def _set_params(self, x): + x = np.float64(x) + if np.all(self._variance != x): + if x == 0.: + self.precision = None + self.V = None + else: + self.precision = 1. / x + self.V = (self.precision) * self.Y + self.covariance_matrix = np.eye(self.N) * x + self._variance = x + + def predictive_values(self, mu, var, full_cov): + """ + Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval + """ + mean = mu * self._scale + self._offset + if full_cov: + if self.input_dim > 1: + raise NotImplementedError, "TODO" + # Note. for input_dim>1, we need to re-normalise all the outputs independently. + # This will mess up computations of diag(true_var), below. + # note that the upper, lower quantiles should be the same shape as mean + # Augment the output variance with the likelihood variance and rescale. + true_var = (var + np.eye(var.shape[0]) * self._variance) * self._scale ** 2 + _5pc = mean - 2.*np.sqrt(np.diag(true_var)) + _95pc = mean + 2.*np.sqrt(np.diag(true_var)) + else: + true_var = (var + self._variance) * self._scale ** 2 + _5pc = mean - 2.*np.sqrt(true_var) + _95pc = mean + 2.*np.sqrt(true_var) + return mean, true_var, _5pc, _95pc + + def fit_full(self): + """ + No approximations needed + """ + pass + + def _gradients(self, partial): + return np.sum(partial) diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py new file mode 100644 index 00000000..7008e9a3 --- /dev/null +++ b/GPy/models/bayesian_gplvm.py @@ -0,0 +1,583 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from ..core import SparseGP +from ..likelihoods import Gaussian +from .. import kern +import itertools +from matplotlib.colors import colorConverter +from GPy.inference.optimization import SCG +from GPy.util import plot_latent +from GPy.models.gplvm import GPLVM + +class BayesianGPLVM(SparseGP, GPLVM): + """ + Bayesian Gaussian Process Latent Variable Model + + :param Y: observed data (np.ndarray) or GPy.likelihood + :type Y: np.ndarray| GPy.likelihood instance + :param input_dim: latent dimensionality + :type input_dim: int + :param init: initialisation method for the latent space + :type init: 'PCA'|'random' + + """ + def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', M=10, + Z=None, kernel=None, oldpsave=10, _debug=False, + **kwargs): + if type(likelihood_or_Y) is np.ndarray: + likelihood = Gaussian(likelihood_or_Y) + else: + likelihood = likelihood_or_Y + + if X == None: + X = self.initialise_latent(init, input_dim, likelihood.Y) + self.init = init + + if X_variance is None: + X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1) + + if Z is None: + Z = np.random.permutation(X.copy())[:M] + assert Z.shape[1] == X.shape[1] + + if kernel is None: + kernel = kern.rbf(input_dim) + kern.white(input_dim) + + self.oldpsave = oldpsave + self._oldps = [] + self._debug = _debug + + if self._debug: + self.f_call = 0 + self._count = itertools.count() + self._savedklll = [] + self._savedparams = [] + self._savedgradients = [] + self._savederrors = [] + self._savedpsiKmm = [] + self._savedABCD = [] + + SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs) + self._set_params(self._get_params()) + + @property + def oldps(self): + return self._oldps + @oldps.setter + def oldps(self, p): + if len(self._oldps) == (self.oldpsave + 1): + self._oldps.pop() + # if len(self._oldps) == 0 or not np.any([np.any(np.abs(p - op) > 1e-5) for op in self._oldps]): + self._oldps.insert(0, p.copy()) + + def _get_param_names(self): + X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], []) + S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], []) + return (X_names + S_names + SparseGP._get_param_names(self)) + + def _get_params(self): + """ + Horizontally stacks the parameters in order to present them to the optimizer. + The resulting 1-input_dim array has this structure: + + =============================================================== + | mu | S | Z | theta | beta | + =============================================================== + + """ + x = np.hstack((self.X.flatten(), self.X_variance.flatten(), SparseGP._get_params(self))) + return x + + def _clipped(self, x): + return x # np.clip(x, -1e300, 1e300) + + def _set_params(self, x, save_old=True, save_count=0): +# try: + x = self._clipped(x) + N, input_dim = self.N, self.input_dim + self.X = x[:self.X.size].reshape(N, input_dim).copy() + self.X_variance = x[(N * input_dim):(2 * N * input_dim)].reshape(N, input_dim).copy() + SparseGP._set_params(self, x[(2 * N * input_dim):]) +# self.oldps = x +# except (LinAlgError, FloatingPointError, ZeroDivisionError): +# print "\rWARNING: Caught LinAlgError, continueing without setting " +# if self._debug: +# self._savederrors.append(self.f_call) +# if save_count > 10: +# raise +# self._set_params(self.oldps[-1], save_old=False, save_count=save_count + 1) + + def dKL_dmuS(self): + dKL_dS = (1. - (1. / (self.X_variance))) * 0.5 + dKL_dmu = self.X + return dKL_dmu, dKL_dS + + def dL_dmuS(self): + dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.dL_dpsi0, self.Z, self.X, self.X_variance) + dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.dL_dpsi1, self.Z, self.X, self.X_variance) + dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2, self.Z, self.X, self.X_variance) + dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2 + dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2 + + return dL_dmu, dL_dS + + def KL_divergence(self): + var_mean = np.square(self.X).sum() + var_S = np.sum(self.X_variance - np.log(self.X_variance)) + return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.N + + def log_likelihood(self): + ll = SparseGP.log_likelihood(self) + kl = self.KL_divergence() + +# if ll < -2E4: +# ll = -2E4 + np.random.randn() +# if kl > 5E4: +# kl = 5E4 + np.random.randn() + + if self._debug: + self.f_call = self._count.next() + if self.f_call % 1 == 0: + self._savedklll.append([self.f_call, ll, kl]) + self._savedparams.append([self.f_call, self._get_params()]) + self._savedgradients.append([self.f_call, self._log_likelihood_gradients()]) + self._savedpsiKmm.append([self.f_call, [self.Kmm, self.dL_dKmm]]) +# sf2 = self.scale_factor ** 2 + if self.likelihood.is_heteroscedastic: + A = -0.5 * self.N * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.V * self.likelihood.Y) +# B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A) * sf2) + B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A)) + else: + A = -0.5 * self.N * self.input_dim * (np.log(2.*np.pi) + np.log(self.likelihood._variance)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT +# B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A) * sf2) + B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A)) + C = -self.input_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2)) + D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V)) + self._savedABCD.append([self.f_call, A, B, C, D]) + + # print "\nkl:", kl, "ll:", ll + return ll - kl + + def _log_likelihood_gradients(self): + dKL_dmu, dKL_dS = self.dKL_dmuS() + dL_dmu, dL_dS = self.dL_dmuS() + # TODO: find way to make faster + + d_dmu = (dL_dmu - dKL_dmu).flatten() + d_dS = (dL_dS - dKL_dS).flatten() + # TEST KL: ==================== + # d_dmu = (dKL_dmu).flatten() + # d_dS = (dKL_dS).flatten() + # ======================== + # TEST L: ==================== +# d_dmu = (dL_dmu).flatten() +# d_dS = (dL_dS).flatten() + # ======================== + self.dbound_dmuS = np.hstack((d_dmu, d_dS)) + self.dbound_dZtheta = SparseGP._log_likelihood_gradients(self) + return self._clipped(np.hstack((self.dbound_dmuS.flatten(), self.dbound_dZtheta))) + + def plot_latent(self, *args, **kwargs): + return plot_latent.plot_latent_indices(self, *args, **kwargs) + + def do_test_latents(self, Y): + """ + Compute the latent representation for a set of new points Y + + Notes: + This will only work with a univariate Gaussian likelihood (for now) + """ + assert not self.likelihood.is_heteroscedastic + N_test = Y.shape[0] + input_dim = self.Z.shape[1] + means = np.zeros((N_test, input_dim)) + covars = np.zeros((N_test, input_dim)) + + dpsi0 = -0.5 * self.input_dim * self.likelihood.precision + dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods + V = self.likelihood.precision * Y + dpsi1 = np.dot(self.Cpsi1V, V.T) + + start = np.zeros(self.input_dim * 2) + + for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]): + args = (self.kern, self.Z, dpsi0, dpsi1_n, dpsi2) + xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False) + + mu, log_S = xopt.reshape(2, 1, -1) + means[n] = mu[0].copy() + covars[n] = np.exp(log_S[0]).copy() + + return means, covars + + + def plot_X_1d(self, fignum=None, ax=None, colors=None): + """ + Plot latent space X in 1D: + + -if fig is given, create input_dim subplots in fig and plot in these + -if ax is given plot input_dim 1D latent space plots of X into each `axis` + -if neither fig nor ax is given create a figure with fignum and plot in there + + colors: + colors of different latent space dimensions input_dim + """ + import pylab + if ax is None: + fig = pylab.figure(num=fignum, figsize=(8, min(12, (2 * self.X.shape[1])))) + if colors is None: + colors = pylab.gca()._get_lines.color_cycle + pylab.clf() + else: + colors = iter(colors) + plots = [] + x = np.arange(self.X.shape[0]) + for i in range(self.X.shape[1]): + if ax is None: + a = fig.add_subplot(self.X.shape[1], 1, i + 1) + elif isinstance(ax, (tuple, list)): + a = ax[i] + else: + raise ValueError("Need one ax per latent dimnesion input_dim") + a.plot(self.X, c='k', alpha=.3) + plots.extend(a.plot(x, self.X.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i))) + a.fill_between(x, + self.X.T[i] - 2 * np.sqrt(self.X_variance.T[i]), + self.X.T[i] + 2 * np.sqrt(self.X_variance.T[i]), + facecolor=plots[-1].get_color(), + alpha=.3) + a.legend(borderaxespad=0.) + a.set_xlim(x.min(), x.max()) + if i < self.X.shape[1] - 1: + a.set_xticklabels('') + pylab.draw() + fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95)) + return fig + + def __getstate__(self): + return (self.likelihood, self.input_dim, self.X, self.X_variance, + self.init, self.num_inducing, self.Z, self.kern, + self.oldpsave, self._debug) + + def __setstate__(self, state): + self.__init__(*state) + + def _debug_filter_params(self, x): + start, end = 0, self.X.size, + X = x[start:end].reshape(self.N, self.input_dim) + start, end = end, end + self.X_variance.size + X_v = x[start:end].reshape(self.N, self.input_dim) + start, end = end, end + (self.num_inducing * self.input_dim) + Z = x[start:end].reshape(self.num_inducing, self.input_dim) + start, end = end, end + self.input_dim + theta = x[start:] + return X, X_v, Z, theta + + + def _debug_get_axis(self, figs): + if figs[-1].axes: + ax1 = figs[-1].axes[0] + ax1.cla() + else: + ax1 = figs[-1].add_subplot(111) + return ax1 + + def _debug_plot(self): + assert self._debug, "must enable _debug, to debug-plot" + import pylab +# from mpl_toolkits.mplot3d import Axes3D + figs = [pylab.figure('BGPLVM DEBUG', figsize=(12, 4))] +# fig.clf() + + # log like +# splotshape = (6, 4) +# ax1 = pylab.subplot2grid(splotshape, (0, 0), 1, 4) + ax1 = self._debug_get_axis(figs) + ax1.text(.5, .5, "Optimization", alpha=.3, transform=ax1.transAxes, + ha='center', va='center') + kllls = np.array(self._savedklll) + LL, = ax1.plot(kllls[:, 0], kllls[:, 1] - kllls[:, 2], '-', label=r'$\log p(\mathbf{Y})$', mew=1.5) + KL, = ax1.plot(kllls[:, 0], kllls[:, 2], '-', label=r'$\mathcal{KL}(p||q)$', mew=1.5) + L, = ax1.plot(kllls[:, 0], kllls[:, 1], '-', label=r'$L$', mew=1.5) # \mathds{E}_{q(\mathbf{X})}[p(\mathbf{Y|X})\frac{p(\mathbf{X})}{q(\mathbf{X})}] + + param_dict = dict(self._savedparams) + gradient_dict = dict(self._savedgradients) +# kmm_dict = dict(self._savedpsiKmm) + iters = np.array(param_dict.keys()) + ABCD_dict = np.array(self._savedABCD) + self.showing = 0 + +# ax2 = pylab.subplot2grid(splotshape, (1, 0), 2, 4) + figs.append(pylab.figure("BGPLVM DEBUG X", figsize=(12, 4))) + ax2 = self._debug_get_axis(figs) + ax2.text(.5, .5, r"$\mathbf{X}$", alpha=.5, transform=ax2.transAxes, + ha='center', va='center') + figs[-1].canvas.draw() + figs[-1].tight_layout(rect=(0, 0, 1, .86)) +# ax3 = pylab.subplot2grid(splotshape, (3, 0), 2, 4, sharex=ax2) + figs.append(pylab.figure("BGPLVM DEBUG S", figsize=(12, 4))) + ax3 = self._debug_get_axis(figs) + ax3.text(.5, .5, r"$\mathbf{S}$", alpha=.5, transform=ax3.transAxes, + ha='center', va='center') + figs[-1].canvas.draw() + figs[-1].tight_layout(rect=(0, 0, 1, .86)) +# ax4 = pylab.subplot2grid(splotshape, (5, 0), 2, 2) + figs.append(pylab.figure("BGPLVM DEBUG Z", figsize=(6, 4))) + ax4 = self._debug_get_axis(figs) + ax4.text(.5, .5, r"$\mathbf{Z}$", alpha=.5, transform=ax4.transAxes, + ha='center', va='center') + figs[-1].canvas.draw() + figs[-1].tight_layout(rect=(0, 0, 1, .86)) +# ax5 = pylab.subplot2grid(splotshape, (5, 2), 2, 2) + figs.append(pylab.figure("BGPLVM DEBUG theta", figsize=(6, 4))) + ax5 = self._debug_get_axis(figs) + ax5.text(.5, .5, r"${\theta}$", alpha=.5, transform=ax5.transAxes, + ha='center', va='center') + figs[-1].canvas.draw() + figs[-1].tight_layout(rect=(.15, 0, 1, .86)) +# figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6))) +# fig = figs[-1] +# ax6 = fig.add_subplot(121) +# ax6.text(.5, .5, r"${\mathbf{K}_{mm}}$", color='magenta', alpha=.5, transform=ax6.transAxes, +# ha='center', va='center') +# ax7 = fig.add_subplot(122) +# ax7.text(.5, .5, r"${\frac{dL}{dK_{mm}}}$", color='magenta', alpha=.5, transform=ax7.transAxes, +# ha='center', va='center') + figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6))) + fig = figs[-1] + ax8 = fig.add_subplot(121) + ax8.text(.5, .5, r"${\mathbf{A,B,C,input_dim}}$", color='k', alpha=.5, transform=ax8.transAxes, + ha='center', va='center') + ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 1], label='A') + ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 2], label='B') + ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 3], label='C') + ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 4], label='input_dim') + ax8.legend() + figs[-1].canvas.draw() + figs[-1].tight_layout(rect=(.15, 0, 1, .86)) + + X, S, Z, theta = self._debug_filter_params(param_dict[self.showing]) + Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing]) +# Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag + + quiver_units = 'xy' + quiver_scale = 1 + quiver_scale_units = 'xy' + Xlatentplts = ax2.plot(X, ls="-", marker="x") + colors = colorConverter.to_rgba_array([p.get_color() for p in Xlatentplts], .4) + Ulatent = np.zeros_like(X) + xlatent = np.tile(np.arange(0, X.shape[0])[:, None], X.shape[1]) + Xlatentgrads = ax2.quiver(xlatent, X, Ulatent, Xg, color=colors, + units=quiver_units, scale_units=quiver_scale_units, + scale=quiver_scale) + + Slatentplts = ax3.plot(S, ls="-", marker="x") + Slatentgrads = ax3.quiver(xlatent, S, Ulatent, Sg, color=colors, + units=quiver_units, scale_units=quiver_scale_units, + scale=quiver_scale) + ax3.set_ylim(0, 1.) + + xZ = np.tile(np.arange(0, Z.shape[0])[:, None], Z.shape[1]) + UZ = np.zeros_like(Z) + Zplts = ax4.plot(Z, ls="-", marker="x") + Zgrads = ax4.quiver(xZ, Z, UZ, Zg, color=colors, + units=quiver_units, scale_units=quiver_scale_units, + scale=quiver_scale) + + xtheta = np.arange(len(theta)) + Utheta = np.zeros_like(theta) + thetaplts = ax5.bar(xtheta - .4, theta, color=colors) + thetagrads = ax5.quiver(xtheta, theta, Utheta, thetag, color=colors, + units=quiver_units, scale_units=quiver_scale_units, + scale=quiver_scale, + edgecolors=('k',), linewidths=[1]) + pylab.setp(thetaplts, zorder=0) + pylab.setp(thetagrads, zorder=10) + ax5.set_xticks(np.arange(len(theta))) + ax5.set_xticklabels(self._get_param_names()[-len(theta):], rotation=17) + +# imkmm = ax6.imshow(kmm_dict[self.showing][0]) +# from mpl_toolkits.axes_grid1 import make_axes_locatable +# divider = make_axes_locatable(ax6) +# caxkmm = divider.append_axes("right", "5%", pad="1%") +# cbarkmm = pylab.colorbar(imkmm, cax=caxkmm) +# +# imkmmdl = ax7.imshow(kmm_dict[self.showing][1]) +# divider = make_axes_locatable(ax7) +# caxkmmdl = divider.append_axes("right", "5%", pad="1%") +# cbarkmmdl = pylab.colorbar(imkmmdl, cax=caxkmmdl) + +# input_dimleg = ax1.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)], +# loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.15, 1, 1.15), +# borderaxespad=0, mode="expand") + ax2.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)], + loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1), + borderaxespad=0, mode="expand") + ax3.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)], + loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1), + borderaxespad=0, mode="expand") + ax4.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)], + loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1), + borderaxespad=0, mode="expand") + ax5.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)], + loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1), + borderaxespad=0, mode="expand") + Lleg = ax1.legend() + Lleg.draggable() +# ax1.add_artist(input_dimleg) + + indicatorKL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 2], 'o', c=KL.get_color()) + indicatorLL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1] - kllls[self.showing, 2], 'o', c=LL.get_color()) + indicatorL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1], 'o', c=L.get_color()) +# for err in self._savederrors: +# if err < kllls.shape[0]: +# ax1.scatter(kllls[err, 0], kllls[err, 2], s=50, marker=(5, 2), c=KL.get_color()) +# ax1.scatter(kllls[err, 0], kllls[err, 1] - kllls[err, 2], s=50, marker=(5, 2), c=LL.get_color()) +# ax1.scatter(kllls[err, 0], kllls[err, 1], s=50, marker=(5, 2), c=L.get_color()) + +# try: +# for f in figs: +# f.canvas.draw() +# f.tight_layout(box=(0, .15, 1, .9)) +# # pylab.draw() +# # pylab.tight_layout(box=(0, .1, 1, .9)) +# except: +# pass + + # parameter changes + # ax2 = pylab.subplot2grid((4, 1), (1, 0), 3, 1, projection='3d') + button_options = [0, 0] # [0]: clicked -- [1]: dragged + + def update_plots(event): + if button_options[0] and not button_options[1]: +# event.button, event.x, event.y, event.xdata, event.ydata) + tmp = np.abs(iters - event.xdata) + closest_hit = iters[tmp == tmp.min()][0] + + if closest_hit != self.showing: + self.showing = closest_hit + # print closest_hit, iters, event.xdata + + indicatorLL.set_data(self.showing, kllls[self.showing, 1] - kllls[self.showing, 2]) + indicatorKL.set_data(self.showing, kllls[self.showing, 2]) + indicatorL.set_data(self.showing, kllls[self.showing, 1]) + + X, S, Z, theta = self._debug_filter_params(param_dict[self.showing]) + Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing]) +# Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag + + for i, Xlatent in enumerate(Xlatentplts): + Xlatent.set_ydata(X[:, i]) + Xlatentgrads.set_offsets(np.array([xlatent.ravel(), X.ravel()]).T) + Xlatentgrads.set_UVC(Ulatent, Xg) + + for i, Slatent in enumerate(Slatentplts): + Slatent.set_ydata(S[:, i]) + Slatentgrads.set_offsets(np.array([xlatent.ravel(), S.ravel()]).T) + Slatentgrads.set_UVC(Ulatent, Sg) + + for i, Zlatent in enumerate(Zplts): + Zlatent.set_ydata(Z[:, i]) + Zgrads.set_offsets(np.array([xZ.ravel(), Z.ravel()]).T) + Zgrads.set_UVC(UZ, Zg) + + for p, t in zip(thetaplts, theta): + p.set_height(t) + thetagrads.set_offsets(np.array([xtheta.ravel(), theta.ravel()]).T) + thetagrads.set_UVC(Utheta, thetag) + +# imkmm.set_data(kmm_dict[self.showing][0]) +# imkmm.autoscale() +# cbarkmm.update_normal(imkmm) +# +# imkmmdl.set_data(kmm_dict[self.showing][1]) +# imkmmdl.autoscale() +# cbarkmmdl.update_normal(imkmmdl) + + ax2.relim() + # ax3.relim() + ax4.relim() + ax5.relim() + ax2.autoscale() + # ax3.autoscale() + ax4.autoscale() + ax5.autoscale() + + [fig.canvas.draw() for fig in figs] + button_options[0] = 0 + button_options[1] = 0 + + def onclick(event): + if event.inaxes is ax1 and event.button == 1: + button_options[0] = 1 + def motion(event): + if button_options[0]: + button_options[1] = 1 + + cidr = figs[0].canvas.mpl_connect('button_release_event', update_plots) + cidp = figs[0].canvas.mpl_connect('button_press_event', onclick) + cidd = figs[0].canvas.mpl_connect('motion_notify_event', motion) + + return ax1, ax2, ax3, ax4, ax5 # , ax6, ax7 + + + + +def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2): + """ + objective function for fitting the latent variables for test points + (negative log-likelihood: should be minimised!) + """ + mu, log_S = mu_S.reshape(2, 1, -1) + S = np.exp(log_S) + + psi0 = kern.psi0(Z, mu, S) + psi1 = kern.psi1(Z, mu, S) + psi2 = kern.psi2(Z, mu, S) + + lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S) + + mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S) + mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S) + mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S) + + dmu = mu0 + mu1 + mu2 - mu + # dS = S0 + S1 + S2 -0.5 + .5/S + dlnS = S * (S0 + S1 + S2 - 0.5) + .5 + return -lik, -np.hstack((dmu.flatten(), dlnS.flatten())) + +def latent_cost(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2): + """ + objective function for fitting the latent variables (negative log-likelihood: should be minimised!) + This is the same as latent_cost_and_grad but only for the objective + """ + mu, log_S = mu_S.reshape(2, 1, -1) + S = np.exp(log_S) + + psi0 = kern.psi0(Z, mu, S) + psi1 = kern.psi1(Z, mu, S) + psi2 = kern.psi2(Z, mu, S) + + lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S) + return -float(lik) + +def latent_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2): + """ + This is the same as latent_cost_and_grad but only for the grad + """ + mu, log_S = mu_S.reshape(2, 1, -1) + S = np.exp(log_S) + + mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S) + mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S) + mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S) + + dmu = mu0 + mu1 + mu2 - mu + # dS = S0 + S1 + S2 -0.5 + .5/S + dlnS = S * (S0 + S1 + S2 - 0.5) + .5 + + return -np.hstack((dmu.flatten(), dlnS.flatten())) + + diff --git a/GPy/models/fitc.py b/GPy/models/fitc.py new file mode 100644 index 00000000..785ea46c --- /dev/null +++ b/GPy/models/fitc.py @@ -0,0 +1,252 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, chol_inv, tdot, symmetrify, pdinv +from ..util.plot import gpplot +from .. import kern +from scipy import stats, linalg +from GPy.core.sparse_gp import SparseGP + +def backsub_both_sides(L, X): + """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky""" + tmp, _ = linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(X), lower=1, trans=1) + return linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(tmp.T), lower=1, trans=1)[0].T + +class FITC(SparseGP): + + def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False): + super(FITC, self).__init__(X, likelihood, kernel, normalize_X=normalize_X) + + def update_likelihood_approximation(self): + """ + Approximates a non-gaussian likelihood using Expectation Propagation + + For a Gaussian (or direct: TODO) likelihood, no iteration is required: + this function does nothing + + Diag(Knn - Qnn) is added to the noise term to use the tools already implemented in SparseGP. + The true precison is now 'true_precision' not 'precision'. + """ + if self.has_uncertain_inputs: + raise NotImplementedError, "FITC approximation not implemented for uncertain inputs" + else: + self.likelihood.fit_FITC(self.Kmm, self.psi1, self.psi0) + self._set_params(self._get_params()) # update the GP + + def _computations(self): + + # factor Kmm + self.Lm = jitchol(self.Kmm) + self.Lmi, info = linalg.lapack.flapack.dtrtrs(self.Lm, np.eye(self.num_inducing), lower=1) + Lmipsi1 = np.dot(self.Lmi, self.psi1) + self.Qnn = np.dot(Lmipsi1.T, Lmipsi1).copy() + self.Diag0 = self.psi0 - np.diag(self.Qnn) + self.beta_star = self.likelihood.precision / (1. + self.likelihood.precision * self.Diag0[:, None]) # Includes Diag0 in the precision + self.V_star = self.beta_star * self.likelihood.Y + + # The rather complex computations of self.A + if self.has_uncertain_inputs: + raise NotImplementedError + else: + if self.likelihood.is_heteroscedastic: + assert self.likelihood.input_dim == 1 + tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.N))) + tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1) + self.A = tdot(tmp) + + # factor B + self.B = np.eye(self.num_inducing) + self.A + self.LB = jitchol(self.B) + self.LBi = chol_inv(self.LB) + self.psi1V = np.dot(self.psi1, self.V_star) + + Lmi_psi1V, info = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0) + self._LBi_Lmi_psi1V, _ = linalg.lapack.flapack.dtrtrs(self.LB, np.asfortranarray(Lmi_psi1V), lower=1, trans=0) + + Kmmipsi1 = np.dot(self.Lmi.T, Lmipsi1) + b_psi1_Ki = self.beta_star * Kmmipsi1.T + Ki_pbp_Ki = np.dot(Kmmipsi1, b_psi1_Ki) + Kmmi = np.dot(self.Lmi.T, self.Lmi) + LBiLmi = np.dot(self.LBi, self.Lmi) + LBL_inv = np.dot(LBiLmi.T, LBiLmi) + VVT = np.outer(self.V_star, self.V_star) + VV_p_Ki = np.dot(VVT, Kmmipsi1.T) + Ki_pVVp_Ki = np.dot(Kmmipsi1, VV_p_Ki) + psi1beta = self.psi1 * self.beta_star.T + H = self.Kmm + mdot(self.psi1, psi1beta.T) + LH = jitchol(H) + LHi = chol_inv(LH) + Hi = np.dot(LHi.T, LHi) + + betapsi1TLmiLBi = np.dot(psi1beta.T, LBiLmi.T) + alpha = np.array([np.dot(a.T, a) for a in betapsi1TLmiLBi])[:, None] + gamma_1 = mdot(VVT, self.psi1.T, Hi) + pHip = mdot(self.psi1.T, Hi, self.psi1) + gamma_2 = mdot(self.beta_star * pHip, self.V_star) + gamma_3 = self.V_star * gamma_2 + + self._dL_dpsi0 = -0.5 * self.beta_star # dA_dpsi0: logdet(self.beta_star) + self._dL_dpsi0 += .5 * self.V_star ** 2 # dA_psi0: yT*beta_star*y + self._dL_dpsi0 += .5 * alpha # dC_dpsi0 + self._dL_dpsi0 += 0.5 * mdot(self.beta_star * pHip, self.V_star) ** 2 - self.V_star * mdot(self.V_star.T, pHip * self.beta_star).T # dD_dpsi0 + + self._dL_dpsi1 = b_psi1_Ki.copy() # dA_dpsi1: logdet(self.beta_star) + self._dL_dpsi1 += -np.dot(psi1beta.T, LBL_inv) # dC_dpsi1 + self._dL_dpsi1 += gamma_1 - mdot(psi1beta.T, Hi, self.psi1, gamma_1) # dD_dpsi1 + + self._dL_dKmm = -0.5 * np.dot(Kmmipsi1, b_psi1_Ki) # dA_dKmm: logdet(self.beta_star) + self._dL_dKmm += .5 * (LBL_inv - Kmmi) + mdot(LBL_inv, psi1beta, Kmmipsi1.T) # dC_dKmm + self._dL_dKmm += -.5 * mdot(Hi, self.psi1, gamma_1) # dD_dKmm + + self._dpsi1_dtheta = 0 + self._dpsi1_dX = 0 + self._dKmm_dtheta = 0 + self._dKmm_dX = 0 + + self._dpsi1_dX_jkj = 0 + self._dpsi1_dtheta_jkj = 0 + + for i, V_n, alpha_n, gamma_n, gamma_k in zip(range(self.N), self.V_star, alpha, gamma_2, gamma_3): + K_pp_K = np.dot(Kmmipsi1[:, i:(i + 1)], Kmmipsi1[:, i:(i + 1)].T) + + # Diag_dpsi1 = Diag_dA_dpsi1: yT*beta_star*y + Diag_dC_dpsi1 +Diag_dD_dpsi1 + _dpsi1 = (-V_n ** 2 - alpha_n + 2.*gamma_k - gamma_n ** 2) * Kmmipsi1.T[i:(i + 1), :] + + # Diag_dKmm = Diag_dA_dKmm: yT*beta_star*y +Diag_dC_dKmm +Diag_dD_dKmm + _dKmm = .5 * (V_n ** 2 + alpha_n + gamma_n ** 2 - 2.*gamma_k) * K_pp_K # Diag_dD_dKmm + + self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1, self.X[i:i + 1, :], self.Z) + self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm, self.Z) + + self._dKmm_dX += 2.*self.kern.dK_dX(_dKmm , self.Z) + self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T, self.Z, self.X[i:i + 1, :]) + + # the partial derivative vector for the likelihood + if self.likelihood.Nparams == 0: + # save computation here. + self.partial_for_likelihood = None + elif self.likelihood.is_heteroscedastic: + raise NotImplementedError, "heteroscedatic derivates not implemented" + else: + # likelihood is not heterscedatic + dbstar_dnoise = self.likelihood.precision * (self.beta_star ** 2 * self.Diag0[:, None] - self.beta_star) + Lmi_psi1 = mdot(self.Lmi, self.psi1) + LBiLmipsi1 = np.dot(self.LBi, Lmi_psi1) + aux_0 = np.dot(self._LBi_Lmi_psi1V.T, LBiLmipsi1) + aux_1 = self.likelihood.Y.T * np.dot(self._LBi_Lmi_psi1V.T, LBiLmipsi1) + aux_2 = np.dot(LBiLmipsi1.T, self._LBi_Lmi_psi1V) + + dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise / self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y ** 2 * dbstar_dnoise) + dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T, self.LBi, Lmi_psi1) * Lmi_psi1 * dbstar_dnoise.T) + dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T, self.LBi, Lmi_psi1) * Lmi_psi1 * dbstar_dnoise.T) + + dD_dnoise_1 = mdot(self.V_star * LBiLmipsi1.T, LBiLmipsi1 * dbstar_dnoise.T * self.likelihood.Y.T) + alpha = mdot(LBiLmipsi1, self.V_star) + alpha_ = mdot(LBiLmipsi1.T, alpha) + dD_dnoise_2 = -0.5 * self.input_dim * np.sum(alpha_ ** 2 * dbstar_dnoise) + + dD_dnoise_1 = mdot(self.V_star.T, self.psi1.T, self.Lmi.T, self.LBi.T, self.LBi, self.Lmi, self.psi1, dbstar_dnoise * self.likelihood.Y) + dD_dnoise_2 = 0.5 * mdot(self.V_star.T, self.psi1.T, Hi, self.psi1, dbstar_dnoise * self.psi1.T, Hi, self.psi1, self.V_star) + dD_dnoise = dD_dnoise_1 + dD_dnoise_2 + + self.partial_for_likelihood = dA_dnoise + dC_dnoise + dD_dnoise + + def log_likelihood(self): + """ Compute the (lower bound on the) log marginal likelihood """ + A = -0.5 * self.N * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y) + C = -self.input_dim * (np.sum(np.log(np.diag(self.LB)))) + D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V)) + return A + C + D + + def _log_likelihood_gradients(self): + pass + return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood))) + + def dL_dtheta(self): + if self.has_uncertain_inputs: + raise NotImplementedError, "FITC approximation not implemented for uncertain inputs" + else: + dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0, self.X) + dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1, self.X, self.Z) + dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm, X=self.Z) + dL_dtheta += self._dKmm_dtheta + dL_dtheta += self._dpsi1_dtheta + return dL_dtheta + + def dL_dZ(self): + if self.has_uncertain_inputs: + raise NotImplementedError, "FITC approximation not implemented for uncertain inputs" + else: + dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T, self.Z, self.X) + dL_dZ += 2. * self.kern.dK_dX(self._dL_dKmm, X=self.Z) + dL_dZ += self._dpsi1_dX + dL_dZ += self._dKmm_dX + return dL_dZ + + def _raw_predict(self, Xnew, which_parts, full_cov=False): + + if self.likelihood.is_heteroscedastic: + Iplus_Dprod_i = 1. / (1. + self.Diag0 * self.likelihood.precision.flatten()) + self.Diag = self.Diag0 * Iplus_Dprod_i + self.P = Iplus_Dprod_i[:, None] * self.psi1.T + self.RPT0 = np.dot(self.Lmi, self.psi1) + self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0, ((1. - Iplus_Dprod_i) / self.Diag0)[:, None] * self.RPT0.T)) + self.R, info = linalg.flapack.dtrtrs(self.L, self.Lmi, lower=1) + self.RPT = np.dot(self.R, self.P.T) + self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T, self.RPT) + self.w = self.Diag * self.likelihood.v_tilde + self.Gamma = np.dot(self.R.T, np.dot(self.RPT, self.likelihood.v_tilde)) + self.mu = self.w + np.dot(self.P, self.Gamma) + + """ + Make a prediction for the generalized FITC model + + Arguments + --------- + X : Input prediction data - Nx1 numpy array (floats) + """ + # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T) + + # Ci = I + (RPT0)Di(RPT0).T + # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T + # = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T + # = I - [RPT0] * (U*U.T)^-1 * [RPT0].T + # = I - V.T * V + U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn) + V, info = linalg.flapack.dtrtrs(U, self.RPT0.T, lower=1) + C = np.eye(self.num_inducing) - np.dot(V.T, V) + mu_u = np.dot(C, self.RPT0) * (1. / self.Diag0[None, :]) + # self.C = C + # self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T + # self.mu_u = mu_u + # self.U = U + # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T) + mu_H = np.dot(mu_u, self.mu) + self.mu_H = mu_H + Sigma_H = C + np.dot(mu_u, np.dot(self.Sigma, mu_u.T)) + # q(f_star|y) = N(f_star|mu_star,sigma2_star) + Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts) + KR0T = np.dot(Kx.T, self.Lmi.T) + mu_star = np.dot(KR0T, mu_H) + if full_cov: + Kxx = self.kern.K(Xnew, which_parts=which_parts) + var = Kxx + np.dot(KR0T, np.dot(Sigma_H - np.eye(self.num_inducing), KR0T.T)) + else: + Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts) + var = (Kxx + np.sum(KR0T.T * np.dot(Sigma_H - np.eye(self.num_inducing), KR0T.T), 0))[:, None] + return mu_star[:, None], var + else: + raise NotImplementedError, "homoscedastic fitc not implemented" + """ + Kx = self.kern.K(self.Z, Xnew) + mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V) + if full_cov: + Kxx = self.kern.K(Xnew) + var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting + else: + Kxx = self.kern.Kdiag(Xnew) + var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) + return mu,var[:,None] + """ diff --git a/GPy/models/generalized_fitc.py b/GPy/models/generalized_fitc.py new file mode 100644 index 00000000..58da609d --- /dev/null +++ b/GPy/models/generalized_fitc.py @@ -0,0 +1,221 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot +from ..util.plot import gpplot +from .. import kern +from scipy import stats, linalg +from ..core import SparseGP + +def backsub_both_sides(L,X): + """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky""" + tmp,_ = linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(X),lower=1,trans=1) + return linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(tmp.T),lower=1,trans=1)[0].T + + +class GeneralizedFITC(SparseGP): + """ + Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC. + + :param X: inputs + :type X: np.ndarray (N x input_dim) + :param likelihood: a likelihood instance, containing the observed data + :type likelihood: GPy.likelihood.(Gaussian | EP) + :param kernel : the kernel/covariance function. See link kernels + :type kernel: a GPy kernel + :param X_variance: The variance in the measurements of X (Gaussian variance) + :type X_variance: np.ndarray (N x input_dim) | None + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (num_inducing x input_dim) | None + :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type num_inducing: int + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False): + self.Z = Z + self.num_inducing = self.Z.shape[0] + self.true_precision = likelihood.precision + + super(GeneralizedFITC, self).__init__(X, likelihood, kernel=kernel, Z=self.Z, X_variance=X_variance, normalize_X=normalize_X) + self._set_params(self._get_params()) + + def _set_params(self, p): + self.Z = p[:self.num_inducing*self.input_dim].reshape(self.num_inducing, self.input_dim) + self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam]) + self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:]) + self._compute_kernel_matrices() + self._computations() + self._FITC_computations() + + def update_likelihood_approximation(self): + """ + Approximates a non-gaussian likelihood using Expectation Propagation + + For a Gaussian (or direct: TODO) likelihood, no iteration is required: + this function does nothing + + Diag(Knn - Qnn) is added to the noise term to use the tools already implemented in SparseGP. + The true precison is now 'true_precision' not 'precision'. + """ + if self.has_uncertain_inputs: + raise NotImplementedError, "FITC approximation not implemented for uncertain inputs" + else: + self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0) + self.true_precision = self.likelihood.precision # Save the true precision + self.likelihood.precision = self.true_precision/(1. + self.true_precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation + self._set_params(self._get_params()) # update the GP + + def _FITC_computations(self): + """ + FITC approximation doesn't have the correction term in the log-likelihood bound, + but adds a diagonal term to the covariance matrix: diag(Knn - Qnn). + This function: + - computes the FITC diagonal term + - removes the extra terms computed in the SparseGP approximation + - computes the likelihood gradients wrt the true precision. + """ + #NOTE the true precison is now 'true_precision' not 'precision' + if self.likelihood.is_heteroscedastic: + + # Compute generalized FITC's diagonal term of the covariance + self.Lmi,info = linalg.lapack.flapack.dtrtrs(self.Lm,np.eye(self.num_inducing),lower=1) + Lmipsi1 = np.dot(self.Lmi,self.psi1) + self.Qnn = np.dot(Lmipsi1.T,Lmipsi1) + #self.Kmmi, Lm, Lmi, Kmm_logdet = pdinv(self.Kmm) + #self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1) + #a = kj + self.Diag0 = self.psi0 - np.diag(self.Qnn) + Iplus_Dprod_i = 1./(1.+ self.Diag0 * self.true_precision.flatten()) + self.Diag = self.Diag0 * Iplus_Dprod_i + + self.P = Iplus_Dprod_i[:,None] * self.psi1.T + self.RPT0 = np.dot(self.Lmi,self.psi1) + self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T)) + self.R,info = linalg.flapack.dtrtrs(self.L,self.Lmi,lower=1) + self.RPT = np.dot(self.R,self.P.T) + self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT) + self.w = self.Diag * self.likelihood.v_tilde + self.Gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde)) + self.mu = self.w + np.dot(self.P,self.Gamma) + + # Remove extra term from dL_dpsi1 + self.dL_dpsi1 -= mdot(self.Lmi.T,Lmipsi1*self.likelihood.precision.flatten().reshape(1,self.N)) + #self.Kmmi, Lm, Lmi, Kmm_logdet = pdinv(self.Kmm) + #self.dL_dpsi1 -= mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB + + #########333333 + #self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) + #########333333 + + + + else: + raise NotImplementedError, "homoscedastic fitc not implemented" + # Remove extra term from dL_dpsi1 + #self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB + + sf = self.scale_factor + sf2 = sf**2 + + # Remove extra term from dL_dKmm + self.dL_dKmm += 0.5 * self.input_dim * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB + self.dL_dpsi0 = None + + #the partial derivative vector for the likelihood + if self.likelihood.Nparams == 0: + self.partial_for_likelihood = None + elif self.likelihood.is_heteroscedastic: + raise NotImplementedError, "heteroscedastic derivates not implemented" + else: + raise NotImplementedError, "homoscedastic derivatives not implemented" + #likelihood is not heterscedatic + #self.partial_for_likelihood = - 0.5 * self.N*self.input_dim*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2 + #self.partial_for_likelihood += 0.5 * self.input_dim * trace_dot(self.Bi,self.A)*self.likelihood.precision + #self.partial_for_likelihood += self.likelihood.precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1)) + #TODO partial derivative vector for the likelihood not implemented + + def dL_dtheta(self): + """ + Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel + """ + dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z) + if self.has_uncertain_inputs: + raise NotImplementedError, "heteroscedatic derivates not implemented" + else: + #NOTE in SparseGP this would include the gradient wrt psi0 + dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1,self.Z,self.X) + return dL_dtheta + + + def log_likelihood(self): + """ Compute the (lower bound on the) log marginal likelihood """ + sf2 = self.scale_factor**2 + if self.likelihood.is_heteroscedastic: + A = -0.5*self.N*self.input_dim*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y) + else: + A = -0.5*self.N*self.input_dim*(np.log(2.*np.pi) + np.log(self.likelihood._variance)) -0.5*self.likelihood.precision*self.likelihood.trYYT + C = -self.input_dim * (np.sum(np.log(np.diag(self.LB))) + 0.5*self.num_inducing*np.log(sf2)) + #C = -0.5*self.input_dim * (self.B_logdet + self.num_inducing*np.log(sf2)) + D = 0.5*np.sum(np.square(self._LBi_Lmi_psi1V)) + #self.Cpsi1VVpsi1 = np.dot(self.Cpsi1V,self.psi1V.T) + #D_ = 0.5*np.trace(self.Cpsi1VVpsi1) + return A+C+D + + def _raw_predict(self, Xnew, which_parts, full_cov=False): + if self.likelihood.is_heteroscedastic: + """ + Make a prediction for the generalized FITC model + + Arguments + --------- + X : Input prediction data - Nx1 numpy array (floats) + """ + # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T) + + # Ci = I + (RPT0)Di(RPT0).T + # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T + # = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T + # = I - [RPT0] * (U*U.T)^-1 * [RPT0].T + # = I - V.T * V + U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn) + V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1) + C = np.eye(self.num_inducing) - np.dot(V.T,V) + mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:]) + #self.C = C + #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T + #self.mu_u = mu_u + #self.U = U + # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T) + mu_H = np.dot(mu_u,self.mu) + self.mu_H = mu_H + Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T)) + # q(f_star|y) = N(f_star|mu_star,sigma2_star) + Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts) + KR0T = np.dot(Kx.T,self.Lmi.T) + mu_star = np.dot(KR0T,mu_H) + if full_cov: + Kxx = self.kern.K(Xnew,which_parts=which_parts) + var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T)) + else: + Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts) + Kxx_ = self.kern.K(Xnew,which_parts=which_parts) # TODO: RA, is this line needed? + var_ = Kxx_ + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T)) # TODO: RA, is this line needed? + var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T),0))[:,None] + return mu_star[:,None],var + else: + raise NotImplementedError, "homoscedastic fitc not implemented" + """ + Kx = self.kern.K(self.Z, Xnew) + mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V) + if full_cov: + Kxx = self.kern.K(Xnew) + var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting + else: + Kxx = self.kern.Kdiag(Xnew) + var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) + return mu,var[:,None] + """ diff --git a/GPy/models/gp_classification.py b/GPy/models/gp_classification.py new file mode 100644 index 00000000..74455a2b --- /dev/null +++ b/GPy/models/gp_classification.py @@ -0,0 +1,41 @@ +# Copyright (c) 2013, Ricardo Andrade +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from ..core import GP +from .. import likelihoods +from .. import kern + +class GP_classification(GP): + """ + Gaussian Process classification + + This is a thin wrapper around the models.GP class, with a set of sensible defalts + + :param X: input observations + :param Y: observed values + :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function + :param kernel: a GPy kernel, defaults to rbf + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self,X,Y=None,likelihood=None,kernel=None,normalize_X=False,normalize_Y=False): + if kernel is None: + kernel = kern.rbf(X.shape[1]) + + if likelihood is None: + distribution = likelihoods.likelihood_functions.Binomial() + likelihood = likelihoods.EP(Y, distribution) + elif Y is not None: + if not all(Y.flatten() == likelihood.data.flatten()): + raise Warning, 'likelihood.data and Y are different.' + + GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X) + self._set_params(self._get_params()) diff --git a/GPy/models/gp_regression.py b/GPy/models/gp_regression.py new file mode 100644 index 00000000..526ef5f5 --- /dev/null +++ b/GPy/models/gp_regression.py @@ -0,0 +1,35 @@ +# Copyright (c) 2012, James Hensman +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from ..core import GP +from .. import likelihoods +from .. import kern + +class GPRegression(GP): + """ + Gaussian Process model for regression + + This is a thin wrapper around the models.GP class, with a set of sensible defalts + + :param X: input observations + :param Y: observed values + :param kernel: a GPy kernel, defaults to rbf + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False): + if kernel is None: + kernel = kern.rbf(X.shape[1]) + + likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y) + + GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X) + self._set_params(self._get_params()) diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py new file mode 100644 index 00000000..5589304a --- /dev/null +++ b/GPy/models/gplvm.py @@ -0,0 +1,67 @@ +### Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +import pylab as pb +import sys, pdb +from .. import kern +from ..core import model +from ..util.linalg import pdinv, PCA +from ..core import GP +from ..likelihoods import Gaussian +from .. import util +from GPy.util import plot_latent + + +class GPLVM(GP): + """ + Gaussian Process Latent Variable Model + + :param Y: observed data + :type Y: np.ndarray + :param input_dim: latent dimensionality + :type input_dim: int + :param init: initialisation method for the latent space + :type init: 'PCA'|'random' + + """ + def __init__(self, Y, input_dim, init='PCA', X = None, kernel=None, normalize_Y=False): + if X is None: + X = self.initialise_latent(init, input_dim, Y) + if kernel is None: + kernel = kern.rbf(input_dim, ARD=input_dim>1) + kern.bias(input_dim, np.exp(-2)) + kern.white(input_dim, np.exp(-2)) + likelihood = Gaussian(Y, normalize=normalize_Y) + GP.__init__(self, X, likelihood, kernel, normalize_X=False) + self._set_params(self._get_params()) + + def initialise_latent(self, init, input_dim, Y): + if init == 'PCA': + return PCA(Y, input_dim)[0] + else: + return np.random.randn(Y.shape[0], input_dim) + + def _get_param_names(self): + return sum([['X_%i_%i'%(n,q) for q in range(self.input_dim)] for n in range(self.N)],[]) + GP._get_param_names(self) + + def _get_params(self): + return np.hstack((self.X.flatten(), GP._get_params(self))) + + def _set_params(self,x): + self.X = x[:self.N*self.input_dim].reshape(self.N,self.input_dim).copy() + GP._set_params(self, x[self.X.size:]) + + def _log_likelihood_gradients(self): + dL_dX = 2.*self.kern.dK_dX(self.dL_dK,self.X) + + return np.hstack((dL_dX.flatten(),GP._log_likelihood_gradients(self))) + + def plot(self): + assert self.likelihood.Y.shape[1]==2 + pb.scatter(self.likelihood.Y[:,0],self.likelihood.Y[:,1],40,self.X[:,0].copy(),linewidth=0,cmap=pb.cm.jet) + Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None] + mu, var, upper, lower = self.predict(Xnew) + pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5) + + def plot_latent(self, *args, **kwargs): + return util.plot_latent.plot_latent(self, *args, **kwargs) diff --git a/GPy/models/sparse_gp_classification.py b/GPy/models/sparse_gp_classification.py new file mode 100644 index 00000000..0e4b8147 --- /dev/null +++ b/GPy/models/sparse_gp_classification.py @@ -0,0 +1,50 @@ +# Copyright (c) 2013, Ricardo Andrade +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from ..core import sparse_GP +from .. import likelihoods +from .. import kern +from ..likelihoods import likelihood +from GPRegression import GPRegression + +class sparse_GP_classification(sparse_GP): + """ + sparse Gaussian Process model for classification + + This is a thin wrapper around the sparse_GP class, with a set of sensible defalts + + :param X: input observations + :param Y: observed values + :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function + :param kernel: a GPy kernel, defaults to rbf+white + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + :rtype: model object + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, M=10): + if kernel is None: + kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3) + + if likelihood is None: + distribution = likelihoods.likelihood_functions.Binomial() + likelihood = likelihoods.EP(Y, distribution) + elif Y is not None: + if not all(Y.flatten() == likelihood.data.flatten()): + raise Warning, 'likelihood.data and Y are different.' + + if Z is None: + i = np.random.permutation(X.shape[0])[:M] + Z = X[i].copy() + else: + assert Z.shape[1]==X.shape[1] + + sparse_GP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X) + self._set_params(self._get_params()) diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py new file mode 100644 index 00000000..d7907c5f --- /dev/null +++ b/GPy/models/sparse_gp_regression.py @@ -0,0 +1,45 @@ +# Copyright (c) 2012, James Hensman +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from ..core import SparseGP +from .. import likelihoods +from .. import kern + +class SparseGPRegression(SparseGP): + """ + Gaussian Process model for regression + + This is a thin wrapper around the SparseGP class, with a set of sensible defalts + + :param X: input observations + :param Y: observed values + :param kernel: a GPy kernel, defaults to rbf+white + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + :rtype: model object + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, Z=None, M=10, X_variance=None): + # kern defaults to rbf (plus white for stability) + if kernel is None: + kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1], 1e-3) + + # Z defaults to a subset of the data + if Z is None: + i = np.random.permutation(X.shape[0])[:M] + Z = X[i].copy() + else: + assert Z.shape[1] == X.shape[1] + + # likelihood defaults to Gaussian + likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y) + + SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X, X_variance=X_variance) + self._set_params(self._get_params()) diff --git a/GPy/models/sparse_gplvm.py b/GPy/models/sparse_gplvm.py new file mode 100644 index 00000000..bec8c2da --- /dev/null +++ b/GPy/models/sparse_gplvm.py @@ -0,0 +1,61 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +import pylab as pb +import sys, pdb +from GPy.models.sparse_gp_regression import SparseGPRegression +from GPy.models.gplvm import GPLVM +# from .. import kern +# from ..core import model +# from ..util.linalg import pdinv, PCA + +class SparseGPLVM(SparseGPRegression, GPLVM): + """ + Sparse Gaussian Process Latent Variable Model + + :param Y: observed data + :type Y: np.ndarray + :param input_dim: latent dimensionality + :type input_dim: int + :param init: initialisation method for the latent space + :type init: 'PCA'|'random' + + """ + def __init__(self, Y, input_dim, kernel=None, init='PCA', M=10): + X = self.initialise_latent(init, input_dim, Y) + SparseGPRegression.__init__(self, X, Y, kernel=kernel, M=M) + + def _get_param_names(self): + return (sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], []) + + SparseGPRegression._get_param_names(self)) + + def _get_params(self): + return np.hstack((self.X.flatten(), SparseGPRegression._get_params(self))) + + def _set_params(self, x): + self.X = x[:self.X.size].reshape(self.N, self.input_dim).copy() + SparseGPRegression._set_params(self, x[self.X.size:]) + + def log_likelihood(self): + return SparseGPRegression.log_likelihood(self) + + def dL_dX(self): + dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0, self.X) + dL_dX += self.kern.dK_dX(self.dL_dpsi1.T, self.X, self.Z) + + return dL_dX + + def _log_likelihood_gradients(self): + return np.hstack((self.dL_dX().flatten(), SparseGPRegression._log_likelihood_gradients(self))) + + def plot(self): + GPLVM.plot(self) + # passing Z without a small amout of jitter will induce the white kernel where we don;t want it! + mu, var, upper, lower = SparseGPRegression.predict(self, self.Z + np.random.randn(*self.Z.shape) * 0.0001) + pb.plot(mu[:, 0] , mu[:, 1], 'ko') + + def plot_latent(self, *args, **kwargs): + input_1, input_2 = GPLVM.plot_latent(*args, **kwargs) + pb.plot(m.Z[:, input_1], m.Z[:, input_2], '^w') diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py new file mode 100644 index 00000000..fcef66c6 --- /dev/null +++ b/GPy/models/warped_gp.py @@ -0,0 +1,89 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from ..util.warping_functions import * +from ..core import GP +from .. import likelihoods +from GPy.util.warping_functions import TanhWarpingFunction_d +from GPy import kern + +class WarpedGP(GP): + def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False): + + if kernel is None: + kernel = kern.rbf(X.shape[1]) + + if warping_function == None: + self.warping_function = TanhWarpingFunction_d(warping_terms) + self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1) + + Y = self._scale_data(Y) + self.has_uncertain_inputs = False + self.Y_untransformed = Y.copy() + self.predict_in_warped_space = False + likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y) + + GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X) + self._set_params(self._get_params()) + + def _scale_data(self, Y): + self._Ymax = Y.max() + self._Ymin = Y.min() + return (Y - self._Ymin) / (self._Ymax - self._Ymin) - 0.5 + + def _unscale_data(self, Y): + return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin + + def _set_params(self, x): + self.warping_params = x[:self.warping_function.num_parameters] + Y = self.transform_data() + self.likelihood.set_data(Y) + GP._set_params(self, x[self.warping_function.num_parameters:].copy()) + + def _get_params(self): + return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy())) + + def _get_param_names(self): + warping_names = self.warping_function._get_param_names() + param_names = GP._get_param_names(self) + return warping_names + param_names + + def transform_data(self): + Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy() + return Y + + def log_likelihood(self): + ll = GP.log_likelihood(self) + jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params) + return ll + np.log(jacobian).sum() + + def _log_likelihood_gradients(self): + ll_grads = GP._log_likelihood_gradients(self) + alpha = np.dot(self.Ki, self.likelihood.Y.flatten()) + warping_grads = self.warping_function_gradients(alpha) + + warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1]) + return np.hstack((warping_grads.flatten(), ll_grads.flatten())) + + def warping_function_gradients(self, Kiy): + grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params) + grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params, + return_covar_chain=True) + djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0) + dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0) + + return -dquad_dpsi + djac_dpsi + + def plot_warping(self): + self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max()) + + def _raw_predict(self, *args, **kwargs): + mu, var = GP._raw_predict(self, *args, **kwargs) + + if self.predict_in_warped_space: + mu = self.warping_function.f_inv(mu, self.warping_params) + var = self.warping_function.f_inv(var, self.warping_params) + mu = self._unscale_data(mu) + return mu, var