refactoring files added

2026-05-08 11:32:39 +02:00 · 2013-06-05 14:11:49 +01:00 · 2013-06-05 14:11:49 +01:00 · 7040b26f41
commit 7040b26f41
parent 802d6e7792
16 changed files with 2852 additions and 0 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -0,0 +1,150 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import linalg
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot
 #from ..util.plot import gpplot,  Tango
 from ..likelihoods import EP
 from gp_base import GPBase
 class GP(GPBase):
    """
    Gaussian Process model for regression and EP
    :param X: input observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :parm likelihood: a GPy likelihood
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :rtype: model object
    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
    :type powerep: list
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self._set_params(self._get_params())
    def _set_params(self, p):
        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
        self.K = self.kern.K(self.X)
        self.K += self.likelihood.covariance_matrix
        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
        # the gradient of the likelihood wrt the covariance matrix
        if self.likelihood.YYT is None:
            #alpha = np.dot(self.Ki, self.likelihood.Y)
            alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1)
            self.dL_dK = 0.5 * (tdot(alpha) - self.input_dim * self.Ki)
        else:
            #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
            self.dL_dK = 0.5 * (tmp - self.input_dim * self.Ki)
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
        For a Gaussian likelihood, no iteration is required:
        this function does nothing
        """
        self.likelihood.fit_full(self.kern.K(self.X))
        self._set_params(self._get_params())  # update the GP
    def _model_fit_term(self):
        """
        Computes the model fit using YYT if it's available
        """
        if self.likelihood.YYT is None:
            tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
            return -0.5 * np.sum(np.square(tmp))
            #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
        else:
            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
    def log_likelihood(self):
        """
        The log marginal likelihood of the GP.
        For an EP model,  can be written as the log likelihood of a regression
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
        return -0.5 * self.input_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z
    def _log_likelihood_gradients(self):
        """
        The gradient of all parameters.
        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
        """
        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
        """
        Internal helper function for making predictions, does not account
        for normalization or likelihood
        """
        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
        #KiKx = np.dot(self.Ki, Kx)
        KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1)
        mu = np.dot(KiKx.T, self.likelihood.Y)
        if full_cov:
            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
            var = Kxx - np.dot(KiKx.T, Kx)
        else:
            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
            var = var[:, None]
        if stop:
            debug_this # @UndefinedVariable
        return mu, var
    def predict(self, Xnew, which_parts='all', full_cov=False):
        """
        Predict the function(s) at the new point(s) Xnew.
        Arguments
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
        """
        # normalize X values
        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
        return mean, var, _025pm, _975pm
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -0,0 +1,304 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv
 from scipy import linalg
 from ..likelihoods import Gaussian
 from gp_base import GPBase
 class SparseGP(GPBase):
    """
    Variational sparse GP model
    :param X: inputs
    :type X: np.ndarray (N x input_dim)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace)
    :param kernel : the kernel (covariance function). See link kernels
    :type kernel: a GPy.kern.kern instance
    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (N x input_dim) | None
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (num_inducing x input_dim) | None
    :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type num_inducing: int
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self.Z = Z
        self.num_inducing = Z.shape[0]
        self.likelihood = likelihood
        if X_variance is None:
            self.has_uncertain_inputs = False
        else:
            assert X_variance.shape == X.shape
            self.has_uncertain_inputs = True
            self.X_variance = X_variance
        if normalize_X:
            self.Z = (self.Z.copy() - self._Xmean) / self._Xstd
        # normalize X uncertainty also
        if self.has_uncertain_inputs:
            self.X_variance /= np.square(self._Xstd)
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            self.psi0 = self.kern.psi0(self.Z, self.X, self.X_variance)
            self.psi1 = self.kern.psi1(self.Z, self.X, self.X_variance).T
            self.psi2 = self.kern.psi2(self.Z, self.X, self.X_variance)
        else:
            self.psi0 = self.kern.Kdiag(self.X)
            self.psi1 = self.kern.K(self.Z, self.X)
            self.psi2 = None
    def _computations(self):
        # factor Kmm
        self.Lm = jitchol(self.Kmm)
        # The rather complex computations of self.A
        if self.has_uncertain_inputs:
            if self.likelihood.is_heteroscedastic:
                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.N, 1, 1))).sum(0)
            else:
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
            evals, evecs = linalg.eigh(psi2_beta)
            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
            tmp = evecs * np.sqrt(clipped_evals)
        else:
            if self.likelihood.is_heteroscedastic:
                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.N)))
            else:
                tmp = self.psi1 * (np.sqrt(self.likelihood.precision))
        tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
        self.A = tdot(tmp)
        # factor B
        self.B = np.eye(self.num_inducing) + self.A
        self.LB = jitchol(self.B)
        # TODO: make a switch for either first compute psi1V, or VV.T
        self.psi1V = np.dot(self.psi1, self.likelihood.V)
        # back substutue C into psi1V
        tmp, info1 = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
        self._LBi_Lmi_psi1V, _ = linalg.lapack.flapack.dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0)
        tmp, info2 = linalg.lapack.flapack.dpotrs(self.LB, tmp, lower=1)
        self.Cpsi1V, info3 = linalg.lapack.flapack.dtrtrs(self.Lm, tmp, lower=1, trans=1)
        # Compute dL_dKmm
        tmp = tdot(self._LBi_Lmi_psi1V)
        self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.input_dim * np.eye(self.num_inducing) + tmp)
        tmp = -0.5 * self.DBi_plus_BiPBi
        tmp += -0.5 * self.B * self.input_dim
        tmp += self.input_dim * np.eye(self.num_inducing)
        self.dL_dKmm = backsub_both_sides(self.Lm, tmp)
        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
        self.dL_dpsi0 = -0.5 * self.input_dim * (self.likelihood.precision * np.ones([self.N, 1])).flatten()
        self.dL_dpsi1 = np.dot(self.Cpsi1V, self.likelihood.V.T)
        dL_dpsi2_beta = 0.5 * backsub_both_sides(self.Lm, self.input_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
        if self.likelihood.is_heteroscedastic:
            if self.has_uncertain_inputs:
                self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
            else:
                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, self.psi1 * self.likelihood.precision.reshape(1, self.N))
                self.dL_dpsi2 = None
        else:
            dL_dpsi2 = self.likelihood.precision * dL_dpsi2_beta
            if self.has_uncertain_inputs:
                # repeat for each of the N psi_2 matrices
                self.dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], self.N, axis=0)
            else:
                # subsume back into psi1 (==Kmn)
                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2, self.psi1)
                self.dL_dpsi2 = None
        # the partial derivative vector for the likelihood
        if self.likelihood.Nparams == 0:
            # save computation here.
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
            raise NotImplementedError, "heteroscedatic derivates not implemented"
        else:
            # likelihood is not heterscedatic
            self.partial_for_likelihood = -0.5 * self.N * self.input_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
            self.partial_for_likelihood += 0.5 * self.input_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self.A) * self.likelihood.precision)
            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self.A * self.DBi_plus_BiPBi) - np.sum(np.square(self._LBi_Lmi_psi1V)))
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        if self.likelihood.is_heteroscedastic:
            A = -0.5 * self.N * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y)
            B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
        else:
            A = -0.5 * self.N * self.input_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
            B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
        C = -self.input_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
        return A + B + C + D + self.likelihood.Z
    def _set_params(self, p):
        self.Z = p[:self.num_inducing * self.input_dim].reshape(self.num_inducing, self.input_dim)
        self.kern._set_params(p[self.Z.size:self.Z.size + self.kern.Nparam])
        self.likelihood._set_params(p[self.Z.size + self.kern.Nparam:])
        self._compute_kernel_matrices()
        self._computations()
    def _get_params(self):
        return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()])
    def _get_param_names(self):
        return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])],[])\
            + self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
        For a Gaussian likelihood, no iteration is required:
        this function does nothing
        """
        if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood
            self.likelihood.restart() # TODO check consistency with pseudo_EP
            if self.has_uncertain_inputs:
                Lmi = chol_inv(self.Lm)
                Kmmi = tdot(Lmi.T)
                diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)])
                self.likelihood.fit_FITC(self.Kmm, self.psi1, diag_tr_psi2Kmmi) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
                # raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
            else:
                self.likelihood.fit_DTC(self.Kmm, self.psi1)
                # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
                self._set_params(self._get_params()) # update the GP
    def _log_likelihood_gradients(self):
        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
    def dL_dtheta(self):
        """
        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
        """
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X, self.X_variance)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T, self.Z, self.X, self.X_variance)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X, self.X_variance)
        else:
            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.Z, self.X)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
        return dL_dtheta
    def dL_dZ(self):
        """
        The derivative of the bound wrt the inducing inputs Z
        """
        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm, self.Z) # factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
        if self.has_uncertain_inputs:
            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
        else:
            dL_dZ += self.kern.dK_dX(self.dL_dpsi1, self.Z, self.X)
        return dL_dZ
    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
        """Internal helper function for making predictions, does not account for normalization"""
        Bi, _ = linalg.lapack.flapack.dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
        symmetrify(Bi)
        Kmmi_LmiBLmi = backsub_both_sides(self.Lm, np.eye(self.num_inducing) - Bi)
        if X_variance_new is None:
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
            mu = np.dot(Kx.T, self.Cpsi1V)
            if full_cov:
                Kxx = self.kern.K(Xnew, which_parts=which_parts)
                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
            else:
                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
        else:
            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
            mu = np.dot(Kx, self.Cpsi1V)
            if full_cov:
                raise NotImplementedError, "TODO"
            else:
                Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
                psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
        return mu, var[:, None]
    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
        """
        Predict the function(s) at the new point(s) Xnew.
        Arguments
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param X_variance_new: The uncertainty in the prediction points
        :type X_variance_new: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
        """
        # normalize X values
        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
        if X_variance_new is not None:
            X_variance_new = X_variance_new / self._Xstd ** 2
        # here's the actual prediction by the GP model
        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
        return mean, var, _025pm, _975pm
    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        if which_data is 'all':
            which_data = slice(None)
        GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax)
        if self.X.shape[1] == 1:
            if self.has_uncertain_inputs:
                Xu = self.X * self._Xstd + self._Xmean  # NOTE self.X are the normalized values now
                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
            Zu = self.Z * self._Xstd + self._Xmean
            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
        elif self.X.shape[1] == 2:
            Zu = self.Z * self._Xstd + self._Xmean
            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
--- a/GPy/inference/scg.py
+++ b/GPy/inference/scg.py
@ -0,0 +1,169 @@
 # Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012)
 # Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
 #      THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
 #      HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 #      EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
 #      NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 #      MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 #      PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 #      REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 #      DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 #      EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 #      (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 #      OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 #      DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 #      HOWEVER CAUSED AND ON ANY THEORY OF
 #      LIABILITY, WHETHER IN CONTRACT, STRICT
 #      LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 #      OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #      OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #      POSSIBILITY OF SUCH DAMAGE.
 import numpy as np
 import sys
 def print_out(len_maxiters, display, fnow, current_grad, beta, iteration):
    if display:
        print '\r',
        print '{0:>0{mi}g}  {1:> 12e}  {2:> 12e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
        sys.stdout.flush()
 def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=None, ftol=None, gtol=None):
    """
    Optimisation through Scaled Conjugate Gradients (SCG)
    f: the objective function
    gradf : the gradient function (should return a 1D np.ndarray)
    x : the initial condition
    Returns
    x the optimal value for x
    flog : a list of all the objective values
    function_eval number of fn evaluations
    status: string describing convergence status
    """
    if xtol is None:
        xtol = 1e-6
    if ftol is None:
        ftol = 1e-6
    if gtol is None:
        gtol = 1e-5
    sigma0 = 1.0e-8
    fold = f(x, *optargs) # Initial function value.
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
    current_grad = np.dot(gradnew, gradnew)
    gradold = gradnew.copy()
    d = -gradnew # Initial search direction.
    success = True # Force calculation of directional derivs.
    nsuccess = 0 # nsuccess counts number of successes.
    beta = 1.0 # Initial scale parameter.
    betamin = 1.0e-60 # Lower bound on scale.
    betamax = 1.0e100 # Upper bound on scale.
    status = "Not converged"
    flog = [fold]
    iteration = 0
    len_maxiters = len(str(maxiters))
    if display:
        print ' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters)
    # Main optimization loop.
    while iteration < maxiters:
        # Calculate first and second directional derivatives.
        if success:
            mu = np.dot(d, gradnew)
            if mu >= 0:
                d = -gradnew
                mu = np.dot(d, gradnew)
            kappa = np.dot(d, d)
            sigma = sigma0 / np.sqrt(kappa)
            xplus = x + sigma * d
            gplus = gradf(xplus, *optargs)
            theta = np.dot(d, (gplus - gradnew)) / sigma
        # Increase effective curvature and evaluate step size alpha.
        delta = theta + beta * kappa
        if delta <= 0:
            delta = beta * kappa
            beta = beta - theta / kappa
        alpha = -mu / delta
        # Calculate the comparison ratio.
        xnew = x + alpha * d
        fnew = f(xnew, *optargs)
        function_eval += 1
        if function_eval >= max_f_eval:
            status = "Maximum number of function evaluations exceeded"
            break
 #             return x, flog, function_eval, status
        Delta = 2.*(fnew - fold) / (alpha * mu)
        if Delta >= 0.:
            success = True
            nsuccess += 1
            x = xnew
            fnow = fnew
        else:
            success = False
            fnow = fold
        # Store relevant variables
        flog.append(fnow) # Current function value
        iteration += 1
        print_out(len_maxiters, display, fnow, current_grad, beta, iteration)
        if success:
            # Test for termination
            if (np.max(np.abs(alpha * d)) < xtol) or (np.abs(fnew - fold) < ftol):
                status = 'converged'
                break
 #                 return x, flog, function_eval, status
            else:
                # Update variables for new position
                gradnew = gradf(x, *optargs)
                current_grad = np.dot(gradnew, gradnew)
                gradold = gradnew
                fold = fnew
                # If the gradient is zero then we are done.
                if current_grad <= gtol:
                    status = 'converged'
                    break
                    # return x, flog, function_eval, status
        # Adjust beta according to comparison ratio.
        if Delta < 0.25:
            beta = min(4.0 * beta, betamax)
        if Delta > 0.75:
            beta = max(0.5 * beta, betamin)
        # Update search direction using Polak-Ribiere formula, or re-start
        # in direction of negative gradient after nparams steps.
        if nsuccess == x.size:
            d = -gradnew
 #             beta = 1.  # TODO: betareset!!
            nsuccess = 0
        elif success:
            Gamma = np.dot(gradold - gradnew, gradnew) / (mu)
            d = Gamma * d - gradnew
    else:
        # If we get here, then we haven't terminated in the given number of
        # iterations.
        status = "maxiter exceeded"
    if display:
        print_out(len_maxiters, display, fnow, current_grad, beta, iteration)
        print ""
    return x, flog, function_eval, status
--- a/GPy/inference/sgd.py
+++ b/GPy/inference/sgd.py
@ -0,0 +1,356 @@
 import numpy as np
 import scipy as sp
 import scipy.sparse
 from optimization import Optimizer
 from scipy import linalg, optimize
 import pylab as plt
 import copy, sys, pickle
 class opt_SGD(Optimizer):
    """
    Optimize using stochastic gradient descent.
    *** Parameters ***
    model: reference to the model object
    iterations: number of iterations
    learning_rate: learning rate
    momentum: momentum
    """
    def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs):
        self.opt_name = "Stochastic Gradient Descent"
        self.model = model
        self.iterations = iterations
        self.momentum = momentum
        self.learning_rate = learning_rate
        self.x_opt = None
        self.f_opt = None
        self.messages = messages
        self.batch_size = batch_size
        self.self_paced = self_paced
        self.center = center
        self.param_traces = [('noise',[])]
        self.iteration_file = iteration_file
        self.learning_rate_adaptation = learning_rate_adaptation
        self.actual_iter = actual_iter
        if self.learning_rate_adaptation != None:
            if self.learning_rate_adaptation == 'annealing':
                self.learning_rate_0 = self.learning_rate
            else:
                self.learning_rate_0 = self.learning_rate.mean()
        self.schedule = schedule
        # if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1:
        #     self.param_traces.append(('bias',[]))
        # if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1:
        #     self.param_traces.append(('linear',[]))
        # if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1:
        #     self.param_traces.append(('rbf_var',[]))
        self.param_traces = dict(self.param_traces)
        self.fopt_trace = []
        num_params = len(self.model._get_params())
        if isinstance(self.learning_rate, float):
            self.learning_rate = np.ones((num_params,)) * self.learning_rate
        assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter"
    def __str__(self):
        status = "\nOptimizer: \t\t\t %s\n" % self.opt_name
        status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt
        status += "Number of iterations: \t\t %d\n" % self.iterations
        status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min())
        status += "Momentum: \t\t\t %.3f\n" % self.momentum
        status += "Batch size: \t\t\t %d\n" % self.batch_size
        status += "Time elapsed: \t\t\t %s\n" % self.time
        return status
    def plot_traces(self):
        plt.figure()
        plt.subplot(211)
        plt.title('Parameters')
        for k in self.param_traces.keys():
            plt.plot(self.param_traces[k], label=k)
        plt.legend(loc=0)
        plt.subplot(212)
        plt.title('Objective function')
        plt.plot(self.fopt_trace)
    def non_null_samples(self, data):
        return (np.isnan(data).sum(axis=1) == 0)
    def check_for_missing(self, data):
        if sp.sparse.issparse(self.model.likelihood.Y):
            return True
        else:
            return np.isnan(data).sum() > 0
    def subset_parameter_vector(self, x, samples, param_shapes):
        subset = np.array([], dtype = int)
        x = np.arange(0, len(x))
        i = 0
        for s in param_shapes:
            N, input_dim = s
            X = x[i:i+N*input_dim].reshape(N, input_dim)
            X = X[samples]
            subset = np.append(subset, X.flatten())
            i += N*input_dim
        subset = np.append(subset, x[i:])
        return subset
    def shift_constraints(self, j):
        constrained_indices = copy.deepcopy(self.model.constrained_indices)
        for c, constraint in enumerate(constrained_indices):
            mask = (np.ones_like(constrained_indices[c]) == 1)
            for i in range(len(constrained_indices[c])):
                pos = np.where(j == constrained_indices[c][i])[0]
                if len(pos) == 1:
                    self.model.constrained_indices[c][i] = pos
                else:
                    mask[i] = False
            self.model.constrained_indices[c] = self.model.constrained_indices[c][mask]
        return constrained_indices
        # back them up
        # bounded_i = copy.deepcopy(self.model.constrained_bounded_indices)
        # bounded_l = copy.deepcopy(self.model.constrained_bounded_lowers)
        # bounded_u = copy.deepcopy(self.model.constrained_bounded_uppers)
        # for b in range(len(bounded_i)): # for each group of constraints
        #     for bc in range(len(bounded_i[b])):
        #         pos = np.where(j == bounded_i[b][bc])[0]
        #         if len(pos) == 1:
        #             pos2 = np.where(self.model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
        #             self.model.constrained_bounded_indices[b][pos2] = pos[0]
        #         else:
        #             if len(self.model.constrained_bounded_indices[b]) == 1:
        #                 # if it's the last index to be removed
        #                 # the logic here is just a mess. If we remove the last one, then all the
        #                 # b-indices change and we have to iterate through everything to find our
        #                 # current index. Can't deal with this right now.
        #                 raise NotImplementedError
        #             else: # just remove it from the indices
        #                 mask = self.model.constrained_bounded_indices[b] != bc
        #                 self.model.constrained_bounded_indices[b] = self.model.constrained_bounded_indices[b][mask]
        # # here we shif the positive constraints. We cycle through each positive
        # # constraint
        # positive = self.model.constrained_positive_indices.copy()
        # mask = (np.ones_like(positive) == 1)
        # for p in range(len(positive)):
        #     # we now check whether the constrained index appears in the j vector
        #     # (the vector of the "active" indices)
        #     pos = np.where(j == self.model.constrained_positive_indices[p])[0]
        #     if len(pos) == 1:
        #         self.model.constrained_positive_indices[p] = pos
        #     else:
        #         mask[p] = False
        # self.model.constrained_positive_indices = self.model.constrained_positive_indices[mask]
        # return (bounded_i, bounded_l, bounded_u), positive
    def restore_constraints(self, c):#b, p):
        # self.model.constrained_bounded_indices = b[0]
        # self.model.constrained_bounded_lowers = b[1]
        # self.model.constrained_bounded_uppers = b[2]
        # self.model.constrained_positive_indices = p
        self.model.constrained_indices = c
    def get_param_shapes(self, N = None, input_dim = None):
        model_name = self.model.__class__.__name__
        if model_name == 'GPLVM':
            return [(N, input_dim)]
        if model_name == 'Bayesian_GPLVM':
            return [(N, input_dim), (N, input_dim)]
        else:
            raise NotImplementedError
    def step_with_missing_data(self, f_fp, X, step, shapes):
        N, input_dim = X.shape
        if not sp.sparse.issparse(self.model.likelihood.Y):
            Y = self.model.likelihood.Y
            samples = self.non_null_samples(self.model.likelihood.Y)
            self.model.N = samples.sum()
            Y = Y[samples]
        else:
            samples = self.model.likelihood.Y.nonzero()[0]
            self.model.N = len(samples)
            Y = np.asarray(self.model.likelihood.Y[samples].todense(), dtype = np.float64)
        if self.model.N == 0 or Y.std() == 0.0:
            return 0, step, self.model.N
        self.model.likelihood._offset = Y.mean()
        self.model.likelihood._scale = Y.std()
        self.model.likelihood.set_data(Y)
        # self.model.likelihood.V = self.model.likelihood.Y*self.model.likelihood.precision
        sigma = self.model.likelihood._variance
        self.model.likelihood._variance = None # invalidate cache
        self.model.likelihood._set_params(sigma)
        j = self.subset_parameter_vector(self.x_opt, samples, shapes)
        self.model.X = X[samples]
        model_name = self.model.__class__.__name__
        if model_name == 'Bayesian_GPLVM':
            self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T)
            self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT)
        ci = self.shift_constraints(j)
        f, fp = f_fp(self.x_opt[j])
        step[j] = self.momentum * step[j] + self.learning_rate[j] * fp
        self.x_opt[j] -= step[j]
        self.restore_constraints(ci)
        self.model.grads[j] = fp
        # restore likelihood _offset and _scale, otherwise when we call set_data(y) on
        # the next feature, it will get normalized with the mean and std of this one.
        self.model.likelihood._offset = 0
        self.model.likelihood._scale = 1
        return f, step, self.model.N
    def adapt_learning_rate(self, t, D):
        if self.learning_rate_adaptation == 'adagrad':
            if t > 0:
                g_k = self.model.grads
                self.s_k += np.square(g_k)
                t0 = 100.0
                self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k))
                import pdb; pdb.set_trace()
            else:
                self.learning_rate = np.zeros_like(self.learning_rate)
                self.s_k = np.zeros_like(self.x_opt)
        elif self.learning_rate_adaptation == 'annealing':
            #self.learning_rate = self.learning_rate_0/(1+float(t+1)/10)
            self.learning_rate = np.ones_like(self.learning_rate) * self.schedule[t]
        elif self.learning_rate_adaptation == 'semi_pesky':
            if self.model.__class__.__name__ == 'Bayesian_GPLVM':
                g_t = self.model.grads
                if t == 0:
                    self.hbar_t = 0.0
                    self.tau_t = 100.0
                    self.gbar_t = 0.0
                self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
                self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
                self.learning_rate = np.ones_like(self.learning_rate)*(np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t)
                tau_t = self.tau_t*(1-self.learning_rate) + 1
    def opt(self, f_fp=None, f=None, fp=None):
        self.x_opt = self.model._get_params_transformed()
        self.grads = []
        X, Y = self.model.X.copy(), self.model.likelihood.Y.copy()
        self.model.likelihood.YYT = 0
        self.model.likelihood.trYYT = 0
        self.model.likelihood._offset = 0.0
        self.model.likelihood._scale = 1.0
        N, input_dim = self.model.X.shape
        D = self.model.likelihood.Y.shape[1]
        num_params = self.model._get_params()
        self.trace = []
        missing_data = self.check_for_missing(self.model.likelihood.Y)
        step = np.zeros_like(num_params)
        for it in range(self.iterations):
            if self.actual_iter != None:
                it = self.actual_iter
            self.model.grads = np.zeros_like(self.x_opt) # TODO this is ugly
            if it == 0 or self.self_paced is False:
                features = np.random.permutation(Y.shape[1])
            else:
                features = np.argsort(NLL)
            b = len(features)/self.batch_size
            features = [features[i::b] for i in range(b)]
            NLL = []
            import pylab as plt
            for count, j in enumerate(features):
                self.model.input_dim = len(j)
                self.model.likelihood.input_dim = len(j)
                self.model.likelihood.set_data(Y[:, j])
                # self.model.likelihood.V = self.model.likelihood.Y*self.model.likelihood.precision
                sigma = self.model.likelihood._variance
                self.model.likelihood._variance = None # invalidate cache
                self.model.likelihood._set_params(sigma)
                if missing_data:
                    shapes = self.get_param_shapes(N, input_dim)
                    f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes)
                else:
                    self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T)
                    self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT)
                    Nj = N
                    f, fp = f_fp(self.x_opt)
                    self.model.grads = fp.copy()
                    step = self.momentum * step + self.learning_rate * fp
                    self.x_opt -= step
                if self.messages == 2:
                    noise = self.model.likelihood._variance
                    status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise)
                    sys.stdout.write(status)
                    sys.stdout.flush()
                    self.param_traces['noise'].append(noise)
                self.adapt_learning_rate(it+count, D)
                NLL.append(f)
                self.fopt_trace.append(NLL[-1])
                # fig = plt.figure('traces')
                # plt.clf()
                # plt.plot(self.param_traces['noise'])
                # for k in self.param_traces.keys():
                #     self.param_traces[k].append(self.model.get(k)[0])
            self.grads.append(self.model.grads.tolist())
            # should really be a sum(), but earlier samples in the iteration will have a very crappy ll
            self.f_opt = np.mean(NLL)
            self.model.N = N
            self.model.X = X
            self.model.input_dim = D
            self.model.likelihood.N = N
            self.model.likelihood.input_dim = D
            self.model.likelihood.Y = Y
            sigma = self.model.likelihood._variance
            self.model.likelihood._variance = None # invalidate cache
            self.model.likelihood._set_params(sigma)
            self.trace.append(self.f_opt)
            if self.iteration_file is not None:
                f = open(self.iteration_file + "iteration%d.pickle" % it, 'w')
                data = [self.x_opt, self.fopt_trace, self.param_traces]
                pickle.dump(data, f)
                f.close()
            if self.messages != 0:
                sys.stdout.write('\r' + ' '*len(status)*2 + '  \r')
                status = "SGD Iteration: {0: 3d}/{1: 3d}  f: {2: 2.3f}   max eta: {3: 1.5f}\n".format(it+1, self.iterations, self.f_opt, self.learning_rate.max())
                sys.stdout.write(status)
                sys.stdout.flush()
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -0,0 +1,336 @@
 import numpy as np
 from scipy import stats, linalg
 from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot
 from likelihood import likelihood
 class EP(likelihood):
    def __init__(self,data,LikelihoodFunction,epsilon=1e-3,power_ep=[1.,1.]):
        """
        Expectation Propagation
        Arguments
        ---------
        epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        LikelihoodFunction : a likelihood function (see likelihood_functions.py)
        """
        self.LikelihoodFunction = LikelihoodFunction
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        self.data = data
        self.N, self.input_dim = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0
        self._transf_data = self.LikelihoodFunction._preprocess_values(data)
        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
        #initial values for the GP variables
        self.Y = np.zeros((self.N,1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
    def restart(self):
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
        self.Y = np.zeros((self.N,1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
    def predictive_values(self,mu,var,full_cov):
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
        return self.LikelihoodFunction.predictive_values(mu,var)
    def _get_params(self):
        return np.zeros(0)
    def _get_param_names(self):
        return []
    def _set_params(self,p):
        pass # TODO: the EP likelihood might want to take some parameters...
    def _gradients(self,partial):
        return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
    def _compute_GP_variables(self):
        #Variables to be called from GP
        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
        self.Y =  mu_tilde[:,None]
        self.YYT = np.dot(self.Y,self.Y.T)
        self.covariance_matrix = np.diag(1./self.tau_tilde)
        self.precision = self.tau_tilde[:,None]
        self.V = self.precision * self.Y
    def fit_full(self,K):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
        """
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        mu = np.zeros(self.N)
        Sigma = K.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = self.epsilon + 1.
        epsilon_np2 = self.epsilon + 1.
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
                mu = np.dot(Sigma,self.v_tilde)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
            L = jitchol(B)
            V,info = linalg.lapack.flapack.dtrtrs(L,Sroot_tilde_K,lower=1)
            Sigma = K - np.dot(V.T,V)
            mu = np.dot(Sigma,self.v_tilde)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
    def fit_DTC(self, Kmm, Kmn):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see ... 2013.
        """
        M = Kmm.shape[0]
        #TODO: this doesn't work with uncertain inputs!
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = Qnn = Knm*Kmmi*Kmn
        """
        KmnKnm = np.dot(Kmn,Kmn.T)
        Lm = jitchol(Kmm)
        Lmi = chol_inv(Lm)
        Kmmi = np.dot(Lmi.T,Lmi)
        KmmiKmn = np.dot(Kmmi,Kmn)
        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        LLT0 = Kmm.copy()
        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
        #KmnKnm = np.dot(Kmn, Kmn.T)
        #KmmiKmn = np.dot(Kmmi,Kmn)
        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        #LLT0 = Kmm.copy()
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
        mu = np.zeros(self.N)
        LLT = Kmm.copy()
        Sigma_diag = Qnn_diag.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        np1 = [self.tau_tilde.copy()]
        np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
                L = jitchol(LLT)
                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
                V,info = linalg.lapack.flapack.dtrtrs(L,Kmn,lower=1)
                Sigma_diag = np.sum(V*V,-2)
                si = np.sum(V.T*V[:,i],-1)
                mu += (Delta_v-Delta_tau*mu[i])*si
                self.iterations += 1
            #Sigma recomputation with Cholesky decompositon
            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
            L = jitchol(LLT)
            V,info = linalg.lapack.flapack.dtrtrs(L,Kmn,lower=1)
            V2,info = linalg.lapack.flapack.dtrtrs(L.T,V,lower=0)
            Sigma_diag = np.sum(V*V,-2)
            Knmv_tilde = np.dot(Kmn,self.v_tilde)
            mu = np.dot(V2.T,Knmv_tilde)
            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
            np1.append(self.tau_tilde.copy())
            np2.append(self.v_tilde.copy())
        self._compute_GP_variables()
    def fit_FITC(self, Kmm, Kmn, Knn_diag):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
        """
        M = Kmm.shape[0]
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
        """
        Lm = jitchol(Kmm)
        Lmi = chol_inv(Lm)
        Kmmi = np.dot(Lmi.T,Lmi)
        P0 = Kmn.T
        KmnKnm = np.dot(P0.T, P0)
        KmmiKmn = np.dot(Kmmi,P0.T)
        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
        Diag0 = Knn_diag - Qnn_diag
        R0 = jitchol(Kmmi).T
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
        self.w = np.zeros(self.N)
        self.Gamma = np.zeros(M)
        mu = np.zeros(self.N)
        P = P0.copy()
        R = R0.copy()
        Diag = Diag0.copy()
        Sigma_diag = Knn_diag
        RPT0 = np.dot(R0,P0.T)
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                dtd1 = Delta_tau*Diag[i] + 1.
                dii = Diag[i]
                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
                pi_ = P[i,:].reshape(1,M)
                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
                Rp_i = np.dot(R,pi_.T)
                RTR = np.dot(R.T,np.dot(np.eye(M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
                R = jitchol(RTR).T
                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
                RPT = np.dot(R,P.T)
                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
                mu = self.w + np.dot(P,self.Gamma)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
            Diag = Diag0 * Iplus_Dprod_i
            P = Iplus_Dprod_i[:,None] * P0
            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
            L = jitchol(np.eye(M) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
            R,info = linalg.lapack.flapack.dtrtrs(L,R0,lower=1)
            RPT = np.dot(R,P.T)
            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
            self.w = Diag * self.v_tilde
            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
            mu = self.w + np.dot(P,self.Gamma)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -0,0 +1,93 @@
 import numpy as np
 from likelihood import likelihood
 class Gaussian(likelihood):
    """
    Likelihood class for doing Expectation propagation
    :param Y: observed output (Nx1 numpy.darray)
    ..Note:: Y values allowed depend on the likelihood_function used
    :param variance : 
    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
    :type normalize: False|True
    """
    def __init__(self, data, variance=1., normalize=False):
        self.is_heteroscedastic = False
        self.Nparams = 1
        self.Z = 0. # a correction factor which accounts for the approximation made
        N, self.input_dim = data.shape
        # normalization
        if normalize:
            self._offset = data.mean(0)[None, :]
            self._scale = data.std(0)[None, :]
            # Don't scale outputs which have zero variance to zero.
            self._scale[np.nonzero(self._scale == 0.)] = 1.0e-3
        else:
            self._offset = np.zeros((1, self.input_dim))
            self._scale = np.ones((1, self.input_dim))
        self.set_data(data)
        self._variance = np.asarray(variance) + 1.
        self._set_params(np.asarray(variance))
    def set_data(self, data):
        self.data = data
        self.N, D = data.shape
        assert D == self.input_dim
        self.Y = (self.data - self._offset) / self._scale
        if D > self.N:
            self.YYT = np.dot(self.Y, self.Y.T)
            self.trYYT = np.trace(self.YYT)
        else:
            self.YYT = None
            self.trYYT = np.sum(np.square(self.Y))
    def _get_params(self):
        return np.asarray(self._variance)
    def _get_param_names(self):
        return ["noise_variance"]
    def _set_params(self, x):
        x = np.float64(x)
        if np.all(self._variance != x):
            if x == 0.:
                self.precision = None
                self.V = None
            else:
                self.precision = 1. / x
                self.V = (self.precision) * self.Y
            self.covariance_matrix = np.eye(self.N) * x
            self._variance = x
    def predictive_values(self, mu, var, full_cov):
        """
        Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval
        """
        mean = mu * self._scale + self._offset
        if full_cov:
            if self.input_dim > 1:
                raise NotImplementedError, "TODO"
                # Note. for input_dim>1, we need to re-normalise all the outputs independently.
                # This will mess up computations of diag(true_var), below.
                # note that the upper, lower quantiles should be the same shape as mean
            # Augment the output variance with the likelihood variance and rescale.
            true_var = (var + np.eye(var.shape[0]) * self._variance) * self._scale ** 2
            _5pc = mean - 2.*np.sqrt(np.diag(true_var))
            _95pc = mean + 2.*np.sqrt(np.diag(true_var))
        else:
            true_var = (var + self._variance) * self._scale ** 2
            _5pc = mean - 2.*np.sqrt(true_var)
            _95pc = mean + 2.*np.sqrt(true_var)
        return mean, true_var, _5pc, _95pc
    def fit_full(self):
        """
        No approximations needed
        """
        pass
    def _gradients(self, partial):
        return np.sum(partial)
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -0,0 +1,583 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import SparseGP
 from ..likelihoods import Gaussian
 from .. import kern
 import itertools
 from matplotlib.colors import colorConverter
 from GPy.inference.optimization import SCG
 from GPy.util import plot_latent
 from GPy.models.gplvm import GPLVM
 class BayesianGPLVM(SparseGP, GPLVM):
    """
    Bayesian Gaussian Process Latent Variable Model
    :param Y: observed data (np.ndarray) or GPy.likelihood
    :type Y: np.ndarray| GPy.likelihood instance
    :param input_dim: latent dimensionality
    :type input_dim: int
    :param init: initialisation method for the latent space
    :type init: 'PCA'|'random'
    """
    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', M=10,
                 Z=None, kernel=None, oldpsave=10, _debug=False,
                 **kwargs):
        if type(likelihood_or_Y) is np.ndarray:
            likelihood = Gaussian(likelihood_or_Y)
        else:
            likelihood = likelihood_or_Y
        if X == None:
            X = self.initialise_latent(init, input_dim, likelihood.Y)
        self.init = init
        if X_variance is None:
            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
        if Z is None:
            Z = np.random.permutation(X.copy())[:M]
        assert Z.shape[1] == X.shape[1]
        if kernel is None:
            kernel = kern.rbf(input_dim) + kern.white(input_dim)
        self.oldpsave = oldpsave
        self._oldps = []
        self._debug = _debug
        if self._debug:
            self.f_call = 0
            self._count = itertools.count()
            self._savedklll = []
            self._savedparams = []
            self._savedgradients = []
            self._savederrors = []
            self._savedpsiKmm = []
            self._savedABCD = []
        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
        self._set_params(self._get_params())
    @property
    def oldps(self):
        return self._oldps
    @oldps.setter
    def oldps(self, p):
        if len(self._oldps) == (self.oldpsave + 1):
            self._oldps.pop()
        # if len(self._oldps) == 0 or not np.any([np.any(np.abs(p - op) > 1e-5) for op in self._oldps]):
        self._oldps.insert(0, p.copy())
    def _get_param_names(self):
        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], [])
        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], [])
        return (X_names + S_names + SparseGP._get_param_names(self))
    def _get_params(self):
        """
        Horizontally stacks the parameters in order to present them to the optimizer.
        The resulting 1-input_dim array has this structure:
        ===============================================================
        |       mu       |        S        |    Z    | theta |  beta  |
        ===============================================================
        """
        x = np.hstack((self.X.flatten(), self.X_variance.flatten(), SparseGP._get_params(self)))
        return x
    def _clipped(self, x):
        return x # np.clip(x, -1e300, 1e300)
    def _set_params(self, x, save_old=True, save_count=0):
 #         try:
            x = self._clipped(x)
            N, input_dim = self.N, self.input_dim
            self.X = x[:self.X.size].reshape(N, input_dim).copy()
            self.X_variance = x[(N * input_dim):(2 * N * input_dim)].reshape(N, input_dim).copy()
            SparseGP._set_params(self, x[(2 * N * input_dim):])
 #             self.oldps = x
 #         except (LinAlgError, FloatingPointError, ZeroDivisionError):
 #             print "\rWARNING: Caught LinAlgError, continueing without setting            "
 #             if self._debug:
 #                 self._savederrors.append(self.f_call)
 #             if save_count > 10:
 #                 raise
 #             self._set_params(self.oldps[-1], save_old=False, save_count=save_count + 1)
    def dKL_dmuS(self):
        dKL_dS = (1. - (1. / (self.X_variance))) * 0.5
        dKL_dmu = self.X
        return dKL_dmu, dKL_dS
    def dL_dmuS(self):
        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.dL_dpsi0, self.Z, self.X, self.X_variance)
        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.dL_dpsi1, self.Z, self.X, self.X_variance)
        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2, self.Z, self.X, self.X_variance)
        dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
        dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
        return dL_dmu, dL_dS
    def KL_divergence(self):
        var_mean = np.square(self.X).sum()
        var_S = np.sum(self.X_variance - np.log(self.X_variance))
        return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.N
    def log_likelihood(self):
        ll = SparseGP.log_likelihood(self)
        kl = self.KL_divergence()
 #         if ll < -2E4:
 #             ll = -2E4 + np.random.randn()
 #         if kl > 5E4:
 #             kl = 5E4 + np.random.randn()
        if self._debug:
            self.f_call = self._count.next()
            if self.f_call % 1 == 0:
                self._savedklll.append([self.f_call, ll, kl])
                self._savedparams.append([self.f_call, self._get_params()])
                self._savedgradients.append([self.f_call, self._log_likelihood_gradients()])
                self._savedpsiKmm.append([self.f_call, [self.Kmm, self.dL_dKmm]])
 #                 sf2 = self.scale_factor ** 2
                if self.likelihood.is_heteroscedastic:
                    A = -0.5 * self.N * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.V * self.likelihood.Y)
 #                     B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A) * sf2)
                    B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
                else:
                    A = -0.5 * self.N * self.input_dim * (np.log(2.*np.pi) + np.log(self.likelihood._variance)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
 #                     B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A) * sf2)
                    B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
                C = -self.input_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
                D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
                self._savedABCD.append([self.f_call, A, B, C, D])
        # print "\nkl:", kl, "ll:", ll
        return ll - kl
    def _log_likelihood_gradients(self):
        dKL_dmu, dKL_dS = self.dKL_dmuS()
        dL_dmu, dL_dS = self.dL_dmuS()
        # TODO: find way to make faster
        d_dmu = (dL_dmu - dKL_dmu).flatten()
        d_dS = (dL_dS - dKL_dS).flatten()
        # TEST KL: ====================
        # d_dmu = (dKL_dmu).flatten()
        # d_dS = (dKL_dS).flatten()
        # ========================
        # TEST L: ====================
 #         d_dmu = (dL_dmu).flatten()
 #         d_dS = (dL_dS).flatten()
        # ========================
        self.dbound_dmuS = np.hstack((d_dmu, d_dS))
        self.dbound_dZtheta = SparseGP._log_likelihood_gradients(self)
        return self._clipped(np.hstack((self.dbound_dmuS.flatten(), self.dbound_dZtheta)))
    def plot_latent(self, *args, **kwargs):
        return plot_latent.plot_latent_indices(self, *args, **kwargs)
    def do_test_latents(self, Y):
        """
        Compute the latent representation for a set of new points Y
        Notes:
        This will only work with a univariate Gaussian likelihood (for now)
        """
        assert not self.likelihood.is_heteroscedastic
        N_test = Y.shape[0]
        input_dim = self.Z.shape[1]
        means = np.zeros((N_test, input_dim))
        covars = np.zeros((N_test, input_dim))
        dpsi0 = -0.5 * self.input_dim * self.likelihood.precision
        dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
        V = self.likelihood.precision * Y
        dpsi1 = np.dot(self.Cpsi1V, V.T)
        start = np.zeros(self.input_dim * 2)
        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
            args = (self.kern, self.Z, dpsi0, dpsi1_n, dpsi2)
            xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False)
            mu, log_S = xopt.reshape(2, 1, -1)
            means[n] = mu[0].copy()
            covars[n] = np.exp(log_S[0]).copy()
        return means, covars
    def plot_X_1d(self, fignum=None, ax=None, colors=None):
        """
        Plot latent space X in 1D:
            -if fig is given, create input_dim subplots in fig and plot in these
            -if ax is given plot input_dim 1D latent space plots of X into each `axis`
            -if neither fig nor ax is given create a figure with fignum and plot in there
        colors:
            colors of different latent space dimensions input_dim
        """
        import pylab
        if ax is None:
            fig = pylab.figure(num=fignum, figsize=(8, min(12, (2 * self.X.shape[1]))))
        if colors is None:
            colors = pylab.gca()._get_lines.color_cycle
            pylab.clf()
        else:
            colors = iter(colors)
        plots = []
        x = np.arange(self.X.shape[0])
        for i in range(self.X.shape[1]):
            if ax is None:
                a = fig.add_subplot(self.X.shape[1], 1, i + 1)
            elif isinstance(ax, (tuple, list)):
                a = ax[i]
            else:
                raise ValueError("Need one ax per latent dimnesion input_dim")
            a.plot(self.X, c='k', alpha=.3)
            plots.extend(a.plot(x, self.X.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
            a.fill_between(x,
                            self.X.T[i] - 2 * np.sqrt(self.X_variance.T[i]),
                            self.X.T[i] + 2 * np.sqrt(self.X_variance.T[i]),
                            facecolor=plots[-1].get_color(),
                            alpha=.3)
            a.legend(borderaxespad=0.)
            a.set_xlim(x.min(), x.max())
            if i < self.X.shape[1] - 1:
                a.set_xticklabels('')
        pylab.draw()
        fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
        return fig
    def __getstate__(self):
        return (self.likelihood, self.input_dim, self.X, self.X_variance,
                self.init, self.num_inducing, self.Z, self.kern,
                self.oldpsave, self._debug)
    def __setstate__(self, state):
        self.__init__(*state)
    def _debug_filter_params(self, x):
        start, end = 0, self.X.size,
        X = x[start:end].reshape(self.N, self.input_dim)
        start, end = end, end + self.X_variance.size
        X_v = x[start:end].reshape(self.N, self.input_dim)
        start, end = end, end + (self.num_inducing * self.input_dim)
        Z = x[start:end].reshape(self.num_inducing, self.input_dim)
        start, end = end, end + self.input_dim
        theta = x[start:]
        return X, X_v, Z, theta
    def _debug_get_axis(self, figs):
        if figs[-1].axes:
            ax1 = figs[-1].axes[0]
            ax1.cla()
        else:
            ax1 = figs[-1].add_subplot(111)
        return ax1
    def _debug_plot(self):
        assert self._debug, "must enable _debug, to debug-plot"
        import pylab
 #         from mpl_toolkits.mplot3d import Axes3D
        figs = [pylab.figure('BGPLVM DEBUG', figsize=(12, 4))]
 #         fig.clf()
        # log like
 #         splotshape = (6, 4)
 #         ax1 = pylab.subplot2grid(splotshape, (0, 0), 1, 4)
        ax1 = self._debug_get_axis(figs)
        ax1.text(.5, .5, "Optimization", alpha=.3, transform=ax1.transAxes,
                 ha='center', va='center')
        kllls = np.array(self._savedklll)
        LL, = ax1.plot(kllls[:, 0], kllls[:, 1] - kllls[:, 2], '-', label=r'$\log p(\mathbf{Y})$', mew=1.5)
        KL, = ax1.plot(kllls[:, 0], kllls[:, 2], '-', label=r'$\mathcal{KL}(p||q)$', mew=1.5)
        L, = ax1.plot(kllls[:, 0], kllls[:, 1], '-', label=r'$L$', mew=1.5) # \mathds{E}_{q(\mathbf{X})}[p(\mathbf{Y|X})\frac{p(\mathbf{X})}{q(\mathbf{X})}]
        param_dict = dict(self._savedparams)
        gradient_dict = dict(self._savedgradients)
 #         kmm_dict = dict(self._savedpsiKmm)
        iters = np.array(param_dict.keys())
        ABCD_dict = np.array(self._savedABCD)
        self.showing = 0
 #         ax2 = pylab.subplot2grid(splotshape, (1, 0), 2, 4)
        figs.append(pylab.figure("BGPLVM DEBUG X", figsize=(12, 4)))
        ax2 = self._debug_get_axis(figs)
        ax2.text(.5, .5, r"$\mathbf{X}$", alpha=.5, transform=ax2.transAxes,
                 ha='center', va='center')
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(0, 0, 1, .86))
 #         ax3 = pylab.subplot2grid(splotshape, (3, 0), 2, 4, sharex=ax2)
        figs.append(pylab.figure("BGPLVM DEBUG S", figsize=(12, 4)))
        ax3 = self._debug_get_axis(figs)
        ax3.text(.5, .5, r"$\mathbf{S}$", alpha=.5, transform=ax3.transAxes,
                 ha='center', va='center')
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(0, 0, 1, .86))
 #         ax4 = pylab.subplot2grid(splotshape, (5, 0), 2, 2)
        figs.append(pylab.figure("BGPLVM DEBUG Z", figsize=(6, 4)))
        ax4 = self._debug_get_axis(figs)
        ax4.text(.5, .5, r"$\mathbf{Z}$", alpha=.5, transform=ax4.transAxes,
                 ha='center', va='center')
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(0, 0, 1, .86))
 #         ax5 = pylab.subplot2grid(splotshape, (5, 2), 2, 2)
        figs.append(pylab.figure("BGPLVM DEBUG theta", figsize=(6, 4)))
        ax5 = self._debug_get_axis(figs)
        ax5.text(.5, .5, r"${\theta}$", alpha=.5, transform=ax5.transAxes,
                 ha='center', va='center')
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(.15, 0, 1, .86))
 #         figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6)))
 #         fig = figs[-1]
 #         ax6 = fig.add_subplot(121)
 #         ax6.text(.5, .5, r"${\mathbf{K}_{mm}}$", color='magenta', alpha=.5, transform=ax6.transAxes,
 #                  ha='center', va='center')
 #         ax7 = fig.add_subplot(122)
 #         ax7.text(.5, .5, r"${\frac{dL}{dK_{mm}}}$", color='magenta', alpha=.5, transform=ax7.transAxes,
 #                  ha='center', va='center')
        figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6)))
        fig = figs[-1]
        ax8 = fig.add_subplot(121)
        ax8.text(.5, .5, r"${\mathbf{A,B,C,input_dim}}$", color='k', alpha=.5, transform=ax8.transAxes,
                 ha='center', va='center')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 1], label='A')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 2], label='B')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 3], label='C')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 4], label='input_dim')
        ax8.legend()
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(.15, 0, 1, .86))
        X, S, Z, theta = self._debug_filter_params(param_dict[self.showing])
        Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing])
 #         Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag
        quiver_units = 'xy'
        quiver_scale = 1
        quiver_scale_units = 'xy'
        Xlatentplts = ax2.plot(X, ls="-", marker="x")
        colors = colorConverter.to_rgba_array([p.get_color() for p in Xlatentplts], .4)
        Ulatent = np.zeros_like(X)
        xlatent = np.tile(np.arange(0, X.shape[0])[:, None], X.shape[1])
        Xlatentgrads = ax2.quiver(xlatent, X, Ulatent, Xg, color=colors,
                                  units=quiver_units, scale_units=quiver_scale_units,
                                  scale=quiver_scale)
        Slatentplts = ax3.plot(S, ls="-", marker="x")
        Slatentgrads = ax3.quiver(xlatent, S, Ulatent, Sg, color=colors,
                                  units=quiver_units, scale_units=quiver_scale_units,
                                  scale=quiver_scale)
        ax3.set_ylim(0, 1.)
        xZ = np.tile(np.arange(0, Z.shape[0])[:, None], Z.shape[1])
        UZ = np.zeros_like(Z)
        Zplts = ax4.plot(Z, ls="-", marker="x")
        Zgrads = ax4.quiver(xZ, Z, UZ, Zg, color=colors,
                                  units=quiver_units, scale_units=quiver_scale_units,
                                  scale=quiver_scale)
        xtheta = np.arange(len(theta))
        Utheta = np.zeros_like(theta)
        thetaplts = ax5.bar(xtheta - .4, theta, color=colors)
        thetagrads = ax5.quiver(xtheta, theta, Utheta, thetag, color=colors,
                                  units=quiver_units, scale_units=quiver_scale_units,
                                  scale=quiver_scale,
                                  edgecolors=('k',), linewidths=[1])
        pylab.setp(thetaplts, zorder=0)
        pylab.setp(thetagrads, zorder=10)
        ax5.set_xticks(np.arange(len(theta)))
        ax5.set_xticklabels(self._get_param_names()[-len(theta):], rotation=17)
 #         imkmm = ax6.imshow(kmm_dict[self.showing][0])
 #         from mpl_toolkits.axes_grid1 import make_axes_locatable
 #         divider = make_axes_locatable(ax6)
 #         caxkmm = divider.append_axes("right", "5%", pad="1%")
 #         cbarkmm = pylab.colorbar(imkmm, cax=caxkmm)
 #
 #         imkmmdl = ax7.imshow(kmm_dict[self.showing][1])
 #         divider = make_axes_locatable(ax7)
 #         caxkmmdl = divider.append_axes("right", "5%", pad="1%")
 #         cbarkmmdl = pylab.colorbar(imkmmdl, cax=caxkmmdl)
 #         input_dimleg = ax1.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
 #                    loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.15, 1, 1.15),
 #                    borderaxespad=0, mode="expand")
        ax2.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
                   borderaxespad=0, mode="expand")
        ax3.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
                   borderaxespad=0, mode="expand")
        ax4.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
                   borderaxespad=0, mode="expand")
        ax5.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
                   borderaxespad=0, mode="expand")
        Lleg = ax1.legend()
        Lleg.draggable()
 #         ax1.add_artist(input_dimleg)
        indicatorKL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 2], 'o', c=KL.get_color())
        indicatorLL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1] - kllls[self.showing, 2], 'o', c=LL.get_color())
        indicatorL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1], 'o', c=L.get_color())
 #         for err in self._savederrors:
 #             if err < kllls.shape[0]:
 #                 ax1.scatter(kllls[err, 0], kllls[err, 2], s=50, marker=(5, 2), c=KL.get_color())
 #                 ax1.scatter(kllls[err, 0], kllls[err, 1] - kllls[err, 2], s=50, marker=(5, 2), c=LL.get_color())
 #                 ax1.scatter(kllls[err, 0], kllls[err, 1], s=50, marker=(5, 2), c=L.get_color())
 #         try:
 #             for f in figs:
 #                 f.canvas.draw()
 #                 f.tight_layout(box=(0, .15, 1, .9))
 # #             pylab.draw()
 # #             pylab.tight_layout(box=(0, .1, 1, .9))
 #         except:
 #             pass
        # parameter changes
        # ax2 = pylab.subplot2grid((4, 1), (1, 0), 3, 1, projection='3d')
        button_options = [0, 0] # [0]: clicked -- [1]: dragged
        def update_plots(event):
            if button_options[0] and not button_options[1]:
 #               event.button, event.x, event.y, event.xdata, event.ydata)
                tmp = np.abs(iters - event.xdata)
                closest_hit = iters[tmp == tmp.min()][0]
                if closest_hit != self.showing:
                    self.showing = closest_hit
                    # print closest_hit, iters, event.xdata
                    indicatorLL.set_data(self.showing, kllls[self.showing, 1] - kllls[self.showing, 2])
                    indicatorKL.set_data(self.showing, kllls[self.showing, 2])
                    indicatorL.set_data(self.showing, kllls[self.showing, 1])
                    X, S, Z, theta = self._debug_filter_params(param_dict[self.showing])
                    Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing])
 #                     Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag
                    for i, Xlatent in enumerate(Xlatentplts):
                        Xlatent.set_ydata(X[:, i])
                    Xlatentgrads.set_offsets(np.array([xlatent.ravel(), X.ravel()]).T)
                    Xlatentgrads.set_UVC(Ulatent, Xg)
                    for i, Slatent in enumerate(Slatentplts):
                        Slatent.set_ydata(S[:, i])
                    Slatentgrads.set_offsets(np.array([xlatent.ravel(), S.ravel()]).T)
                    Slatentgrads.set_UVC(Ulatent, Sg)
                    for i, Zlatent in enumerate(Zplts):
                        Zlatent.set_ydata(Z[:, i])
                    Zgrads.set_offsets(np.array([xZ.ravel(), Z.ravel()]).T)
                    Zgrads.set_UVC(UZ, Zg)
                    for p, t in zip(thetaplts, theta):
                        p.set_height(t)
                    thetagrads.set_offsets(np.array([xtheta.ravel(), theta.ravel()]).T)
                    thetagrads.set_UVC(Utheta, thetag)
 #                     imkmm.set_data(kmm_dict[self.showing][0])
 #                     imkmm.autoscale()
 #                     cbarkmm.update_normal(imkmm)
 #
 #                     imkmmdl.set_data(kmm_dict[self.showing][1])
 #                     imkmmdl.autoscale()
 #                     cbarkmmdl.update_normal(imkmmdl)
                    ax2.relim()
                    # ax3.relim()
                    ax4.relim()
                    ax5.relim()
                    ax2.autoscale()
                    # ax3.autoscale()
                    ax4.autoscale()
                    ax5.autoscale()
                    [fig.canvas.draw() for fig in figs]
            button_options[0] = 0
            button_options[1] = 0
        def onclick(event):
            if event.inaxes is ax1 and event.button == 1:
                button_options[0] = 1
        def motion(event):
            if button_options[0]:
                button_options[1] = 1
        cidr = figs[0].canvas.mpl_connect('button_release_event', update_plots)
        cidp = figs[0].canvas.mpl_connect('button_press_event', onclick)
        cidd = figs[0].canvas.mpl_connect('motion_notify_event', motion)
        return ax1, ax2, ax3, ax4, ax5 # , ax6, ax7
 def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
    """
    objective function for fitting the latent variables for test points
    (negative log-likelihood: should be minimised!)
    """
    mu, log_S = mu_S.reshape(2, 1, -1)
    S = np.exp(log_S)
    psi0 = kern.psi0(Z, mu, S)
    psi1 = kern.psi1(Z, mu, S)
    psi2 = kern.psi2(Z, mu, S)
    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
    dmu = mu0 + mu1 + mu2 - mu
    # dS = S0 + S1 + S2 -0.5 + .5/S
    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
    return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
 def latent_cost(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
    """
    objective function for fitting the latent variables (negative log-likelihood: should be minimised!)
    This is the same as latent_cost_and_grad but only for the objective
    """
    mu, log_S = mu_S.reshape(2, 1, -1)
    S = np.exp(log_S)
    psi0 = kern.psi0(Z, mu, S)
    psi1 = kern.psi1(Z, mu, S)
    psi2 = kern.psi2(Z, mu, S)
    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
    return -float(lik)
 def latent_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
    """
    This is the same as latent_cost_and_grad but only for the grad
    """
    mu, log_S = mu_S.reshape(2, 1, -1)
    S = np.exp(log_S)
    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
    dmu = mu0 + mu1 + mu2 - mu
    # dS = S0 + S1 + S2 -0.5 + .5/S
    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
    return -np.hstack((dmu.flatten(), dlnS.flatten()))
--- a/GPy/models/fitc.py
+++ b/GPy/models/fitc.py
@ -0,0 +1,252 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, tdot, symmetrify, pdinv
 from ..util.plot import gpplot
 from .. import kern
 from scipy import stats, linalg
 from GPy.core.sparse_gp import SparseGP
 def backsub_both_sides(L, X):
    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
    tmp, _ = linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(X), lower=1, trans=1)
    return linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(tmp.T), lower=1, trans=1)[0].T
 class FITC(SparseGP):
    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
        super(FITC, self).__init__(X, likelihood, kernel, normalize_X=normalize_X)
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
        this function does nothing
        Diag(Knn - Qnn) is added to the noise term to use the tools already implemented in SparseGP.
        The true precison is now 'true_precision' not 'precision'.
        """
        if self.has_uncertain_inputs:
            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
        else:
            self.likelihood.fit_FITC(self.Kmm, self.psi1, self.psi0)
            self._set_params(self._get_params()) # update the GP
    def _computations(self):
        # factor Kmm
        self.Lm = jitchol(self.Kmm)
        self.Lmi, info = linalg.lapack.flapack.dtrtrs(self.Lm, np.eye(self.num_inducing), lower=1)
        Lmipsi1 = np.dot(self.Lmi, self.psi1)
        self.Qnn = np.dot(Lmipsi1.T, Lmipsi1).copy()
        self.Diag0 = self.psi0 - np.diag(self.Qnn)
        self.beta_star = self.likelihood.precision / (1. + self.likelihood.precision * self.Diag0[:, None]) # Includes Diag0 in the precision
        self.V_star = self.beta_star * self.likelihood.Y
        # The rather complex computations of self.A
        if self.has_uncertain_inputs:
                raise NotImplementedError
        else:
            if self.likelihood.is_heteroscedastic:
                assert self.likelihood.input_dim == 1
            tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.N)))
            tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
            self.A = tdot(tmp)
        # factor B
        self.B = np.eye(self.num_inducing) + self.A
        self.LB = jitchol(self.B)
        self.LBi = chol_inv(self.LB)
        self.psi1V = np.dot(self.psi1, self.V_star)
        Lmi_psi1V, info = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
        self._LBi_Lmi_psi1V, _ = linalg.lapack.flapack.dtrtrs(self.LB, np.asfortranarray(Lmi_psi1V), lower=1, trans=0)
        Kmmipsi1 = np.dot(self.Lmi.T, Lmipsi1)
        b_psi1_Ki = self.beta_star * Kmmipsi1.T
        Ki_pbp_Ki = np.dot(Kmmipsi1, b_psi1_Ki)
        Kmmi = np.dot(self.Lmi.T, self.Lmi)
        LBiLmi = np.dot(self.LBi, self.Lmi)
        LBL_inv = np.dot(LBiLmi.T, LBiLmi)
        VVT = np.outer(self.V_star, self.V_star)
        VV_p_Ki = np.dot(VVT, Kmmipsi1.T)
        Ki_pVVp_Ki = np.dot(Kmmipsi1, VV_p_Ki)
        psi1beta = self.psi1 * self.beta_star.T
        H = self.Kmm + mdot(self.psi1, psi1beta.T)
        LH = jitchol(H)
        LHi = chol_inv(LH)
        Hi = np.dot(LHi.T, LHi)
        betapsi1TLmiLBi = np.dot(psi1beta.T, LBiLmi.T)
        alpha = np.array([np.dot(a.T, a) for a in betapsi1TLmiLBi])[:, None]
        gamma_1 = mdot(VVT, self.psi1.T, Hi)
        pHip = mdot(self.psi1.T, Hi, self.psi1)
        gamma_2 = mdot(self.beta_star * pHip, self.V_star)
        gamma_3 = self.V_star * gamma_2
        self._dL_dpsi0 = -0.5 * self.beta_star # dA_dpsi0: logdet(self.beta_star)
        self._dL_dpsi0 += .5 * self.V_star ** 2 # dA_psi0: yT*beta_star*y
        self._dL_dpsi0 += .5 * alpha # dC_dpsi0
        self._dL_dpsi0 += 0.5 * mdot(self.beta_star * pHip, self.V_star) ** 2 - self.V_star * mdot(self.V_star.T, pHip * self.beta_star).T # dD_dpsi0
        self._dL_dpsi1 = b_psi1_Ki.copy() # dA_dpsi1: logdet(self.beta_star)
        self._dL_dpsi1 += -np.dot(psi1beta.T, LBL_inv) # dC_dpsi1
        self._dL_dpsi1 += gamma_1 - mdot(psi1beta.T, Hi, self.psi1, gamma_1) # dD_dpsi1
        self._dL_dKmm = -0.5 * np.dot(Kmmipsi1, b_psi1_Ki) # dA_dKmm: logdet(self.beta_star)
        self._dL_dKmm += .5 * (LBL_inv - Kmmi) + mdot(LBL_inv, psi1beta, Kmmipsi1.T) # dC_dKmm
        self._dL_dKmm += -.5 * mdot(Hi, self.psi1, gamma_1) # dD_dKmm
        self._dpsi1_dtheta = 0
        self._dpsi1_dX = 0
        self._dKmm_dtheta = 0
        self._dKmm_dX = 0
        self._dpsi1_dX_jkj = 0
        self._dpsi1_dtheta_jkj = 0
        for i, V_n, alpha_n, gamma_n, gamma_k in zip(range(self.N), self.V_star, alpha, gamma_2, gamma_3):
            K_pp_K = np.dot(Kmmipsi1[:, i:(i + 1)], Kmmipsi1[:, i:(i + 1)].T)
            # Diag_dpsi1 = Diag_dA_dpsi1: yT*beta_star*y + Diag_dC_dpsi1 +Diag_dD_dpsi1
            _dpsi1 = (-V_n ** 2 - alpha_n + 2.*gamma_k - gamma_n ** 2) * Kmmipsi1.T[i:(i + 1), :]
            # Diag_dKmm = Diag_dA_dKmm: yT*beta_star*y +Diag_dC_dKmm +Diag_dD_dKmm
            _dKmm = .5 * (V_n ** 2 + alpha_n + gamma_n ** 2 - 2.*gamma_k) * K_pp_K # Diag_dD_dKmm
            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1, self.X[i:i + 1, :], self.Z)
            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm, self.Z)
            self._dKmm_dX += 2.*self.kern.dK_dX(_dKmm , self.Z)
            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T, self.Z, self.X[i:i + 1, :])
        # the partial derivative vector for the likelihood
        if self.likelihood.Nparams == 0:
            # save computation here.
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
            raise NotImplementedError, "heteroscedatic derivates not implemented"
        else:
            # likelihood is not heterscedatic
            dbstar_dnoise = self.likelihood.precision * (self.beta_star ** 2 * self.Diag0[:, None] - self.beta_star)
            Lmi_psi1 = mdot(self.Lmi, self.psi1)
            LBiLmipsi1 = np.dot(self.LBi, Lmi_psi1)
            aux_0 = np.dot(self._LBi_Lmi_psi1V.T, LBiLmipsi1)
            aux_1 = self.likelihood.Y.T * np.dot(self._LBi_Lmi_psi1V.T, LBiLmipsi1)
            aux_2 = np.dot(LBiLmipsi1.T, self._LBi_Lmi_psi1V)
            dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise / self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y ** 2 * dbstar_dnoise)
            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T, self.LBi, Lmi_psi1) * Lmi_psi1 * dbstar_dnoise.T)
            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T, self.LBi, Lmi_psi1) * Lmi_psi1 * dbstar_dnoise.T)
            dD_dnoise_1 = mdot(self.V_star * LBiLmipsi1.T, LBiLmipsi1 * dbstar_dnoise.T * self.likelihood.Y.T)
            alpha = mdot(LBiLmipsi1, self.V_star)
            alpha_ = mdot(LBiLmipsi1.T, alpha)
            dD_dnoise_2 = -0.5 * self.input_dim * np.sum(alpha_ ** 2 * dbstar_dnoise)
            dD_dnoise_1 = mdot(self.V_star.T, self.psi1.T, self.Lmi.T, self.LBi.T, self.LBi, self.Lmi, self.psi1, dbstar_dnoise * self.likelihood.Y)
            dD_dnoise_2 = 0.5 * mdot(self.V_star.T, self.psi1.T, Hi, self.psi1, dbstar_dnoise * self.psi1.T, Hi, self.psi1, self.V_star)
            dD_dnoise = dD_dnoise_1 + dD_dnoise_2
            self.partial_for_likelihood = dA_dnoise + dC_dnoise + dD_dnoise
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        A = -0.5 * self.N * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
        C = -self.input_dim * (np.sum(np.log(np.diag(self.LB))))
        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
        return A + C + D
    def _log_likelihood_gradients(self):
        pass
        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
    def dL_dtheta(self):
        if self.has_uncertain_inputs:
            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
        else:
            dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0, self.X)
            dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1, self.X, self.Z)
            dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm, X=self.Z)
            dL_dtheta += self._dKmm_dtheta
            dL_dtheta += self._dpsi1_dtheta
        return dL_dtheta
    def dL_dZ(self):
        if self.has_uncertain_inputs:
            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
        else:
            dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T, self.Z, self.X)
            dL_dZ += 2. * self.kern.dK_dX(self._dL_dKmm, X=self.Z)
            dL_dZ += self._dpsi1_dX
            dL_dZ += self._dKmm_dX
        return dL_dZ
    def _raw_predict(self, Xnew, which_parts, full_cov=False):
        if self.likelihood.is_heteroscedastic:
            Iplus_Dprod_i = 1. / (1. + self.Diag0 * self.likelihood.precision.flatten())
            self.Diag = self.Diag0 * Iplus_Dprod_i
            self.P = Iplus_Dprod_i[:, None] * self.psi1.T
            self.RPT0 = np.dot(self.Lmi, self.psi1)
            self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0, ((1. - Iplus_Dprod_i) / self.Diag0)[:, None] * self.RPT0.T))
            self.R, info = linalg.flapack.dtrtrs(self.L, self.Lmi, lower=1)
            self.RPT = np.dot(self.R, self.P.T)
            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T, self.RPT)
            self.w = self.Diag * self.likelihood.v_tilde
            self.Gamma = np.dot(self.R.T, np.dot(self.RPT, self.likelihood.v_tilde))
            self.mu = self.w + np.dot(self.P, self.Gamma)
            """
            Make a prediction for the generalized FITC model
            Arguments
            ---------
            X : Input prediction data - Nx1 numpy array (floats)
            """
            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
            # Ci = I + (RPT0)Di(RPT0).T
            # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T
            #   = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T
            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
            #   = I - V.T * V
            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
            V, info = linalg.flapack.dtrtrs(U, self.RPT0.T, lower=1)
            C = np.eye(self.num_inducing) - np.dot(V.T, V)
            mu_u = np.dot(C, self.RPT0) * (1. / self.Diag0[None, :])
            # self.C = C
            # self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
            # self.mu_u = mu_u
            # self.U = U
            # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
            mu_H = np.dot(mu_u, self.mu)
            self.mu_H = mu_H
            Sigma_H = C + np.dot(mu_u, np.dot(self.Sigma, mu_u.T))
            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
            KR0T = np.dot(Kx.T, self.Lmi.T)
            mu_star = np.dot(KR0T, mu_H)
            if full_cov:
                Kxx = self.kern.K(Xnew, which_parts=which_parts)
                var = Kxx + np.dot(KR0T, np.dot(Sigma_H - np.eye(self.num_inducing), KR0T.T))
            else:
                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
                var = (Kxx + np.sum(KR0T.T * np.dot(Sigma_H - np.eye(self.num_inducing), KR0T.T), 0))[:, None]
            return mu_star[:, None], var
        else:
            raise NotImplementedError, "homoscedastic fitc not implemented"
            """
            Kx = self.kern.K(self.Z, Xnew)
            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
            if full_cov:
                Kxx = self.kern.K(Xnew)
                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
            else:
                Kxx = self.kern.Kdiag(Xnew)
                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
            return mu,var[:,None]
            """
--- a/GPy/models/generalized_fitc.py
+++ b/GPy/models/generalized_fitc.py
@ -0,0 +1,221 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
 from ..util.plot import gpplot
 from .. import kern
 from scipy import stats, linalg
 from ..core import SparseGP
 def backsub_both_sides(L,X):
    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
    tmp,_ = linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(X),lower=1,trans=1)
    return linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(tmp.T),lower=1,trans=1)[0].T
 class GeneralizedFITC(SparseGP):
    """
    Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC.
    :param X: inputs
    :type X: np.ndarray (N x input_dim)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP)
    :param kernel : the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param X_variance: The variance in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (N x input_dim) | None
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (num_inducing x input_dim) | None
    :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type num_inducing: int
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
        self.Z = Z
        self.num_inducing = self.Z.shape[0]
        self.true_precision = likelihood.precision
        super(GeneralizedFITC, self).__init__(X, likelihood, kernel=kernel, Z=self.Z, X_variance=X_variance, normalize_X=normalize_X)
        self._set_params(self._get_params())
    def _set_params(self, p):
        self.Z = p[:self.num_inducing*self.input_dim].reshape(self.num_inducing, self.input_dim)
        self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam])
        self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:])
        self._compute_kernel_matrices()
        self._computations()
        self._FITC_computations()
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
        this function does nothing
        Diag(Knn - Qnn) is added to the noise term to use the tools already implemented in SparseGP.
        The true precison is now 'true_precision' not 'precision'.
        """
        if self.has_uncertain_inputs:
            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
        else:
            self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
            self.true_precision = self.likelihood.precision # Save the true precision
            self.likelihood.precision = self.true_precision/(1. + self.true_precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
            self._set_params(self._get_params()) # update the GP
    def _FITC_computations(self):
        """
        FITC approximation doesn't have the correction term in the log-likelihood bound,
        but adds a diagonal term to the covariance matrix: diag(Knn - Qnn).
        This function:
            - computes the FITC diagonal term
            - removes the extra terms computed in the SparseGP approximation
            - computes the likelihood gradients wrt the true precision.
        """
        #NOTE the true precison is now 'true_precision' not 'precision'
        if self.likelihood.is_heteroscedastic:
            # Compute generalized FITC's diagonal term of the covariance
            self.Lmi,info = linalg.lapack.flapack.dtrtrs(self.Lm,np.eye(self.num_inducing),lower=1)
            Lmipsi1 = np.dot(self.Lmi,self.psi1)
            self.Qnn = np.dot(Lmipsi1.T,Lmipsi1)
            #self.Kmmi, Lm, Lmi, Kmm_logdet = pdinv(self.Kmm)
            #self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
            #a = kj
            self.Diag0 = self.psi0 - np.diag(self.Qnn)
            Iplus_Dprod_i = 1./(1.+ self.Diag0 * self.true_precision.flatten())
            self.Diag = self.Diag0 * Iplus_Dprod_i
            self.P = Iplus_Dprod_i[:,None] * self.psi1.T
            self.RPT0 = np.dot(self.Lmi,self.psi1)
            self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T))
            self.R,info = linalg.flapack.dtrtrs(self.L,self.Lmi,lower=1)
            self.RPT = np.dot(self.R,self.P.T)
            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
            self.w = self.Diag * self.likelihood.v_tilde
            self.Gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde))
            self.mu = self.w + np.dot(self.P,self.Gamma)
            # Remove extra term from dL_dpsi1
            self.dL_dpsi1 -= mdot(self.Lmi.T,Lmipsi1*self.likelihood.precision.flatten().reshape(1,self.N))
            #self.Kmmi, Lm, Lmi, Kmm_logdet = pdinv(self.Kmm)
            #self.dL_dpsi1 -= mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
            #########333333
            #self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
            #########333333
        else:
            raise NotImplementedError, "homoscedastic fitc not implemented"
            # Remove extra term from dL_dpsi1
            #self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB
        sf = self.scale_factor
        sf2 = sf**2
        # Remove extra term from dL_dKmm
        self.dL_dKmm += 0.5 * self.input_dim * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
        self.dL_dpsi0 = None
        #the partial derivative vector for the likelihood
        if self.likelihood.Nparams == 0:
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
            raise NotImplementedError, "heteroscedastic derivates not implemented"
        else:
            raise NotImplementedError, "homoscedastic derivatives not implemented"
            #likelihood is not heterscedatic
            #self.partial_for_likelihood =   - 0.5 * self.N*self.input_dim*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2
            #self.partial_for_likelihood += 0.5 * self.input_dim * trace_dot(self.Bi,self.A)*self.likelihood.precision
            #self.partial_for_likelihood += self.likelihood.precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
        #TODO partial derivative vector for the likelihood not implemented
    def dL_dtheta(self):
        """
        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
        """
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
        if self.has_uncertain_inputs:
            raise NotImplementedError, "heteroscedatic derivates not implemented"
        else:
            #NOTE in SparseGP this would include the gradient wrt psi0
            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1,self.Z,self.X)
        return dL_dtheta
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        sf2 = self.scale_factor**2
        if self.likelihood.is_heteroscedastic:
            A = -0.5*self.N*self.input_dim*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
        else:
            A = -0.5*self.N*self.input_dim*(np.log(2.*np.pi) + np.log(self.likelihood._variance)) -0.5*self.likelihood.precision*self.likelihood.trYYT
        C = -self.input_dim * (np.sum(np.log(np.diag(self.LB))) + 0.5*self.num_inducing*np.log(sf2))
        #C = -0.5*self.input_dim * (self.B_logdet + self.num_inducing*np.log(sf2))
        D = 0.5*np.sum(np.square(self._LBi_Lmi_psi1V))
        #self.Cpsi1VVpsi1 = np.dot(self.Cpsi1V,self.psi1V.T)
        #D_ = 0.5*np.trace(self.Cpsi1VVpsi1)
        return A+C+D
    def _raw_predict(self, Xnew, which_parts, full_cov=False):
        if self.likelihood.is_heteroscedastic:
            """
            Make a prediction for the generalized FITC model
            Arguments
            ---------
            X : Input prediction data - Nx1 numpy array (floats)
            """
            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
            # Ci = I + (RPT0)Di(RPT0).T
            # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T
            #   = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T
            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
            #   = I - V.T * V
            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
            V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1)
            C = np.eye(self.num_inducing) - np.dot(V.T,V)
            mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
            #self.C = C
            #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
            #self.mu_u = mu_u
            #self.U = U
            # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
            mu_H = np.dot(mu_u,self.mu)
            self.mu_H = mu_H
            Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
            KR0T = np.dot(Kx.T,self.Lmi.T)
            mu_star = np.dot(KR0T,mu_H)
            if full_cov:
                Kxx = self.kern.K(Xnew,which_parts=which_parts)
                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T))
            else:
                Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts)
                Kxx_ = self.kern.K(Xnew,which_parts=which_parts) # TODO: RA, is this line needed?
                var_ = Kxx_ + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T)) # TODO: RA, is this line needed?
                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T),0))[:,None]
            return mu_star[:,None],var
        else:
            raise NotImplementedError, "homoscedastic fitc not implemented"
            """
            Kx = self.kern.K(self.Z, Xnew)
            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
            if full_cov:
                Kxx = self.kern.K(Xnew)
                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
            else:
                Kxx = self.kern.Kdiag(Xnew)
                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
            return mu,var[:,None]
            """
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -0,0 +1,41 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import GP
 from .. import likelihoods
 from .. import kern
 class GP_classification(GP):
    """
    Gaussian Process classification
    This is a thin wrapper around the models.GP class, with a set of sensible defalts
    :param X: input observations
    :param Y: observed values
    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
    :param kernel: a GPy kernel, defaults to rbf
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self,X,Y=None,likelihood=None,kernel=None,normalize_X=False,normalize_Y=False):
        if kernel is None:
            kernel = kern.rbf(X.shape[1])
        if likelihood is None:
            distribution = likelihoods.likelihood_functions.Binomial()
            likelihood = likelihoods.EP(Y, distribution)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
                raise Warning, 'likelihood.data and Y are different.'
        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self._set_params(self._get_params())
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -0,0 +1,35 @@
 # Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import GP
 from .. import likelihoods
 from .. import kern
 class GPRegression(GP):
    """
    Gaussian Process model for regression
    This is a thin wrapper around the models.GP class, with a set of sensible defalts
    :param X: input observations
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False):
        if kernel is None:
            kernel = kern.rbf(X.shape[1])
        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)
        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self._set_params(self._get_params())
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -0,0 +1,67 @@
 ### Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 import sys, pdb
 from .. import kern
 from ..core import model
 from ..util.linalg import pdinv, PCA
 from ..core import GP
 from ..likelihoods import Gaussian
 from .. import util
 from GPy.util import plot_latent
 class GPLVM(GP):
    """
    Gaussian Process Latent Variable Model
    :param Y: observed data
    :type Y: np.ndarray
    :param input_dim: latent dimensionality
    :type input_dim: int
    :param init: initialisation method for the latent space
    :type init: 'PCA'|'random'
    """
    def __init__(self, Y, input_dim, init='PCA', X = None, kernel=None, normalize_Y=False):
        if X is None:
            X = self.initialise_latent(init, input_dim, Y)
        if kernel is None:
            kernel = kern.rbf(input_dim, ARD=input_dim>1) + kern.bias(input_dim, np.exp(-2)) + kern.white(input_dim, np.exp(-2))
        likelihood = Gaussian(Y, normalize=normalize_Y)
        GP.__init__(self, X, likelihood, kernel, normalize_X=False)
        self._set_params(self._get_params())
    def initialise_latent(self, init, input_dim, Y):
        if init == 'PCA':
            return PCA(Y, input_dim)[0]
        else:
            return np.random.randn(Y.shape[0], input_dim)
    def _get_param_names(self):
        return sum([['X_%i_%i'%(n,q) for q in range(self.input_dim)] for n in range(self.N)],[]) + GP._get_param_names(self)
    def _get_params(self):
        return np.hstack((self.X.flatten(), GP._get_params(self)))
    def _set_params(self,x):
        self.X = x[:self.N*self.input_dim].reshape(self.N,self.input_dim).copy()
        GP._set_params(self, x[self.X.size:])
    def _log_likelihood_gradients(self):
        dL_dX = 2.*self.kern.dK_dX(self.dL_dK,self.X)
        return np.hstack((dL_dX.flatten(),GP._log_likelihood_gradients(self)))
    def plot(self):
        assert self.likelihood.Y.shape[1]==2
        pb.scatter(self.likelihood.Y[:,0],self.likelihood.Y[:,1],40,self.X[:,0].copy(),linewidth=0,cmap=pb.cm.jet)
        Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None]
        mu, var, upper, lower = self.predict(Xnew)
        pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)
    def plot_latent(self, *args, **kwargs):
        return util.plot_latent.plot_latent(self, *args, **kwargs)
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@ -0,0 +1,50 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import sparse_GP
 from .. import likelihoods
 from .. import kern
 from ..likelihoods import likelihood
 from GPRegression import GPRegression
 class sparse_GP_classification(sparse_GP):
    """
    sparse Gaussian Process model for classification
    This is a thin wrapper around the sparse_GP class, with a set of sensible defalts
    :param X: input observations
    :param Y: observed values
    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :rtype: model object
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, M=10):
        if kernel is None:
            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
        if likelihood is None:
            distribution = likelihoods.likelihood_functions.Binomial()
            likelihood = likelihoods.EP(Y, distribution)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
                raise Warning, 'likelihood.data and Y are different.'
        if Z is None:
            i = np.random.permutation(X.shape[0])[:M]
            Z = X[i].copy()
        else:
            assert Z.shape[1]==X.shape[1]
        sparse_GP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
        self._set_params(self._get_params())
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@ -0,0 +1,45 @@
 # Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import SparseGP
 from .. import likelihoods
 from .. import kern
 class SparseGPRegression(SparseGP):
    """
    Gaussian Process model for regression
    This is a thin wrapper around the SparseGP class, with a set of sensible defalts
    :param X: input observations
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :rtype: model object
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, Z=None, M=10, X_variance=None):
        # kern defaults to rbf (plus white for stability)
        if kernel is None:
            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1], 1e-3)
        # Z defaults to a subset of the data
        if Z is None:
            i = np.random.permutation(X.shape[0])[:M]
            Z = X[i].copy()
        else:
            assert Z.shape[1] == X.shape[1]
        # likelihood defaults to Gaussian
        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X, X_variance=X_variance)
        self._set_params(self._get_params())
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@ -0,0 +1,61 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 import sys, pdb
 from GPy.models.sparse_gp_regression import SparseGPRegression
 from GPy.models.gplvm import GPLVM
 # from .. import kern
 # from ..core import model
 # from ..util.linalg import pdinv, PCA
 class SparseGPLVM(SparseGPRegression, GPLVM):
    """
    Sparse Gaussian Process Latent Variable Model
    :param Y: observed data
    :type Y: np.ndarray
    :param input_dim: latent dimensionality
    :type input_dim: int
    :param init: initialisation method for the latent space
    :type init: 'PCA'|'random'
    """
    def __init__(self, Y, input_dim, kernel=None, init='PCA', M=10):
        X = self.initialise_latent(init, input_dim, Y)
        SparseGPRegression.__init__(self, X, Y, kernel=kernel, M=M)
    def _get_param_names(self):
        return (sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], [])
                + SparseGPRegression._get_param_names(self))
    def _get_params(self):
        return np.hstack((self.X.flatten(), SparseGPRegression._get_params(self)))
    def _set_params(self, x):
        self.X = x[:self.X.size].reshape(self.N, self.input_dim).copy()
        SparseGPRegression._set_params(self, x[self.X.size:])
    def log_likelihood(self):
        return SparseGPRegression.log_likelihood(self)
    def dL_dX(self):
        dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0, self.X)
        dL_dX += self.kern.dK_dX(self.dL_dpsi1.T, self.X, self.Z)
        return dL_dX
    def _log_likelihood_gradients(self):
        return np.hstack((self.dL_dX().flatten(), SparseGPRegression._log_likelihood_gradients(self)))
    def plot(self):
        GPLVM.plot(self)
        # passing Z without a small amout of jitter will induce the white kernel where we don;t want it!
        mu, var, upper, lower = SparseGPRegression.predict(self, self.Z + np.random.randn(*self.Z.shape) * 0.0001)
        pb.plot(mu[:, 0] , mu[:, 1], 'ko')
    def plot_latent(self, *args, **kwargs):
        input_1, input_2 = GPLVM.plot_latent(*args, **kwargs)
        pb.plot(m.Z[:, input_1], m.Z[:, input_2], '^w')
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@ -0,0 +1,89 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
 from .. import likelihoods
 from GPy.util.warping_functions import TanhWarpingFunction_d
 from GPy import kern
 class WarpedGP(GP):
    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
        if kernel is None:
            kernel = kern.rbf(X.shape[1])
        if warping_function == None:
            self.warping_function = TanhWarpingFunction_d(warping_terms)
            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
        Y = self._scale_data(Y)
        self.has_uncertain_inputs = False
        self.Y_untransformed = Y.copy()
        self.predict_in_warped_space = False
        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self._set_params(self._get_params())
    def _scale_data(self, Y):
        self._Ymax = Y.max()
        self._Ymin = Y.min()
        return (Y - self._Ymin) / (self._Ymax - self._Ymin) - 0.5
    def _unscale_data(self, Y):
        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
    def _set_params(self, x):
        self.warping_params = x[:self.warping_function.num_parameters]
        Y = self.transform_data()
        self.likelihood.set_data(Y)
        GP._set_params(self, x[self.warping_function.num_parameters:].copy())
    def _get_params(self):
        return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
    def _get_param_names(self):
        warping_names = self.warping_function._get_param_names()
        param_names = GP._get_param_names(self)
        return warping_names + param_names
    def transform_data(self):
        Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
        return Y
    def log_likelihood(self):
        ll = GP.log_likelihood(self)
        jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
        return ll + np.log(jacobian).sum()
    def _log_likelihood_gradients(self):
        ll_grads = GP._log_likelihood_gradients(self)
        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
        warping_grads = self.warping_function_gradients(alpha)
        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
    def warping_function_gradients(self, Kiy):
        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
                                                                 return_covar_chain=True)
        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
        return -dquad_dpsi + djac_dpsi
    def plot_warping(self):
        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
    def _raw_predict(self, *args, **kwargs):
        mu, var = GP._raw_predict(self, *args, **kwargs)
        if self.predict_in_warped_space:
            mu = self.warping_function.f_inv(mu, self.warping_params)
            var = self.warping_function.f_inv(var, self.warping_params)
            mu = self._unscale_data(mu)
        return mu, var