first commit in new structure

2026-05-07 19:12:40 +02:00 · 2013-12-04 11:20:25 +00:00 · 2013-12-04 11:20:25 +00:00 · daaad3c30c
commit daaad3c30c
parent b502efb1c5
16 changed files with 424 additions and 1750 deletions
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -19,7 +19,6 @@ class EP(likelihood):
        self.num_data, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
        self.num_params = 0
        self._transf_data = self.noise_model._preprocess_values(data)
        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
@ -50,10 +49,26 @@ class EP(likelihood):
        self.VVT_factor = self.V
        self.trYYT = 0.
-    def predictive_values(self,mu,var,full_cov):
+    def predictive_values(self,mu,var,full_cov,**noise_args):
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model.predictive_values(mu,var)
+        return self.noise_model.predictive_values(mu,var,**noise_args)
    def log_predictive_density(self, y_test, mu_star, var_star):
        """
        Calculation of the log predictive density
        .. math:
            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
        :param y_test: test observations (y_{*})
        :type y_test: (Nx1) array
        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
        :type mu_star: (Nx1) array
        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
        :type var_star: (Nx1) array
        """
        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
    def _get_params(self):
        #return np.zeros(0)
@ -134,7 +149,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@ -233,7 +248,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@ -336,7 +351,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
--- a/GPy/inference/laplace.py
+++ b/GPy/inference/laplace.py
@ -0,0 +1,403 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #
 #Parts of this file were influenced by the Matlab GPML framework written by
 #Carl Edward Rasmussen & Hannes Nickisch, however all bugs are our own.
 #
 #The GPML code is released under the FreeBSD License.
 #Copyright (c) 2005-2013 Carl Edward Rasmussen & Hannes Nickisch. All rights reserved.
 #
 #The code and associated documentation is available from
 #http://gaussianprocess.org/gpml/code.
 import numpy as np
 import scipy as sp
 from likelihood import likelihood
 from ..util.linalg import mdot, jitchol, pddet, dpotrs
 from functools import partial as partial_func
 import warnings
 class Laplace(likelihood):
    """Laplace approximation to a posterior"""
    def __init__(self, data, noise_model, extra_data=None):
        """
        Laplace Approximation
        Find the moments \hat{f} and the hessian at this point
        (using Newton-Raphson) of the unnormalised posterior
        Compute the GP variables (i.e. generate some Y^{squiggle} and
        z^{squiggle} which makes a gaussian the same as the laplace
        approximation to the posterior, but normalised
        Arguments
        ---------
        :param data: array of data the likelihood function is approximating
        :type data: NxD
        :param noise_model: likelihood function - subclass of noise_model
        :type noise_model: noise_model
        :param extra_data: additional data used by some likelihood functions,
        """
        self.data = data
        self.noise_model = noise_model
        self.extra_data = extra_data
        #Inital values
        self.N, self.D = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0
        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
        self.restart()
        likelihood.__init__(self)
    def restart(self):
        """
        Reset likelihood variables to their defaults
        """
        #Initial values for the GP variables
        self.Y = np.zeros((self.N, 1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)[:, None]
        self.Z = 0
        self.YYT = None
        self.old_Ki_f = None
        self.bad_fhat = False
    def predictive_values(self,mu,var,full_cov,**noise_args):
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
        return self.noise_model.predictive_values(mu,var,**noise_args)
    def log_predictive_density(self, y_test, mu_star, var_star):
        """
        Calculation of the log predictive density
        .. math:
            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
        :param y_test: test observations (y_{*})
        :type y_test: (Nx1) array
        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
        :type mu_star: (Nx1) array
        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
        :type var_star: (Nx1) array
        """
        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
    def _get_params(self):
        return np.asarray(self.noise_model._get_params())
    def _get_param_names(self):
        return self.noise_model._get_param_names()
    def _set_params(self, p):
        return self.noise_model._set_params(p)
    def _shared_gradients_components(self):
        d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
        return dL_dfhat, I_KW_i
    def _Kgradients(self):
        """
        Gradients with respect to prior kernel parameters dL_dK to be chained
        with dK_dthetaK to give dL_dthetaK
        :returns: dL_dK matrix
        :rtype: Matrix (1 x num_kernel_params)
        """
        dL_dfhat, I_KW_i = self._shared_gradients_components()
        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
        #Explicit
        #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
        #expl_b = self.Wi_K_i
        #expl = 0.5*expl_a - 0.5*expl_b
        #dL_dthetaK_exp = dK_dthetaK(expl, X)
        #Implicit
        impl = mdot(dlp, dL_dfhat, I_KW_i)
        #No longer required as we are computing these in the gp already
        #otherwise we would take them away and add them back
        #dL_dthetaK_imp = dK_dthetaK(impl, X)
        #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
        #dL_dK = expl + impl
        #No need to compute explicit as we are computing dZ_dK to account
        #for the difference between the K gradients of a normal GP,
        #and the K gradients including the implicit part
        dL_dK = impl
        return dL_dK
    def _gradients(self, partial):
        """
        Gradients with respect to likelihood parameters (dL_dthetaL)
        :param partial: Not needed by this likelihood
        :type partial: lambda function
        :rtype: array of derivatives (1 x num_likelihood_params)
        """
        dL_dfhat, I_KW_i = self._shared_gradients_components()
        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
        #len(dlik_dthetaL)
        num_params = len(self._get_param_names())
        # make space for one derivative for each likelihood parameter
        dL_dthetaL = np.zeros(num_params)
        for thetaL_i in range(num_params):
            #Explicit
            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[:, thetaL_i])
                             #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[:, thetaL_i])
                             )
            #Implicit
            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[:, thetaL_i])
            dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
        return dL_dthetaL
    def _compute_GP_variables(self):
        """
        Generate data Y which would give the normal distribution identical
        to the laplace approximation to the posterior, but normalised
        GPy expects a likelihood to be gaussian, so need to caluclate
        the data Y^{\tilde} that makes the posterior match that found
        by a laplace approximation to a non-gaussian likelihood but with
        a gaussian likelihood
        Firstly,
        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
        we wish to find the hessian \Sigma^{\tilde}
        that has the same curvature but using our new simulated data Y^{\tilde}
        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
        Secondly,
        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
        So we can suck up any differences between that and our log marginal likelihood approximation
        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
        which we want to optimize instead, by equating them and rearranging, the difference is added onto
        the log p(y) that GPy optimizes by default
        Thirdly,
        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
        gp.py code
        """
        Wi = 1.0/self.W
        self.Sigma_tilde = np.diagflat(Wi)
        Y_tilde = Wi*self.Ki_f + self.f_hat
        self.Wi_K_i = self.W12BiW12
        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
        Z_tilde = (+ lik
                   - 0.5*self.ln_B_det
                   + 0.5*ln_det_Wi_K
                   - 0.5*self.f_Ki_f
                   + 0.5*y_Wi_K_i_y
                   + self.NORMAL_CONST
                  )
        #Convert to float as its (1, 1) and Z must be a scalar
        self.Z = np.float64(Z_tilde)
        self.Y = Y_tilde
        self.YYT = np.dot(self.Y, self.Y.T)
        self.covariance_matrix = self.Sigma_tilde
        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
        self.dZ_dK = self._Kgradients()
        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
    def fit_full(self, K):
        """
        The laplace approximation algorithm, find K and expand hessian
        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
        :param K: Prior covariance matrix evaluated at locations X
        :type K: NxN matrix
        """
        self.K = K.copy()
        #Find mode
        self.f_hat = self.rasm_mode(self.K)
        #Compute hessian and other variables at mode
        self._compute_likelihood_variables()
        #Compute fake variables replicating laplace approximation to posterior
        self._compute_GP_variables()
    def _compute_likelihood_variables(self):
        """
        Compute the variables required to compute gaussian Y variables
        """
        #At this point get the hessian matrix (or vector as W is diagonal)
        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
        if not self.noise_model.log_concave:
            #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
        self.Ki_f = self.Ki_f
        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
        self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
    def _compute_B_statistics(self, K, W, a):
        """
        Rasmussen suggests the use of a numerically stable positive definite matrix B
        Which has a positive diagonal element and can be easyily inverted
        :param K: Prior Covariance matrix evaluated at locations X
        :type K: NxN matrix
        :param W: Negative hessian at a point (diagonal matrix)
        :type W: Vector of diagonal values of hessian (1xN)
        :param a: Matrix to calculate W12BiW12a
        :type a: Matrix NxN
        :returns: (W12BiW12, ln_B_det)
        """
        if not self.noise_model.log_concave:
            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
            W[W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                # To cause the posterior to become less certain than the prior and likelihood,
                                # This is a property only held by non-log-concave likelihoods
        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
        W_12 = np.sqrt(W)
        B = np.eye(self.N) + W_12*K*W_12.T
        L = jitchol(B)
        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
        ln_B_det = 2*np.sum(np.log(np.diag(L)))
        return W12BiW12a, ln_B_det
    def rasm_mode(self, K, MAX_ITER=40):
        """
        Rasmussen's numerically stable mode finding
        For nomenclature see Rasmussen & Williams 2006
        Influenced by GPML (BSD) code, all errors are our own
        :param K: Covariance matrix evaluated at locations X
        :type K: NxD matrix
        :param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
        :type MAX_ITER: scalar
        :returns: f_hat, mode on which to make laplace approxmiation
        :rtype: NxD matrix
        """
        #old_Ki_f = np.zeros((self.N, 1))
        #Start f's at zero originally of if we have gone off track, try restarting
        if self.old_Ki_f is None or self.bad_fhat:
            old_Ki_f = np.random.rand(self.N, 1)/50.0
            #old_Ki_f = self.Y
            f = np.dot(K, old_Ki_f)
        else:
            #Start at the old best point
            old_Ki_f = self.old_Ki_f.copy()
            f = self.f_hat.copy()
        new_obj = -np.inf
        old_obj = np.inf
        def obj(Ki_f, f):
            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
        difference = np.inf
        epsilon = 1e-7
        #step_size = 1
        #rs = 0
        i = 0
        while difference > epsilon and i < MAX_ITER:
            W = -self.noise_model.d2logpdf_df2(f, self.data, extra_data=self.extra_data)
            W_f = W*f
            grad = self.noise_model.dlogpdf_df(f, self.data, extra_data=self.extra_data)
            b = W_f + grad
            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
            full_step_Ki_f = b - W12BiW12Kb
            dKi_f = full_step_Ki_f - old_Ki_f
            f_old = f.copy()
            def inner_obj(step_size, old_Ki_f, dKi_f, K):
                Ki_f = old_Ki_f + step_size*dKi_f
                f = np.dot(K, Ki_f)
                # This is nasty, need to set something within an optimization though
                self.tmp_Ki_f = Ki_f.copy()
                self.tmp_f = f.copy()
                return -obj(Ki_f, f)
            i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
            #Find the stepsize that minimizes the objective function using a brent line search
            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
            #steps than get this exact then make a step, if B was bigger it might be the other way around though
            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
            f = self.tmp_f.copy()
            Ki_f = self.tmp_Ki_f.copy()
            #Optimize without linesearch
            #f_old = f.copy()
            #update_passed = False
            #while not update_passed:
                #Ki_f = old_Ki_f + step_size*dKi_f
                #f = np.dot(K, Ki_f)
                #old_obj = new_obj
                #new_obj = obj(Ki_f, f)
                #difference = new_obj - old_obj
                ##print "difference: ",difference
                #if difference < 0:
                    ##print "Objective function rose", np.float(difference)
                    ##If the objective function isn't rising, restart optimization
                    #step_size *= 0.8
                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                    ##objective function isn't increasing, try reducing step size
                    #f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
                    #old_obj = new_obj
                    #rs += 1
                #else:
                    #update_passed = True
            #old_Ki_f = self.Ki_f.copy()
            #difference = abs(new_obj - old_obj)
            #old_obj = new_obj.copy()
            difference = np.abs(np.sum(f - f_old)) + np.abs(np.sum(Ki_f - old_Ki_f))
            #difference = np.abs(np.sum(Ki_f - old_Ki_f))/np.float(self.N)
            old_Ki_f = Ki_f.copy()
            i += 1
        self.old_Ki_f = old_Ki_f.copy()
        #Warn of bad fits
        if difference > epsilon:
            self.bad_fhat = True
            warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
        elif self.bad_fhat:
            self.bad_fhat = False
            warnings.warn("f_hat now perfect again")
        self.Ki_f = Ki_f
        return f
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,7 +0,0 @@
 from ep import EP
 from ep_mixed_noise import EP_Mixed_Noise
 from gaussian import Gaussian
 from gaussian_mixed_noise import Gaussian_Mixed_Noise
 from noise_model_constructors import *
 # TODO: from Laplace import Laplace
--- a/GPy/likelihoods/ep_mixed_noise.py
+++ b/GPy/likelihoods/ep_mixed_noise.py
@ -1,385 +0,0 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
 class EP_Mixed_Noise(likelihood):
    def __init__(self,data_list,noise_model_list,epsilon=1e-3,power_ep=[1.,1.]):
        """
        Expectation Propagation
        Arguments
        ---------
        :param data_list: list of outputs
        :param noise_model_list: a list of noise models
        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations
        :type epsilon: float
        :param power_ep: list of power ep parameters
        """
        assert len(data_list) == len(noise_model_list)
        self.noise_model_list = noise_model_list
        n_list = [data.size for data in data_list]
        self.n_models = len(data_list)
        self.n_params = [noise_model._get_params().size for noise_model in noise_model_list]
        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.n_models),n_list)])
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        self.data = np.vstack(data_list)
        self.N, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
        self.num_params = 0#FIXME
        self._transf_data = np.vstack([noise_model._preprocess_values(data) for noise_model,data in zip(noise_model_list,data_list)])
        #TODO non-gaussian index
        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
        #initial values for the GP variables
        self.Y = np.zeros((self.N,1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = 0.
    def restart(self):
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
        self.Y = np.zeros((self.N,1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = 0.
    def predictive_values(self,mu,var,full_cov,noise_model):
        """
        Predicts the output given the GP
        :param mu: GP's mean
        :param var: GP's variance
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: False|True
        :param noise_model: noise model to use
        :type noise_model: integer
        """
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
        #_mu = []
        #_var = []
        #_q1 = []
        #_q2 = []
        #for m,v,o in zip(mu,var,output.flatten()):
        #    a,b,c,d = self.noise_model_list[int(o)].predictive_values(m,v)
        #    _mu.append(a)
        #    _var.append(b)
        #    _q1.append(c)
        #    _q2.append(d)
        #return np.vstack(_mu),np.vstack(_var),np.vstack(_q1),np.vstack(_q2)
        return self.noise_model_list[noise_model].predictive_values(mu,var)
    def _get_params(self):
        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
    def _get_param_names(self):
        names = []
        for noise_model in self.noise_model_list:
           names += noise_model._get_param_names()
        return names
    def _set_params(self,p):
        cs_params = np.cumsum([0]+self.n_params)
        for i in range(len(self.n_params)):
            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
    def _gradients(self,partial):
        #NOTE this is not tested
        return np.hstack([noise_model._gradients(partial) for noise_model in self.noise_model_list])
    def _compute_GP_variables(self):
        #Variables to be called from GP
        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
        self.Y =  mu_tilde[:,None]
        self.YYT = np.dot(self.Y,self.Y.T)
        self.covariance_matrix = np.diag(1./self.tau_tilde)
        self.precision = self.tau_tilde[:,None]
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = np.trace(self.YYT)
    def fit_full(self,K):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
        """
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        mu = np.zeros(self.N)
        Sigma = K.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = self.epsilon + 1.
        epsilon_np2 = self.epsilon + 1.
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
                mu = np.dot(Sigma,self.v_tilde)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
            L = jitchol(B)
            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
            Sigma = K - np.dot(V.T,V)
            mu = np.dot(Sigma,self.v_tilde)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
    def fit_DTC(self, Kmm, Kmn):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see ... 2013.
        """
        num_inducing = Kmm.shape[0]
        #TODO: this doesn't work with uncertain inputs!
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = Qnn = Knm*Kmmi*Kmn
        """
        KmnKnm = np.dot(Kmn,Kmn.T)
        Lm = jitchol(Kmm)
        Lmi = chol_inv(Lm)
        Kmmi = np.dot(Lmi.T,Lmi)
        KmmiKmn = np.dot(Kmmi,Kmn)
        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        LLT0 = Kmm.copy()
        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
        #KmnKnm = np.dot(Kmn, Kmn.T)
        #KmmiKmn = np.dot(Kmmi,Kmn)
        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        #LLT0 = Kmm.copy()
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
        mu = np.zeros(self.N)
        LLT = Kmm.copy()
        Sigma_diag = Qnn_diag.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        np1 = [self.tau_tilde.copy()]
        np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
                L = jitchol(LLT)
                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
                V,info = dtrtrs(L,Kmn,lower=1)
                Sigma_diag = np.sum(V*V,-2)
                si = np.sum(V.T*V[:,i],-1)
                mu += (Delta_v-Delta_tau*mu[i])*si
                self.iterations += 1
            #Sigma recomputation with Cholesky decompositon
            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
            L = jitchol(LLT)
            V,info = dtrtrs(L,Kmn,lower=1)
            V2,info = dtrtrs(L.T,V,lower=0)
            Sigma_diag = np.sum(V*V,-2)
            Knmv_tilde = np.dot(Kmn,self.v_tilde)
            mu = np.dot(V2.T,Knmv_tilde)
            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
            np1.append(self.tau_tilde.copy())
            np2.append(self.v_tilde.copy())
        self._compute_GP_variables()
    def fit_FITC(self, Kmm, Kmn, Knn_diag):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
        """
        num_inducing = Kmm.shape[0]
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
        """
        Lm = jitchol(Kmm)
        Lmi = chol_inv(Lm)
        Kmmi = np.dot(Lmi.T,Lmi)
        P0 = Kmn.T
        KmnKnm = np.dot(P0.T, P0)
        KmmiKmn = np.dot(Kmmi,P0.T)
        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
        Diag0 = Knn_diag - Qnn_diag
        R0 = jitchol(Kmmi).T
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
        self.w = np.zeros(self.N)
        self.Gamma = np.zeros(num_inducing)
        mu = np.zeros(self.N)
        P = P0.copy()
        R = R0.copy()
        Diag = Diag0.copy()
        Sigma_diag = Knn_diag
        RPT0 = np.dot(R0,P0.T)
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                dtd1 = Delta_tau*Diag[i] + 1.
                dii = Diag[i]
                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
                pi_ = P[i,:].reshape(1,num_inducing)
                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
                Rp_i = np.dot(R,pi_.T)
                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
                R = jitchol(RTR).T
                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
                RPT = np.dot(R,P.T)
                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
                mu = self.w + np.dot(P,self.Gamma)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
            Diag = Diag0 * Iplus_Dprod_i
            P = Iplus_Dprod_i[:,None] * P0
            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
            R,info = dtrtrs(L,R0,lower=1)
            RPT = np.dot(R,P.T)
            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
            self.w = Diag * self.v_tilde
            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
            mu = self.w + np.dot(P,self.Gamma)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -1,110 +0,0 @@
 import numpy as np
 from likelihood import likelihood
 from ..util.linalg import jitchol
 from ..core.parameter import Param
 class Gaussian(likelihood):
    """
    Likelihood class for doing Expectation propagation
    :param data: observed output
    :type data: Nx1 numpy.darray
    :param variance: noise parameter
    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
    :type normalize: False|True
    """
    def __init__(self, data, variance=1., normalize=False):
        super(Gaussian, self).__init__('gaussian')
        self.is_heteroscedastic = False
        #self.num_params = 1
        self.Z = 0. # a correction factor which accounts for the approximation made
        N, self.output_dim = data.shape
        # normalization
        if normalize:
            self._offset = data.mean(0)[None, :]
            self._scale = data.std(0)[None, :]
            # Don't scale outputs which have zero variance to zero.
            self._scale[np.nonzero(self._scale == 0.)] = 1.0e-3
        else:
            self._offset = np.zeros((1, self.output_dim))
            self._scale = np.ones((1, self.output_dim))
        self.set_data(data)
        self.variance = Param('variance', variance)
        self.variance.add_observer(self, self.update_variance)
        self.add_parameter(self.variance)
        #self.parameters_changed()
 #         self._set_params(np.asarray(variance))
    def set_data(self, data):
        self.data = data
        self.N, D = data.shape
        assert D == self.output_dim
        self.Y = (self.data - self._offset) / self._scale
        if D > self.N:
            self.YYT = np.dot(self.Y, self.Y.T)
            self.trYYT = np.trace(self.YYT)
            self.YYT_factor = jitchol(self.YYT)
        else:
            self.YYT = None
            self.trYYT = np.sum(np.square(self.Y))
            self.YYT_factor = self.Y
 #     def _get_params(self):
 #         return np.asarray(self._variance)
 # 
 #     def _get_param_names(self):
 #         return ["noise_variance"]
 # 
 #     def _set_params(self, x):
 #         self.variance = x[0]
    def update_variance(self, v):
        if np.all(self.variance == 0.): #special case of zero noise
            self.precision = np.inf
            self.V = None
        else:
            self.precision = (1. / self.variance).squeeze()
            self.V = (self.precision) * self.Y
            self.VVT_factor = self.precision * self.YYT_factor
        self.covariance_matrix = np.eye(self.N) * self.variance
        #self._variance = self.variance.copy()
 #     def parameters_changed(self):
 #         if np.any(self._variance != self.variance):
 #             self.update_variance()
    def predictive_values(self, mu, var, full_cov):
        """
        Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval
        """
        mean = mu * self._scale + self._offset
        if full_cov:
            if self.output_dim > 1:
                raise NotImplementedError, "TODO"
                # Note. for output_dim>1, we need to re-normalise all the outputs independently.
                # This will mess up computations of diag(true_var), below.
                # note that the upper, lower quantiles should be the same shape as mean
            # Augment the output variance with the likelihood variance and rescale.
            true_var = (var + np.eye(var.shape[0]) * self.variance) * self._scale ** 2
            _5pc = mean - 2.*np.sqrt(np.diag(true_var))
            _95pc = mean + 2.*np.sqrt(np.diag(true_var))
        else:
            true_var = (var + self.variance) * self._scale ** 2
            _5pc = mean - 2.*np.sqrt(true_var)
            _95pc = mean + 2.*np.sqrt(true_var)
        return mean, true_var, _5pc, _95pc
    def fit_full(self):
        """
        No approximations needed
        """
        pass
    def _gradients(self, partial):
        return np.sum(partial)
--- a/GPy/likelihoods/gaussian_mixed_noise.py
+++ b/GPy/likelihoods/gaussian_mixed_noise.py
@ -1,108 +0,0 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
 from . import Gaussian
 class Gaussian_Mixed_Noise(likelihood):
    """
    Gaussian Likelihood for multiple outputs
    This is a wrapper around likelihood.Gaussian class
    :param data_list: data observations
    :type data_list: list of numpy arrays (num_data_output_i x 1), one array per output
    :param noise_params: noise parameters of each output
    :type noise_params: list of floats, one per output
    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
    :type normalize: False|True
    """
    def __init__(self, data_list, noise_params=None, normalize=True):
        self.num_params = len(data_list)
        self.n_list = [data.size for data in data_list]
        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.num_params),self.n_list)])
        if noise_params is None:
            noise_params = [1.] * self.num_params
        else:
            assert self.num_params == len(noise_params), 'Number of noise parameters does not match the number of noise models.'
        self.noise_model_list = [Gaussian(Y,variance=v,normalize = normalize) for Y,v in zip(data_list,noise_params)]
        self.n_params = [noise_model._get_params().size for noise_model in self.noise_model_list]
        self.data = np.vstack(data_list)
        self.N, self.output_dim = self.data.shape
        self._offset = np.zeros((1, self.output_dim))
        self._scale = np.ones((1, self.output_dim))
        self.is_heteroscedastic = True
        self.Z = 0. # a correction factor which accounts for the approximation made
        self.set_data(data_list)
        self._set_params(np.asarray(noise_params))
        super(Gaussian_Mixed_Noise, self).__init__()
    def set_data(self, data_list):
        self.data = np.vstack(data_list)
        self.N, D = self.data.shape
        assert D == self.output_dim
        self.Y = (self.data - self._offset) / self._scale
        if D > self.N:
            raise NotImplementedError
            #self.YYT = np.dot(self.Y, self.Y.T)
            #self.trYYT = np.trace(self.YYT)
            #self.YYT_factor = jitchol(self.YYT)
        else:
            self.YYT = None
            self.trYYT = np.sum(np.square(self.Y))
            self.YYT_factor = self.Y
    def predictive_values(self,mu,var,full_cov,noise_model):
        """
        Predicts the output given the GP
        :param mu: GP's mean
        :param var: GP's variance
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: False|True
        :param noise_model: noise model to use
        :type noise_model: integer
        """
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
        return self.noise_model_list[noise_model].predictive_values(mu,var,full_cov)
    def _get_params(self):
        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
    def _get_param_names(self):
        if len(self.noise_model_list) == 1:
            names = self.noise_model_list[0]._get_param_names()
        else:
            names = []
            for noise_model,i in zip(self.noise_model_list,range(len(self.n_list))):
                names.append(''.join(noise_model._get_param_names() + ['_%s' %i]))
        return names
    def _set_params(self,p):
        cs_params = np.cumsum([0]+self.n_params)
        for i in range(len(self.n_params)):
            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
        self.precision = np.hstack([np.repeat(noise_model.precision,n) for noise_model,n in zip(self.noise_model_list,self.n_list)])[:,None]
        self.V = self.precision * self.Y
        self.VVT_factor = self.precision * self.YYT_factor
        self.covariance_matrix = np.eye(self.N) * 1./self.precision
    def _gradients(self,partial):
        gradients = []
        aux = np.cumsum([0]+self.n_list)
        for ai,af,noise_model in zip(aux[:-1],aux[1:],self.noise_model_list):
            gradients += [noise_model._gradients(partial[ai:af])]
        return np.hstack(gradients)
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -1,43 +0,0 @@
 import numpy as np
 import copy
 from ..core.parameterized import Parameterized
 class likelihood(Parameterized):
    """
    The atom for a likelihood class
    This object interfaces the GP and the data. The most basic likelihood
    (Gaussian) inherits directly from this, as does the EP algorithm
    Some things must be defined for this to work properly:
        - self.Y : the effective Gaussian target of the GP
        - self.N, self.D : Y.shape
        - self.covariance_matrix : the effective (noise) covariance of the GP targets
        - self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
        - self.is_heteroscedastic : enables significant computational savings in GP
        - self.precision : a scalar or vector representation of the effective target precision
        - self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
        - self.V : self.precision * self.Y
    """
    def __init__(self, name=None):
        Parameterized.__init__(self, name)
 #     def _get_params(self):
 #         raise NotImplementedError
 # 
 #     def _get_param_names(self):
 #         raise NotImplementedError
 # 
 #     def _set_params(self, x):
 #         raise NotImplementedError
    def fit(self):
        raise NotImplementedError
    def _gradients(self, partial):
        raise NotImplementedError
    def predictive_values(self, mu, var):
        raise NotImplementedError
--- a/GPy/likelihoods/noise_model_constructors.py
+++ b/GPy/likelihoods/noise_model_constructors.py
@ -1,88 +0,0 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import noise_models
 def binomial(gp_link=None):
    """
    Construct a binomial likelihood
    :param gp_link: a GPy gp_link function
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Probit()
    #else:
    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
    if isinstance(gp_link,noise_models.gp_transformations.Probit):
        analytical_mean = True
        analytical_variance = False
    elif isinstance(gp_link,noise_models.gp_transformations.Heaviside):
        analytical_mean = True
        analytical_variance = True
    else:
        analytical_mean = False
        analytical_variance = False
    return noise_models.binomial_noise.Binomial(gp_link,analytical_mean,analytical_variance)
 def exponential(gp_link=None):
    """
    Construct a binomial likelihood
    :param gp_link: a GPy gp_link function
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Identity()
    analytical_mean = False
    analytical_variance = False
    return noise_models.exponential_noise.Exponential(gp_link,analytical_mean,analytical_variance)
 def gaussian_ep(gp_link=None,variance=1.):
    """
    Construct a gaussian likelihood
    :param gp_link: a GPy gp_link function
    :param variance: scalar
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Identity()
    #else:
    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
    analytical_mean = False
    analytical_variance = False
    return noise_models.gaussian_noise.Gaussian(gp_link,analytical_mean,analytical_variance,variance)
 def poisson(gp_link=None):
    """
    Construct a Poisson likelihood
    :param gp_link: a GPy gp_link function
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Log_ex_1()
    #else:
    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
    analytical_mean = False
    analytical_variance = False
    return noise_models.poisson_noise.Poisson(gp_link,analytical_mean,analytical_variance)
 def gamma(gp_link=None,beta=1.):
    """
    Construct a Gamma likelihood
    :param gp_link: a GPy gp_link function
    :param beta: scalar
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Log_ex_1()
    analytical_mean = False
    analytical_variance = False
    return noise_models.gamma_noise.Gamma(gp_link,analytical_mean,analytical_variance,beta)
--- a/GPy/likelihoods/noise_models/init.py
+++ b/GPy/likelihoods/noise_models/init.py
@ -1,7 +0,0 @@
 import noise_distributions
 import binomial_noise
 import exponential_noise
 import gaussian_noise
 import gamma_noise
 import poisson_noise
 import gp_transformations
--- a/GPy/likelihoods/noise_models/binomial_noise.py
+++ b/GPy/likelihoods/noise_models/binomial_noise.py
@ -1,132 +0,0 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Binomial(NoiseDistribution):
    """
    Probit likelihood
    Y is expected to take values in {-1,1}
    -----
    $$
    L(x) = \\Phi (Y_i*f_i)
    $$
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
        super(Binomial, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _preprocess_values(self,Y):
        """
        Check if the values of the observations correspond to the values
        assumed by the likelihood function.
        ..Note:: Binary classification algorithm works better with classes {-1,1}
        """
        Y_prep = Y.copy()
        Y1 = Y[Y.flatten()==1].size
        Y2 = Y[Y.flatten()==0].size
        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
        Y_prep[Y.flatten() == 0] = -1
        return Y_prep
    def _moments_match_analytical(self,data_i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        if isinstance(self.gp_link,gp_transformations.Probit):
            z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
            Z_hat = std_norm_cdf(z)
            phi = std_norm_pdf(z)
            mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
        elif isinstance(self.gp_link,gp_transformations.Heaviside):
            a = data_i*v_i/np.sqrt(tau_i)
            Z_hat = std_norm_cdf(a)
            N = std_norm_pdf(a)
            mu_hat = v_i/tau_i + data_i*N/Z_hat/np.sqrt(tau_i)
            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
            if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
                stop
        return Z_hat, mu_hat, sigma2_hat
    def _predictive_mean_analytical(self,mu,sigma):
        if isinstance(self.gp_link,gp_transformations.Probit):
            return stats.norm.cdf(mu/np.sqrt(1+sigma**2))
        elif isinstance(self.gp_link,gp_transformations.Heaviside):
            return stats.norm.cdf(mu/sigma)
        else:
            raise NotImplementedError
    def _predictive_variance_analytical(self,mu,sigma, pred_mean):
        if isinstance(self.gp_link,gp_transformations.Heaviside):
            return 0.
        else:
            raise NotImplementedError
    def _mass(self,gp,obs):
        #NOTE obs must be in {0,1}
        p = self.gp_link.transf(gp)
        return p**obs * (1.-p)**(1.-obs)
    def _nlog_mass(self,gp,obs):
        p = self.gp_link.transf(gp)
        return obs*np.log(p) + (1.-obs)*np.log(1-p)
    def _dnlog_mass_dgp(self,gp,obs):
        p = self.gp_link.transf(gp)
        dp = self.gp_link.dtransf_df(gp)
        return obs/p * dp - (1.-obs)/(1.-p) * dp
    def _d2nlog_mass_dgp2(self,gp,obs):
        p = self.gp_link.transf(gp)
        return (obs/p + (1.-obs)/(1.-p))*self.gp_link.d2transf_df2(gp) + ((1.-obs)/(1.-p)**2-obs/p**2)*self.gp_link.dtransf_df(gp)
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        p = self.gp_link.transf(gp)
        return p*(1.-p)
    def _dvariance_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)*(1. - 2.*self.gp_link.transf(gp))
    def _d2variance_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
    def samples(self, gp):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
        :param size: number of samples to compute
        :param gp: latent variable
        """
        orig_shape = gp.shape
        gp = gp.flatten()
        Ysim = np.array([np.random.binomial(1,self.gp_link.transf(gpj),size=1) for gpj in gp])
        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@ -1,68 +0,0 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Exponential(NoiseDistribution):
    """
    Expoential likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
        super(Exponential, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _preprocess_values(self,Y):
        return Y
    def _mass(self,gp,obs):
        """
        Mass (or density) function
        """
        return np.exp(-obs/self.gp_link.transf(gp))/self.gp_link.transf(gp)
    def _nlog_mass(self,gp,obs):
        """
        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
        """
        return obs/self.gp_link.transf(gp) + np.log(self.gp_link.transf(gp))
    def _dnlog_mass_dgp(self,gp,obs):
        return ( 1./self.gp_link.transf(gp) - obs/self.gp_link.transf(gp)**2) * self.gp_link.dtransf_df(gp)
    def _d2nlog_mass_dgp2(self,gp,obs):
        fgp = self.gp_link.transf(gp)
        return (2*obs/fgp**3 - 1./fgp**2) * self.gp_link.dtransf_df(gp)**2 + ( 1./fgp - obs/fgp**2) * self.gp_link.d2transf_df2(gp)
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)**2
    def _dvariance_dgp(self,gp):
        return 2*self.gp_link.transf(gp)*self.gp_link.dtransf_df(gp)
    def _d2variance_dgp2(self,gp):
        return 2 * (self.gp_link.dtransf_df(gp)**2 + self.gp_link.transf(gp)*self.gp_link.d2transf_df2(gp))
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@ -1,71 +0,0 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Gamma(NoiseDistribution):
    """
    Gamma likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
        self.beta = beta
        super(Gamma, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _preprocess_values(self,Y):
        return Y
    def _mass(self,gp,obs):
        """
        Mass (or density) function
        """
        #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
        alpha = self.gp_link.transf(gp)*self.beta
        return obs**(alpha - 1.) * np.exp(-self.beta*obs) * self.beta**alpha / special.gamma(alpha)
    def _nlog_mass(self,gp,obs):
        """
        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
        """
        alpha = self.gp_link.transf(gp)*self.beta
        return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
    def _dnlog_mass_dgp(self,gp,obs):
        return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
    def _d2nlog_mass_dgp2(self,gp,obs):
        return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)/self.beta
    def _dvariance_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)/self.beta
    def _d2variance_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)/self.beta
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@ -1,98 +0,0 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Gaussian(NoiseDistribution):
    """
    Gaussian likelihood
    :param mean: mean value of the Gaussian distribution
    :param variance: mean value of the Gaussian distribution
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1.):
        self.variance = variance
        super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _get_params(self):
        return np.array([self.variance])
    def _get_param_names(self):
        return ['noise_model_variance']
    def _set_params(self,p):
        self.variance = p
    def _gradients(self,partial):
        return np.zeros(1)
        #return np.sum(partial)
    def _preprocess_values(self,Y):
        """
        Check if the values of the observations correspond to the values
        assumed by the likelihood function.
        """
        return Y
    def _moments_match_analytical(self,data_i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        sigma2_hat = 1./(1./self.variance + tau_i)
        mu_hat = sigma2_hat*(data_i/self.variance + v_i)
        sum_var = self.variance + 1./tau_i
        Z_hat = 1./np.sqrt(2.*np.pi*sum_var)*np.exp(-.5*(data_i - v_i/tau_i)**2./sum_var)
        return Z_hat, mu_hat, sigma2_hat
    def _predictive_mean_analytical(self,mu,sigma):
        new_sigma2 = self.predictive_variance(mu,sigma)
        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)
    def _predictive_variance_analytical(self,mu,sigma):
        return 1./(1./self.variance + 1./sigma**2)
    def _mass(self,gp,obs):
        #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance))
    def _nlog_mass(self,gp,obs):
        return .5*((self.gp_link.transf(gp)-obs)**2/self.variance + np.log(2.*np.pi*self.variance))
    def _dnlog_mass_dgp(self,gp,obs):
        return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
    def _d2nlog_mass_dgp2(self,gp,obs):
        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.variance
    def _dvariance_dgp(self,gp):
        return 0
    def _d2variance_dgp2(self,gp):
        return 0
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@ -1,133 +0,0 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 import scipy as sp
 import pylab as pb
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_cdf
 class GPTransformation(object):
    """
    Link function class for doing non-Gaussian likelihoods approximation
    :param Y: observed output (Nx1 numpy.darray)
    .. note:: Y values allowed depend on the likelihood_function used
    """
    def __init__(self):
        pass
    def transf(self,f):
        """
        Gaussian process tranformation function, latent space -> output space
        """
        pass
    def dtransf_df(self,f):
        """
        derivative of transf(f) w.r.t. f
        """
        pass
    def d2transf_df2(self,f):
        """
        second derivative of transf(f) w.r.t. f
        """
        pass
 class Identity(GPTransformation):
    """
    .. math::
        g(f) = f
    """
    def transf(self,f):
        return f
    def dtransf_df(self,f):
        return 1.
    def d2transf_df2(self,f):
        return 0
 class Probit(GPTransformation):
    """
    .. math::
        g(f) = \\Phi^{-1} (mu)
    """
    def transf(self,f):
        return std_norm_cdf(f)
    def dtransf_df(self,f):
        return std_norm_pdf(f)
    def d2transf_df2(self,f):
        return -f * std_norm_pdf(f)
 class Log(GPTransformation):
    """
    .. math::
        g(f) = \\log(\\mu)
    """
    def transf(self,f):
        return np.exp(f)
    def dtransf_df(self,f):
        return np.exp(f)
    def d2transf_df2(self,f):
        return np.exp(f)
 class Log_ex_1(GPTransformation):
    """
    .. math::
        g(f) = \\log(\\exp(\\mu) - 1)
    """
    def transf(self,f):
        return np.log(1.+np.exp(f))
    def dtransf_df(self,f):
        return np.exp(f)/(1.+np.exp(f))
    def d2transf_df2(self,f):
        aux = np.exp(f)/(1.+np.exp(f))
        return aux*(1.-aux)
 class Reciprocal(GPTransformation):
    def transf(sefl,f):
        return 1./f
    def dtransf_df(self,f):
        return -1./f**2
    def d2transf_df2(self,f):
        return 2./f**3
 class Heaviside(GPTransformation):
    """
    .. math::
        g(f) = I_{x \\in A}
    """
    def transf(self,f):
        #transformation goes here
        return np.where(f>0, 1, 0)
    def dtransf_df(self,f):
        raise NotImplementedError, "This function is not differentiable!"
    def d2transf_df2(self,f):
        raise NotImplementedError, "This function is not differentiable!"
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@ -1,425 +0,0 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 import pylab as pb
 from GPy.util.plot import gpplot
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 class NoiseDistribution(object):
    """
    Likelihood class for doing Expectation propagation
    :param Y: observed output (Nx1 numpy.darray)
    .. note:: Y values allowed depend on the LikelihoodFunction used
    """
    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
        assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."
        self.gp_link = gp_link
        self.analytical_mean = analytical_mean
        self.analytical_variance = analytical_variance
        if self.analytical_mean:
            self.moments_match = self._moments_match_analytical
            self.predictive_mean = self._predictive_mean_analytical
        else:
            self.moments_match = self._moments_match_numerical
            self.predictive_mean = self._predictive_mean_numerical
        if self.analytical_variance:
            self.predictive_variance = self._predictive_variance_analytical
        else:
            self.predictive_variance = self._predictive_variance_numerical
    def _get_params(self):
        return np.zeros(0)
    def _get_param_names(self):
        return []
    def _set_params(self,p):
        pass
    def _gradients(self,partial):
        return np.zeros(0)
    def _preprocess_values(self,Y):
        """
        In case it is needed, this function assess the output values or makes any pertinent transformation on them.
        :param Y: observed output
        :type Y: Nx1 numpy.darray
        """
        return Y
    def _product(self,gp,obs,mu,sigma):
        """
        Product between the cavity distribution and a likelihood factor.
        :param gp: latent variable
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
    def _nlog_product_scaled(self,gp,obs,mu,sigma):
        """
        Negative log-product between the cavity distribution and a likelihood factor.
        .. note:: The constant term in the Gaussian distribution is ignored.
        :param gp: latent variable
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
    def _dnlog_product_dgp(self,gp,obs,mu,sigma):
        """
        Derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
        :param gp: latent variable
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
    def _d2nlog_product_dgp2(self,gp,obs,mu,sigma):
        """
        Second derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
        :param gp: latent variable
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
    def _product_mode(self,obs,mu,sigma):
        """
        Newton's CG method to find the mode in _product (cavity x likelihood factor).
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma),disp=False)
    def _moments_match_analytical(self,obs,tau,v):
        """
        If available, this function computes the moments analytically.
        """
        pass
    def _moments_match_numerical(self,obs,tau,v):
        """
        Lapace approximation to calculate the moments.
        :param obs: observed output
        :param tau: cavity distribution 1st natural parameter (precision)
        :param v: cavity distribution 2nd natural paramenter (mu*precision)
        """
        mu = v/tau
        mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
        sigma2_hat = 1./(tau + self._d2nlog_mass_dgp2(mu_hat,obs))
        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self._mass(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
        return Z_hat,mu_hat,sigma2_hat
    def _nlog_conditional_mean_scaled(self,gp,mu,sigma):
        """
        Negative logarithm of the l.v.'s predictive distribution times the output's mean given the l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        .. note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
        """
        return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
    def _dnlog_conditional_mean_dgp(self,gp,mu,sigma):
        """
        Derivative of _nlog_conditional_mean_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
    def _d2nlog_conditional_mean_dgp2(self,gp,mu,sigma):
        """
        Second derivative of _nlog_conditional_mean_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
    def _nlog_exp_conditional_variance_scaled(self,gp,mu,sigma):
        """
        Negative logarithm of the l.v.'s predictive distribution times the output's variance given the l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        .. note:: This function helps computing E(V(Y_star|f_star))
        """
        return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
    def _dnlog_exp_conditional_variance_dgp(self,gp,mu,sigma):
        """
        Derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
    def _d2nlog_exp_conditional_variance_dgp2(self,gp,mu,sigma):
        """
        Second derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
    def _nlog_exp_conditional_mean_sq_scaled(self,gp,mu,sigma):
        """
        Negative logarithm of the l.v.'s predictive distribution times the output's mean squared given the l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        .. note:: This function helps computing E( E(Y_star|f_star)**2 )
        """
        return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
    def _dnlog_exp_conditional_mean_sq_dgp(self,gp,mu,sigma):
        """
        Derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
    def _d2nlog_exp_conditional_mean_sq_dgp2(self,gp,mu,sigma):
        """
        Second derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
    def _predictive_mean_analytical(self,mu,sigma):
        """
        If available, this function computes the predictive mean analytically.
        """
        pass
    def _predictive_variance_analytical(self,mu,sigma):
        """
        If available, this function computes the predictive variance analytically.
        """
        pass
    def _predictive_mean_numerical(self,mu,sigma):
        """
        Laplace approximation to the predictive mean: E(Y_star) = E( E(Y_star|f_star) )
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma),disp=False)
        mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
        """
        pb.figure()
        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
        f = np.array([np.exp(-self._nlog_conditional_mean_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
        pb.plot(x,f,'b-')
        sigma2 = 1./self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma)
        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
        k = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
        pb.plot(x,f2*mean,'r-')
        pb.vlines(maximum,0,f.max())
        """
        return mean
    def _predictive_mean_sq(self,mu,sigma):
        """
        Laplace approximation to the predictive mean squared: E(Y_star**2) = E( E(Y_star|f_star)**2 )
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma),disp=False)
        mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
        return mean_squared
    def _predictive_variance_numerical(self,mu,sigma,predictive_mean=None):
        """
        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
        """
        # E( V(Y_star|f_star) )
        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma),disp=False)
        exp_var = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma))*sigma)
        """
        pb.figure()
        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
        f = np.array([np.exp(-self._nlog_exp_conditional_variance_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
        pb.plot(x,f,'b-')
        sigma2 = 1./self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma)
        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
        k = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
        pb.plot(x,f2*exp_var,'r--')
        pb.vlines(maximum,0,f.max())
        """
        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star)**2 )
        exp_exp2 = self._predictive_mean_sq(mu,sigma)
        if predictive_mean is None:
            predictive_mean = self.predictive_mean(mu,sigma)
        var_exp = exp_exp2 - predictive_mean**2
        return exp_var + var_exp
    def _predictive_percentiles(self,p,mu,sigma):
        """
        Percentiles of the predictive distribution
        :parm p: lower tail probability
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
        """
        qf = stats.norm.ppf(p,mu,sigma)
        return self.gp_link.transf(qf)
    def _nlog_joint_predictive_scaled(self,x,mu,sigma):
        """
        Negative logarithm of the joint predictive distribution (latent variable and output).
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        """
        return self._nlog_product_scaled(x[0],x[1],mu,sigma)
    def _gradient_nlog_joint_predictive(self,x,mu,sigma):
        """
        Gradient of _nlog_joint_predictive_scaled.
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        .. note: Only available when the output is continuous
        """
        assert not self.discrete, "Gradient not available for discrete outputs."
        return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
    def _hessian_nlog_joint_predictive(self,x,mu,sigma):
        """
        Hessian of _nlog_joint_predictive_scaled.
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        .. note: Only available when the output is continuous
        """
        assert not self.discrete, "Hessian not available for discrete outputs."
        cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
        return np.array((self._d2nlog_product_dgp2(gp=x[0],obs=x[1],mu=mu,sigma=sigma),cross_derivative,cross_derivative,self._d2nlog_mass_dobs2(obs=x[1],gp=x[0]))).reshape(2,2)
    def _joint_predictive_mode(self,mu,sigma):
        """
        Negative logarithm of the joint predictive distribution (latent variable and output).
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        """
        return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
    def predictive_values(self,mu,var):
        """
        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
        :param mu: mean of the latent variable
        :param var: variance of the latent variable
        """
        if isinstance(mu,float) or isinstance(mu,int):
            mu = [mu]
            var = [var]
        pred_mean = []
        pred_var = []
        q1 = []
        q3 = []
        for m,s in zip(mu,np.sqrt(var)):
            pred_mean.append(self.predictive_mean(m,s))
            pred_var.append(self.predictive_variance(m,s,pred_mean[-1]))
            q1.append(self._predictive_percentiles(.025,m,s))
            q3.append(self._predictive_percentiles(.975,m,s))
        pred_mean = np.vstack(pred_mean)
        pred_var = np.vstack(pred_var)
        q1 = np.vstack(q1)
        q3 = np.vstack(q3)
        return pred_mean, pred_var, q1, q3
    def samples(self, gp):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
        :param gp: latent variable
        """
        pass
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@ -1,69 +0,0 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Poisson(NoiseDistribution):
    """
    Poisson likelihood
    .. math::
        L(x) = \\exp(\\lambda) * \\frac{\\lambda^Y_i}{Y_i!}
    ..Note: Y is expected to take values in {0,1,2,...}
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _preprocess_values(self,Y): #TODO
        return Y
    def _mass(self,gp,obs):
        """
        Mass (or density) function
        """
        return stats.poisson.pmf(obs,self.gp_link.transf(gp))
    def _nlog_mass(self,gp,obs):
        """
        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
        """
        return self.gp_link.transf(gp) - obs * np.log(self.gp_link.transf(gp)) + np.log(special.gamma(obs+1))
    def _dnlog_mass_dgp(self,gp,obs):
        return self.gp_link.dtransf_df(gp) * (1. - obs/self.gp_link.transf(gp))
    def _d2nlog_mass_dgp2(self,gp,obs):
        d2_df = self.gp_link.d2transf_df2(gp)
        transf = self.gp_link.transf(gp)
        return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dvariance_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2variance_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)