diff --git a/GPy/likelihoods/ep.py b/GPy/inference/ep.py
similarity index 94%
rename from GPy/likelihoods/ep.py
rename to GPy/inference/ep.py
index 4fedd66b..aa106067 100644
--- a/GPy/likelihoods/ep.py
+++ b/GPy/inference/ep.py
@@ -19,7 +19,6 @@ class EP(likelihood):
         self.num_data, self.output_dim = self.data.shape
         self.is_heteroscedastic = True
         self.num_params = 0
-        self._transf_data = self.noise_model._preprocess_values(data)
 
         #Initial values - Likelihood approximation parameters:
         #p(y|f) = t(f|tau_tilde,v_tilde)
@@ -50,10 +49,26 @@ class EP(likelihood):
         self.VVT_factor = self.V
         self.trYYT = 0.
 
-    def predictive_values(self,mu,var,full_cov):
+    def predictive_values(self,mu,var,full_cov,**noise_args):
         if full_cov:
             raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model.predictive_values(mu,var)
+        return self.noise_model.predictive_values(mu,var,**noise_args)
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
 
     def _get_params(self):
         #return np.zeros(0)
@@ -134,7 +149,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@@ -233,7 +248,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@@ -336,7 +351,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
diff --git a/GPy/inference/laplace.py b/GPy/inference/laplace.py
new file mode 100644
index 00000000..0def0c8b
--- /dev/null
+++ b/GPy/inference/laplace.py
@@ -0,0 +1,403 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+#
+#Parts of this file were influenced by the Matlab GPML framework written by
+#Carl Edward Rasmussen & Hannes Nickisch, however all bugs are our own.
+#
+#The GPML code is released under the FreeBSD License.
+#Copyright (c) 2005-2013 Carl Edward Rasmussen & Hannes Nickisch. All rights reserved.
+#
+#The code and associated documentation is available from
+#http://gaussianprocess.org/gpml/code.
+
+import numpy as np
+import scipy as sp
+from likelihood import likelihood
+from ..util.linalg import mdot, jitchol, pddet, dpotrs
+from functools import partial as partial_func
+import warnings
+
+class Laplace(likelihood):
+    """Laplace approximation to a posterior"""
+
+    def __init__(self, data, noise_model, extra_data=None):
+        """
+        Laplace Approximation
+
+        Find the moments \hat{f} and the hessian at this point
+        (using Newton-Raphson) of the unnormalised posterior
+
+        Compute the GP variables (i.e. generate some Y^{squiggle} and
+        z^{squiggle} which makes a gaussian the same as the laplace
+        approximation to the posterior, but normalised
+
+        Arguments
+        ---------
+
+        :param data: array of data the likelihood function is approximating
+        :type data: NxD
+        :param noise_model: likelihood function - subclass of noise_model
+        :type noise_model: noise_model
+        :param extra_data: additional data used by some likelihood functions,
+        """
+        self.data = data
+        self.noise_model = noise_model
+        self.extra_data = extra_data
+
+        #Inital values
+        self.N, self.D = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
+        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
+
+        self.restart()
+        likelihood.__init__(self)
+
+    def restart(self):
+        """
+        Reset likelihood variables to their defaults
+        """
+        #Initial values for the GP variables
+        self.Y = np.zeros((self.N, 1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:, None]
+        self.Z = 0
+        self.YYT = None
+
+        self.old_Ki_f = None
+        self.bad_fhat = False
+
+    def predictive_values(self,mu,var,full_cov,**noise_args):
+        if full_cov:
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        return self.noise_model.predictive_values(mu,var,**noise_args)
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
+
+    def _get_params(self):
+        return np.asarray(self.noise_model._get_params())
+
+    def _get_param_names(self):
+        return self.noise_model._get_param_names()
+
+    def _set_params(self, p):
+        return self.noise_model._set_params(p)
+
+    def _shared_gradients_components(self):
+        d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
+        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
+        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
+        return dL_dfhat, I_KW_i
+
+    def _Kgradients(self):
+        """
+        Gradients with respect to prior kernel parameters dL_dK to be chained
+        with dK_dthetaK to give dL_dthetaK
+        :returns: dL_dK matrix
+        :rtype: Matrix (1 x num_kernel_params)
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #Explicit
+        #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
+        #expl_b = self.Wi_K_i
+        #expl = 0.5*expl_a - 0.5*expl_b
+        #dL_dthetaK_exp = dK_dthetaK(expl, X)
+
+        #Implicit
+        impl = mdot(dlp, dL_dfhat, I_KW_i)
+
+        #No longer required as we are computing these in the gp already
+        #otherwise we would take them away and add them back
+        #dL_dthetaK_imp = dK_dthetaK(impl, X)
+        #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
+        #dL_dK = expl + impl
+
+        #No need to compute explicit as we are computing dZ_dK to account
+        #for the difference between the K gradients of a normal GP,
+        #and the K gradients including the implicit part
+        dL_dK = impl
+        return dL_dK
+
+    def _gradients(self, partial):
+        """
+        Gradients with respect to likelihood parameters (dL_dthetaL)
+
+        :param partial: Not needed by this likelihood
+        :type partial: lambda function
+        :rtype: array of derivatives (1 x num_likelihood_params)
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #len(dlik_dthetaL)
+        num_params = len(self._get_param_names())
+        # make space for one derivative for each likelihood parameter
+        dL_dthetaL = np.zeros(num_params)
+        for thetaL_i in range(num_params):
+            #Explicit
+            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[:, thetaL_i])
+                             #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
+                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[:, thetaL_i])
+                             )
+
+            #Implicit
+            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[:, thetaL_i])
+            dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
+            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+
+        return dL_dthetaL
+
+    def _compute_GP_variables(self):
+        """
+        Generate data Y which would give the normal distribution identical
+        to the laplace approximation to the posterior, but normalised
+
+        GPy expects a likelihood to be gaussian, so need to caluclate
+        the data Y^{\tilde} that makes the posterior match that found
+        by a laplace approximation to a non-gaussian likelihood but with
+        a gaussian likelihood
+
+        Firstly,
+        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
+        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
+        we wish to find the hessian \Sigma^{\tilde}
+        that has the same curvature but using our new simulated data Y^{\tilde}
+        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
+        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
+        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
+
+        Secondly,
+        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
+        So we can suck up any differences between that and our log marginal likelihood approximation
+        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
+        which we want to optimize instead, by equating them and rearranging, the difference is added onto
+        the log p(y) that GPy optimizes by default
+
+        Thirdly,
+        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
+        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
+        gp.py code
+        """
+        Wi = 1.0/self.W
+        self.Sigma_tilde = np.diagflat(Wi)
+
+        Y_tilde = Wi*self.Ki_f + self.f_hat
+
+        self.Wi_K_i = self.W12BiW12
+        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+
+        Z_tilde = (+ lik
+                   - 0.5*self.ln_B_det
+                   + 0.5*ln_det_Wi_K
+                   - 0.5*self.f_Ki_f
+                   + 0.5*y_Wi_K_i_y
+                   + self.NORMAL_CONST
+                  )
+
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
+        self.YYT = np.dot(self.Y, self.Y.T)
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
+
+        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
+        self.dZ_dK = self._Kgradients()
+        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
+
+    def fit_full(self, K):
+        """
+        The laplace approximation algorithm, find K and expand hessian
+        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
+
+        :param K: Prior covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        """
+        self.K = K.copy()
+
+        #Find mode
+        self.f_hat = self.rasm_mode(self.K)
+
+        #Compute hessian and other variables at mode
+        self._compute_likelihood_variables()
+
+        #Compute fake variables replicating laplace approximation to posterior
+        self._compute_GP_variables()
+
+    def _compute_likelihood_variables(self):
+        """
+        Compute the variables required to compute gaussian Y variables
+        """
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
+
+        if not self.noise_model.log_concave:
+            #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
+            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+
+        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
+
+        self.Ki_f = self.Ki_f
+        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
+        self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
+
+    def _compute_B_statistics(self, K, W, a):
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :param K: Prior Covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of hessian (1xN)
+        :param a: Matrix to calculate W12BiW12a
+        :type a: Matrix NxN
+        :returns: (W12BiW12, ln_B_det)
+        """
+        if not self.noise_model.log_concave:
+            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
+            W[W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                # To cause the posterior to become less certain than the prior and likelihood,
+                                # This is a property only held by non-log-concave likelihoods
+
+
+        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        B = np.eye(self.N) + W_12*K*W_12.T
+        L = jitchol(B)
+
+        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
+        ln_B_det = 2*np.sum(np.log(np.diag(L)))
+        return W12BiW12a, ln_B_det
+
+    def rasm_mode(self, K, MAX_ITER=40):
+        """
+        Rasmussen's numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+        Influenced by GPML (BSD) code, all errors are our own
+
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
+        :param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :type MAX_ITER: scalar
+        :returns: f_hat, mode on which to make laplace approxmiation
+        :rtype: NxD matrix
+        """
+        #old_Ki_f = np.zeros((self.N, 1))
+
+        #Start f's at zero originally of if we have gone off track, try restarting
+        if self.old_Ki_f is None or self.bad_fhat:
+            old_Ki_f = np.random.rand(self.N, 1)/50.0
+            #old_Ki_f = self.Y
+            f = np.dot(K, old_Ki_f)
+        else:
+            #Start at the old best point
+            old_Ki_f = self.old_Ki_f.copy()
+            f = self.f_hat.copy()
+
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(Ki_f, f):
+            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
+
+        difference = np.inf
+        epsilon = 1e-7
+        #step_size = 1
+        #rs = 0
+        i = 0
+
+        while difference > epsilon and i < MAX_ITER:
+            W = -self.noise_model.d2logpdf_df2(f, self.data, extra_data=self.extra_data)
+
+            W_f = W*f
+            grad = self.noise_model.dlogpdf_df(f, self.data, extra_data=self.extra_data)
+
+            b = W_f + grad
+            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_Ki_f = b - W12BiW12Kb
+            dKi_f = full_step_Ki_f - old_Ki_f
+
+            f_old = f.copy()
+            def inner_obj(step_size, old_Ki_f, dKi_f, K):
+                Ki_f = old_Ki_f + step_size*dKi_f
+                f = np.dot(K, Ki_f)
+                # This is nasty, need to set something within an optimization though
+                self.tmp_Ki_f = Ki_f.copy()
+                self.tmp_f = f.copy()
+                return -obj(Ki_f, f)
+
+            i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
+            #Find the stepsize that minimizes the objective function using a brent line search
+            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
+            #steps than get this exact then make a step, if B was bigger it might be the other way around though
+            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
+            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
+            f = self.tmp_f.copy()
+            Ki_f = self.tmp_Ki_f.copy()
+
+            #Optimize without linesearch
+            #f_old = f.copy()
+            #update_passed = False
+            #while not update_passed:
+                #Ki_f = old_Ki_f + step_size*dKi_f
+                #f = np.dot(K, Ki_f)
+
+                #old_obj = new_obj
+                #new_obj = obj(Ki_f, f)
+                #difference = new_obj - old_obj
+                ##print "difference: ",difference
+                #if difference < 0:
+                    ##print "Objective function rose", np.float(difference)
+                    ##If the objective function isn't rising, restart optimization
+                    #step_size *= 0.8
+                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    ##objective function isn't increasing, try reducing step size
+                    #f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
+                    #old_obj = new_obj
+                    #rs += 1
+                #else:
+                    #update_passed = True
+
+            #old_Ki_f = self.Ki_f.copy()
+
+            #difference = abs(new_obj - old_obj)
+            #old_obj = new_obj.copy()
+            difference = np.abs(np.sum(f - f_old)) + np.abs(np.sum(Ki_f - old_Ki_f))
+            #difference = np.abs(np.sum(Ki_f - old_Ki_f))/np.float(self.N)
+            old_Ki_f = Ki_f.copy()
+            i += 1
+
+        self.old_Ki_f = old_Ki_f.copy()
+
+        #Warn of bad fits
+        if difference > epsilon:
+            self.bad_fhat = True
+            warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now perfect again")
+
+        self.Ki_f = Ki_f
+        return f
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
deleted file mode 100644
index 0cb62eb0..00000000
--- a/GPy/likelihoods/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from ep import EP
-from ep_mixed_noise import EP_Mixed_Noise
-from gaussian import Gaussian
-from gaussian_mixed_noise import Gaussian_Mixed_Noise
-from noise_model_constructors import *
-# TODO: from Laplace import Laplace
-
diff --git a/GPy/likelihoods/ep_mixed_noise.py b/GPy/likelihoods/ep_mixed_noise.py
deleted file mode 100644
index f5452512..00000000
--- a/GPy/likelihoods/ep_mixed_noise.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# Copyright (c) 2013, Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
-from likelihood import likelihood
-
-class EP_Mixed_Noise(likelihood):
-    def __init__(self,data_list,noise_model_list,epsilon=1e-3,power_ep=[1.,1.]):
-        """
-        Expectation Propagation
-
-        Arguments
-        ---------
-        :param data_list: list of outputs
-        :param noise_model_list: a list of noise models
-        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations
-        :type epsilon: float
-        :param power_ep: list of power ep parameters
-        """
-        assert len(data_list) == len(noise_model_list)
-        self.noise_model_list = noise_model_list
-        n_list = [data.size for data in data_list]
-        self.n_models = len(data_list)
-        self.n_params = [noise_model._get_params().size for noise_model in noise_model_list]
-        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.n_models),n_list)])
-        self.epsilon = epsilon
-        self.eta, self.delta = power_ep
-        self.data = np.vstack(data_list)
-        self.N, self.output_dim = self.data.shape
-        self.is_heteroscedastic = True
-        self.num_params = 0#FIXME
-        self._transf_data = np.vstack([noise_model._preprocess_values(data) for noise_model,data in zip(noise_model_list,data_list)])
-        #TODO non-gaussian index
-
-        #Initial values - Likelihood approximation parameters:
-        #p(y|f) = t(f|tau_tilde,v_tilde)
-        self.tau_tilde = np.zeros(self.N)
-        self.v_tilde = np.zeros(self.N)
-
-        #initial values for the GP variables
-        self.Y = np.zeros((self.N,1))
-        self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:,None]
-        self.Z = 0
-        self.YYT = None
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = 0.
-
-    def restart(self):
-        self.tau_tilde = np.zeros(self.N)
-        self.v_tilde = np.zeros(self.N)
-        self.Y = np.zeros((self.N,1))
-        self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:,None]
-        self.Z = 0
-        self.YYT = None
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = 0.
-
-    def predictive_values(self,mu,var,full_cov,noise_model):
-        """
-        Predicts the output given the GP
-
-        :param mu: GP's mean
-        :param var: GP's variance
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: False|True
-        :param noise_model: noise model to use
-        :type noise_model: integer
-        """
-        if full_cov:
-            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        #_mu = []
-        #_var = []
-        #_q1 = []
-        #_q2 = []
-        #for m,v,o in zip(mu,var,output.flatten()):
-        #    a,b,c,d = self.noise_model_list[int(o)].predictive_values(m,v)
-        #    _mu.append(a)
-        #    _var.append(b)
-        #    _q1.append(c)
-        #    _q2.append(d)
-        #return np.vstack(_mu),np.vstack(_var),np.vstack(_q1),np.vstack(_q2)
-        return self.noise_model_list[noise_model].predictive_values(mu,var)
-
-    def _get_params(self):
-        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
-
-    def _get_param_names(self):
-        names = []
-        for noise_model in self.noise_model_list:
-           names += noise_model._get_param_names()
-        return names
-
-    def _set_params(self,p):
-        cs_params = np.cumsum([0]+self.n_params)
-        for i in range(len(self.n_params)):
-            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
-
-    def _gradients(self,partial):
-        #NOTE this is not tested
-        return np.hstack([noise_model._gradients(partial) for noise_model in self.noise_model_list])
-
-    def _compute_GP_variables(self):
-        #Variables to be called from GP
-        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
-        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
-        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
-        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
-
-        self.Y =  mu_tilde[:,None]
-        self.YYT = np.dot(self.Y,self.Y.T)
-        self.covariance_matrix = np.diag(1./self.tau_tilde)
-        self.precision = self.tau_tilde[:,None]
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = np.trace(self.YYT)
-
-    def fit_full(self,K):
-        """
-        The expectation-propagation algorithm.
-        For nomenclature see Rasmussen & Williams 2006.
-        """
-        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
-        mu = np.zeros(self.N)
-        Sigma = K.copy()
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
-
-        #Approximation
-        epsilon_np1 = self.epsilon + 1.
-        epsilon_np2 = self.epsilon + 1.
-       	self.iterations = 0
-        self.np1 = [self.tau_tilde.copy()]
-        self.np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
-                mu = np.dot(Sigma,self.v_tilde)
-                self.iterations += 1
-            #Sigma recomptutation with Cholesky decompositon
-            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
-            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
-            L = jitchol(B)
-            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
-            Sigma = K - np.dot(V.T,V)
-            mu = np.dot(Sigma,self.v_tilde)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
-            self.np1.append(self.tau_tilde.copy())
-            self.np2.append(self.v_tilde.copy())
-
-        return self._compute_GP_variables()
-
-    def fit_DTC(self, Kmm, Kmn):
-        """
-        The expectation-propagation algorithm with sparse pseudo-input.
-        For nomenclature see ... 2013.
-        """
-        num_inducing = Kmm.shape[0]
-
-        #TODO: this doesn't work with uncertain inputs!
-
-        """
-        Prior approximation parameters:
-        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
-        Sigma0 = Qnn = Knm*Kmmi*Kmn
-        """
-        KmnKnm = np.dot(Kmn,Kmn.T)
-        Lm = jitchol(Kmm)
-        Lmi = chol_inv(Lm)
-        Kmmi = np.dot(Lmi.T,Lmi)
-        KmmiKmn = np.dot(Kmmi,Kmn)
-        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
-        LLT0 = Kmm.copy()
-
-        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
-        #KmnKnm = np.dot(Kmn, Kmn.T)
-        #KmmiKmn = np.dot(Kmmi,Kmn)
-        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
-        #LLT0 = Kmm.copy()
-
-        """
-        Posterior approximation: q(f|y) = N(f| mu, Sigma)
-        Sigma = Diag + P*R.T*R*P.T + K
-        mu = w + P*Gamma
-        """
-        mu = np.zeros(self.N)
-        LLT = Kmm.copy()
-        Sigma_diag = Qnn_diag.copy()
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
-
-        #Approximation
-        epsilon_np1 = 1
-        epsilon_np2 = 1
-       	self.iterations = 0
-        np1 = [self.tau_tilde.copy()]
-        np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
-                L = jitchol(LLT)
-                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
-                V,info = dtrtrs(L,Kmn,lower=1)
-                Sigma_diag = np.sum(V*V,-2)
-                si = np.sum(V.T*V[:,i],-1)
-                mu += (Delta_v-Delta_tau*mu[i])*si
-                self.iterations += 1
-            #Sigma recomputation with Cholesky decompositon
-            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
-            L = jitchol(LLT)
-            V,info = dtrtrs(L,Kmn,lower=1)
-            V2,info = dtrtrs(L.T,V,lower=0)
-            Sigma_diag = np.sum(V*V,-2)
-            Knmv_tilde = np.dot(Kmn,self.v_tilde)
-            mu = np.dot(V2.T,Knmv_tilde)
-            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
-            np1.append(self.tau_tilde.copy())
-            np2.append(self.v_tilde.copy())
-
-        self._compute_GP_variables()
-
-    def fit_FITC(self, Kmm, Kmn, Knn_diag):
-        """
-        The expectation-propagation algorithm with sparse pseudo-input.
-        For nomenclature see Naish-Guzman and Holden, 2008.
-        """
-        num_inducing = Kmm.shape[0]
-
-        """
-        Prior approximation parameters:
-        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
-        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
-        """
-        Lm = jitchol(Kmm)
-        Lmi = chol_inv(Lm)
-        Kmmi = np.dot(Lmi.T,Lmi)
-        P0 = Kmn.T
-        KmnKnm = np.dot(P0.T, P0)
-        KmmiKmn = np.dot(Kmmi,P0.T)
-        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
-        Diag0 = Knn_diag - Qnn_diag
-        R0 = jitchol(Kmmi).T
-
-        """
-        Posterior approximation: q(f|y) = N(f| mu, Sigma)
-        Sigma = Diag + P*R.T*R*P.T + K
-        mu = w + P*Gamma
-        """
-        self.w = np.zeros(self.N)
-        self.Gamma = np.zeros(num_inducing)
-        mu = np.zeros(self.N)
-        P = P0.copy()
-        R = R0.copy()
-        Diag = Diag0.copy()
-        Sigma_diag = Knn_diag
-        RPT0 = np.dot(R0,P0.T)
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
-
-        #Approximation
-        epsilon_np1 = 1
-        epsilon_np2 = 1
-       	self.iterations = 0
-        self.np1 = [self.tau_tilde.copy()]
-        self.np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                dtd1 = Delta_tau*Diag[i] + 1.
-                dii = Diag[i]
-                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
-                pi_ = P[i,:].reshape(1,num_inducing)
-                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
-                Rp_i = np.dot(R,pi_.T)
-                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
-                R = jitchol(RTR).T
-                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
-                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
-                RPT = np.dot(R,P.T)
-                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
-                mu = self.w + np.dot(P,self.Gamma)
-                self.iterations += 1
-            #Sigma recomptutation with Cholesky decompositon
-            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
-            Diag = Diag0 * Iplus_Dprod_i
-            P = Iplus_Dprod_i[:,None] * P0
-            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
-            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
-            R,info = dtrtrs(L,R0,lower=1)
-            RPT = np.dot(R,P.T)
-            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
-            self.w = Diag * self.v_tilde
-            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
-            mu = self.w + np.dot(P,self.Gamma)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
-            self.np1.append(self.tau_tilde.copy())
-            self.np2.append(self.v_tilde.copy())
-
-        return self._compute_GP_variables()
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
deleted file mode 100644
index e68bfafa..00000000
--- a/GPy/likelihoods/gaussian.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import numpy as np
-from likelihood import likelihood
-from ..util.linalg import jitchol
-from ..core.parameter import Param
-
-
-class Gaussian(likelihood):
-    """
-    Likelihood class for doing Expectation propagation
-
-    :param data: observed output
-    :type data: Nx1 numpy.darray
-    :param variance: noise parameter
-    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize: False|True
-    """
-    def __init__(self, data, variance=1., normalize=False):
-        super(Gaussian, self).__init__('gaussian')
-        self.is_heteroscedastic = False
-        #self.num_params = 1
-        self.Z = 0. # a correction factor which accounts for the approximation made
-        N, self.output_dim = data.shape
-
-        # normalization
-        if normalize:
-            self._offset = data.mean(0)[None, :]
-            self._scale = data.std(0)[None, :]
-            # Don't scale outputs which have zero variance to zero.
-            self._scale[np.nonzero(self._scale == 0.)] = 1.0e-3
-        else:
-            self._offset = np.zeros((1, self.output_dim))
-            self._scale = np.ones((1, self.output_dim))
-
-        self.set_data(data)
-
-        self.variance = Param('variance', variance)
-        self.variance.add_observer(self, self.update_variance)
-        self.add_parameter(self.variance)
-        
-        #self.parameters_changed()
-#         self._set_params(np.asarray(variance))
-
-
-    def set_data(self, data):
-        self.data = data
-        self.N, D = data.shape
-        assert D == self.output_dim
-        self.Y = (self.data - self._offset) / self._scale
-        if D > self.N:
-            self.YYT = np.dot(self.Y, self.Y.T)
-            self.trYYT = np.trace(self.YYT)
-            self.YYT_factor = jitchol(self.YYT)
-        else:
-            self.YYT = None
-            self.trYYT = np.sum(np.square(self.Y))
-            self.YYT_factor = self.Y
-
-#     def _get_params(self):
-#         return np.asarray(self._variance)
-# 
-#     def _get_param_names(self):
-#         return ["noise_variance"]
-# 
-#     def _set_params(self, x):
-#         self.variance = x[0]
-
-    def update_variance(self, v):
-        if np.all(self.variance == 0.): #special case of zero noise
-            self.precision = np.inf
-            self.V = None
-        else:
-            self.precision = (1. / self.variance).squeeze()
-            self.V = (self.precision) * self.Y
-            self.VVT_factor = self.precision * self.YYT_factor
-        self.covariance_matrix = np.eye(self.N) * self.variance
-        #self._variance = self.variance.copy()
-
-#     def parameters_changed(self):
-#         if np.any(self._variance != self.variance):
-#             self.update_variance()
-
-    def predictive_values(self, mu, var, full_cov):
-        """
-        Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval
-        """
-        mean = mu * self._scale + self._offset
-        if full_cov:
-            if self.output_dim > 1:
-                raise NotImplementedError, "TODO"
-                # Note. for output_dim>1, we need to re-normalise all the outputs independently.
-                # This will mess up computations of diag(true_var), below.
-                # note that the upper, lower quantiles should be the same shape as mean
-            # Augment the output variance with the likelihood variance and rescale.
-            true_var = (var + np.eye(var.shape[0]) * self.variance) * self._scale ** 2
-            _5pc = mean - 2.*np.sqrt(np.diag(true_var))
-            _95pc = mean + 2.*np.sqrt(np.diag(true_var))
-        else:
-            true_var = (var + self.variance) * self._scale ** 2
-            _5pc = mean - 2.*np.sqrt(true_var)
-            _95pc = mean + 2.*np.sqrt(true_var)
-        return mean, true_var, _5pc, _95pc
-
-    def fit_full(self):
-        """
-        No approximations needed
-        """
-        pass
-
-    def _gradients(self, partial):
-        return np.sum(partial)
diff --git a/GPy/likelihoods/gaussian_mixed_noise.py b/GPy/likelihoods/gaussian_mixed_noise.py
deleted file mode 100644
index 696867c0..00000000
--- a/GPy/likelihoods/gaussian_mixed_noise.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) 2013, Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
-from likelihood import likelihood
-from . import Gaussian
-
-
-class Gaussian_Mixed_Noise(likelihood):
-    """
-    Gaussian Likelihood for multiple outputs
-
-    This is a wrapper around likelihood.Gaussian class
-
-    :param data_list: data observations
-    :type data_list: list of numpy arrays (num_data_output_i x 1), one array per output
-    :param noise_params: noise parameters of each output
-    :type noise_params: list of floats, one per output
-    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize: False|True
-    """
-    def __init__(self, data_list, noise_params=None, normalize=True):
-        self.num_params = len(data_list)
-        self.n_list = [data.size for data in data_list]
-        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.num_params),self.n_list)])
-
-        if noise_params is None:
-            noise_params = [1.] * self.num_params
-        else:
-            assert self.num_params == len(noise_params), 'Number of noise parameters does not match the number of noise models.'
-
-        self.noise_model_list = [Gaussian(Y,variance=v,normalize = normalize) for Y,v in zip(data_list,noise_params)]
-        self.n_params = [noise_model._get_params().size for noise_model in self.noise_model_list]
-        self.data = np.vstack(data_list)
-        self.N, self.output_dim = self.data.shape
-        self._offset = np.zeros((1, self.output_dim))
-        self._scale = np.ones((1, self.output_dim))
-
-        self.is_heteroscedastic = True
-        self.Z = 0. # a correction factor which accounts for the approximation made
-
-        self.set_data(data_list)
-        self._set_params(np.asarray(noise_params))
-
-        super(Gaussian_Mixed_Noise, self).__init__()
-
-    def set_data(self, data_list):
-        self.data = np.vstack(data_list)
-        self.N, D = self.data.shape
-        assert D == self.output_dim
-        self.Y = (self.data - self._offset) / self._scale
-        if D > self.N:
-            raise NotImplementedError
-            #self.YYT = np.dot(self.Y, self.Y.T)
-            #self.trYYT = np.trace(self.YYT)
-            #self.YYT_factor = jitchol(self.YYT)
-        else:
-            self.YYT = None
-            self.trYYT = np.sum(np.square(self.Y))
-            self.YYT_factor = self.Y
-
-    def predictive_values(self,mu,var,full_cov,noise_model):
-        """
-        Predicts the output given the GP
-
-        :param mu: GP's mean
-        :param var: GP's variance
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: False|True
-        :param noise_model: noise model to use
-        :type noise_model: integer
-        """
-        if full_cov:
-            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model_list[noise_model].predictive_values(mu,var,full_cov)
-
-    def _get_params(self):
-        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
-
-    def _get_param_names(self):
-        if len(self.noise_model_list) == 1:
-            names = self.noise_model_list[0]._get_param_names()
-        else:
-            names = []
-            for noise_model,i in zip(self.noise_model_list,range(len(self.n_list))):
-                names.append(''.join(noise_model._get_param_names() + ['_%s' %i]))
-        return names
-
-    def _set_params(self,p):
-        cs_params = np.cumsum([0]+self.n_params)
-
-        for i in range(len(self.n_params)):
-            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
-        self.precision = np.hstack([np.repeat(noise_model.precision,n) for noise_model,n in zip(self.noise_model_list,self.n_list)])[:,None]
-
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.precision * self.YYT_factor
-        self.covariance_matrix = np.eye(self.N) * 1./self.precision
-
-    def _gradients(self,partial):
-        gradients = []
-        aux = np.cumsum([0]+self.n_list)
-        for ai,af,noise_model in zip(aux[:-1],aux[1:],self.noise_model_list):
-            gradients += [noise_model._gradients(partial[ai:af])]
-        return np.hstack(gradients)
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
deleted file mode 100644
index d94af758..00000000
--- a/GPy/likelihoods/likelihood.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import numpy as np
-import copy
-from ..core.parameterized import Parameterized
-
-class likelihood(Parameterized):
-    """
-    The atom for a likelihood class
-
-    This object interfaces the GP and the data. The most basic likelihood
-    (Gaussian) inherits directly from this, as does the EP algorithm
-
-    Some things must be defined for this to work properly:
-
-        - self.Y : the effective Gaussian target of the GP
-        - self.N, self.D : Y.shape
-        - self.covariance_matrix : the effective (noise) covariance of the GP targets
-        - self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
-        - self.is_heteroscedastic : enables significant computational savings in GP
-        - self.precision : a scalar or vector representation of the effective target precision
-        - self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
-        - self.V : self.precision * self.Y
-
-    """
-    def __init__(self, name=None):
-        Parameterized.__init__(self, name)
-    
-#     def _get_params(self):
-#         raise NotImplementedError
-# 
-#     def _get_param_names(self):
-#         raise NotImplementedError
-# 
-#     def _set_params(self, x):
-#         raise NotImplementedError
-
-    def fit(self):
-        raise NotImplementedError
-
-    def _gradients(self, partial):
-        raise NotImplementedError
-
-    def predictive_values(self, mu, var):
-        raise NotImplementedError
diff --git a/GPy/likelihoods/noise_model_constructors.py b/GPy/likelihoods/noise_model_constructors.py
deleted file mode 100644
index ec971e04..00000000
--- a/GPy/likelihoods/noise_model_constructors.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import noise_models
-
-def binomial(gp_link=None):
-    """
-    Construct a binomial likelihood
-
-    :param gp_link: a GPy gp_link function
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Probit()
-    #else:
-    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
-
-    if isinstance(gp_link,noise_models.gp_transformations.Probit):
-        analytical_mean = True
-        analytical_variance = False
-
-    elif isinstance(gp_link,noise_models.gp_transformations.Heaviside):
-        analytical_mean = True
-        analytical_variance = True
-
-    else:
-        analytical_mean = False
-        analytical_variance = False
-
-    return noise_models.binomial_noise.Binomial(gp_link,analytical_mean,analytical_variance)
-
-def exponential(gp_link=None):
-    """
-    Construct a binomial likelihood
-
-    :param gp_link: a GPy gp_link function
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Identity()
-
-    analytical_mean = False
-    analytical_variance = False
-    return noise_models.exponential_noise.Exponential(gp_link,analytical_mean,analytical_variance)
-
-def gaussian_ep(gp_link=None,variance=1.):
-    """
-    Construct a gaussian likelihood
-
-    :param gp_link: a GPy gp_link function
-    :param variance: scalar
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Identity()
-    #else:
-    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
-
-    analytical_mean = False
-    analytical_variance = False
-    return noise_models.gaussian_noise.Gaussian(gp_link,analytical_mean,analytical_variance,variance)
-
-def poisson(gp_link=None):
-    """
-    Construct a Poisson likelihood
-
-    :param gp_link: a GPy gp_link function
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Log_ex_1()
-    #else:
-    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
-    analytical_mean = False
-    analytical_variance = False
-    return noise_models.poisson_noise.Poisson(gp_link,analytical_mean,analytical_variance)
-
-def gamma(gp_link=None,beta=1.):
-    """
-    Construct a Gamma likelihood
-
-    :param gp_link: a GPy gp_link function
-    :param beta: scalar
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Log_ex_1()
-    analytical_mean = False
-    analytical_variance = False
-    return noise_models.gamma_noise.Gamma(gp_link,analytical_mean,analytical_variance,beta)
-
-
diff --git a/GPy/likelihoods/noise_models/__init__.py b/GPy/likelihoods/noise_models/__init__.py
deleted file mode 100644
index b47702a7..00000000
--- a/GPy/likelihoods/noise_models/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import noise_distributions
-import binomial_noise
-import exponential_noise
-import gaussian_noise
-import gamma_noise
-import poisson_noise
-import gp_transformations
diff --git a/GPy/likelihoods/noise_models/binomial_noise.py b/GPy/likelihoods/noise_models/binomial_noise.py
deleted file mode 100644
index c0bb8be4..00000000
--- a/GPy/likelihoods/noise_models/binomial_noise.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
-
-class Binomial(NoiseDistribution):
-    """
-    Probit likelihood
-    Y is expected to take values in {-1,1}
-    -----
-    $$
-    L(x) = \\Phi (Y_i*f_i)
-    $$
-    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Binomial, self).__init__(gp_link,analytical_mean,analytical_variance)
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-
-        ..Note:: Binary classification algorithm works better with classes {-1,1}
-        """
-        Y_prep = Y.copy()
-        Y1 = Y[Y.flatten()==1].size
-        Y2 = Y[Y.flatten()==0].size
-        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
-        Y_prep[Y.flatten() == 0] = -1
-        return Y_prep
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        if isinstance(self.gp_link,gp_transformations.Probit):
-            z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
-            Z_hat = std_norm_cdf(z)
-            phi = std_norm_pdf(z)
-            mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
-            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
-
-        elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            a = data_i*v_i/np.sqrt(tau_i)
-            Z_hat = std_norm_cdf(a)
-            N = std_norm_pdf(a)
-            mu_hat = v_i/tau_i + data_i*N/Z_hat/np.sqrt(tau_i)
-            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
-            if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
-                stop
-
-        return Z_hat, mu_hat, sigma2_hat
-
-    def _predictive_mean_analytical(self,mu,sigma):
-        if isinstance(self.gp_link,gp_transformations.Probit):
-            return stats.norm.cdf(mu/np.sqrt(1+sigma**2))
-        elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            return stats.norm.cdf(mu/sigma)
-        else:
-            raise NotImplementedError
-
-    def _predictive_variance_analytical(self,mu,sigma, pred_mean):
-        if isinstance(self.gp_link,gp_transformations.Heaviside):
-            return 0.
-        else:
-            raise NotImplementedError
-
-    def _mass(self,gp,obs):
-        #NOTE obs must be in {0,1}
-        p = self.gp_link.transf(gp)
-        return p**obs * (1.-p)**(1.-obs)
-
-    def _nlog_mass(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        return obs*np.log(p) + (1.-obs)*np.log(1-p)
-
-    def _dnlog_mass_dgp(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        dp = self.gp_link.dtransf_df(gp)
-        return obs/p * dp - (1.-obs)/(1.-p) * dp
-
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        return (obs/p + (1.-obs)/(1.-p))*self.gp_link.d2transf_df2(gp) + ((1.-obs)/(1.-p)**2-obs/p**2)*self.gp_link.dtransf_df(gp)
-
-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        p = self.gp_link.transf(gp)
-        return p*(1.-p)
-
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)*(1. - 2.*self.gp_link.transf(gp))
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
-
-
-    def samples(self, gp):
-        """
-        Returns a set of samples of observations based on a given value of the latent variable.
-
-        :param size: number of samples to compute
-        :param gp: latent variable
-        """
-        orig_shape = gp.shape
-        gp = gp.flatten()
-        Ysim = np.array([np.random.binomial(1,self.gp_link.transf(gpj),size=1) for gpj in gp])
-        return Ysim.reshape(orig_shape)
diff --git a/GPy/likelihoods/noise_models/exponential_noise.py b/GPy/likelihoods/noise_models/exponential_noise.py
deleted file mode 100644
index 56e63c75..00000000
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
-
-class Exponential(NoiseDistribution):
-    """
-    Expoential likelihood
-    Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
-    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Exponential, self).__init__(gp_link,analytical_mean,analytical_variance)
-
-    def _preprocess_values(self,Y):
-        return Y
-
-    def _mass(self,gp,obs):
-        """
-        Mass (or density) function
-        """
-        return np.exp(-obs/self.gp_link.transf(gp))/self.gp_link.transf(gp)
-
-    def _nlog_mass(self,gp,obs):
-        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
-        """
-        return obs/self.gp_link.transf(gp) + np.log(self.gp_link.transf(gp))
-
-    def _dnlog_mass_dgp(self,gp,obs):
-        return ( 1./self.gp_link.transf(gp) - obs/self.gp_link.transf(gp)**2) * self.gp_link.dtransf_df(gp)
-
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        fgp = self.gp_link.transf(gp)
-        return (2*obs/fgp**3 - 1./fgp**2) * self.gp_link.dtransf_df(gp)**2 + ( 1./fgp - obs/fgp**2) * self.gp_link.d2transf_df2(gp)
-
-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)**2
-
-    def _dvariance_dgp(self,gp):
-        return 2*self.gp_link.transf(gp)*self.gp_link.dtransf_df(gp)
-
-    def _d2variance_dgp2(self,gp):
-        return 2 * (self.gp_link.dtransf_df(gp)**2 + self.gp_link.transf(gp)*self.gp_link.d2transf_df2(gp))
diff --git a/GPy/likelihoods/noise_models/gamma_noise.py b/GPy/likelihoods/noise_models/gamma_noise.py
deleted file mode 100644
index 6bf0dd7b..00000000
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
-
-class Gamma(NoiseDistribution):
-    """
-    Gamma likelihood
-    Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
-    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
-        self.beta = beta
-        super(Gamma, self).__init__(gp_link,analytical_mean,analytical_variance)
-
-    def _preprocess_values(self,Y):
-        return Y
-
-    def _mass(self,gp,obs):
-        """
-        Mass (or density) function
-        """
-        #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
-        alpha = self.gp_link.transf(gp)*self.beta
-        return obs**(alpha - 1.) * np.exp(-self.beta*obs) * self.beta**alpha / special.gamma(alpha)
-
-    def _nlog_mass(self,gp,obs):
-        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
-        """
-        alpha = self.gp_link.transf(gp)*self.beta
-        return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
-
-    def _dnlog_mass_dgp(self,gp,obs):
-        return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
-
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
-
-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)/self.beta
-
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)/self.beta
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)/self.beta
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
deleted file mode 100644
index 93ac9acd..00000000
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
-
-class Gaussian(NoiseDistribution):
-    """
-    Gaussian likelihood
-
-    :param mean: mean value of the Gaussian distribution
-    :param variance: mean value of the Gaussian distribution
-    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1.):
-        self.variance = variance
-        super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)
-
-    def _get_params(self):
-        return np.array([self.variance])
-
-    def _get_param_names(self):
-        return ['noise_model_variance']
-
-    def _set_params(self,p):
-        self.variance = p
-
-    def _gradients(self,partial):
-        return np.zeros(1)
-        #return np.sum(partial)
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-        """
-        return Y
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        sigma2_hat = 1./(1./self.variance + tau_i)
-        mu_hat = sigma2_hat*(data_i/self.variance + v_i)
-        sum_var = self.variance + 1./tau_i
-        Z_hat = 1./np.sqrt(2.*np.pi*sum_var)*np.exp(-.5*(data_i - v_i/tau_i)**2./sum_var)
-        return Z_hat, mu_hat, sigma2_hat
-
-    def _predictive_mean_analytical(self,mu,sigma):
-        new_sigma2 = self.predictive_variance(mu,sigma)
-        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)
-
-    def _predictive_variance_analytical(self,mu,sigma):
-        return 1./(1./self.variance + 1./sigma**2)
-
-    def _mass(self,gp,obs):
-        #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
-        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance))
-
-    def _nlog_mass(self,gp,obs):
-        return .5*((self.gp_link.transf(gp)-obs)**2/self.variance + np.log(2.*np.pi*self.variance))
-
-    def _dnlog_mass_dgp(self,gp,obs):
-        return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
-
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
-
-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.variance
-
-    def _dvariance_dgp(self,gp):
-        return 0
-
-    def _d2variance_dgp2(self,gp):
-        return 0
diff --git a/GPy/likelihoods/noise_models/gp_transformations.py b/GPy/likelihoods/noise_models/gp_transformations.py
deleted file mode 100644
index e95e9df7..00000000
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-import scipy as sp
-import pylab as pb
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_cdf
-
-class GPTransformation(object):
-    """
-    Link function class for doing non-Gaussian likelihoods approximation
-
-    :param Y: observed output (Nx1 numpy.darray)
-
-    .. note:: Y values allowed depend on the likelihood_function used
-
-    """
-    def __init__(self):
-        pass
-
-    def transf(self,f):
-        """
-        Gaussian process tranformation function, latent space -> output space
-        """
-        pass
-
-    def dtransf_df(self,f):
-        """
-        derivative of transf(f) w.r.t. f
-        """
-        pass
-
-    def d2transf_df2(self,f):
-        """
-        second derivative of transf(f) w.r.t. f
-        """
-        pass
-
-class Identity(GPTransformation):
-    """
-    .. math::
-
-        g(f) = f
-
-    """
-    def transf(self,f):
-        return f
-
-    def dtransf_df(self,f):
-        return 1.
-
-    def d2transf_df2(self,f):
-        return 0
-
-
-class Probit(GPTransformation):
-    """
-    .. math::
-
-        g(f) = \\Phi^{-1} (mu)
-
-    """
-    def transf(self,f):
-        return std_norm_cdf(f)
-
-    def dtransf_df(self,f):
-        return std_norm_pdf(f)
-
-    def d2transf_df2(self,f):
-        return -f * std_norm_pdf(f)
-
-class Log(GPTransformation):
-    """
-    .. math::
-
-        g(f) = \\log(\\mu)
-
-    """
-    def transf(self,f):
-        return np.exp(f)
-
-    def dtransf_df(self,f):
-        return np.exp(f)
-
-    def d2transf_df2(self,f):
-        return np.exp(f)
-
-class Log_ex_1(GPTransformation):
-    """
-    .. math::
-
-        g(f) = \\log(\\exp(\\mu) - 1)
-
-    """
-    def transf(self,f):
-        return np.log(1.+np.exp(f))
-
-    def dtransf_df(self,f):
-        return np.exp(f)/(1.+np.exp(f))
-
-    def d2transf_df2(self,f):
-        aux = np.exp(f)/(1.+np.exp(f))
-        return aux*(1.-aux)
-
-class Reciprocal(GPTransformation):
-    def transf(sefl,f):
-        return 1./f
-
-    def dtransf_df(self,f):
-        return -1./f**2
-
-    def d2transf_df2(self,f):
-        return 2./f**3
-
-class Heaviside(GPTransformation):
-    """
-
-    .. math::
-
-        g(f) = I_{x \\in A}
-
-    """
-    def transf(self,f):
-        #transformation goes here
-        return np.where(f>0, 1, 0)
-
-    def dtransf_df(self,f):
-        raise NotImplementedError, "This function is not differentiable!"
-
-    def d2transf_df2(self,f):
-        raise NotImplementedError, "This function is not differentiable!"
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
deleted file mode 100644
index 913c2b9d..00000000
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ /dev/null
@@ -1,425 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-import pylab as pb
-from GPy.util.plot import gpplot
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-
-
-class NoiseDistribution(object):
-    """
-    Likelihood class for doing Expectation propagation
-
-    :param Y: observed output (Nx1 numpy.darray)
-
-    .. note:: Y values allowed depend on the LikelihoodFunction used
-    """
-    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
-        assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."
-        self.gp_link = gp_link
-        self.analytical_mean = analytical_mean
-        self.analytical_variance = analytical_variance
-        if self.analytical_mean:
-            self.moments_match = self._moments_match_analytical
-            self.predictive_mean = self._predictive_mean_analytical
-        else:
-            self.moments_match = self._moments_match_numerical
-            self.predictive_mean = self._predictive_mean_numerical
-        if self.analytical_variance:
-            self.predictive_variance = self._predictive_variance_analytical
-        else:
-            self.predictive_variance = self._predictive_variance_numerical
-
-    def _get_params(self):
-        return np.zeros(0)
-
-    def _get_param_names(self):
-        return []
-
-    def _set_params(self,p):
-        pass
-
-    def _gradients(self,partial):
-        return np.zeros(0)
-
-    def _preprocess_values(self,Y):
-        """
-        In case it is needed, this function assess the output values or makes any pertinent transformation on them.
-
-        :param Y: observed output
-        :type Y: Nx1 numpy.darray
-
-        """
-        return Y
-
-    def _product(self,gp,obs,mu,sigma):
-        """
-        Product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
-
-    def _nlog_product_scaled(self,gp,obs,mu,sigma):
-        """
-        Negative log-product between the cavity distribution and a likelihood factor.
-
-        .. note:: The constant term in the Gaussian distribution is ignored.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
-
-    def _dnlog_product_dgp(self,gp,obs,mu,sigma):
-        """
-        Derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
-
-    def _d2nlog_product_dgp2(self,gp,obs,mu,sigma):
-        """
-        Second derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
-
-    def _product_mode(self,obs,mu,sigma):
-        """
-        Newton's CG method to find the mode in _product (cavity x likelihood factor).
-
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma),disp=False)
-
-    def _moments_match_analytical(self,obs,tau,v):
-        """
-        If available, this function computes the moments analytically.
-        """
-        pass
-
-    def _moments_match_numerical(self,obs,tau,v):
-        """
-        Lapace approximation to calculate the moments.
-
-        :param obs: observed output
-        :param tau: cavity distribution 1st natural parameter (precision)
-        :param v: cavity distribution 2nd natural paramenter (mu*precision)
-
-        """
-        mu = v/tau
-        mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
-        sigma2_hat = 1./(tau + self._d2nlog_mass_dgp2(mu_hat,obs))
-        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self._mass(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
-        return Z_hat,mu_hat,sigma2_hat
-
-    def _nlog_conditional_mean_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's mean given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
-
-        """
-        return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
-
-    def _dnlog_conditional_mean_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_conditional_mean_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
-
-    def _d2nlog_conditional_mean_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_conditional_mean_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
-
-    def _nlog_exp_conditional_variance_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's variance given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E(V(Y_star|f_star))
-
-        """
-        return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
-
-    def _dnlog_exp_conditional_variance_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
-
-    def _d2nlog_exp_conditional_variance_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
-
-    def _nlog_exp_conditional_mean_sq_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's mean squared given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E( E(Y_star|f_star)**2 )
-
-        """
-        return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
-
-    def _dnlog_exp_conditional_mean_sq_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
-
-    def _d2nlog_exp_conditional_mean_sq_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
-
-    def _predictive_mean_analytical(self,mu,sigma):
-        """
-        If available, this function computes the predictive mean analytically.
-        """
-        pass
-
-    def _predictive_variance_analytical(self,mu,sigma):
-        """
-        If available, this function computes the predictive variance analytically.
-        """
-        pass
-
-    def _predictive_mean_numerical(self,mu,sigma):
-        """
-        Laplace approximation to the predictive mean: E(Y_star) = E( E(Y_star|f_star) )
-
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma),disp=False)
-        mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
-        """
-
-        pb.figure()
-        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
-        f = np.array([np.exp(-self._nlog_conditional_mean_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
-        pb.plot(x,f,'b-')
-        sigma2 = 1./self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma)
-        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
-        k = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
-        pb.plot(x,f2*mean,'r-')
-        pb.vlines(maximum,0,f.max())
-        """
-        return mean
-
-    def _predictive_mean_sq(self,mu,sigma):
-        """
-        Laplace approximation to the predictive mean squared: E(Y_star**2) = E( E(Y_star|f_star)**2 )
-
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma),disp=False)
-        mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
-        return mean_squared
-
-    def _predictive_variance_numerical(self,mu,sigma,predictive_mean=None):
-        """
-        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
-
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
-
-        """
-        # E( V(Y_star|f_star) )
-        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma),disp=False)
-        exp_var = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma))*sigma)
-
-        """
-        pb.figure()
-        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
-        f = np.array([np.exp(-self._nlog_exp_conditional_variance_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
-        pb.plot(x,f,'b-')
-        sigma2 = 1./self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma)
-        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
-        k = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
-        pb.plot(x,f2*exp_var,'r--')
-        pb.vlines(maximum,0,f.max())
-        """
-
-        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star)**2 )
-        exp_exp2 = self._predictive_mean_sq(mu,sigma)
-        if predictive_mean is None:
-            predictive_mean = self.predictive_mean(mu,sigma)
-        var_exp = exp_exp2 - predictive_mean**2
-        return exp_var + var_exp
-
-    def _predictive_percentiles(self,p,mu,sigma):
-        """
-        Percentiles of the predictive distribution
-
-        :parm p: lower tail probability
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
-
-        """
-        qf = stats.norm.ppf(p,mu,sigma)
-        return self.gp_link.transf(qf)
-
-    def _nlog_joint_predictive_scaled(self,x,mu,sigma):
-        """
-        Negative logarithm of the joint predictive distribution (latent variable and output).
-
-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        """
-        return self._nlog_product_scaled(x[0],x[1],mu,sigma)
-
-    def _gradient_nlog_joint_predictive(self,x,mu,sigma):
-        """
-        Gradient of _nlog_joint_predictive_scaled.
-
-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        .. note: Only available when the output is continuous
-
-        """
-        assert not self.discrete, "Gradient not available for discrete outputs."
-        return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
-
-    def _hessian_nlog_joint_predictive(self,x,mu,sigma):
-        """
-        Hessian of _nlog_joint_predictive_scaled.
-
-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        .. note: Only available when the output is continuous
-
-        """
-        assert not self.discrete, "Hessian not available for discrete outputs."
-        cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
-        return np.array((self._d2nlog_product_dgp2(gp=x[0],obs=x[1],mu=mu,sigma=sigma),cross_derivative,cross_derivative,self._d2nlog_mass_dobs2(obs=x[1],gp=x[0]))).reshape(2,2)
-
-    def _joint_predictive_mode(self,mu,sigma):
-        """
-        Negative logarithm of the joint predictive distribution (latent variable and output).
-
-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        """
-        return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
-
-    def predictive_values(self,mu,var):
-        """
-        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
-
-        :param mu: mean of the latent variable
-        :param var: variance of the latent variable
-
-        """
-        if isinstance(mu,float) or isinstance(mu,int):
-            mu = [mu]
-            var = [var]
-        pred_mean = []
-        pred_var = []
-        q1 = []
-        q3 = []
-        for m,s in zip(mu,np.sqrt(var)):
-            pred_mean.append(self.predictive_mean(m,s))
-            pred_var.append(self.predictive_variance(m,s,pred_mean[-1]))
-            q1.append(self._predictive_percentiles(.025,m,s))
-            q3.append(self._predictive_percentiles(.975,m,s))
-        pred_mean = np.vstack(pred_mean)
-        pred_var = np.vstack(pred_var)
-        q1 = np.vstack(q1)
-        q3 = np.vstack(q3)
-        return pred_mean, pred_var, q1, q3
-
-
-    def samples(self, gp):
-        """
-        Returns a set of samples of observations based on a given value of the latent variable.
-
-        :param gp: latent variable
-        """
-        pass
-
diff --git a/GPy/likelihoods/noise_models/poisson_noise.py b/GPy/likelihoods/noise_models/poisson_noise.py
deleted file mode 100644
index 33de84cd..00000000
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
-
-class Poisson(NoiseDistribution):
-    """
-    Poisson likelihood
-
-    .. math::
-        L(x) = \\exp(\\lambda) * \\frac{\\lambda^Y_i}{Y_i!}
-
-    ..Note: Y is expected to take values in {0,1,2,...}
-    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
-
-    def _preprocess_values(self,Y): #TODO
-        return Y
-
-    def _mass(self,gp,obs):
-        """
-        Mass (or density) function
-        """
-        return stats.poisson.pmf(obs,self.gp_link.transf(gp))
-
-    def _nlog_mass(self,gp,obs):
-        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
-        """
-        return self.gp_link.transf(gp) - obs * np.log(self.gp_link.transf(gp)) + np.log(special.gamma(obs+1))
-
-    def _dnlog_mass_dgp(self,gp,obs):
-        return self.gp_link.dtransf_df(gp) * (1. - obs/self.gp_link.transf(gp))
-
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        d2_df = self.gp_link.d2transf_df2(gp)
-        transf = self.gp_link.transf(gp)
-        return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
-
-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)