[huge merge] trying to merge old master and master

2026-05-18 13:55:14 +02:00 · 2014-11-21 16:17:03 +00:00 · 2014-11-21 16:17:03 +00:00 · 180650ec85
commit 180650ec85
parent 0f8dbba56d 4fd05439fc
308 changed files with 27071 additions and 24550 deletions
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,7 +1,8 @@
-from ep import EP
-from laplace import Laplace
-from ep_mixed_noise import EP_Mixed_Noise
+from bernoulli import Bernoulli
+from exponential import Exponential
 from gaussian import Gaussian
-from gaussian_mixed_noise import Gaussian_Mixed_Noise
-import noise_models
-from noise_model_constructors import *
+from gamma import Gamma
+from poisson import Poisson
+from student_t import StudentT
+from likelihood import Likelihood
+from mixed_noise import MixedNoise
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -0,0 +1,224 @@
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
+import link_functions
+from likelihood import Likelihood
+from scipy import stats
+
+class Bernoulli(Likelihood):
+    """
+    Bernoulli likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+    .. Note::
+        Y takes values in either {-1, 1} or {0, 1}.
+        link function should have the domain [0, 1], e.g. probit (default) or Heaviside
+
+    .. See also::
+        likelihood.py, for the parent class
+    """
+    def __init__(self, gp_link=None):
+        if gp_link is None:
+            gp_link = link_functions.Probit()
+
+        super(Bernoulli, self).__init__(gp_link, 'Bernoulli')
+
+        if isinstance(gp_link , (link_functions.Heaviside, link_functions.Probit)):
+            self.log_concave = True
+
+    def _preprocess_values(self, Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+
+        ..Note:: Binary classification algorithm works better with classes {-1, 1}
+        """
+        Y_prep = Y.copy()
+        Y1 = Y[Y.flatten()==1].size
+        Y2 = Y[Y.flatten()==0].size
+        assert Y1 + Y2 == Y.size, 'Bernoulli likelihood is meant to be used only with outputs in {0, 1}.'
+        Y_prep[Y.flatten() == 0] = -1
+        return Y_prep
+
+    def moments_match_ep(self, Y_i, tau_i, v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        if Y_i == 1:
+            sign = 1.
+        elif Y_i == 0 or Y_i == -1:
+            sign = -1
+        else:
+            raise ValueError("bad value for Bernoulli observation (0, 1)")
+        if isinstance(self.gp_link, link_functions.Probit):
+            z = sign*v_i/np.sqrt(tau_i**2 + tau_i)
+            Z_hat = std_norm_cdf(z)
+            phi = std_norm_pdf(z)
+            mu_hat = v_i/tau_i + sign*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
+            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
+
+        elif isinstance(self.gp_link, link_functions.Heaviside):
+            a = sign*v_i/np.sqrt(tau_i)
+            Z_hat = std_norm_cdf(a)
+            N = std_norm_pdf(a)
+            mu_hat = v_i/tau_i + sign*N/Z_hat/np.sqrt(tau_i)
+            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
+        else:
+            #TODO: do we want to revert to numerical quadrature here?
+            raise ValueError("Exact moment matching not available for link {}".format(self.gp_link.__name__))
+
+        return Z_hat, mu_hat, sigma2_hat
+
+    def predictive_mean(self, mu, variance, Y_metadata=None):
+
+        if isinstance(self.gp_link, link_functions.Probit):
+            return stats.norm.cdf(mu/np.sqrt(1+variance))
+
+        elif isinstance(self.gp_link, link_functions.Heaviside):
+            return stats.norm.cdf(mu/np.sqrt(variance))
+
+        else:
+            raise NotImplementedError
+
+    def predictive_variance(self, mu, variance, pred_mean, Y_metadata=None):
+
+        if isinstance(self.gp_link, link_functions.Heaviside):
+            return 0.
+        else:
+            return np.nan
+
+    def pdf_link(self, inv_link_f, y, Y_metadata=None):
+        """
+        Likelihood function given inverse link of f.
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in bernoulli
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        .. Note:
+            Each y_i must be in {0, 1}
+        """
+        #objective = (inv_link_f**y) * ((1.-inv_link_f)**(1.-y))
+        return np.where(y, inv_link_f, 1.-inv_link_f)
+
+    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
+        """
+        Log Likelihood function given inverse link of f.
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in bernoulli
+        :returns: log likelihood evaluated at points inverse link of f.
+        :rtype: float
+        """
+        #objective = y*np.log(inv_link_f) + (1.-y)*np.log(inv_link_f)
+        p = np.where(y==1, inv_link_f, 1.-inv_link_f)
+        return np.log(np.clip(p, 1e-6 ,np.inf))
+
+    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
+        """
+        Gradient of the pdf at y, given inverse link of f w.r.t inverse link of f.
+
+        .. math::
+            \\frac{d\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - \\frac{(1 - y_{i})}{(1 - \\lambda(f_{i}))}
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in bernoulli
+        :returns: gradient of log likelihood evaluated at points inverse link of f.
+        :rtype: Nx1 array
+        """
+        #grad = (y/inv_link_f) - (1.-y)/(1-inv_link_f)
+        #grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f))
+        ff = np.clip(inv_link_f, 1e-6, 1-1e-6)
+        denom = np.where(y, ff, -(1-ff))
+        return 1./denom
+
+    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
+        """
+        Hessian at y, given inv_link_f, w.r.t inv_link_f the hessian will be 0 unless i == j
+        i.e. second derivative logpdf at y given inverse link of f_i and inverse link of f_j  w.r.t inverse link of f_i and inverse link of f_j.
+
+
+        .. math::
+            \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in bernoulli
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points inverse link of f.
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on inverse link of f_i not on inverse link of f_(j!=i)
+        """
+        #d2logpdf_dlink2 = -y/(inv_link_f**2) - (1-y)/((1-inv_link_f)**2)
+        #d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f))
+        arg = np.where(y, inv_link_f, 1.-inv_link_f)
+        ret =  -1./np.square(np.clip(arg, 1e-3, np.inf))
+        if np.any(np.isinf(ret)):
+            stop
+        return ret
+
+    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
+        """
+        Third order derivative log-likelihood function at y given inverse link of f w.r.t inverse link of f
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f)^{3}} - \\frac{2(1-y_{i}}{(1-\\lambda(f))^{3}}
+
+        :param inv_link_f: latent variables passed through inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in bernoulli
+        :returns: third derivative of log likelihood evaluated at points inverse_link(f)
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        #d3logpdf_dlink3 = 2*(y/(inv_link_f**3) - (1-y)/((1-inv_link_f)**3))
+        state = np.seterr(divide='ignore')
+        # TODO check y \in {0, 1} or {-1, 1}
+        d3logpdf_dlink3 = np.where(y, 2./(inv_link_f**3), -2./((1.-inv_link_f)**3))
+        np.seterr(**state)
+        return d3logpdf_dlink3
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        ns = np.ones_like(gp, dtype=int)
+        Ysim = np.random.binomial(ns, self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
+
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        pass
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -1,392 +0,0 @@
-import numpy as np
-from scipy import stats
-from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
-from likelihood import likelihood
-
-class EP(likelihood):
-    def __init__(self,data,noise_model):
-        """
-        Expectation Propagation
-
-        :param data: data to model
-        :type data: numpy array
-        :param noise_model: noise distribution
-        :type noise_model: A GPy noise model
-
-        """
-        self.noise_model = noise_model
-        self.data = data
-        self.num_data, self.output_dim = self.data.shape
-        self.is_heteroscedastic = True
-        self.num_params = 0
-
-        #Initial values - Likelihood approximation parameters:
-        #p(y|f) = t(f|tau_tilde,v_tilde)
-        self.tau_tilde = np.zeros(self.num_data)
-        self.v_tilde = np.zeros(self.num_data)
-
-        #initial values for the GP variables
-        self.Y = np.zeros((self.num_data,1))
-        self.covariance_matrix = np.eye(self.num_data)
-        self.precision = np.ones(self.num_data)[:,None]
-        self.Z = 0
-        self.YYT = None
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = 0.
-
-        super(EP, self).__init__()
-
-    def restart(self):
-        self.tau_tilde = np.zeros(self.num_data)
-        self.v_tilde = np.zeros(self.num_data)
-        self.Y = np.zeros((self.num_data,1))
-        self.covariance_matrix = np.eye(self.num_data)
-        self.precision = np.ones(self.num_data)[:,None]
-        self.Z = 0
-        self.YYT = None
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = 0.
-
-    def predictive_values(self,mu,var,full_cov,**noise_args):
-        if full_cov:
-            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model.predictive_values(mu,var,**noise_args)
-
-    def log_predictive_density(self, y_test, mu_star, var_star):
-        """
-        Calculation of the log predictive density
-
-        .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
-
-        :param y_test: test observations (y_{*})
-        :type y_test: (Nx1) array
-        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type mu_star: (Nx1) array
-        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type var_star: (Nx1) array
-        """
-        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
-
-    def _get_params(self):
-        #return np.zeros(0)
-        return self.noise_model._get_params()
-
-    def _get_param_names(self):
-        #return []
-        return self.noise_model._get_param_names()
-
-    def _set_params(self,p):
-        #pass # TODO: the EP likelihood might want to take some parameters...
-        self.noise_model._set_params(p)
-
-    def _gradients(self,partial):
-        #return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
-        return self.noise_model._gradients(partial)
-
-    def _compute_GP_variables(self):
-        #Variables to be called from GP
-        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
-        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
-        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
-        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
-        self.Z += 0.5*self.num_data*np.log(2*np.pi)
-
-        self.Y =  mu_tilde[:,None]
-        self.YYT = np.dot(self.Y,self.Y.T)
-        self.covariance_matrix = np.diag(1./self.tau_tilde)
-        self.precision = self.tau_tilde[:,None]
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = np.trace(self.YYT)
-
-    def fit_full(self, K, epsilon=1e-3,power_ep=[1.,1.]):
-        """
-        The expectation-propagation algorithm.
-        For nomenclature see Rasmussen & Williams 2006.
-
-        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        :type epsilon: float
-        :param power_ep: Power EP parameters
-        :type power_ep: list of floats
-
-        """
-        self.epsilon = epsilon
-        self.eta, self.delta = power_ep
-
-        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
-        mu = np.zeros(self.num_data)
-        Sigma = K.copy()
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.num_data,dtype=float)
-        self.v_ = np.empty(self.num_data,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.num_data,dtype=float)
-        self.Z_hat = np.empty(self.num_data,dtype=float)
-        phi = np.empty(self.num_data,dtype=float)
-        mu_hat = np.empty(self.num_data,dtype=float)
-        sigma2_hat = np.empty(self.num_data,dtype=float)
-
-        #Approximation
-        epsilon_np1 = self.epsilon + 1.
-        epsilon_np2 = self.epsilon + 1.
-       	self.iterations = 0
-        self.np1 = [self.tau_tilde.copy()]
-        self.np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.num_data)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
-                mu = np.dot(Sigma,self.v_tilde)
-                self.iterations += 1
-            #Sigma recomptutation with Cholesky decompositon
-            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
-            B = np.eye(self.num_data) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
-            L = jitchol(B)
-            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
-            Sigma = K - np.dot(V.T,V)
-            mu = np.dot(Sigma,self.v_tilde)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
-            self.np1.append(self.tau_tilde.copy())
-            self.np2.append(self.v_tilde.copy())
-
-        return self._compute_GP_variables()
-
-    def fit_DTC(self, Kmm, Kmn, epsilon=1e-3,power_ep=[1.,1.]):
-        """
-        The expectation-propagation algorithm with sparse pseudo-input.
-        For nomenclature see ... 2013.
-
-        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        :type epsilon: float
-        :param power_ep: Power EP parameters
-        :type power_ep: list of floats
-
-        """
-        self.epsilon = epsilon
-        self.eta, self.delta = power_ep
-
-        num_inducing = Kmm.shape[0]
-
-        #TODO: this doesn't work with uncertain inputs!
-
-        """
-        Prior approximation parameters:
-        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
-        Sigma0 = Qnn = Knm*Kmmi*Kmn
-        """
-        KmnKnm = np.dot(Kmn,Kmn.T)
-        Lm = jitchol(Kmm)
-        Lmi = chol_inv(Lm)
-        Kmmi = np.dot(Lmi.T,Lmi)
-        KmmiKmn = np.dot(Kmmi,Kmn)
-        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
-        LLT0 = Kmm.copy()
-
-        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
-        #KmnKnm = np.dot(Kmn, Kmn.T)
-        #KmmiKmn = np.dot(Kmmi,Kmn)
-        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
-        #LLT0 = Kmm.copy()
-
-        """
-        Posterior approximation: q(f|y) = N(f| mu, Sigma)
-        Sigma = Diag + P*R.T*R*P.T + K
-        mu = w + P*Gamma
-        """
-        mu = np.zeros(self.num_data)
-        LLT = Kmm.copy()
-        Sigma_diag = Qnn_diag.copy()
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.num_data,dtype=float)
-        self.v_ = np.empty(self.num_data,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.num_data,dtype=float)
-        self.Z_hat = np.empty(self.num_data,dtype=float)
-        phi = np.empty(self.num_data,dtype=float)
-        mu_hat = np.empty(self.num_data,dtype=float)
-        sigma2_hat = np.empty(self.num_data,dtype=float)
-
-        #Approximation
-        epsilon_np1 = 1
-        epsilon_np2 = 1
-       	self.iterations = 0
-        np1 = [self.tau_tilde.copy()]
-        np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.num_data)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
-                L = jitchol(LLT)
-                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
-                V,info = dtrtrs(L,Kmn,lower=1)
-                Sigma_diag = np.sum(V*V,-2)
-                si = np.sum(V.T*V[:,i],-1)
-                mu += (Delta_v-Delta_tau*mu[i])*si
-                self.iterations += 1
-            #Sigma recomputation with Cholesky decompositon
-            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
-            L = jitchol(LLT)
-            V,info = dtrtrs(L,Kmn,lower=1)
-            V2,info = dtrtrs(L.T,V,lower=0)
-            Sigma_diag = np.sum(V*V,-2)
-            Knmv_tilde = np.dot(Kmn,self.v_tilde)
-            mu = np.dot(V2.T,Knmv_tilde)
-            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.num_data
-            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.num_data
-            np1.append(self.tau_tilde.copy())
-            np2.append(self.v_tilde.copy())
-
-        self._compute_GP_variables()
-
-    def fit_FITC(self, Kmm, Kmn, Knn_diag, epsilon=1e-3,power_ep=[1.,1.]):
-        """
-        The expectation-propagation algorithm with sparse pseudo-input.
-        For nomenclature see Naish-Guzman and Holden, 2008.
-
-        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        :type epsilon: float
-        :param power_ep: Power EP parameters
-        :type power_ep: list of floats
-        """
-        self.epsilon = epsilon
-        self.eta, self.delta = power_ep
-
-        num_inducing = Kmm.shape[0]
-
-        """
-        Prior approximation parameters:
-        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
-        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
-        """
-        Lm = jitchol(Kmm)
-        Lmi = chol_inv(Lm)
-        Kmmi = np.dot(Lmi.T,Lmi)
-        P0 = Kmn.T
-        KmnKnm = np.dot(P0.T, P0)
-        KmmiKmn = np.dot(Kmmi,P0.T)
-        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
-        Diag0 = Knn_diag - Qnn_diag
-        R0 = jitchol(Kmmi).T
-
-        """
-        Posterior approximation: q(f|y) = N(f| mu, Sigma)
-        Sigma = Diag + P*R.T*R*P.T + K
-        mu = w + P*Gamma
-        """
-        self.w = np.zeros(self.num_data)
-        self.Gamma = np.zeros(num_inducing)
-        mu = np.zeros(self.num_data)
-        P = P0.copy()
-        R = R0.copy()
-        Diag = Diag0.copy()
-        Sigma_diag = Knn_diag
-        RPT0 = np.dot(R0,P0.T)
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.num_data,dtype=float)
-        self.v_ = np.empty(self.num_data,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.num_data,dtype=float)
-        self.Z_hat = np.empty(self.num_data,dtype=float)
-        phi = np.empty(self.num_data,dtype=float)
-        mu_hat = np.empty(self.num_data,dtype=float)
-        sigma2_hat = np.empty(self.num_data,dtype=float)
-
-        #Approximation
-        epsilon_np1 = 1
-        epsilon_np2 = 1
-       	self.iterations = 0
-        self.np1 = [self.tau_tilde.copy()]
-        self.np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.num_data)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                dtd1 = Delta_tau*Diag[i] + 1.
-                dii = Diag[i]
-                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
-                pi_ = P[i,:].reshape(1,num_inducing)
-                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
-                Rp_i = np.dot(R,pi_.T)
-                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
-                R = jitchol(RTR).T
-                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
-                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
-                RPT = np.dot(R,P.T)
-                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
-                mu = self.w + np.dot(P,self.Gamma)
-                self.iterations += 1
-            #Sigma recomptutation with Cholesky decompositon
-            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
-            Diag = Diag0 * Iplus_Dprod_i
-            P = Iplus_Dprod_i[:,None] * P0
-            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
-            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
-            R,info = dtrtrs(L,R0,lower=1)
-            RPT = np.dot(R,P.T)
-            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
-            self.w = Diag * self.v_tilde
-            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
-            mu = self.w + np.dot(P,self.Gamma)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
-            self.np1.append(self.tau_tilde.copy())
-            self.np2.append(self.v_tilde.copy())
-
-        return self._compute_GP_variables()
--- a/GPy/likelihoods/ep_mixed_noise.py
+++ b/GPy/likelihoods/ep_mixed_noise.py
@ -1,385 +0,0 @@
-# Copyright (c) 2013, Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
-from likelihood import likelihood
-
-class EP_Mixed_Noise(likelihood):
-    def __init__(self,data_list,noise_model_list,epsilon=1e-3,power_ep=[1.,1.]):
-        """
-        Expectation Propagation
-
-        Arguments
-        ---------
-        :param data_list: list of outputs
-        :param noise_model_list: a list of noise models
-        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations
-        :type epsilon: float
-        :param power_ep: list of power ep parameters
-        """
-        assert len(data_list) == len(noise_model_list)
-        self.noise_model_list = noise_model_list
-        n_list = [data.size for data in data_list]
-        self.n_models = len(data_list)
-        self.n_params = [noise_model._get_params().size for noise_model in noise_model_list]
-        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.n_models),n_list)])
-        self.epsilon = epsilon
-        self.eta, self.delta = power_ep
-        self.data = np.vstack(data_list)
-        self.N, self.output_dim = self.data.shape
-        self.is_heteroscedastic = True
-        self.num_params = 0#FIXME
-        self._transf_data = np.vstack([noise_model._preprocess_values(data) for noise_model,data in zip(noise_model_list,data_list)])
-        #TODO non-gaussian index
-
-        #Initial values - Likelihood approximation parameters:
-        #p(y|f) = t(f|tau_tilde,v_tilde)
-        self.tau_tilde = np.zeros(self.N)
-        self.v_tilde = np.zeros(self.N)
-
-        #initial values for the GP variables
-        self.Y = np.zeros((self.N,1))
-        self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:,None]
-        self.Z = 0
-        self.YYT = None
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = 0.
-
-    def restart(self):
-        self.tau_tilde = np.zeros(self.N)
-        self.v_tilde = np.zeros(self.N)
-        self.Y = np.zeros((self.N,1))
-        self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:,None]
-        self.Z = 0
-        self.YYT = None
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = 0.
-
-    def predictive_values(self,mu,var,full_cov,noise_model):
-        """
-        Predicts the output given the GP
-
-        :param mu: GP's mean
-        :param var: GP's variance
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: False|True
-        :param noise_model: noise model to use
-        :type noise_model: integer
-        """
-        if full_cov:
-            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        #_mu = []
-        #_var = []
-        #_q1 = []
-        #_q2 = []
-        #for m,v,o in zip(mu,var,output.flatten()):
-        #    a,b,c,d = self.noise_model_list[int(o)].predictive_values(m,v)
-        #    _mu.append(a)
-        #    _var.append(b)
-        #    _q1.append(c)
-        #    _q2.append(d)
-        #return np.vstack(_mu),np.vstack(_var),np.vstack(_q1),np.vstack(_q2)
-        return self.noise_model_list[noise_model].predictive_values(mu,var)
-
-    def _get_params(self):
-        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
-
-    def _get_param_names(self):
-        names = []
-        for noise_model in self.noise_model_list:
-           names += noise_model._get_param_names()
-        return names
-
-    def _set_params(self,p):
-        cs_params = np.cumsum([0]+self.n_params)
-        for i in range(len(self.n_params)):
-            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
-
-    def _gradients(self,partial):
-        #NOTE this is not tested
-        return np.hstack([noise_model._gradients(partial) for noise_model in self.noise_model_list])
-
-    def _compute_GP_variables(self):
-        #Variables to be called from GP
-        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
-        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
-        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
-        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
-
-        self.Y =  mu_tilde[:,None]
-        self.YYT = np.dot(self.Y,self.Y.T)
-        self.covariance_matrix = np.diag(1./self.tau_tilde)
-        self.precision = self.tau_tilde[:,None]
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = np.trace(self.YYT)
-
-    def fit_full(self,K):
-        """
-        The expectation-propagation algorithm.
-        For nomenclature see Rasmussen & Williams 2006.
-        """
-        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
-        mu = np.zeros(self.N)
-        Sigma = K.copy()
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
-
-        #Approximation
-        epsilon_np1 = self.epsilon + 1.
-        epsilon_np2 = self.epsilon + 1.
-       	self.iterations = 0
-        self.np1 = [self.tau_tilde.copy()]
-        self.np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
-                mu = np.dot(Sigma,self.v_tilde)
-                self.iterations += 1
-            #Sigma recomptutation with Cholesky decompositon
-            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
-            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
-            L = jitchol(B)
-            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
-            Sigma = K - np.dot(V.T,V)
-            mu = np.dot(Sigma,self.v_tilde)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
-            self.np1.append(self.tau_tilde.copy())
-            self.np2.append(self.v_tilde.copy())
-
-        return self._compute_GP_variables()
-
-    def fit_DTC(self, Kmm, Kmn):
-        """
-        The expectation-propagation algorithm with sparse pseudo-input.
-        For nomenclature see ... 2013.
-        """
-        num_inducing = Kmm.shape[0]
-
-        #TODO: this doesn't work with uncertain inputs!
-
-        """
-        Prior approximation parameters:
-        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
-        Sigma0 = Qnn = Knm*Kmmi*Kmn
-        """
-        KmnKnm = np.dot(Kmn,Kmn.T)
-        Lm = jitchol(Kmm)
-        Lmi = chol_inv(Lm)
-        Kmmi = np.dot(Lmi.T,Lmi)
-        KmmiKmn = np.dot(Kmmi,Kmn)
-        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
-        LLT0 = Kmm.copy()
-
-        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
-        #KmnKnm = np.dot(Kmn, Kmn.T)
-        #KmmiKmn = np.dot(Kmmi,Kmn)
-        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
-        #LLT0 = Kmm.copy()
-
-        """
-        Posterior approximation: q(f|y) = N(f| mu, Sigma)
-        Sigma = Diag + P*R.T*R*P.T + K
-        mu = w + P*Gamma
-        """
-        mu = np.zeros(self.N)
-        LLT = Kmm.copy()
-        Sigma_diag = Qnn_diag.copy()
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
-
-        #Approximation
-        epsilon_np1 = 1
-        epsilon_np2 = 1
-       	self.iterations = 0
-        np1 = [self.tau_tilde.copy()]
-        np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
-                L = jitchol(LLT)
-                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
-                V,info = dtrtrs(L,Kmn,lower=1)
-                Sigma_diag = np.sum(V*V,-2)
-                si = np.sum(V.T*V[:,i],-1)
-                mu += (Delta_v-Delta_tau*mu[i])*si
-                self.iterations += 1
-            #Sigma recomputation with Cholesky decompositon
-            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
-            L = jitchol(LLT)
-            V,info = dtrtrs(L,Kmn,lower=1)
-            V2,info = dtrtrs(L.T,V,lower=0)
-            Sigma_diag = np.sum(V*V,-2)
-            Knmv_tilde = np.dot(Kmn,self.v_tilde)
-            mu = np.dot(V2.T,Knmv_tilde)
-            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
-            np1.append(self.tau_tilde.copy())
-            np2.append(self.v_tilde.copy())
-
-        self._compute_GP_variables()
-
-    def fit_FITC(self, Kmm, Kmn, Knn_diag):
-        """
-        The expectation-propagation algorithm with sparse pseudo-input.
-        For nomenclature see Naish-Guzman and Holden, 2008.
-        """
-        num_inducing = Kmm.shape[0]
-
-        """
-        Prior approximation parameters:
-        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
-        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
-        """
-        Lm = jitchol(Kmm)
-        Lmi = chol_inv(Lm)
-        Kmmi = np.dot(Lmi.T,Lmi)
-        P0 = Kmn.T
-        KmnKnm = np.dot(P0.T, P0)
-        KmmiKmn = np.dot(Kmmi,P0.T)
-        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
-        Diag0 = Knn_diag - Qnn_diag
-        R0 = jitchol(Kmmi).T
-
-        """
-        Posterior approximation: q(f|y) = N(f| mu, Sigma)
-        Sigma = Diag + P*R.T*R*P.T + K
-        mu = w + P*Gamma
-        """
-        self.w = np.zeros(self.N)
-        self.Gamma = np.zeros(num_inducing)
-        mu = np.zeros(self.N)
-        P = P0.copy()
-        R = R0.copy()
-        Diag = Diag0.copy()
-        Sigma_diag = Knn_diag
-        RPT0 = np.dot(R0,P0.T)
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
-
-        #Approximation
-        epsilon_np1 = 1
-        epsilon_np2 = 1
-       	self.iterations = 0
-        self.np1 = [self.tau_tilde.copy()]
-        self.np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                dtd1 = Delta_tau*Diag[i] + 1.
-                dii = Diag[i]
-                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
-                pi_ = P[i,:].reshape(1,num_inducing)
-                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
-                Rp_i = np.dot(R,pi_.T)
-                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
-                R = jitchol(RTR).T
-                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
-                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
-                RPT = np.dot(R,P.T)
-                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
-                mu = self.w + np.dot(P,self.Gamma)
-                self.iterations += 1
-            #Sigma recomptutation with Cholesky decompositon
-            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
-            Diag = Diag0 * Iplus_Dprod_i
-            P = Iplus_Dprod_i[:,None] * P0
-            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
-            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
-            R,info = dtrtrs(L,R0,lower=1)
-            RPT = np.dot(R,P.T)
-            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
-            self.w = Diag * self.v_tilde
-            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
-            mu = self.w + np.dot(P,self.Gamma)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
-            self.np1.append(self.tau_tilde.copy())
-            self.np2.append(self.v_tilde.copy())
-
-        return self._compute_GP_variables()
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@ -1,15 +1,14 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
+# Copyright (c) 2012-2014 GPy Authors
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


 import numpy as np
 from scipy import stats,special
 import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
+import link_functions
+from likelihood import Likelihood

-class Exponential(NoiseDistribution):
+class Exponential(Likelihood):
    """
    Expoential likelihood
    Y is expected to take values in {0,1,2,...}
@ -18,13 +17,12 @@ class Exponential(NoiseDistribution):
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Exponential, self).__init__(gp_link,analytical_mean,analytical_variance)
+    def __init__(self,gp_link=None):
+        if gp_link is None:
+            gp_link = link_functions.Log()
+        super(Exponential, self).__init__(gp_link, 'ExpLikelihood')

-    def _preprocess_values(self,Y):
-        return Y
-
-    def pdf_link(self, link_f, y, extra_data=None):
+    def pdf_link(self, link_f, y, Y_metadata=None):
        """
        Likelihood function given link(f)

@ -35,16 +33,15 @@ class Exponential(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in exponential distribution
+        :param Y_metadata: Y_metadata which is not used in exponential distribution
        :returns: likelihood evaluated for this point
        :rtype: float
        """
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        log_objective = link_f*np.exp(-y*link_f)
        return np.exp(np.sum(np.log(log_objective)))
-        #return np.exp(np.sum(-y/link_f - np.log(link_f) ))

-    def logpdf_link(self, link_f, y, extra_data=None):
+    def logpdf_link(self, link_f, y, Y_metadata=None):
        """
        Log Likelihood Function given link(f)

@ -55,17 +52,16 @@ class Exponential(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in exponential distribution
+        :param Y_metadata: Y_metadata which is not used in exponential distribution
        :returns: likelihood evaluated for this point
        :rtype: float

        """
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        log_objective = np.log(link_f) - y*link_f
-        #logpdf_link = np.sum(-np.log(link_f) - y/link_f)
        return np.sum(log_objective)

-    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)

@ -76,7 +72,7 @@ class Exponential(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in exponential distribution
+        :param Y_metadata: Y_metadata which is not used in exponential distribution
        :returns: gradient of likelihood evaluated at points
        :rtype: Nx1 array

@ -86,7 +82,7 @@ class Exponential(NoiseDistribution):
        #grad = y/(link_f**2) - 1./link_f
        return grad

-    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
        """
        Hessian at y, given link(f), w.r.t link(f)
        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
@ -99,7 +95,7 @@ class Exponential(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in exponential distribution
+        :param Y_metadata: Y_metadata which is not used in exponential distribution
        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
        :rtype: Nx1 array

@ -112,7 +108,7 @@ class Exponential(NoiseDistribution):
        #hess = -2*y/(link_f**3) + 1/(link_f**2)
        return hess

-    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
        """
        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)

@ -123,7 +119,7 @@ class Exponential(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in exponential distribution
+        :param Y_metadata: Y_metadata which is not used in exponential distribution
        :returns: third derivative of likelihood evaluated at points f
        :rtype: Nx1 array
        """
@ -132,18 +128,6 @@ class Exponential(NoiseDistribution):
        #d3lik_dlink3 = 6*y/(link_f**4) - 2./(link_f**3)
        return d3lik_dlink3

-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)**2
-
    def samples(self, gp):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@ -1,15 +1,15 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
+# Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


 import numpy as np
 from scipy import stats,special
 import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
+from ..core.parameterization import Param
+import link_functions
+from likelihood import Likelihood

-class Gamma(NoiseDistribution):
+class Gamma(Likelihood):
    """
    Gamma likelihood

@ -18,14 +18,16 @@ class Gamma(NoiseDistribution):
        \\alpha_{i} = \\beta y_{i}

    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
-        self.beta = beta
-        super(Gamma, self).__init__(gp_link,analytical_mean,analytical_variance)
+    def __init__(self,gp_link=None,beta=1.):
+        if gp_link is None:
+            gp_link = link_functions.Log()
+        super(Gamma, self).__init__(gp_link, 'Gamma')

-    def _preprocess_values(self,Y):
-        return Y
+        self.beta = Param('beta', beta)
+        self.link_parameter(self.beta)
+        self.beta.fix()#TODO: gradients!

-    def pdf_link(self, link_f, y, extra_data=None):
+    def pdf_link(self, link_f, y, Y_metadata=None):
        """
        Likelihood function given link(f)

@ -37,7 +39,7 @@ class Gamma(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in poisson distribution
+        :param Y_metadata: Y_metadata which is not used in poisson distribution
        :returns: likelihood evaluated for this point
        :rtype: float
        """
@ -47,7 +49,7 @@ class Gamma(NoiseDistribution):
        objective = (y**(alpha - 1.) * np.exp(-self.beta*y) * self.beta**alpha)/ special.gamma(alpha)
        return np.exp(np.sum(np.log(objective)))

-    def logpdf_link(self, link_f, y, extra_data=None):
+    def logpdf_link(self, link_f, y, Y_metadata=None):
        """
        Log Likelihood Function given link(f)

@ -59,7 +61,7 @@ class Gamma(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in poisson distribution
+        :param Y_metadata: Y_metadata which is not used in poisson distribution
        :returns: likelihood evaluated for this point
        :rtype: float

@ -71,7 +73,7 @@ class Gamma(NoiseDistribution):
        log_objective = alpha*np.log(self.beta) - np.log(special.gamma(alpha)) + (alpha - 1)*np.log(y) - self.beta*y
        return np.sum(log_objective)

-    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)

@ -83,7 +85,7 @@ class Gamma(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in gamma distribution
+        :param Y_metadata: Y_metadata which is not used in gamma distribution
        :returns: gradient of likelihood evaluated at points
        :rtype: Nx1 array

@ -94,7 +96,7 @@ class Gamma(NoiseDistribution):
        #return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
        return grad

-    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
        """
        Hessian at y, given link(f), w.r.t link(f)
        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
@ -108,7 +110,7 @@ class Gamma(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in gamma distribution
+        :param Y_metadata: Y_metadata which is not used in gamma distribution
        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
        :rtype: Nx1 array

@ -122,7 +124,7 @@ class Gamma(NoiseDistribution):
        #return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
        return hess

-    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
        """
        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)

@ -134,22 +136,10 @@ class Gamma(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in gamma distribution
+        :param Y_metadata: Y_metadata which is not used in gamma distribution
        :returns: third derivative of likelihood evaluated at points f
        :rtype: Nx1 array
        """
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        d3lik_dlink3 = -special.polygamma(2, self.beta*link_f)*(self.beta**3)
        return d3lik_dlink3
-
-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)/self.beta
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -1,114 +1,318 @@
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+#TODO
+"""
+A lot of this code assumes that the link function is the identity.
+
+I think laplace code is okay, but I'm quite sure that the EP moments will only work if the link is identity.
+
+Furthermore, exact Guassian inference can only be done for the identity link, so we should be asserting so for all calls which relate to that.
+
+James 11/12/13
+"""
+
 import numpy as np
-from likelihood import likelihood
-from ..util.linalg import jitchol
+from scipy import stats, special
+import link_functions
+from likelihood import Likelihood
+from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
+from scipy import stats

-
-class Gaussian(likelihood):
+class Gaussian(Likelihood):
    """
-    Likelihood class for doing Expectation propagation
+    Gaussian likelihood

-    :param data: observed output
-    :type data: Nx1 numpy.darray
-    :param variance: noise parameter
-    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize: False|True
+    .. math::
+        \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+    :param variance: variance value of the Gaussian distribution
+    :param N: Number of data points
+    :type N: int
    """
-    def __init__(self, data, variance=1., normalize=False):
-        self.is_heteroscedastic = False
-        self.num_params = 1
-        self.Z = 0. # a correction factor which accounts for the approximation made
-        N, self.output_dim = data.shape
+    def __init__(self, gp_link=None, variance=1., name='Gaussian_noise'):
+        if gp_link is None:
+            gp_link = link_functions.Identity()

-        # normalization
-        if normalize:
-            self._offset = data.mean(0)[None, :]
-            self._scale = data.std(0)[None, :]
-            # Don't scale outputs which have zero variance to zero.
-            self._scale[np.nonzero(self._scale == 0.)] = 1.0e-3
-        else:
-            self._offset = np.zeros((1, self.output_dim))
-            self._scale = np.ones((1, self.output_dim))
+        assert isinstance(gp_link, link_functions.Identity), "the likelihood only implemented for the identity link"

-        self.set_data(data)
+        super(Gaussian, self).__init__(gp_link, name=name)

-        self._variance = np.asarray(variance) + 1.
-        self._set_params(np.asarray(variance))
+        self.variance = Param('variance', variance, Logexp())
+        self.link_parameter(self.variance)

-        super(Gaussian, self).__init__()
+        if isinstance(gp_link, link_functions.Identity):
+            self.log_concave = True

-    def set_data(self, data):
-        self.data = data
-        self.N, D = data.shape
-        assert D == self.output_dim
-        self.Y = (self.data - self._offset) / self._scale
-        if D > self.N:
-            self.YYT = np.dot(self.Y, self.Y.T)
-            self.trYYT = np.trace(self.YYT)
-            self.YYT_factor = jitchol(self.YYT)
-        else:
-            self.YYT = None
-            self.trYYT = np.sum(np.square(self.Y))
-            self.YYT_factor = self.Y
+    def betaY(self,Y,Y_metadata=None):
+        #TODO: ~Ricardo this does not live here
+        return Y/self.gaussian_variance(Y_metadata)

-    def _get_params(self):
-        return np.asarray(self._variance)
+    def gaussian_variance(self, Y_metadata=None):
+        return self.variance

-    def _get_param_names(self):
-        return ["noise_variance"]
+    def update_gradients(self, grad):
+        self.variance.gradient = grad

-    def _set_params(self, x):
-        x = np.float64(x)
-        if np.all(self._variance != x):
-            if x == 0.:#special case of zero noise
-                self.precision = np.inf
-                self.V = None
-            else:
-                self.precision = 1. / x
-                self.V = (self.precision) * self.Y
-                self.VVT_factor = self.precision * self.YYT_factor
-            self.covariance_matrix = np.eye(self.N) * x
-            self._variance = x
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        return dL_dKdiag.sum()

-    def predictive_values(self, mu, var, full_cov, **likelihood_args):
+    def _preprocess_values(self, Y):
        """
-        Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
        """
-        mean = mu * self._scale + self._offset
+        return Y
+
+    def _moments_match_ep(self, data_i, tau_i, v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        sigma2_hat = 1./(1./self.variance + tau_i)
+        mu_hat = sigma2_hat*(data_i/self.variance + v_i)
+        sum_var = self.variance + 1./tau_i
+        Z_hat = 1./np.sqrt(2.*np.pi*sum_var)*np.exp(-.5*(data_i - v_i/tau_i)**2./sum_var)
+        return Z_hat, mu_hat, sigma2_hat
+
+    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
        if full_cov:
-            if self.output_dim > 1:
-                raise NotImplementedError, "TODO"
-                # Note. for output_dim>1, we need to re-normalise all the outputs independently.
-                # This will mess up computations of diag(true_var), below.
-                # note that the upper, lower quantiles should be the same shape as mean
-            # Augment the output variance with the likelihood variance and rescale.
-            true_var = (var + np.eye(var.shape[0]) * self._variance) * self._scale ** 2
-            _5pc = mean - 2.*np.sqrt(np.diag(true_var))
-            _95pc = mean + 2.*np.sqrt(np.diag(true_var))
+            if var.ndim == 2:
+                var += np.eye(var.shape[0])*self.variance
+            if var.ndim == 3:
+                var += np.atleast_3d(np.eye(var.shape[0])*self.variance)
        else:
-            true_var = (var + self._variance) * self._scale ** 2
-            _5pc = mean - 2.*np.sqrt(true_var)
-            _95pc = mean + 2.*np.sqrt(true_var)
-        return mean, true_var, _5pc, _95pc
+            var += self.variance
+        return mu, var
+
+    def predictive_mean(self, mu, sigma):
+        return mu
+
+    def predictive_variance(self, mu, sigma, predictive_mean=None):
+        return self.variance + sigma**2
+
+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
+        return  [stats.norm.ppf(q/100.)*np.sqrt(var + self.variance) + mu for q in quantiles]
+
+    def pdf_link(self, link_f, y, Y_metadata=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        #Assumes no covariance, exp, sum, log for numerical stability
+        return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
+
+    def logpdf_link(self, link_f, y, Y_metadata=None):
+        """
+        Log likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: log likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        N = y.shape[0]
+        ln_det_cov = N*np.log(self.variance)
+
+        return -0.5*(np.sum((y-link_f)**2/self.variance) + ln_det_cov + N*np.log(2.*np.pi))
+
+    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
+        """
+        Gradient of the pdf at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\sigma^{2}}(y_{i} - \\lambda(f_{i}))
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: gradient of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s2_i = (1.0/self.variance)
+        grad = s2_i*y - s2_i*link_f
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
+        """
+        Hessian at y, given link_f, w.r.t link_f.
+        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
+
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        N = y.shape[0]
+        hess = -(1.0/self.variance)*np.ones((N, 1))
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = 0
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: third derivative of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        N = y.shape[0]
+        d3logpdf_dlink3 = np.zeros((N,1))
+        return d3logpdf_dlink3
+
+    def dlogpdf_link_dvar(self, link_f, y, Y_metadata=None):
+        """
+        Gradient of the log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\sigma^{2}} = -\\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - \\lambda(f_{i}))^{2}}{2\\sigma^{4}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: float
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        e = y - link_f
+        s_4 = 1.0/(self.variance**2)
+        N = y.shape[0]
+        dlik_dsigma = -0.5*N/self.variance + 0.5*s_4*np.sum(np.square(e))
+        return np.sum(dlik_dsigma) # Sure about this sum?
+
+    def dlogpdf_dlink_dvar(self, link_f, y, Y_metadata=None):
+        """
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)}) = \\frac{1}{\\sigma^{4}}(-y_{i} + \\lambda(f_{i}))
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s_4 = 1.0/(self.variance**2)
+        dlik_grad_dsigma = -s_4*y + s_4*link_f
+        return dlik_grad_dsigma
+
+    def d2logpdf_dlink2_dvar(self, link_f, y, Y_metadata=None):
+        """
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)}) = \\frac{1}{\\sigma^{4}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: derivative of log hessian evaluated at points link(f_i) and link(f_j) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s_4 = 1.0/(self.variance**2)
+        N = y.shape[0]
+        d2logpdf_dlink2_dvar = np.ones((N,1))*s_4
+        return d2logpdf_dlink2_dvar
+
+    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
+        return np.asarray([[dlogpdf_dvar]])
+
+    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
+        return d2logpdf_dlink2_dvar
+
+    def _mean(self, gp):
+        """
+        Expected value of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            E_{p(y|f)}[y]
+        """
+        return self.gp_link.transf(gp)
+
+    def _variance(self, gp):
+        """
+        Variance of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            Var_{p(y|f)}[y]
+        """
+        return self.variance
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        #orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp])
+        return Ysim.reshape(orig_shape)

    def log_predictive_density(self, y_test, mu_star, var_star):
        """
-        Calculation of the log predictive density
-
-        .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
-
-        :param y_test: test observations (y_{*})
-        :type y_test: (Nx1) array
-        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type mu_star: (Nx1) array
-        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type var_star: (Nx1) array
-
-        .. Note:
-            Works as if each test point was provided individually, i.e. not full_cov
+        assumes independence
        """
-        y_rescaled = (y_test - self._offset)/self._scale
-        return -0.5*np.log(2*np.pi) -0.5*np.log(var_star + self._variance) -0.5*(np.square(y_rescaled - mu_star))/(var_star + self._variance)
+        v = var_star + self.variance
+        return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v

-    def _gradients(self, partial):
-        return np.sum(partial)
--- a/GPy/likelihoods/gaussian_mixed_noise.py
+++ b/GPy/likelihoods/gaussian_mixed_noise.py
@ -1,108 +0,0 @@
-# Copyright (c) 2013, Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
-from likelihood import likelihood
-from . import Gaussian
-
-
-class Gaussian_Mixed_Noise(likelihood):
-    """
-    Gaussian Likelihood for multiple outputs
-
-    This is a wrapper around likelihood.Gaussian class
-
-    :param data_list: data observations
-    :type data_list: list of numpy arrays (num_data_output_i x 1), one array per output
-    :param noise_params: noise parameters of each output
-    :type noise_params: list of floats, one per output
-    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize: False|True
-    """
-    def __init__(self, data_list, noise_params=None, normalize=True):
-        self.num_params = len(data_list)
-        self.n_list = [data.size for data in data_list]
-        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.num_params),self.n_list)])
-
-        if noise_params is None:
-            noise_params = [1.] * self.num_params
-        else:
-            assert self.num_params == len(noise_params), 'Number of noise parameters does not match the number of noise models.'
-
-        self.noise_model_list = [Gaussian(Y,variance=v,normalize = normalize) for Y,v in zip(data_list,noise_params)]
-        self.n_params = [noise_model._get_params().size for noise_model in self.noise_model_list]
-        self.data = np.vstack(data_list)
-        self.N, self.output_dim = self.data.shape
-        self._offset = np.zeros((1, self.output_dim))
-        self._scale = np.ones((1, self.output_dim))
-
-        self.is_heteroscedastic = True
-        self.Z = 0. # a correction factor which accounts for the approximation made
-
-        self.set_data(data_list)
-        self._set_params(np.asarray(noise_params))
-
-        super(Gaussian_Mixed_Noise, self).__init__()
-
-    def set_data(self, data_list):
-        self.data = np.vstack(data_list)
-        self.N, D = self.data.shape
-        assert D == self.output_dim
-        self.Y = (self.data - self._offset) / self._scale
-        if D > self.N:
-            raise NotImplementedError
-            #self.YYT = np.dot(self.Y, self.Y.T)
-            #self.trYYT = np.trace(self.YYT)
-            #self.YYT_factor = jitchol(self.YYT)
-        else:
-            self.YYT = None
-            self.trYYT = np.sum(np.square(self.Y))
-            self.YYT_factor = self.Y
-
-    def predictive_values(self,mu,var,full_cov,noise_model):
-        """
-        Predicts the output given the GP
-
-        :param mu: GP's mean
-        :param var: GP's variance
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: False|True
-        :param noise_model: noise model to use
-        :type noise_model: integer
-        """
-        if full_cov:
-            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model_list[noise_model].predictive_values(mu,var,full_cov)
-
-    def _get_params(self):
-        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
-
-    def _get_param_names(self):
-        if len(self.noise_model_list) == 1:
-            names = self.noise_model_list[0]._get_param_names()
-        else:
-            names = []
-            for noise_model,i in zip(self.noise_model_list,range(len(self.n_list))):
-                names.append(''.join(noise_model._get_param_names() + ['_%s' %i]))
-        return names
-
-    def _set_params(self,p):
-        cs_params = np.cumsum([0]+self.n_params)
-
-        for i in range(len(self.n_params)):
-            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
-        self.precision = np.hstack([np.repeat(noise_model.precision,n) for noise_model,n in zip(self.noise_model_list,self.n_list)])[:,None]
-
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.precision * self.YYT_factor
-        self.covariance_matrix = np.eye(self.N) * 1./self.precision
-
-    def _gradients(self,partial):
-        gradients = []
-        aux = np.cumsum([0]+self.n_list)
-        for ai,af,noise_model in zip(aux[:-1],aux[1:],self.noise_model_list):
-            gradients += [noise_model._gradients(partial[ai:af])]
-        return np.hstack(gradients)
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -1,60 +1,77 @@
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 import numpy as np
-import copy
-from ..core.parameterized import Parameterized
+from scipy import stats,special
+import scipy as sp
+import link_functions
+from ..util.misc import chain_1, chain_2, chain_3
+from scipy.integrate import quad
+import warnings
+from ..core.parameterization import Parameterized

-class likelihood(Parameterized):
+class Likelihood(Parameterized):
    """
-    The atom for a likelihood class
+    Likelihood base class, used to defing p(y|f).

-    This object interfaces the GP and the data. The most basic likelihood
-    (Gaussian) inherits directly from this, as does the EP algorithm
+    All instances use _inverse_ link functions, which can be swapped out. It is
+    expected that inheriting classes define a default inverse link function

-    Some things must be defined for this to work properly:
+    To use this class, inherit and define missing functionality.

-        - self.Y : the effective Gaussian target of the GP
-        - self.N, self.D : Y.shape
-        - self.covariance_matrix : the effective (noise) covariance of the GP targets
-        - self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
-        - self.is_heteroscedastic : enables significant computational savings in GP
-        - self.precision : a scalar or vector representation of the effective target precision
-        - self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
-        - self.V : self.precision * self.Y
+    Inheriting classes *must* implement:
+       pdf_link : a bound method which turns the output of the link function into the pdf
+       logpdf_link : the logarithm of the above
+
+    To enable use with EP, inheriting classes *must* define:
+       TODO: a suitable derivative function for any parameters of the class
+    It is also desirable to define:
+       moments_match_ep : a function to compute the EP moments If this isn't defined, the moments will be computed using 1D quadrature.
+
+    To enable use with Laplace approximation, inheriting classes *must* define:
+       Some derivative functions *AS TODO*
+
+    For exact Gaussian inference, define *JH TODO*

    """
-    def __init__(self):
-        Parameterized.__init__(self)
-        self.dZ_dK = 0
+    def __init__(self, gp_link, name):
+        super(Likelihood, self).__init__(name)
+        assert isinstance(gp_link,link_functions.GPTransformation), "gp_link is not a valid GPTransformation."
+        self.gp_link = gp_link
+        self.log_concave = False

-    def _get_params(self):
+    def _gradients(self,partial):
+        return np.zeros(0)
+
+    def update_gradients(self, partial):
+        if self.size > 0:
+            raise NotImplementedError('Must be implemented for likelihoods with parameters to be optimized')
+
+    def _preprocess_values(self,Y):
+        """
+        In case it is needed, this function assess the output values or makes any pertinent transformation on them.
+
+        :param Y: observed output
+        :type Y: Nx1 numpy.darray
+
+        """
+        return Y
+
+    def conditional_mean(self, gp):
+        """
+        The mean of the random variable conditioned on one value of the GP
+        """
        raise NotImplementedError

-    def _get_param_names(self):
-        raise NotImplementedError
-
-    def _set_params(self, x):
-        raise NotImplementedError
-
-    def fit_full(self, K):
+    def conditional_variance(self, gp):
        """
-        No approximations needed by default
+        The variance of the random variable conditioned on one value of the GP
        """
-        pass
-
-    def restart(self):
-        """
-        No need to restart if not an approximation
-        """
-        pass
-
-    def _gradients(self, partial):
-        raise NotImplementedError
-
-    def predictive_values(self, mu, var):
        raise NotImplementedError

    def log_predictive_density(self, y_test, mu_star, var_star):
        """
-        Calculation of the predictive density
+        Calculation of the log predictive density

        .. math:
            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
@ -66,4 +83,391 @@ class likelihood(Parameterized):
        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
        :type var_star: (Nx1) array
        """
+        assert y_test.shape==mu_star.shape
+        assert y_test.shape==var_star.shape
+        assert y_test.shape[1] == 1
+        def integral_generator(y, m, v):
+            """Generate a function which can be integrated to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
+            def f(f_star):
+                return self.pdf(f_star, y)*np.exp(-(1./(2*v))*np.square(m-f_star))
+            return f
+
+        scaled_p_ystar, accuracy = zip(*[quad(integral_generator(y, m, v), -np.inf, np.inf) for y, m, v in zip(y_test.flatten(), mu_star.flatten(), var_star.flatten())])
+        scaled_p_ystar = np.array(scaled_p_ystar).reshape(-1,1)
+        p_ystar = scaled_p_ystar/np.sqrt(2*np.pi*var_star)
+        return np.log(p_ystar)
+
+    def _moments_match_ep(self,obs,tau,v):
+        """
+        Calculation of moments using quadrature
+
+        :param obs: observed output
+        :param tau: cavity distribution 1st natural parameter (precision)
+        :param v: cavity distribution 2nd natural paramenter (mu*precision)
+        """
+        #Compute first integral for zeroth moment.
+        #NOTE constant np.sqrt(2*pi/tau) added at the end of the function
+        mu = v/tau
+        def int_1(f):
+            return self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        z_scaled, accuracy = quad(int_1, -np.inf, np.inf)
+
+        #Compute second integral for first moment
+        def int_2(f):
+            return f*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        mean, accuracy = quad(int_2, -np.inf, np.inf)
+        mean /= z_scaled
+
+        #Compute integral for variance
+        def int_3(f):
+            return (f**2)*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        Ef2, accuracy = quad(int_3, -np.inf, np.inf)
+        Ef2 /= z_scaled
+        variance = Ef2 - mean**2
+
+        #Add constant to the zeroth moment
+        #NOTE: this constant is not needed in the other moments because it cancells out.
+        z = z_scaled/np.sqrt(2*np.pi/tau)
+
+        return z, mean, variance
+
+    def variational_expectations(self, Y, m, v, gh_points=None):
+        """
+        Use Gauss-Hermite Quadrature to compute 
+
+           E_p(f) [ log p(y|f) ]
+           d/dm E_p(f) [ log p(y|f) ]
+           d/dv E_p(f) [ log p(y|f) ]
+
+        where p(f) is a Gaussian with mean m and variance v. The shapes of Y, m and v should match.
+
+        if no gh_points are passed, we construct them using defualt options
+        """
+
+        if gh_points is None:
+            gh_x, gh_w = np.polynomial.hermite.hermgauss(12)
+        else:
+            gh_x, gh_w = gh_points
+
+        shape = m.shape
+        m,v,Y = m.flatten(), v.flatten(), Y.flatten()
+
+        #make a grid of points
+        X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
+
+        #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
+        # broadcast needs to be handled carefully. 
+        logp = self.logpdf(X,Y[:,None])
+        dlogp_dx = self.dlogpdf_df(X, Y[:,None])
+        d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None])
+
+        #clipping for numerical stability
+        logp = np.clip(logp,-1e6,1e6)
+        dlogp_dx = np.clip(dlogp_dx,-1e6,1e6)
+        d2logp_dx2 = np.clip(d2logp_dx2,-1e6,1e6)
+
+        #average over the gird to get derivatives of the Gaussian's parameters
+        F = np.dot(logp, gh_w)
+        dF_dm = np.dot(dlogp_dx, gh_w)
+        dF_dv = np.dot(d2logp_dx2, gh_w)/2.
+
+        if np.any(np.isnan(dF_dv)) or np.any(np.isinf(dF_dv)):
+            stop
+        if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
+            stop
+
+        return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape)
+
+
+
+
+    def predictive_mean(self, mu, variance, Y_metadata=None):
+        """
+        Quadrature calculation of the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
+
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior
+
+        """
+        #conditional_mean: the edpected value of y given some f, under this likelihood
+        def int_mean(f,m,v):
+            p = np.exp(-(0.5/v)*np.square(f - m))
+            #If p is zero then conditional_mean will overflow
+            if p < 1e-10:
+                return 0.
+            else:
+                return self.conditional_mean(f)*p
+        scaled_mean = [quad(int_mean, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
+
+        return mean
+
+    def _conditional_mean(self, f):
+        """Quadrature calculation of the conditional mean: E(Y_star|f)"""
+        raise NotImplementedError, "implement this function to make predictions"
+
+    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
+        """
+        Approximation to the predictive variance: V(Y_star)
+
+        The following variance decomposition is used:
+        V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior
+        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+
+        """
+        #sigma2 = sigma**2
+        normalizer = np.sqrt(2*np.pi*variance)
+
+        # E( V(Y_star|f_star) )
+        def int_var(f,m,v):
+            p = np.exp(-(0.5/v)*np.square(f - m))
+            #If p is zero then conditional_variance will overflow
+            if p < 1e-10:
+                return 0.
+            else:
+                return self.conditional_variance(f)*p
+        scaled_exp_variance = [quad(int_var, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        exp_var = np.array(scaled_exp_variance)[:,None] / normalizer
+
+        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
+
+        #E( E(Y_star|f_star) )**2
+        if predictive_mean is None:
+            predictive_mean = self.predictive_mean(mu,variance)
+        predictive_mean_sq = predictive_mean**2
+
+        #E( E(Y_star|f_star)**2 )
+        def int_pred_mean_sq(f,m,v,predictive_mean_sq):
+            p = np.exp(-(0.5/v)*np.square(f - m))
+            #If p is zero then conditional_mean**2 will overflow
+            if p < 1e-10:
+                return 0.
+            else:
+                return self.conditional_mean(f)**2*p
+
+        scaled_exp_exp2 = [quad(int_pred_mean_sq, -np.inf, np.inf,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
+        exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer
+
+        var_exp = exp_exp2 - predictive_mean_sq
+
+        # V(Y_star) = E[ V(Y_star|f_star) ] + V[ E(Y_star|f_star) ]
+        # V(Y_star) = E[ V(Y_star|f_star) ] + E(Y_star**2|f_star) - E[Y_star|f_star]**2
+        return exp_var + var_exp
+
+    def pdf_link(self, inv_link_f, y, Y_metadata=None):
+        raise NotImplementedError
+
+    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
+        raise NotImplementedError
+
+    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
+        raise NotImplementedError
+
+    def dlogpdf_link_dtheta(self, inv_link_f, y, Y_metadata=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink_dtheta(self, inv_link_f, y, Y_metadata=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2_dtheta(self, inv_link_f, y, Y_metadata=None):
+        raise NotImplementedError
+
+    def pdf(self, f, y, Y_metadata=None):
+        """
+        Evaluates the link function link(f) then computes the likelihood (pdf) using it
+
+        .. math:
+            p(y|\\lambda(f))
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        inv_link_f = self.gp_link.transf(f)
+        return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+
+    def logpdf(self, f, y, Y_metadata=None):
+        """
+        Evaluates the link function link(f) then computes the log likelihood (log pdf) using it
+
+        .. math:
+            \\log p(y|\\lambda(f))
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution - not used
+        :returns: log likelihood evaluated for this point
+        :rtype: float
+        """
+        inv_link_f = self.gp_link.transf(f)
+        return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+
+    def dlogpdf_df(self, f, y, Y_metadata=None):
+        """
+        Evaluates the link function link(f) then computes the derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule
+
+        .. math::
+            \\frac{d\\log p(y|\\lambda(f))}{df} = \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d\\lambda(f)}{df}
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution - not used
+        :returns: derivative of log likelihood evaluated for this point
+        :rtype: 1xN array
+        """
+        inv_link_f = self.gp_link.transf(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+        dlink_df = self.gp_link.dtransf_df(f)
+        return chain_1(dlogpdf_dlink, dlink_df)
+
+    def d2logpdf_df2(self, f, y, Y_metadata=None):
+        """
+        Evaluates the link function link(f) then computes the second derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule
+
+        .. math::
+            \\frac{d^{2}\\log p(y|\\lambda(f))}{df^{2}} = \\frac{d^{2}\\log p(y|\\lambda(f))}{d^{2}\\lambda(f)}\\left(\\frac{d\\lambda(f)}{df}\\right)^{2} + \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d^{2}\\lambda(f)}{df^{2}}
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution - not used
+        :returns: second derivative of log likelihood evaluated for this point (diagonal only)
+        :rtype: 1xN array
+        """
+        inv_link_f = self.gp_link.transf(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+        dlink_df = self.gp_link.dtransf_df(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+
+    def d3logpdf_df3(self, f, y, Y_metadata=None):
+        """
+        Evaluates the link function link(f) then computes the third derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule
+
+        .. math::
+            \\frac{d^{3}\\log p(y|\\lambda(f))}{df^{3}} = \\frac{d^{3}\\log p(y|\\lambda(f)}{d\\lambda(f)^{3}}\\left(\\frac{d\\lambda(f)}{df}\\right)^{3} + 3\\frac{d^{2}\\log p(y|\\lambda(f)}{d\\lambda(f)^{2}}\\frac{d\\lambda(f)}{df}\\frac{d^{2}\\lambda(f)}{df^{2}} + \\frac{d\\log p(y|\\lambda(f)}{d\\lambda(f)}\\frac{d^{3}\\lambda(f)}{df^{3}}
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution - not used
+        :returns: third derivative of log likelihood evaluated for this point
+        :rtype: float
+        """
+        inv_link_f = self.gp_link.transf(f)
+        d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
+        dlink_df = self.gp_link.dtransf_df(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+        d3link_df3 = self.gp_link.d3transf_df3(f)
+        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+
+    def dlogpdf_dtheta(self, f, y, Y_metadata=None):
+        """
+        TODO: Doc strings
+        """
+        if self.size > 0:
+            inv_link_f = self.gp_link.transf(f)
+            return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+        else:
+            # There are no parameters so return an empty array for derivatives
+            return np.zeros([1, 0])
+
+    def dlogpdf_df_dtheta(self, f, y, Y_metadata=None):
+        """
+        TODO: Doc strings
+        """
+        if self.size > 0:
+            inv_link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+        else:
+            # There are no parameters so return an empty array for derivatives
+            return np.zeros([f.shape[0], 0])
+
+    def d2logpdf_df2_dtheta(self, f, y, Y_metadata=None):
+        """
+        TODO: Doc strings
+        """
+        if self.size > 0:
+            inv_link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+        else:
+            # There are no parameters so return an empty array for derivatives
+            return np.zeros([f.shape[0], 0])
+
+    def _laplace_gradients(self, f, y, Y_metadata=None):
+        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, Y_metadata=Y_metadata)
+        dlogpdf_df_dtheta = self.dlogpdf_df_dtheta(f, y, Y_metadata=Y_metadata)
+        d2logpdf_df2_dtheta = self.d2logpdf_df2_dtheta(f, y, Y_metadata=Y_metadata)
+
+        #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
+        # ensure we have gradients for every parameter we want to optimize
+        assert len(dlogpdf_dtheta) == self.size #1 x num_param array
+        assert dlogpdf_df_dtheta.shape[1] == self.size #f x num_param matrix
+        assert d2logpdf_df2_dtheta.shape[1] == self.size #f x num_param matrix
+
+        return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
+
+    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
+        """
+        Compute  mean, variance of the  predictive distibution.
+
+        :param mu: mean of the latent variable, f, of posterior
+        :param var: variance of the latent variable, f, of posterior
+        :param full_cov: whether to use the full covariance or just the diagonal
+        :type full_cov: Boolean
+        """
+
+        pred_mean = self.predictive_mean(mu, var, Y_metadata)
+        pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata)
+
+        return pred_mean, pred_var
+
+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
+        #compute the quantiles by sampling!!!
+        N_samp = 1000
+        s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
+        #ss_f = s.flatten()
+        #ss_y = self.samples(ss_f, Y_metadata)
+        ss_y = self.samples(s, Y_metadata)
+        #ss_y = ss_y.reshape(mu.shape[0], N_samp)
+
+        return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles]
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
        raise NotImplementedError
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@ -1,13 +1,14 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 import numpy as np
 from scipy import stats
 import scipy as sp
-import pylab as pb
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_cdf

+_exp_lim_val = np.finfo(np.float64).max
+_lim_val = np.log(_exp_lim_val)
+
 class GPTransformation(object):
    """
    Link function class for doing non-Gaussian likelihoods approximation
@ -69,7 +70,7 @@ class Probit(GPTransformation):
    .. math::

        g(f) = \\Phi^{-1} (mu)
-
+    
    """
    def transf(self,f):
        return std_norm_cdf(f)
@ -86,6 +87,34 @@ class Probit(GPTransformation):
        f2 = f**2
        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(1-f2)

+
+class Cloglog(GPTransformation):
+    """
+    Complementary log-log link
+    .. math::
+
+        p(f) = 1 - e^{-e^f}
+
+        or
+
+        f = \log (-\log(1-p))
+    
+    """
+    def transf(self,f):
+        return 1-np.exp(-np.exp(f))
+
+    def dtransf_df(self,f):
+        return np.exp(f-np.exp(f))
+
+    def d2transf_df2(self,f):
+        ef = np.exp(f)
+        return -np.exp(f-ef)*(ef-1.)
+
+    def d3transf_df3(self,f):
+        ef = np.exp(f)
+        return np.exp(f-ef)*(1.-3*ef + ef**2)
+
+
 class Log(GPTransformation):
    """
    .. math::
@ -94,16 +123,16 @@ class Log(GPTransformation):

    """
    def transf(self,f):
-        return np.exp(f)
+        return np.exp(np.clip(f, -_lim_val, _lim_val))

    def dtransf_df(self,f):
-        return np.exp(f)
+        return np.exp(np.clip(f, -_lim_val, _lim_val))

    def d2transf_df2(self,f):
-        return np.exp(f)
+        return np.exp(np.clip(f, -_lim_val, _lim_val))

    def d3transf_df3(self,f):
-        return np.exp(f)
+        return np.exp(np.clip(f, -_lim_val, _lim_val))

 class Log_ex_1(GPTransformation):
    """
--- a/GPy/likelihoods/mixed_noise.py
+++ b/GPy/likelihoods/mixed_noise.py
@ -0,0 +1,82 @@
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats, special
+import link_functions
+from likelihood import Likelihood
+from gaussian import Gaussian
+from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
+from ..core.parameterization import Parameterized
+import itertools
+
+class MixedNoise(Likelihood):
+    def __init__(self, likelihoods_list, name='mixed_noise'):
+        #NOTE at the moment this likelihood only works for using a list of gaussians
+        super(Likelihood, self).__init__(name=name)
+
+        self.link_parameters(*likelihoods_list)
+        self.likelihoods_list = likelihoods_list
+        self.log_concave = False
+
+    def gaussian_variance(self, Y_metadata):
+        assert all([isinstance(l, Gaussian) for l in self.likelihoods_list])
+        ind = Y_metadata['output_index'].flatten()
+        variance = np.zeros(ind.size)
+        for lik, j in zip(self.likelihoods_list, range(len(self.likelihoods_list))):
+            variance[ind==j] = lik.variance
+        return variance
+
+    def betaY(self,Y,Y_metadata):
+        #TODO not here.
+        return Y/self.gaussian_variance(Y_metadata=Y_metadata)[:,None]
+
+    def update_gradients(self, gradients):
+        self.gradient = gradients
+
+    def exact_inference_gradients(self, dL_dKdiag, Y_metadata):
+        assert all([isinstance(l, Gaussian) for l in self.likelihoods_list])
+        ind = Y_metadata['output_index'].flatten()
+        return np.array([dL_dKdiag[ind==i].sum() for i in range(len(self.likelihoods_list))])
+
+    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
+        ind = Y_metadata['output_index'].flatten()
+        _variance = np.array([self.likelihoods_list[j].variance for j in ind ])
+        if full_cov:
+            var += np.eye(var.shape[0])*_variance
+        else:
+            var += _variance
+        return mu, var
+
+    def predictive_variance(self, mu, sigma, Y_metadata):
+        _variance = self.gaussian_variance(Y_metadata)
+        return _variance + sigma**2
+
+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata):
+        ind = Y_metadata['output_index'].flatten()
+        outputs = np.unique(ind)
+        Q = np.zeros( (mu.size,len(quantiles)) )
+        for j in outputs:
+            q = self.likelihoods_list[j].predictive_quantiles(mu[ind==j,:],
+                var[ind==j,:],quantiles,Y_metadata=None)
+            Q[ind==j,:] = np.hstack(q)
+        return [q[:,None] for q in Q.T]
+
+    def samples(self, gp, Y_metadata):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        N1, N2 = gp.shape
+        Ysim = np.zeros((N1,N2))
+        ind = Y_metadata['output_index'].flatten()
+        for j in np.unique(ind):
+            flt = ind==j
+            gp_filtered = gp[flt,:]
+            n1 = gp_filtered.shape[0]
+            lik = self.likelihoods_list[j]
+            _ysim = np.array([np.random.normal(lik.gp_link.transf(gpj), scale=np.sqrt(lik.variance), size=1) for gpj in gp_filtered.flatten()])
+            Ysim[flt,:] = _ysim.reshape(n1,N2)
+        return Ysim
--- a/GPy/likelihoods/noise_model_constructors.py
+++ b/GPy/likelihoods/noise_model_constructors.py
@ -1,121 +0,0 @@
-# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import noise_models
-
-def bernoulli(gp_link=None):
-    """
-    Construct a bernoulli likelihood
-
-    :param gp_link: a GPy gp_link function
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Probit()
-    #else:
-    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
-
-    if isinstance(gp_link,noise_models.gp_transformations.Probit):
-        analytical_mean = True
-        analytical_variance = False
-
-    elif isinstance(gp_link,noise_models.gp_transformations.Heaviside):
-        analytical_mean = True
-        analytical_variance = True
-
-    else:
-        analytical_mean = False
-        analytical_variance = False
-
-    return noise_models.bernoulli_noise.Bernoulli(gp_link,analytical_mean,analytical_variance)
-
-def exponential(gp_link=None):
-
-    """
-    Construct a exponential likelihood
-
-    :param gp_link: a GPy gp_link function
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Log_ex_1()
-
-    analytical_mean = False
-    analytical_variance = False
-    return noise_models.exponential_noise.Exponential(gp_link,analytical_mean,analytical_variance)
-
-def gaussian_ep(gp_link=None,variance=1.):
-    """
-    Construct a gaussian likelihood
-
-    :param gp_link: a GPy gp_link function
-    :param variance: scalar
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Identity()
-    #else:
-    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
-
-    analytical_mean = False
-    analytical_variance = False
-    return noise_models.gaussian_noise.Gaussian(gp_link,analytical_mean,analytical_variance,variance)
-
-def poisson(gp_link=None):
-    """
-    Construct a Poisson likelihood
-
-    :param gp_link: a GPy gp_link function
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Log_ex_1()
-    #else:
-    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
-    analytical_mean = False
-    analytical_variance = False
-    return noise_models.poisson_noise.Poisson(gp_link,analytical_mean,analytical_variance)
-
-def gamma(gp_link=None,beta=1.):
-    """
-    Construct a Gamma likelihood
-
-    :param gp_link: a GPy gp_link function
-    :param beta: scalar
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Log_ex_1()
-    analytical_mean = False
-    analytical_variance = False
-    return noise_models.gamma_noise.Gamma(gp_link,analytical_mean,analytical_variance,beta)
-
-def gaussian(gp_link=None, variance=2, D=None, N=None):
-    """
-    Construct a Gaussian likelihood
-
-    :param gp_link: a GPy gp_link function
-    :param variance: variance
-    :type variance: scalar
-    :returns: Gaussian noise model:
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Identity()
-    analytical_mean = True
-    analytical_variance = True # ?
-    return noise_models.gaussian_noise.Gaussian(gp_link, analytical_mean,
-            analytical_variance, variance=variance, D=D, N=N)
-
-def student_t(gp_link=None, deg_free=5, sigma2=2):
-    """
-    Construct a Student t likelihood
-
-    :param gp_link: a GPy gp_link function
-    :param deg_free: degrees of freedom of student-t
-    :type deg_free: scalar
-    :param sigma2: variance
-    :type sigma2: scalar
-    :returns: Student-T noise model
-    """
-    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Identity()
-    analytical_mean = True
-    analytical_variance = True
-    return noise_models.student_t_noise.StudentT(gp_link, analytical_mean,
-            analytical_variance,deg_free, sigma2)
--- a/GPy/likelihoods/noise_models/init.py
+++ b/GPy/likelihoods/noise_models/init.py
@ -1,8 +0,0 @@
-import noise_distributions
-import bernoulli_noise
-import exponential_noise
-import gaussian_noise
-import gamma_noise
-import poisson_noise
-import student_t_noise
-import gp_transformations
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@ -1,300 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
-
-class Gaussian(NoiseDistribution):
-    """
-    Gaussian likelihood
-
-    .. math::
-        \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
-
-    :param variance: variance value of the Gaussian distribution
-    :param N: Number of data points
-    :type N: int
-    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1., D=None, N=None):
-        self.variance = variance
-        self.N = N
-        self._set_params(np.asarray(variance))
-        super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)
-        if isinstance(gp_link , gp_transformations.Identity):
-            self.log_concave = True
-
-    def _get_params(self):
-        return np.array([self.variance])
-
-    def _get_param_names(self):
-        return ['noise_model_variance']
-
-    def _set_params(self, p):
-        self.variance = float(p)
-        self.I = np.eye(self.N)
-        self.covariance_matrix = self.I * self.variance
-        self.Ki = self.I*(1.0 / self.variance)
-        #self.ln_det_K = np.sum(np.log(np.diag(self.covariance_matrix)))
-        self.ln_det_K = self.N*np.log(self.variance)
-
-    def _gradients(self,partial):
-        return np.zeros(1)
-        #return np.sum(partial)
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-        """
-        return Y
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        sigma2_hat = 1./(1./self.variance + tau_i)
-        mu_hat = sigma2_hat*(data_i/self.variance + v_i)
-        sum_var = self.variance + 1./tau_i
-        Z_hat = 1./np.sqrt(2.*np.pi*sum_var)*np.exp(-.5*(data_i - v_i/tau_i)**2./sum_var)
-        return Z_hat, mu_hat, sigma2_hat
-
-    def _predictive_mean_analytical(self,mu,sigma):
-        new_sigma2 = self.predictive_variance(mu,sigma)
-        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)
-
-    def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
-        return 1./(1./self.variance + 1./sigma**2)
-
-    def _mass(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
-                            Please negate your function and use pdf in noise_model.py, if implementing a likelihood\
-                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
-                            its derivatives")
-    def _nlog_mass(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
-                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
-                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
-                            its derivatives")
-
-    def _dnlog_mass_dgp(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
-                            Please negate your function and use dlogpdf_df in noise_model.py, if implementing a likelihood\
-                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
-                            its derivatives")
-
-    def _d2nlog_mass_dgp2(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
-                            Please negate your function and use d2logpdf_df2 in noise_model.py, if implementing a likelihood\
-                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
-                            its derivatives")
-
-    def pdf_link(self, link_f, y, extra_data=None):
-        """
-        Likelihood function given link(f)
-
-        .. math::
-            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
-        :returns: likelihood evaluated for this point
-        :rtype: float
-        """
-        #Assumes no covariance, exp, sum, log for numerical stability
-        return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
-
-    def logpdf_link(self, link_f, y, extra_data=None):
-        """
-        Log likelihood function given link(f)
-
-        .. math::
-            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
-        :returns: log likelihood evaluated for this point
-        :rtype: float
-        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        return -0.5*(np.sum((y-link_f)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
-
-    def dlogpdf_dlink(self, link_f, y, extra_data=None):
-        """
-        Gradient of the pdf at y, given link(f) w.r.t link(f)
-
-        .. math::
-            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\sigma^{2}}(y_{i} - \\lambda(f_{i}))
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
-        :returns: gradient of log likelihood evaluated at points link(f)
-        :rtype: Nx1 array
-        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        s2_i = (1.0/self.variance)
-        grad = s2_i*y - s2_i*link_f
-        return grad
-
-    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
-        """
-        Hessian at y, given link_f, w.r.t link_f.
-        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
-
-        The hessian will be 0 unless i == j
-
-        .. math::
-            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
-        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
-        :rtype: Nx1 array
-
-        .. Note::
-            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
-        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        hess = -(1.0/self.variance)*np.ones((self.N, 1))
-        return hess
-
-    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
-        """
-        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
-
-        .. math::
-            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = 0
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
-        :returns: third derivative of log likelihood evaluated at points link(f)
-        :rtype: Nx1 array
-        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None]
-        return d3logpdf_dlink3
-
-    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
-        """
-        Gradient of the log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
-
-        .. math::
-            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\sigma^{2}} = -\\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - \\lambda(f_{i}))^{2}}{2\\sigma^{4}}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
-        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
-        :rtype: float
-        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        e = y - link_f
-        s_4 = 1.0/(self.variance**2)
-        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.sum(np.square(e))
-        return np.sum(dlik_dsigma) # Sure about this sum?
-
-    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
-        """
-        Derivative of the dlogpdf_dlink w.r.t variance parameter (noise_variance)
-
-        .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)}) = \\frac{1}{\\sigma^{4}}(-y_{i} + \\lambda(f_{i}))
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
-        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
-        :rtype: Nx1 array
-        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        s_4 = 1.0/(self.variance**2)
-        dlik_grad_dsigma = -s_4*y + s_4*link_f
-        return dlik_grad_dsigma
-
-    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
-        """
-        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (noise_variance)
-
-        .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)}) = \\frac{1}{\\sigma^{4}}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
-        :returns: derivative of log hessian evaluated at points link(f_i) and link(f_j) w.r.t variance parameter
-        :rtype: Nx1 array
-        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        s_4 = 1.0/(self.variance**2)
-        d2logpdf_dlink2_dvar = np.diag(s_4*self.I)[:, None]
-        return d2logpdf_dlink2_dvar
-
-    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
-        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
-        return np.asarray([[dlogpdf_dvar]])
-
-    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
-        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
-        return dlogpdf_dlink_dvar
-
-    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
-        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
-        return d2logpdf_dlink2_dvar
-
-    def _mean(self,gp):
-        """
-        Expected value of y under the Mass (or density) function p(y|f)
-
-        .. math::
-            E_{p(y|f)}[y]
-        """
-        return self.gp_link.transf(gp)
-
-    def _variance(self,gp):
-        """
-        Variance of y under the Mass (or density) function p(y|f)
-
-        .. math::
-            Var_{p(y|f)}[y]
-        """
-        return self.variance
-
-    def samples(self, gp):
-        """
-        Returns a set of samples of observations based on a given value of the latent variable.
-
-        :param gp: latent variable
-        """
-        orig_shape = gp.shape
-        gp = gp.flatten()
-        Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp])
-        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@ -1,434 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-import pylab as pb
-from GPy.util.plot import gpplot
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from GPy.util.misc import chain_1, chain_2, chain_3
-from scipy.integrate import quad
-import warnings
-
-class NoiseDistribution(object):
-    """
-    Likelihood class for doing approximations
-    """
-    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
-        assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."
-        self.gp_link = gp_link
-        self.analytical_mean = analytical_mean
-        self.analytical_variance = analytical_variance
-        if self.analytical_mean:
-            self.moments_match = self._moments_match_analytical
-            self.predictive_mean = self._predictive_mean_analytical
-        else:
-            self.moments_match = self._moments_match_numerical
-            self.predictive_mean = self._predictive_mean_numerical
-        if self.analytical_variance:
-            self.predictive_variance = self._predictive_variance_analytical
-        else:
-            self.predictive_variance = self._predictive_variance_numerical
-
-        self.log_concave = False
-
-    def _get_params(self):
-        return np.zeros(0)
-
-    def _get_param_names(self):
-        return []
-
-    def _set_params(self,p):
-        pass
-
-    def _gradients(self,partial):
-        return np.zeros(0)
-
-    def _preprocess_values(self,Y):
-        """
-        In case it is needed, this function assess the output values or makes any pertinent transformation on them.
-
-        :param Y: observed output
-        :type Y: Nx1 numpy.darray
-
-        """
-        return Y
-
-    def _moments_match_analytical(self,obs,tau,v):
-        """
-        If available, this function computes the moments analytically.
-        """
-        raise NotImplementedError
-
-    def log_predictive_density(self, y_test, mu_star, var_star):
-        """
-        Calculation of the log predictive density
-
-        .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
-
-        :param y_test: test observations (y_{*})
-        :type y_test: (Nx1) array
-        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type mu_star: (Nx1) array
-        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type var_star: (Nx1) array
-        """
-        assert y_test.shape==mu_star.shape
-        assert y_test.shape==var_star.shape
-        assert y_test.shape[1] == 1
-        def integral_generator(y, m, v):
-            """Generate a function which can be integrated to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
-            def f(f_star):
-                return self.pdf(f_star, y)*np.exp(-(1./(2*v))*np.square(m-f_star))
-            return f
-
-        scaled_p_ystar, accuracy = zip(*[quad(integral_generator(y, m, v), -np.inf, np.inf) for y, m, v in zip(y_test.flatten(), mu_star.flatten(), var_star.flatten())])
-        scaled_p_ystar = np.array(scaled_p_ystar).reshape(-1,1)
-        p_ystar = scaled_p_ystar/np.sqrt(2*np.pi*var_star)
-        return np.log(p_ystar)
-
-    def _moments_match_numerical(self,obs,tau,v):
-        """
-        Calculation of moments using quadrature
-
-        :param obs: observed output
-        :param tau: cavity distribution 1st natural parameter (precision)
-        :param v: cavity distribution 2nd natural paramenter (mu*precision)
-        """
-        #Compute first integral for zeroth moment.
-        #NOTE constant np.sqrt(2*pi/tau) added at the end of the function
-        mu = v/tau
-        def int_1(f):
-            return self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
-        z_scaled, accuracy = quad(int_1, -np.inf, np.inf)
-
-        #Compute second integral for first moment
-        def int_2(f):
-            return f*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
-        mean, accuracy = quad(int_2, -np.inf, np.inf)
-        mean /= z_scaled
-
-        #Compute integral for variance
-        def int_3(f):
-            return (f**2)*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
-        Ef2, accuracy = quad(int_3, -np.inf, np.inf)
-        Ef2 /= z_scaled
-        variance = Ef2 - mean**2
-
-        #Add constant to the zeroth moment
-        #NOTE: this constant is not needed in the other moments because it cancells out.
-        z = z_scaled/np.sqrt(2*np.pi/tau)
-
-        return z, mean, variance
-
-    def _predictive_mean_analytical(self,mu,sigma):
-        """
-        Predictive mean
-        .. math::
-            E(Y^{*}|Y) = E( E(Y^{*}|f^{*}, Y) )
-
-        If available, this function computes the predictive mean analytically.
-        """
-        raise NotImplementedError
-
-    def _predictive_variance_analytical(self,mu,sigma):
-        """
-        Predictive variance
-        .. math::
-            V(Y^{*}| Y) = E( V(Y^{*}|f^{*}, Y) ) + V( E(Y^{*}|f^{*}, Y) )
-
-        If available, this function computes the predictive variance analytically.
-        """
-        raise NotImplementedError
-
-    def _predictive_mean_numerical(self,mu,variance):
-        """
-        Quadrature calculation of the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
-
-        :param mu: mean of posterior
-        :param sigma: standard deviation of posterior
-
-        """
-        #import ipdb; ipdb.set_trace()
-        def int_mean(f,m,v):
-            return self._mean(f)*np.exp(-(0.5/v)*np.square(f - m))
-        #scaled_mean = [quad(int_mean, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
-        scaled_mean = [quad(int_mean, mj-6*np.sqrt(s2j), mj+6*np.sqrt(s2j), args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
-        mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
-
-        return mean
-
-    def _predictive_variance_numerical(self,mu,variance,predictive_mean=None):
-        """
-        Numerical approximation to the predictive variance: V(Y_star)
-
-        The following variance decomposition is used:
-        V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
-
-        :param mu: mean of posterior
-        :param sigma: standard deviation of posterior
-        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
-
-        """
-        normalizer = np.sqrt(2*np.pi*variance)
-
-        # E( V(Y_star|f_star) )
-        def int_var(f,m,v):
-            return self._variance(f)*np.exp(-(0.5/v)*np.square(f - m))
-        #Most of the weight is within 6 stds and this avoids some negative infinity and infinity problems of taking f^2
-        scaled_exp_variance = [quad(int_var, mj-6*np.sqrt(s2j), mj+6*np.sqrt(s2j), args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
-        exp_var = np.array(scaled_exp_variance)[:,None] / normalizer
-
-        #V( E(Y_star|f_star) ) = E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
-
-        #E( E(Y_star|f_star) )**2
-        if predictive_mean is None:
-            predictive_mean = self.predictive_mean(mu,variance)
-        predictive_mean_sq = predictive_mean**2
-
-        #E( E(Y_star|f_star)**2 )
-        def int_pred_mean_sq(f,m,v):
-            return self._mean(f)**2*np.exp(-(0.5/v)*np.square(f - m))
-        scaled_exp_exp2 = [quad(int_pred_mean_sq, mj-6*np.sqrt(s2j), mj+6*np.sqrt(s2j), args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
-        exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer
-
-        var_exp = exp_exp2 - predictive_mean_sq
-
-        # V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
-        return exp_var + var_exp
-
-    def pdf_link(self, link_f, y, extra_data=None):
-        raise NotImplementedError
-
-    def logpdf_link(self, link_f, y, extra_data=None):
-        raise NotImplementedError
-
-    def dlogpdf_dlink(self, link_f, y, extra_data=None):
-        raise NotImplementedError
-
-    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
-        raise NotImplementedError
-
-    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
-        raise NotImplementedError
-
-    def dlogpdf_link_dtheta(self, link_f, y, extra_data=None):
-        raise NotImplementedError
-
-    def dlogpdf_dlink_dtheta(self, link_f, y, extra_data=None):
-        raise NotImplementedError
-
-    def d2logpdf_dlink2_dtheta(self, link_f, y, extra_data=None):
-        raise NotImplementedError
-
-    def pdf(self, f, y, extra_data=None):
-        """
-        Evaluates the link function link(f) then computes the likelihood (pdf) using it
-
-        .. math:
-            p(y|\\lambda(f))
-
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: likelihood evaluated for this point
-        :rtype: float
-        """
-        link_f = self.gp_link.transf(f)
-        return self.pdf_link(link_f, y, extra_data=extra_data)
-
-    def logpdf(self, f, y, extra_data=None):
-        """
-        Evaluates the link function link(f) then computes the log likelihood (log pdf) using it
-
-        .. math:
-            \\log p(y|\\lambda(f))
-
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: log likelihood evaluated for this point
-        :rtype: float
-        """
-        link_f = self.gp_link.transf(f)
-        return self.logpdf_link(link_f, y, extra_data=extra_data)
-
-    def dlogpdf_df(self, f, y, extra_data=None):
-        """
-        Evaluates the link function link(f) then computes the derivative of log likelihood using it
-        Uses the Faa di Bruno's formula for the chain rule
-
-        .. math::
-            \\frac{d\\log p(y|\\lambda(f))}{df} = \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d\\lambda(f)}{df}
-
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of log likelihood evaluated for this point
-        :rtype: 1xN array
-        """
-        link_f = self.gp_link.transf(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
-        dlink_df = self.gp_link.dtransf_df(f)
-        return chain_1(dlogpdf_dlink, dlink_df)
-
-    def d2logpdf_df2(self, f, y, extra_data=None):
-        """
-        Evaluates the link function link(f) then computes the second derivative of log likelihood using it
-        Uses the Faa di Bruno's formula for the chain rule
-
-        .. math::
-            \\frac{d^{2}\\log p(y|\\lambda(f))}{df^{2}} = \\frac{d^{2}\\log p(y|\\lambda(f))}{d^{2}\\lambda(f)}\\left(\\frac{d\\lambda(f)}{df}\\right)^{2} + \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d^{2}\\lambda(f)}{df^{2}}
-
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: second derivative of log likelihood evaluated for this point (diagonal only)
-        :rtype: 1xN array
-        """
-        link_f = self.gp_link.transf(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
-        dlink_df = self.gp_link.dtransf_df(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
-
-    def d3logpdf_df3(self, f, y, extra_data=None):
-        """
-        Evaluates the link function link(f) then computes the third derivative of log likelihood using it
-        Uses the Faa di Bruno's formula for the chain rule
-
-        .. math::
-            \\frac{d^{3}\\log p(y|\\lambda(f))}{df^{3}} = \\frac{d^{3}\\log p(y|\\lambda(f)}{d\\lambda(f)^{3}}\\left(\\frac{d\\lambda(f)}{df}\\right)^{3} + 3\\frac{d^{2}\\log p(y|\\lambda(f)}{d\\lambda(f)^{2}}\\frac{d\\lambda(f)}{df}\\frac{d^{2}\\lambda(f)}{df^{2}} + \\frac{d\\log p(y|\\lambda(f)}{d\\lambda(f)}\\frac{d^{3}\\lambda(f)}{df^{3}}
-
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: third derivative of log likelihood evaluated for this point
-        :rtype: float
-        """
-        link_f = self.gp_link.transf(f)
-        d3logpdf_dlink3 = self.d3logpdf_dlink3(link_f, y, extra_data=extra_data)
-        dlink_df = self.gp_link.dtransf_df(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
-        d3link_df3 = self.gp_link.d3transf_df3(f)
-        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
-
-    def dlogpdf_dtheta(self, f, y, extra_data=None):
-        """
-        TODO: Doc strings
-        """
-        if len(self._get_param_names()) > 0:
-            link_f = self.gp_link.transf(f)
-            return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
-        else:
-            #Is no parameters so return an empty array for its derivatives
-            return np.empty([1, 0])
-
-    def dlogpdf_df_dtheta(self, f, y, extra_data=None):
-        """
-        TODO: Doc strings
-        """
-        if len(self._get_param_names()) > 0:
-            link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
-            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
-        else:
-            #Is no parameters so return an empty array for its derivatives
-            return np.empty([f.shape[0], 0])
-
-    def d2logpdf_df2_dtheta(self, f, y, extra_data=None):
-        """
-        TODO: Doc strings
-        """
-        if len(self._get_param_names()) > 0:
-            link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            d2link_df2 = self.gp_link.d2transf_df2(f)
-            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
-            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
-        else:
-            #Is no parameters so return an empty array for its derivatives
-            return np.empty([f.shape[0], 0])
-
-    def _laplace_gradients(self, f, y, extra_data=None):
-        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, extra_data=extra_data)
-        dlogpdf_df_dtheta = self.dlogpdf_df_dtheta(f, y, extra_data=extra_data)
-        d2logpdf_df2_dtheta = self.d2logpdf_df2_dtheta(f, y, extra_data=extra_data)
-
-        #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
-        # ensure we have gradients for every parameter we want to optimize
-        assert dlogpdf_dtheta.shape[1] == len(self._get_param_names())
-        assert dlogpdf_df_dtheta.shape[1] == len(self._get_param_names())
-        assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
-        return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
-
-    def predictive_values(self, mu, var, full_cov=False, sampling=False, num_samples=10000):
-        """
-        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
-
-        :param mu: mean of the latent variable, f, of posterior
-        :param var: variance of the latent variable, f, of posterior
-        :param full_cov: whether to use the full covariance or just the diagonal
-        :type full_cov: Boolean
-        :param num_samples: number of samples to use in computing quantiles and
-                            possibly mean variance
-        :type num_samples: integer
-        :param sampling: Whether to use samples for mean and variances anyway
-        :type sampling: Boolean
-
-        """
-
-        if sampling:
-            #Get gp_samples f* using posterior mean and variance
-            if not full_cov:
-                gp_samples = np.random.multivariate_normal(mu.flatten(), np.diag(var.flatten()),
-                                                            size=num_samples).T
-            else:
-                gp_samples = np.random.multivariate_normal(mu.flatten(), var,
-                                                               size=num_samples).T
-            #Push gp samples (f*) through likelihood to give p(y*|f*)
-            samples = self.samples(gp_samples)
-            axis=-1
-
-            #Calculate mean, variance and precentiles from samples
-            warnings.warn("Using sampling to calculate mean, variance and predictive quantiles.")
-            pred_mean = np.mean(samples, axis=axis)[:,None]
-            pred_var = np.var(samples, axis=axis)[:,None]
-            q1 = np.percentile(samples, 2.5, axis=axis)[:,None]
-            q3 = np.percentile(samples, 97.5, axis=axis)[:,None]
-
-        else:
-            pred_mean = self.predictive_mean(mu, var)
-            pred_var = self.predictive_variance(mu, var, pred_mean)
-            warnings.warn("Predictive quantiles are only computed when sampling.")
-            q1 = np.repeat(np.nan,pred_mean.size)[:,None]
-            q3 = q1.copy()
-
-        return pred_mean, pred_var, q1, q3
-
-    def samples(self, gp):
-        """
-        Returns a set of samples of observations based on a given value of the latent variable.
-
-        :param gp: latent variable
-        """
-        raise NotImplementedError
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@ -1,15 +1,14 @@
 from __future__ import division
-# Copyright (c) 2012, 2013 Ricardo Andrade
+# Copyright (c) 2012-2014 Ricardo Andrade, Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
 from scipy import stats,special
 import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
+import link_functions
+from likelihood import Likelihood

-class Poisson(NoiseDistribution):
+class Poisson(Likelihood):
    """
    Poisson likelihood

@ -19,13 +18,19 @@ class Poisson(NoiseDistribution):
    .. Note::
        Y is expected to take values in {0,1,2,...}
    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
+    def __init__(self, gp_link=None):
+        if gp_link is None:
+            gp_link = link_functions.Log()

-    def _preprocess_values(self,Y): #TODO
-        return Y
+        super(Poisson, self).__init__(gp_link, name='Poisson')

-    def pdf_link(self, link_f, y, extra_data=None):
+    def _conditional_mean(self, f):
+        """
+        the expected value of y given a value of f
+        """
+        return self.gp_link.transf(gp)
+
+    def pdf_link(self, link_f, y, Y_metadata=None):
        """
        Likelihood function given link(f)

@ -36,14 +41,14 @@ class Poisson(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in poisson distribution
+        :param Y_metadata: Y_metadata which is not used in poisson distribution
        :returns: likelihood evaluated for this point
        :rtype: float
        """
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        return np.prod(stats.poisson.pmf(y,link_f))

-    def logpdf_link(self, link_f, y, extra_data=None):
+    def logpdf_link(self, link_f, y, Y_metadata=None):
        """
        Log Likelihood Function given link(f)

@ -54,7 +59,7 @@ class Poisson(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in poisson distribution
+        :param Y_metadata: Y_metadata which is not used in poisson distribution
        :returns: likelihood evaluated for this point
        :rtype: float

@ -62,7 +67,7 @@ class Poisson(NoiseDistribution):
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        return np.sum(-link_f + y*np.log(link_f) - special.gammaln(y+1))

-    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)

@ -73,7 +78,7 @@ class Poisson(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in poisson distribution
+        :param Y_metadata: Y_metadata which is not used in poisson distribution
        :returns: gradient of likelihood evaluated at points
        :rtype: Nx1 array

@ -81,7 +86,7 @@ class Poisson(NoiseDistribution):
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        return y/link_f - 1

-    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
        """
        Hessian at y, given link(f), w.r.t link(f)
        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
@ -94,7 +99,7 @@ class Poisson(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in poisson distribution
+        :param Y_metadata: Y_metadata which is not used in poisson distribution
        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
        :rtype: Nx1 array

@ -109,7 +114,7 @@ class Poisson(NoiseDistribution):
        #transf = self.gp_link.transf(gp)
        #return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df

-    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
        """
        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)

@ -120,7 +125,7 @@ class Poisson(NoiseDistribution):
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in poisson distribution
+        :param Y_metadata: Y_metadata which is not used in poisson distribution
        :returns: third derivative of likelihood evaluated at points f
        :rtype: Nx1 array
        """
@ -128,19 +133,19 @@ class Poisson(NoiseDistribution):
        d3lik_dlink3 = 2*y/(link_f)**3
        return d3lik_dlink3

-    def _mean(self,gp):
+    def conditional_mean(self,gp):
        """
-        Mass (or density) function
+        The mean of the random variable conditioned on one value of the GP
        """
        return self.gp_link.transf(gp)

-    def _variance(self,gp):
+    def conditional_variance(self,gp):
        """
-        Mass (or density) function
+        The variance of the random variable conditioned on one value of the GP
        """
        return self.gp_link.transf(gp)

-    def samples(self, gp):
+    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.

--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@ -0,0 +1,279 @@
+# Copyright (c) 2012-2014 Ricardo Andrade, Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats, special
+import scipy as sp
+import link_functions
+from scipy import stats, integrate
+from scipy.special import gammaln, gamma
+from likelihood import Likelihood
+from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
+
+class StudentT(Likelihood):
+    """
+    Student T likelihood
+
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+
+    """
+    def __init__(self,gp_link=None, deg_free=5, sigma2=2):
+        if gp_link is None:
+            gp_link = link_functions.Identity()
+
+        super(StudentT, self).__init__(gp_link, name='Student_T')
+        # sigma2 is not a noise parameter, it is a squared scale.
+        self.sigma2 = Param('t_scale2', float(sigma2), Logexp())
+        self.v = Param('deg_free', float(deg_free))
+        self.link_parameter(self.sigma2)
+        self.link_parameter(self.v)
+        self.v.constrain_fixed()
+
+        self.log_concave = False
+
+    def parameters_changed(self):
+        self.variance = (self.v / float(self.v - 2)) * self.sigma2
+
+    def update_gradients(self, grads):
+        """
+        Pull out the gradients, be careful as the order must match the order
+        in which the parameters are added
+        """
+        self.sigma2.gradient = grads[0]
+        self.v.gradient = grads[1]
+
+    def pdf_link(self, inv_link_f, y, Y_metadata=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
+        #Careful gamma(big_number) is infinity!
+        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
+                     / (np.sqrt(self.v * np.pi * self.sigma2)))
+                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
+                    )
+        return np.prod(objective)
+
+    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)
+
+        :param inv_link_f: latent variables (link(f))
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
+        #FIXME:
+        #Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
+        #But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
+        #print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
+        objective = (+ gammaln((self.v + 1) * 0.5)
+                    - gammaln(self.v * 0.5)
+                    - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                    - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
+                    )
+        return np.sum(objective)
+
+    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v}
+
+        :param inv_link_f: latent variables (f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        return grad
+
+    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}
+
+        :param inv_link_f: latent variables inv_link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
+        return hess
+
+    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3}
+
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
+        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
+                    )
+        return d3lik_dlink3
+
+    def dlogpdf_link_dvar(self, inv_link_f, y, Y_metadata=None):
+        """
+        Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})}
+
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
+        """
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
+        dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
+        return np.sum(dlogpdf_dvar)
+
+    def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None):
+        """
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2}
+
+        :param inv_link_f: latent variables inv_link_f
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
+        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dvar(self, inv_link_f, y, Y_metadata=None):
+        """
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}}
+
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in student t distribution
+        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
+        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+                              / ((self.sigma2*self.v + (e**2))**3)
+                           )
+        return d2logpdf_dlink2_dvar
+
+    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
+        dlogpdf_dv = np.zeros_like(dlogpdf_dvar) #FIXME: Not done yet
+        return np.hstack((dlogpdf_dvar, dlogpdf_dv))
+
+    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
+        dlogpdf_dlink_dv = np.zeros_like(dlogpdf_dlink_dvar) #FIXME: Not done yet
+        return np.hstack((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))
+
+    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
+        d2logpdf_dlink2_dv = np.zeros_like(d2logpdf_dlink2_dvar) #FIXME: Not done yet
+        return np.hstack((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
+
+    def predictive_mean(self, mu, sigma, Y_metadata=None):
+        # The comment here confuses mean and median. 
+        return self.gp_link.transf(mu) # only true if link is monotonic, which it is.
+
+    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
+        if self.deg_free<=2.:
+            return np.empty(mu.shape)*np.nan # does not exist for degrees of freedom <= 2.
+        else:
+            return super(StudentT, self).predictive_variance(mu, variance, predictive_mean, Y_metadata)
+
+    def conditional_mean(self, gp):
+        return self.gp_link.transf(gp)
+
+    def conditional_variance(self, gp):
+        return self.deg_free/(self.deg_free - 2.)
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        #FIXME: Very slow as we are computing a new random variable per input!
+        #Can't get it to sample all at the same time
+        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        dfs = np.ones_like(gp)*self.v
+        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
+        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
+                                        scale=scales)
+        return student_t_samples.reshape(orig_shape)