Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-06-29 15:59:41 +02:00 · 2013-10-31 17:47:12 +00:00 · 2013-10-31 17:47:12 +00:00 · ddf106dd32
commit ddf106dd32
parent 490755130a ecfffc97e6
45 changed files with 3149 additions and 632 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,4 +45,4 @@ iterate.dat

 # git merge files #
 ###################
-*.orig
+*.orig
--- a/.travis.yml
+++ b/.travis.yml
@ -7,7 +7,7 @@ virtualenv:
  system_site_packages: true

 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
-before_install: 
+before_install:
  - sudo apt-get install -qq python-scipy python-pip
  - sudo apt-get install -qq python-matplotlib
  # Workaround for a permissions issue with Travis virtual machine images
@ -17,10 +17,10 @@ before_install:
  - sudo ln -s /run/shm /dev/shm

 install:
-  - pip install --upgrade numpy==1.7.1 
-  - pip install sphinx 
+  - pip install --upgrade numpy==1.7.1
+  - pip install sphinx
  - pip install nose
  - pip install . --use-mirrors
 # command to run tests, e.g. python setup.py test
-script: 
+script:
  - nosetests GPy/testing
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -6,7 +6,7 @@ import numpy as np
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs
-from ..likelihoods import EP
+from ..likelihoods import EP, Laplace
 from gp_base import GPBase

 class GP(GPBase):
@ -25,14 +25,23 @@ class GP(GPBase):
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        self.update_likelihood_approximation()


    def _set_params(self, p):
-        self.kern._set_params_transformed(p[:self.kern.num_params_transformed()])
-        self.likelihood._set_params(p[self.kern.num_params_transformed():])
+        new_kern_params = p[:self.kern.num_params_transformed()]
+        new_likelihood_params = p[self.kern.num_params_transformed():]
+        old_likelihood_params = self.likelihood._get_params()
+
+        self.kern._set_params_transformed(new_kern_params)
+        self.likelihood._set_params_transformed(new_likelihood_params)

        self.K = self.kern.K(self.X)
+
+        #Re fit likelihood approximation (if it is an approx), as parameters have changed
+        if isinstance(self.likelihood, Laplace):
+            self.likelihood.fit_full(self.K)
+
        self.K += self.likelihood.covariance_matrix

        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
@ -49,6 +58,10 @@ class GP(GPBase):
            tmp, _ = dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
            self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki)

+        #Adding dZ_dK (0 for a non-approximate likelihood, compensates for
+        #additional gradients of K when log-likelihood has non-zero Z term)
+        self.dL_dK += self.likelihood.dZ_dK
+
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))

@ -88,7 +101,6 @@ class GP(GPBase):
        return (-0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) -
            0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z)

-
    def _log_likelihood_gradients(self):
        """
        The gradient of all parameters.
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -99,13 +99,13 @@ class GPBase(Model):

        see also: gp_base.plot
        """
-        kwargs['use_raw_predict'] = True
+        kwargs['plot_raw'] = True
        self.plot(*args, **kwargs)

    def plot(self, plot_limits=None, which_data_rows='all',
            which_data_ycols='all', which_parts='all', fixed_inputs=[],
            levels=20, samples=0, fignum=None, ax=None, resolution=None,
-            use_raw_predict=False,
+            plot_raw=False,
            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
        """ 
        Plot the posterior of the GP.
@ -170,15 +170,17 @@ class GPBase(Model):
                Xgrid[:,i] = v

            #make a prediction on the frame and plot it
-            if use_raw_predict:
+            if plot_raw:
                m, v = self._raw_predict(Xgrid, which_parts=which_parts)
                lower = m - 2*np.sqrt(v)
                upper = m + 2*np.sqrt(v)
+                Y = self.likelihood.Y
            else:
                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+                Y = self.likelihood.data
            for d in which_data_ycols:
                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data_rows,free_dims], self.likelihood.data[which_data_rows, d], 'kx', mew=1.5)
+                ax.plot(Xu[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)

            #optionally plot some samples
            if samples: #NOTE not tested with fixed_inputs
@ -188,7 +190,7 @@ class GPBase(Model):
                    #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.

            #set the limits of the plot to some sensible values
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+            ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            ax.set_xlim(xmin, xmax)
            ax.set_ylim(ymin, ymax)
@ -209,13 +211,14 @@ class GPBase(Model):
            #predict on the frame and plot
            if use_raw_predict:
                m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
+                Y = self.likelihood.Y
            else:
                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts)
+                Y = self.likelihood.data
            for d in which_data_ycols:
                m_d = m[:,d].reshape(resolution, resolution).T
                ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-                Y_d = self.likelihood.Y[which_data_rows,d]
-                ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y_d, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+                ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)

            #set the limits of the plot to some sensible values
            ax.set_xlim(xmin[0], xmax[0])
@ -255,4 +258,17 @@ class GPBase(Model):
        self.X = state.pop()
        Model.setstate(self, state)

+    def log_predictive_density(self, x_test, y_test):
+        """
+        Calculation of the log predictive density

+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param x_test: test observations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        """
+        mu_star, var_star = self._raw_predict(x_test)
+        return self.likelihood.log_predictive_density(y_test, mu_star, var_star)
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -251,7 +251,7 @@ class Model(Parameterized):
            self._set_params_transformed(initial_parameters)

    def ensure_default_constraints(self):
-        """       
+        """
        Ensure that any variables which should clearly be positive
        have been constrained somehow. The method performs a regular
        expression search on parameter names looking for the terms
@ -274,7 +274,7 @@ class Model(Parameterized):
        """
        The objective function passed to the optimizer. It combines
        the likelihood and the priors.
-        
+
        Failures are handled robustly. The algorithm will try several times to
        return the objective, and will raise the original exception if it
        the objective cannot be computed.
@ -462,7 +462,7 @@ class Model(Parameterized):

            numerical_gradient = (f1 - f2) / (2 * dx)
            global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient)))
-            
+
            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
        else:
            # check the gradient of each parameter individually, and do some pretty printing
@ -547,9 +547,9 @@ class Model(Parameterized):
        :param stop_crit: convergence criterion
        :type stop_crit: float

-        .. Note: kwargs are passed to update_likelihood and optimize functions. 
+        .. Note: kwargs are passed to update_likelihood and optimize functions.
        """
-        assert isinstance(self.likelihood, likelihoods.EP) or isinstance(self.likelihood, likelihoods.EP_Mixed_Noise), "pseudo_EM is only available for EP likelihoods"
+        assert isinstance(self.likelihood, (likelihoods.EP, likelihoods.EP_Mixed_Noise, likelihoods.Laplace)), "pseudo_EM is only available for approximate likelihoods"
        ll_change = stop_crit + 1.
        iteration = 0
        last_ll = -np.inf
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -324,7 +324,7 @@ class SparseGP(GPBase):


    def plot_f(self, samples=0, plot_limits=None, which_data_rows='all',
-            which_data_cols='all', which_parts='all', resolution=None,
+            which_data_ycols='all', which_parts='all', resolution=None,
            full_cov=False, fignum=None, ax=None):

        """
@ -359,7 +359,7 @@ class SparseGP(GPBase):
        if which_data_rows is 'all':
            which_data_rows = slice(None)

-        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data_rows=which_data_rows, which_data_ycols=which_data_ycols, which_parts=which_parts, resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)
+        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data_rows=which_data_rows, which_data_ycols=which_data_ycols, which_parts=which_parts, resolution=resolution, fignum=fignum, ax=ax)

        if self.X.shape[1] == 1:
            if self.has_uncertain_inputs:
@ -379,6 +379,7 @@ class SparseGP(GPBase):

    def plot(self, plot_limits=None, which_data_rows='all',
            which_data_ycols='all', which_parts='all', fixed_inputs=[],
+            plot_raw=False,
            levels=20, samples=0, fignum=None, ax=None, resolution=None):
        """ 
        Plot the posterior of the sparse GP.
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -43,7 +43,7 @@ def oil(num_inducing=50, max_iters=100, kernel=None):

 def toy_linear_1d_classification(seed=default_seed):
    """
-    Simple 1D classification example
+    Simple 1D classification example using EP approximation

    :param seed: seed value for data generation (default is 4).
    :type seed: int
@ -61,6 +61,7 @@ def toy_linear_1d_classification(seed=default_seed):
    #m.update_likelihood_approximation()
    # Parameters optimization:
    #m.optimize()
+    #m.update_likelihood_approximation()
    m.pseudo_EM()

    # Plot
@ -71,6 +72,41 @@ def toy_linear_1d_classification(seed=default_seed):

    return m

+def toy_linear_1d_classification_laplace(seed=default_seed):
+    """
+    Simple 1D classification example using Laplace approximation
+
+    :param seed: seed value for data generation (default is 4).
+    :type seed: int
+
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+
+    bern_noise_model = GPy.likelihoods.bernoulli()
+    laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), bern_noise_model)
+
+    # Model definition
+    m = GPy.models.GPClassification(data['X'], Y, likelihood=laplace_likelihood)
+
+    print m
+    # Optimize
+    #m.update_likelihood_approximation()
+    # Parameters optimization:
+    m.optimize('bfgs', messages=1)
+    #m.pseudo_EM()
+
+    # Plot
+    fig, axes = pb.subplots(2,1)
+    m.plot_f(ax=axes[0])
+    m.plot(ax=axes[1])
+    print(m)
+
+    return m
+
+
 def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
    """
    Sparse 1D classification example
@ -116,7 +152,7 @@ def toy_heaviside(seed=default_seed):
    Y[Y.flatten() == -1] = 0

    # Model definition
-    noise_model = GPy.likelihoods.binomial(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
+    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
    likelihood = GPy.likelihoods.EP(Y,noise_model)
    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)

--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@ -0,0 +1,296 @@
+import GPy
+import numpy as np
+import matplotlib.pyplot as plt
+from GPy.util import datasets
+np.random.seed(1)
+
+def student_t_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    real_std = 0.1
+    #Start a function, any function
+    X = np.linspace(0.0, np.pi*2, 100)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+    Yc = Y.copy()
+
+    X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
+    Y_full = np.sin(X_full)
+
+    Y = Y/Y.max()
+
+    #Slightly noisy data
+    Yc[75:80] += 1
+
+    #Very noisy data
+    #Yc[10] += 100
+    #Yc[25] += 10
+    #Yc[23] += 10
+    #Yc[26] += 1000
+    #Yc[24] += 10
+    #Yc = Yc/Yc.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 5
+    print "Real noise: ", real_std
+    initial_var_guess = 0.5
+
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
+
+    plt.figure(1)
+    plt.suptitle('Gaussian likelihood')
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
+
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GPRegression(X, Y, kernel=kernel1)
+    # optimize
+    m.ensure_default_constraints()
+    m.constrain_fixed('white', 1e-4)
+    m.randomize()
+    m.optimize()
+    # plot
+    ax = plt.subplot(211)
+    m.plot(ax=ax)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
+    plt.title('Gaussian clean')
+    print m
+
+    #Corrupt
+    print "Corrupt Gaussian"
+    m = GPy.models.GPRegression(X, Yc, kernel=kernel2)
+    m.ensure_default_constraints()
+    m.constrain_fixed('white', 1e-4)
+    m.randomize()
+    m.optimize()
+    ax = plt.subplot(212)
+    m.plot(ax=ax)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
+    plt.title('Gaussian corrupt')
+    print m
+
+    plt.figure(2)
+    plt.suptitle('Student-t likelihood')
+    edited_real_sd = initial_var_guess
+
+    print "Clean student t, rasm"
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
+    m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
+    m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
+    m.constrain_fixed('white', 1e-4)
+    m.randomize()
+    #m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    ax = plt.subplot(211)
+    m.plot(ax=ax)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
+    plt.title('Student-t rasm clean')
+
+    print "Corrupt student t, rasm"
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
+    m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
+    m.constrain_fixed('white', 1e-4)
+    m.randomize()
+    for a in range(1):
+        m.randomize()
+        m_start = m.copy()
+        print m
+        m.optimize('scg', messages=1)
+    print(m)
+    ax = plt.subplot(212)
+    m.plot(ax=ax)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
+    plt.title('Student-t rasm corrupt')
+
+    return m
+
+def boston_example():
+    import sklearn
+    from sklearn.cross_validation import KFold
+    optimizer='bfgs'
+    messages=0
+    data = datasets.boston_housing()
+    degrees_freedoms = [3, 5, 8, 10]
+    X = data['X'].copy()
+    Y = data['Y'].copy()
+    X = X-X.mean(axis=0)
+    X = X/X.std(axis=0)
+    Y = Y-Y.mean()
+    Y = Y/Y.std()
+    num_folds = 10
+    kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    num_models = len(degrees_freedoms) + 3 #3 for baseline, gaussian, gaussian laplace approx
+    score_folds = np.zeros((num_models, num_folds))
+    pred_density = score_folds.copy()
+
+    def rmse(Y, Ystar):
+        return np.sqrt(np.mean((Y-Ystar)**2))
+
+    for n, (train, test) in enumerate(kf):
+        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        print "Fold {}".format(n)
+
+        noise = 1e-1 #np.exp(-2)
+        rbf_len = 0.5
+        data_axis_plot = 4
+        plot = False
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+
+        #Baseline
+        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
+
+        #Gaussian GP
+        print "Gauss GP"
+        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
+        mgp.ensure_default_constraints()
+        mgp.constrain_fixed('white', 1e-5)
+        mgp['rbf_len'] = rbf_len
+        mgp['noise'] = noise
+        print mgp
+        mgp.optimize(optimizer=optimizer, messages=messages)
+        Y_test_pred = mgp.predict(X_test)
+        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
+        print mgp
+        print pred_density
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('GP gauss')
+
+        print "Gaussian Laplace GP"
+        N, D = Y_train.shape
+        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
+        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
+        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
+        mg.ensure_default_constraints()
+        mg.constrain_positive('noise_variance')
+        mg.constrain_fixed('white', 1e-5)
+        mg['rbf_len'] = rbf_len
+        mg['noise'] = noise
+        print mg
+        try:
+            mg.optimize(optimizer=optimizer, messages=messages)
+        except Exception:
+            print "Blew up"
+        Y_test_pred = mg.predict(X_test)
+        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
+        print pred_density
+        print mg
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('Lap gauss')
+
+        for stu_num, df in enumerate(degrees_freedoms):
+            #Student T
+            print "Student-T GP {}df".format(df)
+            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
+            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
+            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
+            mstu_t.ensure_default_constraints()
+            mstu_t.constrain_fixed('white', 1e-5)
+            mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+            mstu_t['rbf_len'] = rbf_len
+            mstu_t['t_noise'] = noise
+            print mstu_t
+            try:
+                mstu_t.optimize(optimizer=optimizer, messages=messages)
+            except Exception:
+                print "Blew up"
+            Y_test_pred = mstu_t.predict(X_test)
+            score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
+            pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
+            print pred_density
+            print mstu_t
+            if plot:
+                plt.figure()
+                plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+                plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+                plt.title('Stu t {}df'.format(df))
+
+    print "Average scores: {}".format(np.mean(score_folds, 1))
+    print "Average pred density: {}".format(np.mean(pred_density, 1))
+
+    #Plotting
+    stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
+    legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
+
+    #Plot boxplots for RMSE density
+    fig = plt.figure()
+    ax=fig.add_subplot(111)
+    plt.title('RMSE')
+    bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
+    plt.setp(bp['boxes'], color='black')
+    plt.setp(bp['whiskers'], color='black')
+    plt.setp(bp['fliers'], color='red', marker='+')
+    xtickNames = plt.setp(ax, xticklabels=legends)
+    plt.setp(xtickNames, rotation=45, fontsize=8)
+    ax.set_ylabel('RMSE')
+    ax.set_xlabel('Distribution')
+    #Make grid and put it below boxes
+    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+              alpha=0.5)
+    ax.set_axisbelow(True)
+
+    #Plot boxplots for predictive density
+    fig = plt.figure()
+    ax=fig.add_subplot(111)
+    plt.title('Predictive density')
+    bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
+    plt.setp(bp['boxes'], color='black')
+    plt.setp(bp['whiskers'], color='black')
+    plt.setp(bp['fliers'], color='red', marker='+')
+    xtickNames = plt.setp(ax, xticklabels=legends[1:])
+    plt.setp(xtickNames, rotation=45, fontsize=8)
+    ax.set_ylabel('Mean Log probability P(Y*|Y)')
+    ax.set_xlabel('Distribution')
+    #Make grid and put it below boxes
+    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+              alpha=0.5)
+    ax.set_axisbelow(True)
+    return mstu_t
+
+def precipitation_example():
+    import sklearn
+    from sklearn.cross_validation import KFold
+    data = datasets.boston_housing()
+    X = data['X'].copy()
+    Y = data['Y'].copy()
+    X = X-X.mean(axis=0)
+    X = X/X.std(axis=0)
+    Y = Y-Y.mean()
+    Y = Y/Y.std()
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    num_folds = 10
+    kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    score_folds = np.zeros((4, num_folds))
+    def rmse(Y, Ystar):
+        return np.sqrt(np.mean((Y-Ystar)**2))
+    #for train, test in kf:
+    for n, (train, test) in enumerate(kf):
+        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        print "Fold {}".format(n)
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -270,6 +270,50 @@ def toy_rbf_1d_50(max_iters=100):
    print(m)
    return m

+def toy_poisson_rbf_1d(optimizer='bfgs', max_nb_eval_optim=100):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    x_len = 400
+    X = np.linspace(0, 10, x_len)[:, None]
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
+
+    noise_model = GPy.likelihoods.poisson()
+    likelihood = GPy.likelihoods.EP(Y,noise_model)
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+
+    # optimize
+    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
+    # plot
+    m.plot()
+    print(m)
+    return m
+
+def toy_poisson_rbf_1d_laplace(optimizer='bfgs', max_nb_eval_optim=100):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    x_len = 30
+    X = np.linspace(0, 10, x_len)[:, None]
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
+
+    noise_model = GPy.likelihoods.poisson()
+    likelihood = GPy.likelihoods.Laplace(Y,noise_model)
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+
+    # optimize
+    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
+    # plot
+    m.plot()
+    # plot the real underlying rate function
+    pb.plot(X, np.exp(f_true), '--k', linewidth=2)
+    print(m)
+    return m
+
+
+
 def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@ -1,7 +1,6 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from IPython.core.debugger import Tracer; debug_here=Tracer()
 from kernpart import Kernpart
 import numpy as np
 from ...util.linalg import tdot
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,8 +1,7 @@
 from ep import EP
+from laplace import Laplace
 from ep_mixed_noise import EP_Mixed_Noise
 from gaussian import Gaussian
 from gaussian_mixed_noise import Gaussian_Mixed_Noise
 import noise_models
 from noise_model_constructors import *
-# TODO: from Laplace import Laplace
-
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -19,7 +19,6 @@ class EP(likelihood):
        self.num_data, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
        self.num_params = 0
-        self._transf_data = self.noise_model._preprocess_values(data)

        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
@ -55,6 +54,22 @@ class EP(likelihood):
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
        return self.noise_model.predictive_values(mu,var)

+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
+
    def _get_params(self):
        #return np.zeros(0)
        return self.noise_model._get_params()
@ -134,7 +149,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@ -233,7 +248,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@ -336,7 +351,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -90,11 +90,25 @@ class Gaussian(likelihood):
            _95pc = mean + 2.*np.sqrt(true_var)
        return mean, true_var, _5pc, _95pc

-    def fit_full(self):
+    def log_predictive_density(self, y_test, mu_star, var_star):
        """
-        No approximations needed
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+
+        .. Note:
+            Works as if each test point was provided individually, i.e. not full_cov
        """
-        pass
+        y_rescaled = (y_test - self._offset)/self._scale
+        return -0.5*np.log(2*np.pi) -0.5*np.log(var_star + self._variance) -0.5*(np.square(y_rescaled - mu_star))/(var_star + self._variance)

    def _gradients(self, partial):
        return np.sum(partial)
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@ -0,0 +1,390 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+#
+#Parts of this file were influenced by the Matlab GPML framework written by
+#Carl Edward Rasmussen & Hannes Nickisch, however all bugs are our own.
+#
+#The GPML code is released under the FreeBSD License.
+#Copyright (c) 2005-2013 Carl Edward Rasmussen & Hannes Nickisch. All rights reserved.
+#
+#The code and associated documentation is available from
+#http://gaussianprocess.org/gpml/code.
+
+import numpy as np
+import scipy as sp
+from likelihood import likelihood
+from ..util.linalg import mdot, jitchol, pddet, dpotrs
+from functools import partial as partial_func
+
+class Laplace(likelihood):
+    """Laplace approximation to a posterior"""
+
+    def __init__(self, data, noise_model, extra_data=None):
+        """
+        Laplace Approximation
+
+        Find the moments \hat{f} and the hessian at this point
+        (using Newton-Raphson) of the unnormalised posterior
+
+        Compute the GP variables (i.e. generate some Y^{squiggle} and
+        z^{squiggle} which makes a gaussian the same as the laplace
+        approximation to the posterior, but normalised
+
+        Arguments
+        ---------
+
+        :param data: array of data the likelihood function is approximating
+        :type data: NxD
+        :param noise_model: likelihood function - subclass of noise_model
+        :type noise_model: noise_model
+        :param extra_data: additional data used by some likelihood functions,
+        """
+        self.data = data
+        self.noise_model = noise_model
+        self.extra_data = extra_data
+
+        #Inital values
+        self.N, self.D = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
+        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
+
+        self.restart()
+        likelihood.__init__(self)
+
+    def restart(self):
+        """
+        Reset likelihood variables to their defaults
+        """
+        #Initial values for the GP variables
+        self.Y = np.zeros((self.N, 1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:, None]
+        self.Z = 0
+        self.YYT = None
+
+        self.old_Ki_f = None
+
+    def predictive_values(self, mu, var, full_cov):
+        if full_cov:
+            raise NotImplementedError("Cannot make correlated predictions\
+                    with an Laplace likelihood")
+        return self.noise_model.predictive_values(mu, var)
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
+
+    def _get_params(self):
+        return np.asarray(self.noise_model._get_params())
+
+    def _get_param_names(self):
+        return self.noise_model._get_param_names()
+
+    def _set_params(self, p):
+        return self.noise_model._set_params(p)
+
+    def _shared_gradients_components(self):
+        d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
+        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
+        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
+        return dL_dfhat, I_KW_i
+
+    def _Kgradients(self):
+        """
+        Gradients with respect to prior kernel parameters dL_dK to be chained
+        with dK_dthetaK to give dL_dthetaK
+        :returns: dL_dK matrix
+        :rtype: Matrix (1 x num_kernel_params)
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #Explicit
+        #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
+        #expl_b = self.Wi_K_i
+        #expl = 0.5*expl_a - 0.5*expl_b
+        #dL_dthetaK_exp = dK_dthetaK(expl, X)
+
+        #Implicit
+        impl = mdot(dlp, dL_dfhat, I_KW_i)
+
+        #No longer required as we are computing these in the gp already
+        #otherwise we would take them away and add them back
+        #dL_dthetaK_imp = dK_dthetaK(impl, X)
+        #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
+        #dL_dK = expl + impl
+
+        #No need to compute explicit as we are computing dZ_dK to account
+        #for the difference between the K gradients of a normal GP,
+        #and the K gradients including the implicit part
+        dL_dK = impl
+        return dL_dK
+
+    def _gradients(self, partial):
+        """
+        Gradients with respect to likelihood parameters (dL_dthetaL)
+
+        :param partial: Not needed by this likelihood
+        :type partial: lambda function
+        :rtype: array of derivatives (1 x num_likelihood_params)
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #len(dlik_dthetaL)
+        num_params = len(self._get_param_names())
+        # make space for one derivative for each likelihood parameter
+        dL_dthetaL = np.zeros(num_params)
+        for thetaL_i in range(num_params):
+            #Explicit
+            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[:, thetaL_i])
+                             #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
+                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[:, thetaL_i])
+                             )
+
+            #Implicit
+            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[:, thetaL_i])
+            dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
+            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+
+        return dL_dthetaL
+
+    def _compute_GP_variables(self):
+        """
+        Generate data Y which would give the normal distribution identical
+        to the laplace approximation to the posterior, but normalised
+
+        GPy expects a likelihood to be gaussian, so need to caluclate
+        the data Y^{\tilde} that makes the posterior match that found
+        by a laplace approximation to a non-gaussian likelihood but with
+        a gaussian likelihood
+
+        Firstly,
+        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
+        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
+        we wish to find the hessian \Sigma^{\tilde}
+        that has the same curvature but using our new simulated data Y^{\tilde}
+        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
+        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
+        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
+
+        Secondly,
+        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
+        So we can suck up any differences between that and our log marginal likelihood approximation
+        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
+        which we want to optimize instead, by equating them and rearranging, the difference is added onto
+        the log p(y) that GPy optimizes by default
+
+        Thirdly,
+        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
+        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
+        gp.py code
+        """
+        Wi = 1.0/self.W
+        self.Sigma_tilde = np.diagflat(Wi)
+
+        Y_tilde = Wi*self.Ki_f + self.f_hat
+
+        self.Wi_K_i = self.W12BiW12
+        self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+        self.lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+
+        Z_tilde = (+ self.lik
+                   - 0.5*self.ln_B_det
+                   + 0.5*self.ln_det_Wi_K
+                   - 0.5*self.f_Ki_f
+                   + 0.5*self.y_Wi_Ki_i_y
+                  )
+
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
+        self.YYT = np.dot(self.Y, self.Y.T)
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
+
+        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
+        self.dZ_dK = self._Kgradients()
+        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
+
+    def fit_full(self, K):
+        """
+        The laplace approximation algorithm, find K and expand hessian
+        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
+
+        :param K: Prior covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        """
+        self.K = K.copy()
+
+        #Find mode
+        self.f_hat = self.rasm_mode(self.K)
+
+        #Compute hessian and other variables at mode
+        self._compute_likelihood_variables()
+
+        #Compute fake variables replicating laplace approximation to posterior
+        self._compute_GP_variables()
+
+    def _compute_likelihood_variables(self):
+        """
+        Compute the variables required to compute gaussian Y variables
+        """
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
+
+        self.Ki_f = self.Ki_f
+        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
+        self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
+
+    def _compute_B_statistics(self, K, W, a):
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :param K: Prior Covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of hessian (1xN)
+        :param a: Matrix to calculate W12BiW12a
+        :type a: Matrix NxN
+        :returns: (W12BiW12, ln_B_det)
+        """
+        if not self.noise_model.log_concave:
+            #print "Under 1e-10: {}".format(np.sum(W < 1e-10))
+            W[W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                # To cause the posterior to become less certain than the prior and likelihood,
+                                # This is a property only held by non-log-concave likelihoods
+
+
+        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        B = np.eye(self.N) + W_12*K*W_12.T
+        L = jitchol(B)
+
+        W12BiW12 = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
+        ln_B_det = 2*np.sum(np.log(np.diag(L)))
+        return W12BiW12, ln_B_det
+
+    def rasm_mode(self, K, MAX_ITER=30):
+        """
+        Rasmussen's numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+        Influenced by GPML (BSD) code, all errors are our own
+
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
+        :param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :type MAX_ITER: scalar
+        :returns: f_hat, mode on which to make laplace approxmiation
+        :rtype: NxD matrix
+        """
+        #old_Ki_f = np.zeros((self.N, 1))
+
+        #Start f's at zero originally
+        if self.old_Ki_f is None:
+            old_Ki_f = np.zeros((self.N, 1))
+            f = np.dot(K, old_Ki_f)
+        else:
+            #Start at the old best point
+            old_Ki_f = self.old_Ki_f.copy()
+            f = self.f_hat.copy()
+
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(Ki_f, f):
+            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
+
+        difference = np.inf
+        epsilon = 1e-5
+        #step_size = 1
+        #rs = 0
+        i = 0
+
+        while difference > epsilon and i < MAX_ITER:
+            W = -self.noise_model.d2logpdf_df2(f, self.data, extra_data=self.extra_data)
+
+            W_f = W*f
+            grad = self.noise_model.dlogpdf_df(f, self.data, extra_data=self.extra_data)
+
+            b = W_f + grad
+            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_Ki_f = b - W12BiW12Kb
+            dKi_f = full_step_Ki_f - old_Ki_f
+
+            f_old = f.copy()
+            def inner_obj(step_size, old_Ki_f, dKi_f, K):
+                Ki_f = old_Ki_f + step_size*dKi_f
+                f = np.dot(K, Ki_f)
+                # This is nasty, need to set something within an optimization though
+                self.tmp_Ki_f = Ki_f.copy()
+                self.tmp_f = f.copy()
+                return -obj(Ki_f, f)
+
+            i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
+            #Find the stepsize that minimizes the objective function using a brent line search
+            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
+            #steps than get this exact then make a step, if B was bigger it might be the other way around though
+            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
+            f = self.tmp_f.copy()
+            Ki_f = self.tmp_Ki_f.copy()
+
+            #Optimize without linesearch
+            #f_old = f.copy()
+            #update_passed = False
+            #while not update_passed:
+                #Ki_f = old_Ki_f + step_size*dKi_f
+                #f = np.dot(K, Ki_f)
+
+                #old_obj = new_obj
+                #new_obj = obj(Ki_f, f)
+                #difference = new_obj - old_obj
+                ##print "difference: ",difference
+                #if difference < 0:
+                    ##print "Objective function rose", np.float(difference)
+                    ##If the objective function isn't rising, restart optimization
+                    #step_size *= 0.8
+                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    ##objective function isn't increasing, try reducing step size
+                    #f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
+                    #old_obj = new_obj
+                    #rs += 1
+                #else:
+                    #update_passed = True
+
+            #old_Ki_f = self.Ki_f.copy()
+
+            #difference = abs(new_obj - old_obj)
+            #old_obj = new_obj.copy()
+            #difference = np.abs(np.sum(f - f_old))
+            difference = np.abs(np.sum(Ki_f - old_Ki_f))
+            old_Ki_f = Ki_f.copy()
+            i += 1
+
+        self.old_Ki_f = old_Ki_f.copy()
+        if difference > epsilon:
+            print "Not perfect f_hat fit difference: {}".format(difference)
+
+        self.Ki_f = Ki_f
+        return f
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -23,6 +23,7 @@ class likelihood(Parameterized):
    """
    def __init__(self):
        Parameterized.__init__(self)
+        self.dZ_dK = 0

    def _get_params(self):
        raise NotImplementedError
@ -33,11 +34,36 @@ class likelihood(Parameterized):
    def _set_params(self, x):
        raise NotImplementedError

-    def fit(self):
-        raise NotImplementedError
+    def fit_full(self, K):
+        """
+        No approximations needed by default
+        """
+        pass
+
+    def restart(self):
+        """
+        No need to restart if not an approximation
+        """
+        pass

    def _gradients(self, partial):
        raise NotImplementedError

    def predictive_values(self, mu, var):
        raise NotImplementedError
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        raise NotImplementedError
--- a/GPy/likelihoods/noise_model_constructors.py
+++ b/GPy/likelihoods/noise_model_constructors.py
@ -4,9 +4,9 @@
 import numpy as np
 import noise_models

-def binomial(gp_link=None):
+def bernoulli(gp_link=None):
    """
-    Construct a binomial likelihood
+    Construct a bernoulli likelihood

    :param gp_link: a GPy gp_link function
    """
@ -27,16 +27,17 @@ def binomial(gp_link=None):
        analytical_mean = False
        analytical_variance = False

-    return noise_models.binomial_noise.Binomial(gp_link,analytical_mean,analytical_variance)
+    return noise_models.bernoulli_noise.Bernoulli(gp_link,analytical_mean,analytical_variance)

 def exponential(gp_link=None):
+
    """
-    Construct a binomial likelihood
+    Construct a exponential likelihood

    :param gp_link: a GPy gp_link function
    """
    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Identity()
+        gp_link = noise_models.gp_transformations.Log_ex_1()

    analytical_mean = False
    analytical_variance = False
@ -85,4 +86,36 @@ def gamma(gp_link=None,beta=1.):
    analytical_variance = False
    return noise_models.gamma_noise.Gamma(gp_link,analytical_mean,analytical_variance,beta)

+def gaussian(gp_link=None, variance=2, D=None, N=None):
+    """
+    Construct a Gaussian likelihood

+    :param gp_link: a GPy gp_link function
+    :param variance: variance
+    :type variance: scalar
+    :returns: Gaussian noise model:
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+    analytical_mean = True
+    analytical_variance = True # ?
+    return noise_models.gaussian_noise.Gaussian(gp_link, analytical_mean,
+            analytical_variance, variance=variance, D=D, N=N)
+
+def student_t(gp_link=None, deg_free=5, sigma2=2):
+    """
+    Construct a Student t likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param deg_free: degrees of freedom of student-t
+    :type deg_free: scalar
+    :param sigma2: variance
+    :type sigma2: scalar
+    :returns: Student-T noise model
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+    analytical_mean = True
+    analytical_variance = True
+    return noise_models.student_t_noise.StudentT(gp_link, analytical_mean,
+            analytical_variance,deg_free, sigma2)
--- a/GPy/likelihoods/noise_models/init.py
+++ b/GPy/likelihoods/noise_models/init.py
@ -1,7 +1,8 @@
 import noise_distributions
-import binomial_noise
+import bernoulli_noise
 import exponential_noise
 import gaussian_noise
 import gamma_noise
 import poisson_noise
+import student_t_noise
 import gp_transformations
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@ -0,0 +1,216 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Bernoulli(NoiseDistribution):
+    """
+    Bernoulli likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+    .. Note::
+        Y is expected to take values in {-1,1}
+        Probit likelihood usually used
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        super(Bernoulli, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+
+        ..Note:: Binary classification algorithm works better with classes {-1,1}
+        """
+        Y_prep = Y.copy()
+        Y1 = Y[Y.flatten()==1].size
+        Y2 = Y[Y.flatten()==0].size
+        assert Y1 + Y2 == Y.size, 'Bernoulli likelihood is meant to be used only with outputs in {0,1}.'
+        Y_prep[Y.flatten() == 0] = -1
+        return Y_prep
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        if data_i == 1:
+            sign = 1.
+        elif data_i == 0:
+            sign = -1
+        else:
+            raise ValueError("bad value for Bernouilli observation (0,1)")
+        if isinstance(self.gp_link,gp_transformations.Probit):
+            z = sign*v_i/np.sqrt(tau_i**2 + tau_i)
+            Z_hat = std_norm_cdf(z)
+            phi = std_norm_pdf(z)
+            mu_hat = v_i/tau_i + sign*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
+            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
+
+        elif isinstance(self.gp_link,gp_transformations.Heaviside):
+            a = sign*v_i/np.sqrt(tau_i)
+            Z_hat = std_norm_cdf(a)
+            N = std_norm_pdf(a)
+            mu_hat = v_i/tau_i + sign*N/Z_hat/np.sqrt(tau_i)
+            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
+            if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
+                stop
+        else:
+            raise ValueError("Exact moment matching not available for link {}".format(self.gp_link.gp_transformations.__name__))
+
+        return Z_hat, mu_hat, sigma2_hat
+
+    def _predictive_mean_analytical(self,mu,sigma):
+        if isinstance(self.gp_link,gp_transformations.Probit):
+            return stats.norm.cdf(mu/np.sqrt(1+sigma**2))
+        elif isinstance(self.gp_link,gp_transformations.Heaviside):
+            return stats.norm.cdf(mu/sigma)
+        else:
+            raise NotImplementedError
+
+    def _predictive_variance_analytical(self,mu,sigma, pred_mean):
+        if isinstance(self.gp_link,gp_transformations.Heaviside):
+            return 0.
+        else:
+            raise NotImplementedError
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        .. Note:
+            Each y_i must be in {0,1}
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        objective = (link_f**y) * ((1.-link_f)**(1.-y))
+        return np.exp(np.sum(np.log(objective)))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: log likelihood evaluated at points link(f)
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        #objective = y*np.log(link_f) + (1.-y)*np.log(link_f)
+        objective = np.where(y==1, np.log(link_f), np.log(1-link_f))
+        return np.sum(objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the pdf at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - \\frac{(1 - y_{i})}{(1 - \\lambda(f_{i}))}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: gradient of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = (y/link_f) - (1.-y)/(1-link_f)
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j
+        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
+
+
+        .. math::
+            \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d2logpdf_dlink2 = -y/(link_f**2) - (1-y)/((1-link_f)**2)
+        return d2logpdf_dlink2
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f)^{3}} - \\frac{2(1-y_{i}}{(1-\\lambda(f))^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: third derivative of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3logpdf_dlink3 = 2*(y/(link_f**3) - (1-y)/((1-link_f)**3))
+        return d3logpdf_dlink3
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        p = self.gp_link.transf(gp)
+        return p*(1.-p)
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        ns = np.ones_like(gp, dtype=int)
+        Ysim = np.random.binomial(ns, self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/binomial_noise.py
+++ b/GPy/likelihoods/noise_models/binomial_noise.py
@ -1,132 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
-
-class Binomial(NoiseDistribution):
-    """
-    Probit likelihood
-    Y is expected to take values in {-1,1}
-    -----
-    $$
-    L(x) = \\Phi (Y_i*f_i)
-    $$
-    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Binomial, self).__init__(gp_link,analytical_mean,analytical_variance)
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-
-        ..Note:: Binary classification algorithm works better with classes {-1,1}
-        """
-        Y_prep = Y.copy()
-        Y1 = Y[Y.flatten()==1].size
-        Y2 = Y[Y.flatten()==0].size
-        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
-        Y_prep[Y.flatten() == 0] = -1
-        return Y_prep
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        if isinstance(self.gp_link,gp_transformations.Probit):
-            z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
-            Z_hat = std_norm_cdf(z)
-            phi = std_norm_pdf(z)
-            mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
-            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
-
-        elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            a = data_i*v_i/np.sqrt(tau_i)
-            Z_hat = std_norm_cdf(a)
-            N = std_norm_pdf(a)
-            mu_hat = v_i/tau_i + data_i*N/Z_hat/np.sqrt(tau_i)
-            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
-            if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
-                stop
-
-        return Z_hat, mu_hat, sigma2_hat
-
-    def _predictive_mean_analytical(self,mu,sigma):
-        if isinstance(self.gp_link,gp_transformations.Probit):
-            return stats.norm.cdf(mu/np.sqrt(1+sigma**2))
-        elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            return stats.norm.cdf(mu/sigma)
-        else:
-            raise NotImplementedError
-
-    def _predictive_variance_analytical(self,mu,sigma, pred_mean):
-        if isinstance(self.gp_link,gp_transformations.Heaviside):
-            return 0.
-        else:
-            raise NotImplementedError
-
-    def _mass(self,gp,obs):
-        #NOTE obs must be in {0,1}
-        p = self.gp_link.transf(gp)
-        return p**obs * (1.-p)**(1.-obs)
-
-    def _nlog_mass(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        return obs*np.log(p) + (1.-obs)*np.log(1-p)
-
-    def _dnlog_mass_dgp(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        dp = self.gp_link.dtransf_df(gp)
-        return obs/p * dp - (1.-obs)/(1.-p) * dp
-
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        return (obs/p + (1.-obs)/(1.-p))*self.gp_link.d2transf_df2(gp) + ((1.-obs)/(1.-p)**2-obs/p**2)*self.gp_link.dtransf_df(gp)
-
-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        p = self.gp_link.transf(gp)
-        return p*(1.-p)
-
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)*(1. - 2.*self.gp_link.transf(gp))
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
-
-
-    def samples(self, gp):
-        """
-        Returns a set of samples of observations based on a given value of the latent variable.
-
-        :param size: number of samples to compute
-        :param gp: latent variable
-        """
-        orig_shape = gp.shape
-        gp = gp.flatten()
-        Ysim = np.array([np.random.binomial(1,self.gp_link.transf(gpj),size=1) for gpj in gp])
-        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@ -24,24 +24,113 @@ class Exponential(NoiseDistribution):
    def _preprocess_values(self,Y):
        return Y

-    def _mass(self,gp,obs):
+    def pdf_link(self, link_f, y, extra_data=None):
        """
-        Mass (or density) function
-        """
-        return np.exp(-obs/self.gp_link.transf(gp))/self.gp_link.transf(gp)
+        Likelihood function given link(f)

-    def _nlog_mass(self,gp,obs):
-        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
-        """
-        return obs/self.gp_link.transf(gp) + np.log(self.gp_link.transf(gp))
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})\\exp (-y\\lambda(f_{i}))

-    def _dnlog_mass_dgp(self,gp,obs):
-        return ( 1./self.gp_link.transf(gp) - obs/self.gp_link.transf(gp)**2) * self.gp_link.dtransf_df(gp)
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        log_objective = link_f*np.exp(-y*link_f)
+        return np.exp(np.sum(np.log(log_objective)))
+        #return np.exp(np.sum(-y/link_f - np.log(link_f) ))

-    def _d2nlog_mass_dgp2(self,gp,obs):
-        fgp = self.gp_link.transf(gp)
-        return (2*obs/fgp**3 - 1./fgp**2) * self.gp_link.dtransf_df(gp)**2 + ( 1./fgp - obs/fgp**2) * self.gp_link.d2transf_df2(gp)
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\lambda(f_{i}) - y_{i}\\lambda(f_{i})
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        log_objective = np.log(link_f) - y*link_f
+        #logpdf_link = np.sum(-np.log(link_f) - y/link_f)
+        return np.sum(log_objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\lambda(f)} - y_{i}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = 1./link_f - y
+        #grad = y/(link_f**2) - 1./link_f
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\frac{1}{\\lambda(f_{i})^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -1./(link_f**2)
+        #hess = -2*y/(link_f**3) + 1/(link_f**2)
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2}{\\lambda(f_{i})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = 2./(link_f**3)
+        #d3lik_dlink3 = 6*y/(link_f**4) - 2./(link_f**3)
+        return d3lik_dlink3

    def _mean(self,gp):
        """
@ -49,20 +138,19 @@ class Exponential(NoiseDistribution):
        """
        return self.gp_link.transf(gp)

-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)**2

-    def _dvariance_dgp(self,gp):
-        return 2*self.gp_link.transf(gp)*self.gp_link.dtransf_df(gp)
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.

-    def _d2variance_dgp2(self,gp):
-        return 2 * (self.gp_link.dtransf_df(gp)**2 + self.gp_link.transf(gp)*self.gp_link.d2transf_df2(gp))
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.random.exponential(1.0/self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@ -12,11 +12,11 @@ from noise_distributions import NoiseDistribution
 class Gamma(NoiseDistribution):
    """
    Gamma likelihood
-    Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\beta^{\\alpha_{i}}}{\\Gamma(\\alpha_{i})}y_{i}^{\\alpha_{i}-1}e^{-\\beta y_{i}}\\\\
+        \\alpha_{i} = \\beta y_{i}
+
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
        self.beta = beta
@ -25,26 +25,122 @@ class Gamma(NoiseDistribution):
    def _preprocess_values(self,Y):
        return Y

-    def _mass(self,gp,obs):
+    def pdf_link(self, link_f, y, extra_data=None):
        """
-        Mass (or density) function
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\beta^{\\alpha_{i}}}{\\Gamma(\\alpha_{i})}y_{i}^{\\alpha_{i}-1}e^{-\\beta y_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
-        alpha = self.gp_link.transf(gp)*self.beta
-        return obs**(alpha - 1.) * np.exp(-self.beta*obs) * self.beta**alpha / special.gamma(alpha)
+        alpha = link_f*self.beta
+        objective = (y**(alpha - 1.) * np.exp(-self.beta*y) * self.beta**alpha)/ special.gamma(alpha)
+        return np.exp(np.sum(np.log(objective)))

-    def _nlog_mass(self,gp,obs):
+    def logpdf_link(self, link_f, y, extra_data=None):
        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
        """
-        alpha = self.gp_link.transf(gp)*self.beta
-        return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        #alpha = self.gp_link.transf(gp)*self.beta
+        #return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+        alpha = link_f*self.beta
+        log_objective = alpha*np.log(self.beta) - np.log(special.gamma(alpha)) + (alpha - 1)*np.log(y) - self.beta*y
+        return np.sum(log_objective)

-    def _dnlog_mass_dgp(self,gp,obs):
-        return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)

-    def _d2nlog_mass_dgp2(self,gp,obs):
-        return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\beta (\\log \\beta y_{i}) - \\Psi(\\alpha_{i})\\beta\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = self.beta*np.log(self.beta*y) - special.psi(self.beta*link_f)*self.beta
+        #old
+        #return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -special.polygamma(1, self.beta*link_f)*(self.beta**2)
+        #old
+        #return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = -special.polygamma(2, self.beta*link_f)*(self.beta**3)
+        return d3lik_dlink3

    def _mean(self,gp):
        """
@ -52,20 +148,8 @@ class Gamma(NoiseDistribution):
        """
        return self.gp_link.transf(gp)

-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)/self.beta
-
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)/self.beta
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)/self.beta
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@ -12,11 +12,17 @@ class Gaussian(NoiseDistribution):
    """
    Gaussian likelihood

-    :param mean: mean value of the Gaussian distribution
-    :param variance: mean value of the Gaussian distribution
+    .. math::
+        \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+    :param variance: variance value of the Gaussian distribution
+    :param N: Number of data points
+    :type N: int
    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1.):
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1., D=None, N=None):
        self.variance = variance
+        self.N = N
+        self._set_params(np.asarray(variance))
        super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)

    def _get_params(self):
@ -25,8 +31,13 @@ class Gaussian(NoiseDistribution):
    def _get_param_names(self):
        return ['noise_model_variance']

-    def _set_params(self,p):
-        self.variance = p
+    def _set_params(self, p):
+        self.variance = float(p)
+        self.I = np.eye(self.N)
+        self.covariance_matrix = self.I * self.variance
+        self.Ki = self.I*(1.0 / self.variance)
+        #self.ln_det_K = np.sum(np.log(np.diag(self.covariance_matrix)))
+        self.ln_det_K = self.N*np.log(self.variance)

    def _gradients(self,partial):
        return np.zeros(1)
@ -57,42 +68,231 @@ class Gaussian(NoiseDistribution):
        new_sigma2 = self.predictive_variance(mu,sigma)
        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)

-    def _predictive_variance_analytical(self,mu,sigma):
+    def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
        return 1./(1./self.variance + 1./sigma**2)

-    def _mass(self,gp,obs):
-        #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
-        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance))
+    def _mass(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use pdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+    def _nlog_mass(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")

-    def _nlog_mass(self,gp,obs):
-        return .5*((self.gp_link.transf(gp)-obs)**2/self.variance + np.log(2.*np.pi*self.variance))
+    def _dnlog_mass_dgp(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use dlogpdf_df in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")

-    def _dnlog_mass_dgp(self,gp,obs):
-        return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
+    def _d2nlog_mass_dgp2(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use d2logpdf_df2 in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")

-    def _d2nlog_mass_dgp2(self,gp,obs):
-        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        #Assumes no covariance, exp, sum, log for numerical stability
+        return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: log likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        return -0.5*(np.sum((y-link_f)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the pdf at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\sigma^{2}}(y_{i} - \\lambda(f_{i}))
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: gradient of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s2_i = (1.0/self.variance)
+        grad = s2_i*y - s2_i*link_f
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link_f, w.r.t link_f.
+        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
+
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        hess = -(1.0/self.variance)*np.ones((self.N, 1))
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = 0
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: third derivative of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None]
+        return d3logpdf_dlink3
+
+    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\sigma^{2}} = -\\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - \\lambda(f_{i}))^{2}}{2\\sigma^{4}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: float
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        e = y - link_f
+        s_4 = 1.0/(self.variance**2)
+        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.sum(np.square(e))
+        return np.sum(dlik_dsigma) # Sure about this sum?
+
+    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
+        """
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)}) = \\frac{1}{\\sigma^{4}}(-y_{i} + \\lambda(f_{i}))
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s_4 = 1.0/(self.variance**2)
+        dlik_grad_dsigma = -s_4*y + s_4*link_f
+        return dlik_grad_dsigma
+
+    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)}) = \\frac{1}{\\sigma^{4}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log hessian evaluated at points link(f_i) and link(f_j) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s_4 = 1.0/(self.variance**2)
+        d2logpdf_dlink2_dvar = np.diag(s_4*self.I)[:, None]
+        return d2logpdf_dlink2_dvar
+
+    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
+        return np.asarray([[dlogpdf_dvar]])
+
+    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
+        return d2logpdf_dlink2_dvar

    def _mean(self,gp):
        """
-        Mass (or density) function
+        Expected value of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            E_{p(y|f)}[y]
        """
        return self.gp_link.transf(gp)

-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
    def _variance(self,gp):
        """
-        Mass (or density) function
+        Variance of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            Var_{p(y|f)}[y]
        """
        return self.variance

-    def _dvariance_dgp(self,gp):
-        return 0
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.

-    def _d2variance_dgp2(self,gp):
-        return 0
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp])
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@ -24,19 +24,25 @@ class GPTransformation(object):
        """
        Gaussian process tranformation function, latent space -> output space
        """
-        pass
+        raise NotImplementedError

    def dtransf_df(self,f):
        """
        derivative of transf(f) w.r.t. f
        """
-        pass
+        raise NotImplementedError

    def d2transf_df2(self,f):
        """
        second derivative of transf(f) w.r.t. f
        """
-        pass
+        raise NotImplementedError
+
+    def d3transf_df3(self,f):
+        """
+        third derivative of transf(f) w.r.t. f
+        """
+        raise NotImplementedError

 class Identity(GPTransformation):
    """
@ -49,10 +55,13 @@ class Identity(GPTransformation):
        return f

    def dtransf_df(self,f):
-        return 1.
+        return np.ones_like(f)

    def d2transf_df2(self,f):
-        return 0
+        return np.zeros_like(f)
+
+    def d3transf_df3(self,f):
+        return np.zeros_like(f)


 class Probit(GPTransformation):
@ -71,6 +80,10 @@ class Probit(GPTransformation):
    def d2transf_df2(self,f):
        return -f * std_norm_pdf(f)

+    def d3transf_df3(self,f):
+        f2 = f**2
+        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(1-f2)
+
 class Log(GPTransformation):
    """
    .. math::
@ -87,6 +100,9 @@ class Log(GPTransformation):
    def d2transf_df2(self,f):
        return np.exp(f)

+    def d3transf_df3(self,f):
+        return np.exp(f)
+
 class Log_ex_1(GPTransformation):
    """
    .. math::
@ -104,15 +120,23 @@ class Log_ex_1(GPTransformation):
        aux = np.exp(f)/(1.+np.exp(f))
        return aux*(1.-aux)

+    def d3transf_df3(self,f):
+        aux = np.exp(f)/(1.+np.exp(f))
+        daux_df = aux*(1.-aux)
+        return daux_df - (2.*aux*daux_df)
+
 class Reciprocal(GPTransformation):
    def transf(self,f):
        return 1./f

    def dtransf_df(self,f):
-        return -1./f**2
+        return -1./(f**2)

    def d2transf_df2(self,f):
-        return 2./f**3
+        return 2./(f**3)
+
+    def d3transf_df3(self,f):
+        return -6./(f**4)

 class Heaviside(GPTransformation):
    """
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@ -9,15 +9,12 @@ import pylab as pb
 from GPy.util.plot import gpplot
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
-
+from GPy.util.misc import chain_1, chain_2, chain_3
+from scipy.integrate import quad

 class NoiseDistribution(object):
    """
-    Likelihood class for doing Expectation propagation
-
-    :param Y: observed output (Nx1 numpy.darray)
-
-    .. note:: Y values allowed depend on the LikelihoodFunction used
+    Likelihood class for doing approximations
    """
    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
        assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."
@ -35,6 +32,8 @@ class NoiseDistribution(object):
        else:
            self.predictive_variance = self._predictive_variance_numerical

+        self.log_concave = True
+
    def _get_params(self):
        return np.zeros(0)

@ -57,369 +56,379 @@ class NoiseDistribution(object):
        """
        return Y

-    def _product(self,gp,obs,mu,sigma):
-        """
-        Product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
-
-    def _nlog_product_scaled(self,gp,obs,mu,sigma):
-        """
-        Negative log-product between the cavity distribution and a likelihood factor.
-
-        .. note:: The constant term in the Gaussian distribution is ignored.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
-
-    def _dnlog_product_dgp(self,gp,obs,mu,sigma):
-        """
-        Derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
-
-    def _d2nlog_product_dgp2(self,gp,obs,mu,sigma):
-        """
-        Second derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
-
-    def _product_mode(self,obs,mu,sigma):
-        """
-        Newton's CG method to find the mode in _product (cavity x likelihood factor).
-
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma),disp=False)
-
    def _moments_match_analytical(self,obs,tau,v):
        """
        If available, this function computes the moments analytically.
        """
-        pass
+        raise NotImplementedError
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        assert y_test.shape==mu_star.shape
+        assert y_test.shape==var_star.shape
+        assert y_test.shape[1] == 1
+        def integral_generator(y, m, v):
+            """Generate a function which can be integrated to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
+            def f(f_star):
+                return self.pdf(f_star, y)*np.exp(-(1./(2*v))*np.square(m-f_star))
+            return f
+
+        scaled_p_ystar, accuracy = zip(*[quad(integral_generator(y, m, v), -np.inf, np.inf) for y, m, v in zip(y_test.flatten(), mu_star.flatten(), var_star.flatten())])
+        scaled_p_ystar = np.array(scaled_p_ystar).reshape(-1,1)
+        p_ystar = scaled_p_ystar/np.sqrt(2*np.pi*var_star)
+        return np.log(p_ystar)

    def _moments_match_numerical(self,obs,tau,v):
        """
-        Lapace approximation to calculate the moments.
+        Calculation of moments using quadrature

        :param obs: observed output
        :param tau: cavity distribution 1st natural parameter (precision)
        :param v: cavity distribution 2nd natural paramenter (mu*precision)
-
        """
+        #Compute first integral for zeroth moment
        mu = v/tau
-        mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
-        sigma2_hat = 1./(tau + self._d2nlog_mass_dgp2(mu_hat,obs))
-        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self._mass(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
-        return Z_hat,mu_hat,sigma2_hat
+        def int_1(f):
+            return self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        z, accuracy = quad(int_1, -np.inf, np.inf)
+        z /= np.sqrt(2*np.pi/tau)

-    def _nlog_conditional_mean_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's mean given the l.v.
+        #Compute second integral for first moment
+        def int_2(f):
+            return f*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        mean, accuracy = quad(int_2, -np.inf, np.inf)
+        mean /= np.sqrt(2*np.pi/tau)
+        mean /= z

-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
+        #Compute integral for variance
+        def int_3(f):
+            return (f**2)*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        Ef2, accuracy = quad(int_3, -np.inf, np.inf)
+        Ef2 /= np.sqrt(2*np.pi/tau)
+        Ef2 /= z
+        variance = Ef2 - mean**2

-        .. note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
-
-        """
-        return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
-
-    def _dnlog_conditional_mean_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_conditional_mean_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
-
-    def _d2nlog_conditional_mean_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_conditional_mean_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
-
-    def _nlog_exp_conditional_variance_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's variance given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E(V(Y_star|f_star))
-
-        """
-        return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
-
-    def _dnlog_exp_conditional_variance_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
-
-    def _d2nlog_exp_conditional_variance_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
-
-    def _nlog_exp_conditional_mean_sq_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's mean squared given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E( E(Y_star|f_star)**2 )
-
-        """
-        return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
-
-    def _dnlog_exp_conditional_mean_sq_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
-
-    def _d2nlog_exp_conditional_mean_sq_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
+        return z, mean, variance

    def _predictive_mean_analytical(self,mu,sigma):
        """
+        Predictive mean
+        .. math::
+            E(Y^{*}|Y) = E( E(Y^{*}|f^{*}, Y) )
+
        If available, this function computes the predictive mean analytically.
        """
-        pass
+        raise NotImplementedError

    def _predictive_variance_analytical(self,mu,sigma):
        """
+        Predictive variance
+        .. math::
+            V(Y^{*}| Y) = E( V(Y^{*}|f^{*}, Y) ) + V( E(Y^{*}|f^{*}, Y) )
+
        If available, this function computes the predictive variance analytically.
        """
-        pass
+        raise NotImplementedError

    def _predictive_mean_numerical(self,mu,sigma):
        """
-        Laplace approximation to the predictive mean: E(Y_star) = E( E(Y_star|f_star) )
+        Quadrature calculation of the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )

-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior

        """
-        maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma),disp=False)
-        mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
-        """
+        #FIXME: Quadrature does not work!
+        raise NotImplementedError
+        sigma2 = sigma**2
+        #Compute first moment
+        def int_mean(f):
+            return self._mean(f)*np.exp(-(0.5/sigma2)*np.square(f - mu))
+        scaled_mean, accuracy = quad(int_mean, -np.inf, np.inf)
+        mean = scaled_mean / np.sqrt(2*np.pi*(sigma2))

-        pb.figure()
-        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
-        f = np.array([np.exp(-self._nlog_conditional_mean_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
-        pb.plot(x,f,'b-')
-        sigma2 = 1./self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma)
-        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
-        k = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
-        pb.plot(x,f2*mean,'r-')
-        pb.vlines(maximum,0,f.max())
-        """
        return mean

-    def _predictive_mean_sq(self,mu,sigma):
-        """
-        Laplace approximation to the predictive mean squared: E(Y_star**2) = E( E(Y_star|f_star)**2 )
-
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma),disp=False)
-        mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
-        return mean_squared
-
    def _predictive_variance_numerical(self,mu,sigma,predictive_mean=None):
        """
        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )

-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior
        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.

        """
+        sigma2 = sigma**2
+        normalizer = np.sqrt(2*np.pi*sigma2)
+
        # E( V(Y_star|f_star) )
-        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma),disp=False)
-        exp_var = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma))*sigma)
+        #Compute expected value of variance
+        def int_var(f):
+            return self._variance(f)*np.exp(-(0.5/sigma2)*np.square(f - mu))
+        scaled_exp_variance, accuracy = quad(int_var, -np.inf, np.inf)
+        exp_var = scaled_exp_variance / normalizer

-        """
-        pb.figure()
-        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
-        f = np.array([np.exp(-self._nlog_exp_conditional_variance_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
-        pb.plot(x,f,'b-')
-        sigma2 = 1./self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma)
-        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
-        k = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
-        pb.plot(x,f2*exp_var,'r--')
-        pb.vlines(maximum,0,f.max())
-        """
-
-        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star)**2 )
-        exp_exp2 = self._predictive_mean_sq(mu,sigma)
+        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
        if predictive_mean is None:
            predictive_mean = self.predictive_mean(mu,sigma)
+
+        predictive_mean_sq = predictive_mean**2
+        def int_pred_mean_sq(f):
+            return predictive_mean_sq*np.exp(-(0.5/(sigma2))*np.square(f - mu))
+
+        scaled_exp_exp2, accuracy = quad(int_pred_mean_sq, -np.inf, np.inf)
+        exp_exp2 = scaled_exp_exp2 / normalizer
+
        var_exp = exp_exp2 - predictive_mean**2
+        # V(Y_star | f_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
        return exp_var + var_exp

-    def _predictive_percentiles(self,p,mu,sigma):
+    def pdf_link(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_link_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def pdf(self, f, y, extra_data=None):
        """
-        Percentiles of the predictive distribution
+        Evaluates the link function link(f) then computes the likelihood (pdf) using it

-        :parm p: lower tail probability
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+        .. math:
+            p(y|\\lambda(f))

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
        """
-        qf = stats.norm.ppf(p,mu,sigma)
-        return self.gp_link.transf(qf)
+        link_f = self.gp_link.transf(f)
+        return self.pdf_link(link_f, y, extra_data=extra_data)

-    def _nlog_joint_predictive_scaled(self,x,mu,sigma):
+    def logpdf(self, f, y, extra_data=None):
        """
-        Negative logarithm of the joint predictive distribution (latent variable and output).
+        Evaluates the link function link(f) then computes the log likelihood (log pdf) using it

-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
+        .. math:
+            \\log p(y|\\lambda(f))

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: log likelihood evaluated for this point
+        :rtype: float
        """
-        return self._nlog_product_scaled(x[0],x[1],mu,sigma)
+        link_f = self.gp_link.transf(f)
+        return self.logpdf_link(link_f, y, extra_data=extra_data)

-    def _gradient_nlog_joint_predictive(self,x,mu,sigma):
+    def dlogpdf_df(self, f, y, extra_data=None):
        """
-        Gradient of _nlog_joint_predictive_scaled.
+        Evaluates the link function link(f) then computes the derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule

-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        .. note: Only available when the output is continuous
+        .. math::
+            \\frac{d\\log p(y|\\lambda(f))}{df} = \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d\\lambda(f)}{df}

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of log likelihood evaluated for this point
+        :rtype: 1xN array
        """
-        assert not self.discrete, "Gradient not available for discrete outputs."
-        return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
+        link_f = self.gp_link.transf(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        return chain_1(dlogpdf_dlink, dlink_df)

-    def _hessian_nlog_joint_predictive(self,x,mu,sigma):
+    def d2logpdf_df2(self, f, y, extra_data=None):
        """
-        Hessian of _nlog_joint_predictive_scaled.
+        Evaluates the link function link(f) then computes the second derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule

-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        .. note: Only available when the output is continuous
+        .. math::
+            \\frac{d^{2}\\log p(y|\\lambda(f))}{df^{2}} = \\frac{d^{2}\\log p(y|\\lambda(f))}{d^{2}\\lambda(f)}\\left(\\frac{d\\lambda(f)}{df}\\right)^{2} + \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d^{2}\\lambda(f)}{df^{2}}

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: second derivative of log likelihood evaluated for this point (diagonal only)
+        :rtype: 1xN array
        """
-        assert not self.discrete, "Hessian not available for discrete outputs."
-        cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
-        return np.array((self._d2nlog_product_dgp2(gp=x[0],obs=x[1],mu=mu,sigma=sigma),cross_derivative,cross_derivative,self._d2nlog_mass_dobs2(obs=x[1],gp=x[0]))).reshape(2,2)
+        link_f = self.gp_link.transf(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)

-    def _joint_predictive_mode(self,mu,sigma):
+    def d3logpdf_df3(self, f, y, extra_data=None):
        """
-        Negative logarithm of the joint predictive distribution (latent variable and output).
+        Evaluates the link function link(f) then computes the third derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule

-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
+        .. math::
+            \\frac{d^{3}\\log p(y|\\lambda(f))}{df^{3}} = \\frac{d^{3}\\log p(y|\\lambda(f)}{d\\lambda(f)^{3}}\\left(\\frac{d\\lambda(f)}{df}\\right)^{3} + 3\\frac{d^{2}\\log p(y|\\lambda(f)}{d\\lambda(f)^{2}}\\frac{d\\lambda(f)}{df}\\frac{d^{2}\\lambda(f)}{df^{2}} + \\frac{d\\log p(y|\\lambda(f)}{d\\lambda(f)}\\frac{d^{3}\\lambda(f)}{df^{3}}

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: third derivative of log likelihood evaluated for this point
+        :rtype: float
        """
-        return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
+        link_f = self.gp_link.transf(f)
+        d3logpdf_dlink3 = self.d3logpdf_dlink3(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        d3link_df3 = self.gp_link.d3transf_df3(f)
+        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)

-    def predictive_values(self,mu,var):
+    def dlogpdf_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([1, 0])
+
+    def dlogpdf_df_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([f.shape[0], 0])
+
+    def d2logpdf_df2_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([f.shape[0], 0])
+
+    def _laplace_gradients(self, f, y, extra_data=None):
+        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, extra_data=extra_data)
+        dlogpdf_df_dtheta = self.dlogpdf_df_dtheta(f, y, extra_data=extra_data)
+        d2logpdf_df2_dtheta = self.d2logpdf_df2_dtheta(f, y, extra_data=extra_data)
+
+        #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
+        # ensure we have gradients for every parameter we want to optimize
+        assert dlogpdf_dtheta.shape[1] == len(self._get_param_names())
+        assert dlogpdf_df_dtheta.shape[1] == len(self._get_param_names())
+        assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
+        return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
+
+    def predictive_values(self, mu, var, full_cov=False, num_samples=30000,
+                          sampling=False):
        """
        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.

-        :param mu: mean of the latent variable
-        :param var: variance of the latent variable
+        :param mu: mean of the latent variable, f, of posterior
+        :param var: variance of the latent variable, f, of posterior
+        :param full_cov: whether to use the full covariance or just the diagonal
+        :type full_cov: Boolean
+        :param num_samples: number of samples to use in computing quantiles and
+                            possibly mean variance
+        :type num_samples: integer
+        :param sampling: Whether to use samples for mean and variances anyway
+        :type sampling: Boolean

        """
-        if isinstance(mu,float) or isinstance(mu,int):
-            mu = [mu]
-            var = [var]
-        pred_mean = []
-        pred_var = []
-        q1 = []
-        q3 = []
-        for m,s in zip(mu,np.sqrt(var)):
-            pred_mean.append(self.predictive_mean(m,s))
-            pred_var.append(self.predictive_variance(m,s,pred_mean[-1]))
-            q1.append(self._predictive_percentiles(.025,m,s))
-            q3.append(self._predictive_percentiles(.975,m,s))
+
+        #Get gp_samples f* using posterior mean and variance
+        if not full_cov:
+            gp_samples = np.random.multivariate_normal(mu.flatten(), np.diag(var.flatten()),
+                                                        size=num_samples).T
+        else:
+            gp_samples = np.random.multivariate_normal(mu.flatten(), var,
+                                                           size=num_samples).T
+
+        #Push gp samples (f*) through likelihood to give p(y*|f*)
+        samples = self.samples(gp_samples)
+        axis=-1
+
+        if self.analytical_mean and not sampling:
+            pred_mean = self.predictive_mean(mu, np.sqrt(var))
+        else:
+            pred_mean = np.mean(samples, axis=axis)
+
+        if self.analytical_variance and not sampling:
+            pred_var = self.predictive_variance(mu, np.sqrt(var), pred_mean)
+        else:
+            pred_var = np.var(samples, axis=axis)
+
+        #Calculate quantiles from samples
+        q1 = np.percentile(samples, 2.5, axis=axis)
+        q3 = np.percentile(samples, 97.5, axis=axis)
+        print "WARNING: Using sampling to calculate predictive quantiles"
+
        pred_mean = np.vstack(pred_mean)
        pred_var = np.vstack(pred_var)
        q1 = np.vstack(q1)
        q3 = np.vstack(q3)
        return pred_mean, pred_var, q1, q3

-
    def samples(self, gp):
        """
        Returns a set of samples of observations based on a given value of the latent variable.

        :param gp: latent variable
        """
-        pass
-
+        raise NotImplementedError
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@ -1,7 +1,7 @@
+from __future__ import division
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 import numpy as np
 from scipy import stats,special
 import scipy as sp
@ -14,9 +14,10 @@ class Poisson(NoiseDistribution):
    Poisson likelihood

    .. math::
-        L(x) = \\exp(\\lambda) * \\frac{\\lambda^Y_i}{Y_i!}
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\lambda(f_{i})^{y_{i}}}{y_{i}!}e^{-\\lambda(f_{i})}

-    ..Note: Y is expected to take values in {0,1,2,...}
+    .. Note::
+        Y is expected to take values in {0,1,2,...}
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
@ -24,25 +25,108 @@ class Poisson(NoiseDistribution):
    def _preprocess_values(self,Y): #TODO
        return Y

-    def _mass(self,gp,obs):
+    def pdf_link(self, link_f, y, extra_data=None):
        """
-        Mass (or density) function
-        """
-        return stats.poisson.pmf(obs,self.gp_link.transf(gp))
+        Likelihood function given link(f)

-    def _nlog_mass(self,gp,obs):
-        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
-        """
-        return self.gp_link.transf(gp) - obs * np.log(self.gp_link.transf(gp)) + np.log(special.gamma(obs+1))
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\lambda(f_{i})^{y_{i}}}{y_{i}!}e^{-\\lambda(f_{i})}

-    def _dnlog_mass_dgp(self,gp,obs):
-        return self.gp_link.dtransf_df(gp) * (1. - obs/self.gp_link.transf(gp))
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return np.prod(stats.poisson.pmf(y,link_f))

-    def _d2nlog_mass_dgp2(self,gp,obs):
-        d2_df = self.gp_link.d2transf_df2(gp)
-        transf = self.gp_link.transf(gp)
-        return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = -\\lambda(f_{i}) + y_{i}\\log \\lambda(f_{i}) - \\log y_{i}!
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return np.sum(-link_f + y*np.log(link_f) - special.gammaln(y+1))
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - 1
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return y/link_f - 1
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{-y_{i}}{\\lambda(f_{i})^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -y/(link_f**2)
+        return hess
+        #d2_df = self.gp_link.d2transf_df2(gp)
+        #transf = self.gp_link.transf(gp)
+        #return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f_{i})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = 2*y/(link_f)**3
+        return d3lik_dlink3

    def _mean(self,gp):
        """
@ -50,20 +134,19 @@ class Poisson(NoiseDistribution):
        """
        return self.gp_link.transf(gp)

-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)

-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.

-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.random.poisson(self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@ -0,0 +1,277 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats, special
+import scipy as sp
+import gp_transformations
+from noise_distributions import NoiseDistribution
+from scipy import stats, integrate
+from scipy.special import gammaln, gamma
+
+class StudentT(NoiseDistribution):
+    """
+    Student T likelihood
+
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+
+    """
+    def __init__(self,gp_link=None,analytical_mean=True,analytical_variance=True, deg_free=5, sigma2=2):
+        self.v = deg_free
+        self.sigma2 = sigma2
+
+        self._set_params(np.asarray(sigma2))
+        super(StudentT, self).__init__(gp_link,analytical_mean,analytical_variance)
+        self.log_concave = False
+
+    def _get_params(self):
+        return np.asarray(self.sigma2)
+
+    def _get_param_names(self):
+        return ["t_noise_std2"]
+
+    def _set_params(self, x):
+        self.sigma2 = float(x)
+
+    @property
+    def variance(self, extra_data=None):
+        return (self.v / float(self.v - 2)) * self.sigma2
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        #Careful gamma(big_number) is infinity!
+        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
+                     / (np.sqrt(self.v * np.pi * self.sigma2)))
+                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
+                    )
+        return np.prod(objective)
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        objective = (+ gammaln((self.v + 1) * 0.5)
+                     - gammaln(self.v * 0.5)
+                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                     - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
+                    )
+        return np.sum(objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
+                    )
+        return d3lik_dlink3
+
+    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
+        return np.sum(dlogpdf_dvar)
+
+    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
+        """
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2}
+
+        :param link_f: latent variables link_f
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+                              / ((self.sigma2*self.v + (e**2))**3)
+                           )
+        return d2logpdf_dlink2_dvar
+
+    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
+        return np.asarray([[dlogpdf_dvar]])
+
+    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
+        return d2logpdf_dlink2_dvar
+
+    def _predictive_variance_analytical(self, mu, sigma, predictive_mean=None):
+        """
+        Compute predictive variance of student_t*normal p(y*|f*)p(f*)
+
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+
+        #FIXME: Not correct
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = 1/(1/sigma**2 + 1/self.variance)
+
+        return true_var
+
+    def _predictive_mean_analytical(self, mu, sigma):
+        """
+        Compute mean of the prediction
+        """
+        #FIXME: Not correct
+        return mu
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        #FIXME: Very slow as we are computing a new random variable per input!
+        #Can't get it to sample all at the same time
+        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        dfs = np.ones_like(gp)*self.v
+        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
+        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
+                                        scale=scales)
+        return student_t_samples.reshape(orig_shape)
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@ -16,7 +16,7 @@ class FITCClassification(FITC):

    :param X: input observations
    :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link function
+    :param likelihood: a GPy likelihood, defaults to Bernoulli with probit link function
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
@ -31,7 +31,7 @@ class FITCClassification(FITC):
            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)

        if likelihood is None:
-            noise_model = likelihoods.binomial()
+            noise_model = likelihoods.bernoulli()
            likelihood = likelihoods.EP(Y, noise_model)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -15,7 +15,7 @@ class GPClassification(GP):

    :param X: input observations
    :param Y: observed values, can be None if likelihood is not None
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
+    :param likelihood: a GPy likelihood, defaults to Bernoulli with Probit link_function
    :param kernel: a GPy kernel, defaults to rbf
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
@ -31,7 +31,7 @@ class GPClassification(GP):
            kernel = kern.rbf(X.shape[1])

        if likelihood is None:
-            noise_model = likelihoods.binomial()
+            noise_model = likelihoods.bernoulli()
            likelihood = likelihoods.EP(Y, noise_model)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -25,11 +25,12 @@ class GPRegression(GP):

    """

-    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, likelihood=None):
        if kernel is None:
            kernel = kern.rbf(X.shape[1])

-        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
+        if likelihood is None:
+            likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)

        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self.ensure_default_constraints()
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@ -26,40 +26,40 @@ class GradientChecker(Model):
        """
        :param f: Function to check gradient for
        :param df: Gradient of function to check
-        :param x0: 
+        :param x0:
            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
-            Can be a list of arrays, if takes a list of arrays. This list will be passed 
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
            to f and df in the same order as given here.
            If only one argument, make sure not to pass a list!!!
-            
+
        :type x0: [array-like] | array-like | float | int
        :param names:
            Names to print, when performing gradcheck. If a list was passed to x0
            a list of names with the same length is expected.
        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
-        
+
        Examples:
        ---------
            from GPy.models import GradientChecker
            N, M, Q = 10, 5, 3
-        
+
            Sinusoid:
-            
+
                X = numpy.random.rand(N, Q)
                grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
                grad.checkgrad(verbose=1)
-    
+
            Using GPy:
-            
+
                X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
                kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
-                grad = GradientChecker(kern.K, 
+                grad = GradientChecker(kern.K,
                                       lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
                                       x0 = X.copy(),
-                                       names='X')  
+                                       names='X')
                grad.checkgrad(verbose=1)
                grad.randomize()
-                grad.checkgrad(verbose=1)      
+                grad.checkgrad(verbose=1)
        """
        Model.__init__(self)
        if isinstance(x0, (list, tuple)) and names is None:
@ -75,14 +75,14 @@ class GradientChecker(Model):
            self.names = names
            self.shapes = [get_shape(x0)]
        for name, xi in zip(self.names, at_least_one_element(x0)):
-            self.__setattr__(name, xi)
+            self.__setattr__(name, numpy.float_(xi))
 #         self._param_names = []
 #         for name, shape in zip(self.names, self.shapes):
 #             self._param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
        self.args = args
        self.kwargs = kwargs
-        self.f = f
-        self.df = df
+        self._f = f
+        self._df = df

    def _get_x(self):
        if len(self.names) > 1:
@ -90,10 +90,10 @@ class GradientChecker(Model):
        return [self.__getattribute__(self.names[0])] + list(self.args)

    def log_likelihood(self):
-        return float(numpy.sum(self.f(*self._get_x(), **self.kwargs)))
+        return float(numpy.sum(self._f(*self._get_x(), **self.kwargs)))

    def _log_likelihood_gradients(self):
-        return numpy.atleast_1d(self.df(*self._get_x(), **self.kwargs)).flatten()
+        return numpy.atleast_1d(self._df(*self._get_x(), **self.kwargs)).flatten()


    def _get_params(self):
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@ -16,7 +16,7 @@ class SparseGPClassification(SparseGP):

    :param X: input observations
    :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
+    :param likelihood: a GPy likelihood, defaults to Bernoulli with probit link_function
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
@ -31,7 +31,7 @@ class SparseGPClassification(SparseGP):
            kernel = kern.rbf(X.shape[1])# + kern.white(X.shape[1],1e-3)

        if likelihood is None:
-            noise_model = likelihoods.binomial()
+            noise_model = likelihoods.bernoulli()
            likelihood = likelihoods.EP(Y, noise_model)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@ -37,7 +37,7 @@ def model_checkgrads(model):

 def model_instance(model):
    #assert isinstance(model, GPy.core.model)
-    return isinstance(model, GPy.core.model)
+    return isinstance(model, GPy.core.model.Model)

@nottest
 def test_models():
@ -54,7 +54,7 @@ def test_models():
        print "After"
        print functions
        for example in functions:
-            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100']:
+            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100', 'brendan_faces']:
                print "SKIPPING"
                continue

--- a/GPy/testing/gp_transformation_tests.py
+++ b/GPy/testing/gp_transformation_tests.py
@ -0,0 +1,61 @@
+from nose.tools import with_setup
+from GPy.models import GradientChecker
+from GPy.likelihoods.noise_models import gp_transformations
+import inspect
+import unittest
+import numpy as np
+
+class TestTransformations(object):
+    """
+    Generic transformations checker
+    """
+    def setUp(self):
+        N = 30
+        self.fs = [np.random.rand(N, 1), float(np.random.rand(1))]
+
+
+    def tearDown(self):
+        self.fs = None
+
+    def test_transformations(self):
+        self.setUp()
+        transformations = [gp_transformations.Identity(),
+                           gp_transformations.Log(),
+                           gp_transformations.Probit(),
+                           gp_transformations.Log_ex_1(),
+                           gp_transformations.Reciprocal(),
+                           ]
+
+        for transformation in transformations:
+            for f in self.fs:
+                yield self.t_dtransf_df, transformation, f
+                yield self.t_d2transf_df2, transformation, f
+                yield self.t_d3transf_df3, transformation, f
+
+    @with_setup(setUp, tearDown)
+    def t_dtransf_df(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.transf, transformation.dtransf_df, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2transf_df2(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.dtransf_df, transformation.d2transf_df2, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3transf_df3(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.d2transf_df2, transformation.d3transf_df3, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+#if __name__ == "__main__":
+    #print "Running unit tests"
+    #unittest.main()
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@ -0,0 +1,577 @@
+import numpy as np
+import unittest
+import GPy
+from GPy.models import GradientChecker
+import functools
+import inspect
+from GPy.likelihoods.noise_models import gp_transformations
+from functools import partial
+
+def dparam_partial(inst_func, *args):
+    """
+    If we have a instance method that needs to be called but that doesn't
+    take the parameter we wish to change to checkgrad, then this function
+    will change the variable using set params.
+
+    inst_func: should be a instance function of an object that we would like
+                to change
+    param: the param that will be given to set_params
+    args: anything else that needs to be given to the function (for example
+          the f or Y that are being used in the function whilst we tweak the
+          param
+    """
+    def param_func(param, inst_func, args):
+        inst_func.im_self._set_params(param)
+        return inst_func(*args)
+    return functools.partial(param_func, inst_func=inst_func, args=args)
+
+def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=False, verbose=False):
+    """
+    checkgrad expects a f: R^N -> R^1 and df: R^N -> R^N
+    However if we are holding other parameters fixed and moving something else
+    We need to check the gradient of each of the fixed parameters
+    (f and y for example) seperately,  whilst moving another parameter.
+    Otherwise f: gives back R^N and
+              df: gives back R^NxM where M is
+    The number of parameters and N is the number of data
+    Need to take a slice out from f and a slice out of df
+    """
+    #print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
+                                           #func.__name__, dfunc.__name__)
+    partial_f = dparam_partial(func, *args)
+    partial_df = dparam_partial(dfunc, *args)
+    gradchecking = True
+    for param in params:
+        fnum = np.atleast_1d(partial_f(param)).shape[0]
+        dfnum = np.atleast_1d(partial_df(param)).shape[0]
+        for fixed_val in range(dfnum):
+            #dlik and dlik_dvar gives back 1 value for each
+            f_ind = min(fnum, fixed_val+1) - 1
+            print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
+            #Make grad checker with this param moving, note that set_params is NOT being called
+            #The parameter is being set directly with __setattr__
+            grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
+                                   lambda x : np.atleast_1d(partial_df(x))[fixed_val],
+                                   param, 'p')
+            #This is not general for more than one param...
+            if constraints is not None:
+                for constraint in constraints:
+                    constraint('p', grad)
+            if randomize:
+                grad.randomize()
+            if verbose:
+                print grad
+                grad.checkgrad(verbose=1)
+            if not grad.checkgrad():
+                gradchecking = False
+
+    return gradchecking
+
+
+from nose.tools import with_setup
+class TestNoiseModels(object):
+    """
+    Generic model checker
+    """
+    def setUp(self):
+        self.N = 5
+        self.D = 3
+        self.X = np.random.rand(self.N, self.D)*10
+
+        self.real_std = 0.1
+        noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
+        self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
+        self.f = np.random.rand(self.N, 1)
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        self.positive_Y = np.exp(self.Y.copy())
+        tmp = np.round(self.X[:, 0]*3-3)[:, None] + np.random.randint(0,3, self.X.shape[0])[:, None]
+        self.integer_Y = np.where(tmp > 0, tmp, 0)
+
+        self.var = 0.2
+
+        self.var = np.random.rand(1)
+
+        #Make a bigger step as lower bound can be quite curved
+        self.step = 1e-3
+
+    def tearDown(self):
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_noise_models(self):
+        self.setUp()
+
+        ####################################################
+        # Constraint wrappers so we can just list them off #
+        ####################################################
+        def constrain_negative(regex, model):
+            model.constrain_negative(regex)
+
+        def constrain_positive(regex, model):
+            model.constrain_positive(regex)
+
+        def constrain_bounded(regex, model, lower, upper):
+            """
+            Used like: partial(constrain_bounded, lower=0, upper=1)
+            """
+            model.constrain_bounded(regex, lower, upper)
+
+        """
+        Dictionary where we nest models we would like to check
+            Name: {
+                "model": model_instance,
+                "grad_params": {
+                    "names": [names_of_params_we_want, to_grad_check],
+                    "vals": [values_of_params, to_start_at],
+                    "constrain": [constraint_wrappers, listed_here]
+                    },
+                "laplace": boolean_of_whether_model_should_work_for_laplace,
+                "ep": boolean_of_whether_model_should_work_for_laplace,
+                "link_f_constraints": [constraint_wrappers, listed_here]
+                }
+        """
+        noise_models = {"Student_t_default": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_1_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [1],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_small_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [0.01],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_approx_gauss": {
+                            "model": GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_log": {
+                            "model": GPy.likelihoods.student_t(gp_link=gp_transformations.Log(), deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_default": {
+                            "model": GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True,
+                            "ep": True
+                            },
+                        "Gaussian_log": {
+                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log(), variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_probit": {
+                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Probit(), variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_log_ex": {
+                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Bernoulli_default": {
+                            "model": GPy.likelihoods.bernoulli(),
+                            "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
+                            "laplace": True,
+                            "Y": self.binary_Y,
+                            "ep": True
+                            },
+                        "Exponential_default": {
+                            "model": GPy.likelihoods.exponential(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.positive_Y,
+                            "laplace": True,
+                        },
+                        "Poisson_default": {
+                            "model": GPy.likelihoods.poisson(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.integer_Y,
+                            "laplace": True,
+                            "ep": False #Should work though...
+                        },
+                        "Gamma_default": {
+                            "model": GPy.likelihoods.gamma(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.positive_Y,
+                            "laplace": True
+                        }
+                    }
+
+        for name, attributes in noise_models.iteritems():
+            model = attributes["model"]
+            if "grad_params" in attributes:
+                params = attributes["grad_params"]
+                param_vals = params["vals"]
+                param_names= params["names"]
+                param_constraints = params["constraints"]
+            else:
+                params = []
+                param_vals = []
+                param_names = []
+                constrain_positive = []
+            if "link_f_constraints" in attributes:
+                link_f_constraints = attributes["link_f_constraints"]
+            else:
+                link_f_constraints = []
+            if "Y" in attributes:
+                Y = attributes["Y"].copy()
+            else:
+                Y = self.Y.copy()
+            if "f" in attributes:
+                f = attributes["f"].copy()
+            else:
+                f = self.f.copy()
+            if "laplace" in attributes:
+                laplace = attributes["laplace"]
+            else:
+                laplace = False
+            if "ep" in attributes:
+                ep = attributes["ep"]
+            else:
+                ep = False
+
+            if len(param_vals) > 1:
+                raise NotImplementedError("Cannot support multiple params in likelihood yet!")
+
+            #Required by all
+            #Normal derivatives
+            yield self.t_logpdf, model, Y, f
+            yield self.t_dlogpdf_df, model, Y, f
+            yield self.t_d2logpdf_df2, model, Y, f
+            #Link derivatives
+            yield self.t_dlogpdf_dlink, model, Y, f, link_f_constraints
+            yield self.t_d2logpdf_dlink2, model, Y, f, link_f_constraints
+            if laplace:
+                #Laplace only derivatives
+                yield self.t_d3logpdf_df3, model, Y, f
+                yield self.t_d3logpdf_dlink3, model, Y, f, link_f_constraints
+                #Params
+                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_constraints
+                #Link params
+                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_constraints
+
+                #laplace likelihood gradcheck
+                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+            if ep:
+                #ep likelihood gradcheck
+                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+
+
+        self.tearDown()
+
+    #############
+    # dpdf_df's #
+    #############
+    @with_setup(setUp, tearDown)
+    def t_logpdf(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        np.testing.assert_almost_equal(
+                               np.log(model.pdf(f.copy(), Y.copy())),
+                               model.logpdf(f.copy(), Y.copy()))
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_df(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.description = "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(model.logpdf, y=Y)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        grad = GradientChecker(logpdf, dlogpdf_df, f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print model
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf_df2(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print model
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3logpdf_df3(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y)
+        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print model
+        assert grad.checkgrad()
+
+    ##############
+    # df_dparams #
+    ##############
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_df_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    ################
+    # dpdf_dlink's #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dlink(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(model.logpdf_link, y=Y)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        grad = GradientChecker(logpdf, dlogpdf_dlink, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        print grad
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf_dlink2(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print grad
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3logpdf_dlink3(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y)
+        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print grad
+        assert grad.checkgrad()
+
+    #################
+    # dlink_dparams #
+    #################
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    ################
+    # laplace test #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_laplace_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        #Normalize
+        Y = Y/Y.max()
+        white_var = 0.001
+        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), model)
+        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=laplace_likelihood)
+        m.ensure_default_constraints()
+        m.constrain_fixed('white', white_var)
+
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            m[name] = param_vals[param_num]
+            constraints[param_num](name, m)
+
+        m.randomize()
+        m.checkgrad(verbose=1, step=step)
+        print m
+        assert m.checkgrad(step=step)
+
+    ###########
+    # EP test #
+    ###########
+    @with_setup(setUp, tearDown)
+    def t_ep_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        #Normalize
+        Y = Y/Y.max()
+        white_var = 0.001
+        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        ep_likelihood = GPy.likelihoods.EP(Y.copy(), model)
+        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)
+        m.ensure_default_constraints()
+        m.constrain_fixed('white', white_var)
+
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            m[name] = param_vals[param_num]
+            constraints[param_num](name, m)
+
+        m.randomize()
+        m.checkgrad(verbose=1, step=step)
+        print m
+        assert m.checkgrad(step=step)
+
+
+class LaplaceTests(unittest.TestCase):
+    """
+    Specific likelihood tests, not general enough for the above tests
+    """
+
+    def setUp(self):
+        self.N = 5
+        self.D = 3
+        self.X = np.random.rand(self.N, self.D)*10
+
+        self.real_std = 0.1
+        noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
+        self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
+        self.f = np.random.rand(self.N, 1)
+
+        self.var = 0.2
+
+        self.var = np.random.rand(1)
+        self.stu_t = GPy.likelihoods.student_t(deg_free=5, sigma2=self.var)
+        self.gauss = GPy.likelihoods.gaussian(gp_transformations.Log(), variance=self.var, D=self.D, N=self.N)
+
+        #Make a bigger step as lower bound can be quite curved
+        self.step = 1e-6
+
+    def tearDown(self):
+        self.stu_t = None
+        self.gauss = None
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_gaussian_d2logpdf_df2_2(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.Y = None
+        self.gauss = None
+
+        self.N = 2
+        self.D = 1
+        self.X = np.linspace(0, self.D, self.N)[:, None]
+        self.real_std = 0.2
+        noise = np.random.randn(*self.X.shape)*self.real_std
+        self.Y = np.sin(self.X*2*np.pi) + noise
+        self.f = np.random.rand(self.N, 1)
+        self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
+
+        dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
+        d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests"
+    unittest.main()
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -209,7 +209,7 @@ class GradientTests(unittest.TestCase):
        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.rbf(1)
        m = GPy.models.SparseGPClassification(X,Y,kernel=kernel,Z=Z)
-        #distribution = GPy.likelihoods.likelihood_functions.Binomial()
+        #distribution = GPy.likelihoods.likelihood_functions.Bernoulli()
        #likelihood = GPy.likelihoods.EP(Y, distribution)
        #m = GPy.core.SparseGP(X, likelihood, kernel, Z)
        #m.ensure_default_constraints()
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -17,13 +17,13 @@ except ImportError:

 import sys, urllib

-def reporthook(a,b,c): 
+def reporthook(a,b,c):
    # ',' at the end of the line is important!
    #print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
    #you can also use sys.stdout.write
    sys.stdout.write("\r% 3.1f%% of %d bytes" % (min(100, float(a * b) / c * 100), c))
    sys.stdout.flush()
-     
+
 # Global variables
 data_path = os.path.join(os.path.dirname(__file__), 'datasets')
 default_seed = 10000
@ -39,7 +39,7 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
                                       'license' : None,
                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
-                   
+
                  'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
                                      'files' : [['Index', 'housing.data', 'housing.names']],
                                      'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
@ -164,14 +164,14 @@ def prompt_user(prompt):
        print(prompt)
        choice = raw_input().lower()
        # would like to test for exception here, but not sure if we can do that without importing IPython
-    except: 
+    except:
        print('Stdin is not implemented.')
        print('You need to set')
        print('overide_manual_authorize=True')
        print('to proceed with the download. Please set that variable and continue.')
        raise

-    
+
    if choice in yes:
        return True
    elif choice in no:
@ -189,7 +189,7 @@ def data_available(dataset_name=None):
            if not os.path.exists(os.path.join(data_path, dataset_name, file)):
                return False
    return True
-            
+
 def download_url(url, store_directory, save_name = None, messages = True, suffix=''):
    """Download a file from a url and save it to disk."""
    i = url.rfind('/')
@ -249,18 +249,18 @@ def download_data(dataset_name=None):
            for file in files:
                download_url(os.path.join(url,file), dataset_name, dataset_name)
    return True
-                  
+
 def data_details_return(data, data_set):
    """Update the data component of the data dictionary with details drawn from the data_resources."""
    data.update(data_resources[data_set])
    return data

-    
+
 def cmu_urls_files(subj_motions, messages = True):
    '''
-    Find which resources are missing on the local disk for the requested CMU motion capture motions. 
+    Find which resources are missing on the local disk for the requested CMU motion capture motions.
    '''
-    
+
    subjects_num = subj_motions[0]
    motions_num = subj_motions[1]

@ -280,15 +280,15 @@ def cmu_urls_files(subj_motions, messages = True):
            motions[i].append(curMot)

    all_skels = []
-    
+
    assert len(subjects) == len(motions)
-    
+
    all_motions = []
-            
+
    for i in range(len(subjects)):
        skel_dir = os.path.join(data_path, 'cmu_mocap')
        cur_skel_file = os.path.join(skel_dir, subjects[i] + '.asf')
-        
+
        url_required = False
        file_download = []
        if not os.path.exists(cur_skel_file):
@ -332,7 +332,7 @@ if gpxpy_available:
            points = [point for track in gpx.tracks for segment in track.segments for point in segment.points]
            data = [[(point.time-datetime.datetime(2013,8,21)).total_seconds(), point.latitude, point.longitude, point.elevation] for point in points]
            X.append(np.asarray(data)[::sample_every, :])
-            gpx_file.close()        
+            gpx_file.close()
        return data_details_return({'X' : X, 'info' : 'Data is an array containing time in seconds, latitude, longitude and elevation in that order.'}, data_set)

 del gpxpy_available
@ -408,7 +408,7 @@ def oil(data_set='three_phase_oil_flow'):
    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'Xtest' : Xtest, 'Xvalid': Xvalid, 'Yvalid': Yvalid}, data_set)
    #else:
    # throw an error
-    
+
 def oil_100(seed=default_seed, data_set = 'three_phase_oil_flow'):
    np.random.seed(seed=seed)
    data = oil()
@ -622,7 +622,7 @@ def xw_pen(data_set='xw_pen'):
    X = np.arange(485)[:, None]
    return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen. Plot in original paper showed regression between time steps 175 and 275."}, data_set)

-    
+
 def download_rogers_girolami_data():
    if not data_available('rogers_girolami_data'):
        download_data(data_set)
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -61,6 +61,14 @@ def dpotri(A, lower=0):
    """
    return lapack.dpotri(A, lower=lower)

+def pddet(A):
+    """
+    Determinant of a positive definite matrix, only symmetric matricies though
+    """
+    L = jitchol(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
 def trace_dot(a, b):
    """
    Efficiently compute the trace of the matrix product of a and b
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@ -5,6 +5,33 @@ import numpy as np
 from scipy import weave
 from config import *

+def chain_1(df_dg, dg_dx):
+    """
+    Generic chaining function for first derivative
+
+    .. math::
+        \\frac{d(f . g)}{dx} = \\frac{df}{dg} \\frac{dg}{dx}
+    """
+    return df_dg * dg_dx
+
+def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
+    """
+    Generic chaining function for second derivative
+
+    .. math::
+        \\frac{d^{2}(f . g)}{dx^{2}} = \\frac{d^{2}f}{dg^{2}}(\\frac{dg}{dx})^{2} + \\frac{df}{dg}\\frac{d^{2}g}{dx^{2}}
+    """
+    return d2f_dg2*(dg_dx**2) + df_dg*d2g_dx2
+
+def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
+    """
+    Generic chaining function for third derivative
+
+    .. math::
+        \\frac{d^{3}(f . g)}{dx^{3}} = \\frac{d^{3}f}{dg^{3}}(\\frac{dg}{dx})^{3} + 3\\frac{d^{2}f}{dg^{2}}\\frac{dg}{dx}\\frac{d^{2}g}{dx^{2}} + \\frac{df}{dg}\\frac{d^{3}g}{dx^{3}}
+    """
+    return d3f_dg3*(dg_dx**3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3
+
 def opt_wrapper(m, **kwargs):
    """
    This function just wraps the optimization procedure of a GPy
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@ -13,24 +13,32 @@ def std_norm_cdf(x):
    Cumulative standard Gaussian distribution
    Based on Abramowitz, M. and Stegun, I. (1970)
    """
+    #Generalize for many x
+    x = np.asarray(x).copy()
+    cdf_x = np.zeros_like(x)
+    N = x.size
    support_code = "#include <math.h>"
    code = """

-    double sign = 1.0;
-    if (x < 0.0){
-        sign = -1.0;
-        x = -x;
+    double sign, t, erf;
+    for (int i=0; i<N; i++){
+        sign = 1.0;
+        if (x[i] < 0.0){
+            sign = -1.0;
+            x[i] = -x[i];
+        }
+        x[i] = x[i]/sqrt(2.0);
+
+        t = 1.0/(1.0 +  0.3275911*x[i]);
+
+        erf = 1. - exp(-x[i]*x[i])*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))));
+
+        //return_val = 0.5*(1.0 + sign*erf);
+        cdf_x[i] = 0.5*(1.0 + sign*erf);
    }
-    x = x/sqrt(2.0);
-
-    double t = 1.0/(1.0 +  0.3275911*x);
-
-    double erf = 1. - exp(-x*x)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))));
-
-    return_val = 0.5*(1.0 + sign*erf);
    """
-    x = float(x)
-    return weave.inline(code,arg_names=['x'],support_code=support_code)
+    weave.inline(code, arg_names=['x', 'cdf_x', 'N'], support_code=support_code)
+    return cdf_x

 def inv_std_norm_cdf(x):
    """
--- a/doc/GPy.examples.rst
+++ b/doc/GPy.examples.rst
@ -20,6 +20,14 @@ GPy.examples.dimensionality_reduction module
    :undoc-members:
    :show-inheritance:

+GPy.examples.laplace_approximations module
+------------------------------------------
+
+.. automodule:: GPy.examples.laplace_approximations
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.examples.regression module
 ------------------------------

--- a/doc/GPy.kern.parts.rst
+++ b/doc/GPy.kern.parts.rst
@ -28,6 +28,14 @@ GPy.kern.parts.Matern52 module
    :undoc-members:
    :show-inheritance:

+GPy.kern.parts.ODE_1 module
+---------------------------
+
+.. automodule:: GPy.kern.parts.ODE_1
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.kern.parts.bias module
 --------------------------

@ -44,6 +52,14 @@ GPy.kern.parts.coregionalize module
    :undoc-members:
    :show-inheritance:

+GPy.kern.parts.eq_ode1 module
+-----------------------------
+
+.. automodule:: GPy.kern.parts.eq_ode1
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.kern.parts.exponential module
 ---------------------------------

--- a/doc/GPy.likelihoods.noise_models.rst
+++ b/doc/GPy.likelihoods.noise_models.rst
@ -4,10 +4,10 @@ GPy.likelihoods.noise_models package
 Submodules
 ----------

-GPy.likelihoods.noise_models.binomial_noise module
--------------------------------------------------
+GPy.likelihoods.noise_models.bernoulli_noise module
+---------------------------------------------------

-.. automodule:: GPy.likelihoods.noise_models.binomial_noise
+.. automodule:: GPy.likelihoods.noise_models.bernoulli_noise
    :members:
    :undoc-members:
    :show-inheritance:
@ -60,6 +60,14 @@ GPy.likelihoods.noise_models.poisson_noise module
    :undoc-members:
    :show-inheritance:

+GPy.likelihoods.noise_models.student_t_noise module
+---------------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.student_t_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+

 Module contents
 ---------------
--- a/doc/GPy.likelihoods.rst
+++ b/doc/GPy.likelihoods.rst
@ -43,6 +43,14 @@ GPy.likelihoods.gaussian_mixed_noise module
    :undoc-members:
    :show-inheritance:

+GPy.likelihoods.laplace module
+------------------------------
+
+.. automodule:: GPy.likelihoods.laplace
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.likelihoods.likelihood module
 ---------------------------------

--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@ -4,6 +4,14 @@ GPy.testing package
 Submodules
 ----------

+GPy.testing.bcgplvm_tests module
+--------------------------------
+
+.. automodule:: GPy.testing.bcgplvm_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.bgplvm_tests module
 -------------------------------

@ -28,6 +36,14 @@ GPy.testing.examples_tests module
    :undoc-members:
    :show-inheritance:

+GPy.testing.gp_transformation_tests module
+------------------------------------------
+
+.. automodule:: GPy.testing.gp_transformation_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.gplvm_tests module
 ------------------------------

@ -44,6 +60,14 @@ GPy.testing.kernel_tests module
    :undoc-members:
    :show-inheritance:

+GPy.testing.likelihoods_tests module
+------------------------------------
+
+.. automodule:: GPy.testing.likelihoods_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.mapping_tests module
 --------------------------------

--- a/doc/GPy.util.rst
+++ b/doc/GPy.util.rst
@ -27,6 +27,14 @@ GPy.util.classification module
    :undoc-members:
    :show-inheritance:

+GPy.util.config module
+----------------------
+
+.. automodule:: GPy.util.config
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.datasets module
 ------------------------

@ -43,6 +51,14 @@ GPy.util.decorators module
    :undoc-members:
    :show-inheritance:

+GPy.util.erfcx module
+---------------------
+
+.. automodule:: GPy.util.erfcx
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.linalg module
 ----------------------

@ -51,6 +67,14 @@ GPy.util.linalg module
    :undoc-members:
    :show-inheritance:

+GPy.util.ln_diff_erfs module
+----------------------------
+
+.. automodule:: GPy.util.ln_diff_erfs
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.misc module
 --------------------

@ -75,6 +99,14 @@ GPy.util.multioutput module
    :undoc-members:
    :show-inheritance:

+GPy.util.netpbmfile module
+--------------------------
+
+.. automodule:: GPy.util.netpbmfile
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.plot module
 --------------------

@ -99,6 +131,14 @@ GPy.util.squashers module
    :undoc-members:
    :show-inheritance:

+GPy.util.symbolic module
+------------------------
+
+.. automodule:: GPy.util.symbolic
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.univariate_Gaussian module
 -----------------------------------