Merge branch 'devel' of https://github.com/SheffieldML/GPy into devel

2026-07-11 16:22:13 +02:00 · 2013-11-18 10:47:02 +00:00 · 2013-11-18 10:47:02 +00:00 · 357c003b0a
commit 357c003b0a
parent b845c0d634 6e4d625460
67 changed files with 3945 additions and 1257 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,4 +45,4 @@ iterate.dat

 # git merge files #
 ###################
-*.orig
+*.orig
--- a/.travis.yml
+++ b/.travis.yml
@ -7,7 +7,7 @@ virtualenv:
  system_site_packages: true

 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
-before_install: 
+before_install:
  - sudo apt-get install -qq python-scipy python-pip
  - sudo apt-get install -qq python-matplotlib
  # Workaround for a permissions issue with Travis virtual machine images
@ -17,10 +17,10 @@ before_install:
  - sudo ln -s /run/shm /dev/shm

 install:
-  - pip install --upgrade numpy==1.7.1 
-  - pip install sphinx 
+  - pip install --upgrade numpy==1.7.1
+  - pip install sphinx
  - pip install nose
  - pip install . --use-mirrors
 # command to run tests, e.g. python setup.py test
-script: 
+script:
  - nosetests GPy/testing
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@ -159,7 +159,7 @@ class FITC(SparseGP):
        A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB))))
        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
-        return A + C + D
+        return A + C + D + self.likelihood.Z

    def _log_likelihood_gradients(self):
        pass
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -6,7 +6,7 @@ import numpy as np
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs
-from ..likelihoods import EP
+from ..likelihoods import EP, Laplace
 from gp_base import GPBase

 class GP(GPBase):
@ -25,20 +25,23 @@ class GP(GPBase):
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        self.update_likelihood_approximation()

-    def getstate(self):
-        return GPBase.getstate(self)
-
-    def setstate(self, state):
-        GPBase.setstate(self, state)
-        self._set_params(self._get_params())

    def _set_params(self, p):
-        self.kern._set_params_transformed(p[:self.kern.num_params_transformed()])
-        self.likelihood._set_params(p[self.kern.num_params_transformed():])
+        new_kern_params = p[:self.kern.num_params_transformed()]
+        new_likelihood_params = p[self.kern.num_params_transformed():]
+        old_likelihood_params = self.likelihood._get_params()
+
+        self.kern._set_params_transformed(new_kern_params)
+        self.likelihood._set_params_transformed(new_likelihood_params)

        self.K = self.kern.K(self.X)
+
+        #Re fit likelihood approximation (if it is an approx), as parameters have changed
+        if isinstance(self.likelihood, Laplace):
+            self.likelihood.fit_full(self.K)
+
        self.K += self.likelihood.covariance_matrix

        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
@ -55,6 +58,10 @@ class GP(GPBase):
            tmp, _ = dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
            self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki)

+        #Adding dZ_dK (0 for a non-approximate likelihood, compensates for
+        #additional gradients of K when log-likelihood has non-zero Z term)
+        self.dL_dK += self.likelihood.dZ_dK
+
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))

@ -94,19 +101,13 @@ class GP(GPBase):
        return (-0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) -
            0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z)

-
    def _log_likelihood_gradients(self):
        """
        The gradient of all parameters.

        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
        """
-        #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
-        if not isinstance(self.likelihood,EP):
-            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
-        else:
-            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
-        return tmp
+        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
        """
@ -193,3 +194,11 @@ class GP(GPBase):
        """
        Xnew = self._add_output_index(Xnew, output)
        return self.predict(Xnew, which_parts=which_parts, full_cov=full_cov, likelihood_args=likelihood_args)
+
+    def getstate(self):
+        return GPBase.getstate(self)
+
+    def setstate(self, state):
+        GPBase.setstate(self, state)
+        self._set_params(self._get_params())
+
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -9,11 +9,16 @@ from ..likelihoods import Gaussian, Gaussian_Mixed_Noise
 class GPBase(Model):
    """
    Gaussian process base model for holding shared behaviour between
-    sparse_GP and GP models.
+    sparse_GP and GP models, and potentially other models in the future.
+
+    Here we define some functions that are use
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
+        if len(X.shape)==1:
+            X = X.reshape(-1,1)
+            warning.warn("One dimension output (N,) being reshaped to (N,1)")
        self.X = X
-        assert len(self.X.shape) == 2
+        assert len(self.X.shape) == 2, "too many dimensions for X input"
        self.num_data, self.input_dim = self.X.shape
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
@ -34,31 +39,8 @@ class GPBase(Model):
        # All leaf nodes should call self._set_params(self._get_params()) at
        # the end

-    def getstate(self):
-        """
-        Get the current state of the class, here we return everything that is needed to recompute the model.
-        """
-        return Model.getstate(self) + [self.X,
-                self.num_data,
-                self.input_dim,
-                self.kern,
-                self.likelihood,
-                self.output_dim,
-                self._Xoffset,
-                self._Xscale]

-    def setstate(self, state):
-        self._Xscale = state.pop()
-        self._Xoffset = state.pop()
-        self.output_dim = state.pop()
-        self.likelihood = state.pop()
-        self.kern = state.pop()
-        self.input_dim = state.pop()
-        self.num_data = state.pop()
-        self.X = state.pop()
-        Model.setstate(self, state)
-
-    def posterior_samples_f(self,X,size=10,which_parts='all',full_cov=True):
+    def posterior_samples_f(self,X,size=10,which_parts='all'):
        """
        Samples the posterior GP at the points X.

@ -72,16 +54,13 @@ class GPBase(Model):
        :type full_cov: bool.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
-        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=full_cov)
+        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=True)
        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
-        if not full_cov:
-            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
-        else:
-            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
+        Ysim = np.random.multivariate_normal(m.flatten(), v, size).T

        return Ysim

-    def posterior_samples(self,X,size=10,which_parts='all',full_cov=True,noise_model=None):
+    def posterior_samples(self,X,size=10,which_parts='all',noise_model=None):
        """
        Samples the posterior GP at the points X.

@ -97,7 +76,7 @@ class GPBase(Model):
        :type noise_model: integer.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
-        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=full_cov)
+        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=True)
        if isinstance(self.likelihood,Gaussian):
            noise_std = np.sqrt(self.likelihood._get_params())
            Ysim += np.random.normal(0,noise_std,Ysim.shape)
@ -110,90 +89,43 @@ class GPBase(Model):

        return Ysim

-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+    def plot_f(self, *args, **kwargs):
        """
-        Plot the GP's view of the world, where the data is normalized and the
-          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted function
-          - Not implemented in higher dimensions
+        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.

-        :param samples: the number of a posteriori samples to plot
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param full_cov:
-        :type full_cov: bool
-                :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
+        This is a convenience function: we simply call self.plot with the
+        argument use_raw_predict set True. All args and kwargs are passed on to
+        plot.

-        :param output: which output to plot (for multiple output models only)
-        :type output: integer (first output is 0)
+        see also: gp_base.plot
        """
-        if which_data == 'all':
-            which_data = slice(None)
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        if self.X.shape[1] == 1:
-            resolution = resolution or 200
-            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
-
-            m, v = self._raw_predict(Xnew, which_parts=which_parts)
-            if samples:
-                Ysim = self.posterior_samples_f(Xnew, samples, which_parts=which_parts, full_cov=True)
-                for yi in Ysim.T:
-                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
-            gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
-
-            ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
-            ax.set_xlim(xmin, xmax)
-            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_ylim(ymin, ymax)
-
-        elif self.X.shape[1] == 2:
-
-            resolution = resolution or 50
-            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
-            m, v = self._raw_predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable
-            ax.set_xlim(xmin[0], xmax[0])
-            ax.set_ylim(xmin[1], xmax[1])
-
-            if samples:
-                warnings.warn("Samples only implemented for 1 dimensional inputs.")
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
-        """
-        Plot the GP with noise where the likelihood is Gaussian.
+        kwargs['plot_raw'] = True
+        self.plot(*args, **kwargs)

+    def plot(self, plot_limits=None, which_data_rows='all',
+            which_data_ycols='all', which_parts='all', fixed_inputs=[],
+            levels=20, samples=0, fignum=None, ax=None, resolution=None,
+            plot_raw=False,
+            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+        """ 
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
-          - Not implemented in higher dimensions
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.

        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
+        using which_data_rowsm which_data_ycols and which_parts

        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
@ -205,216 +137,139 @@ class GPBase(Model):
        :param ax: axes to plot on.
        :type ax: axes handle
        :type output: integer (first output is 0)
-        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
-        :type fixed_inputs: a list of tuples
        :param linecol: color of line to plot.
        :type linecol:
        :param fillcol: color of fill
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        """
-        if which_data == 'all':
-            which_data = slice(None)
-
+        #deal with optional arguments
+        if which_data_rows == 'all':
+            which_data_rows = slice(None)
+        if which_data_ycols == 'all':
+            which_data_ycols = np.arange(self.output_dim)
+        if len(which_data_ycols)==0:
+            raise ValueError('No data selected for plotting')
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)

-        plotdims = self.input_dim - len(fixed_inputs)
-        if plotdims == 1:
+        #work out what the inputs are for plotting (1D or 2D)
+        fixed_dims = np.array([i for i,v in fixed_inputs])
+        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+
+        #one dimensional plotting
+        if len(free_dims) == 1:
+
+            #define the frame on which to plot
            resolution = resolution or 200
-
            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
-
-            fixed_dims = np.array([i for i,v in fixed_inputs])
-            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
-
-            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
+            Xnew, xmin, xmax = x_frame1D(Xu[:,free_dims], plot_limits=plot_limits)
            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,freedim] = Xnew
+            Xgrid[:,free_dims] = Xnew
            for i,v in fixed_inputs:
                Xgrid[:,i] = v

-            m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+            #make a prediction on the frame and plot it
+            if plot_raw:
+                m, v = self._raw_predict(Xgrid, which_parts=which_parts)
+                lower = m - 2*np.sqrt(v)
+                upper = m + 2*np.sqrt(v)
+                Y = self.likelihood.Y
+            else:
+                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts,sampling=False) #Compute the exact mean
+                m_, v_, lower, upper = self.predict(Xgrid, which_parts=which_parts,sampling=True,num_samples=15000) #Apporximate the percentiles
+                Y = self.likelihood.data
+            for d in which_data_ycols:
+                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
+                ax.plot(Xu[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)

+            #optionally plot some samples
            if samples: #NOTE not tested with fixed_inputs
                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts, full_cov=True)
                for yi in Ysim.T:
                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                    #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.

-            for d in range(m.shape[1]):
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+            #set the limits of the plot to some sensible values
+            ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            ax.set_xlim(xmin, xmax)
            ax.set_ylim(ymin, ymax)

-        elif self.X.shape[1] == 2:
+        #2D plotting
+        elif len(free_dims) == 2:

+            #define the frame for plotting on
            resolution = resolution or 50
-            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
+            Xnew, _, _, xmin, xmax = x_frame2D(Xu[:,free_dims], plot_limits, resolution)
+            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+            Xgrid[:,free_dims] = Xnew
+            for i,v in fixed_inputs:
+                Xgrid[:,i] = v
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            Yf = self.likelihood.Y.flatten()
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
+
+            #predict on the frame and plot
+            if plot_raw:
+                m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
+                Y = self.likelihood.Y
+            else:
+                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts,sampling=False)
+                Y = self.likelihood.data
+            for d in which_data_ycols:
+                m_d = m[:,d].reshape(resolution, resolution).T
+                ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+                ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+
+            #set the limits of the plot to some sensible values
            ax.set_xlim(xmin[0], xmax[0])
            ax.set_ylim(xmin[1], xmax[1])

            if samples:
-                warnings.warn("Samples only implemented for 1 dimensional inputs.")
+                warnings.warn("Samples are rather difficult to plot for 2D inputs...")

        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

-    def plot_single_output_f(self, output=None, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+    def getstate(self):
        """
-        For a specific output, in a multioutput model, this function works just as plot_f on single output models.
-
-        :param output: which output to plot (for multiple output models only)
-        :type output: integer (first output is 0)
-        :param samples: the number of a posteriori samples to plot
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param full_cov:
-        :type full_cov: bool
-                :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
+        Get the curent state of the class. This is only used to efficiently
+        pickle the model. See also self.setstate
        """
-        assert output is not None, "An output must be specified."
-        assert len(self.likelihood.noise_model_list) > output, "The model has only %s outputs." %(self.output_dim + 1)
+        return Model.getstate(self) + [self.X,
+                self.num_data,
+                self.input_dim,
+                self.kern,
+                self.likelihood,
+                self.output_dim,
+                self._Xoffset,
+                self._Xscale]

-        if which_data == 'all':
-            which_data = slice(None)
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        if self.X.shape[1] == 2:
-            Xu = self.X[self.X[:,-1]==output ,0:1]
-            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
-            Xnew_indexed = self._add_output_index(Xnew,output)
-
-            m, v = self._raw_predict(Xnew_indexed, which_parts=which_parts)
-
-            if samples:
-                Ysim = self.posterior_samples_f(Xnew_indexed, samples, which_parts=which_parts, full_cov=True)
-                for yi in Ysim.T:
-                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
-
-            gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
-            ax.plot(Xu[which_data], self.likelihood.Y[self.likelihood.index==output][:,None], 'kx', mew=1.5)
-            ax.set_xlim(xmin, xmax)
-            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_ylim(ymin, ymax)
-
-        elif self.X.shape[1] == 3:
-            raise NotImplementedError, "Plots not implemented for multioutput models with 2D inputs...yet"
-            #if samples:
-            #    warnings.warn("Samples only implemented for 1 dimensional inputs.")
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-
-    def plot_single_output(self, output=None, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+    def setstate(self, state):
        """
-        For a specific output, in a multioutput model, this function works just as plot_f on single output models.
-
-        :param output: which output to plot (for multiple output models only)
-        :type output: integer (first output is 0)
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :type plot_limits: np.array
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param levels: number of levels to plot in a contour plot.
-        :type levels: int
-        :param samples: the number of a posteriori samples to plot
-        :type samples: int
-        :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
-        :type output: integer (first output is 0)
-        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
-        :type fixed_inputs: a list of tuples
-        :param linecol: color of line to plot.
-        :type linecol:
-        :param fillcol: color of fill
-        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        Set the state of the model. Used for efficient pickling
        """
-        assert output is not None, "An output must be specified."
-        assert len(self.likelihood.noise_model_list) > output, "The model has only %s outputs." %(self.output_dim + 1)
-        if which_data == 'all':
-            which_data = slice(None)
+        self._Xscale = state.pop()
+        self._Xoffset = state.pop()
+        self.output_dim = state.pop()
+        self.likelihood = state.pop()
+        self.kern = state.pop()
+        self.input_dim = state.pop()
+        self.num_data = state.pop()
+        self.X = state.pop()
+        Model.setstate(self, state)

-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        if self.X.shape[1] == 2:
-            resolution = resolution or 200
-
-            Xu = self.X[self.X[:,-1]==output,:] #keep the output of interest
-            Xu = self.X * self._Xscale + self._Xoffset
-            Xu = self.X[self.X[:,-1]==output ,0:1] #get rid of the index column
-
-            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
-            Xnew_indexed = self._add_output_index(Xnew,output)
-
-
-            m, v, lower, upper = self.predict(Xnew_indexed, which_parts=which_parts,noise_model=output)
-
-            if samples: #NOTE not tested with fixed_inputs
-                Ysim = self.posterior_samples(Xnew_indexed, samples, which_parts=which_parts, full_cov=True,noise_model=output)
-                for yi in Ysim.T:
-                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
-
-            for d in range(m.shape[1]):
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data], self.likelihood.noise_model_list[output].data, 'kx', mew=1.5)
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_xlim(xmin, xmax)
-            ax.set_ylim(ymin, ymax)
-
-        elif self.X.shape[1] == 3:
-            raise NotImplementedError, "Plots not implemented for multioutput models with 2D inputs...yet"
-            #if samples:
-            #    warnings.warn("Samples only implemented for 1 dimensional inputs.")
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-
-    def _add_output_index(self,X,output):
+    def log_predictive_density(self, x_test, y_test):
        """
-        In a multioutput model, appends an index column to X to specify the output it is related to.
+        Calculation of the log predictive density

-        :param X: Input data
-        :type X: np.ndarray, N x self.input_dim
-        :param output: output X is related to
-        :type output: integer in {0,..., output_dim-1}
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})

-        .. Note:: For multiple non-independent outputs models only.
+        :param x_test: test observations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
        """
-
-        assert hasattr(self,'multioutput'), 'This function is for multiple output models only.'
-
-        index = np.ones((X.shape[0],1))*output
-        return np.hstack((X,index))
+        mu_star, var_star = self._raw_predict(x_test)
+        return self.likelihood.log_predictive_density(y_test, mu_star, var_star)
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -251,7 +251,7 @@ class Model(Parameterized):
            self._set_params_transformed(initial_parameters)

    def ensure_default_constraints(self):
-        """       
+        """
        Ensure that any variables which should clearly be positive
        have been constrained somehow. The method performs a regular
        expression search on parameter names looking for the terms
@ -274,7 +274,7 @@ class Model(Parameterized):
        """
        The objective function passed to the optimizer. It combines
        the likelihood and the priors.
-        
+
        Failures are handled robustly. The algorithm will try several times to
        return the objective, and will raise the original exception if it
        the objective cannot be computed.
@ -462,7 +462,7 @@ class Model(Parameterized):

            numerical_gradient = (f1 - f2) / (2 * dx)
            global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient)))
-            
+
            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
        else:
            # check the gradient of each parameter individually, and do some pretty printing
@ -547,9 +547,9 @@ class Model(Parameterized):
        :param stop_crit: convergence criterion
        :type stop_crit: float

-        .. Note: kwargs are passed to update_likelihood and optimize functions. 
+        .. Note: kwargs are passed to update_likelihood and optimize functions.
        """
-        assert isinstance(self.likelihood, likelihoods.EP) or isinstance(self.likelihood, likelihoods.EP_Mixed_Noise), "pseudo_EM is only available for EP likelihoods"
+        assert isinstance(self.likelihood, (likelihoods.EP, likelihoods.EP_Mixed_Noise, likelihoods.Laplace)), "pseudo_EM is only available for approximate likelihoods"
        ll_change = stop_crit + 1.
        iteration = 0
        last_ll = -np.inf
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -52,23 +52,6 @@ class SparseGP(GPBase):

        self._const_jitter = None

-    def getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return GPBase.getstate(self) + [self.Z,
-                self.num_inducing,
-                self.has_uncertain_inputs,
-                self.X_variance]
-
-    def setstate(self, state):
-        self.X_variance = state.pop()
-        self.has_uncertain_inputs = state.pop()
-        self.num_inducing = state.pop()
-        self.Z = state.pop()
-        GPBase.setstate(self, state)
-
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
@ -87,7 +70,6 @@ class SparseGP(GPBase):

        # factor Kmm
        self._Lm = jitchol(self.Kmm + self._const_jitter)
-        # TODO: no white kernel needed anymore, all noise in likelihood --------

        # The rather complex computations of self._A
        if self.has_uncertain_inputs:
@ -341,7 +323,10 @@ class SparseGP(GPBase):
        return mean, var, _025pm, _975pm


-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+    def plot_f(self, samples=0, plot_limits=None, which_data_rows='all',
+            which_data_ycols='all', which_parts='all', resolution=None,
+            full_cov=False, fignum=None, ax=None):
+
        """
        Plot the GP's view of the world, where the data is normalized and the
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -350,8 +335,8 @@ class SparseGP(GPBase):

        :param samples: the number of a posteriori samples to plot
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_rows: which if the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
@ -371,10 +356,10 @@ class SparseGP(GPBase):
            ax = fig.add_subplot(111)
        if fignum is None and ax is None:
                fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)
+        if which_data_rows is 'all':
+            which_data_rows = slice(None)

-        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)
+        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data_rows=which_data_rows, which_data_ycols=which_data_ycols, which_parts=which_parts, resolution=resolution, fignum=fignum, ax=ax)

        if self.X.shape[1] == 1:
            if self.has_uncertain_inputs:
@ -389,177 +374,98 @@ class SparseGP(GPBase):
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')

-
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

-    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
+    def plot(self, plot_limits=None, which_data_rows='all',
+            which_data_ycols='all', which_parts='all', fixed_inputs=[],
+            plot_raw=False,
+            levels=20, samples=0, fignum=None, ax=None, resolution=None):
+        """ 
+        Plot the posterior of the sparse GP.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data_rowsm which_data_ycols and which_parts
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :type output: integer (first output is 0)
+        :param linecol: color of line to plot.
+        :type linecol:
+        :param fillcol: color of fill
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        """
+        #deal work out which ax to plot on
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
-        if fignum is None and ax is None:
-                fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)

-        GPBase.plot(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, levels=20, fignum=fignum, ax=ax)
+        #work out what the inputs are for plotting (1D or 2D)
+        fixed_dims = np.array([i for i,v in fixed_inputs])
+        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)

-        if self.X.shape[1] == 1:
+        #call the base plotting
+        GPBase.plot(self, samples=samples, plot_limits=plot_limits,
+                which_data_rows=which_data_rows,
+                which_data_ycols=which_data_ycols, fixed_inputs=fixed_inputs,
+                which_parts=which_parts, resolution=resolution, levels=20,
+                fignum=fignum, ax=ax)
+
+        if len(free_dims) == 1:
+            #plot errorbars for the uncertain inputs
            if self.has_uncertain_inputs:
                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
+                ax.errorbar(Xu[which_data_rows, 0], self.likelihood.data[which_data_rows, 0],
+                            xerr=2 * np.sqrt(self.X_variance[which_data_rows, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+
+            #plot the inducing inputs
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)

-        elif self.X.shape[1] == 2:
+        elif len(free_dims) == 2:
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')

-
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

-    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
+    def getstate(self):
        """
-        For a specific output, predict the function at the new point(s) Xnew.
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param output: output to predict
-        :type output: integer in {0,..., num_outputs-1}
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
-
-        .. Note:: For multiple output models only
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
        """
+        return GPBase.getstate(self) + [self.Z,
+                self.num_inducing,
+                self.has_uncertain_inputs,
+                self.X_variance]

-        assert hasattr(self,'multioutput')
-        index = np.ones_like(Xnew)*output
-        Xnew = np.hstack((Xnew,index))
-
-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
-
-        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, noise_model = output)
-        return mean, var, _025pm, _975pm
-
-    def _raw_predict_single_output(self, _Xnew, output=0, X_variance_new=None, which_parts='all', full_cov=False,stop=False):
-        """
-        Internal helper function for making predictions for a specific output,
-        does not account for normalization or likelihood
-        ---------
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param output: output to predict
-        :type output: integer in {0,..., num_outputs-1}
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-
-        .. Note:: For multiple output models only
-        """
-        Bi, _ = dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
-        symmetrify(Bi)
-        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
-
-        if self.Cpsi1V is None:
-            psi1V = np.dot(self.psi1.T,self.likelihood.V)
-            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
-            tmp, _ = dpotrs(self.LB, tmp, lower=1)
-            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
-
-        assert hasattr(self,'multioutput')
-        index = np.ones_like(_Xnew)*output
-        _Xnew = np.hstack((_Xnew,index))
-
-        if X_variance_new is None:
-            Kx = self.kern.K(self.Z, _Xnew, which_parts=which_parts)
-            mu = np.dot(Kx.T, self.Cpsi1V)
-            if full_cov:
-                Kxx = self.kern.K(_Xnew, which_parts=which_parts)
-                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
-            else:
-                Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
-                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
-        else:
-            Kx = self.kern.psi1(self.Z, _Xnew, X_variance_new)
-            mu = np.dot(Kx, self.Cpsi1V)
-            if full_cov:
-                raise NotImplementedError, "TODO"
-            else:
-                Kxx = self.kern.psi0(self.Z, _Xnew, X_variance_new)
-                psi2 = self.kern.psi2(self.Z, _Xnew, X_variance_new)
-                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-
-        return mu, var[:, None]
+    def setstate(self, state):
+        self.X_variance = state.pop()
+        self.has_uncertain_inputs = state.pop()
+        self.num_inducing = state.pop()
+        self.Z = state.pop()
+        GPBase.setstate(self, state)


-    def plot_single_output_f(self, output=None, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        if fignum is None and ax is None:
-                fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)
-
-        GPBase.plot_single_output_f(self, output=output, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)
-
-        if self.X.shape[1] == 2:
-            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:1]
-            ax.plot(Zu[:,0], np.zeros_like(Zu[:,0]) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
-
-        elif self.X.shape[1] == 2:
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:2]
-            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
-
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-    def plot_single_output(self, output=None, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        if fignum is None and ax is None:
-                fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)
-
-        GPBase.plot_single_output(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, levels=20, fignum=fignum, ax=ax, output=output)
-
-        if self.X.shape[1] == 2:
-            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:1]
-            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
-
-        elif self.X.shape[1] == 3:
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:1]
-            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -18,30 +18,16 @@ class SVIGP(GPBase):
    Stochastic Variational inference in a Gaussian Process

    :param X: inputs
-    :type X: np.ndarray (N x Q)
+    :type X: np.ndarray (num_data x num_inputs)
    :param Y: observed data
-    :type Y: np.ndarray of observations (N x D)
-    :param batchsize: the size of a h
-
-    Additional kwargs are used as for a sparse GP. They include:
-
+    :type Y: np.ndarray of observations (num_data x output_dim)
+    :param batchsize: the size of a minibatch
    :param q_u: canonical parameters of the distribution squasehd into a 1D array
    :type q_u: np.ndarray
-    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
    :param kernel: the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x Q) | None
-    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
-    :type X_uncertainty: np.ndarray (N x Q) | None
-    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
-    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
-    :param beta: noise precision. TODO: ignore beta if doing EP
-    :type beta: float
-    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_(X|Y): bool
+    :param Z: inducing inputs
+    :type Z: np.ndarray (num_inducing x num_inputs)

    """

--- a/GPy/core/variational.py
+++ b/GPy/core/variational.py
@ -0,0 +1,19 @@
+'''
+Created on 6 Nov 2013
+
+@author: maxz
+'''
+from parameterized import Parameterized
+from parameter import Param
+
+class Normal(Parameterized):
+    '''
+    Normal distribution for variational approximations.
+    
+    holds the means and variances for a factorizing multivariate normal distribution
+    '''
+    def __init__(self, name, means, variances):
+        Parameterized.__init__(self, name=name)
+        self.means = Param("mean", means)
+        self.variances = Param('variance', variances)
+        self.add_parameters(self.means, self.variances)
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -43,7 +43,7 @@ def oil(num_inducing=50, max_iters=100, kernel=None):

 def toy_linear_1d_classification(seed=default_seed):
    """
-    Simple 1D classification example
+    Simple 1D classification example using EP approximation

    :param seed: seed value for data generation (default is 4).
    :type seed: int
@ -61,6 +61,7 @@ def toy_linear_1d_classification(seed=default_seed):
    #m.update_likelihood_approximation()
    # Parameters optimization:
    #m.optimize()
+    #m.update_likelihood_approximation()
    m.pseudo_EM()

    # Plot
@ -71,6 +72,41 @@ def toy_linear_1d_classification(seed=default_seed):

    return m

+def toy_linear_1d_classification_laplace(seed=default_seed):
+    """
+    Simple 1D classification example using Laplace approximation
+
+    :param seed: seed value for data generation (default is 4).
+    :type seed: int
+
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+
+    bern_noise_model = GPy.likelihoods.bernoulli()
+    laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), bern_noise_model)
+
+    # Model definition
+    m = GPy.models.GPClassification(data['X'], Y, likelihood=laplace_likelihood)
+
+    print m
+    # Optimize
+    #m.update_likelihood_approximation()
+    # Parameters optimization:
+    m.optimize('bfgs', messages=1)
+    #m.pseudo_EM()
+
+    # Plot
+    fig, axes = pb.subplots(2,1)
+    m.plot_f(ax=axes[0])
+    m.plot(ax=axes[1])
+    print(m)
+
+    return m
+
+
 def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
    """
    Sparse 1D classification example
@ -116,7 +152,7 @@ def toy_heaviside(seed=default_seed):
    Y[Y.flatten() == -1] = 0

    # Model definition
-    noise_model = GPy.likelihoods.binomial(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
+    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
    likelihood = GPy.likelihoods.EP(Y,noise_model)
    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)

--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -12,10 +12,10 @@ from GPy.likelihoods.gaussian import Gaussian
 default_seed = np.random.seed(123344)

 def BGPLVM(seed=default_seed):
-    N = 5
-    num_inducing = 4
-    Q = 3
-    D = 2
+    N = 13
+    num_inducing = 5
+    Q = 6
+    D = 25
    # generate GPLVM-like data
    X = np.random.rand(N, Q)
    lengthscales = np.random.rand(Q)
@ -25,9 +25,12 @@ def BGPLVM(seed=default_seed):
    Y = np.random.multivariate_normal(np.zeros(N), K, D).T
    lik = Gaussian(Y, normalize=True)

-    k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
+    # k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
    # k = GPy.kern.linear(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
    # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
+    # k = GPy.kern.rbf(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.rbf(Q, .3, np.ones(Q) * .2, ARD=True)
+    k = GPy.kern.rbf(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.linear(Q, np.ones(Q) * .2, ARD=True)
+    # k = GPy.kern.rbf(Q, .5, 2., ARD=0) + GPy.kern.rbf(Q, .3, .2, ARD=0)

    m = GPy.models.BayesianGPLVM(lik, Q, kernel=k, num_inducing=num_inducing)
    m.lengthscales = lengthscales
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@ -0,0 +1,296 @@
+import GPy
+import numpy as np
+import matplotlib.pyplot as plt
+from GPy.util import datasets
+np.random.seed(1)
+
+def student_t_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    real_std = 0.1
+    #Start a function, any function
+    X = np.linspace(0.0, np.pi*2, 100)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+    Yc = Y.copy()
+
+    X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
+    Y_full = np.sin(X_full)
+
+    Y = Y/Y.max()
+
+    #Slightly noisy data
+    Yc[75:80] += 1
+
+    #Very noisy data
+    #Yc[10] += 100
+    #Yc[25] += 10
+    #Yc[23] += 10
+    #Yc[26] += 1000
+    #Yc[24] += 10
+    #Yc = Yc/Yc.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 5
+    print "Real noise: ", real_std
+    initial_var_guess = 0.5
+
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
+
+    plt.figure(1)
+    plt.suptitle('Gaussian likelihood')
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
+
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GPRegression(X, Y, kernel=kernel1)
+    # optimize
+    m.ensure_default_constraints()
+    m.constrain_fixed('white', 1e-4)
+    m.randomize()
+    m.optimize()
+    # plot
+    ax = plt.subplot(211)
+    m.plot(ax=ax)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
+    plt.title('Gaussian clean')
+    print m
+
+    #Corrupt
+    print "Corrupt Gaussian"
+    m = GPy.models.GPRegression(X, Yc, kernel=kernel2)
+    m.ensure_default_constraints()
+    m.constrain_fixed('white', 1e-4)
+    m.randomize()
+    m.optimize()
+    ax = plt.subplot(212)
+    m.plot(ax=ax)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
+    plt.title('Gaussian corrupt')
+    print m
+
+    plt.figure(2)
+    plt.suptitle('Student-t likelihood')
+    edited_real_sd = initial_var_guess
+
+    print "Clean student t, rasm"
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
+    m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
+    m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
+    m.constrain_fixed('white', 1e-4)
+    m.randomize()
+    #m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    ax = plt.subplot(211)
+    m.plot(ax=ax)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
+    plt.title('Student-t rasm clean')
+
+    print "Corrupt student t, rasm"
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
+    m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
+    m.constrain_fixed('white', 1e-4)
+    m.randomize()
+    for a in range(1):
+        m.randomize()
+        m_start = m.copy()
+        print m
+        m.optimize('scg', messages=1)
+    print(m)
+    ax = plt.subplot(212)
+    m.plot(ax=ax)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
+    plt.title('Student-t rasm corrupt')
+
+    return m
+
+def boston_example():
+    import sklearn
+    from sklearn.cross_validation import KFold
+    optimizer='bfgs'
+    messages=0
+    data = datasets.boston_housing()
+    degrees_freedoms = [3, 5, 8, 10]
+    X = data['X'].copy()
+    Y = data['Y'].copy()
+    X = X-X.mean(axis=0)
+    X = X/X.std(axis=0)
+    Y = Y-Y.mean()
+    Y = Y/Y.std()
+    num_folds = 10
+    kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    num_models = len(degrees_freedoms) + 3 #3 for baseline, gaussian, gaussian laplace approx
+    score_folds = np.zeros((num_models, num_folds))
+    pred_density = score_folds.copy()
+
+    def rmse(Y, Ystar):
+        return np.sqrt(np.mean((Y-Ystar)**2))
+
+    for n, (train, test) in enumerate(kf):
+        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        print "Fold {}".format(n)
+
+        noise = 1e-1 #np.exp(-2)
+        rbf_len = 0.5
+        data_axis_plot = 4
+        plot = False
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+
+        #Baseline
+        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
+
+        #Gaussian GP
+        print "Gauss GP"
+        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
+        mgp.ensure_default_constraints()
+        mgp.constrain_fixed('white', 1e-5)
+        mgp['rbf_len'] = rbf_len
+        mgp['noise'] = noise
+        print mgp
+        mgp.optimize(optimizer=optimizer, messages=messages)
+        Y_test_pred = mgp.predict(X_test)
+        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
+        print mgp
+        print pred_density
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('GP gauss')
+
+        print "Gaussian Laplace GP"
+        N, D = Y_train.shape
+        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
+        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
+        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
+        mg.ensure_default_constraints()
+        mg.constrain_positive('noise_variance')
+        mg.constrain_fixed('white', 1e-5)
+        mg['rbf_len'] = rbf_len
+        mg['noise'] = noise
+        print mg
+        try:
+            mg.optimize(optimizer=optimizer, messages=messages)
+        except Exception:
+            print "Blew up"
+        Y_test_pred = mg.predict(X_test)
+        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
+        print pred_density
+        print mg
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('Lap gauss')
+
+        for stu_num, df in enumerate(degrees_freedoms):
+            #Student T
+            print "Student-T GP {}df".format(df)
+            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
+            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
+            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
+            mstu_t.ensure_default_constraints()
+            mstu_t.constrain_fixed('white', 1e-5)
+            mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+            mstu_t['rbf_len'] = rbf_len
+            mstu_t['t_noise'] = noise
+            print mstu_t
+            try:
+                mstu_t.optimize(optimizer=optimizer, messages=messages)
+            except Exception:
+                print "Blew up"
+            Y_test_pred = mstu_t.predict(X_test)
+            score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
+            pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
+            print pred_density
+            print mstu_t
+            if plot:
+                plt.figure()
+                plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+                plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+                plt.title('Stu t {}df'.format(df))
+
+    print "Average scores: {}".format(np.mean(score_folds, 1))
+    print "Average pred density: {}".format(np.mean(pred_density, 1))
+
+    #Plotting
+    stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
+    legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
+
+    #Plot boxplots for RMSE density
+    fig = plt.figure()
+    ax=fig.add_subplot(111)
+    plt.title('RMSE')
+    bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
+    plt.setp(bp['boxes'], color='black')
+    plt.setp(bp['whiskers'], color='black')
+    plt.setp(bp['fliers'], color='red', marker='+')
+    xtickNames = plt.setp(ax, xticklabels=legends)
+    plt.setp(xtickNames, rotation=45, fontsize=8)
+    ax.set_ylabel('RMSE')
+    ax.set_xlabel('Distribution')
+    #Make grid and put it below boxes
+    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+              alpha=0.5)
+    ax.set_axisbelow(True)
+
+    #Plot boxplots for predictive density
+    fig = plt.figure()
+    ax=fig.add_subplot(111)
+    plt.title('Predictive density')
+    bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
+    plt.setp(bp['boxes'], color='black')
+    plt.setp(bp['whiskers'], color='black')
+    plt.setp(bp['fliers'], color='red', marker='+')
+    xtickNames = plt.setp(ax, xticklabels=legends[1:])
+    plt.setp(xtickNames, rotation=45, fontsize=8)
+    ax.set_ylabel('Mean Log probability P(Y*|Y)')
+    ax.set_xlabel('Distribution')
+    #Make grid and put it below boxes
+    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+              alpha=0.5)
+    ax.set_axisbelow(True)
+    return mstu_t
+
+def precipitation_example():
+    import sklearn
+    from sklearn.cross_validation import KFold
+    data = datasets.boston_housing()
+    X = data['X'].copy()
+    Y = data['Y'].copy()
+    X = X-X.mean(axis=0)
+    X = X/X.std(axis=0)
+    Y = Y-Y.mean()
+    Y = Y/Y.std()
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    num_folds = 10
+    kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    score_folds = np.zeros((4, num_folds))
+    def rmse(Y, Ystar):
+        return np.sqrt(np.mean((Y-Ystar)**2))
+    #for train, test in kf:
+    for n, (train, test) in enumerate(kf):
+        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        print "Fold {}".format(n)
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -57,8 +57,8 @@ def coregionalization_toy(max_iters=100):
    m.optimize(max_iters=max_iters)

    fig, axes = pb.subplots(2,1)
-    m.plot_single_output(output=0,ax=axes[0])
-    m.plot_single_output(output=1,ax=axes[1])
+    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
+    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
    axes[0].set_title('Output 0')
    axes[1].set_title('Output 1')
    return m
@ -270,6 +270,50 @@ def toy_rbf_1d_50(max_iters=100):
    print(m)
    return m

+def toy_poisson_rbf_1d(optimizer='bfgs', max_nb_eval_optim=100):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    x_len = 400
+    X = np.linspace(0, 10, x_len)[:, None]
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
+
+    noise_model = GPy.likelihoods.poisson()
+    likelihood = GPy.likelihoods.EP(Y,noise_model)
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+
+    # optimize
+    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
+    # plot
+    m.plot()
+    print(m)
+    return m
+
+def toy_poisson_rbf_1d_laplace(optimizer='bfgs', max_nb_eval_optim=100):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    x_len = 30
+    X = np.linspace(0, 10, x_len)[:, None]
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
+
+    noise_model = GPy.likelihoods.poisson()
+    likelihood = GPy.likelihoods.Laplace(Y,noise_model)
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+
+    # optimize
+    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
+    # plot
+    m.plot()
+    # plot the real underlying rate function
+    pb.plot(X, np.exp(f_true), '--k', linewidth=2)
+    print(m)
+    return m
+
+
+
 def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
--- a/GPy/gpy_config.cfg
+++ b/GPy/gpy_config.cfg
@ -4,4 +4,4 @@
 # Enable openmp support. This speeds up some computations, depending on the number
 # of cores available. Setting up a compiler with openmp support can be difficult on 
 # some platforms, hence this option.
-openmp=True
+openmp=False
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -450,9 +450,21 @@ def prod(k1,k2,tensor=False):
 def symmetric(k):
    """
    Construct a symmetric kernel from an existing kernel
+
+    The symmetric kernel works by adding two GP functions together, and computing the overall covariance.
+
+    Let f ~ GP(x | 0, k(x, x')). Now let g = f(x) + f(-x).
+
+    It's easy to see that g is a symmetric function: g(x) = g(-x).
+
+    by construction, g, is a gaussian Process with mean 0 and covariance
+
+    k(x, x') + k(-x, x') + k(x, -x') + k(-x, -x')
+
+    This constructor builds a covariance function of this form from the initial kernel
    """
    k_ = k.copy()
-    k_.parts = [symmetric.Symmetric(p) for p in k.parts]
+    k_.parts = [parts.symmetric.Symmetric(p) for p in k.parts]
    return k_

 def coregionalize(output_dim,rank=1, W=None, kappa=None):
@ -604,4 +616,4 @@ def ODE_UY(input_dim=2, varianceU=1.,  varianceY=1., lengthscaleU=None,  lengths
    :rtype: kernel object
    """
    part = parts.ODE_UY.ODE_UY(input_dim, varianceU, varianceY, lengthscaleU, lengthscaleY)
-    return kern(input_dim, [part])
+    return kern(input_dim, [part])
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -440,67 +440,123 @@ class kern(Parameterized):

    def psi2(self, Z, mu, S):
        """
-        Computer the psi2 statistics for the covariance function.
-        
-        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
-        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
-        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
-
+        :param Z: np.ndarray of inducing inputs (M x Q)
+        :param mu, S: np.ndarrays of means and variances (each N x Q)
+        :returns psi2: np.ndarray (N,M,M)
        """
        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]

        # compute the "cross" terms
        # TODO: input_slices needed
-        crossterms = 0
+        from parts.white import White
+        from parts.rbf import RBF
+        from parts.rbf_inv import RBFInv
+        from parts.bias import Bias
+        from parts.linear import Linear
+        from parts.fixed import Fixed

-        for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self.parts, self.input_slices), 2):
-            if i_s1 == i_s2:
-                # TODO psi1 this must be faster/better/precached/more nice
-                tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
-                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
-                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
-                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
-    
-                prod = np.multiply(tmp1, tmp2)
-                crossterms += prod[:, :, None] + prod[:, None, :]
-
-        # target += crossterms
-        return target + crossterms
+        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self.parts, self.param_slices), 2):
+            # white doesn;t combine with anything
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+                target += 2 * p1.variance * (p2._psi1[:, :, None] + p2._psi1[:, None, :])
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+                tmp1 = p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
+                renorm = p1.variance*np.exp()
+                target += p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
+            # linear X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
+                tmp = np.zeros((mu.shape[0], Z.shape[0]))
+                p2.psi1(Z, mu, S, tmp)
+                target += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, Linear):
+                tmp = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z, mu, S, tmp)
+                target += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
+            # rbf X any
+            elif isinstance(p1, (RBF, RBFInv)):
+                pass
+            elif isinstance(p2, (RBF, RBFInv)):
+                raise NotImplementedError # TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+        return target

    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
-        """Gradient of the psi2 statistics with respect to the parameters."""
        target = np.zeros(self.num_params)
        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]

        # compute the "cross" terms
        # TODO: better looping, input_slices
-        for i1, i2 in itertools.permutations(range(len(self.parts)), 2):
+        for i1, i2 in itertools.combinations(range(len(self.parts)), 2):
            p1, p2 = self.parts[i1], self.parts[i2]
 #             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
            ps1, ps2 = self.param_slices[i1], self.param_slices[i2]

-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2])
+            # white doesn;t combine with anything
+            if p1.name == 'white' or p2.name == 'white':
+                pass
+            # rbf X bias
+            elif p1.name == 'bias' and p2.name == 'rbf':
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2])
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2._psi1 * 2., Z, mu, S, target[ps1])
+            elif p2.name == 'bias' and p1.name == 'rbf':
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target[ps1])
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1._psi1 * 2., Z, mu, S, target[ps2])
+            # linear X bias
+            elif p1.name == 'bias' and p2.name == 'linear':
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2]) # [ps1])
+                psi1 = np.zeros((mu.shape[0], Z.shape[0]))
+                p2.psi1(Z, mu, S, psi1)
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps1])
+            elif p2.name == 'bias' and p1.name == 'linear':
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target[ps1])
+                psi1 = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z, mu, S, psi1)
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps2])
+            # rbf X any
+            
+            elif p1.name == 'linear' and p2.name == 'rbf':
+                raise NotImplementedError # TODO
+            elif p2.name == 'linear' and p1.name == 'rbf':
+                raise NotImplementedError # TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"

        return self._transform_gradients(target)

    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
        target = np.zeros_like(Z)
        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
-        # target *= 2

        # compute the "cross" terms
        # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self.parts, 2):
-            if p1.name == 'linear' and p2.name == 'linear':
-                raise NotImplementedError("We don't handle linear/linear cross-terms")
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target)
+        for p1, p2 in itertools.combinations(self.parts, 2):
+            # white doesn;t combine with anything
+            if p1.name == 'white' or p2.name == 'white':
+                pass
+            # rbf X bias
+            elif p1.name == 'bias' and p2.name == 'rbf':
+                p2.dpsi1_dX(dL_dpsi2.sum(1).T * p1.variance, Z, mu, S, target)
+            elif p2.name == 'bias' and p1.name == 'rbf':
+                p1.dpsi1_dZ(dL_dpsi2.sum(1).T * p2.variance, Z, mu, S, target)
+            # linear X bias
+            elif p1.name == 'bias' and p2.name == 'linear':
+                p2.dpsi1_dZ(dL_dpsi2.sum(1).T * p1.variance, Z, mu, S, target)
+            elif p2.name == 'bias' and p1.name == 'linear':
+                p1.dpsi1_dZ(dL_dpsi2.sum(1).T * p2.variance, Z, mu, S, target)
+            # rbf X linear
+            elif p1.name == 'linear' and p2.name == 'rbf':
+                raise NotImplementedError # TODO
+            elif p2.name == 'linear' and p1.name == 'rbf':
+                raise NotImplementedError # TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"

-        return target * 2
+        return target * 2.

    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
@ -508,16 +564,29 @@ class kern(Parameterized):

        # compute the "cross" terms
        # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self.parts, 2):
-            if p1.name == 'linear' and p2.name == 'linear':
-                raise NotImplementedError("We don't handle linear/linear cross-terms")
-
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S)
+        for p1, p2 in itertools.combinations(self.parts, 2):
+            # white doesn;t combine with anything
+            if p1.name == 'white' or p2.name == 'white':
+                pass
+            # rbf X bias
+            elif p1.name == 'bias' and p2.name == 'rbf':
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1).T * p1.variance * 2., Z, mu, S, target_mu, target_S)
+            elif p2.name == 'bias' and p1.name == 'rbf':
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1).T * p2.variance * 2., Z, mu, S, target_mu, target_S)
+            # linear X bias
+            elif p1.name == 'bias' and p2.name == 'linear':
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1).T * p1.variance * 2., Z, mu, S, target_mu, target_S)
+            elif p2.name == 'bias' and p1.name == 'linear':
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1).T * p2.variance * 2., Z, mu, S, target_mu, target_S)
+            # rbf X linear
+            elif p1.name == 'linear' and p2.name == 'rbf':
+                raise NotImplementedError # TODO
+            elif p2.name == 'linear' and p1.name == 'rbf':
+                raise NotImplementedError # TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"

        return target_mu, target_S
-
    def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
        if which_parts == 'all':
            which_parts = [True] * self.num_parts
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@ -1,7 +1,6 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from IPython.core.debugger import Tracer; debug_here=Tracer()
 from kernpart import Kernpart
 import numpy as np
 from ...util.linalg import tdot
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@ -19,7 +19,10 @@ class Prod(Kernpart):
    """
    def __init__(self,k1,k2,tensor=False):
        self.num_params = k1.num_params + k2.num_params
-        self.name = '['+k1.name + '**' + k2.name +']'
+        if tensor:
+            self.name = '['+k1.name + '**' + k2.name +']'
+        else:
+            self.name = '['+k1.name + '*' + k2.name +']'
        self.k1 = k1
        self.k2 = k2
        if tensor:
@ -130,3 +133,13 @@ class Prod(Kernpart):
                self.k1.K(X[:,self.slice1],X2[:,self.slice1],self._K1)
                self.k2.K(X[:,self.slice2],X2[:,self.slice2],self._K2)

+    #def __getstate__(self):
+        #return [self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params]
+
+    #def __setstate__(self, state):
+        #self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params = state
+        #self._X, self._X2, self._params = np.empty(shape=(3,1))
+
+
+
+
--- a/GPy/kern/parts/symmetric.py
+++ b/GPy/kern/parts/symmetric.py
@ -56,7 +56,7 @@ class Symmetric(Kernpart):
        AX = np.dot(X,self.transform)
        if X2 is None:
            X2 = X
-            ZX2 = AX
+            AX2 = AX
        else:
            AX2 = np.dot(X2, self.transform)
        self.k.dK_dtheta(dL_dK,X,X2,target)
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@ -1,7 +1,8 @@
 #include <math.h>
 #include <float.h>
 #include <stdlib.h>
-
+#include <iostream>
+#include <stdexcept>
 double DiracDelta(double x){
  // TODO: this doesn't seem to be a dirac delta ... should return infinity. Neil
    if((x<0.000001) & (x>-0.000001))//go on, laugh at my c++ skills
@ -14,6 +15,7 @@ double DiracDelta(double x,int foo){
 };

 double sinc(double x){
+  // compute the sinc function
  if (x==0)
    return 1.0;
  else 
@ -21,6 +23,7 @@ double sinc(double x){
 }

 double sinc_grad(double x){
+  // compute the gradient of the sinc function.
  if (x==0)
    return 0.0;
  else 
@ -28,6 +31,7 @@ double sinc_grad(double x){
 }

 double erfcx(double x){
+  // compute the scaled complex error function.
  double xneg=-sqrt(log(DBL_MAX/2));
  double xmax = 1/(sqrt(M_PI)*DBL_MIN);
  xmax = DBL_MAX<xmax ? DBL_MAX : xmax;
@ -50,12 +54,108 @@ double erfcx(double x){
 }

 double ln_diff_erf(double x0, double x1){
+  // stably compute the log of difference between two erfs.
+  if (x1>x0)
+    throw std::runtime_error("Error: second argument must be smaller than first in ln_diff_err");
+  return log(erf(x0) - erf(x1));
  if (x0==x1)
-    return INFINITY;
+    return -INFINITY;
  else if(x0<0 && x1>0 || x0>0 && x1<0)
    return log(erf(x0)-erf(x1));
  else if(x1>0)
-    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1)- x0*x0)-x1*x1;
+    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1- x0*x0))-x1*x1;
  else 
    return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
 }
+
+double h(double t, double tprime, double d_i, double d_j, double l){
+  // Compute the h function for the sim covariance.
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  
+  return sign_val*exp(half_l_di*half_l_di - d_i*(t-tprime) + ln_part_1 - log(d_i + d_j)) - sign_val*exp(half_l_di*half_l_di - d_i*t - d_j*tprime + ln_part_2 - log(d_i + d_j));
+}
+
+double dh_dl(double t, double tprime, double d_i, double d_j, double l){
+  // compute gradient of h function with respect to lengthscale for sim covariance
+  // TODO a lot of energy wasted recomputing things here, need to do this in a shared way somehow ... perhaps needs rewrite of sympykern.
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  double diff_t = t - tprime;
+  double l2 = l*l;
+  double hv = h(t, tprime, d_i, d_j, l);
+  return 0.5*d_i*d_i*l*hv + 2/(sqrt(M_PI)*(d_i+d_j))*((-diff_t/l2-d_i/2)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2)*exp(-t*t/l2-d_j*tprime)-d_i/2*exp(-(d_i*t+d_j*tprime)));
+}
+
+double dh_dd_i(double t, double tprime, double d_i, double d_j, double l){
+  double diff_t = (t-tprime);
+  double l2 = l*l;
+  double hv = h(t, tprime, d_i, d_j, l);
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_1 = half_l_di;
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
+
+  double base = ((0.5*d_i*l2*(d_i+d_j)-1)*hv 
+		 + (-diff_t*sign_val*exp(half_l_di*half_l_di
+					 -d_i*diff_t
+					 +ln_part_1)
+		    +t*sign_val*exp(half_l_di*half_l_di
+				    -d_i*t-d_j*tprime
+				    +ln_part_2))
+		 + l/sqrt(M_PI)*(-exp(-diff_t*diff_t/l2)
+			       +exp(-tprime*tprime/l2-d_i*t)
+			       +exp(-t*t/l2-d_j*tprime)
+			       -exp(-(d_i*t + d_j*tprime))));
+  return base/(d_i+d_j);
+}
+
+double dh_dd_j(double t, double tprime, double d_i, double d_j, double l){
+  double diff_t = (t-tprime);
+  double l2 = l*l;
+  double half_l_di = 0.5*l*d_i;
+  double hv = h(t, tprime, d_i, d_j, l);
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_1 = half_l_di;
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
+  double base = tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2)-hv;
+  return base/(d_i+d_j);
+}
+
+
+double dh_dt(double t, double tprime, double d_i, double d_j, double l){
+  return 0.0;
+}
+
+double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
+  return 0.0;
+}
--- a/GPy/kern/parts/sympy_helpers.h
+++ b/GPy/kern/parts/sympy_helpers.h
@ -7,3 +7,10 @@ double sinc_grad(double x);

 double erfcx(double x);
 double ln_diff_erf(double x0, double x1);
+
+double h(double t, double tprime, double d_i, double d_j, double l);
+double dh_dl(double t, double tprime, double d_i, double d_j, double l);
+double dh_dd_i(double t, double tprime, double d_i, double d_j, double l);
+double dh_dd_j(double t, double tprime, double d_i, double d_j, double l);
+double dh_dt(double t, double tprime, double d_i, double d_j, double l);
+double dh_dtprime(double t, double tprime, double d_i, double d_j, double l);
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,7 +1,7 @@
 from ep import EP
+from laplace import Laplace
 from ep_mixed_noise import EP_Mixed_Noise
 from gaussian import Gaussian
 from gaussian_mixed_noise import Gaussian_Mixed_Noise
+import noise_models
 from noise_model_constructors import *
-# TODO: from Laplace import Laplace
-
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -19,7 +19,6 @@ class EP(likelihood):
        self.num_data, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
        self.num_params = 0
-        self._transf_data = self.noise_model._preprocess_values(data)

        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
@ -50,10 +49,26 @@ class EP(likelihood):
        self.VVT_factor = self.V
        self.trYYT = 0.

-    def predictive_values(self,mu,var,full_cov):
+    def predictive_values(self,mu,var,full_cov,**noise_args):
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model.predictive_values(mu,var)
+        return self.noise_model.predictive_values(mu,var,**noise_args)
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)

    def _get_params(self):
        #return np.zeros(0)
@ -134,7 +149,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@ -233,7 +248,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@ -336,7 +351,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -69,7 +69,7 @@ class Gaussian(likelihood):
            self.covariance_matrix = np.eye(self.N) * x
            self._variance = x

-    def predictive_values(self, mu, var, full_cov):
+    def predictive_values(self, mu, var, full_cov, **likelihood_args):
        """
        Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval
        """
@ -90,11 +90,25 @@ class Gaussian(likelihood):
            _95pc = mean + 2.*np.sqrt(true_var)
        return mean, true_var, _5pc, _95pc

-    def fit_full(self):
+    def log_predictive_density(self, y_test, mu_star, var_star):
        """
-        No approximations needed
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+
+        .. Note:
+            Works as if each test point was provided individually, i.e. not full_cov
        """
-        pass
+        y_rescaled = (y_test - self._offset)/self._scale
+        return -0.5*np.log(2*np.pi) -0.5*np.log(var_star + self._variance) -0.5*(np.square(y_rescaled - mu_star))/(var_star + self._variance)

    def _gradients(self, partial):
        return np.sum(partial)
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@ -0,0 +1,390 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+#
+#Parts of this file were influenced by the Matlab GPML framework written by
+#Carl Edward Rasmussen & Hannes Nickisch, however all bugs are our own.
+#
+#The GPML code is released under the FreeBSD License.
+#Copyright (c) 2005-2013 Carl Edward Rasmussen & Hannes Nickisch. All rights reserved.
+#
+#The code and associated documentation is available from
+#http://gaussianprocess.org/gpml/code.
+
+import numpy as np
+import scipy as sp
+from likelihood import likelihood
+from ..util.linalg import mdot, jitchol, pddet, dpotrs
+from functools import partial as partial_func
+
+class Laplace(likelihood):
+    """Laplace approximation to a posterior"""
+
+    def __init__(self, data, noise_model, extra_data=None):
+        """
+        Laplace Approximation
+
+        Find the moments \hat{f} and the hessian at this point
+        (using Newton-Raphson) of the unnormalised posterior
+
+        Compute the GP variables (i.e. generate some Y^{squiggle} and
+        z^{squiggle} which makes a gaussian the same as the laplace
+        approximation to the posterior, but normalised
+
+        Arguments
+        ---------
+
+        :param data: array of data the likelihood function is approximating
+        :type data: NxD
+        :param noise_model: likelihood function - subclass of noise_model
+        :type noise_model: noise_model
+        :param extra_data: additional data used by some likelihood functions,
+        """
+        self.data = data
+        self.noise_model = noise_model
+        self.extra_data = extra_data
+
+        #Inital values
+        self.N, self.D = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
+        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
+
+        self.restart()
+        likelihood.__init__(self)
+
+    def restart(self):
+        """
+        Reset likelihood variables to their defaults
+        """
+        #Initial values for the GP variables
+        self.Y = np.zeros((self.N, 1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:, None]
+        self.Z = 0
+        self.YYT = None
+
+        self.old_Ki_f = None
+
+    def predictive_values(self, mu, var, full_cov):
+        if full_cov:
+            raise NotImplementedError("Cannot make correlated predictions\
+                    with an Laplace likelihood")
+        return self.noise_model.predictive_values(mu, var)
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
+
+    def _get_params(self):
+        return np.asarray(self.noise_model._get_params())
+
+    def _get_param_names(self):
+        return self.noise_model._get_param_names()
+
+    def _set_params(self, p):
+        return self.noise_model._set_params(p)
+
+    def _shared_gradients_components(self):
+        d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
+        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
+        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
+        return dL_dfhat, I_KW_i
+
+    def _Kgradients(self):
+        """
+        Gradients with respect to prior kernel parameters dL_dK to be chained
+        with dK_dthetaK to give dL_dthetaK
+        :returns: dL_dK matrix
+        :rtype: Matrix (1 x num_kernel_params)
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #Explicit
+        #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
+        #expl_b = self.Wi_K_i
+        #expl = 0.5*expl_a - 0.5*expl_b
+        #dL_dthetaK_exp = dK_dthetaK(expl, X)
+
+        #Implicit
+        impl = mdot(dlp, dL_dfhat, I_KW_i)
+
+        #No longer required as we are computing these in the gp already
+        #otherwise we would take them away and add them back
+        #dL_dthetaK_imp = dK_dthetaK(impl, X)
+        #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
+        #dL_dK = expl + impl
+
+        #No need to compute explicit as we are computing dZ_dK to account
+        #for the difference between the K gradients of a normal GP,
+        #and the K gradients including the implicit part
+        dL_dK = impl
+        return dL_dK
+
+    def _gradients(self, partial):
+        """
+        Gradients with respect to likelihood parameters (dL_dthetaL)
+
+        :param partial: Not needed by this likelihood
+        :type partial: lambda function
+        :rtype: array of derivatives (1 x num_likelihood_params)
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #len(dlik_dthetaL)
+        num_params = len(self._get_param_names())
+        # make space for one derivative for each likelihood parameter
+        dL_dthetaL = np.zeros(num_params)
+        for thetaL_i in range(num_params):
+            #Explicit
+            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[:, thetaL_i])
+                             #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
+                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[:, thetaL_i])
+                             )
+
+            #Implicit
+            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[:, thetaL_i])
+            dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
+            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+
+        return dL_dthetaL
+
+    def _compute_GP_variables(self):
+        """
+        Generate data Y which would give the normal distribution identical
+        to the laplace approximation to the posterior, but normalised
+
+        GPy expects a likelihood to be gaussian, so need to caluclate
+        the data Y^{\tilde} that makes the posterior match that found
+        by a laplace approximation to a non-gaussian likelihood but with
+        a gaussian likelihood
+
+        Firstly,
+        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
+        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
+        we wish to find the hessian \Sigma^{\tilde}
+        that has the same curvature but using our new simulated data Y^{\tilde}
+        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
+        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
+        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
+
+        Secondly,
+        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
+        So we can suck up any differences between that and our log marginal likelihood approximation
+        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
+        which we want to optimize instead, by equating them and rearranging, the difference is added onto
+        the log p(y) that GPy optimizes by default
+
+        Thirdly,
+        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
+        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
+        gp.py code
+        """
+        Wi = 1.0/self.W
+        self.Sigma_tilde = np.diagflat(Wi)
+
+        Y_tilde = Wi*self.Ki_f + self.f_hat
+
+        self.Wi_K_i = self.W12BiW12
+        self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+        self.lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+
+        Z_tilde = (+ self.lik
+                   - 0.5*self.ln_B_det
+                   + 0.5*self.ln_det_Wi_K
+                   - 0.5*self.f_Ki_f
+                   + 0.5*self.y_Wi_Ki_i_y
+                  )
+
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
+        self.YYT = np.dot(self.Y, self.Y.T)
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
+
+        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
+        self.dZ_dK = self._Kgradients()
+        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
+
+    def fit_full(self, K):
+        """
+        The laplace approximation algorithm, find K and expand hessian
+        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
+
+        :param K: Prior covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        """
+        self.K = K.copy()
+
+        #Find mode
+        self.f_hat = self.rasm_mode(self.K)
+
+        #Compute hessian and other variables at mode
+        self._compute_likelihood_variables()
+
+        #Compute fake variables replicating laplace approximation to posterior
+        self._compute_GP_variables()
+
+    def _compute_likelihood_variables(self):
+        """
+        Compute the variables required to compute gaussian Y variables
+        """
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
+
+        self.Ki_f = self.Ki_f
+        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
+        self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
+
+    def _compute_B_statistics(self, K, W, a):
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :param K: Prior Covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of hessian (1xN)
+        :param a: Matrix to calculate W12BiW12a
+        :type a: Matrix NxN
+        :returns: (W12BiW12, ln_B_det)
+        """
+        if not self.noise_model.log_concave:
+            #print "Under 1e-10: {}".format(np.sum(W < 1e-10))
+            W[W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                # To cause the posterior to become less certain than the prior and likelihood,
+                                # This is a property only held by non-log-concave likelihoods
+
+
+        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        B = np.eye(self.N) + W_12*K*W_12.T
+        L = jitchol(B)
+
+        W12BiW12 = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
+        ln_B_det = 2*np.sum(np.log(np.diag(L)))
+        return W12BiW12, ln_B_det
+
+    def rasm_mode(self, K, MAX_ITER=30):
+        """
+        Rasmussen's numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+        Influenced by GPML (BSD) code, all errors are our own
+
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
+        :param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :type MAX_ITER: scalar
+        :returns: f_hat, mode on which to make laplace approxmiation
+        :rtype: NxD matrix
+        """
+        #old_Ki_f = np.zeros((self.N, 1))
+
+        #Start f's at zero originally
+        if self.old_Ki_f is None:
+            old_Ki_f = np.zeros((self.N, 1))
+            f = np.dot(K, old_Ki_f)
+        else:
+            #Start at the old best point
+            old_Ki_f = self.old_Ki_f.copy()
+            f = self.f_hat.copy()
+
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(Ki_f, f):
+            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
+
+        difference = np.inf
+        epsilon = 1e-5
+        #step_size = 1
+        #rs = 0
+        i = 0
+
+        while difference > epsilon and i < MAX_ITER:
+            W = -self.noise_model.d2logpdf_df2(f, self.data, extra_data=self.extra_data)
+
+            W_f = W*f
+            grad = self.noise_model.dlogpdf_df(f, self.data, extra_data=self.extra_data)
+
+            b = W_f + grad
+            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_Ki_f = b - W12BiW12Kb
+            dKi_f = full_step_Ki_f - old_Ki_f
+
+            f_old = f.copy()
+            def inner_obj(step_size, old_Ki_f, dKi_f, K):
+                Ki_f = old_Ki_f + step_size*dKi_f
+                f = np.dot(K, Ki_f)
+                # This is nasty, need to set something within an optimization though
+                self.tmp_Ki_f = Ki_f.copy()
+                self.tmp_f = f.copy()
+                return -obj(Ki_f, f)
+
+            i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
+            #Find the stepsize that minimizes the objective function using a brent line search
+            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
+            #steps than get this exact then make a step, if B was bigger it might be the other way around though
+            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
+            f = self.tmp_f.copy()
+            Ki_f = self.tmp_Ki_f.copy()
+
+            #Optimize without linesearch
+            #f_old = f.copy()
+            #update_passed = False
+            #while not update_passed:
+                #Ki_f = old_Ki_f + step_size*dKi_f
+                #f = np.dot(K, Ki_f)
+
+                #old_obj = new_obj
+                #new_obj = obj(Ki_f, f)
+                #difference = new_obj - old_obj
+                ##print "difference: ",difference
+                #if difference < 0:
+                    ##print "Objective function rose", np.float(difference)
+                    ##If the objective function isn't rising, restart optimization
+                    #step_size *= 0.8
+                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    ##objective function isn't increasing, try reducing step size
+                    #f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
+                    #old_obj = new_obj
+                    #rs += 1
+                #else:
+                    #update_passed = True
+
+            #old_Ki_f = self.Ki_f.copy()
+
+            #difference = abs(new_obj - old_obj)
+            #old_obj = new_obj.copy()
+            #difference = np.abs(np.sum(f - f_old))
+            difference = np.abs(np.sum(Ki_f - old_Ki_f))
+            old_Ki_f = Ki_f.copy()
+            i += 1
+
+        self.old_Ki_f = old_Ki_f.copy()
+        if difference > epsilon:
+            print "Not perfect f_hat fit difference: {}".format(difference)
+
+        self.Ki_f = Ki_f
+        return f
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -23,6 +23,7 @@ class likelihood(Parameterized):
    """
    def __init__(self):
        Parameterized.__init__(self)
+        self.dZ_dK = 0

    def _get_params(self):
        raise NotImplementedError
@ -33,11 +34,36 @@ class likelihood(Parameterized):
    def _set_params(self, x):
        raise NotImplementedError

-    def fit(self):
-        raise NotImplementedError
+    def fit_full(self, K):
+        """
+        No approximations needed by default
+        """
+        pass
+
+    def restart(self):
+        """
+        No need to restart if not an approximation
+        """
+        pass

    def _gradients(self, partial):
        raise NotImplementedError

    def predictive_values(self, mu, var):
        raise NotImplementedError
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        raise NotImplementedError
--- a/GPy/likelihoods/noise_model_constructors.py
+++ b/GPy/likelihoods/noise_model_constructors.py
@ -4,9 +4,9 @@
 import numpy as np
 import noise_models

-def binomial(gp_link=None):
+def bernoulli(gp_link=None):
    """
-    Construct a binomial likelihood
+    Construct a bernoulli likelihood

    :param gp_link: a GPy gp_link function
    """
@ -27,16 +27,17 @@ def binomial(gp_link=None):
        analytical_mean = False
        analytical_variance = False

-    return noise_models.binomial_noise.Binomial(gp_link,analytical_mean,analytical_variance)
+    return noise_models.bernoulli_noise.Bernoulli(gp_link,analytical_mean,analytical_variance)

 def exponential(gp_link=None):
+
    """
-    Construct a binomial likelihood
+    Construct a exponential likelihood

    :param gp_link: a GPy gp_link function
    """
    if gp_link is None:
-        gp_link = noise_models.gp_transformations.Identity()
+        gp_link = noise_models.gp_transformations.Log_ex_1()

    analytical_mean = False
    analytical_variance = False
@ -85,4 +86,36 @@ def gamma(gp_link=None,beta=1.):
    analytical_variance = False
    return noise_models.gamma_noise.Gamma(gp_link,analytical_mean,analytical_variance,beta)

+def gaussian(gp_link=None, variance=2, D=None, N=None):
+    """
+    Construct a Gaussian likelihood

+    :param gp_link: a GPy gp_link function
+    :param variance: variance
+    :type variance: scalar
+    :returns: Gaussian noise model:
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+    analytical_mean = True
+    analytical_variance = True # ?
+    return noise_models.gaussian_noise.Gaussian(gp_link, analytical_mean,
+            analytical_variance, variance=variance, D=D, N=N)
+
+def student_t(gp_link=None, deg_free=5, sigma2=2):
+    """
+    Construct a Student t likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param deg_free: degrees of freedom of student-t
+    :type deg_free: scalar
+    :param sigma2: variance
+    :type sigma2: scalar
+    :returns: Student-T noise model
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+    analytical_mean = True
+    analytical_variance = True
+    return noise_models.student_t_noise.StudentT(gp_link, analytical_mean,
+            analytical_variance,deg_free, sigma2)
--- a/GPy/likelihoods/noise_models/init.py
+++ b/GPy/likelihoods/noise_models/init.py
@ -1,7 +1,8 @@
 import noise_distributions
-import binomial_noise
+import bernoulli_noise
 import exponential_noise
 import gaussian_noise
 import gamma_noise
 import poisson_noise
+import student_t_noise
 import gp_transformations
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@ -0,0 +1,220 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Bernoulli(NoiseDistribution):
+    """
+    Bernoulli likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+    .. Note::
+        Y is expected to take values in {-1,1}
+        Probit likelihood usually used
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        super(Bernoulli, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+
+        ..Note:: Binary classification algorithm works better with classes {-1,1}
+        """
+        Y_prep = Y.copy()
+        Y1 = Y[Y.flatten()==1].size
+        Y2 = Y[Y.flatten()==0].size
+        assert Y1 + Y2 == Y.size, 'Bernoulli likelihood is meant to be used only with outputs in {0,1}.'
+        Y_prep[Y.flatten() == 0] = -1
+        return Y_prep
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        if data_i == 1:
+            sign = 1.
+        elif data_i == 0:
+            sign = -1
+        else:
+            raise ValueError("bad value for Bernouilli observation (0,1)")
+        if isinstance(self.gp_link,gp_transformations.Probit):
+            z = sign*v_i/np.sqrt(tau_i**2 + tau_i)
+            Z_hat = std_norm_cdf(z)
+            phi = std_norm_pdf(z)
+            mu_hat = v_i/tau_i + sign*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
+            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
+
+        elif isinstance(self.gp_link,gp_transformations.Heaviside):
+            a = sign*v_i/np.sqrt(tau_i)
+            Z_hat = std_norm_cdf(a)
+            N = std_norm_pdf(a)
+            mu_hat = v_i/tau_i + sign*N/Z_hat/np.sqrt(tau_i)
+            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
+            if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
+                stop
+        else:
+            raise ValueError("Exact moment matching not available for link {}".format(self.gp_link.gp_transformations.__name__))
+
+        return Z_hat, mu_hat, sigma2_hat
+
+    def _predictive_mean_analytical(self,mu,variance):
+
+        if isinstance(self.gp_link,gp_transformations.Probit):
+            return stats.norm.cdf(mu/np.sqrt(1+variance))
+
+        elif isinstance(self.gp_link,gp_transformations.Heaviside):
+            return stats.norm.cdf(mu/np.sqrt(variance))
+
+        else:
+            raise NotImplementedError
+
+    def _predictive_variance_analytical(self,mu,variance, pred_mean):
+
+        if isinstance(self.gp_link,gp_transformations.Heaviside):
+            return 0.
+        else:
+            raise NotImplementedError
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        .. Note:
+            Each y_i must be in {0,1}
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        objective = (link_f**y) * ((1.-link_f)**(1.-y))
+        return np.exp(np.sum(np.log(objective)))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: log likelihood evaluated at points link(f)
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        #objective = y*np.log(link_f) + (1.-y)*np.log(link_f)
+        objective = np.where(y==1, np.log(link_f), np.log(1-link_f))
+        return np.sum(objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the pdf at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - \\frac{(1 - y_{i})}{(1 - \\lambda(f_{i}))}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: gradient of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = (y/link_f) - (1.-y)/(1-link_f)
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j
+        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
+
+
+        .. math::
+            \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d2logpdf_dlink2 = -y/(link_f**2) - (1-y)/((1-link_f)**2)
+        return d2logpdf_dlink2
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f)^{3}} - \\frac{2(1-y_{i}}{(1-\\lambda(f))^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: third derivative of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3logpdf_dlink3 = 2*(y/(link_f**3) - (1-y)/((1-link_f)**3))
+        return d3logpdf_dlink3
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        p = self.gp_link.transf(gp)
+        return p*(1.-p)
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        ns = np.ones_like(gp, dtype=int)
+        Ysim = np.random.binomial(ns, self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/binomial_noise.py
+++ b/GPy/likelihoods/noise_models/binomial_noise.py
@ -1,132 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
-
-class Binomial(NoiseDistribution):
-    """
-    Probit likelihood
-    Y is expected to take values in {-1,1}
-    -----
-    $$
-    L(x) = \\Phi (Y_i*f_i)
-    $$
-    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Binomial, self).__init__(gp_link,analytical_mean,analytical_variance)
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-
-        ..Note:: Binary classification algorithm works better with classes {-1,1}
-        """
-        Y_prep = Y.copy()
-        Y1 = Y[Y.flatten()==1].size
-        Y2 = Y[Y.flatten()==0].size
-        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
-        Y_prep[Y.flatten() == 0] = -1
-        return Y_prep
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        if isinstance(self.gp_link,gp_transformations.Probit):
-            z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
-            Z_hat = std_norm_cdf(z)
-            phi = std_norm_pdf(z)
-            mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
-            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
-
-        elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            a = data_i*v_i/np.sqrt(tau_i)
-            Z_hat = std_norm_cdf(a)
-            N = std_norm_pdf(a)
-            mu_hat = v_i/tau_i + data_i*N/Z_hat/np.sqrt(tau_i)
-            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
-            if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
-                stop
-
-        return Z_hat, mu_hat, sigma2_hat
-
-    def _predictive_mean_analytical(self,mu,sigma):
-        if isinstance(self.gp_link,gp_transformations.Probit):
-            return stats.norm.cdf(mu/np.sqrt(1+sigma**2))
-        elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            return stats.norm.cdf(mu/sigma)
-        else:
-            raise NotImplementedError
-
-    def _predictive_variance_analytical(self,mu,sigma, pred_mean):
-        if isinstance(self.gp_link,gp_transformations.Heaviside):
-            return 0.
-        else:
-            raise NotImplementedError
-
-    def _mass(self,gp,obs):
-        #NOTE obs must be in {0,1}
-        p = self.gp_link.transf(gp)
-        return p**obs * (1.-p)**(1.-obs)
-
-    def _nlog_mass(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        return obs*np.log(p) + (1.-obs)*np.log(1-p)
-
-    def _dnlog_mass_dgp(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        dp = self.gp_link.dtransf_df(gp)
-        return obs/p * dp - (1.-obs)/(1.-p) * dp
-
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        return (obs/p + (1.-obs)/(1.-p))*self.gp_link.d2transf_df2(gp) + ((1.-obs)/(1.-p)**2-obs/p**2)*self.gp_link.dtransf_df(gp)
-
-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        p = self.gp_link.transf(gp)
-        return p*(1.-p)
-
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)*(1. - 2.*self.gp_link.transf(gp))
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
-
-
-    def samples(self, gp):
-        """
-        Returns a set of samples of observations based on a given value of the latent variable.
-
-        :param size: number of samples to compute
-        :param gp: latent variable
-        """
-        orig_shape = gp.shape
-        gp = gp.flatten()
-        Ysim = np.array([np.random.binomial(1,self.gp_link.transf(gpj),size=1) for gpj in gp])
-        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@ -24,24 +24,113 @@ class Exponential(NoiseDistribution):
    def _preprocess_values(self,Y):
        return Y

-    def _mass(self,gp,obs):
+    def pdf_link(self, link_f, y, extra_data=None):
        """
-        Mass (or density) function
-        """
-        return np.exp(-obs/self.gp_link.transf(gp))/self.gp_link.transf(gp)
+        Likelihood function given link(f)

-    def _nlog_mass(self,gp,obs):
-        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
-        """
-        return obs/self.gp_link.transf(gp) + np.log(self.gp_link.transf(gp))
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})\\exp (-y\\lambda(f_{i}))

-    def _dnlog_mass_dgp(self,gp,obs):
-        return ( 1./self.gp_link.transf(gp) - obs/self.gp_link.transf(gp)**2) * self.gp_link.dtransf_df(gp)
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        log_objective = link_f*np.exp(-y*link_f)
+        return np.exp(np.sum(np.log(log_objective)))
+        #return np.exp(np.sum(-y/link_f - np.log(link_f) ))

-    def _d2nlog_mass_dgp2(self,gp,obs):
-        fgp = self.gp_link.transf(gp)
-        return (2*obs/fgp**3 - 1./fgp**2) * self.gp_link.dtransf_df(gp)**2 + ( 1./fgp - obs/fgp**2) * self.gp_link.d2transf_df2(gp)
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\lambda(f_{i}) - y_{i}\\lambda(f_{i})
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        log_objective = np.log(link_f) - y*link_f
+        #logpdf_link = np.sum(-np.log(link_f) - y/link_f)
+        return np.sum(log_objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\lambda(f)} - y_{i}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = 1./link_f - y
+        #grad = y/(link_f**2) - 1./link_f
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\frac{1}{\\lambda(f_{i})^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -1./(link_f**2)
+        #hess = -2*y/(link_f**3) + 1/(link_f**2)
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2}{\\lambda(f_{i})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = 2./(link_f**3)
+        #d3lik_dlink3 = 6*y/(link_f**4) - 2./(link_f**3)
+        return d3lik_dlink3

    def _mean(self,gp):
        """
@ -49,20 +138,19 @@ class Exponential(NoiseDistribution):
        """
        return self.gp_link.transf(gp)

-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)**2

-    def _dvariance_dgp(self,gp):
-        return 2*self.gp_link.transf(gp)*self.gp_link.dtransf_df(gp)
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.

-    def _d2variance_dgp2(self,gp):
-        return 2 * (self.gp_link.dtransf_df(gp)**2 + self.gp_link.transf(gp)*self.gp_link.d2transf_df2(gp))
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.random.exponential(1.0/self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@ -12,11 +12,11 @@ from noise_distributions import NoiseDistribution
 class Gamma(NoiseDistribution):
    """
    Gamma likelihood
-    Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\beta^{\\alpha_{i}}}{\\Gamma(\\alpha_{i})}y_{i}^{\\alpha_{i}-1}e^{-\\beta y_{i}}\\\\
+        \\alpha_{i} = \\beta y_{i}
+
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
        self.beta = beta
@ -25,26 +25,122 @@ class Gamma(NoiseDistribution):
    def _preprocess_values(self,Y):
        return Y

-    def _mass(self,gp,obs):
+    def pdf_link(self, link_f, y, extra_data=None):
        """
-        Mass (or density) function
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\beta^{\\alpha_{i}}}{\\Gamma(\\alpha_{i})}y_{i}^{\\alpha_{i}-1}e^{-\\beta y_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
-        alpha = self.gp_link.transf(gp)*self.beta
-        return obs**(alpha - 1.) * np.exp(-self.beta*obs) * self.beta**alpha / special.gamma(alpha)
+        alpha = link_f*self.beta
+        objective = (y**(alpha - 1.) * np.exp(-self.beta*y) * self.beta**alpha)/ special.gamma(alpha)
+        return np.exp(np.sum(np.log(objective)))

-    def _nlog_mass(self,gp,obs):
+    def logpdf_link(self, link_f, y, extra_data=None):
        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
        """
-        alpha = self.gp_link.transf(gp)*self.beta
-        return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        #alpha = self.gp_link.transf(gp)*self.beta
+        #return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+        alpha = link_f*self.beta
+        log_objective = alpha*np.log(self.beta) - np.log(special.gamma(alpha)) + (alpha - 1)*np.log(y) - self.beta*y
+        return np.sum(log_objective)

-    def _dnlog_mass_dgp(self,gp,obs):
-        return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)

-    def _d2nlog_mass_dgp2(self,gp,obs):
-        return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\beta (\\log \\beta y_{i}) - \\Psi(\\alpha_{i})\\beta\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = self.beta*np.log(self.beta*y) - special.psi(self.beta*link_f)*self.beta
+        #old
+        #return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -special.polygamma(1, self.beta*link_f)*(self.beta**2)
+        #old
+        #return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = -special.polygamma(2, self.beta*link_f)*(self.beta**3)
+        return d3lik_dlink3

    def _mean(self,gp):
        """
@ -52,20 +148,8 @@ class Gamma(NoiseDistribution):
        """
        return self.gp_link.transf(gp)

-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)/self.beta
-
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)/self.beta
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)/self.beta
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@ -12,11 +12,17 @@ class Gaussian(NoiseDistribution):
    """
    Gaussian likelihood

-    :param mean: mean value of the Gaussian distribution
-    :param variance: mean value of the Gaussian distribution
+    .. math::
+        \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+    :param variance: variance value of the Gaussian distribution
+    :param N: Number of data points
+    :type N: int
    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1.):
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1., D=None, N=None):
        self.variance = variance
+        self.N = N
+        self._set_params(np.asarray(variance))
        super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)

    def _get_params(self):
@ -25,8 +31,13 @@ class Gaussian(NoiseDistribution):
    def _get_param_names(self):
        return ['noise_model_variance']

-    def _set_params(self,p):
-        self.variance = p
+    def _set_params(self, p):
+        self.variance = float(p)
+        self.I = np.eye(self.N)
+        self.covariance_matrix = self.I * self.variance
+        self.Ki = self.I*(1.0 / self.variance)
+        #self.ln_det_K = np.sum(np.log(np.diag(self.covariance_matrix)))
+        self.ln_det_K = self.N*np.log(self.variance)

    def _gradients(self,partial):
        return np.zeros(1)
@ -57,42 +68,231 @@ class Gaussian(NoiseDistribution):
        new_sigma2 = self.predictive_variance(mu,sigma)
        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)

-    def _predictive_variance_analytical(self,mu,sigma):
+    def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
        return 1./(1./self.variance + 1./sigma**2)

-    def _mass(self,gp,obs):
-        #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
-        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance))
+    def _mass(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use pdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+    def _nlog_mass(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")

-    def _nlog_mass(self,gp,obs):
-        return .5*((self.gp_link.transf(gp)-obs)**2/self.variance + np.log(2.*np.pi*self.variance))
+    def _dnlog_mass_dgp(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use dlogpdf_df in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")

-    def _dnlog_mass_dgp(self,gp,obs):
-        return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
+    def _d2nlog_mass_dgp2(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use d2logpdf_df2 in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")

-    def _d2nlog_mass_dgp2(self,gp,obs):
-        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        #Assumes no covariance, exp, sum, log for numerical stability
+        return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: log likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        return -0.5*(np.sum((y-link_f)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the pdf at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\sigma^{2}}(y_{i} - \\lambda(f_{i}))
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: gradient of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s2_i = (1.0/self.variance)
+        grad = s2_i*y - s2_i*link_f
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link_f, w.r.t link_f.
+        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
+
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        hess = -(1.0/self.variance)*np.ones((self.N, 1))
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = 0
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: third derivative of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None]
+        return d3logpdf_dlink3
+
+    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\sigma^{2}} = -\\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - \\lambda(f_{i}))^{2}}{2\\sigma^{4}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: float
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        e = y - link_f
+        s_4 = 1.0/(self.variance**2)
+        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.sum(np.square(e))
+        return np.sum(dlik_dsigma) # Sure about this sum?
+
+    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
+        """
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)}) = \\frac{1}{\\sigma^{4}}(-y_{i} + \\lambda(f_{i}))
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s_4 = 1.0/(self.variance**2)
+        dlik_grad_dsigma = -s_4*y + s_4*link_f
+        return dlik_grad_dsigma
+
+    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)}) = \\frac{1}{\\sigma^{4}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log hessian evaluated at points link(f_i) and link(f_j) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s_4 = 1.0/(self.variance**2)
+        d2logpdf_dlink2_dvar = np.diag(s_4*self.I)[:, None]
+        return d2logpdf_dlink2_dvar
+
+    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
+        return np.asarray([[dlogpdf_dvar]])
+
+    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
+        return d2logpdf_dlink2_dvar

    def _mean(self,gp):
        """
-        Mass (or density) function
+        Expected value of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            E_{p(y|f)}[y]
        """
        return self.gp_link.transf(gp)

-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
    def _variance(self,gp):
        """
-        Mass (or density) function
+        Variance of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            Var_{p(y|f)}[y]
        """
        return self.variance

-    def _dvariance_dgp(self,gp):
-        return 0
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.

-    def _d2variance_dgp2(self,gp):
-        return 0
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp])
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@ -24,19 +24,25 @@ class GPTransformation(object):
        """
        Gaussian process tranformation function, latent space -> output space
        """
-        pass
+        raise NotImplementedError

    def dtransf_df(self,f):
        """
        derivative of transf(f) w.r.t. f
        """
-        pass
+        raise NotImplementedError

    def d2transf_df2(self,f):
        """
        second derivative of transf(f) w.r.t. f
        """
-        pass
+        raise NotImplementedError
+
+    def d3transf_df3(self,f):
+        """
+        third derivative of transf(f) w.r.t. f
+        """
+        raise NotImplementedError

 class Identity(GPTransformation):
    """
@ -49,10 +55,13 @@ class Identity(GPTransformation):
        return f

    def dtransf_df(self,f):
-        return 1.
+        return np.ones_like(f)

    def d2transf_df2(self,f):
-        return 0
+        return np.zeros_like(f)
+
+    def d3transf_df3(self,f):
+        return np.zeros_like(f)


 class Probit(GPTransformation):
@ -69,8 +78,14 @@ class Probit(GPTransformation):
        return std_norm_pdf(f)

    def d2transf_df2(self,f):
+        #FIXME
        return -f * std_norm_pdf(f)

+    def d3transf_df3(self,f):
+        #FIXME
+        f2 = f**2
+        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(1-f2)
+
 class Log(GPTransformation):
    """
    .. math::
@ -87,6 +102,9 @@ class Log(GPTransformation):
    def d2transf_df2(self,f):
        return np.exp(f)

+    def d3transf_df3(self,f):
+        return np.exp(f)
+
 class Log_ex_1(GPTransformation):
    """
    .. math::
@ -104,15 +122,23 @@ class Log_ex_1(GPTransformation):
        aux = np.exp(f)/(1.+np.exp(f))
        return aux*(1.-aux)

+    def d3transf_df3(self,f):
+        aux = np.exp(f)/(1.+np.exp(f))
+        daux_df = aux*(1.-aux)
+        return daux_df - (2.*aux*daux_df)
+
 class Reciprocal(GPTransformation):
-    def transf(sefl,f):
+    def transf(self,f):
        return 1./f

    def dtransf_df(self,f):
-        return -1./f**2
+        return -1./(f**2)

    def d2transf_df2(self,f):
-        return 2./f**3
+        return 2./(f**3)
+
+    def d3transf_df3(self,f):
+        return -6./(f**4)

 class Heaviside(GPTransformation):
    """
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@ -9,15 +9,13 @@ import pylab as pb
 from GPy.util.plot import gpplot
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
-
+from GPy.util.misc import chain_1, chain_2, chain_3
+from scipy.integrate import quad
+import warnings

 class NoiseDistribution(object):
    """
-    Likelihood class for doing Expectation propagation
-
-    :param Y: observed output (Nx1 numpy.darray)
-
-    .. note:: Y values allowed depend on the LikelihoodFunction used
+    Likelihood class for doing approximations
    """
    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
        assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."
@ -35,6 +33,8 @@ class NoiseDistribution(object):
        else:
            self.predictive_variance = self._predictive_variance_numerical

+        self.log_concave = True
+
    def _get_params(self):
        return np.zeros(0)

@ -57,363 +57,372 @@ class NoiseDistribution(object):
        """
        return Y

-    def _product(self,gp,obs,mu,sigma):
-        """
-        Product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
-
-    def _nlog_product_scaled(self,gp,obs,mu,sigma):
-        """
-        Negative log-product between the cavity distribution and a likelihood factor.
-
-        .. note:: The constant term in the Gaussian distribution is ignored.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
-
-    def _dnlog_product_dgp(self,gp,obs,mu,sigma):
-        """
-        Derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
-
-    def _d2nlog_product_dgp2(self,gp,obs,mu,sigma):
-        """
-        Second derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
-
-    def _product_mode(self,obs,mu,sigma):
-        """
-        Newton's CG method to find the mode in _product (cavity x likelihood factor).
-
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma),disp=False)
-
    def _moments_match_analytical(self,obs,tau,v):
        """
        If available, this function computes the moments analytically.
        """
-        pass
+        raise NotImplementedError
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        assert y_test.shape==mu_star.shape
+        assert y_test.shape==var_star.shape
+        assert y_test.shape[1] == 1
+        def integral_generator(y, m, v):
+            """Generate a function which can be integrated to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
+            def f(f_star):
+                return self.pdf(f_star, y)*np.exp(-(1./(2*v))*np.square(m-f_star))
+            return f
+
+        scaled_p_ystar, accuracy = zip(*[quad(integral_generator(y, m, v), -np.inf, np.inf) for y, m, v in zip(y_test.flatten(), mu_star.flatten(), var_star.flatten())])
+        scaled_p_ystar = np.array(scaled_p_ystar).reshape(-1,1)
+        p_ystar = scaled_p_ystar/np.sqrt(2*np.pi*var_star)
+        return np.log(p_ystar)

    def _moments_match_numerical(self,obs,tau,v):
        """
-        Lapace approximation to calculate the moments.
+        Calculation of moments using quadrature

        :param obs: observed output
        :param tau: cavity distribution 1st natural parameter (precision)
        :param v: cavity distribution 2nd natural paramenter (mu*precision)
-
        """
+        #Compute first integral for zeroth moment.
+        #NOTE constant np.sqrt(2*pi/tau) added at the end of the function
        mu = v/tau
-        mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
-        sigma2_hat = 1./(tau + self._d2nlog_mass_dgp2(mu_hat,obs))
-        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self._mass(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
-        return Z_hat,mu_hat,sigma2_hat
+        def int_1(f):
+            return self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        z_scaled, accuracy = quad(int_1, -np.inf, np.inf)

-    def _nlog_conditional_mean_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's mean given the l.v.
+        #Compute second integral for first moment
+        def int_2(f):
+            return f*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        mean, accuracy = quad(int_2, -np.inf, np.inf)
+        mean /= z_scaled

-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
+        #Compute integral for variance
+        def int_3(f):
+            return (f**2)*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        Ef2, accuracy = quad(int_3, -np.inf, np.inf)
+        Ef2 /= z_scaled
+        variance = Ef2 - mean**2

-        .. note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
+        #Add constant to the zeroth moment
+        #NOTE: this constant is not needed in the other moments because it cancells out.
+        z = z_scaled/np.sqrt(2*np.pi/tau)

-        """
-        return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
-
-    def _dnlog_conditional_mean_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_conditional_mean_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
-
-    def _d2nlog_conditional_mean_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_conditional_mean_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
-
-    def _nlog_exp_conditional_variance_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's variance given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E(V(Y_star|f_star))
-
-        """
-        return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
-
-    def _dnlog_exp_conditional_variance_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
-
-    def _d2nlog_exp_conditional_variance_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
-
-    def _nlog_exp_conditional_mean_sq_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's mean squared given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E( E(Y_star|f_star)**2 )
-
-        """
-        return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
-
-    def _dnlog_exp_conditional_mean_sq_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
-
-    def _d2nlog_exp_conditional_mean_sq_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
+        return z, mean, variance

    def _predictive_mean_analytical(self,mu,sigma):
        """
+        Predictive mean
+        .. math::
+            E(Y^{*}|Y) = E( E(Y^{*}|f^{*}, Y) )
+
        If available, this function computes the predictive mean analytically.
        """
-        pass
+        raise NotImplementedError

    def _predictive_variance_analytical(self,mu,sigma):
        """
+        Predictive variance
+        .. math::
+            V(Y^{*}| Y) = E( V(Y^{*}|f^{*}, Y) ) + V( E(Y^{*}|f^{*}, Y) )
+
        If available, this function computes the predictive variance analytically.
        """
-        pass
+        raise NotImplementedError

-    def _predictive_mean_numerical(self,mu,sigma):
+    def _predictive_mean_numerical(self,mu,variance):
        """
-        Laplace approximation to the predictive mean: E(Y_star) = E( E(Y_star|f_star) )
+        Quadrature calculation of the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )

-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior

        """
-        maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma),disp=False)
-        mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
-        """
+        def int_mean(f,m,v):
+            return self._mean(f)*np.exp(-(0.5/v)*np.square(f - m))
+        scaled_mean = [quad(int_mean, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))

-        pb.figure()
-        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
-        f = np.array([np.exp(-self._nlog_conditional_mean_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
-        pb.plot(x,f,'b-')
-        sigma2 = 1./self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma)
-        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
-        k = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
-        pb.plot(x,f2*mean,'r-')
-        pb.vlines(maximum,0,f.max())
-        """
        return mean

-    def _predictive_mean_sq(self,mu,sigma):
+    def _predictive_variance_numerical(self,mu,variance,predictive_mean=None):
        """
-        Laplace approximation to the predictive mean squared: E(Y_star**2) = E( E(Y_star|f_star)**2 )
+        Numerical approximation to the predictive variance: V(Y_star)

-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
+        The following variance decomposition is used:
+        V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )

-        """
-        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma),disp=False)
-        mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
-        return mean_squared
-
-    def _predictive_variance_numerical(self,mu,sigma,predictive_mean=None):
-        """
-        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
-
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior
        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.

        """
+        #sigma2 = sigma**2
+        normalizer = np.sqrt(2*np.pi*variance)
+
        # E( V(Y_star|f_star) )
-        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma),disp=False)
-        exp_var = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma))*sigma)
+        def int_var(f,m,v):
+            return self._variance(f)*np.exp(-(0.5/v)*np.square(f - m))
+        scaled_exp_variance = [quad(int_var, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        exp_var = np.array(scaled_exp_variance)[:,None] / normalizer

-        """
-        pb.figure()
-        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
-        f = np.array([np.exp(-self._nlog_exp_conditional_variance_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
-        pb.plot(x,f,'b-')
-        sigma2 = 1./self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma)
-        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
-        k = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
-        pb.plot(x,f2*exp_var,'r--')
-        pb.vlines(maximum,0,f.max())
-        """
+        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2

-        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star)**2 )
-        exp_exp2 = self._predictive_mean_sq(mu,sigma)
+        #E( E(Y_star|f_star) )**2
        if predictive_mean is None:
-            predictive_mean = self.predictive_mean(mu,sigma)
-        var_exp = exp_exp2 - predictive_mean**2
+            predictive_mean = self.predictive_mean(mu,variance)
+        predictive_mean_sq = predictive_mean**2
+
+        #E( E(Y_star|f_star)**2 )
+        def int_pred_mean_sq(f,m,v,predictive_mean_sq):
+            return self._mean(f)**2*np.exp(-(0.5/v)*np.square(f - m))
+        scaled_exp_exp2 = [quad(int_pred_mean_sq, -np.inf, np.inf,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
+        exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer
+
+        var_exp = exp_exp2 - predictive_mean_sq
+
+        # V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
        return exp_var + var_exp

-    def _predictive_percentiles(self,p,mu,sigma):
+    def pdf_link(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_link_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def pdf(self, f, y, extra_data=None):
        """
-        Percentiles of the predictive distribution
+        Evaluates the link function link(f) then computes the likelihood (pdf) using it

-        :parm p: lower tail probability
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+        .. math:
+            p(y|\\lambda(f))

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
        """
-        qf = stats.norm.ppf(p,mu,sigma)
-        return self.gp_link.transf(qf)
+        link_f = self.gp_link.transf(f)
+        return self.pdf_link(link_f, y, extra_data=extra_data)

-    def _nlog_joint_predictive_scaled(self,x,mu,sigma):
+    def logpdf(self, f, y, extra_data=None):
        """
-        Negative logarithm of the joint predictive distribution (latent variable and output).
+        Evaluates the link function link(f) then computes the log likelihood (log pdf) using it

-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
+        .. math:
+            \\log p(y|\\lambda(f))

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: log likelihood evaluated for this point
+        :rtype: float
        """
-        return self._nlog_product_scaled(x[0],x[1],mu,sigma)
+        link_f = self.gp_link.transf(f)
+        return self.logpdf_link(link_f, y, extra_data=extra_data)

-    def _gradient_nlog_joint_predictive(self,x,mu,sigma):
+    def dlogpdf_df(self, f, y, extra_data=None):
        """
-        Gradient of _nlog_joint_predictive_scaled.
+        Evaluates the link function link(f) then computes the derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule

-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        .. note: Only available when the output is continuous
+        .. math::
+            \\frac{d\\log p(y|\\lambda(f))}{df} = \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d\\lambda(f)}{df}

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of log likelihood evaluated for this point
+        :rtype: 1xN array
        """
-        assert not self.discrete, "Gradient not available for discrete outputs."
-        return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
+        link_f = self.gp_link.transf(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        return chain_1(dlogpdf_dlink, dlink_df)

-    def _hessian_nlog_joint_predictive(self,x,mu,sigma):
+    def d2logpdf_df2(self, f, y, extra_data=None):
        """
-        Hessian of _nlog_joint_predictive_scaled.
+        Evaluates the link function link(f) then computes the second derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule

-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        .. note: Only available when the output is continuous
+        .. math::
+            \\frac{d^{2}\\log p(y|\\lambda(f))}{df^{2}} = \\frac{d^{2}\\log p(y|\\lambda(f))}{d^{2}\\lambda(f)}\\left(\\frac{d\\lambda(f)}{df}\\right)^{2} + \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d^{2}\\lambda(f)}{df^{2}}

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: second derivative of log likelihood evaluated for this point (diagonal only)
+        :rtype: 1xN array
        """
-        assert not self.discrete, "Hessian not available for discrete outputs."
-        cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
-        return np.array((self._d2nlog_product_dgp2(gp=x[0],obs=x[1],mu=mu,sigma=sigma),cross_derivative,cross_derivative,self._d2nlog_mass_dobs2(obs=x[1],gp=x[0]))).reshape(2,2)
+        link_f = self.gp_link.transf(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)

-    def _joint_predictive_mode(self,mu,sigma):
+    def d3logpdf_df3(self, f, y, extra_data=None):
        """
-        Negative logarithm of the joint predictive distribution (latent variable and output).
+        Evaluates the link function link(f) then computes the third derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule

-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
+        .. math::
+            \\frac{d^{3}\\log p(y|\\lambda(f))}{df^{3}} = \\frac{d^{3}\\log p(y|\\lambda(f)}{d\\lambda(f)^{3}}\\left(\\frac{d\\lambda(f)}{df}\\right)^{3} + 3\\frac{d^{2}\\log p(y|\\lambda(f)}{d\\lambda(f)^{2}}\\frac{d\\lambda(f)}{df}\\frac{d^{2}\\lambda(f)}{df^{2}} + \\frac{d\\log p(y|\\lambda(f)}{d\\lambda(f)}\\frac{d^{3}\\lambda(f)}{df^{3}}

+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: third derivative of log likelihood evaluated for this point
+        :rtype: float
        """
-        return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
+        link_f = self.gp_link.transf(f)
+        d3logpdf_dlink3 = self.d3logpdf_dlink3(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        d3link_df3 = self.gp_link.d3transf_df3(f)
+        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)

-    def predictive_values(self,mu,var):
+    def dlogpdf_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([1, 0])
+
+    def dlogpdf_df_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([f.shape[0], 0])
+
+    def d2logpdf_df2_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([f.shape[0], 0])
+
+    def _laplace_gradients(self, f, y, extra_data=None):
+        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, extra_data=extra_data)
+        dlogpdf_df_dtheta = self.dlogpdf_df_dtheta(f, y, extra_data=extra_data)
+        d2logpdf_df2_dtheta = self.d2logpdf_df2_dtheta(f, y, extra_data=extra_data)
+
+        #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
+        # ensure we have gradients for every parameter we want to optimize
+        assert dlogpdf_dtheta.shape[1] == len(self._get_param_names())
+        assert dlogpdf_df_dtheta.shape[1] == len(self._get_param_names())
+        assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
+        return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
+
+    def predictive_values(self, mu, var, full_cov=False, sampling=False, num_samples=10000):
        """
        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.

-        :param mu: mean of the latent variable
-        :param var: variance of the latent variable
+        :param mu: mean of the latent variable, f, of posterior
+        :param var: variance of the latent variable, f, of posterior
+        :param full_cov: whether to use the full covariance or just the diagonal
+        :type full_cov: Boolean
+        :param num_samples: number of samples to use in computing quantiles and
+                            possibly mean variance
+        :type num_samples: integer
+        :param sampling: Whether to use samples for mean and variances anyway
+        :type sampling: Boolean

        """
-        if isinstance(mu,float) or isinstance(mu,int):
-            mu = [mu]
-            var = [var]
-        pred_mean = []
-        pred_var = []
-        q1 = []
-        q3 = []
-        for m,s in zip(mu,np.sqrt(var)):
-            pred_mean.append(self.predictive_mean(m,s))
-            pred_var.append(self.predictive_variance(m,s,pred_mean[-1]))
-            q1.append(self._predictive_percentiles(.025,m,s))
-            q3.append(self._predictive_percentiles(.975,m,s))
-        pred_mean = np.vstack(pred_mean)
-        pred_var = np.vstack(pred_var)
-        q1 = np.vstack(q1)
-        q3 = np.vstack(q3)
-        return pred_mean, pred_var, q1, q3

+        if sampling:
+            #Get gp_samples f* using posterior mean and variance
+            if not full_cov:
+                gp_samples = np.random.multivariate_normal(mu.flatten(), np.diag(var.flatten()),
+                                                            size=num_samples).T
+            else:
+                gp_samples = np.random.multivariate_normal(mu.flatten(), var,
+                                                               size=num_samples).T
+            #Push gp samples (f*) through likelihood to give p(y*|f*)
+            samples = self.samples(gp_samples)
+            axis=-1
+
+            #Calculate mean, variance and precentiles from samples
+            print "WARNING: Using sampling to calculate mean, variance and predictive quantiles."
+            pred_mean = np.mean(samples, axis=axis)[:,None]
+            pred_var = np.var(samples, axis=axis)[:,None]
+            q1 = np.percentile(samples, 2.5, axis=axis)[:,None]
+            q3 = np.percentile(samples, 97.5, axis=axis)[:,None]
+
+        else:
+
+            pred_mean = self.predictive_mean(mu, var)
+            pred_var = self.predictive_variance(mu, var, pred_mean)
+            print "WARNING: Predictive quantiles are only computed when sampling."
+            q1 = np.repeat(np.nan,pred_mean.size)[:,None]
+            q3 = q1.copy()
+
+        return pred_mean, pred_var, q1, q3

    def samples(self, gp):
        """
@ -421,5 +430,4 @@ class NoiseDistribution(object):

        :param gp: latent variable
        """
-        pass
-
+        raise NotImplementedError
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@ -1,7 +1,7 @@
+from __future__ import division
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 import numpy as np
 from scipy import stats,special
 import scipy as sp
@ -14,9 +14,10 @@ class Poisson(NoiseDistribution):
    Poisson likelihood

    .. math::
-        L(x) = \\exp(\\lambda) * \\frac{\\lambda^Y_i}{Y_i!}
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\lambda(f_{i})^{y_{i}}}{y_{i}!}e^{-\\lambda(f_{i})}

-    ..Note: Y is expected to take values in {0,1,2,...}
+    .. Note::
+        Y is expected to take values in {0,1,2,...}
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
@ -24,25 +25,108 @@ class Poisson(NoiseDistribution):
    def _preprocess_values(self,Y): #TODO
        return Y

-    def _mass(self,gp,obs):
+    def pdf_link(self, link_f, y, extra_data=None):
        """
-        Mass (or density) function
-        """
-        return stats.poisson.pmf(obs,self.gp_link.transf(gp))
+        Likelihood function given link(f)

-    def _nlog_mass(self,gp,obs):
-        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
-        """
-        return self.gp_link.transf(gp) - obs * np.log(self.gp_link.transf(gp)) + np.log(special.gamma(obs+1))
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\lambda(f_{i})^{y_{i}}}{y_{i}!}e^{-\\lambda(f_{i})}

-    def _dnlog_mass_dgp(self,gp,obs):
-        return self.gp_link.dtransf_df(gp) * (1. - obs/self.gp_link.transf(gp))
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return np.prod(stats.poisson.pmf(y,link_f))

-    def _d2nlog_mass_dgp2(self,gp,obs):
-        d2_df = self.gp_link.d2transf_df2(gp)
-        transf = self.gp_link.transf(gp)
-        return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = -\\lambda(f_{i}) + y_{i}\\log \\lambda(f_{i}) - \\log y_{i}!
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return np.sum(-link_f + y*np.log(link_f) - special.gammaln(y+1))
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - 1
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return y/link_f - 1
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{-y_{i}}{\\lambda(f_{i})^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -y/(link_f**2)
+        return hess
+        #d2_df = self.gp_link.d2transf_df2(gp)
+        #transf = self.gp_link.transf(gp)
+        #return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f_{i})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = 2*y/(link_f)**3
+        return d3lik_dlink3

    def _mean(self,gp):
        """
@ -50,20 +134,19 @@ class Poisson(NoiseDistribution):
        """
        return self.gp_link.transf(gp)

-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)

-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.

-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.random.poisson(self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@ -0,0 +1,277 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats, special
+import scipy as sp
+import gp_transformations
+from noise_distributions import NoiseDistribution
+from scipy import stats, integrate
+from scipy.special import gammaln, gamma
+
+class StudentT(NoiseDistribution):
+    """
+    Student T likelihood
+
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+
+    """
+    def __init__(self,gp_link=None,analytical_mean=True,analytical_variance=True, deg_free=5, sigma2=2):
+        self.v = deg_free
+        self.sigma2 = sigma2
+
+        self._set_params(np.asarray(sigma2))
+        super(StudentT, self).__init__(gp_link,analytical_mean,analytical_variance)
+        self.log_concave = False
+
+    def _get_params(self):
+        return np.asarray(self.sigma2)
+
+    def _get_param_names(self):
+        return ["t_noise_std2"]
+
+    def _set_params(self, x):
+        self.sigma2 = float(x)
+
+    @property
+    def variance(self, extra_data=None):
+        return (self.v / float(self.v - 2)) * self.sigma2
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        #Careful gamma(big_number) is infinity!
+        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
+                     / (np.sqrt(self.v * np.pi * self.sigma2)))
+                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
+                    )
+        return np.prod(objective)
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        objective = (+ gammaln((self.v + 1) * 0.5)
+                     - gammaln(self.v * 0.5)
+                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                     - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
+                    )
+        return np.sum(objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
+                    )
+        return d3lik_dlink3
+
+    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
+        return np.sum(dlogpdf_dvar)
+
+    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
+        """
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2}
+
+        :param link_f: latent variables link_f
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+                              / ((self.sigma2*self.v + (e**2))**3)
+                           )
+        return d2logpdf_dlink2_dvar
+
+    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
+        return np.asarray([[dlogpdf_dvar]])
+
+    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
+        return d2logpdf_dlink2_dvar
+
+    def _predictive_variance_analytical(self, mu, sigma, predictive_mean=None):
+        """
+        Compute predictive variance of student_t*normal p(y*|f*)p(f*)
+
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+
+        #FIXME: Not correct
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = 1/(1/sigma**2 + 1/self.variance)
+
+        return true_var
+
+    def _predictive_mean_analytical(self, mu, sigma):
+        """
+        Compute mean of the prediction
+        """
+        #FIXME: Not correct
+        return mu
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        #FIXME: Very slow as we are computing a new random variable per input!
+        #Can't get it to sample all at the same time
+        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        dfs = np.ones_like(gp)*self.v
+        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
+        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
+                                        scale=scales)
+        return student_t_samples.reshape(orig_shape)
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -1,18 +1,19 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from gp_regression import GPRegression
-from gp_classification import GPClassification
-from sparse_gp_regression import SparseGPRegression
-from svigp_regression import SVIGPRegression
-from sparse_gp_classification import SparseGPClassification
-from fitc_classification import FITCClassification
-from gplvm import GPLVM
-from bcgplvm import BCGPLVM
-from sparse_gplvm import SparseGPLVM
-from warped_gp import WarpedGP
-from bayesian_gplvm import BayesianGPLVM
-from mrd import MRD
-from gradient_checker import GradientChecker
-from gp_multioutput_regression import GPMultioutputRegression
-from sparse_gp_multioutput_regression import SparseGPMultioutputRegression
+from gp_regression import GPRegression; _gp_regression = gp_regression ; del gp_regression 
+from gp_classification import GPClassification; _gp_classification = gp_classification ; del gp_classification 
+from sparse_gp_regression import SparseGPRegression; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
+from svigp_regression import SVIGPRegression; _svigp_regression = svigp_regression ; del svigp_regression 
+from sparse_gp_classification import SparseGPClassification; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
+from fitc_classification import FITCClassification; _fitc_classification = fitc_classification ; del fitc_classification 
+from gplvm import GPLVM; _gplvm = gplvm ; del gplvm 
+from bcgplvm import BCGPLVM; _bcgplvm = bcgplvm; del bcgplvm
+from sparse_gplvm import SparseGPLVM; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
+from warped_gp import WarpedGP; _warped_gp = warped_gp ; del warped_gp 
+from bayesian_gplvm import BayesianGPLVM; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
+from mrd import MRD; _mrd = mrd ; del mrd 
+from gradient_checker import GradientChecker; _gradient_checker = gradient_checker ; del gradient_checker 
+from gp_multioutput_regression import GPMultioutputRegression; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
+from sparse_gp_multioutput_regression import SparseGPMultioutputRegression; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
+
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -49,18 +49,6 @@ class BayesianGPLVM(SparseGP, GPLVM):
        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
        self.ensure_default_constraints()

-    def getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return SparseGP.getstate(self) + [self.init]
-
-    def setstate(self, state):
-        self._const_jitter = None
-        self.init = state.pop()
-        SparseGP.setstate(self, state)
-
    def _get_param_names(self):
        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
@ -285,6 +273,19 @@ class BayesianGPLVM(SparseGP, GPLVM):
        fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
        return fig

+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return SparseGP.getstate(self) + [self.init]
+
+    def setstate(self, state):
+        self._const_jitter = None
+        self.init = state.pop()
+        SparseGP.setstate(self, state)
+
+
 def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
    """
    objective function for fitting the latent variables for test points
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@ -7,7 +7,7 @@ import pylab as pb
 import sys, pdb
 from ..core import GP
 from ..models import GPLVM
-from ..mappings import *
+from ..mappings import Kernel


 class BCGPLVM(GPLVM):
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@ -16,7 +16,7 @@ class FITCClassification(FITC):

    :param X: input observations
    :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link function
+    :param likelihood: a GPy likelihood, defaults to Bernoulli with probit link function
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
@ -31,7 +31,7 @@ class FITCClassification(FITC):
            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)

        if likelihood is None:
-            noise_model = likelihoods.binomial()
+            noise_model = likelihoods.bernoulli()
            likelihood = likelihoods.EP(Y, noise_model)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -15,7 +15,7 @@ class GPClassification(GP):

    :param X: input observations
    :param Y: observed values, can be None if likelihood is not None
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
+    :param likelihood: a GPy likelihood, defaults to Bernoulli with Probit link_function
    :param kernel: a GPy kernel, defaults to rbf
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
@ -31,7 +31,7 @@ class GPClassification(GP):
            kernel = kern.rbf(X.shape[1])

        if likelihood is None:
-            noise_model = likelihoods.binomial()
+            noise_model = likelihoods.bernoulli()
            likelihood = likelihoods.EP(Y, noise_model)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -25,11 +25,12 @@ class GPRegression(GP):

    """

-    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, likelihood=None):
        if kernel is None:
            kernel = kern.rbf(X.shape[1])

-        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
+        if likelihood is None:
+            likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)

        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self.ensure_default_constraints()
@ -39,5 +40,3 @@ class GPRegression(GP):

    def setstate(self, state):
        return GP.setstate(self, state)
-
-    pass
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -44,12 +44,6 @@ class GPLVM(GP):
            Xr[:PC.shape[0], :PC.shape[1]] = PC
        return Xr

-    def getstate(self):
-        return GP.getstate(self)
-
-    def setstate(self, state):
-        GP.setstate(self, state)
-
    def _get_param_names(self):
        return sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], []) + GP._get_param_names(self)

@ -68,7 +62,7 @@ class GPLVM(GP):
    def jacobian(self,X):
        target = np.zeros((X.shape[0],X.shape[1],self.output_dim))
        for i in range(self.output_dim):
-        	target[:,:,i]=self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
+        	target[:,:,i] = self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
        return target
   
    def magnification(self,X):
@ -91,3 +85,11 @@ class GPLVM(GP):

    def plot_magnification(self, *args, **kwargs):
        return util.plot_latent.plot_magnification(self, *args, **kwargs)
+
+    def getstate(self):
+        return GP.getstate(self)
+
+    def setstate(self, state):
+        GP.setstate(self, state)
+
+
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@ -26,40 +26,40 @@ class GradientChecker(Model):
        """
        :param f: Function to check gradient for
        :param df: Gradient of function to check
-        :param x0: 
+        :param x0:
            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
-            Can be a list of arrays, if takes a list of arrays. This list will be passed 
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
            to f and df in the same order as given here.
            If only one argument, make sure not to pass a list!!!
-            
+
        :type x0: [array-like] | array-like | float | int
        :param names:
            Names to print, when performing gradcheck. If a list was passed to x0
            a list of names with the same length is expected.
        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
-        
+
        Examples:
        ---------
            from GPy.models import GradientChecker
            N, M, Q = 10, 5, 3
-        
+
            Sinusoid:
-            
+
                X = numpy.random.rand(N, Q)
                grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
                grad.checkgrad(verbose=1)
-    
+
            Using GPy:
-            
+
                X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
                kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
-                grad = GradientChecker(kern.K, 
+                grad = GradientChecker(kern.K,
                                       lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
                                       x0 = X.copy(),
-                                       names='X')  
+                                       names='X')
                grad.checkgrad(verbose=1)
                grad.randomize()
-                grad.checkgrad(verbose=1)      
+                grad.checkgrad(verbose=1)
        """
        Model.__init__(self)
        if isinstance(x0, (list, tuple)) and names is None:
@ -75,14 +75,14 @@ class GradientChecker(Model):
            self.names = names
            self.shapes = [get_shape(x0)]
        for name, xi in zip(self.names, at_least_one_element(x0)):
-            self.__setattr__(name, xi)
+            self.__setattr__(name, numpy.float_(xi))
 #         self._param_names = []
 #         for name, shape in zip(self.names, self.shapes):
 #             self._param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
        self.args = args
        self.kwargs = kwargs
-        self.f = f
-        self.df = df
+        self._f = f
+        self._df = df

    def _get_x(self):
        if len(self.names) > 1:
@ -90,10 +90,10 @@ class GradientChecker(Model):
        return [self.__getattribute__(self.names[0])] + list(self.args)

    def log_likelihood(self):
-        return float(numpy.sum(self.f(*self._get_x(), **self.kwargs)))
+        return float(numpy.sum(self._f(*self._get_x(), **self.kwargs)))

    def _log_likelihood_gradients(self):
-        return numpy.atleast_1d(self.df(*self._get_x(), **self.kwargs)).flatten()
+        return numpy.atleast_1d(self._df(*self._get_x(), **self.kwargs)).flatten()


    def _get_params(self):
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -81,29 +81,6 @@ class MRD(Model):
        Model.__init__(self)
        self.ensure_default_constraints()

-    def getstate(self):
-        return Model.getstate(self) + [self.names,
-                self.bgplvms,
-                self.gref,
-                self.nparams,
-                self.input_dim,
-                self.num_inducing,
-                self.num_data,
-                self.NQ,
-                self.MQ]
-
-    def setstate(self, state):
-        self.MQ = state.pop()
-        self.NQ = state.pop()
-        self.num_data = state.pop()
-        self.num_inducing = state.pop()
-        self.input_dim = state.pop()
-        self.nparams = state.pop()
-        self.gref = state.pop()
-        self.bgplvms = state.pop()
-        self.names = state.pop()
-        Model.setstate(self, state)
-
    @property
    def X(self):
        return self.gref.X
@ -371,4 +348,28 @@ class MRD(Model):
        pylab.draw()
        fig.tight_layout()

+    def getstate(self):
+        return Model.getstate(self) + [self.names,
+                self.bgplvms,
+                self.gref,
+                self.nparams,
+                self.input_dim,
+                self.num_inducing,
+                self.num_data,
+                self.NQ,
+                self.MQ]
+
+    def setstate(self, state):
+        self.MQ = state.pop()
+        self.NQ = state.pop()
+        self.num_data = state.pop()
+        self.num_inducing = state.pop()
+        self.input_dim = state.pop()
+        self.nparams = state.pop()
+        self.gref = state.pop()
+        self.bgplvms = state.pop()
+        self.names = state.pop()
+        Model.setstate(self, state)
+
+

--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@ -16,7 +16,7 @@ class SparseGPClassification(SparseGP):

    :param X: input observations
    :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
+    :param likelihood: a GPy likelihood, defaults to Bernoulli with probit link_function
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
@ -31,7 +31,7 @@ class SparseGPClassification(SparseGP):
            kernel = kern.rbf(X.shape[1])# + kern.white(X.shape[1],1e-3)

        if likelihood is None:
-            noise_model = likelihoods.binomial()
+            noise_model = likelihoods.bernoulli()
            likelihood = likelihoods.EP(Y, noise_model)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@ -37,7 +37,7 @@ def model_checkgrads(model):

 def model_instance(model):
    #assert isinstance(model, GPy.core.model)
-    return isinstance(model, GPy.core.model)
+    return isinstance(model, GPy.core.model.Model)

@nottest
 def test_models():
@ -54,7 +54,7 @@ def test_models():
        print "After"
        print functions
        for example in functions:
-            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100']:
+            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100', 'brendan_faces']:
                print "SKIPPING"
                continue

--- a/GPy/testing/gp_transformation_tests.py
+++ b/GPy/testing/gp_transformation_tests.py
@ -0,0 +1,61 @@
+from nose.tools import with_setup
+from GPy.models import GradientChecker
+from GPy.likelihoods.noise_models import gp_transformations
+import inspect
+import unittest
+import numpy as np
+
+class TestTransformations(object):
+    """
+    Generic transformations checker
+    """
+    def setUp(self):
+        N = 30
+        self.fs = [np.random.rand(N, 1), float(np.random.rand(1))]
+
+
+    def tearDown(self):
+        self.fs = None
+
+    def test_transformations(self):
+        self.setUp()
+        transformations = [gp_transformations.Identity(),
+                           gp_transformations.Log(),
+                           gp_transformations.Probit(),
+                           gp_transformations.Log_ex_1(),
+                           gp_transformations.Reciprocal(),
+                           ]
+
+        for transformation in transformations:
+            for f in self.fs:
+                yield self.t_dtransf_df, transformation, f
+                yield self.t_d2transf_df2, transformation, f
+                yield self.t_d3transf_df3, transformation, f
+
+    @with_setup(setUp, tearDown)
+    def t_dtransf_df(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.transf, transformation.dtransf_df, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2transf_df2(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.dtransf_df, transformation.d2transf_df2, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3transf_df3(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.d2transf_df2, transformation.d3transf_df3, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+#if __name__ == "__main__":
+    #print "Running unit tests"
+    #unittest.main()
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@ -0,0 +1,577 @@
+import numpy as np
+import unittest
+import GPy
+from GPy.models import GradientChecker
+import functools
+import inspect
+from GPy.likelihoods.noise_models import gp_transformations
+from functools import partial
+
+def dparam_partial(inst_func, *args):
+    """
+    If we have a instance method that needs to be called but that doesn't
+    take the parameter we wish to change to checkgrad, then this function
+    will change the variable using set params.
+
+    inst_func: should be a instance function of an object that we would like
+                to change
+    param: the param that will be given to set_params
+    args: anything else that needs to be given to the function (for example
+          the f or Y that are being used in the function whilst we tweak the
+          param
+    """
+    def param_func(param, inst_func, args):
+        inst_func.im_self._set_params(param)
+        return inst_func(*args)
+    return functools.partial(param_func, inst_func=inst_func, args=args)
+
+def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=False, verbose=False):
+    """
+    checkgrad expects a f: R^N -> R^1 and df: R^N -> R^N
+    However if we are holding other parameters fixed and moving something else
+    We need to check the gradient of each of the fixed parameters
+    (f and y for example) seperately,  whilst moving another parameter.
+    Otherwise f: gives back R^N and
+              df: gives back R^NxM where M is
+    The number of parameters and N is the number of data
+    Need to take a slice out from f and a slice out of df
+    """
+    #print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
+                                           #func.__name__, dfunc.__name__)
+    partial_f = dparam_partial(func, *args)
+    partial_df = dparam_partial(dfunc, *args)
+    gradchecking = True
+    for param in params:
+        fnum = np.atleast_1d(partial_f(param)).shape[0]
+        dfnum = np.atleast_1d(partial_df(param)).shape[0]
+        for fixed_val in range(dfnum):
+            #dlik and dlik_dvar gives back 1 value for each
+            f_ind = min(fnum, fixed_val+1) - 1
+            print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
+            #Make grad checker with this param moving, note that set_params is NOT being called
+            #The parameter is being set directly with __setattr__
+            grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
+                                   lambda x : np.atleast_1d(partial_df(x))[fixed_val],
+                                   param, 'p')
+            #This is not general for more than one param...
+            if constraints is not None:
+                for constraint in constraints:
+                    constraint('p', grad)
+            if randomize:
+                grad.randomize()
+            if verbose:
+                print grad
+                grad.checkgrad(verbose=1)
+            if not grad.checkgrad():
+                gradchecking = False
+
+    return gradchecking
+
+
+from nose.tools import with_setup
+class TestNoiseModels(object):
+    """
+    Generic model checker
+    """
+    def setUp(self):
+        self.N = 5
+        self.D = 3
+        self.X = np.random.rand(self.N, self.D)*10
+
+        self.real_std = 0.1
+        noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
+        self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
+        self.f = np.random.rand(self.N, 1)
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        self.positive_Y = np.exp(self.Y.copy())
+        tmp = np.round(self.X[:, 0]*3-3)[:, None] + np.random.randint(0,3, self.X.shape[0])[:, None]
+        self.integer_Y = np.where(tmp > 0, tmp, 0)
+
+        self.var = 0.2
+
+        self.var = np.random.rand(1)
+
+        #Make a bigger step as lower bound can be quite curved
+        self.step = 1e-3
+
+    def tearDown(self):
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_noise_models(self):
+        self.setUp()
+
+        ####################################################
+        # Constraint wrappers so we can just list them off #
+        ####################################################
+        def constrain_negative(regex, model):
+            model.constrain_negative(regex)
+
+        def constrain_positive(regex, model):
+            model.constrain_positive(regex)
+
+        def constrain_bounded(regex, model, lower, upper):
+            """
+            Used like: partial(constrain_bounded, lower=0, upper=1)
+            """
+            model.constrain_bounded(regex, lower, upper)
+
+        """
+        Dictionary where we nest models we would like to check
+            Name: {
+                "model": model_instance,
+                "grad_params": {
+                    "names": [names_of_params_we_want, to_grad_check],
+                    "vals": [values_of_params, to_start_at],
+                    "constrain": [constraint_wrappers, listed_here]
+                    },
+                "laplace": boolean_of_whether_model_should_work_for_laplace,
+                "ep": boolean_of_whether_model_should_work_for_laplace,
+                "link_f_constraints": [constraint_wrappers, listed_here]
+                }
+        """
+        noise_models = {"Student_t_default": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_1_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [1],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_small_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [0.01],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_approx_gauss": {
+                            "model": GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_log": {
+                            "model": GPy.likelihoods.student_t(gp_link=gp_transformations.Log(), deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_default": {
+                            "model": GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True,
+                            "ep": True
+                            },
+                        "Gaussian_log": {
+                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log(), variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_probit": {
+                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Probit(), variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_log_ex": {
+                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Bernoulli_default": {
+                            "model": GPy.likelihoods.bernoulli(),
+                            "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
+                            "laplace": True,
+                            "Y": self.binary_Y,
+                            "ep": True
+                            },
+                        "Exponential_default": {
+                            "model": GPy.likelihoods.exponential(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.positive_Y,
+                            "laplace": True,
+                        },
+                        "Poisson_default": {
+                            "model": GPy.likelihoods.poisson(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.integer_Y,
+                            "laplace": True,
+                            "ep": False #Should work though...
+                        },
+                        "Gamma_default": {
+                            "model": GPy.likelihoods.gamma(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.positive_Y,
+                            "laplace": True
+                        }
+                    }
+
+        for name, attributes in noise_models.iteritems():
+            model = attributes["model"]
+            if "grad_params" in attributes:
+                params = attributes["grad_params"]
+                param_vals = params["vals"]
+                param_names= params["names"]
+                param_constraints = params["constraints"]
+            else:
+                params = []
+                param_vals = []
+                param_names = []
+                constrain_positive = []
+            if "link_f_constraints" in attributes:
+                link_f_constraints = attributes["link_f_constraints"]
+            else:
+                link_f_constraints = []
+            if "Y" in attributes:
+                Y = attributes["Y"].copy()
+            else:
+                Y = self.Y.copy()
+            if "f" in attributes:
+                f = attributes["f"].copy()
+            else:
+                f = self.f.copy()
+            if "laplace" in attributes:
+                laplace = attributes["laplace"]
+            else:
+                laplace = False
+            if "ep" in attributes:
+                ep = attributes["ep"]
+            else:
+                ep = False
+
+            if len(param_vals) > 1:
+                raise NotImplementedError("Cannot support multiple params in likelihood yet!")
+
+            #Required by all
+            #Normal derivatives
+            yield self.t_logpdf, model, Y, f
+            yield self.t_dlogpdf_df, model, Y, f
+            yield self.t_d2logpdf_df2, model, Y, f
+            #Link derivatives
+            yield self.t_dlogpdf_dlink, model, Y, f, link_f_constraints
+            yield self.t_d2logpdf_dlink2, model, Y, f, link_f_constraints
+            if laplace:
+                #Laplace only derivatives
+                yield self.t_d3logpdf_df3, model, Y, f
+                yield self.t_d3logpdf_dlink3, model, Y, f, link_f_constraints
+                #Params
+                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_constraints
+                #Link params
+                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_constraints
+
+                #laplace likelihood gradcheck
+                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+            if ep:
+                #ep likelihood gradcheck
+                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+
+
+        self.tearDown()
+
+    #############
+    # dpdf_df's #
+    #############
+    @with_setup(setUp, tearDown)
+    def t_logpdf(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        np.testing.assert_almost_equal(
+                               np.log(model.pdf(f.copy(), Y.copy())),
+                               model.logpdf(f.copy(), Y.copy()))
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_df(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.description = "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(model.logpdf, y=Y)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        grad = GradientChecker(logpdf, dlogpdf_df, f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print model
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf_df2(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print model
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3logpdf_df3(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y)
+        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print model
+        assert grad.checkgrad()
+
+    ##############
+    # df_dparams #
+    ##############
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_df_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    ################
+    # dpdf_dlink's #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dlink(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(model.logpdf_link, y=Y)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        grad = GradientChecker(logpdf, dlogpdf_dlink, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        print grad
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf_dlink2(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print grad
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3logpdf_dlink3(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y)
+        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print grad
+        assert grad.checkgrad()
+
+    #################
+    # dlink_dparams #
+    #################
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    ################
+    # laplace test #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_laplace_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        #Normalize
+        Y = Y/Y.max()
+        white_var = 0.001
+        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), model)
+        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=laplace_likelihood)
+        m.ensure_default_constraints()
+        m.constrain_fixed('white', white_var)
+
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            m[name] = param_vals[param_num]
+            constraints[param_num](name, m)
+
+        m.randomize()
+        m.checkgrad(verbose=1, step=step)
+        print m
+        assert m.checkgrad(step=step)
+
+    ###########
+    # EP test #
+    ###########
+    @with_setup(setUp, tearDown)
+    def t_ep_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        #Normalize
+        Y = Y/Y.max()
+        white_var = 0.001
+        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        ep_likelihood = GPy.likelihoods.EP(Y.copy(), model)
+        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)
+        m.ensure_default_constraints()
+        m.constrain_fixed('white', white_var)
+
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            m[name] = param_vals[param_num]
+            constraints[param_num](name, m)
+
+        m.randomize()
+        m.checkgrad(verbose=1, step=step)
+        print m
+        assert m.checkgrad(step=step)
+
+
+class LaplaceTests(unittest.TestCase):
+    """
+    Specific likelihood tests, not general enough for the above tests
+    """
+
+    def setUp(self):
+        self.N = 5
+        self.D = 3
+        self.X = np.random.rand(self.N, self.D)*10
+
+        self.real_std = 0.1
+        noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
+        self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
+        self.f = np.random.rand(self.N, 1)
+
+        self.var = 0.2
+
+        self.var = np.random.rand(1)
+        self.stu_t = GPy.likelihoods.student_t(deg_free=5, sigma2=self.var)
+        self.gauss = GPy.likelihoods.gaussian(gp_transformations.Log(), variance=self.var, D=self.D, N=self.N)
+
+        #Make a bigger step as lower bound can be quite curved
+        self.step = 1e-6
+
+    def tearDown(self):
+        self.stu_t = None
+        self.gauss = None
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_gaussian_d2logpdf_df2_2(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.Y = None
+        self.gauss = None
+
+        self.N = 2
+        self.D = 1
+        self.X = np.linspace(0, self.D, self.N)[:, None]
+        self.real_std = 0.2
+        noise = np.random.randn(*self.X.shape)*self.real_std
+        self.Y = np.sin(self.X*2*np.pi) + noise
+        self.f = np.random.rand(self.N, 1)
+        self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
+
+        dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
+        d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests"
+    unittest.main()
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@ -28,8 +28,8 @@ def ard(p):
 class Test(unittest.TestCase):
    input_dim = 9
    num_inducing = 4
-    N = 3
-    Nsamples = 5e6
+    N = 30
+    Nsamples = 9e6

    def setUp(self):
        i_s_dim_list = [2,4,3]
@ -45,20 +45,26 @@ class Test(unittest.TestCase):
                                         input_slices = input_slices
                                         )
        self.kerns = (
-                    input_slice_kern,
+#                     input_slice_kern,
 #                       (GPy.kern.rbf(self.input_dim, ARD=True) +
 #                        GPy.kern.linear(self.input_dim, ARD=True) +
 #                        GPy.kern.bias(self.input_dim) +
 #                        GPy.kern.white(self.input_dim)),
 #                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-#                      GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-#                      GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
-#                      GPy.kern.bias(self.input_dim) +
-#                      GPy.kern.white(self.input_dim)),
-#                       GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
+#                     GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                     GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
+#                     GPy.kern.bias(self.input_dim) +
+#                     GPy.kern.white(self.input_dim)),
+        (GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
+                    GPy.kern.bias(self.input_dim, np.random.rand()) +
+                    GPy.kern.white(self.input_dim, np.random.rand())),
+                (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+                    GPy.kern.bias(self.input_dim, np.random.rand()) +
+                    GPy.kern.white(self.input_dim, np.random.rand())),
+#                     GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim, ARD=False), GPy.kern.linear(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim),
-#                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim),
+#                     GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
 #                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
 #                       GPy.kern.bias(self.input_dim), GPy.kern.white(self.input_dim),
@ -79,7 +85,7 @@ class Test(unittest.TestCase):

    def test_psi1(self):
        for kern in self.kerns:
-            Nsamples = np.floor(self.Nsamples/300.)
+            Nsamples = np.floor(self.Nsamples/self.N)
            psi1 = kern.psi1(self.Z, self.q_x_mean, self.q_x_variance)
            K_ = np.zeros((Nsamples, self.num_inducing))
            diffs = []
@ -105,7 +111,7 @@ class Test(unittest.TestCase):

    def test_psi2(self):
        for kern in self.kerns:
-            Nsamples = self.Nsamples/300.
+            Nsamples = int(np.floor(self.Nsamples/self.N))
            psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
            K_ = np.zeros((self.num_inducing, self.num_inducing))
            diffs = []
@ -119,10 +125,10 @@ class Test(unittest.TestCase):
            try:
                import pylab
                pylab.figure(msg)
-                pylab.plot(diffs)
+                pylab.plot(diffs, marker='x', mew=1.3)
 #                 print msg, np.allclose(psi2.squeeze(), K_, rtol=1e-1, atol=.1)
-                self.assertTrue(np.allclose(psi2.squeeze(), K_,
-                                            rtol=1e-1, atol=.1),
+                self.assertTrue(np.allclose(psi2.squeeze(), K_),
+                                            #rtol=1e-1, atol=.1),
                                msg=msg + ": not matching")
 #                 sys.stdout.write(".")
            except:
@ -135,7 +141,7 @@ class Test(unittest.TestCase):
 if __name__ == "__main__":
    sys.argv = ['',
         #'Test.test_psi0',
-         'Test.test_psi1',
+         #'Test.test_psi1',
         'Test.test_psi2',
         ]
    unittest.main()
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -209,7 +209,7 @@ class GradientTests(unittest.TestCase):
        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.rbf(1)
        m = GPy.models.SparseGPClassification(X,Y,kernel=kernel,Z=Z)
-        #distribution = GPy.likelihoods.likelihood_functions.Binomial()
+        #distribution = GPy.likelihoods.likelihood_functions.Bernoulli()
        #likelihood = GPy.likelihoods.EP(Y, distribution)
        #m = GPy.core.SparseGP(X, likelihood, kernel, Z)
        #m.ensure_default_constraints()
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@ -0,0 +1,24 @@
+import numpy as np
+
+def get_blocks(A, blocksizes):
+    assert (A.shape[0]==A.shape[1]) and len(A.shape)==2, "can;t blockify this non-square matrix"
+    N = np.sum(blocksizes)
+    assert A.shape[0] == N, "bad blocksizes"
+    num_blocks = len(blocksizes)
+    B = np.empty(shape=(num_blocks, num_blocks), dtype=np.object)
+    count_i = 0
+    for Bi, i in enumerate(blocksizes):
+        count_j = 0
+        for Bj, j in enumerate(blocksizes):
+            B[Bi, Bj] = A[count_i:count_i + i, count_j : count_j + j]
+            count_j += j
+        count_i += i
+    return B
+
+
+
+if __name__=='__main__':
+    A = np.zeros((5,5))
+    B = get_blocks(A,[2,3])
+    B[0,0] += 7
+    print B
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@ -5,13 +5,18 @@ import ConfigParser
 import os
 config = ConfigParser.ConfigParser()

-user_file = os.path.join(os.getenv('HOME'),'.gpy_config.cfg')
-default_file = os.path.join('..','gpy_config.cfg')
+home = os.getenv('HOME') or os.getenv('USERPROFILE')
+user_file = os.path.join(home,'.gpy_config.cfg')
+default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'gpy_config.cfg'))
+print user_file, os.path.isfile(user_file)
+print default_file, os.path.isfile(default_file)

 # 1. check if the user has a ~/.gpy_config.cfg
 if os.path.isfile(user_file):
    config.read(user_file)
-else:
+elif os.path.isfile(default_file):
    # 2. if not, use the default one
-    path = os.path.dirname(__file__)
-    config.read(os.path.join(path,default_file))
+    config.read(default_file)
+else:
+    #3. panic
+    raise ValueError, "no configuration file found"
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -17,13 +17,13 @@ except ImportError:

 import sys, urllib

-def reporthook(a,b,c): 
+def reporthook(a,b,c):
    # ',' at the end of the line is important!
    #print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
    #you can also use sys.stdout.write
    sys.stdout.write("\r% 3.1f%% of %d bytes" % (min(100, float(a * b) / c * 100), c))
    sys.stdout.flush()
-     
+
 # Global variables
 data_path = os.path.join(os.path.dirname(__file__), 'datasets')
 default_seed = 10000
@ -39,7 +39,7 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
                                       'license' : None,
                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
-                   
+
                  'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
                                      'files' : [['Index', 'housing.data', 'housing.names']],
                                      'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
@ -164,14 +164,14 @@ def prompt_user(prompt):
        print(prompt)
        choice = raw_input().lower()
        # would like to test for exception here, but not sure if we can do that without importing IPython
-    except: 
+    except:
        print('Stdin is not implemented.')
        print('You need to set')
        print('overide_manual_authorize=True')
        print('to proceed with the download. Please set that variable and continue.')
        raise

-    
+
    if choice in yes:
        return True
    elif choice in no:
@ -189,7 +189,7 @@ def data_available(dataset_name=None):
            if not os.path.exists(os.path.join(data_path, dataset_name, file)):
                return False
    return True
-            
+
 def download_url(url, store_directory, save_name = None, messages = True, suffix=''):
    """Download a file from a url and save it to disk."""
    i = url.rfind('/')
@ -249,18 +249,18 @@ def download_data(dataset_name=None):
            for file in files:
                download_url(os.path.join(url,file), dataset_name, dataset_name)
    return True
-                  
+
 def data_details_return(data, data_set):
    """Update the data component of the data dictionary with details drawn from the data_resources."""
    data.update(data_resources[data_set])
    return data

-    
+
 def cmu_urls_files(subj_motions, messages = True):
    '''
-    Find which resources are missing on the local disk for the requested CMU motion capture motions. 
+    Find which resources are missing on the local disk for the requested CMU motion capture motions.
    '''
-    
+
    subjects_num = subj_motions[0]
    motions_num = subj_motions[1]

@ -280,15 +280,15 @@ def cmu_urls_files(subj_motions, messages = True):
            motions[i].append(curMot)

    all_skels = []
-    
+
    assert len(subjects) == len(motions)
-    
+
    all_motions = []
-            
+
    for i in range(len(subjects)):
        skel_dir = os.path.join(data_path, 'cmu_mocap')
        cur_skel_file = os.path.join(skel_dir, subjects[i] + '.asf')
-        
+
        url_required = False
        file_download = []
        if not os.path.exists(cur_skel_file):
@ -332,7 +332,7 @@ if gpxpy_available:
            points = [point for track in gpx.tracks for segment in track.segments for point in segment.points]
            data = [[(point.time-datetime.datetime(2013,8,21)).total_seconds(), point.latitude, point.longitude, point.elevation] for point in points]
            X.append(np.asarray(data)[::sample_every, :])
-            gpx_file.close()        
+            gpx_file.close()
        return data_details_return({'X' : X, 'info' : 'Data is an array containing time in seconds, latitude, longitude and elevation in that order.'}, data_set)

 del gpxpy_available
@ -408,7 +408,7 @@ def oil(data_set='three_phase_oil_flow'):
    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'Xtest' : Xtest, 'Xvalid': Xvalid, 'Yvalid': Yvalid}, data_set)
    #else:
    # throw an error
-    
+
 def oil_100(seed=default_seed, data_set = 'three_phase_oil_flow'):
    np.random.seed(seed=seed)
    data = oil()
@ -622,7 +622,7 @@ def xw_pen(data_set='xw_pen'):
    X = np.arange(485)[:, None]
    return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen. Plot in original paper showed regression between time steps 175 and 275."}, data_set)

-    
+
 def download_rogers_girolami_data():
    if not data_available('rogers_girolami_data'):
        download_data(data_set)
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -61,6 +61,14 @@ def dpotri(A, lower=0):
    """
    return lapack.dpotri(A, lower=lower)

+def pddet(A):
+    """
+    Determinant of a positive definite matrix, only symmetric matricies though
+    """
+    L = jitchol(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
 def trace_dot(a, b):
    """
    Efficiently compute the trace of the matrix product of a and b
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@ -5,6 +5,33 @@ import numpy as np
 from scipy import weave
 from config import *

+def chain_1(df_dg, dg_dx):
+    """
+    Generic chaining function for first derivative
+
+    .. math::
+        \\frac{d(f . g)}{dx} = \\frac{df}{dg} \\frac{dg}{dx}
+    """
+    return df_dg * dg_dx
+
+def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
+    """
+    Generic chaining function for second derivative
+
+    .. math::
+        \\frac{d^{2}(f . g)}{dx^{2}} = \\frac{d^{2}f}{dg^{2}}(\\frac{dg}{dx})^{2} + \\frac{df}{dg}\\frac{d^{2}g}{dx^{2}}
+    """
+    return d2f_dg2*(dg_dx**2) + df_dg*d2g_dx2
+
+def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
+    """
+    Generic chaining function for third derivative
+
+    .. math::
+        \\frac{d^{3}(f . g)}{dx^{3}} = \\frac{d^{3}f}{dg^{3}}(\\frac{dg}{dx})^{3} + 3\\frac{d^{2}f}{dg^{2}}\\frac{dg}{dx}\\frac{d^{2}g}{dx^{2}} + \\frac{df}{dg}\\frac{d^{3}g}{dx^{3}}
+    """
+    return d3f_dg3*(dg_dx**3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3
+
 def opt_wrapper(m, **kwargs):
    """
    This function just wraps the optimization procedure of a GPy
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@ -1,4 +1,4 @@
-from sympy import Function, S, oo, I, cos, sin, asin, log, erf,pi,exp
+from sympy import Function, S, oo, I, cos, sin, asin, log, erf,pi,exp,sqrt,sign


 class ln_diff_erf(Function):
@ -19,15 +19,84 @@ class ln_diff_erf(Function):
        if x0.is_Number and x1.is_Number:            
            return log(erf(x0)-erf(x1))

-class sim_h(Function):
+class dh_dd_i(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+
+            diff_t = (t-tprime)
+            l2 = l*l
+            h = h(t, tprime, d_i, d_j, l)
+            half_l_di = 0.5*l*d_i
+            arg_1 = half_l_di + tprime/l
+            arg_2 = half_l_di - (t-tprime)/l
+            ln_part_1 = ln_diff_erf(arg_1, arg_2)
+            arg_1 = half_l_di 
+            arg_2 = half_l_di - t/l
+            sign_val = sign(t/l)
+            ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+            base = ((0.5*d_i*l2*(d_i+d_j)-1)*h 
+                    + (-diff_t*sign_val*exp(half_l_di*half_l_di
+                                          -d_i*diff_t
+                                          +ln_part_1)
+                       +t*sign_val*exp(half_l_di*half_l_di
+                                          -d_i*t-d_j*tprime
+                                          +ln_part_2))
+                    + l/sqrt(pi)*(-exp(-diff_t*diff_t/l2)
+                                     +exp(-tprime*tprime/l2-d_i*t)
+                                     +exp(-t*t/l2-d_j*tprime)
+                                     -exp(-(d_i*t + d_j*tprime))))
+            return base/(d_i+d_j)
+
+class dh_dd_j(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            diff_t = (t-tprime)
+            l2 = l*l
+            half_l_di = 0.5*l*d_i
+            h = h(t, tprime, d_i, d_j, l)
+            arg_1 = half_l_di + tprime/l
+            arg_2 = half_l_di - (t-tprime)/l
+            ln_part_1 = ln_diff_erf(arg_1, arg_2)
+            arg_1 = half_l_di 
+            arg_2 = half_l_di - t/l
+            sign_val = sign(t/l)
+            ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+            sign_val = sign(t/l)
+            base = tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2)-h
+            return base/(d_i+d_j)
+    
+class dh_dl(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+
+            diff_t = (t-tprime)
+            l2 = l*l
+            h = h(t, tprime, d_i, d_j, l)
+            return 0.5*d_i*d_i*l*h + 2./(sqrt(pi)*(d_i+d_j))*((-diff_t/l2-d_i/2.)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2.)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2.)*exp(-t*t/l2-d_j*tprime)-d_i/2.*exp(-(d_i*t+d_j*tprime)))
+
+class dh_dt(Function):
    nargs = 5
-
-    def fdiff(self, argindex=1):
-        pass
-    
    @classmethod
    def eval(cls, t, tprime, d_i, d_j, l):
-        # putting in the is_Number stuff forces it to look for a fdiff method for derivative.
        if (t.is_Number
            and tprime.is_Number
            and d_i.is_Number
@ -40,13 +109,119 @@ class sim_h(Function):
                or l is S.NaN):
                return S.NaN
            else:
-                return (exp((d_j/2*l)**2)/(d_i+d_j)
-                        *(exp(-d_j*(tprime - t))
-                          *(erf((tprime-t)/l - d_j/2*l)
-                            + erf(t/l + d_j/2*l))
-                          - exp(-(d_j*tprime + d_i))
-                          *(erf(tprime/l - d_j/2*l)
-                            + erf(d_j/2*l))))
+                half_l_di = 0.5*l*d_i
+                arg_1 = half_l_di + tprime/l
+                arg_2 = half_l_di - (t-tprime)/l
+                ln_part_1 = ln_diff_erf(arg_1, arg_2)
+                arg_1 = half_l_di 
+                arg_2 = half_l_di - t/l
+                sign_val = sign(t/l)
+                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+                
+                return (sign_val*exp(half_l_di*half_l_di
+                                        - d_i*(t-tprime)
+                                        + ln_part_1
+                                        - log(d_i + d_j))
+                        - sign_val*exp(half_l_di*half_l_di
+                                          - d_i*t - d_j*tprime
+                                          + ln_part_2
+                                          - log(d_i + d_j))).diff(t)
+
+class dh_dtprime(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            if (t is S.NaN
+                or tprime is S.NaN
+                or d_i is S.NaN
+                or d_j is S.NaN
+                or l is S.NaN):
+                return S.NaN
+            else:
+                half_l_di = 0.5*l*d_i
+                arg_1 = half_l_di + tprime/l
+                arg_2 = half_l_di - (t-tprime)/l
+                ln_part_1 = ln_diff_erf(arg_1, arg_2)
+                arg_1 = half_l_di 
+                arg_2 = half_l_di - t/l
+                sign_val = sign(t/l)
+                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+                
+                return (sign_val*exp(half_l_di*half_l_di
+                                        - d_i*(t-tprime)
+                                        + ln_part_1
+                                        - log(d_i + d_j))
+                        - sign_val*exp(half_l_di*half_l_di
+                                          - d_i*t - d_j*tprime
+                                          + ln_part_2
+                                          - log(d_i + d_j))).diff(tprime)
+
+
+class h(Function):
+    nargs = 5
+    def fdiff(self, argindex=5):
+        t, tprime, d_i, d_j, l = self.args
+        if argindex == 1:
+            return dh_dt(t, tprime, d_i, d_j, l)
+        elif argindex == 2:
+            return dh_dtprime(t, tprime, d_i, d_j, l)
+        elif argindex == 3:
+            return dh_dd_i(t, tprime, d_i, d_j, l)
+        elif argindex == 4:
+            return dh_dd_j(t, tprime, d_i, d_j, l)
+        elif argindex == 5:
+            return dh_dl(t, tprime, d_i, d_j, l)
+                                                                
+    
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        # putting in the is_Number stuff forces it to look for a fdiff method for derivative. If it's left out, then when asking for self.diff, it just does the diff on the eval symbolic terms directly. We want to avoid that because we are looking to ensure everything is numerically stable. Maybe it's because of the if statement that this happens? 
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            if (t is S.NaN
+                or tprime is S.NaN
+                or d_i is S.NaN
+                or d_j is S.NaN
+                or l is S.NaN):
+                return S.NaN
+            else:
+                half_l_di = 0.5*l*d_i
+                arg_1 = half_l_di + tprime/l
+                arg_2 = half_l_di - (t-tprime)/l
+                ln_part_1 = ln_diff_erf(arg_1, arg_2)
+                arg_1 = half_l_di 
+                arg_2 = half_l_di - t/l
+                sign_val = sign(t/l)
+                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+                
+                return (sign_val*exp(half_l_di*half_l_di
+                                        - d_i*(t-tprime)
+                                        + ln_part_1
+                                        - log(d_i + d_j))
+                        - sign_val*exp(half_l_di*half_l_di
+                                          - d_i*t - d_j*tprime
+                                          + ln_part_2
+                                          - log(d_i + d_j)))
+            
+                                  
+                # return (exp((d_j/2.*l)**2)/(d_i+d_j)
+                #         *(exp(-d_j*(tprime - t))
+                #           *(erf((tprime-t)/l - d_j/2.*l)
+                #             + erf(t/l + d_j/2.*l))
+                #           - exp(-(d_j*tprime + d_i))
+                #           *(erf(tprime/l - d_j/2.*l)
+                #             + erf(d_j/2.*l))))

 class erfc(Function):
    nargs = 1
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@ -13,24 +13,32 @@ def std_norm_cdf(x):
    Cumulative standard Gaussian distribution
    Based on Abramowitz, M. and Stegun, I. (1970)
    """
+    #Generalize for many x
+    x = np.asarray(x).copy()
+    cdf_x = np.zeros_like(x)
+    N = x.size
    support_code = "#include <math.h>"
    code = """

-    double sign = 1.0;
-    if (x < 0.0){
-        sign = -1.0;
-        x = -x;
+    double sign, t, erf;
+    for (int i=0; i<N; i++){
+        sign = 1.0;
+        if (x[i] < 0.0){
+            sign = -1.0;
+            x[i] = -x[i];
+        }
+        x[i] = x[i]/sqrt(2.0);
+
+        t = 1.0/(1.0 +  0.3275911*x[i]);
+
+        erf = 1. - exp(-x[i]*x[i])*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))));
+
+        //return_val = 0.5*(1.0 + sign*erf);
+        cdf_x[i] = 0.5*(1.0 + sign*erf);
    }
-    x = x/sqrt(2.0);
-
-    double t = 1.0/(1.0 +  0.3275911*x);
-
-    double erf = 1. - exp(-x*x)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))));
-
-    return_val = 0.5*(1.0 + sign*erf);
    """
-    x = float(x)
-    return weave.inline(code,arg_names=['x'],support_code=support_code)
+    weave.inline(code, arg_names=['x', 'cdf_x', 'N'], support_code=support_code)
+    return cdf_x

 def inv_std_norm_cdf(x):
    """
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -222,7 +222,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        """


-        mpsi = psi.coSpy()
+        mpsi = psi.copy()
        d = psi[-1]
        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)

--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -2,3 +2,5 @@ include *.txt
 recursive-include doc *.txt
 include *.md
 recursive-include doc *.md
+include *.cfg
+recursive-include doc *.cfg
--- a/doc/GPy.examples.rst
+++ b/doc/GPy.examples.rst
@ -20,6 +20,14 @@ GPy.examples.dimensionality_reduction module
    :undoc-members:
    :show-inheritance:

+GPy.examples.laplace_approximations module
+------------------------------------------
+
+.. automodule:: GPy.examples.laplace_approximations
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.examples.regression module
 ------------------------------

--- a/doc/GPy.kern.parts.rst
+++ b/doc/GPy.kern.parts.rst
@ -28,6 +28,14 @@ GPy.kern.parts.Matern52 module
    :undoc-members:
    :show-inheritance:

+GPy.kern.parts.ODE_1 module
+---------------------------
+
+.. automodule:: GPy.kern.parts.ODE_1
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.kern.parts.bias module
 --------------------------

@ -44,6 +52,14 @@ GPy.kern.parts.coregionalize module
    :undoc-members:
    :show-inheritance:

+GPy.kern.parts.eq_ode1 module
+-----------------------------
+
+.. automodule:: GPy.kern.parts.eq_ode1
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.kern.parts.exponential module
 ---------------------------------

--- a/doc/GPy.likelihoods.noise_models.rst
+++ b/doc/GPy.likelihoods.noise_models.rst
@ -4,10 +4,10 @@ GPy.likelihoods.noise_models package
 Submodules
 ----------

-GPy.likelihoods.noise_models.binomial_noise module
--------------------------------------------------
+GPy.likelihoods.noise_models.bernoulli_noise module
+---------------------------------------------------

-.. automodule:: GPy.likelihoods.noise_models.binomial_noise
+.. automodule:: GPy.likelihoods.noise_models.bernoulli_noise
    :members:
    :undoc-members:
    :show-inheritance:
@ -60,6 +60,14 @@ GPy.likelihoods.noise_models.poisson_noise module
    :undoc-members:
    :show-inheritance:

+GPy.likelihoods.noise_models.student_t_noise module
+---------------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.student_t_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+

 Module contents
 ---------------
--- a/doc/GPy.likelihoods.rst
+++ b/doc/GPy.likelihoods.rst
@ -43,6 +43,14 @@ GPy.likelihoods.gaussian_mixed_noise module
    :undoc-members:
    :show-inheritance:

+GPy.likelihoods.laplace module
+------------------------------
+
+.. automodule:: GPy.likelihoods.laplace
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.likelihoods.likelihood module
 ---------------------------------

--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@ -4,6 +4,14 @@ GPy.testing package
 Submodules
 ----------

+GPy.testing.bcgplvm_tests module
+--------------------------------
+
+.. automodule:: GPy.testing.bcgplvm_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.bgplvm_tests module
 -------------------------------

@ -28,6 +36,14 @@ GPy.testing.examples_tests module
    :undoc-members:
    :show-inheritance:

+GPy.testing.gp_transformation_tests module
+------------------------------------------
+
+.. automodule:: GPy.testing.gp_transformation_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.gplvm_tests module
 ------------------------------

@ -44,6 +60,14 @@ GPy.testing.kernel_tests module
    :undoc-members:
    :show-inheritance:

+GPy.testing.likelihoods_tests module
+------------------------------------
+
+.. automodule:: GPy.testing.likelihoods_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.mapping_tests module
 --------------------------------

--- a/doc/GPy.util.rst
+++ b/doc/GPy.util.rst
@ -27,6 +27,14 @@ GPy.util.classification module
    :undoc-members:
    :show-inheritance:

+GPy.util.config module
+----------------------
+
+.. automodule:: GPy.util.config
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.datasets module
 ------------------------

@ -43,6 +51,14 @@ GPy.util.decorators module
    :undoc-members:
    :show-inheritance:

+GPy.util.erfcx module
+---------------------
+
+.. automodule:: GPy.util.erfcx
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.linalg module
 ----------------------

@ -51,6 +67,14 @@ GPy.util.linalg module
    :undoc-members:
    :show-inheritance:

+GPy.util.ln_diff_erfs module
+----------------------------
+
+.. automodule:: GPy.util.ln_diff_erfs
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.misc module
 --------------------

@ -75,6 +99,14 @@ GPy.util.multioutput module
    :undoc-members:
    :show-inheritance:

+GPy.util.netpbmfile module
+--------------------------
+
+.. automodule:: GPy.util.netpbmfile
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.plot module
 --------------------

@ -99,6 +131,14 @@ GPy.util.squashers module
    :undoc-members:
    :show-inheritance:

+GPy.util.symbolic module
+------------------------
+
+.. automodule:: GPy.util.symbolic
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.univariate_Gaussian module
 -----------------------------------