Merge branch 'devel'

2026-05-15 06:52:39 +02:00 · 2013-12-10 14:07:18 -08:00 · 2013-12-10 14:07:18 -08:00 · ebd2752d5b
commit ebd2752d5b
parent e904aec413 4a751fd2da
194 changed files with 15884 additions and 11008 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,7 +9,6 @@
 dist
 build
 eggs
-parts
 bin
 var
 sdist
--- a/GPy/FAQ.txt
+++ b/GPy/FAQ.txt
@ -0,0 +1,8 @@
+Frequently Asked Questions
+--------------------------
+
+Unit tests are run through Travis-Ci. They can be run locally through entering the GPy route diretory and writing
+
+nosetests testing/
+
+Documentation is handled by Sphinx. To build the documentation:
--- a/GPy/init.py
+++ b/GPy/init.py
@ -5,6 +5,7 @@ warnings.filterwarnings("ignore", category=DeprecationWarning)

 import core
 import models
+import mappings
 import inference
 import util
 import examples
--- a/GPy/coding_style_guide.txt
+++ b/GPy/coding_style_guide.txt
@ -0,0 +1,10 @@
+In this text document we will describe coding conventions to be used in GPy to keep things consistent.
+
+All arrays containing data are two dimensional. The first dimension is the number of data, the second dimension is number of features. This keeps things consistent with the idea of a design matrix.
+
+Input matrices are either X or t, output matrices are Y.
+
+Input dimensionality is input_dim, output dimensionality is output_dim, number of data is num_data.
+
+Data sets are preprocessed in the datasets.py file. This file also records where the data set was obtained from in the dictionary stored in the file. Long term we should move this dictionary to sqlite or similar.
+
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -2,9 +2,10 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 from model import *
-from parameterised import *
+from parameterized import *
 import priors
 from gp import GP
 from sparse_gp import SparseGP
 from fitc import FITC
 from svigp import SVIGP
+from mapping import *
--- a/GPy/core/domains.py
+++ b/GPy/core/domains.py
@ -2,6 +2,22 @@
 Created on 4 Jun 2013

@author: maxz
+
+(Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`.
+These domains specify the legitimate realm of the parameters to live in.
+
+:const:`~GPy.core.domains.REAL` :
+    real domain, all values in the real numbers are allowed
+
+:const:`~GPy.core.domains.POSITIVE`:
+    positive domain, only positive real values are allowed
+    
+:const:`~GPy.core.domains.NEGATIVE`:
+    same as :const:`~GPy.core.domains.POSITIVE`, but only negative values are allowed
+    
+:const:`~GPy.core.domains.BOUNDED`:
+    only values within the bounded range are allowed,
+    the bounds are specified withing the object with the bounded range
 '''

 REAL = 'real'
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@ -11,25 +11,27 @@ from sparse_gp import SparseGP

 class FITC(SparseGP):
    """
-    sparse FITC approximation
+
+    Sparse FITC approximation

    :param X: inputs
    :type X: np.ndarray (num_data x Q)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP)
-    :param kernel : the kernel (covariance function). See link kernels
+    :param kernel: the kernel (covariance function). See link kernels
    :type kernel: a GPy.kern.kern instance
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
-    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
+
    """

    def __init__(self, X, likelihood, kernel, Z, normalize_X=False):
        SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False)
        assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs"

-    def update_likelihood_approximation(self):
+    def update_likelihood_approximation(self, **kwargs):
        """
        Approximates a non-Gaussian likelihood using Expectation Propagation

@ -37,7 +39,7 @@ class FITC(SparseGP):
        this function does nothing
        """
        self.likelihood.restart()
-        self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+        self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0, **kwargs)
        self._set_params(self._get_params())

    def _compute_kernel_matrices(self):
@ -120,11 +122,11 @@ class FITC(SparseGP):
            _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm
            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z)
            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z)
-            self._dKmm_dX += 2.*self.kern.dK_dX(_dKmm ,self.Z)
+            self._dKmm_dX += self.kern.dK_dX(_dKmm ,self.Z)
            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:])

        # the partial derivative vector for the likelihood
-        if self.likelihood.Nparams == 0:
+        if self.likelihood.num_params == 0:
            # save computation here.
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
@ -140,7 +142,6 @@ class FITC(SparseGP):

            dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y**2 * dbstar_dnoise)
            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
-            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)

            dD_dnoise_1 =  mdot(self.V_star*LBiLmipsi1.T,LBiLmipsi1*dbstar_dnoise.T*self.likelihood.Y.T)
            alpha = mdot(LBiLmipsi1,self.V_star)
@ -158,7 +159,7 @@ class FITC(SparseGP):
        A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB))))
        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
-        return A + C + D
+        return A + C + D + self.likelihood.Z

    def _log_likelihood_gradients(self):
        pass
@ -174,7 +175,7 @@ class FITC(SparseGP):

    def dL_dZ(self):
        dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X)
-        dL_dZ += 2. * self.kern.dK_dX(self._dL_dKmm,X=self.Z)
+        dL_dZ += self.kern.dK_dX(self._dL_dKmm,X=self.Z)
        dL_dZ += self._dpsi1_dX
        dL_dZ += self._dKmm_dX
        return dL_dZ
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -6,8 +6,7 @@ import numpy as np
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs
-#from ..util.plot import gpplot,  Tango
-from ..likelihoods import EP
+from ..likelihoods import EP, Laplace
 from gp_base import GPBase

 class GP(GPBase):
@ -16,50 +15,60 @@ class GP(GPBase):

    :param X: input observations
    :param kernel: a GPy kernel, defaults to rbf+white
-    :parm likelihood: a GPy likelihood
+    :param likelihood: a GPy likelihood
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :rtype: model object
-    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
-    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
-    :type powerep: list

    .. Note:: Multiple independent outputs are allowed using columns of Y

    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        self.update_likelihood_approximation()
+

    def _set_params(self, p):
-        self.kern._set_params_transformed(p[:self.kern.num_params_transformed()])
-        self.likelihood._set_params(p[self.kern.num_params_transformed():])
+        new_kern_params = p[:self.kern.num_params_transformed()]
+        new_likelihood_params = p[self.kern.num_params_transformed():]
+        old_likelihood_params = self.likelihood._get_params()
+
+        self.kern._set_params_transformed(new_kern_params)
+        self.likelihood._set_params_transformed(new_likelihood_params)

        self.K = self.kern.K(self.X)
+
+        #Re fit likelihood approximation (if it is an approx), as parameters have changed
+        if isinstance(self.likelihood, Laplace):
+            self.likelihood.fit_full(self.K)
+
        self.K += self.likelihood.covariance_matrix

        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)

        # the gradient of the likelihood wrt the covariance matrix
        if self.likelihood.YYT is None:
-            #alpha = np.dot(self.Ki, self.likelihood.Y)
-            alpha,_ = dpotrs(self.L, self.likelihood.Y,lower=1)
+            # alpha = np.dot(self.Ki, self.likelihood.Y)
+            alpha, _ = dpotrs(self.L, self.likelihood.Y, lower=1)

            self.dL_dK = 0.5 * (tdot(alpha) - self.output_dim * self.Ki)
        else:
-            #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
+            # tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
            tmp, _ = dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
            tmp, _ = dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
            self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki)

+        #Adding dZ_dK (0 for a non-approximate likelihood, compensates for
+        #additional gradients of K when log-likelihood has non-zero Z term)
+        self.dL_dK += self.likelihood.dZ_dK
+
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))

-
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()

-    def update_likelihood_approximation(self):
+    def update_likelihood_approximation(self, **kwargs):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation

@ -67,8 +76,8 @@ class GP(GPBase):
        this function does nothing
        """
        self.likelihood.restart()
-        self.likelihood.fit_full(self.kern.K(self.X))
-        self._set_params(self._get_params())  # update the GP
+        self.likelihood.fit_full(self.kern.K(self.X), **kwargs)
+        self._set_params(self._get_params()) # update the GP

    def _model_fit_term(self):
        """
@ -77,7 +86,7 @@ class GP(GPBase):
        if self.likelihood.YYT is None:
            tmp, _ = dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
            return -0.5 * np.sum(np.square(tmp))
-            #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
+            # return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
        else:
            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))

@ -89,8 +98,8 @@ class GP(GPBase):
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
-        return -0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z
-
+        return (-0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) -
+            0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z)

    def _log_likelihood_gradients(self):
        """
@ -100,13 +109,13 @@ class GP(GPBase):
        """
        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

-    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
+    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
        """
        Internal helper function for making predictions, does not account
        for normalization or likelihood
        """
-        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
-        #KiKx = np.dot(self.Ki, Kx)
+        Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T
+        # KiKx = np.dot(self.Ki, Kx)
        KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1)
        mu = np.dot(KiKx.T, self.likelihood.Y)
        if full_cov:
@ -120,20 +129,19 @@ class GP(GPBase):
            debug_this # @UndefinedVariable
        return mu, var

-    def predict(self, Xnew, which_parts='all', full_cov=False):
+    def predict(self, Xnew, which_parts='all', full_cov=False, **likelihood_args):
        """
        Predict the function(s) at the new point(s) Xnew.
-        Arguments
-        ---------
+
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim


           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
@ -145,6 +153,52 @@ class GP(GPBase):
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)

        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
-
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
        return mean, var, _025pm, _975pm
+
+    def _raw_predict_single_output(self, _Xnew, output, which_parts='all', full_cov=False,stop=False):
+        """
+        For a specific output, calls _raw_predict() at the new point(s) _Xnew.
+        This functions calls _add_output_index(), so _Xnew should not have an index column specifying the output.
+        ---------
+
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param output: output to predict
+        :type output: integer in {0,..., output_dim-1}
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+
+        .. Note:: For multiple non-independent outputs models only.
+        """
+        _Xnew = self._add_output_index(_Xnew, output)
+        return self._raw_predict(_Xnew, which_parts=which_parts,full_cov=full_cov, stop=stop)
+
+    def predict_single_output(self, Xnew,output=0, which_parts='all', full_cov=False, likelihood_args=dict()):
+        """
+        For a specific output, calls predict() at the new point(s) Xnew.
+        This functions calls _add_output_index(), so Xnew should not have an index column specifying the output.
+
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: bool
+        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+
+        .. Note:: For multiple non-independent outputs models only.
+        """
+        Xnew = self._add_output_index(Xnew, output)
+        return self.predict(Xnew, which_parts=which_parts, full_cov=full_cov, likelihood_args=likelihood_args)
+
+    def getstate(self):
+        return GPBase.getstate(self)
+
+    def setstate(self, state):
+        GPBase.setstate(self, state)
+        self._set_params(self._get_params())
+
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -3,16 +3,22 @@ from .. import kern
 from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D
 import pylab as pb
 from GPy.core.model import Model
+import warnings
+from ..likelihoods import Gaussian, Gaussian_Mixed_Noise

 class GPBase(Model):
    """
-    Gaussian Process Model for holding shared behaviour between
-    sprase_GP and GP models
-    """
+    Gaussian process base model for holding shared behaviour between
+    sparse_GP and GP models, and potentially other models in the future.

+    Here we define some functions that are use
+    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
+        if len(X.shape)==1:
+            X = X.reshape(-1,1)
+            warnings.warn("One dimension output (N,) being reshaped to (N,1)")
        self.X = X
-        assert len(self.X.shape) == 2
+        assert len(self.X.shape) == 2, "too many dimensions for X input"
        self.num_data, self.input_dim = self.X.shape
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
@ -29,108 +35,241 @@ class GPBase(Model):
            self._Xscale = np.ones((1, self.input_dim))

        super(GPBase, self).__init__()
-        #Model.__init__(self)
+        # Model.__init__(self)
        # All leaf nodes should call self._set_params(self._get_params()) at
        # the end

-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+
+    def posterior_samples_f(self,X,size=10,which_parts='all'):
        """
-        Plot the GP's view of the world, where the data is normalized and the
-        likelihood is Gaussian.
+        Samples the posterior GP at the points X.

-        :param samples: the number of a posteriori samples to plot
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :param X: The points at which to take the samples.
+        :type X: np.ndarray, Nnew x self.input_dim.
+        :param size: the number of a posteriori samples to plot.
+        :type size: int.
+        :param which_parts: which of the kernel functions to plot (additively).
+        :type which_parts: 'all', or list of bools.
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
+        :type full_cov: bool.
+        :returns: Ysim: set of simulations, a Numpy array (N x samples).
+        """
+        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=True)
+        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
+        Ysim = np.random.multivariate_normal(m.flatten(), v, size).T

+        return Ysim
+
+    def posterior_samples(self,X,size=10,which_parts='all',noise_model=None):
+        """
+        Samples the posterior GP at the points X.
+
+        :param X: the points at which to take the samples.
+        :type X: np.ndarray, Nnew x self.input_dim.
+        :param size: the number of a posteriori samples to plot.
+        :type size: int.
+        :param which_parts: which of the kernel functions to plot (additively).
+        :type which_parts: 'all', or list of bools.
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
+        :type full_cov: bool.
+        :param noise_model: for mixed noise likelihood, the noise model to use in the samples.
+        :type noise_model: integer.
+        :returns: Ysim: set of simulations, a Numpy array (N x samples).
+        """
+        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts)
+        if isinstance(self.likelihood,Gaussian):
+            noise_std = np.sqrt(self.likelihood._get_params())
+            Ysim += np.random.normal(0,noise_std,Ysim.shape)
+        elif isinstance(self.likelihood,Gaussian_Mixed_Noise):
+            assert noise_model is not None, "A noise model must be specified."
+            noise_std = np.sqrt(self.likelihood._get_params()[noise_model])
+            Ysim += np.random.normal(0,noise_std,Ysim.shape)
+        else:
+            Ysim = self.likelihood.noise_model.samples(Ysim)
+
+        return Ysim
+
+    def plot_f(self, *args, **kwargs):
+        """
+        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
+
+        This is a convenience function: we simply call self.plot with the
+        argument use_raw_predict set True. All args and kwargs are passed on to
+        plot.
+
+        see also: gp_base.plot
+        """
+        kwargs['plot_raw'] = True
+        self.plot(*args, **kwargs)
+
+    def plot(self, plot_limits=None, which_data_rows='all',
+            which_data_ycols='all', which_parts='all', fixed_inputs=[],
+            levels=20, samples=0, fignum=None, ax=None, resolution=None,
+            plot_raw=False,
+            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+        """
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, we've no implemented this yet !TODO!
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.

        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
-        """
-        if which_data == 'all':
-            which_data = slice(None)
+        using which_data_rowsm which_data_ycols and which_parts

+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :type output: integer (first output is 0)
+        :param linecol: color of line to plot.
+        :type linecol:
+        :param fillcol: color of fill
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        """
+        #deal with optional arguments
+        if which_data_rows == 'all':
+            which_data_rows = slice(None)
+        if which_data_ycols == 'all':
+            which_data_ycols = np.arange(self.output_dim)
+        if len(which_data_ycols)==0:
+            raise ValueError('No data selected for plotting')
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)

-        if self.X.shape[1] == 1:
-            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
-            if samples == 0:
-                m, v = self._raw_predict(Xnew, which_parts=which_parts)
-                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
-                ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
+        #work out what the inputs are for plotting (1D or 2D)
+        fixed_dims = np.array([i for i,v in fixed_inputs])
+        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+
+        #one dimensional plotting
+        if len(free_dims) == 1:
+
+            #define the frame on which to plot
+            resolution = resolution or 200
+            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
+            Xnew, xmin, xmax = x_frame1D(Xu[:,free_dims], plot_limits=plot_limits)
+            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+            Xgrid[:,free_dims] = Xnew
+            for i,v in fixed_inputs:
+                Xgrid[:,i] = v
+
+            #make a prediction on the frame and plot it
+            if plot_raw:
+                m, v = self._raw_predict(Xgrid, which_parts=which_parts)
+                lower = m - 2*np.sqrt(v)
+                upper = m + 2*np.sqrt(v)
+                Y = self.likelihood.Y
            else:
-                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
-                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
-                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax)
-                for i in range(samples):
-                    ax.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
-            ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
-            ax.set_xlim(xmin, xmax)
-            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_ylim(ymin, ymax)
+                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=False) #Compute the exact mean
+                m_, v_, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=True, num_samples=15000) #Apporximate the percentiles
+                Y = self.likelihood.data
+            for d in which_data_ycols:
+                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
+                ax.plot(Xu[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)

-        elif self.X.shape[1] == 2:
-            resolution = resolution or 50
-            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
-            m, v = self._raw_predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable
-            ax.set_xlim(xmin[0], xmax[0])
-            ax.set_ylim(xmin[1], xmax[1])
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+            #optionally plot some samples
+            if samples: #NOTE not tested with fixed_inputs
+                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts)
+                for yi in Ysim.T:
+                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                    #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.

-    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None):
-        """
-        TODO: Docstrings!
-        
-        :param levels: for 2D plotting, the number of contour levels to use
-        is ax is None, create a new figure
-        """
-        # TODO include samples
-        if which_data == 'all':
-            which_data = slice(None)
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        if self.X.shape[1] == 1:
-
-            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-
-            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
-            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
-            for d in range(m.shape[1]):
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax)
-                ax.plot(Xu[which_data], self.likelihood.data[which_data, d], 'kx', mew=1.5)
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+            #set the limits of the plot to some sensible values
+            ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            ax.set_xlim(xmin, xmax)
            ax.set_ylim(ymin, ymax)

-        elif self.X.shape[1] == 2: # FIXME
+        #2D plotting
+        elif len(free_dims) == 2:
+
+            #define the frame for plotting on
            resolution = resolution or 50
-            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
+            Xnew, _, _, xmin, xmax = x_frame2D(Xu[:,free_dims], plot_limits, resolution)
+            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+            Xgrid[:,free_dims] = Xnew
+            for i,v in fixed_inputs:
+                Xgrid[:,i] = v
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            Yf = self.likelihood.Y.flatten()
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
+
+            #predict on the frame and plot
+            if plot_raw:
+                m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
+                Y = self.likelihood.Y
+            else:
+                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts,sampling=False)
+                Y = self.likelihood.data
+            for d in which_data_ycols:
+                m_d = m[:,d].reshape(resolution, resolution).T
+                ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+                ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+
+            #set the limits of the plot to some sensible values
            ax.set_xlim(xmin[0], xmax[0])
            ax.set_ylim(xmin[1], xmax[1])

+            if samples:
+                warnings.warn("Samples are rather difficult to plot for 2D inputs...")
+
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+    def getstate(self):
+        """
+        Get the curent state of the class. This is only used to efficiently
+        pickle the model. See also self.setstate
+        """
+        return Model.getstate(self) + [self.X,
+                self.num_data,
+                self.input_dim,
+                self.kern,
+                self.likelihood,
+                self.output_dim,
+                self._Xoffset,
+                self._Xscale]
+
+    def setstate(self, state):
+        """
+        Set the state of the model. Used for efficient pickling
+        """
+        self._Xscale = state.pop()
+        self._Xoffset = state.pop()
+        self.output_dim = state.pop()
+        self.likelihood = state.pop()
+        self.kern = state.pop()
+        self.input_dim = state.pop()
+        self.num_data = state.pop()
+        self.X = state.pop()
+        Model.setstate(self, state)
+
+    def log_predictive_density(self, x_test, y_test):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param x_test: test observations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        """
+        mu_star, var_star = self._raw_predict(x_test)
+        return self.likelihood.log_predictive_density(y_test, mu_star, var_star)
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -0,0 +1,192 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from ..util.plot import Tango, x_frame1D, x_frame2D
+from parameterized import Parameterized
+import numpy as np
+import pylab as pb
+
+class Mapping(Parameterized):
+    """
+    Base model for shared behavior between models that can act like a mapping. 
+    """
+
+    def __init__(self, input_dim, output_dim):
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+
+        super(Mapping, self).__init__()
+        # Model.__init__(self)
+        # All leaf nodes should call self._set_params(self._get_params()) at
+        # the end
+
+    def f(self, X):
+        raise NotImplementedError
+
+    def df_dX(self, dL_df, X):
+        """Evaluate derivatives of mapping outputs with respect to inputs.
+
+        :param dL_df: gradient of the objective with respect to the function.
+        :type dL_df: ndarray (num_data x output_dim)
+        :param X: the input locations where derivatives are to be evaluated.
+        :type X: ndarray (num_data x input_dim)
+        :returns: matrix containing gradients of the function with respect to the inputs.
+        """
+        raise NotImplementedError
+
+    def df_dtheta(self, dL_df, X):
+        """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters.
+        :param dL_df: gradient of the objective with respect to the function.
+        :type dL_df: ndarray (num_data x output_dim)
+        :param X: input locations where the function is evaluated.
+        :type X: ndarray (num_data x input_dim)
+        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
+        :rtype: ndarray (num_params length)
+        """
+        raise NotImplementedError
+
+    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']):
+        """
+
+        Plot the mapping.
+
+        Plots the mapping associated with the model.
+          - In one dimension, the function is plotted.
+          - In two dimsensions, a contour-plot shows the function
+          - In higher dimensions, we've not implemented this yet !TODO!
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data and which_functions
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data: which if the training data to plot (default all)
+        :type which_data: 'all' or a slice object to slice self.X, self.Y
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param linecol: color of line to plot.
+        :type linecol:
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+
+        """
+        # TODO include samples
+        if which_data == 'all':
+            which_data = slice(None)
+
+        if ax is None:
+            fig = pb.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        plotdims = self.input_dim - len(fixed_inputs)
+
+        if plotdims == 1:
+
+            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
+
+            fixed_dims = np.array([i for i,v in fixed_inputs])
+            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+
+            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
+            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+            Xgrid[:,freedim] = Xnew
+            for i,v in fixed_inputs:
+                Xgrid[:,i] = v
+
+            f = self.predict(Xgrid, which_parts=which_parts)
+            for d in range(y.shape[1]):
+                ax.plot(Xnew, f[:, d], edgecol=linecol)
+
+        elif self.X.shape[1] == 2:
+            resolution = resolution or 50
+            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
+            f = self.predict(Xnew, which_parts=which_parts)
+            m = m.reshape(resolution, resolution).T
+            ax.contour(x, y, f, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
+            ax.set_xlim(xmin[0], xmax[0])
+            ax.set_ylim(xmin[1], xmax[1])
+
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+from GPy.core.model import Model
+
+class Mapping_check_model(Model):
+    """
+    This is a dummy model class used as a base class for checking that the
+    gradients of a given mapping are implemented correctly. It enables
+    checkgradient() to be called independently on each mapping.
+    """
+    def __init__(self, mapping=None, dL_df=None, X=None):
+        num_samples = 20
+        if mapping==None:
+            mapping = GPy.mapping.linear(1, 1)
+        if X==None:
+            X = np.random.randn(num_samples, mapping.input_dim)
+        if dL_df==None:
+            dL_df = np.ones((num_samples, mapping.output_dim))
+
+        self.mapping=mapping
+        self.X = X
+        self.dL_df = dL_df
+        self.num_params = self.mapping.num_params
+        Model.__init__(self)
+
+
+    def _get_params(self):
+        return self.mapping._get_params()
+
+    def _get_param_names(self):
+        return self.mapping._get_param_names()
+
+    def _set_params(self, x):
+        self.mapping._set_params(x)
+
+    def log_likelihood(self):
+        return (self.dL_df*self.mapping.f(self.X)).sum()
+
+    def _log_likelihood_gradients(self):
+        raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class."
+
+class Mapping_check_df_dtheta(Mapping_check_model):
+    """This class allows gradient checks for the gradient of a mapping with respect to parameters. """
+    def __init__(self, mapping=None, dL_df=None, X=None):
+        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
+
+    def _log_likelihood_gradients(self):
+        return self.mapping.df_dtheta(self.dL_df, self.X)
+
+
+class Mapping_check_df_dX(Mapping_check_model):
+    """This class allows gradient checks for the gradient of a mapping with respect to X. """
+    def __init__(self, mapping=None, dL_df=None, X=None):
+        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
+
+        if dL_df==None:
+            dL_df = np.ones((self.X.shape[0],self.mapping.output_dim))
+        self.num_params = self.X.shape[0]*self.mapping.input_dim
+
+    def _log_likelihood_gradients(self):
+        return self.mapping.df_dX(self.dL_df, self.X).flatten()
+
+    def _get_param_names(self):
+        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
+
+    def _get_params(self):
+        return self.X.flatten()
+
+    def _set_params(self, x):
+        self.X=x.reshape(self.X.shape)
+
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012, 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


@ -6,49 +6,74 @@ from .. import likelihoods
 from ..inference import optimization
 from ..util.linalg import jitchol
 from GPy.util.misc import opt_wrapper
-from parameterised import Parameterised
+from parameterized import Parameterized
 import multiprocessing as mp
 import numpy as np
 from GPy.core.domains import POSITIVE, REAL
 from numpy.linalg.linalg import LinAlgError
 # import numdifftools as ndt

-class Model(Parameterised):
+class Model(Parameterized):
    _fail_count = 0 # Count of failed optimization steps (see objective)
    _allowed_failures = 10 # number of allowed failures
    def __init__(self):
-        Parameterised.__init__(self)
+        Parameterized.__init__(self)
        self.priors = None
        self.optimization_runs = []
        self.sampling_runs = []
        self.preferred_optimizer = 'scg'
        # self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes
-    def _get_params(self):
-        raise NotImplementedError, "this needs to be implemented to use the Model class"
-    def _set_params(self, x):
-        raise NotImplementedError, "this needs to be implemented to use the Model class"
    def log_likelihood(self):
-        raise NotImplementedError, "this needs to be implemented to use the Model class"
+        raise NotImplementedError, "this needs to be implemented to use the model class"
    def _log_likelihood_gradients(self):
-        raise NotImplementedError, "this needs to be implemented to use the Model class"
+        raise NotImplementedError, "this needs to be implemented to use the model class"
+
+    def getstate(self):
+        """
+        Get the current state of the class.
+        Inherited from Parameterized, so add those parameters to the state
+
+        :return: list of states from the model.
+
+        """
+        return Parameterized.getstate(self) + \
+            [self.priors, self.optimization_runs,
+             self.sampling_runs, self.preferred_optimizer]
+
+    def setstate(self, state):
+        """
+        set state from previous call to getstate
+        call Parameterized with the rest of the state
+
+        :param state: the state of the model.
+        :type state: list as returned from getstate.
+
+        """
+        self.preferred_optimizer = state.pop()
+        self.sampling_runs = state.pop()
+        self.optimization_runs = state.pop()
+        self.priors = state.pop()
+        Parameterized.setstate(self, state)

    def set_prior(self, regexp, what):
        """
-        Sets priors on the Model parameters.

-        Arguments
-        ---------
-        regexp -- string, regexp, or integer array
-        what -- instance of a Prior class
+        Sets priors on the model parameters.

-        Notes
-        -----
-        Asserts that the Prior is suitable for the constraint. If the
+        **Notes**
+
+        Asserts that the prior is suitable for the constraint. If the
        wrong constraint is in place, an error is raised.  If no
        constraint is in place, one is added (warning printed).

-        For tied parameters, the Prior will only be "counted" once, thus
-        a Prior object is only inserted on the first tied index
+        For tied parameters, the prior will only be "counted" once, thus
+        a prior object is only inserted on the first tied index
+
+        :param regexp: regular expression of parameters on which priors need to be set.
+        :type param: string, regexp, or integer array
+        :param what: prior to set on parameter.
+        :type what: GPy.core.Prior type
+
        """
        if self.priors is None:
            self.priors = [None for i in range(self._get_params().size)]
@ -58,12 +83,12 @@ class Model(Parameterised):
        # check tied situation
        tie_partial_matches = [tie for tie in self.tied_indices if (not set(tie).isdisjoint(set(which))) & (not set(tie) == set(which))]
        if len(tie_partial_matches):
-            raise ValueError, "cannot place Prior across partial ties"
+            raise ValueError, "cannot place prior across partial ties"
        tie_matches = [tie for tie in self.tied_indices if set(which) == set(tie) ]
        if len(tie_matches) > 1:
-            raise ValueError, "cannot place Prior across multiple ties"
+            raise ValueError, "cannot place prior across multiple ties"
        elif len(tie_matches) == 1:
-            which = which[:1] # just place a Prior object on the first parameter
+            which = which[:1] # just place a prior object on the first parameter


        # check constraints are okay
@ -75,7 +100,7 @@ class Model(Parameterised):
            else:
                constrained_positive_indices = np.zeros(shape=(0,))
            bad_constraints = np.setdiff1d(self.all_constrained_indices(), constrained_positive_indices)
-            assert not np.any(which[:, None] == bad_constraints), "constraint and Prior incompatible"
+            assert not np.any(which[:, None] == bad_constraints), "constraint and prior incompatible"
            unconst = np.setdiff1d(which, constrained_positive_indices)
            if len(unconst):
                print "Warning: constraining parameters to be positive:"
@ -83,17 +108,22 @@ class Model(Parameterised):
                print '\n'
                self.constrain_positive(unconst)
        elif what.domain is REAL:
-            assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and Prior incompatible"
+            assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and prior incompatible"
        else:
-            raise ValueError, "Prior not recognised"
+            raise ValueError, "prior not recognised"

-        # store the Prior in a local list
+        # store the prior in a local list
        for w in which:
            self.priors[w] = what

    def get_gradient(self, name, return_names=False):
        """
-        Get Model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
+        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
+
+        :param name: the name of parameters required (as a regular expression).
+        :type name: regular expression
+        :param return_names: whether or not to return the names matched (default False)
+        :type return_names: bool
        """
        matches = self.grep_param_names(name)
        if len(matches):
@ -133,14 +163,14 @@ class Model(Parameterised):

    def randomize(self):
        """
-        Randomize the Model.
-        Make this draw from the Prior if one exists, else draw from N(0,1)
+        Randomize the model.
+        Make this draw from the prior if one exists, else draw from N(0,1)
        """
        # first take care of all parameters (from N(0,1))
        x = self._get_params_transformed()
        x = np.random.randn(x.size)
        self._set_params_transformed(x)
-        # now draw from Prior where possible
+        # now draw from prior where possible
        x = self._get_params()
        if self.priors is not None:
            [np.put(x, i, p.rvs(1)) for i, p in enumerate(self.priors) if not p is None]
@ -150,25 +180,34 @@ class Model(Parameterised):

    def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
        """
-        Perform random restarts of the Model, and set the Model to the best
+        Perform random restarts of the model, and set the model to the best
        seen solution.

        If the robust flag is set, exceptions raised during optimizations will
-        be handled silently.  If _all_ runs fail, the Model is reset to the
+        be handled silently.  If _all_ runs fail, the model is reset to the
        existing parameter values.

-        Notes
-        -----
-        **kwargs are passed to the optimizer. They can be:
-        :max_f_eval: maximum number of function evaluations
-        :messages: whether to display during optimisation
-        :verbose: whether to show informations about the current restart
-        :parallel: whether to run each restart as a separate process. It relies on the multiprocessing module.
-        :num_processes: number of workers in the multiprocessing pool
+        **Notes**

-        ..Note: If num_processes is None, the number of workes in the multiprocessing pool is automatically
-        set to the number of processors on the current machine.
+        :param num_restarts: number of restarts to use (default 10)
+        :type num_restarts: int
+        :param robust: whether to handle exceptions silently or not (default False)
+        :type robust: bool
+        :param parallel: whether to run each restart as a separate process. It relies on the multiprocessing module.
+        :type parallel: bool
+        :param num_processes: number of workers in the multiprocessing pool
+        :type numprocesses: int

+        \*\*kwargs are passed to the optimizer. They can be:
+
+        :param max_f_eval: maximum number of function evaluations
+        :type max_f_eval: int
+        :param max_iters: maximum number of iterations
+        :type max_iters: int
+        :param messages: whether to display during optimisation
+        :type messages: bool
+
+        .. note:: If num_processes is None, the number of workes in the multiprocessing pool is automatically set to the number of processors on the current machine.

        """
        initial_parameters = self._get_params_transformed()
@ -213,9 +252,14 @@ class Model(Parameterised):

    def ensure_default_constraints(self):
        """
-        Ensure that any variables which should clearly be positive have been constrained somehow.
+        Ensure that any variables which should clearly be positive
+        have been constrained somehow. The method performs a regular
+        expression search on parameter names looking for the terms
+        'variance', 'lengthscale', 'precision' and 'kappa'. If any of
+        these terms are present in the name the parameter is
+        constrained positive.
        """
-        positive_strings = ['variance', 'lengthscale', 'precision', 'kappa']
+        positive_strings = ['variance', 'lengthscale', 'precision', 'decay', 'kappa']
        # param_names = self._get_param_names()
        currently_constrained = self.all_constrained_indices()
        to_make_positive = []
@ -228,11 +272,15 @@ class Model(Parameterised):

    def objective_function(self, x):
        """
-        The objective function passed to the optimizer. It combines the likelihood and the priors.
+        The objective function passed to the optimizer. It combines
+        the likelihood and the priors.

        Failures are handled robustly. The algorithm will try several times to
        return the objective, and will raise the original exception if it
        the objective cannot be computed.
+
+        :param x: the parameters of the model.
+        :parameter type: np.array
        """
        try:
            self._set_params_transformed(x)
@ -249,39 +297,53 @@ class Model(Parameterised):
        Gets the gradients from the likelihood and the priors.

        Failures are handled robustly. The algorithm will try several times to
-        return the objective, and will raise the original exception if it
+        return the gradients, and will raise the original exception if it
        the objective cannot be computed.
+
+        :param x: the parameters of the model.
+        :parameter type: np.array
        """
        try:
            self._set_params_transformed(x)
+            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
            self._fail_count = 0
        except (LinAlgError, ZeroDivisionError, ValueError) as e:
            if self._fail_count >= self._allowed_failures:
                raise e
            self._fail_count += 1
-        obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
        return obj_grads

    def objective_and_gradients(self, x):
+        """
+        Compute the objective function of the model and the gradient of the model at the point given by x.
+
+        :param x: the point at which gradients are to be computed.
+        :type np.array:
+        """
+
        try:
            self._set_params_transformed(x)
            obj_f = -self.log_likelihood() - self.log_prior()
            self._fail_count = 0
+            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
        except (LinAlgError, ZeroDivisionError, ValueError) as e:
            if self._fail_count >= self._allowed_failures:
                raise e
            self._fail_count += 1
            obj_f = np.inf
-        obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
        return obj_f, obj_grads

    def optimize(self, optimizer=None, start=None, **kwargs):
        """
-        Optimize the Model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
+        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
        kwargs are passed to the optimizer. They can be:

-        :max_f_eval: maximum number of function evaluations
+        :param max_f_eval: maximum number of function evaluations
+        :type max_f_eval: int
        :messages: whether to display during optimisation
+        :type messages: bool
        :param optimzer: which optimizer to use (defaults to self.preferred optimizer)
        :type optimzer: string TODO: valid strings?
        """
@ -293,7 +355,9 @@ class Model(Parameterised):

        optimizer = optimization.get_optimizer(optimizer)
        opt = optimizer(start, model=self, **kwargs)
+
        opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients)
+
        self.optimization_runs.append(opt)

        self._set_params_transformed(opt.x_opt)
@ -305,14 +369,14 @@ class Model(Parameterised):
        self.optimization_runs.append(sgd)

    def Laplace_covariance(self):
-        """return the covariance matric of a Laplace approximatino at the current (stationary) point"""
-        # TODO add in the Prior contributions for MAP estimation
+        """return the covariance matrix of a Laplace approximation at the current (stationary) point."""
+        # TODO add in the prior contributions for MAP estimation
        # TODO fix the hessian for tied, constrained and fixed components
        if hasattr(self, 'log_likelihood_hessian'):
            A = -self.log_likelihood_hessian()

        else:
-            print "numerically calculating hessian. please be patient!"
+            print "numerically calculating Hessian. please be patient!"
            x = self._get_params()
            def f(x):
                self._set_params(x)
@ -326,8 +390,8 @@ class Model(Parameterised):
        return A

    def Laplace_evidence(self):
-        """Returns an estiamte of the Model evidence based on the Laplace approximation.
-        Uses a numerical estimate of the hessian if none is available analytically"""
+        """Returns an estiamte of the model evidence based on the Laplace approximation.
+        Uses a numerical estimate of the Hessian if none is available analytically."""
        A = self.Laplace_covariance()
        try:
            hld = np.sum(np.log(np.diag(jitchol(A)[0])))
@ -336,39 +400,47 @@ class Model(Parameterised):
        return 0.5 * self._get_params().size * np.log(2 * np.pi) + self.log_likelihood() - hld

    def __str__(self):
-        s = Parameterised.__str__(self).split('\n')
+        s = Parameterized.__str__(self).split('\n')
+        #def __str__(self, names=None):
+        #    if names is None:
+        #        names = self._get_print_names()
+        #s = Parameterized.__str__(self, names=names).split('\n')
        # add priors to the string
        if self.priors is not None:
            strs = [str(p) if p is not None else '' for p in self.priors]
        else:
            strs = [''] * len(self._get_params())
+       #         strs = [''] * len(self._get_param_names())
+       #     name_indices = self.grep_param_names("|".join(names))
+       #     strs = np.array(strs)[name_indices]
        width = np.array(max([len(p) for p in strs] + [5])) + 4

        log_like = self.log_likelihood()
        log_prior = self.log_prior()
        obj_funct = '\nLog-likelihood: {0:.3e}'.format(log_like)
        if len(''.join(strs)) != 0:
-            obj_funct += ', Log Prior: {0:.3e}, LL+Prior = {0:.3e}'.format(log_prior, log_like + log_prior)
+            obj_funct += ', Log prior: {0:.3e}, LL+prior = {0:.3e}'.format(log_prior, log_like + log_prior)
        obj_funct += '\n\n'
        s[0] = obj_funct + s[0]
-        s[0] += "|{h:^{col}}".format(h='Prior', col=width)
+        s[0] += "|{h:^{col}}".format(h='prior', col=width)
        s[1] += '-' * (width + 1)

        for p in range(2, len(strs) + 2):
-            s[p] += '|{Prior:^{width}}'.format(Prior=strs[p - 2], width=width)
+            s[p] += '|{prior:^{width}}'.format(prior=strs[p - 2], width=width)

        return '\n'.join(s)


    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
        """
-        Check the gradient of the Model by comparing to a numerical estimate.
-        If the verbose flag is passed, invividual components are tested (and printed)
+        Check the gradient of the ,odel by comparing to a numerical
+        estimate.  If the verbose flag is passed, invividual
+        components are tested (and printed)

        :param verbose: If True, print a "full" checking of each parameter
        :type verbose: bool
        :param step: The size of the step around which to linearise the objective
-        :type step: float (defaul 1e-6)
+        :type step: float (default 1e-6)
        :param tolerance: the tolerance allowed (see note)
        :type tolerance: float (default 1e-3)

@ -381,7 +453,12 @@ class Model(Parameterised):

        if not verbose:
            # just check the global ratio
-            dx = step * np.sign(np.random.uniform(-1, 1, x.size))
+
+            #choose a random direction to find the linear approximation in
+            if x.size==2:
+                dx = step * np.ones(2) # random direction for 2 parameters can fail dure to symmetry
+            else:
+                dx = step * np.sign(np.random.uniform(-1, 1, x.size))

            # evaulate around the point x
            f1, g1 = self.objective_and_gradients(x + dx)
@ -389,9 +466,9 @@ class Model(Parameterised):
            gradient = self.objective_function_gradients(x)

            numerical_gradient = (f1 - f2) / (2 * dx)
-            global_ratio = (f1 - f2) / (2 * np.dot(dx, gradient))
+            global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient)))

-            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() - 1) < tolerance
+            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
        else:
            # check the gradient of each parameter individually, and do some pretty printing
            try:
@ -429,7 +506,7 @@ class Model(Parameterised):
                gradient = self.objective_function_gradients(x)[i]

                numerical_gradient = (f1 - f2) / (2 * step)
-                ratio = (f1 - f2) / (2 * step * gradient)
+                ratio = (f1 - f2) / (2 * step * np.where(gradient==0, 1e-312, gradient))
                difference = np.abs((f1 - f2) / 2 / step - gradient)

                if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
@ -445,43 +522,40 @@ class Model(Parameterised):

    def input_sensitivity(self):
        """
-        return an array describing the sesitivity of the Model to each input
+        return an array describing the sesitivity of the model to each input

        NB. Right now, we're basing this on the lengthscales (or
        variances) of the kernel.  TODO: proper sensitivity analysis
-        where we integrate across the Model inputs and evaluate the
-        effect on the variance of the Model output.  """
+        where we integrate across the model inputs and evaluate the
+        effect on the variance of the model output.  """

        if not hasattr(self, 'kern'):
-            raise ValueError, "this Model has no kernel"
+            raise ValueError, "this model has no kernel"

-        k = [p for p in self.kern.parts if p.name in ['rbf', 'linear']]
+        k = [p for p in self.kern.parts if p.name in ['rbf', 'linear', 'rbf_inv']]
        if (not len(k) == 1) or (not k[0].ARD):
            raise ValueError, "cannot determine sensitivity for this kernel"
        k = k[0]

        if k.name == 'rbf':
-            return k.lengthscale
+            return 1. / k.lengthscale
+        elif k.name == 'rbf_inv':
+            return k.inv_lengthscale
        elif k.name == 'linear':
-            return 1. / k.variances
+            return k.variances


-    def pseudo_EM(self, epsilon=.1, **kwargs):
+    def pseudo_EM(self, stop_crit=.1, **kwargs):
        """
-        TODO: Should this not bein the GP class?
        EM - like algorithm  for Expectation Propagation and Laplace approximation

-        kwargs are passed to the optimize function. They can be:
-
-        :epsilon: convergence criterion
-        :max_f_eval: maximum number of function evaluations
-        :messages: whether to display during optimisation
-        :param optimzer: whice optimizer to use (defaults to self.preferred optimizer)
-        :type optimzer: string TODO: valid strings?
+        :param stop_crit: convergence criterion
+        :type stop_crit: float

+        .. Note: kwargs are passed to update_likelihood and optimize functions.
        """
-        assert isinstance(self.likelihood, likelihoods.EP), "pseudo_EM is only available for EP likelihoods"
-        ll_change = epsilon + 1.
+        assert isinstance(self.likelihood, (likelihoods.EP, likelihoods.EP_Mixed_Noise, likelihoods.Laplace)), "pseudo_EM is only available for approximate likelihoods"
+        ll_change = stop_crit + 1.
        iteration = 0
        last_ll = -np.inf

@ -489,10 +563,25 @@ class Model(Parameterised):
        alpha = 0
        stop = False

+        #Handle **kwargs
+        ep_args = {}
+        for arg in kwargs.keys():
+            if arg in ('epsilon','power_ep'):
+                ep_args[arg] = kwargs[arg]
+                del kwargs[arg]
+
        while not stop:
            last_approximation = self.likelihood.copy()
            last_params = self._get_params()
-            self.update_likelihood_approximation()
+            if len(ep_args) == 2:
+                self.update_likelihood_approximation(epsilon=ep_args['epsilon'],power_ep=ep_args['power_ep'])
+            elif len(ep_args) == 1:
+                if  ep_args.keys()[0] == 'epsilon':
+                    self.update_likelihood_approximation(epsilon=ep_args['epsilon'])
+                elif ep_args.keys()[0] == 'power_ep':
+                    self.update_likelihood_approximation(power_ep=ep_args['power_ep'])
+            else:
+                self.update_likelihood_approximation()
            new_ll = self.log_likelihood()
            ll_change = new_ll - last_ll

@ -504,7 +593,7 @@ class Model(Parameterised):
            else:
                self.optimize(**kwargs)
                last_ll = self.log_likelihood()
-                if ll_change < epsilon:
+                if ll_change < stop_crit:
                    stop = True
            iteration += 1
            if stop:
--- a/GPy/core/parameterized.py
+++ b/GPy/core/parameterized.py
@ -9,7 +9,7 @@ import cPickle
 import warnings
 import transformations

-class Parameterised(object):
+class Parameterized(object):
    def __init__(self):
        """
        This is the base class for model and kernel. Mostly just handles tieing and constraining of parameters
@ -20,55 +20,66 @@ class Parameterised(object):
        self.constrained_indices = []
        self.constraints = []

-    def pickle(self, filename, protocol= -1):
-        f = file(filename, 'w')
-        cPickle.dump(self, f, protocol)
-        f.close()
+    def _get_params(self):
+        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
+    def _set_params(self, x):
+        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
+
+    def _get_param_names(self):
+        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
+    #def _get_print_names(self):
+    #    """ Override for which names to print out, when using print m """
+    #    return self._get_param_names()
+
+    def pickle(self, filename, protocol=None):
+        if protocol is None:
+            if self._has_get_set_state():
+                protocol = 0
+            else:
+                protocol = -1
+        with open(filename, 'w') as f:
+            cPickle.dump(self, f, protocol)

    def copy(self):
        """Returns a (deep) copy of the current model """
        return copy.deepcopy(self)

-    @property
-    def params(self):
+    def __getstate__(self):
+        if self._has_get_set_state():
+            return self.getstate()
+        return self.__dict__
+
+    def __setstate__(self, state):
+        if self._has_get_set_state():
+            self.setstate(state) # set state
+            self._set_params(self._get_params()) # restore all values
+            return
+        self.__dict__ = state
+
+    def _has_get_set_state(self):
+        return 'getstate' in vars(self.__class__) and 'setstate' in vars(self.__class__)
+
+    def getstate(self):
        """
-        Returns a **copy** of parameters in non transformed space
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        For inheriting from Parameterized:

-        :see_also: :py:func:`GPy.core.Parameterised.params_transformed`
+        Allways append the state of the inherited object
+        and call down to the inherited object in setstate!!
        """
-        return self._get_params()
+        return [self.tied_indices,
+                self.fixed_indices,
+                self.fixed_values,
+                self.constrained_indices,
+                self.constraints]

-    @params.setter
-    def params(self, params):
-        self._set_params(params)
-
-    @property
-    def params_transformed(self):
-        """
-        Returns a **copy** of parameters in transformed space
-
-        :see_also: :py:func:`GPy.core.Parameterised.params`
-        """
-        return self._get_params_transformed()
-
-    @params_transformed.setter
-    def params_transformed(self, params):
-        self._set_params_transformed(params)
-
-    _get_set_deprecation = """get and set methods wont be available at next minor release
-        in the next releases you will get and set with following syntax:
-        Assume m is a model class:
-        print m['var']          # > prints all parameters matching 'var'
-        m['var'] = 2.           # > sets all parameters matching 'var' to 2.
-        m['var'] = <array-like> # > sets parameters matching 'var' to <array-like>
-        """
-    def get(self, regexp):
-        warnings.warn(self._get_set_deprecation, FutureWarning, stacklevel=2)
-        return self[regexp]
-
-    def set(self, regexp, val):
-        warnings.warn(self._get_set_deprecation, FutureWarning, stacklevel=2)
-        self[regexp] = val
+    def setstate(self, state):
+        self.constraints = state.pop()
+        self.constrained_indices = state.pop()
+        self.fixed_values = state.pop()
+        self.fixed_indices = state.pop()
+        self.tied_indices = state.pop()

    def __getitem__(self, regexp, return_names=False):
        """
@ -95,13 +106,16 @@ class Parameterised(object):
        if len(matches):
            val = np.array(val)
            assert (val.size == 1) or val.size == len(matches), "Shape mismatch: {}:({},)".format(val.size, len(matches))
-            x = self.params
+            x = self._get_params()
            x[matches] = val
-            self.params = x
+            self._set_params(x)
        else:
            raise AttributeError, "no parameter matches %s" % name

    def tie_params(self, regexp):
+        """
+        Tie (all!) parameters matching the regular expression `regexp`. 
+        """
        matches = self.grep_param_names(regexp)
        assert matches.size > 0, "need at least something to tie together"
        if len(self.tied_indices):
@ -154,7 +168,7 @@ class Parameterised(object):
        return len(self._get_params()) - removed

    def unconstrain(self, regexp):
-        """Unconstrain matching parameters.  does not untie parameters"""
+        """Unconstrain matching parameters.  Does not untie parameters"""
        matches = self.grep_param_names(regexp)

        # tranformed contraints:
@ -181,7 +195,7 @@ class Parameterised(object):

    def constrain_negative(self, regexp):
        """ Set negative constraints. """
-        self.constrain(regexp, transformations.negative_exponent())
+        self.constrain(regexp, transformations.negative_logexp())

    def constrain_positive(self, regexp):
        """ Set positive constraints. """
@ -217,16 +231,19 @@ class Parameterised(object):

    def constrain_fixed(self, regexp, value=None):
        """
-        Arguments
-        ---------
-        :param regexp: np.array(dtype=int), or regular expression object or string
-        :param value: a float to fix the matched values to. If the value is not specified,
-                 the parameter is fixed to the current value

-        Notes
-        -----
+        :param regexp: which parameters need to be fixed.
+        :type regexp: ndarray(dtype=int) or regular expression object or string
+        :param value: the vlaue to fix the parameters to. If the value is not specified,
+                 the parameter is fixed to the current value
+        :type value: float
+
+        **Notes**
+
        Fixing a parameter which is tied to another, or constrained in some way will result in an error.
-        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes
+
+        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes.
+
        """
        matches = self.grep_param_names(regexp)
        overlap = set(matches).intersection(set(self.all_constrained_indices()))
@ -321,19 +338,30 @@ class Parameterised(object):
        n = [nn for i, nn in enumerate(n) if not i in remove]
        return n

+    #@property
+    #def all(self):
+    #    return self.__str__(self._get_param_names())
+
+
+    #def __str__(self, names=None, nw=30):
    def __str__(self, nw=30):
        """
        Return a string describing the parameter names and their ties and constraints
        """
        names = self._get_param_names()
+        #if names is None:
+        #    names = self._get_print_names()
+        #name_indices = self.grep_param_names("|".join(names))
        N = len(names)

        if not N:
            return "This object has no free parameters."
        header = ['Name', 'Value', 'Constraints', 'Ties']
        values = self._get_params() # map(str,self._get_params())
+        #values = self._get_params()[name_indices] # map(str,self._get_params())
        # sort out the constraints
        constraints = [''] * len(names)
+        #constraints = [''] * len(self._get_param_names())
        for i, t in zip(self.constrained_indices, self.constraints):
            for ii in i:
                constraints[ii] = t.__str__()
@ -346,7 +374,10 @@ class Parameterised(object):
            for j in tie:
                ties[j] = '(' + str(i) + ')'

-        values = ['%.4f' % float(v) for v in values]
+        if values.size == 1:
+            values = ['%.4f' %float(values)]
+        else:
+            values = ['%.4f' % float(v) for v in values]
        max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
        max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
        max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
@ -361,3 +392,77 @@ class Parameterised(object):


        return ('\n'.join([header_string[0], separator] + param_string)) + '\n'
+
+    def grep_model(self,regexp):
+        regexp_indices = self.grep_param_names(regexp)
+        all_names = self._get_param_names()
+
+        names = [all_names[pj] for pj in regexp_indices]
+        N = len(names)
+
+        if not N:
+            return "Match not found."
+
+        header = ['Name', 'Value', 'Constraints', 'Ties']
+        all_values = self._get_params()
+        values = np.array([all_values[pj] for pj in regexp_indices])
+        constraints = [''] * len(names)
+
+        _constrained_indices,aux = self._pick_elements(regexp_indices,self.constrained_indices)
+        _constraints = [self.constraints[pj] for pj in aux]
+
+        for i, t in zip(_constrained_indices, _constraints):
+            for ii in i:
+                iii = regexp_indices.tolist().index(ii)
+                constraints[iii] = t.__str__()
+
+        _fixed_indices,aux = self._pick_elements(regexp_indices,self.fixed_indices)
+        for i in _fixed_indices:
+            for ii in i:
+                iii = regexp_indices.tolist().index(ii)
+                constraints[ii] = 'Fixed'
+
+        _tied_indices,aux = self._pick_elements(regexp_indices,self.tied_indices)
+        ties = [''] * len(names)
+        for i,ti in zip(_tied_indices,aux):
+            for ii in i:
+                iii = regexp_indices.tolist().index(ii)
+                ties[iii] = '(' + str(ti) + ')'
+
+        if values.size == 1:
+            values = ['%.4f' %float(values)]
+        else:
+            values = ['%.4f' % float(v) for v in values]
+
+        max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
+        max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
+        max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
+        max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
+        cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
+
+        header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
+        header_string = map(lambda x: '|'.join(x), [header_string])
+        separator = '-' * len(header_string[0])
+        param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))]
+
+        print header_string[0]
+        print separator
+        for string in param_string:
+            print string
+
+    def _pick_elements(self,regexp_ind,array_list):
+        """Removes from array_list the elements different from regexp_ind"""
+        new_array_list = [] #New list with elements matching regexp_ind
+        array_indices = [] #Indices that matches the arrays in new_array_list and array_list
+
+        array_index = 0
+        for array in array_list:
+            _new = []
+            for ai in array:
+                if ai in regexp_ind:
+                    _new.append(ai)
+            if len(_new):
+                new_array_list.append(np.array(_new))
+                array_indices.append(array_index)
+            array_index += 1
+        return new_array_list, array_indices
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -5,7 +5,7 @@ import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv, dtrtrs, dpotrs, dpotri
 from scipy import linalg
-from ..likelihoods import Gaussian
+from ..likelihoods import Gaussian, EP,EP_Mixed_Noise
 from gp_base import GPBase

 class SparseGP(GPBase):
@ -16,16 +16,17 @@ class SparseGP(GPBase):
    :type X: np.ndarray (num_data x input_dim)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace)
-    :param kernel : the kernel (covariance function). See link kernels
+    :param kernel: the kernel (covariance function). See link kernels
    :type kernel: a GPy.kern.kern instance
    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (num_data x input_dim) | None
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (num_inducing x input_dim) | None
-    :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type num_inducing: int
-    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
+
    """

    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
@ -33,10 +34,10 @@ class SparseGP(GPBase):

        self.Z = Z
        self.num_inducing = Z.shape[0]
-        self.likelihood = likelihood

        if X_variance is None:
            self.has_uncertain_inputs = False
+            self.X_variance = None
        else:
            assert X_variance.shape == X.shape
            self.has_uncertain_inputs = True
@ -49,6 +50,8 @@ class SparseGP(GPBase):
        if self.has_uncertain_inputs:
            self.X_variance /= np.square(self._Xscale)

+        self._const_jitter = None
+
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
@ -62,11 +65,13 @@ class SparseGP(GPBase):
            self.psi2 = None

    def _computations(self):
+        if self._const_jitter is None or not(self._const_jitter.shape[0] == self.num_inducing):
+            self._const_jitter = np.eye(self.num_inducing) * 1e-7

        # factor Kmm
-        self.Lm = jitchol(self.Kmm)
+        self._Lm = jitchol(self.Kmm + self._const_jitter)

-        # The rather complex computations of self.A
+        # The rather complex computations of self._A
        if self.has_uncertain_inputs:
            if self.likelihood.is_heteroscedastic:
                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.num_data, 1, 1))).sum(0)
@ -74,44 +79,48 @@ class SparseGP(GPBase):
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
            evals, evecs = linalg.eigh(psi2_beta)
            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
+            if not np.array_equal(evals, clipped_evals):
+                pass # print evals
            tmp = evecs * np.sqrt(clipped_evals)
            tmp = tmp.T
        else:
            if self.likelihood.is_heteroscedastic:
-                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data,1)))
+                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data, 1)))
            else:
                tmp = self.psi1 * (np.sqrt(self.likelihood.precision))
-        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp.T), lower=1)
-        self.A = tdot(tmp)
-
+        tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
+        self._A = tdot(tmp)

        # factor B
-        self.B = np.eye(self.num_inducing) + self.A
+        self.B = np.eye(self.num_inducing) + self._A
        self.LB = jitchol(self.B)

-        # TODO: make a switch for either first compute psi1V, or VV.T
-        self.psi1V = np.dot(self.psi1.T, self.likelihood.V)
+        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
+        self.psi1Vf = np.dot(self.psi1.T, self.likelihood.VVT_factor)

-        # back substutue C into psi1V
-        tmp, info1 = dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
-        self._LBi_Lmi_psi1V, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0)
-        tmp, info2 = dpotrs(self.LB, tmp, lower=1)
-        self.Cpsi1V, info3 = dtrtrs(self.Lm, tmp, lower=1, trans=1)
+        # back substutue C into psi1Vf
+        tmp, info1 = dtrtrs(self._Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0)
+        self._LBi_Lmi_psi1Vf, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0)
+        # tmp, info2 = dpotrs(self.LB, tmp, lower=1)
+        tmp, info2 = dtrtrs(self.LB, self._LBi_Lmi_psi1Vf, lower=1, trans=1)
+        self.Cpsi1Vf, info3 = dtrtrs(self._Lm, tmp, lower=1, trans=1)

        # Compute dL_dKmm
-        tmp = tdot(self._LBi_Lmi_psi1V)
+        tmp = tdot(self._LBi_Lmi_psi1Vf)
+        self.data_fit = np.trace(tmp)
        self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.output_dim * np.eye(self.num_inducing) + tmp)
        tmp = -0.5 * self.DBi_plus_BiPBi
        tmp += -0.5 * self.B * self.output_dim
        tmp += self.output_dim * np.eye(self.num_inducing)
-        self.dL_dKmm = backsub_both_sides(self.Lm, tmp)
+        self.dL_dKmm = backsub_both_sides(self._Lm, tmp)

        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
        self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten()
-        self.dL_dpsi1 = np.dot(self.Cpsi1V, self.likelihood.V.T).T
-        dL_dpsi2_beta = 0.5 * backsub_both_sides(self.Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
+        self.dL_dpsi1 = np.dot(self.likelihood.VVT_factor, self.Cpsi1Vf.T)
+        dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)

        if self.likelihood.is_heteroscedastic:
+
            if self.has_uncertain_inputs:
                self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
            else:
@ -129,27 +138,45 @@ class SparseGP(GPBase):


        # the partial derivative vector for the likelihood
-        if self.likelihood.Nparams == 0:
+        if self.likelihood.num_params == 0:
            # save computation here.
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
-            raise NotImplementedError, "heteroscedatic derivates not implemented"
+
+            if self.has_uncertain_inputs:
+                raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+
+            else:
+
+                LBi = chol_inv(self.LB)
+                Lmi_psi1, nil = dtrtrs(self._Lm, np.asfortranarray(self.psi1.T), lower=1, trans=0)
+                _LBi_Lmi_psi1, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
+
+
+                self.partial_for_likelihood = -0.5 * self.likelihood.precision + 0.5 * self.likelihood.V**2
+                self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0 - np.sum(Lmi_psi1**2,0))[:,None] * self.likelihood.precision**2
+
+                self.partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*self.likelihood.precision**2
+
+                self.partial_for_likelihood += -np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * self.likelihood.Y * self.likelihood.precision**2
+                self.partial_for_likelihood += 0.5*np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * self.likelihood.precision**2
+
        else:
-            # likelihood is not heterscedatic
+            # likelihood is not heteroscedatic
            self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
-            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self.A) * self.likelihood.precision)
-            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self.A * self.DBi_plus_BiPBi) - np.sum(np.square(self._LBi_Lmi_psi1V)))
+            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision)
+            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit)

    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        if self.likelihood.is_heteroscedastic:
            A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y)
-            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
+            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self._A))
        else:
            A = -0.5 * self.num_data * self.output_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
-            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
+            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self._A))
        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
-        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
+        D = 0.5 * self.data_fit
        return A + B + C + D + self.likelihood.Z

    def _set_params(self, p):
@ -158,15 +185,19 @@ class SparseGP(GPBase):
        self.likelihood._set_params(p[self.Z.size + self.kern.num_params:])
        self._compute_kernel_matrices()
        self._computations()
+        self.Cpsi1V = None

    def _get_params(self):
        return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()])

    def _get_param_names(self):
-        return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])],[])\
+        return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])], [])\
            + self.kern._get_param_names_transformed() + self.likelihood._get_param_names()

-    def update_likelihood_approximation(self):
+    #def _get_print_names(self):
+    #    return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
+
+    def update_likelihood_approximation(self, **kwargs):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation

@ -176,14 +207,14 @@ class SparseGP(GPBase):
        if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood
            self.likelihood.restart()
            if self.has_uncertain_inputs:
-                Lmi = chol_inv(self.Lm)
+                Lmi = chol_inv(self._Lm)
                Kmmi = tdot(Lmi.T)
                diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)])

-                self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
+                self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi, **kwargs) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
                # raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
            else:
-                self.likelihood.fit_DTC(self.Kmm, self.psi1.T)
+                self.likelihood.fit_DTC(self.Kmm, self.psi1.T, **kwargs)
                # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
                self._set_params(self._get_params()) # update the GP

@ -209,7 +240,7 @@ class SparseGP(GPBase):
        """
        The derivative of the bound wrt the inducing inputs Z
        """
-        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm, self.Z) # factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
+        dL_dZ = self.kern.dK_dX(self.dL_dKmm, self.Z)
        if self.has_uncertain_inputs:
            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
@ -218,11 +249,20 @@ class SparseGP(GPBase):
        return dL_dZ

    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
-        """Internal helper function for making predictions, does not account for normalization"""
+        """
+        Internal helper function for making predictions, does not account for
+        normalization or likelihood function
+        """

-        Bi, _ = dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
+        Bi, _ = dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work!
        symmetrify(Bi)
-        Kmmi_LmiBLmi = backsub_both_sides(self.Lm, np.eye(self.num_inducing) - Bi)
+        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
+
+        if self.Cpsi1V is None:
+            psi1V = np.dot(self.psi1.T, self.likelihood.V)
+            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            tmp, _ = dpotrs(self.LB, tmp, lower=1)
+            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)

        if X_variance_new is None:
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
@ -234,7 +274,7 @@ class SparseGP(GPBase):
                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
        else:
-            # assert which_p.Tarts=='all', "swithching out parts of variational kernels is not implemented"
+            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
            mu = np.dot(Kx, self.Cpsi1V)
            if full_cov:
@ -246,19 +286,19 @@ class SparseGP(GPBase):

        return mu, var[:, None]

-    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False, **likelihood_args):
        """
        Predict the function(s) at the new point(s) Xnew.

-        Arguments
-        ---------
+        **Arguments**
+
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param X_variance_new: The uncertainty in the prediction points
        :type X_variance_new: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
@ -278,22 +318,52 @@ class SparseGP(GPBase):
        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)

        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)

        return mean, var, _025pm, _975pm

-    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
+
+    def plot_f(self, samples=0, plot_limits=None, which_data_rows='all',
+            which_data_ycols='all', which_parts='all', resolution=None,
+            full_cov=False, fignum=None, ax=None):
+
+        """
+        Plot the GP's view of the world, where the data is normalized and the
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - Not implemented in higher dimensions
+
+        :param samples: the number of a posteriori samples to plot
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :param which_data_rows: which if the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param full_cov:
+        :type full_cov: bool
+                :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+
+        :param output: which output to plot (for multiple output models only)
+        :type output: integer (first output is 0)
+        """
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
+        if fignum is None and ax is None:
+                fignum = fig.num
+        if which_data_rows is 'all':
+            which_data_rows = slice(None)

-        if which_data is 'all':
-            which_data = slice(None)
+        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data_rows=which_data_rows, which_data_ycols=which_data_ycols, which_parts=which_parts, resolution=resolution, fignum=fignum, ax=ax)

-        GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax)
        if self.X.shape[1] == 1:
            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset  # NOTE self.X are the normalized values now
+                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
@ -303,3 +373,99 @@ class SparseGP(GPBase):
        elif self.X.shape[1] == 2:
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
+
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+    def plot(self, plot_limits=None, which_data_rows='all',
+            which_data_ycols='all', which_parts='all', fixed_inputs=[],
+            plot_raw=False,
+            levels=20, samples=0, fignum=None, ax=None, resolution=None):
+        """ 
+        Plot the posterior of the sparse GP.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data_rowsm which_data_ycols and which_parts
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :type output: integer (first output is 0)
+        :param linecol: color of line to plot.
+        :type linecol:
+        :param fillcol: color of fill
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        """
+        #deal work out which ax to plot on
+        if ax is None:
+            fig = pb.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        #work out what the inputs are for plotting (1D or 2D)
+        fixed_dims = np.array([i for i,v in fixed_inputs])
+        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+
+        #call the base plotting
+        GPBase.plot(self, samples=samples, plot_limits=plot_limits,
+                which_data_rows=which_data_rows,
+                which_data_ycols=which_data_ycols, fixed_inputs=fixed_inputs,
+                which_parts=which_parts, resolution=resolution, levels=20,
+                fignum=fignum, ax=ax)
+
+        if len(free_dims) == 1:
+            #plot errorbars for the uncertain inputs
+            if self.has_uncertain_inputs:
+                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
+                ax.errorbar(Xu[which_data_rows, 0], self.likelihood.data[which_data_rows, 0],
+                            xerr=2 * np.sqrt(self.X_variance[which_data_rows, 0]),
+                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+
+            #plot the inducing inputs
+            Zu = self.Z * self._Xscale + self._Xoffset
+            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+
+        elif len(free_dims) == 2:
+            Zu = self.Z * self._Xscale + self._Xoffset
+            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
+
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return GPBase.getstate(self) + [self.Z,
+                self.num_inducing,
+                self.has_uncertain_inputs,
+                self.X_variance]
+
+    def setstate(self, state):
+        self.X_variance = state.pop()
+        self.has_uncertain_inputs = state.pop()
+        self.num_inducing = state.pop()
+        self.Z = state.pop()
+        GPBase.setstate(self, state)
+
+
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -14,35 +14,22 @@ import sys

 class SVIGP(GPBase):
    """
+
    Stochastic Variational inference in a Gaussian Process

    :param X: inputs
-    :type X: np.ndarray (N x Q)
+    :type X: np.ndarray (num_data x num_inputs)
    :param Y: observed data
-    :type Y: np.ndarray of observations (N x D)
-    :param batchsize: the size of a h
-
-    Additional kwargs are used as for a sparse GP. They include
-
+    :type Y: np.ndarray of observations (num_data x output_dim)
+    :param batchsize: the size of a minibatch
    :param q_u: canonical parameters of the distribution squasehd into a 1D array
    :type q_u: np.ndarray
-    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
-    :param kernel : the kernel/covariance function. See link kernels
+    :param kernel: the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x Q) | None
-    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
-    :type X_uncertainty: np.ndarray (N x Q) | None
-    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
-    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
-    :param beta: noise precision. TODO> ignore beta if doing EP
-    :type beta: float
-    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_(X|Y): bool
-    """
+    :param Z: inducing inputs
+    :type Z: np.ndarray (num_inducing x num_inputs)

+    """

    def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=False)
@ -91,6 +78,58 @@ class SVIGP(GPBase):
        self._param_steplength_trace = []
        self._vb_steplength_trace = []

+    def getstate(self):
+        steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength]
+        return GPBase.getstate(self) + \
+            [self.get_vb_param(),
+             self.Z,
+             self.num_inducing,
+             self.has_uncertain_inputs,
+             self.X_variance,
+             self.X_batch,
+             self.X_variance_batch,
+             steplength_params,
+             self.batchcounter,
+             self.batchsize,
+             self.epochs,
+             self.momentum,
+             self.data_prop,
+             self._param_trace,
+             self._param_steplength_trace,
+             self._vb_steplength_trace,
+             self._ll_trace,
+             self._grad_trace,
+             self.Y,
+             self._permutation,
+             self.iterations
+            ]
+
+    def setstate(self, state):
+        self.iterations = state.pop()
+        self._permutation = state.pop()
+        self.Y = state.pop()
+        self._grad_trace = state.pop()
+        self._ll_trace = state.pop()
+        self._vb_steplength_trace = state.pop()
+        self._param_steplength_trace = state.pop()
+        self._param_trace = state.pop()
+        self.data_prop = state.pop()
+        self.momentum = state.pop()
+        self.epochs = state.pop()
+        self.batchsize = state.pop()
+        self.batchcounter = state.pop()
+        steplength_params = state.pop()
+        (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params
+        self.X_variance_batch = state.pop()
+        self.X_batch = state.pop()
+        self.X_variance = state.pop()
+        self.has_uncertain_inputs = state.pop()
+        self.num_inducing = state.pop()
+        self.Z = state.pop()
+        vb_param = state.pop()
+        GPBase.setstate(self, state)
+        self.set_vb_param(vb_param)
+
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
@ -166,7 +205,7 @@ class SVIGP(GPBase):
                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
            else:
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
-            evals, evecs = linalg.eigh(psi2_beta)
+            evals, evecs = np.linalg.eigh(psi2_beta)
            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
            tmp = evecs * np.sqrt(clipped_evals)
        else:
@ -296,8 +335,8 @@ class SVIGP(GPBase):

            #callback
            if i and not i%callback_interval:
-                callback()
-                time.sleep(0.1)
+                callback(self) # Change this to callback()
+                time.sleep(0.01)

            if self.epochs > 10:
                self._adapt_steplength()
@ -313,13 +352,13 @@ class SVIGP(GPBase):
        assert self.vb_steplength > 0

        if self.adapt_param_steplength:
-            # self._adaptive_param_steplength()
+            self._adaptive_param_steplength()
            # self._adaptive_param_steplength_log()
-            self._adaptive_param_steplength_from_vb()
+            # self._adaptive_param_steplength_from_vb()
        self._param_steplength_trace.append(self.param_steplength)

    def _adaptive_param_steplength(self):
-        decr_factor = 0.1
+        decr_factor = 0.02
        g_tp = self._transform_gradients(self._log_likelihood_gradients())
        self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp
        self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp)
@ -353,7 +392,7 @@ class SVIGP(GPBase):
        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1

    def _adaptive_vb_steplength_KL(self):
-        decr_factor = 1 #0.1
+        decr_factor = 0.1
        natgrad = self.vb_grad_natgrad()
        g_t1 = natgrad[0]
        g_t2 = natgrad[1]
@ -393,7 +432,7 @@ class SVIGP(GPBase):
            else:
                return mu, diag_var[:,None]

-    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False, sampling=False, num_samples=15000):
        # normalize X values
        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        if X_variance_new is not None:
@ -403,7 +442,7 @@ class SVIGP(GPBase):
        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)

        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, sampling=sampling, num_samples=num_samples)

        return mean, var, _025pm, _975pm

@ -449,7 +488,7 @@ class SVIGP(GPBase):
            ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12)

        if self.input_dim==2:
-            ax.scatter(self.X_all[:,0], self.X_all[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet)
+            ax.scatter(self.X[:,0], self.X[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet)
            ax.plot(Zu[:,0], Zu[:,1], 'w^')

    def plot_traces(self):
--- a/GPy/core/transformations.py
+++ b/GPy/core/transformations.py
@ -4,6 +4,8 @@

 import numpy as np
 from GPy.core.domains import POSITIVE, NEGATIVE, BOUNDED
+import sys 
+lim_val = -np.log(sys.float_info.epsilon) 

 class transformation(object):
    domain = None
@ -16,27 +18,43 @@ class transformation(object):
    def gradfactor(self, f):
        """ df_dx evaluated at self.f(x)=f"""
        raise NotImplementedError
+
    def initialize(self, f):
-        """ produce a sensible initial values for f(x)"""
+        """ produce a sensible initial value for f(x)"""
        raise NotImplementedError
+
    def __str__(self):
        raise NotImplementedError

 class logexp(transformation):
    domain = POSITIVE
    def f(self, x):
-        return np.log(1. + np.exp(x))
+        return np.where(x>lim_val, x, np.log(1. + np.exp(x)))
    def finv(self, f):
-        return np.log(np.exp(f) - 1.)
+        return np.where(f>lim_val, f, np.log(np.exp(f) - 1.))
    def gradfactor(self, f):
-        ef = np.exp(f)
-        return (ef - 1.) / ef
+        return np.where(f>lim_val, 1., 1 - np.exp(-f))
    def initialize(self, f):
+        if np.any(f < 0.):
+            print "Warning: changing parameters to satisfy constraints"
        return np.abs(f)
    def __str__(self):
        return '(+ve)'

-class logexp_clipped(transformation):
+class negative_logexp(transformation):
+    domain = NEGATIVE
+    def f(self, x):
+        return -logexp.f(x)
+    def finv(self, f):
+        return logexp.finv(-f) 
+    def gradfactor(self, f):
+        return -logexp.gradfactor(-f)
+    def initialize(self, f):
+        return -logexp.initialize(f)
+    def __str__(self):
+        return '(-ve)'
+
+class logexp_clipped(logexp):
    max_bound = 1e100
    min_bound = 1e-10
    log_max_bound = np.log(max_bound)
@ -66,7 +84,7 @@ class logexp_clipped(transformation):
 class exponent(transformation):
    domain = POSITIVE
    def f(self, x):
-        return np.exp(x)
+        return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val))
    def finv(self, x):
        return np.log(x)
    def gradfactor(self, f):
@ -78,18 +96,16 @@ class exponent(transformation):
    def __str__(self):
        return '(+ve)'

-class negative_exponent(transformation):
+class negative_exponent(exponent):
    domain = NEGATIVE
    def f(self, x):
-        return -np.exp(x)
-    def finv(self, x):
-        return np.log(-x)
+        return -exponent.f(x)
+    def finv(self, f):
+        return exponent.finv(-f)
    def gradfactor(self, f):
        return f
    def initialize(self, f):
-        if np.any(f > 0.):
-            print "Warning: changing parameters to satisfy constraints"
-        return -np.abs(f)
+        return -exponent.initialize(f) #np.abs(f)
    def __str__(self):
        return '(-ve)'

--- a/GPy/core/variational.py
+++ b/GPy/core/variational.py
@ -0,0 +1,19 @@
+'''
+Created on 6 Nov 2013
+
+@author: maxz
+'''
+from parameterized import Parameterized
+from parameter import Param
+
+class Normal(Parameterized):
+    '''
+    Normal distribution for variational approximations.
+    
+    holds the means and variances for a factorizing multivariate normal distribution
+    '''
+    def __init__(self, name, means, variances):
+        Parameterized.__init__(self, name=name)
+        self.means = Param("mean", means)
+        self.variances = Param('variance', variances)
+        self.add_parameters(self.means, self.variances)
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -6,66 +6,48 @@
 Gaussian Processes classification
 """
 import pylab as pb
-import numpy as np
 import GPy

 default_seed = 10000
-def crescent_data(seed=default_seed): # FIXME
-    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

-    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
-    :param seed : seed value for data generation.
-    :type seed: int
-    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
-    :type inducing: int
+def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    """
+    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

-    data = GPy.util.datasets.crescent_data(seed=seed)
-    Y = data['Y']
-    Y[Y.flatten()==-1] = 0
-
-    m = GPy.models.GPClassification(data['X'], Y)
-    #m.update_likelihood_approximation()
-    #m.optimize()
-    m.pseudo_EM()
-    print(m)
-    m.plot()
-    return m
-
-def oil(num_inducing=50):
-    """
-    Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    """
    data = GPy.util.datasets.oil()
-    X = data['X'][:600,:]
-    X_test = data['X'][600:,:]
-    Y = data['Y'][:600, 0:1]
+    X = data['X']
+    Xtest = data['Xtest']
+    Y = data['Y'][:, 0:1]
+    Ytest = data['Ytest'][:, 0:1]
    Y[Y.flatten()==-1] = 0
-    Y_test = data['Y'][600:, 0:1]
+    Ytest[Ytest.flatten()==-1] = 0

    # Create GP model
-    m = GPy.models.SparseGPClassification(X, Y,num_inducing=num_inducing)
+    m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing)

    # Contrain all parameters to be positive
-    m.constrain_positive('')
    m.tie_params('.*len')
    m['.*len'] = 10.
    m.update_likelihood_approximation()

    # Optimize
-    m.optimize()
+    if optimize:
+        m.optimize(max_iters=max_iters)
    print(m)

    #Test
-    probs = m.predict(X_test)[0]
-    GPy.util.classification.conf_matrix(probs,Y_test)
+    probs = m.predict(Xtest)[0]
+    GPy.util.classification.conf_matrix(probs, Ytest)
    return m

-def toy_linear_1d_classification(seed=default_seed):
+def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    """
-    Simple 1D classification example
-    :param seed : seed value for data generation (default is 4).
+    Simple 1D classification example using EP approximation
+
+    :param seed: seed value for data generation (default is 4).
    :type seed: int
+
    """

    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
@ -76,24 +58,65 @@ def toy_linear_1d_classification(seed=default_seed):
    m = GPy.models.GPClassification(data['X'], Y)

    # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    #m.optimize()
-    m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        #m.optimize()
+        #m.update_likelihood_approximation()
+        m.pseudo_EM()

    # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])

+    print m
    return m

-def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
+def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=True):
+    """
+    Simple 1D classification example using Laplace approximation
+
+    :param seed: seed value for data generation (default is 4).
+    :type seed: int
+
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+
+    bern_noise_model = GPy.likelihoods.bernoulli()
+    laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), bern_noise_model)
+
+    # Model definition
+    m = GPy.models.GPClassification(data['X'], Y, likelihood=laplace_likelihood)
+    print m
+
+    # Optimize
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        m.optimize('bfgs', messages=1)
+        #m.pseudo_EM()
+
+    # Plot
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])
+
+    print m
+    return m
+
+def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, optimize=True, plot=True):
    """
    Sparse 1D classification example
-    :param seed : seed value for data generation (default is 4).
+
+    :param seed: seed value for data generation (default is 4).
    :type seed: int
+
    """

    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
@ -101,68 +124,91 @@ def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
    Y[Y.flatten() == -1] = 0

    # Model definition
-    m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing)
-    m['.*len']= 4.
+    m = GPy.models.SparseGPClassification(data['X'], Y, num_inducing=num_inducing)
+    m['.*len'] = 4.

    # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    #m.optimize()
-    m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        #m.optimize()
+        m.pseudo_EM()

    # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])

+    print m
    return m

-def sparse_crescent_data(num_inducing=10, seed=default_seed):
+def toy_heaviside(seed=default_seed, optimize=True, plot=True):
    """
-    Run a Gaussian process classification with DTC approxiamtion on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+    Simple 1D classification example using a heavy side gp transformation
+
+    :param seed: seed value for data generation (default is 4).
+    :type seed: int
+
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+
+    # Model definition
+    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
+    likelihood = GPy.likelihoods.EP(Y, noise_model)
+    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
+
+    # Optimize
+    if optimize:
+        m.update_likelihood_approximation()
+        # Parameters optimization:
+        m.optimize()
+        #m.pseudo_EM()
+
+    # Plot
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])
+
+    print m
+    return m
+
+def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None, optimize=True, plot=True):
+    """
+    Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
-    :param seed : seed value for data generation.
-    :type seed: int
-    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :param inducing: number of inducing variables (only used for 'FITC' or 'DTC').
    :type inducing: int
-    """
-
-    data = GPy.util.datasets.crescent_data(seed=seed)
-    Y = data['Y']
-    Y[Y.flatten()==-1]=0
-
-    m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing)
-    m['.*len'] = 10.
-    #m.update_likelihood_approximation()
-    #m.optimize()
-    m.pseudo_EM()
-    print(m)
-    m.plot()
-    return m
-
-def FITC_crescent_data(num_inducing=10, seed=default_seed):
-    """
-    Run a Gaussian process classification with FITC approximation on the crescent data. The demonstration uses EP to approximate the likelihood.
-
-    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
-    :param seed : seed value for data generation.
+    :param seed: seed value for data generation.
    :type seed: int
-    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
-    :type num_inducing: int
+    :param kernel: kernel to use in the model
+    :type kernel: a GPy kernel
    """
-
    data = GPy.util.datasets.crescent_data(seed=seed)
    Y = data['Y']
-    Y[Y.flatten()==-1]=0
+    Y[Y.flatten()==-1] = 0

-    m = GPy.models.FITCClassification(data['X'], Y,num_inducing=num_inducing)
-    m.constrain_bounded('.*len',1.,1e3)
-    m['.*len'] = 3.
-    #m.update_likelihood_approximation()
-    #m.optimize()
-    m.pseudo_EM()
-    print(m)
-    m.plot()
+    if model_type == 'Full':
+        m = GPy.models.GPClassification(data['X'], Y, kernel=kernel)
+
+    elif model_type == 'DTC':
+        m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
+        m['.*len'] = 10.
+
+    elif model_type == 'FITC':
+        m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
+        m['.*len'] = 3.
+
+    if optimize:
+        m.pseudo_EM()
+
+    if plot:
+        m.plot()
+
+    print m
    return m
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -1,70 +1,93 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as _np
+default_seed = _np.random.seed(123344)

-import numpy as np
-from matplotlib import pyplot as plt, cm
+def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
+    """
+    model for testing purposes. Samples from a GP with rbf kernel and learns
+    the samples with a new kernel. Normally not for optimization, just model cheking
+    """
+    from GPy.likelihoods.gaussian import Gaussian
+    import GPy

-import GPy
-from GPy.core.transformations import logexp
-from GPy.models.bayesian_gplvm import BayesianGPLVM
+    num_inputs = 13
+    num_inducing = 5
+    if plot:
+        output_dim = 1
+        input_dim = 2
+    else:
+        input_dim = 2
+        output_dim = 25

-default_seed = np.random.seed(123344)
-
-def BGPLVM(seed=default_seed):
-    N = 10
-    num_inducing = 3
-    Q = 2
-    D = 4
    # generate GPLVM-like data
-    X = np.random.rand(N, Q)
-    k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+    X = _np.random.rand(num_inputs, input_dim)
+    lengthscales = _np.random.rand(input_dim)
+    k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
+         + GPy.kern.white(input_dim, 0.01))
    K = k.K(X)
-    Y = np.random.multivariate_normal(np.zeros(N), K, Q).T
+    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, output_dim).T
+    lik = Gaussian(Y, normalize=True)

-    k = GPy.kern.rbf(Q, ARD=True) + GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True) + GPy.kern.white(Q)
-    # k = GPy.kern.rbf(Q) + GPy.kern.rbf(Q) + GPy.kern.white(Q)
-    # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
-    # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
+    k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    # k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
+    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)

-    m = GPy.models.BayesianGPLVM(Y, Q, kernel=k, num_inducing=num_inducing)
-    # m.constrain_positive('(rbf|bias|noise|white|S)')
-    # m.constrain_fixed('S', 1)
+    m = GPy.models.BayesianGPLVM(lik, input_dim, kernel=k, num_inducing=num_inducing)
+    m.lengthscales = lengthscales

-    # pb.figure()
-    # m.plot()
-    # pb.title('PCA initialisation')
-    # pb.figure()
-    # m.optimize(messages = 1)
-    # m.plot()
-    # pb.title('After optimisation')
-    m.randomize()
-    m.checkgrad(verbose=1)
+    if plot:
+        import matplotlib.pyplot as pb
+        m.plot()
+        pb.title('PCA initialisation')
+
+    if optimize:
+        m.optimize('scg', messages=verbose)
+        if plot:
+            m.plot()
+            pb.title('After optimisation')

    return m

-def GPLVM_oil_100(optimize=True):
+def gplvm_oil_100(optimize=True, verbose=1, plot=True):
+    import GPy
    data = GPy.util.datasets.oil_100()
    Y = data['X']
-
    # create simple GP model
    kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
    m = GPy.models.GPLVM(Y, 6, kernel=kernel)
    m.data_labels = data['Y'].argmax(axis=1)
-
-    # optimize
-    if optimize:
-        m.optimize('scg', messages=1)
-
-    # plot
-    print(m)
-    m.plot_latent(labels=m.data_labels)
+    if optimize: m.optimize('scg', messages=verbose)
+    if plot: m.plot_latent(labels=m.data_labels)
    return m

-def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False):
-    from GPy.util.datasets import swiss_roll_generated
-    from GPy.core.transformations import logexp_clipped
+def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_inducing=15, max_iters=50):
+    import GPy
+    _np.random.seed(0)
+    data = GPy.util.datasets.oil()
+    Y = data['X'][:N]
+    Y = Y - Y.mean(0)
+    Y /= Y.std(0)
+    # Create the model
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
+    m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
+    m.data_labels = data['Y'][:N].argmax(axis=1)

-    data = swiss_roll_generated(N=N, sigma=sigma)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=max_iters)
+    if plot:
+        m.plot_latent(labels=m.data_labels)
+        m.kern.plot_ARD()
+    return m
+
+def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4, sigma=.2):
+    import GPy
+    from GPy.util.datasets import swiss_roll_generated
+    from GPy.models import BayesianGPLVM
+
+    data = swiss_roll_generated(num_samples=N, sigma=sigma)
    Y = data['Y']
    Y -= Y.mean()
    Y /= Y.std()
@ -77,116 +100,98 @@ def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False
        iso = Isomap().fit(Y)
        X = iso.embedding_
        if Q > 2:
-            X = np.hstack((X, np.random.randn(N, Q - 2)))
+            X = _np.hstack((X, _np.random.randn(N, Q - 2)))
    except ImportError:
-        X = np.random.randn(N, Q)
+        X = _np.random.randn(N, Q)

    if plot:
-        from mpl_toolkits import mplot3d
-        import pylab
-        fig = pylab.figure("Swiss Roll Data")
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D  # @UnusedImport
+        fig = plt.figure("Swiss Roll Data")
        ax = fig.add_subplot(121, projection='3d')
        ax.scatter(*Y.T, c=c)
        ax.set_title("Swiss Roll")

        ax = fig.add_subplot(122)
        ax.scatter(*X.T[:2], c=c)
-        ax.set_title("Initialization")
-
+        ax.set_title("BGPLVM init")

    var = .5
-    S = (var * np.ones_like(X) + np.clip(np.random.randn(N, Q) * var ** 2,
+    S = (var * _np.ones_like(X) + _np.clip(_np.random.randn(N, Q) * var ** 2,
                                         - (1 - var),
                                         (1 - var))) + .001
-    Z = np.random.permutation(X)[:num_inducing]
+    Z = _np.random.permutation(X)[:num_inducing]

-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))

    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
    m.data_colors = c
    m.data_t = t
-
-    m['rbf_lengthscale'] = 1. # X.var(0).max() / X.var(0)
    m['noise_variance'] = Y.var() / 100.
-    m['bias_variance'] = 0.05

    if optimize:
-        m.optimize('scg', messages=1)
+        m.optimize('scg', messages=verbose, max_iters=2e3)
+
+    if plot:
+        fig = plt.figure('fitted')
+        ax = fig.add_subplot(111)
+        s = m.input_sensitivity().argsort()[::-1][:2]
+        ax.scatter(*m.X.T[s], c=c)
+
    return m

-def BGPLVM_oil(optimize=True, N=200, Q=10, num_inducing=15, max_f_eval=50, plot=False, **k):
-    np.random.seed(0)
+def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
+    import GPy
+    from GPy.likelihoods import Gaussian
+    from matplotlib import pyplot as plt
+
+    _np.random.seed(0)
    data = GPy.util.datasets.oil()

-    # create simple GP model
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
+    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
    Y = data['X'][:N]
-    Yn = Y - Y.mean(0)
-    Yn /= Yn.std(0)
-
+    Yn = Gaussian(Y, normalize=True)
    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m.data_labels = data['Y'][:N].argmax(axis=1)
+    m['noise'] = Yn.Y.var() / 100.

-    # m.constrain('variance|leng', logexp_clipped())
-    m['.*lengt'] = 1. # m.X.var(0).max() / m.X.var(0)
-    m['noise'] = Yn.var() / 100.
-
-
-    # optimize
    if optimize:
-        m.constrain_fixed('noise')
-        m.optimize('scg', messages=1, max_f_eval=100, gtol=.05)
-        m.constrain_positive('noise')
-        m.optimize('scg', messages=1, max_f_eval=max_f_eval, gtol=.05)
+        m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05)

    if plot:
        y = m.likelihood.Y[0, :]
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
-        plt.sca(latent_axes)
-        m.plot_latent()
+        m.plot_latent(ax=latent_axes)
        data_show = GPy.util.visualize.vector_show(y)
-        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], m, data_show, latent_axes=latent_axes) # , sense_axes=sense_axes)
+        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
+            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
        raw_input('Press enter to finish')
        plt.close(fig)
    return m

-def oil_100():
-    data = GPy.util.datasets.oil_100()
-    m = GPy.models.GPLVM(data['X'], 2)
-
-    # optimize
-    m.optimize(messages=1, max_iters=2)
-
-    # plot
-    print(m)
-    # m.plot_latent(labels=data['Y'].argmax(axis=1))
-    return m
-
-
-
 def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
-    x = np.linspace(0, 4 * np.pi, N)[:, None]
-    s1 = np.vectorize(lambda x: np.sin(x))
-    s2 = np.vectorize(lambda x: np.cos(x))
-    s3 = np.vectorize(lambda x:-np.exp(-np.cos(2 * x)))
-    sS = np.vectorize(lambda x: np.sin(2 * x))
+    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
+    s1 = _np.vectorize(lambda x: _np.sin(x))
+    s2 = _np.vectorize(lambda x: _np.cos(x))
+    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
+    sS = _np.vectorize(lambda x: _np.sin(2 * x))

    s1 = s1(x)
    s2 = s2(x)
    s3 = s3(x)
    sS = sS(x)

-    S1 = np.hstack([s1, sS])
-    S2 = np.hstack([s2, s3, sS])
-    S3 = np.hstack([s3, sS])
+    S1 = _np.hstack([s1, sS])
+    S2 = _np.hstack([s2, s3, sS])
+    S3 = _np.hstack([s3, sS])

-    Y1 = S1.dot(np.random.randn(S1.shape[1], D1))
-    Y2 = S2.dot(np.random.randn(S2.shape[1], D2))
-    Y3 = S3.dot(np.random.randn(S3.shape[1], D3))
+    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
+    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
+    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))

-    Y1 += .3 * np.random.randn(*Y1.shape)
-    Y2 += .2 * np.random.randn(*Y2.shape)
-    Y3 += .1 * np.random.randn(*Y3.shape)
+    Y1 += .3 * _np.random.randn(*Y1.shape)
+    Y2 += .2 * _np.random.randn(*Y2.shape)
+    Y3 += .25 * _np.random.randn(*Y3.shape)

    Y1 -= Y1.mean(0)
    Y2 -= Y2.mean(0)
@ -201,6 +206,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):

    if plot_sim:
        import pylab
+        import matplotlib.cm as cm
        import itertools
        fig = pylab.figure("MRD Simulation Data", figsize=(8, 6))
        fig.clf()
@ -211,179 +217,252 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
        ax.legend()
        for i, Y in enumerate(Ylist):
            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
-            ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable
+            ax.imshow(Y, aspect='auto', cmap=cm.gray)
            ax.set_title("Y{}".format(i + 1))
        pylab.draw()
        pylab.tight_layout()

    return slist, [S1, S2, S3], Ylist

-def bgplvm_simulation_matlab_compare():
-    from GPy.util.datasets import simulation_BGPLVM
-    sim_data = simulation_BGPLVM()
-    Y = sim_data['Y']
-    S = sim_data['S']
-    mu = sim_data['mu']
-    num_inducing, [_, Q] = 3, mu.shape
+# def bgplvm_simulation_matlab_compare():
+#     from GPy.util.datasets import simulation_BGPLVM
+#     from GPy import kern
+#     from GPy.models import BayesianGPLVM
+#
+#     sim_data = simulation_BGPLVM()
+#     Y = sim_data['Y']
+#     mu = sim_data['mu']
+#     num_inducing, [_, Q] = 3, mu.shape
+#
+#     k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+#     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
+#                        _debug=False)
+#     m.auto_scale_factor = True
+#     m['noise'] = Y.var() / 100.
+#     m['linear_variance'] = .01
+#     return m

-    from GPy.models import mrd
+def bgplvm_simulation(optimize=True, verbose=1,
+                      plot=True, plot_sim=False,
+                      max_iters=2e4,
+                      ):
    from GPy import kern
-    reload(mrd); reload(kern)
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
-#                        X=mu,
-#                        X_variance=S,
-                       _debug=False)
-    m.auto_scale_factor = True
-    m['noise'] = Y.var() / 100.
-    m['linear_variance'] = .01
-    return m
-
-def bgplvm_simulation(optimize='scg',
-                      plot=True,
-                      max_f_eval=2e4):
-#     from GPy.core.transformations import logexp_clipped
-    D1, D2, D3, N, num_inducing, Q = 15, 8, 8, 100, 3, 5
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot)
-
-    from GPy.models import mrd
-    from GPy import kern
-    reload(mrd); reload(kern)
-
+    from GPy.models import BayesianGPLVM

+    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    Y = Ylist[0]
-
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) # + kern.bias(Q)
-    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k, _debug=True)
-    # m.constrain('variance|noise', logexp_clipped())
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
    m['noise'] = Y.var() / 100.
-    m['linear_variance'] = .01

    if optimize:
        print "Optimizing model:"
-        m.optimize(optimize, max_iters=max_f_eval,
-                   max_f_eval=max_f_eval,
-                   messages=True, gtol=.05)
+        m.optimize('scg', messages=verbose, max_iters=max_iters,
+                   gtol=.05)
    if plot:
        m.plot_X_1d("BGPLVM Latent Space 1D")
        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
    return m

-def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
-    D1, D2, D3, N, num_inducing, Q = 150, 200, 400, 500, 3, 7
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
-
-    from GPy.models import mrd
+def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    from GPy import kern
+    from GPy.models import MRD
+    from GPy.likelihoods import Gaussian

-    reload(mrd); reload(kern)
+    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]

-    k = kern.linear(Q, [.05] * Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = mrd.MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+    m = MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
+    m.ensure_default_constraints()

-    for i, Y in enumerate(Ylist):
-        m['{}_noise'.format(i + 1)] = Y.var() / 100.
-
-
-    # DEBUG
-    # np.seterr("raise")
+    for i, bgplvm in enumerate(m.bgplvms):
+        m['{}_noise'.format(i)] = bgplvm.likelihood.Y.var() / 500.

    if optimize:
        print "Optimizing Model:"
-        m.optimize(messages=1, max_iters=8e3, max_f_eval=8e3, gtol=.1)
+        m.optimize(messages=verbose, max_iters=8e3, gtol=.1)
    if plot:
        m.plot_X_1d("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
    return m

-def brendan_faces():
-    from GPy import kern
+def brendan_faces(optimize=True, verbose=True, plot=True):
+    import GPy
+
    data = GPy.util.datasets.brendan_faces()
    Q = 2
-    Y = data['Y'][0:-1:10, :]
-    # Y = data['Y']
+    Y = data['Y']
    Yn = Y - Y.mean()
    Yn /= Yn.std()

    m = GPy.models.GPLVM(Yn, Q)
-    # m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=100)

    # optimize
    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())

-    m.optimize('scg', messages=1, max_f_eval=10000)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)

-    ax = m.plot_latent(which_indices=(0, 1))
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, invert=False, scale=False)
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
-    lvm_visualizer.close()
+    if plot:
+        ax = m.plot_latent(which_indices=(0, 1))
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')

    return m

-def stick():
-    data = GPy.util.datasets.stick()
-    m = GPy.models.GPLVM(data['Y'], 2)
+def olivetti_faces(optimize=True, verbose=True, plot=True):
+    import GPy

+    data = GPy.util.datasets.olivetti_faces()
+    Q = 2
+    Y = data['Y']
+    Yn = Y - Y.mean()
+    Yn /= Yn.std()
+
+    m = GPy.models.GPLVM(Yn, Q)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+    if plot:
+        ax = m.plot_latent(which_indices=(0, 1))
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+
+    return m
+
+def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=True):
+    import GPy
+    data = GPy.util.datasets.osu_run1()
    # optimize
-    m.optimize(messages=1, max_f_eval=10000)
-    m._set_params(m._get_params())
+    if range == None:
+        Y = data['Y'].copy()
+    else:
+        Y = data['Y'][range[0]:range[1], :].copy()
+    if plot:
+        y = Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.data_play(Y, data_show, frame_rate)
+    return Y

-    ax = m.plot_latent()
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
-    lvm_visualizer.close()
+def stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
+    data = GPy.util.datasets.osu_run1()
+    # optimize
+    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
+        plt.clf
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')

    return m

-def cmu_mocap(subject='35', motion=['01'], in_place=True):
+def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
+    data = GPy.util.datasets.osu_run1()
+    # optimize
+    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
+    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
+        plt.clf
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+
+    return m
+
+def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
+    data = GPy.util.datasets.osu_run1()
+    # optimize
+    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
+    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
+    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
+        plt.clf
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+
+    return m
+
+def robot_wireless(optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
+    data = GPy.util.datasets.robot_wireless()
+    # optimize
+    m = GPy.models.GPLVM(data['Y'], 2)
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    m._set_params(m._get_params())
+    if plot:
+        m.plot_latent()
+
+    return m
+
+def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
+    from GPy.models import BayesianGPLVM
+    from matplotlib import pyplot as plt
+    import GPy
+
+    data = GPy.util.datasets.osu_run1()
+    Q = 6
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
+    # optimize
+    m.ensure_default_constraints()
+    if optimize: m.optimize('scg', messages=verbose, max_iters=200, xtol=1e-300, ftol=1e-300)
+    m._set_params(m._get_params())
+    if plot:
+        plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        plt.sca(latent_axes)
+        m.plot_latent()
+        y = m.likelihood.Y[0, :].copy()
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        raw_input('Press enter to finish')
+
+    return m
+
+
+def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose=True, plot=True):
+    import GPy

    data = GPy.util.datasets.cmu_mocap(subject, motion)
-    Y = data['Y']
    if in_place:
        # Make figure move in place.
        data['Y'][:, 0:3] = 0.0
+
    m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)

-    # optimize
-    m.optimize(messages=1, max_f_eval=10000)
+    if optimize:
+        m.optimize(messages=verbose, max_f_eval=10000)

-    ax = m.plot_latent()
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
-    lvm_visualizer.close()
+    if plot:
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
+        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+        lvm_visualizer.close()

    return m
-
-# def BGPLVM_oil():
-#     data = GPy.util.datasets.oil()
-#     Y, X = data['Y'], data['X']
-#     X -= X.mean(axis=0)
-#     X /= X.std(axis=0)
-#
-#     Q = 10
-#     num_inducing = 30
-#
-#     kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
-#     m = GPy.models.BayesianGPLVM(X, Q, kernel=kernel, num_inducing=num_inducing)
-#     # m.scale_factor = 100.0
-#     m.constrain_positive('(white|noise|bias|X_variance|rbf_variance|rbf_length)')
-#     from sklearn import cluster
-#     km = cluster.KMeans(num_inducing, verbose=10)
-#     Z = km.fit(m.X).cluster_centers_
-#     # Z = GPy.util.misc.kmm_init(m.X, num_inducing)
-#     m.set('iip', Z)
-#     m.set('bias', 1e-4)
-#     # optimize
-#
-#     import pdb; pdb.set_trace()
-#     m.optimize('tnc', messages=1)
-#     print m
-#     m.plot_latent(labels=data['Y'].argmax(axis=1))
-#     return m
-
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -0,0 +1,286 @@
+import GPy
+import numpy as np
+import matplotlib.pyplot as plt
+from GPy.util import datasets
+
+def student_t_approx(optimize=True, plot=True):
+    """
+    Example of regressing with a student t likelihood using Laplace
+    """
+    real_std = 0.1
+    #Start a function, any function
+    X = np.linspace(0.0, np.pi*2, 100)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+    Y = Y/Y.max()
+    Yc = Y.copy()
+
+    X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
+    Y_full = np.sin(X_full)
+    Y_full = Y_full/Y_full.max()
+
+    #Slightly noisy data
+    Yc[75:80] += 1
+
+    #Very noisy data
+    #Yc[10] += 100
+    #Yc[25] += 10
+    #Yc[23] += 10
+    #Yc[26] += 1000
+    #Yc[24] += 10
+    #Yc = Yc/Yc.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 5
+    print "Real noise: ", real_std
+    initial_var_guess = 0.5
+    edited_real_sd = initial_var_guess
+
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+
+    #Gaussian GP model on clean data
+    m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
+    # optimize
+    m1.ensure_default_constraints()
+    m1.constrain_fixed('white', 1e-5)
+    m1.randomize()
+
+    #Gaussian GP model on corrupt data
+    m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
+    m2.ensure_default_constraints()
+    m2.constrain_fixed('white', 1e-5)
+    m2.randomize()
+
+    #Student t GP model on clean data
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
+    m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood)
+    m3.ensure_default_constraints()
+    m3.constrain_bounded('t_noise', 1e-6, 10.)
+    m3.constrain_fixed('white', 1e-5)
+    m3.randomize()
+
+    #Student t GP model on corrupt data
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
+    m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    m4.ensure_default_constraints()
+    m4.constrain_bounded('t_noise', 1e-6, 10.)
+    m4.constrain_fixed('white', 1e-5)
+    m4.randomize()
+
+    if optimize:
+        optimizer='scg'
+        print "Clean Gaussian"
+        m1.optimize(optimizer, messages=1)
+        print "Corrupt Gaussian"
+        m2.optimize(optimizer, messages=1)
+        print "Clean student t"
+        m3.optimize(optimizer, messages=1)
+        print "Corrupt student t"
+        m4.optimize(optimizer, messages=1)
+
+    if plot:
+        plt.figure(1)
+        plt.suptitle('Gaussian likelihood')
+        ax = plt.subplot(211)
+        m1.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian clean')
+
+        ax = plt.subplot(212)
+        m2.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian corrupt')
+
+        plt.figure(2)
+        plt.suptitle('Student-t likelihood')
+        ax = plt.subplot(211)
+        m3.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm clean')
+
+        ax = plt.subplot(212)
+        m4.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm corrupt')
+
+    return m1, m2, m3, m4
+
+def boston_example(optimize=True, plot=True):
+    import sklearn
+    from sklearn.cross_validation import KFold
+    optimizer='bfgs'
+    messages=0
+    data = datasets.boston_housing()
+    degrees_freedoms = [3, 5, 8, 10]
+    X = data['X'].copy()
+    Y = data['Y'].copy()
+    X = X-X.mean(axis=0)
+    X = X/X.std(axis=0)
+    Y = Y-Y.mean()
+    Y = Y/Y.std()
+    num_folds = 10
+    kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    num_models = len(degrees_freedoms) + 3 #3 for baseline, gaussian, gaussian laplace approx
+    score_folds = np.zeros((num_models, num_folds))
+    pred_density = score_folds.copy()
+
+    def rmse(Y, Ystar):
+        return np.sqrt(np.mean((Y-Ystar)**2))
+
+    for n, (train, test) in enumerate(kf):
+        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        print "Fold {}".format(n)
+
+        noise = 1e-1 #np.exp(-2)
+        rbf_len = 0.5
+        data_axis_plot = 4
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+
+        #Baseline
+        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
+
+        #Gaussian GP
+        print "Gauss GP"
+        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
+        mgp.ensure_default_constraints()
+        mgp.constrain_fixed('white', 1e-5)
+        mgp['rbf_len'] = rbf_len
+        mgp['noise'] = noise
+        print mgp
+        if optimize:
+            mgp.optimize(optimizer=optimizer, messages=messages)
+        Y_test_pred = mgp.predict(X_test)
+        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
+        print mgp
+        print pred_density
+
+        print "Gaussian Laplace GP"
+        N, D = Y_train.shape
+        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
+        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
+        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
+        mg.ensure_default_constraints()
+        mg.constrain_positive('noise_variance')
+        mg.constrain_fixed('white', 1e-5)
+        mg['rbf_len'] = rbf_len
+        mg['noise'] = noise
+        print mg
+        if optimize:
+            mg.optimize(optimizer=optimizer, messages=messages)
+        Y_test_pred = mg.predict(X_test)
+        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
+        print pred_density
+        print mg
+
+        for stu_num, df in enumerate(degrees_freedoms):
+            #Student T
+            print "Student-T GP {}df".format(df)
+            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
+            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
+            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
+            mstu_t.ensure_default_constraints()
+            mstu_t.constrain_fixed('white', 1e-5)
+            mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+            mstu_t['rbf_len'] = rbf_len
+            mstu_t['t_noise'] = noise
+            print mstu_t
+            if optimize:
+                mstu_t.optimize(optimizer=optimizer, messages=messages)
+            Y_test_pred = mstu_t.predict(X_test)
+            score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
+            pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
+            print pred_density
+            print mstu_t
+
+    if plot:
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('GP gauss')
+
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('Lap gauss')
+
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('Stu t {}df'.format(df))
+
+    print "Average scores: {}".format(np.mean(score_folds, 1))
+    print "Average pred density: {}".format(np.mean(pred_density, 1))
+
+    if plot:
+        #Plotting
+        stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
+        legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
+
+        #Plot boxplots for RMSE density
+        fig = plt.figure()
+        ax=fig.add_subplot(111)
+        plt.title('RMSE')
+        bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
+        plt.setp(bp['boxes'], color='black')
+        plt.setp(bp['whiskers'], color='black')
+        plt.setp(bp['fliers'], color='red', marker='+')
+        xtickNames = plt.setp(ax, xticklabels=legends)
+        plt.setp(xtickNames, rotation=45, fontsize=8)
+        ax.set_ylabel('RMSE')
+        ax.set_xlabel('Distribution')
+        #Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+                alpha=0.5)
+        ax.set_axisbelow(True)
+
+        #Plot boxplots for predictive density
+        fig = plt.figure()
+        ax=fig.add_subplot(111)
+        plt.title('Predictive density')
+        bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
+        plt.setp(bp['boxes'], color='black')
+        plt.setp(bp['whiskers'], color='black')
+        plt.setp(bp['fliers'], color='red', marker='+')
+        xtickNames = plt.setp(ax, xticklabels=legends[1:])
+        plt.setp(xtickNames, rotation=45, fontsize=8)
+        ax.set_ylabel('Mean Log probability P(Y*|Y)')
+        ax.set_xlabel('Distribution')
+        #Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+                alpha=0.5)
+        ax.set_axisbelow(True)
+    return mstu_t
+
+#def precipitation_example():
+    #import sklearn
+    #from sklearn.cross_validation import KFold
+    #data = datasets.boston_housing()
+    #X = data['X'].copy()
+    #Y = data['Y'].copy()
+    #X = X-X.mean(axis=0)
+    #X = X/X.std(axis=0)
+    #Y = Y-Y.mean()
+    #Y = Y/Y.std()
+    #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    #num_folds = 10
+    #kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    #score_folds = np.zeros((4, num_folds))
+    #def rmse(Y, Ystar):
+        #return np.sqrt(np.mean((Y-Ystar)**2))
+    ##for train, test in kf:
+    #for n, (train, test) in enumerate(kf):
+        #X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        #print "Fold {}".format(n)
+
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 """
 Gaussian Processes regression examples
 """
@ -9,192 +8,163 @@ import pylab as pb
 import numpy as np
 import GPy

-
-def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100):
-    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    data = GPy.util.datasets.toy_rbf_1d()
+def olympic_marathon_men(optimize=True, plot=True):
+    """Run a standard Gaussian process regression on the Olympic marathon data."""
+    data = GPy.util.datasets.olympic_marathon_men()

    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'],data['Y'])
+    m = GPy.models.GPRegression(data['X'], data['Y'])

-    # optimize
-    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
-    # plot
-    m.plot()
-    print(m)
-    return m
-
-def rogers_girolami_olympics(optim_iters=100):
-    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
-    data = GPy.util.datasets.rogers_girolami_olympics()
-
-    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'],data['Y'])
-
-    #set the lengthscale to be something sensible (defaults to 1)
+    # set the lengthscale to be something sensible (defaults to 1)
    m['rbf_lengthscale'] = 10

-    # optimize
-    m.optimize(max_f_eval=optim_iters)
+    if optimize:
+        m.optimize('bfgs', max_iters=200)
+    if plot:
+        m.plot(plot_limits=(1850, 2050))

-    # plot
-    m.plot(plot_limits = (1850, 2050))
-    print(m)
    return m

-def toy_rbf_1d_50(optim_iters=100):
-    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    data = GPy.util.datasets.toy_rbf_1d_50()
-
-    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'],data['Y'])
-
-    # optimize
-    m.optimize(max_f_eval=optim_iters)
-
-    # plot
-    m.plot()
-    print(m)
-    return m
-
-def silhouette(optim_iters=100):
-    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
-    data = GPy.util.datasets.silhouette()
-
-    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'],data['Y'])
-
-    # optimize
-    m.optimize(messages=True,max_f_eval=optim_iters)
-
-    print(m)
-    return m
-
-def coregionalisation_toy2(optim_iters=100):
+def coregionalization_toy2(optimize=True, plot=True):
    """
-    A simple demonstration of coregionalisation on two sinusoidal functions.
+    A simple demonstration of coregionalization on two sinusoidal functions.
    """
-    X1 = np.random.rand(50,1)*8
-    X2 = np.random.rand(30,1)*5
-    index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1,X2)),index))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
-    Y2 = np.sin(X2) + np.random.randn(*X2.shape)*0.05 + 2.
-    Y = np.vstack((Y1,Y2))
+    #build a design matrix with a column of integers indicating the output
+    X1 = np.random.rand(50, 1) * 8
+    X2 = np.random.rand(30, 1) * 5
+    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
+    X = np.hstack((np.vstack((X1, X2)), index))

+    #build a suitable set of observed variables
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
+    Y = np.vstack((Y1, Y2))
+
+    #build the kernel
    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-    k2 = GPy.kern.Coregionalise(2,1)
-    k = k1.prod(k2,tensor=True)
-    m = GPy.models.GPRegression(X,Y,kernel=k)
-    m.constrain_fixed('.*rbf_var',1.)
-    #m.constrain_positive('.*kappa')
-    m.optimize('sim',messages=1,max_f_eval=optim_iters)
+    k2 = GPy.kern.coregionalize(2,1)
+    k = k1**k2
+    m = GPy.models.GPRegression(X, Y, kernel=k)
+    m.constrain_fixed('.*rbf_var', 1.)
+
+    if optimize:
+        m.optimize('bfgs', max_iters=100)
+
+    if plot:
+        m.plot(fixed_inputs=[(1,0)])
+        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())

-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
-    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
-    mean, var,low,up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up)
-    mean, var,low,up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up)
-    pb.plot(X1[:,0],Y1[:,0],'rx',mew=2)
-    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
    return m

-def coregionalisation_toy(optim_iters=100):
-    """
-    A simple demonstration of coregionalisation on two sinusoidal functions.
-    """
-    X1 = np.random.rand(50,1)*8
-    X2 = np.random.rand(30,1)*5
-    index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1,X2)),index))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape)*0.05
-    Y = np.vstack((Y1,Y2))
+#FIXME: Needs recovering once likelihoods are consolidated
+#def coregionalization_toy(optimize=True, plot=True):
+#    """
+#    A simple demonstration of coregionalization on two sinusoidal functions.
+#    """
+#    X1 = np.random.rand(50, 1) * 8
+#    X2 = np.random.rand(30, 1) * 5
+#    X = np.vstack((X1, X2))
+#    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+#    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
+#    Y = np.vstack((Y1, Y2))
+#
+#    k1 = GPy.kern.rbf(1)
+#    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
+#    m.constrain_fixed('.*rbf_var', 1.)
+#    m.optimize(max_iters=100)
+#
+#    fig, axes = pb.subplots(2,1)
+#    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
+#    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
+#    axes[0].set_title('Output 0')
+#    axes[1].set_title('Output 1')
+#    return m

-    k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.Coregionalise(2,2)
-    k = k1.prod(k2,tensor=True)
-    m = GPy.models.GPRegression(X,Y,kernel=k)
-    m.constrain_fixed('.*rbf_var',1.)
-    #m.constrain_positive('kappa')
-    m.optimize(max_f_eval=optim_iters)
+def coregionalization_sparse(optimize=True, plot=True):
+    """
+    A simple demonstration of coregionalization on two sinusoidal functions using sparse approximations.
+    """
+    #fetch the data from the non sparse examples
+    m = coregionalization_toy2(optimize=False, plot=False)
+    X, Y = m.X, m.likelihood.Y
+
+    #construct a model
+    m = GPy.models.SparseGPRegression(X,Y)
+    m.constrain_fixed('iip_\d+_1') # don't optimize the inducing input indexes
+
+    if optimize:
+        m.optimize('bfgs', max_iters=100, messages=1)
+
+    if plot:
+        m.plot(fixed_inputs=[(1,0)])
+        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())

-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
-    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
-    mean, var,low,up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up)
-    mean, var,low,up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up)
-    pb.plot(X1[:,0],Y1[:,0],'rx',mew=2)
-    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
    return m

-
-def coregionalisation_sparse(optim_iters=100):
+def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    """
-    A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations.
+    Perform Gaussian process regression on the latitude and longitude data
+    from the Mount Epomeo runs. Requires gpxpy to be installed on your system
+    to load in the data.
    """
-    X1 = np.random.rand(500,1)*8
-    X2 = np.random.rand(300,1)*5
-    index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1,X2)),index))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape)*0.05
-    Y = np.vstack((Y1,Y2))
+    data = GPy.util.datasets.epomeo_gpx()
+    num_data_list = []
+    for Xpart in data['X']:
+        num_data_list.append(Xpart.shape[0])

-    num_inducing = 40
-    Z = np.hstack((np.random.rand(num_inducing,1)*8,np.random.randint(0,2,num_inducing)[:,None]))
+    num_data_array = np.array(num_data_list)
+    num_data = num_data_array.sum()
+    Y = np.zeros((num_data, 2))
+    t = np.zeros((num_data, 2))
+    start = 0
+    for Xpart, index in zip(data['X'], range(len(data['X']))):
+        end = start+Xpart.shape[0]
+        t[start:end, :] = np.hstack((Xpart[:, 0:1],
+                                    index*np.ones((Xpart.shape[0], 1))))
+        Y[start:end, :] = Xpart[:, 1:3]
+
+    num_inducing = 200
+    Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None],
+                   np.random.randint(0, 4, num_inducing)[:, None]))

    k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.Coregionalise(2,2)
-    k = k1.prod(k2,tensor=True) + GPy.kern.white(2,0.001)
+    k2 = GPy.kern.coregionalize(output_dim=5, rank=5)
+    k = k1**k2

-    m = GPy.models.SparseGPRegression(X,Y,kernel=k,Z=Z)
-    m.constrain_fixed('.*rbf_var',1.)
+    m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True)
+    m.constrain_fixed('.*rbf_var', 1.)
    m.constrain_fixed('iip')
-    m.constrain_bounded('noise_variance',1e-3,1e-1)
-    m.optimize_restarts(5, robust=True, messages=1, max_f_eval=optim_iters)
+    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
+    m.optimize(max_iters=max_iters,messages=True)

-    #plotting:
-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
-    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
-    mean, var,low,up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up)
-    mean, var,low,up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up)
-    pb.plot(X1[:,0],Y1[:,0],'rx',mew=2)
-    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
-    y = pb.ylim()[0]
-    pb.plot(Z[:,0][Z[:,1]==0],np.zeros(np.sum(Z[:,1]==0))+y,'r|',mew=2)
-    pb.plot(Z[:,0][Z[:,1]==1],np.zeros(np.sum(Z[:,1]==1))+y,'g|',mew=2)
    return m

-
-def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000, optim_iters=300):
-    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisey mode is higher."""
+def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300, optimize=True, plot=True):
+    """
+    Show an example of a multimodal error surface for Gaussian process
+    regression. Gene 939 has bimodal behaviour where the noisy mode is
+    higher.
+    """

    # Contour over a range of length scales and signal/noise ratios.
    length_scales = np.linspace(0.1, 60., resolution)
    log_SNRs = np.linspace(-3., 4., resolution)

-    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(gene_number)
-    #data['Y'] = data['Y'][0::2, :]
-    #data['X'] = data['X'][0::2, :]
+    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
+    # data['Y'] = data['Y'][0::2, :]
+    # data['X'] = data['X'][0::2, :]

    data['Y'] = data['Y'] - np.mean(data['Y'])

    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
-    pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
-    ax = pb.gca()
-    pb.xlabel('length scale')
-    pb.ylabel('log_10 SNR')
+    if plot:
+        pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
+        ax = pb.gca()
+        pb.xlabel('length scale')
+        pb.ylabel('log_10 SNR')

-    xlim = ax.get_xlim()
-    ylim = ax.get_ylim()
+        xlim = ax.get_xlim()
+        ylim = ax.get_ylim()

    # Now run a few optimizations
    models = []
@ -202,124 +172,347 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
    optim_point_y = np.empty(2)
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
-        #kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
-        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3,1), lengthscale=np.random.uniform(5,50))
+        # kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
+        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))

-        m = GPy.models.GPRegression(data['X'],data['Y'], kernel=kern)
-        m['noise_variance'] = np.random.uniform(1e-3,1)
+        m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern)
+        m['noise_variance'] = np.random.uniform(1e-3, 1)
        optim_point_x[0] = m['rbf_lengthscale']
        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);

        # optimize
-        m.optimize('scg', xtol=1e-6, ftol=1e-6, max_f_eval=optim_iters)
+        if optimize:
+            m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)

        optim_point_x[1] = m['rbf_lengthscale']
        optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);

-        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1]-optim_point_x[0], optim_point_y[1]-optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
+        if plot:
+            pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
        models.append(m)

-    ax.set_xlim(xlim)
-    ax.set_ylim(ylim)
-    return m #(models, lls)
+    if plot:
+        ax.set_xlim(xlim)
+        ax.set_ylim(ylim)
+    return m # (models, lls)

 def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
-    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.
+    """
+    Evaluate the GP objective function for a given data set for a range of
+    signal to noise ratios and a range of lengthscales.

    :data_set: A data set from the utils.datasets director.
    :length_scales: a list of length scales to explore for the contour plot.
    :log_SNRs: a list of base 10 logarithm signal to noise ratios to explore for the contour plot.
-    :kernel: a kernel to use for the 'signal' portion of the data."""
+    :kernel: a kernel to use for the 'signal' portion of the data.
+    """

    lls = []
    total_var = np.var(data['Y'])
    kernel = kernel_call(1, variance=1., lengthscale=1.)
-    Model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel)
+    model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel)
    for log_SNR in log_SNRs:
        SNR = 10.**log_SNR
-        noise_var = total_var/(1.+SNR)
+        noise_var = total_var / (1. + SNR)
        signal_var = total_var - noise_var
-        Model.kern['.*variance'] = signal_var
-        Model['noise_variance'] = noise_var
+        model.kern['.*variance'] = signal_var
+        model['noise_variance'] = noise_var
        length_scale_lls = []

        for length_scale in length_scales:
-            Model['.*lengthscale'] = length_scale
-            length_scale_lls.append(Model.log_likelihood())
+            model['.*lengthscale'] = length_scale
+            length_scale_lls.append(model.log_likelihood())

        lls.append(length_scale_lls)

    return np.array(lls)

-def sparse_GP_regression_1D(N = 400, num_inducing = 5, optim_iters=100):
-    """Run a 1D example of a sparse GP regression."""
-    # sample inputs and outputs
-    X = np.random.uniform(-3.,3.,(N,1))
-    Y = np.sin(X)+np.random.randn(N,1)*0.05
-    # construct kernel
-    rbf =  GPy.kern.rbf(1)
-    noise = GPy.kern.white(1)
-    kernel = rbf + noise
+
+def olympic_100m_men(optimize=True, plot=True):
+    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
+    data = GPy.util.datasets.olympic_100m_men()
+
    # create simple GP Model
-    m = GPy.models.SparseGPRegression(X, Y, kernel, num_inducing=num_inducing)
+    m = GPy.models.GPRegression(data['X'], data['Y'])

+    # set the lengthscale to be something sensible (defaults to 1)
+    m['rbf_lengthscale'] = 10

-    m.checkgrad(verbose=1)
-    m.optimize('tnc', messages = 1, max_f_eval=optim_iters)
-    m.plot()
+    if optimize:
+        m.optimize('bfgs', max_iters=200)
+
+    if plot:
+        m.plot(plot_limits=(1850, 2050))
    return m

-def sparse_GP_regression_2D(N = 400, num_inducing = 50, optim_iters=100):
-    """Run a 2D example of a sparse GP regression."""
-    X = np.random.uniform(-3.,3.,(N,2))
-    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05
-
-    # construct kernel
-    rbf =  GPy.kern.rbf(2)
-    noise = GPy.kern.white(2)
-    kernel = rbf + noise
+def toy_rbf_1d(optimize=True, plot=True):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    data = GPy.util.datasets.toy_rbf_1d()

    # create simple GP Model
-    m = GPy.models.SparseGPRegression(X,Y,kernel, num_inducing = num_inducing)
+    m = GPy.models.GPRegression(data['X'], data['Y'])
+
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()
+
+    return m
+
+def toy_rbf_1d_50(optimize=True, plot=True):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    data = GPy.util.datasets.toy_rbf_1d_50()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'])
+
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()
+
+    return m
+
+
+def toy_poisson_rbf_1d(optimize=True, plot=True):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    x_len = 400
+    X = np.linspace(0, 10, x_len)[:, None]
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true]).reshape(x_len,1)
+
+    noise_model = GPy.likelihoods.poisson()
+    likelihood = GPy.likelihoods.EP(Y,noise_model)
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()
+
+    return m
+
+def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    optimizer='scg'
+    x_len = 30
+    X = np.linspace(0, 10, x_len)[:, None]
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
+
+    noise_model = GPy.likelihoods.poisson()
+    likelihood = GPy.likelihoods.Laplace(Y,noise_model)
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+
+    if optimize:
+        m.optimize(optimizer)
+    if plot:
+        m.plot()
+        # plot the real underlying rate function
+        pb.plot(X, np.exp(f_true), '--k', linewidth=2)
+
+    return m
+
+def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
+    # Create an artificial dataset where the values in the targets (Y)
+    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
+    # see if this dependency can be recovered
+    X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0))
+    X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0))
+    X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0))
+    X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0))
+    X = np.hstack((X1, X2, X3, X4))
+
+    Y1 = np.asarray(2 * X[:, 0] + 3).reshape(-1, 1)
+    Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0])).reshape(-1, 1)
+    Y = np.hstack((Y1, Y2))
+
+    Y = np.dot(Y, np.random.rand(2, D));
+    Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1])
+    Y -= Y.mean()
+    Y /= Y.std()
+
+    if kernel_type == 'linear':
+        kernel = GPy.kern.linear(X.shape[1], ARD=1)
+    elif kernel_type == 'rbf_inv':
+        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+    else:
+        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
+    kernel += GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+    m = GPy.models.GPRegression(X, Y, kernel)
+    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
+    # m.set_prior('.*lengthscale',len_prior)
+
+    if optimize:
+        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+
+    if plot:
+        m.kern.plot_ARD()
+
+    print m
+    return m
+
+def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
+    # Create an artificial dataset where the values in the targets (Y)
+    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
+    # see if this dependency can be recovered
+    X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0))
+    X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0))
+    X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0))
+    X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0))
+    X = np.hstack((X1, X2, X3, X4))
+
+    Y1 = np.asarray(2 * X[:, 0] + 3)[:, None]
+    Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0]))[:, None]
+    Y = np.hstack((Y1, Y2))
+
+    Y = np.dot(Y, np.random.rand(2, D));
+    Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1])
+    Y -= Y.mean()
+    Y /= Y.std()
+
+    if kernel_type == 'linear':
+        kernel = GPy.kern.linear(X.shape[1], ARD=1)
+    elif kernel_type == 'rbf_inv':
+        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+    else:
+        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
+    kernel += GPy.kern.bias(X.shape[1])
+    X_variance = np.ones(X.shape) * 0.5
+    m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance)
+    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
+    # m.set_prior('.*lengthscale',len_prior)
+
+    if optimize:
+        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+
+    if plot:
+        m.kern.plot_ARD()
+
+    print m
+    return m
+
+def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
+    """Predict the location of a robot given wirelss signal strength readings."""
+    data = GPy.util.datasets.robot_wireless()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)
+
+    # optimize
+    if optimize:
+        m.optimize(messages=True, max_iters=max_iters)
+
+    Xpredict = m.predict(data['Ytest'])[0]
+    if plot:
+        pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
+        pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
+        pb.axis('equal')
+        pb.title('WiFi Localization with Gaussian Processes')
+        pb.legend(('True Location', 'Predicted Location'))
+
+    sse = ((data['Xtest'] - Xpredict)**2).sum()
+
+    print m
+    print('Sum of squares error on test data: ' + str(sse))
+    return m
+
+def silhouette(max_iters=100, optimize=True, plot=True):
+    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
+    data = GPy.util.datasets.silhouette()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'])
+
+    # optimize
+    if optimize:
+        m.optimize(messages=True, max_iters=max_iters)
+
+    print m
+    return m
+
+def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True):
+    """Run a 1D example of a sparse GP regression."""
+    # sample inputs and outputs
+    X = np.random.uniform(-3., 3., (num_samples, 1))
+    Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05
+    # construct kernel
+    rbf = GPy.kern.rbf(1)
+    # create simple GP Model
+    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
+    m.checkgrad(verbose=1)
+
+    if optimize:
+        m.optimize('tnc', messages=1, max_iters=max_iters)
+
+    if plot:
+        m.plot()
+
+    return m
+
+def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True):
+    """Run a 2D example of a sparse GP regression."""
+    X = np.random.uniform(-3., 3., (num_samples, 2))
+    Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
+
+    # construct kernel
+    rbf = GPy.kern.rbf(2)
+
+    # create simple GP Model
+    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)

    # contrain all parameters to be positive (but not inducing inputs)
-    m.set('.*len',2.)
+    m['.*len'] = 2.

    m.checkgrad()

-    # optimize and plot
-    m.optimize('tnc', messages = 1, max_f_eval=optim_iters)
-    m.plot()
-    print(m)
+    # optimize
+    if optimize:
+        m.optimize('tnc', messages=1, max_iters=max_iters)
+
+    # plot
+    if plot:
+        m.plot()
+
+    print m
    return m

-def uncertain_inputs_sparse_regression(optim_iters=100):
+def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
-    fig, axes = pb.subplots(1,2,figsize=(12,5))
+    fig, axes = pb.subplots(1, 2, figsize=(12, 5))

    # sample inputs and outputs
-    S = np.ones((20,1))
-    X = np.random.uniform(-3.,3.,(20,1))
-    Y = np.sin(X)+np.random.randn(20,1)*0.05
-    #likelihood = GPy.likelihoods.Gaussian(Y)
-    Z = np.random.uniform(-3.,3.,(7,1))
+    S = np.ones((20, 1))
+    X = np.random.uniform(-3., 3., (20, 1))
+    Y = np.sin(X) + np.random.randn(20, 1) * 0.05
+    # likelihood = GPy.likelihoods.Gaussian(Y)
+    Z = np.random.uniform(-3., 3., (7, 1))

-    k = GPy.kern.rbf(1) + GPy.kern.white(1)
+    k = GPy.kern.rbf(1)

    # create simple GP Model - no input uncertainty on this one
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
-    m.optimize('scg', messages=1, max_f_eval=optim_iters)
-    m.plot(ax=axes[0])
-    axes[0].set_title('no input uncertainty')

+    if optimize:
+        m.optimize('scg', messages=1, max_iters=max_iters)

-    #the same Model with uncertainty
+    if plot:
+        m.plot(ax=axes[0])
+        axes[0].set_title('no input uncertainty')
+    print m
+
+    # the same Model with uncertainty
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
-    m.optimize('scg', messages=1, max_f_eval=optim_iters)
-    m.plot(ax=axes[1])
-    axes[1].set_title('with input uncertainty')
-    print(m)
-
-    fig.canvas.draw()
+    if optimize:
+        m.optimize('scg', messages=1, max_iters=max_iters)
+    if plot:
+        m.plot(ax=axes[1])
+        axes[1].set_title('with input uncertainty')
+        fig.canvas.draw()

+    print m
    return m
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@ -5,7 +5,7 @@ import pylab as pb
 import numpy as np
 import GPy

-def toy_1d():
+def toy_1d(optimize=True, plot=True):
    N = 2000
    M = 20

@ -16,25 +16,22 @@ def toy_1d():

    m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z)
    m.constrain_bounded('noise_variance',1e-3,1e-1)
+    m.constrain_bounded('white_variance',1e-3,1e-1)

    m.param_steplength = 1e-4

-    fig = pb.figure()
-    ax = fig.add_subplot(111)
-    def cb():
-        ax.cla()
-        m.plot(ax=ax,Z_height=-3)
-        ax.set_ylim(-3,3)
-        fig.canvas.draw()
+    if plot:
+        fig = pb.figure()
+        ax = fig.add_subplot(111)
+        def cb(foo):
+            ax.cla()
+            m.plot(ax=ax,Z_height=-3)
+            ax.set_ylim(-3,3)
+            fig.canvas.draw()

-    m.optimize(500, callback=cb, callback_interval=1)
+    if optimize:
+        m.optimize(500, callback=cb, callback_interval=1)

-    m.plot_traces()
+    if plot:
+        m.plot_traces()
    return m
-
-
-
-
-
-
-
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -11,7 +11,7 @@ pb.ion()
 import numpy as np
 import GPy

-def tuto_GP_regression():
+def tuto_GP_regression(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""

    X = np.random.uniform(-3.,3.,(20,1))
@ -22,7 +22,8 @@ def tuto_GP_regression():
    m = GPy.models.GPRegression(X, Y, kernel)

    print m
-    m.plot()
+    if plot:
+        m.plot()

    m.constrain_positive('')

@ -31,9 +32,9 @@ def tuto_GP_regression():
    m.constrain_bounded('.*lengthscale',1.,10. )
    m.constrain_fixed('.*noise',0.0025)

-    m.optimize()
-
-    m.optimize_restarts(num_restarts = 10)
+    if optimize:
+        m.optimize()
+        m.optimize_restarts(num_restarts = 10)

    #######################################################
    #######################################################
@ -51,12 +52,15 @@ def tuto_GP_regression():
    m.constrain_positive('')

    # optimize and plot
-    m.optimize('tnc', max_f_eval = 1000)
-    m.plot()
-    print(m)
+    if optimize:
+        m.optimize('tnc', max_f_eval = 1000)
+    if plot:
+        m.plot()
+
+    print m
    return(m)

-def tuto_kernel_overview():
+def tuto_kernel_overview(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
    ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.)
@ -64,9 +68,10 @@ def tuto_kernel_overview():

    print ker2

-    ker1.plot()
-    ker2.plot()
-    ker3.plot()
+    if plot:
+        ker1.plot()
+        ker2.plot()
+        ker3.plot()

    k1 = GPy.kern.rbf(1,1.,2.)
    k2 = GPy.kern.Matern32(1, 0.5, 0.2)
@ -114,30 +119,32 @@ def tuto_kernel_overview():

    # Create GP regression model
    m = GPy.models.GPRegression(X, Y, Kanova)
-    fig = pb.figure(figsize=(5,5))
-    ax = fig.add_subplot(111)
-    m.plot(ax=ax)

-    pb.figure(figsize=(20,3))
-    pb.subplots_adjust(wspace=0.5)
-    axs = pb.subplot(1,5,1)
-    m.plot(ax=axs)
-    pb.subplot(1,5,2)
-    pb.ylabel("=   ",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,3)
-    m.plot(ax=axs, which_parts=[False,True,False,False])
-    pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,4)
-    m.plot(ax=axs, which_parts=[False,False,True,False])
-    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,5)
-    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-    m.plot(ax=axs, which_parts=[False,False,False,True])
+    if plot:
+        fig = pb.figure(figsize=(5,5))
+        ax = fig.add_subplot(111)
+        m.plot(ax=ax)
+
+        pb.figure(figsize=(20,3))
+        pb.subplots_adjust(wspace=0.5)
+        axs = pb.subplot(1,5,1)
+        m.plot(ax=axs)
+        pb.subplot(1,5,2)
+        pb.ylabel("=   ",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,3)
+        m.plot(ax=axs, which_parts=[False,True,False,False])
+        pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,4)
+        m.plot(ax=axs, which_parts=[False,False,True,False])
+        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,5)
+        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+        m.plot(ax=axs, which_parts=[False,False,False,True])

    return(m)


-def model_interaction():
+def model_interaction(optimize=True, plot=True):
    X = np.random.randn(20,1)
    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
--- a/GPy/gpy_config.cfg
+++ b/GPy/gpy_config.cfg
@ -0,0 +1,7 @@
+# This is the configuration file for GPy
+
+[parallel]
+# Enable openmp support. This speeds up some computations, depending on the number
+# of cores available. Setting up a compiler with openmp support can be difficult on 
+# some platforms, hence this option.
+openmp=False
--- a/GPy/inference/conjugate_gradient_descent.py
+++ b/GPy/inference/conjugate_gradient_descent.py
@ -233,7 +233,7 @@ class CGD(Async_Optimize):
        """
        opt_async(self, f, df, x0, callback, update_rule=FletcherReeves,
               messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
-               report_every=10, *args, **kwargs)
+               report_every=10, \*args, \*\*kwargs)
        
        callback gets called every `report_every` iterations

@ -244,16 +244,14 @@ class CGD(Async_Optimize):
    
        f, and df will be called with
            
-            f(xi, *args, **kwargs)
-            df(xi, *args, **kwargs)
+            f(xi, \*args, \*\*kwargs)
+            df(xi, \*args, \*\*kwargs)
        
-        **returns**
-        -----------
+        **Returns:**
        
            Started `Process` object, optimizing asynchronously 
        
-        **calls** 
-        ---------
+        **Calls:** 
        
            callback(x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message)
        
@ -265,7 +263,7 @@ class CGD(Async_Optimize):
        """
        opt(self, f, df, x0, callback=None, update_rule=FletcherReeves,
               messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
-               report_every=10, *args, **kwargs)
+               report_every=10, \*args, \*\*kwargs)
        
        Minimize f, calling callback every `report_every` iterations with following syntax:
        
@ -276,11 +274,10 @@ class CGD(Async_Optimize):
    
        f, and df will be called with
            
-            f(xi, *args, **kwargs)
-            df(xi, *args, **kwargs)
+            f(xi, \*args, \*\*kwargs)
+            df(xi, \*args, \*\*kwargs)
                
        **returns** 
-        ---------
        
            x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message
        
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@ -4,6 +4,7 @@
 import pylab as pb
 import datetime as dt
 from scipy import optimize
+from warnings import warn

 try:
    import rasmussens_minimize as rasm
@ -28,7 +29,7 @@ class Optimizer():

    """
    def __init__(self, x_init, messages=False, model=None, max_f_eval=1e4, max_iters=1e3,
-                 ftol=None, gtol=None, xtol=None):
+                 ftol=None, gtol=None, xtol=None, bfgs_factor=None):
        self.opt_name = None
        self.x_init = x_init
        self.messages = messages
@ -38,6 +39,7 @@ class Optimizer():
        self.status = None
        self.max_f_eval = int(max_f_eval)
        self.max_iters = int(max_iters)
+        self.bfgs_factor = bfgs_factor
        self.trace = None
        self.time = "Not available"
        self.xtol = xtol
@ -127,9 +129,11 @@ class opt_lbfgsb(Optimizer):
            print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
        if self.gtol is not None:
            opt_dict['pgtol'] = self.gtol
+        if self.bfgs_factor is not None:
+            opt_dict['factr'] = self.bfgs_factor

        opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
-                                            maxfun=self.max_f_eval, **opt_dict)
+                                            maxfun=self.max_iters, **opt_dict)
        self.x_opt = opt_result[0]
        self.f_opt = f_fp(self.x_opt)[0]
        self.funct_eval = opt_result[2]['funcalls']
@ -198,17 +202,22 @@ class opt_rasm(Optimizer):

 class opt_SCG(Optimizer):
    def __init__(self, *args, **kwargs):
+        if 'max_f_eval' in kwargs:
+            warn("max_f_eval deprecated for SCG optimizer: use max_iters instead!\nIgnoring max_f_eval!", FutureWarning)
        Optimizer.__init__(self, *args, **kwargs)
+
        self.opt_name = "Scaled Conjugate Gradients"

    def opt(self, f_fp=None, f=None, fp=None):
        assert not f is None
        assert not fp is None
+
        opt_result = SCG(f, fp, self.x_init, display=self.messages,
                         maxiters=self.max_iters,
                         max_f_eval=self.max_f_eval,
                         xtol=self.xtol, ftol=self.ftol,
                         gtol=self.gtol)
+
        self.x_opt = opt_result[0]
        self.trace = opt_result[1]
        self.f_opt = self.trace[-1]
--- a/GPy/inference/scg.py
+++ b/GPy/inference/scg.py
@ -26,13 +26,16 @@ import numpy as np
 import sys


-def print_out(len_maxiters, display, fnow, current_grad, beta, iteration):
-    if display:
-        print '\r',
-        print '{0:>0{mi}g}  {1:> 12e}  {2:> 12e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
-        sys.stdout.flush()
+def print_out(len_maxiters, fnow, current_grad, beta, iteration):
+    print '\r',
+    print '{0:>0{mi}g}  {1:> 12e}  {2:> 12e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+    sys.stdout.flush()

-def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=None, ftol=None, gtol=None):
+def exponents(fnow, current_grad):
+    exps = [np.abs(fnow), current_grad]
+    return np.sign(exps) * np.log10(exps).astype(int)
+
+def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None):
    """
    Optimisation through Scaled Conjugate Gradients (SCG)

@ -52,11 +55,14 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
        ftol = 1e-6
    if gtol is None:
        gtol = 1e-5
+
    sigma0 = 1.0e-8
    fold = f(x, *optargs) # Initial function value.
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
+    if any(np.isnan(gradnew)):
+        raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
    current_grad = np.dot(gradnew, gradnew)
    gradold = gradnew.copy()
    d = -gradnew # Initial search direction.
@ -64,7 +70,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
    nsuccess = 0 # nsuccess counts number of successes.
    beta = 1.0 # Initial scale parameter.
    betamin = 1.0e-60 # Lower bound on scale.
-    betamax = 1.0e100 # Upper bound on scale.
+    betamax = 1.0e50 # Upper bound on scale.
    status = "Not converged"

    flog = [fold]
@ -74,6 +80,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
    len_maxiters = len(str(maxiters))
    if display:
        print ' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters)
+        exps = exponents(fnow, current_grad)
+        p_iter = iteration

    # Main optimization loop.
    while iteration < maxiters:
@ -103,9 +111,9 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
        fnew = f(xnew, *optargs)
        function_eval += 1

-        if function_eval >= max_f_eval:
-            status = "Maximum number of function evaluations exceeded"
-            break
+#         if function_eval >= max_f_eval:
+#             status = "maximum number of function evaluations exceeded"
+#             break
 #             return x, flog, function_eval, status

        Delta = 2.*(fnew - fold) / (alpha * mu)
@ -122,15 +130,28 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
        flog.append(fnow) # Current function value

        iteration += 1
-        print_out(len_maxiters, display, fnow, current_grad, beta, iteration)
+        if display:
+            print_out(len_maxiters, fnow, current_grad, beta, iteration)
+            n_exps = exponents(fnow, current_grad)
+            if iteration - p_iter >= 20 * np.random.rand():
+                a = iteration >= p_iter * 2.78
+                b = np.any(n_exps < exps)
+                if a or b:
+                    p_iter = iteration
+                    print ''
+                if b:
+                    exps = n_exps

        if success:
            # Test for termination
-            if (np.max(np.abs(alpha * d)) < xtol) or (np.abs(fnew - fold) < ftol):
-                status = 'converged'
+
+            if (np.abs(fnew - fold) < ftol):
+                status = 'converged - relative reduction in objective'
                break
 #                 return x, flog, function_eval, status
-
+            elif (np.max(np.abs(alpha * d)) < xtol):
+                status = 'converged - relative stepsize'
+                break
            else:
                # Update variables for new position
                gradnew = gradf(x, *optargs)
@ -139,7 +160,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
                fold = fnew
                # If the gradient is zero then we are done.
                if current_grad <= gtol:
-                    status = 'converged'
+                    status = 'converged - relative reduction in gradient'
                    break
                    # return x, flog, function_eval, status

@ -164,6 +185,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
        status = "maxiter exceeded"

    if display:
-        print_out(len_maxiters, display, fnow, current_grad, beta, iteration)
+        print_out(len_maxiters, fnow, current_grad, beta, iteration)
        print ""
+        print status
    return x, flog, function_eval, status
--- a/GPy/inference/sgd.py
+++ b/GPy/inference/sgd.py
@ -10,11 +10,10 @@ class opt_SGD(Optimizer):
    """
    Optimize using stochastic gradient descent.

-    *** Parameters ***
-    Model: reference to the Model object
-    iterations: number of iterations
-    learning_rate: learning rate
-    momentum: momentum
+    :param Model: reference to the Model object
+    :param iterations: number of iterations
+    :param learning_rate: learning rate
+    :param momentum: momentum

    """

--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -1,10 +1,9 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
-from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, symmetric, Coregionalise, rational_quadratic, Fixed, rbfcos, IndependentOutputs
+from constructors import *
 try:
    from constructors import rbf_sympy, sympykern # these depend on sympy
 except:
    pass
-from kern import kern
+from kern import *
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -1,33 +1,27 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 import numpy as np
 from kern import kern
+import parts

-from rbf import rbf as rbfpart
-from white import white as whitepart
-from linear import linear as linearpart
-from exponential import exponential as exponentialpart
-from Matern32 import Matern32 as Matern32part
-from Matern52 import Matern52 as Matern52part
-from bias import bias as biaspart
-from fixed import Fixed as fixedpart
-from finite_dimensional import finite_dimensional as finite_dimensionalpart
-from spline import spline as splinepart
-from Brownian import Brownian as Brownianpart
-from periodic_exponential import periodic_exponential as periodic_exponentialpart
-from periodic_Matern32 import periodic_Matern32 as periodic_Matern32part
-from periodic_Matern52 import periodic_Matern52 as periodic_Matern52part
-from prod import prod as prodpart
-from symmetric import symmetric as symmetric_part
-from coregionalise import Coregionalise as coregionalise_part
-from rational_quadratic import rational_quadratic as rational_quadraticpart
-from rbfcos import rbfcos as rbfcospart
-from independent_outputs import IndependentOutputs as independent_output_part
-#TODO these s=constructors are not as clean as we'd like. Tidy the code up
-#using meta-classes to make the objects construct properly wthout them.

+def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False):
+    """
+    Construct an RBF kernel
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the kernel
+    :type lengthscale: float
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
+    :type ARD: Boolean
+
+    """
+    part = parts.rbf_inv.RBFInv(input_dim,variance,inv_lengthscale,ARD)
+    return kern(input_dim, [part])

 def rbf(input_dim,variance=1., lengthscale=None,ARD=False):
    """
@ -41,35 +35,122 @@ def rbf(input_dim,variance=1., lengthscale=None,ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
+
    """
-    part = rbfpart(input_dim,variance,lengthscale,ARD)
+    part = parts.rbf.RBF(input_dim,variance,lengthscale,ARD)
    return kern(input_dim, [part])

 def linear(input_dim,variances=None,ARD=False):
    """
     Construct a linear kernel.

-     Arguments
-     ---------
-    input_dimD (int), obligatory
-     variances (np.ndarray)
-     ARD (boolean)
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variances:
+    :type variances: np.ndarray
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
+    :type ARD: Boolean
+
    """
-    part = linearpart(input_dim,variances,ARD)
+    part = parts.linear.Linear(input_dim,variances,ARD)
+    return kern(input_dim, [part])
+
+def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False):
+    """
+    Construct an MLP kernel
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param weight_scale: the lengthscale of the kernel
+    :type weight_scale: vector of weight variances for input weights in neural network (length 1 if kernel is isotropic)
+    :param bias_variance: the variance of the biases in the neural network.
+    :type bias_variance: float
+    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
+    :type ARD: Boolean
+
+    """
+    part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD)
+    return kern(input_dim, [part])
+
+def gibbs(input_dim,variance=1., mapping=None):
+    """
+
+    Gibbs and MacKay non-stationary covariance function.
+
+    .. math::
+
+       r = \\sqrt{((x_i - x_j)'*(x_i - x_j))}
+
+       k(x_i, x_j) = \\sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+
+       Z = \\sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
+
+    Where :math:`l(x)` is a function giving the length scale as a function of space.
+
+    This is the non stationary kernel proposed by Mark Gibbs in his 1997
+    thesis. It is similar to an RBF but has a length scale that varies
+    with input location. This leads to an additional term in front of
+    the kernel.
+
+    The parameters are :math:`\\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\\sigma^2`
+    :type variance: float
+    :param mapping: the mapping that gives the lengthscale across the input space.
+    :type mapping: GPy.core.Mapping
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object
+
+    """
+    part = parts.gibbs.Gibbs(input_dim,variance,mapping)
+    return kern(input_dim, [part])
+
+def hetero(input_dim, mapping=None, transform=None):
+    """
+    """
+    part = parts.hetero.Hetero(input_dim,mapping,transform)
+    return kern(input_dim, [part])
+
+def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False):
+    """
+    Construct a polynomial kernel
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param weight_scale: the lengthscale of the kernel
+    :type weight_scale: vector of weight variances for input weights.
+    :param bias_variance: the variance of the biases.
+    :type bias_variance: float
+    :param degree: the degree of the polynomial
+    :type degree: int
+    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
+    :type ARD: Boolean
+
+    """
+    part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD)
    return kern(input_dim, [part])

 def white(input_dim,variance=1.):
    """
     Construct a white kernel.

-     Arguments
-     ---------
-    input_dimD (int), obligatory
-     variance (float)
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+
    """
-    part = whitepart(input_dim,variance)
+    part = parts.white.White(input_dim,variance)
    return kern(input_dim, [part])

+
 def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
    """
    Construct an exponential kernel
@ -82,8 +163,9 @@ def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
+
    """
-    part = exponentialpart(input_dim,variance, lengthscale, ARD)
+    part = parts.exponential.Exponential(input_dim,variance, lengthscale, ARD)
    return kern(input_dim, [part])

 def Matern32(input_dim,variance=1., lengthscale=None, ARD=False):
@ -98,8 +180,9 @@ def Matern32(input_dim,variance=1., lengthscale=None, ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
+
    """
-    part = Matern32part(input_dim,variance, lengthscale, ARD)
+    part = parts.Matern32.Matern32(input_dim,variance, lengthscale, ARD)
    return kern(input_dim, [part])

 def Matern52(input_dim, variance=1., lengthscale=None, ARD=False):
@ -114,31 +197,38 @@ def Matern52(input_dim, variance=1., lengthscale=None, ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
+
    """
-    part = Matern52part(input_dim, variance, lengthscale, ARD)
+    part = parts.Matern52.Matern52(input_dim, variance, lengthscale, ARD)
    return kern(input_dim, [part])

 def bias(input_dim, variance=1.):
    """
     Construct a bias kernel.

-     Arguments
-     ---------
-     input_dim (int), obligatory
-     variance (float)
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+
    """
-    part = biaspart(input_dim, variance)
+    part = parts.bias.Bias(input_dim, variance)
    return kern(input_dim, [part])

 def finite_dimensional(input_dim, F, G, variances=1., weights=None):
    """
    Construct a finite dimensional kernel.
-    input_dim: int - the number of input dimensions
-    F: np.array of functions with shape (n,) - the n basis functions
-    G: np.array with shape (n,n) - the Gram matrix associated to F
-    variances : np.ndarray with shape (n,)
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param F: np.array of functions with shape (n,) - the n basis functions
+    :type F: np.array
+    :param G: np.array with shape (n,n) - the Gram matrix associated to F
+    :type G: np.array
+    :param variances: np.ndarray with shape (n,)
+    :type: np.ndarray
    """
-    part = finite_dimensionalpart(input_dim, F, G, variances, weights)
+    part = parts.finite_dimensional.FiniteDimensional(input_dim, F, G, variances, weights)
    return kern(input_dim, [part])

 def spline(input_dim, variance=1.):
@ -149,8 +239,9 @@ def spline(input_dim, variance=1.):
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
+
    """
-    part = splinepart(input_dim, variance)
+    part = parts.spline.Spline(input_dim, variance)
    return kern(input_dim, [part])

 def Brownian(input_dim, variance=1.):
@ -161,43 +252,111 @@ def Brownian(input_dim, variance=1.):
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
+
    """
-    part = Brownianpart(input_dim, variance)
+    part = parts.Brownian.Brownian(input_dim, variance)
    return kern(input_dim, [part])

 try:
    import sympy as sp
-    from sympykern import spkern
-    from sympy.parsing.sympy_parser import parse_expr
    sympy_available = True
 except ImportError:
    sympy_available = False

 if sympy_available:
+    from parts.sympykern import spkern
+    from sympy.parsing.sympy_parser import parse_expr
+    from GPy.util import symbolic
+
    def rbf_sympy(input_dim, ARD=False, variance=1., lengthscale=1.):
        """
        Radial Basis Function covariance.
        """
-        X = [sp.var('x%i' % i) for i in range(input_dim)]
-        Z = [sp.var('z%i' % i) for i in range(input_dim)]
-        rbf_variance = sp.var('rbf_variance',positive=True)
+        X = sp.symbols('x_:' + str(input_dim))
+        Z = sp.symbols('z_:' + str(input_dim))
+        variance = sp.var('variance',positive=True)
        if ARD:
-            rbf_lengthscales = [sp.var('rbf_lengthscale_%i' % i, positive=True) for i in range(input_dim)]
-            dist_string = ' + '.join(['(x%i-z%i)**2/rbf_lengthscale_%i**2' % (i, i, i) for i in range(input_dim)])
+            lengthscales = sp.symbols('lengthscale_:' + str(input_dim))
+            dist_string = ' + '.join(['(x_%i-z_%i)**2/lengthscale%i**2' % (i, i, i) for i in range(input_dim)])
            dist = parse_expr(dist_string)
-            f =  rbf_variance*sp.exp(-dist/2.)
+            f =  variance*sp.exp(-dist/2.)
        else:
-            rbf_lengthscale = sp.var('rbf_lengthscale',positive=True)
-            dist_string = ' + '.join(['(x%i-z%i)**2' % (i, i) for i in range(input_dim)])
+            lengthscale = sp.var('lengthscale',positive=True)
+            dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(input_dim)])
            dist = parse_expr(dist_string)
-            f =  rbf_variance*sp.exp(-dist/(2*rbf_lengthscale**2))
-        return kern(input_dim, [spkern(input_dim, f)])
+            f =  variance*sp.exp(-dist/(2*lengthscale**2))
+        return kern(input_dim, [spkern(input_dim, f, name='rbf_sympy')])

-    def sympykern(input_dim, k):
+    def eq_sympy(input_dim, output_dim, ARD=False):
        """
-        A kernel from a symbolic sympy representation
+        Latent force model covariance, exponentiated quadratic with multiple outputs. Derived from a diffusion equation with the initial spatial condition layed down by a Gaussian process with lengthscale given by shared_lengthscale.
+
+        See IEEE Trans Pattern Anal Mach Intell. 2013 Nov;35(11):2693-705. doi: 10.1109/TPAMI.2013.86. Linear latent force models using Gaussian processes. Alvarez MA, Luengo D, Lawrence ND.
+
+        :param input_dim: Dimensionality of the kernel
+        :type input_dim: int
+        :param output_dim: number of outputs in the covariance function.
+        :type output_dim: int
+        :param ARD: whether or not to user ARD (default False).
+        :type ARD: bool
+
        """
-        return kern(input_dim, [spkern(input_dim, k)])
+        real_input_dim = input_dim
+        if output_dim>1:
+            real_input_dim -= 1
+        X = sp.symbols('x_:' + str(real_input_dim))
+        Z = sp.symbols('z_:' + str(real_input_dim))
+        scale = sp.var('scale_i scale_j',positive=True)
+        if ARD:
+            lengthscales = [sp.var('lengthscale%i_i lengthscale%i_j' % i, positive=True) for i in range(real_input_dim)]
+            shared_lengthscales = [sp.var('shared_lengthscale%i' % i, positive=True) for i in range(real_input_dim)]
+            dist_string = ' + '.join(['(x_%i-z_%i)**2/(shared_lengthscale%i**2 + lengthscale%i_i**2 + lengthscale%i_j**2)' % (i, i, i) for i in range(real_input_dim)])
+            dist = parse_expr(dist_string)
+            f =  variance*sp.exp(-dist/2.)
+        else:
+            lengthscales = sp.var('lengthscale_i lengthscale_j',positive=True)
+            shared_lengthscale = sp.var('shared_lengthscale',positive=True)
+            dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(real_input_dim)])
+            dist = parse_expr(dist_string)
+            f =  scale_i*scale_j*sp.exp(-dist/(2*(lengthscale_i**2 + lengthscale_j**2 + shared_lengthscale**2)))
+        return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='eq_sympy')])
+
+    def ode1_eq(output_dim=1):
+        """
+        Latent force model covariance, first order differential
+        equation driven by exponentiated quadratic.
+
+        See N. D. Lawrence, G. Sanguinetti and M. Rattray. (2007)
+        'Modelling transcriptional regulation using Gaussian
+        processes' in B. Schoelkopf, J. C. Platt and T. Hofmann (eds)
+        Advances in Neural Information Processing Systems, MIT Press,
+        Cambridge, MA, pp 785--792.
+
+        :param output_dim: number of outputs in the covariance function.
+        :type output_dim: int
+        """
+        input_dim = 2
+        x_0, z_0, decay_i, decay_j, scale_i, scale_j, lengthscale = sp.symbols('x_0, z_0, decay_i, decay_j, scale_i, scale_j, lengthscale')
+        f = scale_i*scale_j*(symbolic.h(x_0, z_0, decay_i, decay_j, lengthscale) 
+     + symbolic.h(z_0, x_0, decay_j, decay_i, lengthscale))
+        return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='ode1_eq')])
+
+    def sympykern(input_dim, k=None, output_dim=1, name=None, param=None):
+        """
+        A base kernel object, where all the hard work in done by sympy.
+
+        :param k: the covariance function
+        :type k: a positive definite sympy function of x1, z1, x2, z2...
+
+        To construct a new sympy kernel, you'll need to define:
+         - a kernel function using a sympy object. Ensure that the kernel is of the form k(x,z).
+         - that's it! we'll extract the variables from the function k.
+
+        Note:
+         - to handle multiple inputs, call them x1, z1, etc
+         - to handle multpile correlated outputs, you'll need to define each covariance function and 'cross' variance function. TODO
+        """
+        return kern(input_dim, [spkern(input_dim, k=k, output_dim=output_dim, name=name, param=param)])
 del sympy_available

 def periodic_exponential(input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
@ -214,8 +373,9 @@ def periodic_exponential(input_dim=1, variance=1., lengthscale=None, period=2 *
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
    :type n_freq: int
+
    """
-    part = periodic_exponentialpart(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    part = parts.periodic_exponential.PeriodicExponential(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])

 def periodic_Matern32(input_dim, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
@ -232,8 +392,9 @@ def periodic_Matern32(input_dim, variance=1., lengthscale=None, period=2 * np.pi
     :type period: float
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
+
    """
-    part = periodic_Matern32part(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    part = parts.periodic_Matern32.PeriodicMatern32(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])

 def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
@ -250,8 +411,9 @@ def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi
     :type period: float
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
+
    """
-    part = periodic_Matern52part(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    part = parts.periodic_Matern52.PeriodicMatern52(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])

 def prod(k1,k2,tensor=False):
@ -260,21 +422,60 @@ def prod(k1,k2,tensor=False):

    :param k1, k2: the kernels to multiply
    :type k1, k2: kernpart
+    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
+    :type tensor: Boolean
    :rtype: kernel object
+
    """
-    part = prodpart(k1,k2,tensor)
+    part = parts.prod.Prod(k1, k2, tensor)
    return kern(part.input_dim, [part])

 def symmetric(k):
    """
-    Construct a symmetrical kernel from an existing kernel
+    Construct a symmetric kernel from an existing kernel
+
+    The symmetric kernel works by adding two GP functions together, and computing the overall covariance.
+
+    Let f ~ GP(x | 0, k(x, x')). Now let g = f(x) + f(-x).
+
+    It's easy to see that g is a symmetric function: g(x) = g(-x).
+
+    by construction, g, is a gaussian Process with mean 0 and covariance
+
+    k(x, x') + k(-x, x') + k(x, -x') + k(-x, -x')
+
+    This constructor builds a covariance function of this form from the initial kernel
    """
    k_ = k.copy()
-    k_.parts = [symmetric_part(p) for p in k.parts]
+    k_.parts = [parts.symmetric.Symmetric(p) for p in k.parts]
    return k_

-def Coregionalise(Nout,R=1, W=None, kappa=None):
-    p = coregionalise_part(Nout,R,W,kappa)
+def coregionalize(output_dim,rank=1, W=None, kappa=None):
+    """
+    Coregionlization matrix B, of the form:
+
+    .. math::
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
+
+    An intrinsic/linear coregionalization kernel of the form:
+
+    .. math::
+       k_2(x, y)=\mathbf{B} k(x, y)
+
+    it is obtainded as the tensor product between a kernel k(x,y) and B.
+
+    :param output_dim: the number of outputs to corregionalize
+    :type output_dim: int
+    :param rank: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type rank: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
+    :type W: numpy array of dimensionality (num_outpus, rank)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (output_dim,)
+    :rtype: kernel object
+
+    """
+    p = parts.coregionalize.Coregionalize(output_dim,rank,W,kappa)
    return kern(1,[p])


@ -291,36 +492,111 @@ def rational_quadratic(input_dim, variance=1., lengthscale=1., power=1.):
    :rtype: kern object

    """
-    part = rational_quadraticpart(input_dim, variance, lengthscale, power)
+    part = parts.rational_quadratic.RationalQuadratic(input_dim, variance, lengthscale, power)
    return kern(input_dim, [part])

-def Fixed(input_dim, K, variance=1.):
+def fixed(input_dim, K, variance=1.):
    """
     Construct a Fixed effect kernel.

-     Arguments
-     ---------
-     input_dim (int), obligatory
-     K (np.array), obligatory
-     variance (float)
+    :param input_dim: the number of input dimensions
+    :type input_dim: int (input_dim=1 is the only value currently supported)
+    :param K: the variance :math:`\sigma^2`
+    :type K: np.array
+    :param variance: kernel variance
+    :type variance: float
+    :rtype: kern object
    """
-    part = fixedpart(input_dim, K, variance)
+    part = parts.fixed.Fixed(input_dim, K, variance)
    return kern(input_dim, [part])

 def rbfcos(input_dim, variance=1., frequencies=None, bandwidths=None, ARD=False):
    """
    construct a rbfcos kernel
    """
-    part = rbfcospart(input_dim, variance, frequencies, bandwidths, ARD)
+    part = parts.rbfcos.RBFCos(input_dim, variance, frequencies, bandwidths, ARD)
    return kern(input_dim, [part])

-def IndependentOutputs(k):
+def independent_outputs(k):
    """
    Construct a kernel with independent outputs from an existing kernel
    """
    for sl in k.input_slices:
        assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
-    parts = [independent_output_part(p) for p in k.parts]
-    return kern(k.input_dim+1,parts)
+    _parts = [parts.independent_outputs.IndependentOutputs(p) for p in k.parts]
+    return kern(k.input_dim+1,_parts)

+def hierarchical(k):
+    """
+    TODO This can't be right! Construct a kernel with independent outputs from an existing kernel
+    """
+    # for sl in k.input_slices:
+    #     assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
+    _parts = [parts.hierarchical.Hierarchical(k.parts)]
+    return kern(k.input_dim+len(k.parts),_parts)

+def build_lcm(input_dim, output_dim, kernel_list = [], rank=1,W=None,kappa=None):
+    """
+    Builds a kernel of a linear coregionalization model
+
+    :input_dim: Input dimensionality
+    :output_dim: Number of outputs
+    :kernel_list: List of coregionalized kernels, each element in the list will be multiplied by a different corregionalization matrix
+    :type kernel_list: list of GPy kernels
+    :param rank: number tuples of the corregionalization parameters 'coregion_W'
+    :type rank: integer
+
+    ..note the kernels dimensionality is overwritten to fit input_dim
+
+    """
+
+    for k in kernel_list:
+        if k.input_dim <> input_dim:
+            k.input_dim = input_dim
+            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+
+    k_coreg = coregionalize(output_dim,rank,W,kappa)
+    kernel = kernel_list[0]**k_coreg.copy()
+
+    for k in kernel_list[1:]:
+        k_coreg = coregionalize(output_dim,rank,W,kappa)
+        kernel += k**k_coreg.copy()
+
+    return kernel
+
+def ODE_1(input_dim=1, varianceU=1.,  varianceY=1., lengthscaleU=None,  lengthscaleY=None):
+    """
+    kernel resultiong from a first order ODE with OU driving GP
+
+    :param input_dim: the number of input dimension, has to be equal to one
+    :type input_dim: int
+    :param varianceU: variance of the driving GP
+    :type varianceU: float
+    :param lengthscaleU: lengthscale of the driving GP
+    :type lengthscaleU: float
+    :param varianceY: 'variance' of the transfer function
+    :type varianceY: float
+    :param lengthscaleY: 'lengthscale' of the transfer function
+    :type lengthscaleY: float
+    :rtype: kernel object
+
+    """
+    part = parts.ODE_1.ODE_1(input_dim, varianceU, varianceY, lengthscaleU, lengthscaleY)
+    return kern(input_dim, [part])
+
+def ODE_UY(input_dim=2, varianceU=1.,  varianceY=1., lengthscaleU=None,  lengthscaleY=None):
+    """
+    kernel resultiong from a first order ODE with OU driving GP
+    :param input_dim: the number of input dimension, has to be equal to one
+    :type input_dim: int
+    :param input_lengthU: the number of input U length
+    :param varianceU: variance of the driving GP
+    :type varianceU: float
+    :param varianceY: 'variance' of the transfer function
+    :type varianceY: float
+    :param lengthscaleY: 'lengthscale' of the transfer function
+    :type lengthscaleY: float
+    :rtype: kernel object
+    """
+    part = parts.ODE_UY.ODE_UY(input_dim, varianceU, varianceY, lengthscaleU, lengthscaleY)
+    return kern(input_dim, [part])
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -1,20 +1,26 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
+import sys
 import numpy as np
 import pylab as pb
-from ..core.parameterised import Parameterised
-from kernpart import Kernpart
+from ..core.parameterized import Parameterized
+from parts.kernpart import Kernpart
 import itertools
-from prod import prod
+from parts.prod import Prod as prod
+from matplotlib.transforms import offset_copy

-class kern(Parameterised):
+class kern(Parameterized):
    def __init__(self, input_dim, parts=[], input_slices=None):
        """
-        This is the main kernel class for GPy. It handles multiple (additive) kernel functions, and keeps track of variaous things like which parameters live where.
+        This is the main kernel class for GPy. It handles multiple
+        (additive) kernel functions, and keeps track of various things
+        like which parameters live where.

-        The technical code for kernels is divided into _parts_ (see e.g. rbf.py). This obnject contains a list of parts, which are computed additively. For multiplication, special _prod_ parts are used.
+        The technical code for kernels is divided into _parts_ (see
+        e.g. rbf.py). This object contains a list of parts, which are
+        computed additively. For multiplication, special _prod_ parts
+        are used.

        :param input_dim: The dimensionality of the kernel's input space
        :type input_dim: int
@ -25,11 +31,16 @@ class kern(Parameterised):

        """
        self.parts = parts
-        self.Nparts = len(parts)
+        self.num_parts = len(parts)
        self.num_params = sum([p.num_params for p in self.parts])

        self.input_dim = input_dim

+        part_names = [k.name for k in self.parts]
+        self.name=''
+        for name in part_names:
+            self.name += name + '+'
+        self.name = self.name[:-1]
        # deal with input_slices
        if input_slices is None:
            self.input_slices = [slice(None) for p in self.parts]
@ -42,29 +53,111 @@ class kern(Parameterised):

        self.compute_param_slices()

-        Parameterised.__init__(self)
+        Parameterized.__init__(self)
+
+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return Parameterized.getstate(self) + [self.parts,
+                self.num_parts,
+                self.num_params,
+                self.input_dim,
+                self.input_slices,
+                self.param_slices
+                ]
+
+    def setstate(self, state):
+        self.param_slices = state.pop()
+        self.input_slices = state.pop()
+        self.input_dim = state.pop()
+        self.num_params = state.pop()
+        self.num_parts = state.pop()
+        self.parts = state.pop()
+        Parameterized.setstate(self, state)


-    def plot_ARD(self, fignum=None, ax=None):
-        """If an ARD kernel is present, it bar-plots the ARD parameters"""
+    def plot_ARD(self, fignum=None, ax=None, title='', legend=False):
+        """If an ARD kernel is present, plot a bar representation using matplotlib
+
+        :param fignum: figure number of the plot
+        :param ax: matplotlib axis to plot on
+        :param title:
+            title of the plot,
+            pass '' to not print a title
+            pass None for a generic title
+        """
        if ax is None:
            fig = pb.figure(fignum)
            ax = fig.add_subplot(111)
+        else:
+            fig = ax.figure
+        from GPy.util import Tango
+        from matplotlib.textpath import TextPath
+        Tango.reset()
+        xticklabels = []
+        bars = []
+        x0 = 0
        for p in self.parts:
+            c = Tango.nextMedium()
            if hasattr(p, 'ARD') and p.ARD:
-                ax.set_title('ARD parameters, %s kernel' % p.name)
-
+                if title is None:
+                    ax.set_title('ARD parameters, %s kernel' % p.name)
+                else:
+                    ax.set_title(title)
                if p.name == 'linear':
                    ard_params = p.variances
                else:
                    ard_params = 1. / p.lengthscale

-                ax.bar(np.arange(len(ard_params)) - 0.4, ard_params)
-                ax.set_xticks(np.arange(len(ard_params)))
-                ax.set_xticklabels([r"${}$".format(i) for i in range(len(ard_params))])
+                x = np.arange(x0, x0 + len(ard_params))
+                bars.append(ax.bar(x, ard_params, align='center', color=c, edgecolor='k', linewidth=1.2, label=p.name))
+                xticklabels.extend([r"$\mathrm{{{name}}}\ {x}$".format(name=p.name, x=i) for i in np.arange(len(ard_params))])
+                x0 += len(ard_params)
+        x = np.arange(x0)
+        transOffset = offset_copy(ax.transData, fig=fig,
+                                  x=0., y= -2., units='points')
+        transOffsetUp = offset_copy(ax.transData, fig=fig,
+                                  x=0., y=1., units='points')
+        for bar in bars:
+            for patch, num in zip(bar.patches, np.arange(len(bar.patches))):
+                height = patch.get_height()
+                xi = patch.get_x() + patch.get_width() / 2.
+                va = 'top'
+                c = 'w'
+                t = TextPath((0, 0), "${xi}$".format(xi=xi), rotation=0, usetex=True, ha='center')
+                transform = transOffset
+                if patch.get_extents().height <= t.get_extents().height + 3:
+                    va = 'bottom'
+                    c = 'k'
+                    transform = transOffsetUp
+                ax.text(xi, height, "${xi}$".format(xi=int(num)), color=c, rotation=0, ha='center', va=va, transform=transform)
+        # for xi, t in zip(x, xticklabels):
+        #    ax.text(xi, maxi / 2, t, rotation=90, ha='center', va='center')
+        # ax.set_xticklabels(xticklabels, rotation=17)
+        ax.set_xticks([])
+        ax.set_xlim(-.5, x0 - .5)
+        if legend:
+            if title is '':
+                mode = 'expand'
+                if len(bars) > 1:
+                    mode = 'expand'
+                ax.legend(bbox_to_anchor=(0., 1.02, 1., 1.02), loc=3,
+                          ncol=len(bars), mode=mode, borderaxespad=0.)
+                fig.tight_layout(rect=(0, 0, 1, .9))
+            else:
+                ax.legend()
        return ax

    def _transform_gradients(self, g):
+        """
+        Apply the transformations of the kernel so that the returned vector
+        represents the gradient in the transformed space (i.e. that given by
+        get_params_transformed())
+
+        :param g: the gradient vector for the current model, usually created by dK_dtheta
+        """
        x = self._get_params()
        [np.put(x, i, x * t.gradfactor(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
        [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
@ -75,7 +168,9 @@ class kern(Parameterised):
            return g

    def compute_param_slices(self):
-        """create a set of slices that can index the parameters of each part"""
+        """
+        Create a set of slices that can index the parameters of each part.
+        """
        self.param_slices = []
        count = 0
        for p in self.parts:
@ -83,16 +178,23 @@ class kern(Parameterised):
            count += p.num_params

    def __add__(self, other):
-        """
-        Shortcut for `add`.
-        """
+        """ Overloading of the '+' operator. for more control, see self.add """
        return self.add(other)

    def add(self, other, tensor=False):
        """
-        Add another kernel to this one. Both kernels are defined on the same _space_
+        Add another kernel to this one.
+
+        If Tensor is False, both kernels are defined on the same _space_. then
+        the created kernel will have the same number of inputs as self and
+        other (which must be the same).
+
+        If Tensor is True, then the dimensions are stacked 'horizontally', so
+        that the resulting kernel has self.input_dim + other.input_dim
+
        :param other: the other kernel to be added
        :type other: GPy.kern
+
        """
        if tensor:
            D = self.input_dim + other.input_dim
@ -121,16 +223,24 @@ class kern(Parameterised):
        return newkern

    def __mul__(self, other):
-        """
-        Shortcut for `prod`.
-        """
+        """ Here we overload the '*' operator. See self.prod for more information"""
        return self.prod(other)

+    def __pow__(self, other, tensor=False):
+        """
+        Shortcut for tensor `prod`.
+        """
+        return self.prod(other, tensor=True)
+
    def prod(self, other, tensor=False):
        """
-        multiply two kernels (either on the same space, or on the tensor product of the input space)
+        Multiply two kernels (either on the same space, or on the tensor product of the input space).
+
        :param other: the other kernel to be added
        :type other: GPy.kern
+        :param tensor: whether or not to use the tensor space (default is false).
+        :type tensor: bool
+
        """
        K1 = self.copy()
        K2 = other.copy()
@ -199,7 +309,7 @@ class kern(Parameterised):
        [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)]

    def _get_param_names(self):
-        # this is a bit nasty: we wat to distinguish between parts with the same name by appending a count
+        # this is a bit nasty: we want to distinguish between parts with the same name by appending a count
        part_names = np.array([k.name for k in self.parts], dtype=np.str)
        counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)]
        cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)]
@ -208,8 +318,19 @@ class kern(Parameterised):
        return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self.parts)], [])

    def K(self, X, X2=None, which_parts='all'):
+        """
+        Compute the kernel function.
+
+        :param X: the first set of inputs to the kernel
+        :param X2: (optional) the second set of arguments to the kernel. If X2
+                   is None, this is passed throgh to the 'part' object, which
+                   handles this as X2 == X.
+        :param which_parts: a list of booleans detailing whether to include
+                            each of the part functions. By default, 'all'
+                            indicates [True]*self.num_parts
+        """
        if which_parts == 'all':
-            which_parts = [True] * self.Nparts
+            which_parts = [True] * self.num_parts
        assert X.shape[1] == self.input_dim
        if X2 is None:
            target = np.zeros((X.shape[0], X.shape[0]))
@ -221,12 +342,16 @@ class kern(Parameterised):

    def dK_dtheta(self, dL_dK, X, X2=None):
        """
-        :param dL_dK: An array of dL_dK derivaties, dL_dK
-        :type dL_dK: Np.ndarray (N x num_inducing)
+        Compute the gradient of the covariance function with respect to the parameters.
+
+        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
+        :type dL_dK: Np.ndarray (num_samples x num_inducing)
        :param X: Observed data inputs
-        :type X: np.ndarray (N x input_dim)
-        :param X2: Observed dara inputs (optional, defaults to X)
+        :type X: np.ndarray (num_samples x input_dim)
+        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)
+
+        returns: dL_dtheta
        """
        assert X.shape[1] == self.input_dim
        target = np.zeros(self.num_params)
@ -238,8 +363,15 @@ class kern(Parameterised):
        return self._transform_gradients(target)

    def dK_dX(self, dL_dK, X, X2=None):
-        if X2 is None:
-            X2 = X
+        """Compute the gradient of the objective function with respect to X.
+
+        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
+        :type dL_dK: np.ndarray (num_samples x num_inducing)
+        :param X: Observed data inputs
+        :type X: np.ndarray (num_samples x input_dim)
+        :param X2: Observed data inputs (optional, defaults to X)
+        :type X2: np.ndarray (num_inducing x input_dim)"""
+
        target = np.zeros_like(X)
        if X2 is None: 
            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
@ -248,14 +380,16 @@ class kern(Parameterised):
        return target

    def Kdiag(self, X, which_parts='all'):
+        """Compute the diagonal of the covariance function for inputs X."""
        if which_parts == 'all':
-            which_parts = [True] * self.Nparts
+            which_parts = [True] * self.num_parts
        assert X.shape[1] == self.input_dim
        target = np.zeros(X.shape[0])
        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self.parts, self.input_slices, which_parts) if part_on]
        return target

    def dKdiag_dtheta(self, dL_dKdiag, X):
+        """Compute the gradient of the diagonal of the covariance function with respect to the parameters."""
        assert X.shape[1] == self.input_dim
        assert dL_dKdiag.size == X.shape[0]
        target = np.zeros(self.num_params)
@ -278,6 +412,9 @@ class kern(Parameterised):
        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
        return self._transform_gradients(target)

+    def dpsi0_dZ(self, dL_dpsi0, Z, mu, S):
+        return np.zeros_like(Z)
+
    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
        target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
@ -299,90 +436,253 @@ class kern(Parameterised):
        return target

    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
-        """return shapes are N,num_inducing,input_dim"""
+        """return shapes are num_samples,num_inducing,input_dim"""
        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
        return target_mu, target_S

    def psi2(self, Z, mu, S):
        """
-        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
-        :param mu, S: np.ndarrays of means and variances (each N x input_dim)
-        :returns psi2: np.ndarray (N,num_inducing,num_inducing)
+        :param Z: np.ndarray of inducing inputs (M x Q)
+        :param mu, S: np.ndarrays of means and variances (each N x Q)
+        :returns psi2: np.ndarray (N,M,M)
        """
        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]

        # compute the "cross" terms
        # TODO: input_slices needed
-        crossterms = 0
+        from parts.white import White
+        from parts.rbf import RBF
+        from parts.rbf_inv import RBFInv
+        from parts.bias import Bias
+        from parts.linear import Linear
+        from parts.fixed import Fixed

-        for p1, p2 in itertools.combinations(self.parts, 2):
+        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self.parts, self.input_slices), 2):
+            # white doesn;t combine with anything
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+                target += p1.variance * (p2._psi1[:, :, None] + p2._psi1[:, None, :])
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+                target += p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
+            # linear X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (Linear, RBF, RBFInv)):
+                tmp = np.zeros((mu.shape[0], Z.shape[0]))
+                p2.psi1(Z, mu, S, tmp)
+                target += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (Linear, RBF, RBFInv)):
+                tmp = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z, mu, S, tmp)
+                target += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
+            # rbf X any
+            elif False:#isinstance(p1, (RBF, RBFInv)) or isinstance(p2, (RBF, RBFInv)):
+                if isinstance(p2, (RBF, RBFInv)) and not isinstance(p1, (RBF, RBFInv)):
+                    p1t = p1; p1 = p2; p2 = p1t; del p1t  
+                N, M = mu.shape[0], Z.shape[0]; NM=N*M
+                psi11 = np.zeros((N, M))
+                psi12 = np.zeros((NM, M))
+                p1.psi1(Z, mu, S, psi11)
+                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
+                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)

-            # TODO psi1 this must be faster/better/precached/more nice
-            tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp1)
-            tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
-            p2.psi1(Z, mu, S, tmp2)
-
-            prod = np.multiply(tmp1, tmp2)
-            crossterms += prod[:, :, None] + prod[:, None, :]
-
-        target += crossterms
+                p2.psi1(Z, Mu, Sigma, psi12)
+                eK2 = psi12.reshape(N, M, M)
+                crossterms = eK2 * (psi11[:, :, None] + psi11[:, None, :])
+                target += crossterms
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
        return target        

    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
        target = np.zeros(self.num_params)
        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]

+        from parts.white import White
+        from parts.rbf import RBF
+        from parts.rbf_inv import RBFInv
+        from parts.bias import Bias
+        from parts.linear import Linear
+        from parts.fixed import Fixed
+
        # compute the "cross" terms
        # TODO: better looping, input_slices
-        for i1, i2 in itertools.permutations(range(len(self.parts)), 2):
+        for i1, i2 in itertools.combinations(range(len(self.parts)), 2):
            p1, p2 = self.parts[i1], self.parts[i2]
-#             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
+            #ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
            ps1, ps2 = self.param_slices[i1], self.param_slices[i2]            
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2])
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2._psi1 * 2., Z, mu, S, target[ps1])
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target[ps1])
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1._psi1 * 2., Z, mu, S, target[ps2])
+            # linear X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2]) # [ps1])
+                psi1 = np.zeros((mu.shape[0], Z.shape[0]))
+                p2.psi1(Z, mu, S, psi1)
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps1])
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, Linear):
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target[ps1])
+                psi1 = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z, mu, S, psi1)
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps2])
+            # rbf X any
+            elif False:#isinstance(p1, (RBF, RBFInv)) or isinstance(p2, (RBF, RBFInv)):
+                if isinstance(p2, (RBF, RBFInv)) and not isinstance(p1, (RBF, RBFInv)):
+                    # turn around to have rbf in front
+                    p1, p2 = self.parts[i2], self.parts[i1]
+                    ps1, ps2 = self.param_slices[i2], self.param_slices[i1]  

-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2])
+                N, M = mu.shape[0], Z.shape[0]; NM=N*M
+
+                psi11 = np.zeros((N, M))
+                p1.psi1(Z, mu, S, psi11)
+
+                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
+                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
+
+                tmp1 = np.zeros_like(target[ps1])
+                tmp2 = np.zeros_like(target[ps2])
+#                 for n in range(N):
+#                     for m in range(M):
+#                         for m_prime in range(M):
+#                             p1.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*psi12_t.reshape(N,M,M)[n:n+1,m:m+1,m_prime:m_prime+1])[0], Z[m:m+1], mu[n:n+1], S[n:n+1], tmp2)#Z[m_prime:m_prime+1], mu[n:n+1], S[n:n+1], tmp2)
+#                             p1.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*psi12_t.reshape(N,M,M)[n:n+1,m_prime:m_prime+1,m:m+1])[0], Z[m_prime:m_prime+1], mu[n:n+1], S[n:n+1], tmp2)
+#                             Mu, Sigma= Mu.reshape(N,M,self.input_dim), Sigma.reshape(N,M,self.input_dim)
+#                             p2.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*(psi11[n:n+1,m_prime:m_prime+1]))[0], Z[m:m+1], Mu[n:n+1,m], Sigma[n:n+1,m], target[ps2])
+#                             p2.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*(psi11[n:n+1,m:m+1]))[0], Z[m_prime:m_prime+1], Mu[n:n+1, m_prime], Sigma[n:n+1, m_prime], target[ps2])#Z[m_prime:m_prime+1], Mu[n+m:(n+m)+1], Sigma[n+m:(n+m)+1], target[ps2])
+
+                if isinstance(p1, RBF) and isinstance(p2, RBF):
+                    psi12 = np.zeros((N, M))
+                    p2.psi1(Z, mu, S, psi12)
+                    Mu2, Sigma2 = p2._crossterm_mu_S(Z, mu, S)
+                    Mu2, Sigma2 = Mu2.reshape(NM,self.input_dim), Sigma2.reshape(NM,self.input_dim)
+                    p1.dpsi1_dtheta((dL_dpsi2*(psi12[:,:,None] + psi12[:,None,:])).reshape(NM,M), Z, Mu2, Sigma2, tmp1)
+                    pass
+
+                if isinstance(p1, RBF) and isinstance(p2, Linear):
+                    #import ipdb;ipdb.set_trace()
+                    pass
+
+                p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, tmp2)
+
+                target[ps1] += tmp1
+                target[ps2] += tmp2
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"

        return self._transform_gradients(target)

    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
        target = np.zeros_like(Z)
        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
-        # target *= 2
+
+        from parts.white import White
+        from parts.rbf import RBF
+        from parts.rbf_inv import RBFInv
+        from parts.bias import Bias
+        from parts.linear import Linear
+        from parts.fixed import Fixed

        # compute the "cross" terms
-        # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self.parts, 2):
-            if p1.name == 'linear' and p2.name == 'linear':
-                raise NotImplementedError("We don't handle linear/linear cross-terms")
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target)
+        # TODO: better looping, input_slices
+        for p1, p2 in itertools.combinations(self.parts, 2):
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+                p2.dpsi1_dZ(dL_dpsi2.sum(1) * p1.variance, Z, mu, S, target)
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+                p1.dpsi1_dZ(dL_dpsi2.sum(1) * p2.variance, Z, mu, S, target)
+            # linear X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
+                p2.dpsi1_dZ(dL_dpsi2.sum(1) * p1.variance, Z, mu, S, target)
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, Linear):
+                p1.dpsi1_dZ(dL_dpsi2.sum(1) * p2.variance, Z, mu, S, target)
+            # rbf X any
+            elif False:#isinstance(p1, (RBF, RBFInv)) or isinstance(p2, (RBF, RBFInv)):
+                if isinstance(p2, (RBF, RBFInv)) and not isinstance(p1, (RBF, RBFInv)):
+                    p1t = p1; p1 = p2; p2 = p1t; del p1t  
+                N, M = mu.shape[0], Z.shape[0]; NM=N*M
+                psi11 = np.zeros((N, M))
+                psi12 = np.zeros((NM, M))
+                #psi12_t = np.zeros((N,M))

+                p1.psi1(Z, mu, S, psi11)
+                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
+                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
+
+                p2.psi1(Z, Mu, Sigma, psi12)
+                tmp1 = np.zeros_like(target)
+                p1.dpsi1_dZ((dL_dpsi2*psi12.reshape(N,M,M)).sum(1), Z, mu, S, tmp1)
+                p1.dpsi1_dZ((dL_dpsi2*psi12.reshape(N,M,M)).sum(2), Z, mu, S, tmp1)
+                target += tmp1
+
+                #p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
+                p2.dpsi1_dZ((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
        return target * 2

    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]

+        from parts.white import White
+        from parts.rbf import RBF
+        from parts.rbf_inv import RBFInv
+        from parts.bias import Bias
+        from parts.linear import Linear
+        from parts.fixed import Fixed
+
        # compute the "cross" terms
-        # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self.parts, 2):
-            if p1.name == 'linear' and p2.name == 'linear':
-                raise NotImplementedError("We don't handle linear/linear cross-terms")
+        # TODO: better looping, input_slices
+        for p1, p2 in itertools.combinations(self.parts, 2):
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target_mu, target_S)
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target_mu, target_S)
+            # linear X bias
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target_mu, target_S)
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, Linear):
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target_mu, target_S)
+            # rbf X any
+            elif False:#isinstance(p1, (RBF, RBFInv)) or isinstance(p2, (RBF, RBFInv)):
+                if isinstance(p2, (RBF, RBFInv)) and not isinstance(p1, (RBF, RBFInv)):
+                    p1t = p1; p1 = p2; p2 = p1t; del p1t  
+                N, M = mu.shape[0], Z.shape[0]; NM=N*M
+                psi11 = np.zeros((N, M))
+                psi12 = np.zeros((NM, M))
+                #psi12_t = np.zeros((N,M))

-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S)
+                p1.psi1(Z, mu, S, psi11)
+                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
+                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)

+                p2.psi1(Z, Mu, Sigma, psi12)
+                p1.dpsi1_dmuS((dL_dpsi2*psi12.reshape(N,M,M)).sum(1), Z, mu, S, target_mu, target_S)
+                p1.dpsi1_dmuS((dL_dpsi2*psi12.reshape(N,M,M)).sum(2), Z, mu, S, target_mu, target_S)
+
+                #p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
+                p2.dpsi1_dmuS((dL_dpsi2*(psi11[:,:,None])).sum(1)*2, Z, Mu.reshape(N,M,self.input_dim).sum(1), Sigma.reshape(N,M,self.input_dim).sum(1), target_mu, target_S)
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
        return target_mu, target_S

    def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
        if which_parts == 'all':
-            which_parts = [True] * self.Nparts
+            which_parts = [True] * self.num_parts
        if self.input_dim == 1:
            if x is None:
                x = np.zeros((1, 1))
@ -435,3 +735,232 @@ class kern(Parameterised):
            pb.title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1]))
        else:
            raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions"
+
+from ..core.model import Model
+class Kern_check_model(Model):
+    """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        num_samples = 20
+        num_samples2 = 10
+        if kernel==None:
+            import GPy
+            kernel = GPy.kern.rbf(1)
+            del GPy
+        if X==None:
+            X = np.random.normal(size=(num_samples, kernel.input_dim))
+        if dL_dK==None:
+            if X2==None:
+                dL_dK = np.ones((X.shape[0], X.shape[0]))
+            else:
+                dL_dK = np.ones((X.shape[0], X2.shape[0]))
+
+        self.kernel=kernel
+        self.X = X
+        self.X2 = X2
+        self.dL_dK = dL_dK
+        #self.constrained_indices=[]
+        #self.constraints=[]
+        super(Kern_check_model, self).__init__()
+
+    def is_positive_definite(self):
+        v = np.linalg.eig(self.kernel.K(self.X))[0]
+        if any(v<-10*sys.float_info.epsilon):
+            return False
+        else:
+            return True
+
+    def _get_params(self):
+        return self.kernel._get_params()
+
+    def _get_param_names(self):
+        return self.kernel._get_param_names()
+
+    def _set_params(self, x):
+        self.kernel._set_params(x)
+
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum()
+
+    def _log_likelihood_gradients(self):
+        raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
+
+class Kern_check_dK_dtheta(Kern_check_model):
+    """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dK_dtheta(self.dL_dK, self.X, self.X2)
+
+class Kern_check_dKdiag_dtheta(Kern_check_model):
+    """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters."""
+    def __init__(self, kernel=None, dL_dK=None, X=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
+        if dL_dK==None:
+            self.dL_dK = np.ones((self.X.shape[0]))
+
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dKdiag_dtheta(self.dL_dK, self.X)
+
+class Kern_check_dK_dX(Kern_check_model):
+    """This class allows gradient checks for the gradient of a kernel with respect to X. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dK_dX(self.dL_dK, self.X, self.X2).flatten()
+
+    def _get_param_names(self):
+        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
+
+    def _get_params(self):
+        return self.X.flatten()
+
+    def _set_params(self, x):
+        self.X=x.reshape(self.X.shape)
+
+class Kern_check_dKdiag_dX(Kern_check_model):
+    """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
+        if dL_dK==None:
+            self.dL_dK = np.ones((self.X.shape[0]))
+
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
+
+    def _get_param_names(self):
+        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
+
+    def _get_params(self):
+        return self.X.flatten()
+
+    def _set_params(self, x):
+        self.X=x.reshape(self.X.shape)
+
+def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=False):
+    """This function runs on kernels to check the correctness of their implementation. It checks that the covariance function is positive definite for a randomly generated data set.
+
+    :param kern: the kernel to be tested.
+    :type kern: GPy.kern.Kernpart
+    :param X: X input values to test the covariance function.
+    :type X: ndarray
+    :param X2: X2 input values to test the covariance function.
+    :type X2: ndarray
+
+    """
+    pass_checks = True
+    if X==None:
+        X = np.random.randn(10, kern.input_dim)
+        if X_positive:
+            X = abs(X)
+        if output_ind is not None:
+            assert(output_ind<kern.input_dim)
+            X[:, output_ind] = np.random.randint(low=0,high=kern.parts[0].output_dim, size=X.shape[0])
+    if X2==None:
+        X2 = np.random.randn(20, kern.input_dim)
+        if X_positive:
+            X2 = abs(X2)
+        if output_ind is not None:
+            assert(output_ind<kern.input_dim)
+            X2[:, output_ind] = np.random.randint(low=0, high=kern.parts[0].output_dim, size=X2.shape[0])
+
+    if verbose:
+        print("Checking covariance function is positive definite.")
+    result = Kern_check_model(kern, X=X).is_positive_definite()
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Positive definite check failed for " + kern.name + " covariance function.")
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X) wrt theta.")
+    result = Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X2) wrt theta.")
+    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of Kdiag(X) wrt theta.")
+    result = Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X) wrt X.")
+    try:
+        result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print("dK_dX not implemented for " + kern.name)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X2) wrt X.")
+    try:
+        result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print("dK_dX not implemented for " + kern.name)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of Kdiag(X) wrt X.")
+    try:
+        result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print("dK_dX not implemented for " + kern.name)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    return pass_checks
+del Model
--- a/GPy/kern/kernpart.py
+++ b/GPy/kern/kernpart.py
@ -1,56 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-class Kernpart(object):
-    def __init__(self,input_dim):
-        """
-        The base class for a kernpart: a positive definite function which forms part of a kernel
-
-        :param input_dim: the number of input dimensions to the function
-        :type input_dim: int
-
-        Do not instantiate.
-        """
-        self.input_dim = input_dim
-        self.num_params = 1
-        self.name = 'unnamed'
-
-    def _get_params(self):
-        raise NotImplementedError
-    def _set_params(self,x):
-        raise NotImplementedError
-    def _get_param_names(self):
-        raise NotImplementedError
-    def K(self,X,X2,target):
-        raise NotImplementedError
-    def Kdiag(self,X,target):
-        raise NotImplementedError
-    def dK_dtheta(self,dL_dK,X,X2,target):
-        raise NotImplementedError
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        raise NotImplementedError
-    def psi0(self,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
-        raise NotImplementedError
-    def psi1(self,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi1_dtheta(self,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
-        raise NotImplementedError
-    def psi2(self,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
-        raise NotImplementedError
-    def dK_dX(self,X,X2,target):
-        raise NotImplementedError
--- a/GPy/kern/parts/Brownian.py
+++ b/GPy/kern/parts/Brownian.py
@ -21,7 +21,7 @@ class Brownian(Kernpart):
    def __init__(self,input_dim,variance=1.):
        self.input_dim = input_dim
        assert self.input_dim==1, "Brownian motion in 1D only"
-        self.num_params = 1.
+        self.num_params = 1
        self.name = 'Brownian'
        self._set_params(np.array([variance]).flatten())

--- a/GPy/kern/parts/Matern32.py
+++ b/GPy/kern/parts/Matern32.py
@ -98,9 +98,13 @@ class Matern32(Kernpart):

    def dK_dX(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
-        ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+        if X2 is None:
+            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X[None, :, :]) / self.lengthscale), -1))[:, :, None]
+            ddist_dX = 2*(X[:, None, :] - X[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+
+        else:
+            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+            ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
        dK_dX = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)

--- a/GPy/kern/parts/Matern52.py
+++ b/GPy/kern/parts/Matern52.py
@ -98,9 +98,12 @@ class Matern52(Kernpart):

    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
+        if X2 is None:
+            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X[None,:,:])/self.lengthscale),-1))[:,:,None]
+            ddist_dX = 2*(X[:,None,:]-X[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
+        else:
+            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
+            ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)

--- a/GPy/kern/parts/ODE_1.py
+++ b/GPy/kern/parts/ODE_1.py
@ -0,0 +1,161 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+
+class ODE_1(Kernpart):
+    """
+    kernel resultiong from a first order ODE with OU driving GP
+
+    :param input_dim: the number of input dimension, has to be equal to one
+    :type input_dim: int
+    :param varianceU: variance of the driving GP
+    :type varianceU: float
+    :param lengthscaleU: lengthscale of the driving GP  (sqrt(3)/lengthscaleU)
+    :type lengthscaleU: float
+    :param varianceY: 'variance' of the transfer function
+    :type varianceY: float
+    :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY)
+    :type lengthscaleY: float
+    :rtype: kernel object
+
+    """
+    def __init__(self, input_dim=1, varianceU=1., varianceY=1., lengthscaleU=None, lengthscaleY=None):
+        assert input_dim==1, "Only defined for input_dim = 1"
+        self.input_dim = input_dim
+        self.num_params = 4
+        self.name = 'ODE_1'
+        if lengthscaleU is not None:
+            lengthscaleU = np.asarray(lengthscaleU)
+            assert lengthscaleU.size == 1, "lengthscaleU should be one dimensional"
+        else:
+            lengthscaleU = np.ones(1)
+        if lengthscaleY is not None:
+            lengthscaleY = np.asarray(lengthscaleY)
+            assert lengthscaleY.size == 1, "lengthscaleY should be one dimensional"
+        else:
+            lengthscaleY = np.ones(1)
+            #lengthscaleY = 0.5
+        self._set_params(np.hstack((varianceU, varianceY, lengthscaleU,lengthscaleY)))
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.varianceU,self.varianceY, self.lengthscaleU,self.lengthscaleY))
+
+    def _set_params(self, x):
+        """set the value of the parameters."""
+        assert x.size == self.num_params
+        self.varianceU = x[0]
+        self.varianceY = x[1]
+        self.lengthscaleU = x[2]
+        self.lengthscaleY = x[3]
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return ['varianceU','varianceY', 'lengthscaleU', 'lengthscaleY']
+
+
+    def K(self, X, X2, target):
+        """Compute the covariance matrix between X and X2."""
+        if X2 is None: X2 = X
+       # i1 = X[:,1]
+       # i2 = X2[:,1]
+       # X = X[:,0].reshape(-1,1)
+       # X2 = X2[:,0].reshape(-1,1)
+        dist = np.abs(X - X2.T)
+        
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+
+        k1 = np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
+        k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+
+        np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+        
+        k1 = (2*lu+ly)/(lu+ly)**2
+        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2 
+        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2 
+
+        np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: X2 = X
+        dist = np.abs(X - X2.T)
+
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+
+        dk1theta1 = np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
+        #c=np.sqrt(3)
+        #t1=c/lu
+        #t2=1/ly
+        #dk1theta1=np.exp(-dist*ly)*t2*( (2*c*t2+2*t1)/(c*t2+t1)**2 -2*(2*c*t2*t1+t1**2)/(c*t2+t1)**3   )
+        
+        dk2theta1 = 1*( 
+            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2) 
+            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3) 
+            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
+            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
+            )
+      
+        dk3theta1 = np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
+
+        dktheta1 = self.varianceU*self.varianceY*(dk1theta1+dk2theta1+dk3theta1)
+
+
+
+
+        dk1theta2 = np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
+
+        dk2theta2 = 1*(
+            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
+            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
+            )
+
+        dk3theta2 = np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
+
+        dktheta2 = self.varianceU*self.varianceY*(dk1theta2 + dk2theta2 +dk3theta2)
+
+
+
+        k1 = np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
+        k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        dkdvar = k1+k2+k3
+
+        target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
+        target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
+        target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)
+        target[3] += np.sum(dktheta2*(-self.lengthscaleY**(-2)) * dL_dK)
+
+
+    # def dKdiag_dtheta(self, dL_dKdiag, X, target):
+    #     """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+    #     # NB: derivative of diagonal elements wrt lengthscale is 0
+    #     target[0] += np.sum(dL_dKdiag)
+
+    # def dK_dX(self, dL_dK, X, X2, target):
+    #     """derivative of the covariance matrix with respect to X."""
+    #     if X2 is None: X2 = X
+    #     dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+    #     ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+    #     dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
+    #     target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    # def dKdiag_dX(self, dL_dKdiag, X, target):
+    #     pass
--- a/GPy/kern/parts/ODE_UY.py
+++ b/GPy/kern/parts/ODE_UY.py
@ -0,0 +1,253 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+
+def index_to_slices(index):
+    """
+    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index. 
+
+    e.g.
+    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
+    returns
+    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
+
+    or, a more complicated example
+    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
+    returns
+    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
+    """
+
+    #contruct the return structure
+    ind = np.asarray(index,dtype=np.int64)
+    ret = [[] for i in range(ind.max()+1)]
+
+    #find the switchpoints
+    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
+    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+
+    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    return ret
+
+class ODE_UY(Kernpart):
+    """
+    kernel resultiong from a first order ODE with OU driving GP
+
+    :param input_dim: the number of input dimension, has to be equal to one
+    :type input_dim: int
+    :param input_lengthU: the number of input U length
+    :type input_dim: int   
+    :param varianceU: variance of the driving GP
+    :type varianceU: float
+    :param lengthscaleU: lengthscale of the driving GP  (sqrt(3)/lengthscaleU)
+    :type lengthscaleU: float
+    :param varianceY: 'variance' of the transfer function
+    :type varianceY: float
+    :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY)
+    :type lengthscaleY: float
+    :rtype: kernel object
+
+    """
+
+
+
+
+    def __init__(self, input_dim=2,varianceU=1., varianceY=1., lengthscaleU=None, lengthscaleY=None):
+        assert input_dim==2, "Only defined for input_dim = 1"
+        self.input_dim = input_dim
+        self.num_params = 4
+        self.name = 'ODE_UY'
+
+
+        if lengthscaleU is not None:
+            lengthscaleU = np.asarray(lengthscaleU)
+            assert lengthscaleU.size == 1, "lengthscaleU should be one dimensional"
+        else:
+            lengthscaleU = np.ones(1)
+        if lengthscaleY is not None:
+            lengthscaleY = np.asarray(lengthscaleY)
+            assert lengthscaleY.size == 1, "lengthscaleY should be one dimensional"
+        else:
+            lengthscaleY = np.ones(1)
+            #lengthscaleY = 0.5
+        self._set_params(np.hstack((varianceU, varianceY, lengthscaleU,lengthscaleY)))
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.varianceU,self.varianceY, self.lengthscaleU,self.lengthscaleY))
+
+    def _set_params(self, x):
+        """set the value of the parameters."""
+        assert x.size == self.num_params
+
+        self.varianceU = x[0]
+        self.varianceY = x[1]
+        self.lengthscaleU = x[2]
+        self.lengthscaleY = x[3]
+
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return ['varianceU','varianceY', 'lengthscaleU', 'lengthscaleY']
+
+
+    def K(self, X, X2, target):
+        """Compute the covariance matrix between X and X2."""
+
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+
+
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+        rdist = X - X2.T
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #iu=self.input_lengthU  #dimention of U
+        
+        Vu=self.varianceU
+        Vy=self.varianceY
+
+        kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
+
+        k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
+        k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist))
+
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+        kyup = lambda dist:Vu*Vy*(k1(dist)+k2(dist))    #t>0 kyu
+        kyun = lambda dist:Vu*Vy*(kyu3(dist))       #t<0 kyu
+
+        kuyp = lambda dist:Vu*Vy*(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:Vu*Vy*(k1(dist)+k2(dist))      #t<0 kuy
+        
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                        elif i==0 and j==1:
+                            target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                        elif i==1 and j==1:
+                            target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                        else:
+                            target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
+
+
+        #KUU = kuu(np.abs(rdist[:iu,:iu]))
+
+        #KYY = kyy(np.abs(rdist[iu:,iu:]))
+
+        #KYU = np.where(rdist[iu:,:iu]>0,kyup(np.abs(rdist[iu:,:iu])),kyun(np.abs(rdist[iu:,:iu]) ))
+
+        #KUY = np.where(rdist[:iu,iu:]>0,kuyp(np.abs(rdist[:iu,iu:])),kuyn(np.abs(rdist[:iu,iu:]) ))
+
+        #ker=np.vstack((np.hstack([KUU,KUY]),np.hstack([KYU,KYY])))
+
+        #np.add(ker, target, target)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+        
+        k1 = (2*lu+ly)/(lu+ly)**2
+        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2 
+        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2 
+
+        slices = index_to_slices(X[:,-1])
+
+        for i, ss1 in enumerate(slices):
+            for s1 in ss1:
+                if i==0:
+                    target[s1]+= self.varianceU 
+                elif i==1:
+                    target[s1]+= self.varianceU*self.varianceY*(k1+k2+k3)
+                else:
+                    raise ValueError, "invalid input/output index"
+        
+        #target[slices[0][0]]+= self.varianceU   #matern32 diag
+        #target[slices[1][0]]+= self.varianceU*self.varianceY*(k1+k2+k3)  #  diag
+
+
+
+
+
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: X2 = X
+        dist = np.abs(X - X2.T)
+
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+
+        dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
+        #c=np.sqrt(3)
+        #t1=c/lu
+        #t2=1/ly
+        #dk1theta1=np.exp(-dist*ly)*t2*( (2*c*t2+2*t1)/(c*t2+t1)**2 -2*(2*c*t2*t1+t1**2)/(c*t2+t1)**3   )
+
+        dk2theta1 = lambda dist: 1*( 
+            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2) 
+            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3) 
+            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
+            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
+            )
+      
+        dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
+
+        dktheta1 = lambda dist: self.varianceU*self.varianceY*(dk1theta1+dk2theta1+dk3theta1)
+
+
+
+
+        dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
+
+        dk2theta2 =lambda dist:  1*(
+            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
+            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
+            )
+
+        dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
+
+        dktheta2 = lambda dist: self.varianceU*self.varianceY*(dk1theta2 + dk2theta2 +dk3theta2)
+
+
+
+        k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
+        k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        dkdvar = k1+k2+k3
+
+        target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
+        target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
+        target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)
+        target[3] += np.sum(dktheta2*(-self.lengthscaleY**(-2)) * dL_dK)
+
+
+    # def dKdiag_dtheta(self, dL_dKdiag, X, target):
+    #     """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+    #     # NB: derivative of diagonal elements wrt lengthscale is 0
+    #     target[0] += np.sum(dL_dKdiag)
+
+    # def dK_dX(self, dL_dK, X, X2, target):
+    #     """derivative of the covariance matrix with respect to X."""
+    #     if X2 is None: X2 = X
+    #     dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+    #     ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+    #     dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
+    #     target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    # def dKdiag_dX(self, dL_dKdiag, X, target):
+    #     pass
--- a/GPy/kern/parts/init.py
+++ b/GPy/kern/parts/init.py
@ -0,0 +1,31 @@
+import bias
+import Brownian
+import coregionalize
+import exponential
+import eq_ode1
+import finite_dimensional
+import fixed
+import gibbs
+import hetero
+import hierarchical
+import independent_outputs
+import linear
+import Matern32
+import Matern52
+import mlp
+import ODE_1
+import ODE_UY
+import periodic_exponential
+import periodic_Matern32
+import periodic_Matern52
+import poly
+import prod_orthogonal
+import prod
+import rational_quadratic
+import rbfcos
+import rbf
+import rbf_inv
+import spline
+import symmetric
+import sympy_helpers
+import white
--- a/GPy/kern/parts/bias.py
+++ b/GPy/kern/parts/bias.py
@ -6,7 +6,7 @@ from kernpart import Kernpart
 import numpy as np
 import hashlib

-class bias(Kernpart):
+class Bias(Kernpart):
    def __init__(self,input_dim,variance=1.):
        """
        :param input_dim: the number of input dimensions
--- a/GPy/kern/parts/coregionalize.py
+++ b/GPy/kern/parts/coregionalize.py
@ -7,26 +7,51 @@ from GPy.util.linalg import mdot, pdinv
 import pdb
 from scipy import weave

-class Coregionalise(Kernpart):
+class Coregionalize(Kernpart):
    """
-    Kernel for Intrinsic Corregionalization Models
+    Covariance function for intrinsic/linear coregionalization models
+
+    This covariance has the form:
+    .. math::
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + \text{diag}(kappa)
+
+    An intrinsic/linear coregionalization covariance function of the form:
+    .. math::
+
+       k_2(x, y)=\mathbf{B} k(x, y)
+
+    it is obtained as the tensor product between a covariance function
+    k(x,y) and B.
+
+    :param output_dim: number of outputs to coregionalize
+    :type output_dim: int
+    :param rank: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type rank: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
+    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (output_dim,)
+
+    .. note: see coregionalization examples in GPy.examples.regression for some usage.
    """
-    def __init__(self,Nout,R=1, W=None, kappa=None):
+    def __init__(self, output_dim, rank=1, W=None, kappa=None):
        self.input_dim = 1
        self.name = 'coregion'
-        self.Nout = Nout
-        self.R = R
+        self.output_dim = output_dim
+        self.rank = rank
+        if self.rank>output_dim-1:
+            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
        if W is None:
-            self.W = np.ones((self.Nout,self.R))
+            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
        else:
-            assert W.shape==(self.Nout,self.R)
+            assert W.shape==(self.output_dim,self.rank)
            self.W = W
        if kappa is None:
-            kappa = np.ones(self.Nout)
+            kappa = 0.5*np.ones(self.output_dim)
        else:
-            assert kappa.shape==(self.Nout,)
+            assert kappa.shape==(self.output_dim,)
        self.kappa = kappa
-        self.num_params = self.Nout*(self.R + 1)
+        self.num_params = self.output_dim*(self.rank + 1)
        self._set_params(np.hstack([self.W.flatten(),self.kappa]))

    def _get_params(self):
@ -34,12 +59,12 @@ class Coregionalise(Kernpart):

    def _set_params(self,x):
        assert x.size == self.num_params
-        self.kappa = x[-self.Nout:]
-        self.W = x[:-self.Nout].reshape(self.Nout,self.R)
+        self.kappa = x[-self.output_dim:]
+        self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank)
        self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)

    def _get_param_names(self):
-        return sum([['W%i_%i'%(i,j) for j in range(self.R)] for i in range(self.Nout)],[]) + ['kappa_%i'%i for i in range(self.Nout)]
+        return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)]

    def K(self,index,index2,target):
        index = np.asarray(index,dtype=np.int)
@ -57,26 +82,26 @@ class Coregionalise(Kernpart):
        if index2 is None:
            code="""
            for(int i=0;i<N; i++){
-              target[i+i*N] += B[index[i]+Nout*index[i]];
+              target[i+i*N] += B[index[i]+output_dim*index[i]];
              for(int j=0; j<i; j++){
-                  target[j+i*N] += B[index[i]+Nout*index[j]];
+                  target[j+i*N] += B[index[i]+output_dim*index[j]];
                  target[i+j*N] += target[j+i*N];
                }
              }
            """
-            N,B,Nout = index.size, self.B, self.Nout
-            weave.inline(code,['target','index','N','B','Nout'])
+            N,B,output_dim = index.size, self.B, self.output_dim
+            weave.inline(code,['target','index','N','B','output_dim'])
        else:
            index2 = np.asarray(index2,dtype=np.int)
            code="""
            for(int i=0;i<num_inducing; i++){
              for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] += B[Nout*index[j]+index2[i]];
+                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
                }
              }
            """
-            N,num_inducing,B,Nout = index.size,index2.size, self.B, self.Nout
-            weave.inline(code,['target','index','index2','N','num_inducing','B','Nout'])
+            N,num_inducing,B,output_dim = index.size,index2.size, self.B, self.output_dim
+            weave.inline(code,['target','index','index2','N','num_inducing','B','output_dim'])


    def Kdiag(self,index,target):
@ -93,12 +118,12 @@ class Coregionalise(Kernpart):
        code="""
        for(int i=0; i<num_inducing; i++){
          for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + Nout*index2[i]] += dL_dK[i+j*num_inducing];
+            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
          }
        }
        """
-        N, num_inducing, Nout = index.size, index2.size, self.Nout
-        weave.inline(code, ['N','num_inducing','Nout','dL_dK','dL_dK_small','index','index2'])
+        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
+        weave.inline(code, ['N','num_inducing','output_dim','dL_dK','dL_dK_small','index','index2'])

        dkappa = np.diag(dL_dK_small)
        dL_dK_small += dL_dK_small.T
@ -115,8 +140,8 @@ class Coregionalise(Kernpart):
        ii,jj = ii.T, jj.T

        dL_dK_small = np.zeros_like(self.B)
-        for i in range(self.Nout):
-            for j in range(self.Nout):
+        for i in range(self.output_dim):
+            for j in range(self.output_dim):
                tmp = np.sum(dL_dK[(ii==i)*(jj==j)])
                dL_dK_small[i,j] = tmp

@ -128,15 +153,13 @@ class Coregionalise(Kernpart):

    def dKdiag_dtheta(self,dL_dKdiag,index,target):
        index = np.asarray(index,dtype=np.int).flatten()
-        dL_dKdiag_small = np.zeros(self.Nout)
-        for i in range(self.Nout):
+        dL_dKdiag_small = np.zeros(self.output_dim)
+        for i in range(self.output_dim):
            dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
        dW = 2.*self.W*dL_dKdiag_small[:,None]
        dkappa = dL_dKdiag_small
        target += np.hstack([dW.flatten(),dkappa])

    def dK_dX(self,dL_dK,X,X2,target):
+        #NOTE In this case, pass is equivalent to returning zero.
        pass
-
-
-
--- a/GPy/kern/parts/eq_ode1.py
+++ b/GPy/kern/parts/eq_ode1.py
@ -0,0 +1,556 @@
+# Copyright (c) 2013, GPy Authors, see AUTHORS.txt
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from GPy.util.linalg import mdot, pdinv
+from GPy.util.ln_diff_erfs import ln_diff_erfs
+import pdb
+from scipy import weave
+
+class Eq_ode1(Kernpart):
+    """
+    Covariance function for first order differential equation driven by an exponentiated quadratic covariance.
+
+    This outputs of this kernel have the form
+    .. math::
+       \frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} f_i(t-\delta_j) +\sqrt{\kappa_j}g_j(t) - d_jy_j(t)
+
+    where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
+    
+    :param output_dim: number of outputs driven by latent function.
+    :type output_dim: int
+    :param W: sensitivities of each output to the latent driving function. 
+    :type W: ndarray (output_dim x rank).
+    :param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
+    :type rank: int
+    :param decay: decay rates for the first order system. 
+    :type decay: array of length output_dim.
+    :param delay: delay between latent force and output response.
+    :type delay: array of length output_dim.
+    :param kappa: diagonal term that allows each latent output to have an independent component to the response.
+    :type kappa: array of length output_dim.
+    
+    .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
+    """
+    def __init__(self,output_dim, W=None, rank=1, kappa=None, lengthscale=1.0,  decay=None, delay=None):
+        self.rank = rank
+        self.input_dim = 1
+        self.name = 'eq_ode1'
+        self.output_dim = output_dim
+        self.lengthscale = lengthscale
+        self.num_params = self.output_dim*self.rank + 1 + (self.output_dim - 1)
+        if kappa is not None:
+            self.num_params+=self.output_dim
+        if delay is not None:
+            assert delay.shape==(self.output_dim-1,)
+            self.num_params+=self.output_dim-1
+        self.rank = rank
+        if W is None:
+            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
+        else:
+            assert W.shape==(self.output_dim,self.rank)
+            self.W = W
+        if decay is None:
+            self.decay = np.ones(self.output_dim-1)
+        if kappa is not None:
+            assert kappa.shape==(self.output_dim,)
+        self.kappa = kappa
+
+        self.delay = delay
+        self.is_normalized = True
+        self.is_stationary = False
+        self.gaussian_initial = False
+        self._set_params(self._get_params())
+        
+    def _get_params(self):
+        param_list = [self.W.flatten()]
+        if self.kappa is not None:
+            param_list.append(self.kappa)
+        param_list.append(self.decay)
+        if self.delay is not None:
+            param_list.append(self.delay)
+        param_list.append(self.lengthscale)
+        return np.hstack(param_list)
+
+    def _set_params(self,x):
+        assert x.size == self.num_params
+        end = self.output_dim*self.rank
+        self.W = x[:end].reshape(self.output_dim,self.rank)
+        start = end
+        self.B = np.dot(self.W,self.W.T)
+        if self.kappa is not None:
+            end+=self.output_dim
+            self.kappa = x[start:end]
+            self.B += np.diag(self.kappa)
+            start=end
+        end+=self.output_dim-1
+        self.decay = x[start:end]
+        start=end
+        if self.delay is not None:
+            end+=self.output_dim-1
+            self.delay = x[start:end]
+            start=end
+        end+=1
+        self.lengthscale = x[start]
+        self.sigma = np.sqrt(2)*self.lengthscale
+
+
+    def _get_param_names(self):
+        param_names = sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[])
+        if self.kappa is not None:
+            param_names += ['kappa_%i'%i for i in range(self.output_dim)]
+        param_names += ['decay_%i'%i for i in range(1,self.output_dim)]
+        if self.delay is not None:
+            param_names += ['delay_%i'%i for i in 1+range(1,self.output_dim)]
+        param_names+= ['lengthscale'] 
+        return param_names
+
+    def K(self,X,X2,target):
+        
+        if X.shape[1] > 2:
+            raise ValueError('Input matrix for ode1 covariance should have at most two columns, one containing times, the other output indices')
+
+        self._K_computations(X, X2)
+        target += self._scale*self._K_dvar
+
+        if self.gaussian_initial:
+            # Add covariance associated with initial condition.
+            t1_mat = self._t[self._rorder, None]
+            t2_mat = self._t2[None, self._rorder2]
+            target+=self.initial_variance * np.exp(- self.decay * (t1_mat + t2_mat))
+
+    def Kdiag(self,index,target):
+        #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+        pass
+    
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        
+        # First extract times and indices.
+        self._extract_t_indices(X, X2, dL_dK=dL_dK)
+        self._dK_ode_dtheta(target)
+        
+
+    def _dK_ode_dtheta(self, target):
+        """Do all the computations for the ode parts of the covariance function."""
+        t_ode = self._t[self._index>0]
+        dL_dK_ode = self._dL_dK[self._index>0, :]
+        index_ode = self._index[self._index>0]-1
+        if self._t2 is None:
+            if t_ode.size==0:
+                return        
+            t2_ode = t_ode
+            dL_dK_ode = dL_dK_ode[:, self._index>0]
+            index2_ode = index_ode
+        else:
+            t2_ode = self._t2[self._index2>0]
+            dL_dK_ode = dL_dK_ode[:, self._index2>0]
+            if t_ode.size==0 or t2_ode.size==0:
+                return
+            index2_ode = self._index2[self._index2>0]-1
+
+        h1 = self._compute_H(t_ode, index_ode, t2_ode, index2_ode, stationary=self.is_stationary, update_derivatives=True)
+        #self._dK_ddelay = self._dh_ddelay
+        self._dK_dsigma = self._dh_dsigma
+
+        if self._t2 is None:
+            h2 = h1
+        else:
+            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary, update_derivatives=True)
+
+        #self._dK_ddelay += self._dh_ddelay.T
+        self._dK_dsigma += self._dh_dsigma.T
+        # C1 = self.sensitivity
+        # C2 = self.sensitivity
+
+        # K = 0.5 * (h1 + h2.T)
+        # var2 = C1*C2
+        # if self.is_normalized:
+        #     dk_dD1 = (sum(sum(dL_dK.*dh1_dD1)) + sum(sum(dL_dK.*dh2_dD1.T)))*0.5*var2
+        #     dk_dD2 = (sum(sum(dL_dK.*dh1_dD2)) + sum(sum(dL_dK.*dh2_dD2.T)))*0.5*var2
+        #     dk_dsigma = 0.5 * var2 * sum(sum(dL_dK.*dK_dsigma))
+        #     dk_dC1 = C2 * sum(sum(dL_dK.*K))
+        #     dk_dC2 = C1 * sum(sum(dL_dK.*K))
+        # else:
+        #     K = np.sqrt(np.pi) * K
+        #     dk_dD1 = (sum(sum(dL_dK.*dh1_dD1)) + * sum(sum(dL_dK.*K))
+        #     dk_dC2 = self.sigma * C1 * sum(sum(dL_dK.*K))
+
+
+        # dk_dSim1Variance = dk_dC1
+        # Last element is the length scale.
+        (dL_dK_ode[:, :, None]*self._dh_ddelay[:, None, :]).sum(2)
+
+        target[-1] += (dL_dK_ode*self._dK_dsigma/np.sqrt(2)).sum()
+
+
+        # # only pass the gradient with respect to the inverse width to one
+        # # of the gradient vectors ... otherwise it is counted twice.
+        # g1 = real([dk_dD1 dk_dinvWidth dk_dSim1Variance])
+        # g2 = real([dk_dD2 0 dk_dSim2Variance])
+        # return g1, g2"""
+
+    def dKdiag_dtheta(self,dL_dKdiag,index,target):
+        pass
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        pass
+
+    def _extract_t_indices(self, X, X2=None, dL_dK=None):
+        """Extract times and output indices from the input matrix X. Times are ordered according to their index for convenience of computation, this ordering is stored in self._order and self.order2. These orderings are then mapped back to the original ordering (in X) using self._rorder and self._rorder2. """
+
+        # TODO: some fast checking here to see if this needs recomputing?
+        self._t = X[:, 0]
+        if not X.shape[1] == 2:
+            raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
+        self._index = np.asarray(X[:, 1],dtype=np.int)
+        # Sort indices so that outputs are in blocks for computational
+        # convenience.
+        self._order = self._index.argsort()
+        self._index = self._index[self._order]
+        self._t = self._t[self._order]
+        self._rorder = self._order.argsort() # rorder is for reversing the order
+        
+        if X2 is None:
+            self._t2 = None
+            self._index2 = None
+            self._order2 = self._order
+            self._rorder2 = self._rorder
+        else:
+            if not X2.shape[1] == 2:
+                raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
+            self._t2 = X2[:, 0]
+            self._index2 = np.asarray(X2[:, 1],dtype=np.int)
+            self._order2 = self._index2.argsort()
+            self._index2 = self._index2[self._order2]
+            self._t2 = self._t2[self._order2]
+            self._rorder2 = self._order2.argsort() # rorder2 is for reversing order
+
+        if dL_dK is not None:
+            self._dL_dK = dL_dK[self._order, :]
+            self._dL_dK = self._dL_dK[:, self._order2]
+            
+    def _K_computations(self, X, X2):
+        """Perform main body of computations for the ode1 covariance function."""
+        # First extract times and indices.
+        self._extract_t_indices(X, X2)
+
+        self._K_compute_eq()
+        self._K_compute_ode_eq()
+        if X2 is None:
+            self._K_eq_ode = self._K_ode_eq.T
+        else:
+            self._K_compute_ode_eq(transpose=True)
+        self._K_compute_ode()
+
+        if X2 is None:
+            self._K_dvar = np.zeros((self._t.shape[0], self._t.shape[0]))
+        else:
+            self._K_dvar = np.zeros((self._t.shape[0], self._t2.shape[0]))
+
+        # Reorder values of blocks for placing back into _K_dvar.
+        self._K_dvar = np.vstack((np.hstack((self._K_eq, self._K_eq_ode)),
+                                                   np.hstack((self._K_ode_eq, self._K_ode))))
+        self._K_dvar = self._K_dvar[self._rorder, :]
+        self._K_dvar = self._K_dvar[:, self._rorder2]
+        
+        
+        if X2 is None:
+            # Matrix giving scales of each output
+            self._scale = np.zeros((self._t.size, self._t.size))
+            code="""
+            for(int i=0;i<N; i++){
+              scale_mat[i+i*N] = B[index[i]+output_dim*(index[i])];
+              for(int j=0; j<i; j++){
+                  scale_mat[j+i*N] = B[index[i]+output_dim*index[j]];
+                  scale_mat[i+j*N] = scale_mat[j+i*N];
+                }
+              }
+            """
+            scale_mat, B, index = self._scale, self.B, self._index
+            N, output_dim = self._t.size, self.output_dim
+            weave.inline(code,['index',
+                               'scale_mat', 'B',
+                               'N', 'output_dim'])
+        else:
+            self._scale = np.zeros((self._t.size, self._t2.size))
+            code = """
+            for(int i=0; i<N; i++){
+              for(int j=0; j<N2; j++){
+                scale_mat[i+j*N] = B[index[i]+output_dim*index2[j]];
+              }
+            }
+            """
+            scale_mat, B, index, index2 = self._scale, self.B, self._index, self._index2
+            N, N2, output_dim = self._t.size, self._t2.size, self.output_dim
+            weave.inline(code, ['index', 'index2',
+                                'scale_mat', 'B',
+                                'N', 'N2', 'output_dim'])
+
+
+
+    def _K_compute_eq(self):
+        """Compute covariance for latent covariance."""
+        t_eq = self._t[self._index==0]
+        if self._t2 is None:
+            if t_eq.size==0:
+                self._K_eq = np.zeros((0, 0))
+                return
+            self._dist2 = np.square(t_eq[:, None] - t_eq[None, :])
+        else:
+            t2_eq = self._t2[self._index2==0]
+            if t_eq.size==0 or t2_eq.size==0:
+                self._K_eq = np.zeros((t_eq.size, t2_eq.size))
+                return
+            self._dist2 = np.square(t_eq[:, None] - t2_eq[None, :])
+        
+        self._K_eq = np.exp(-self._dist2/(2*self.lengthscale*self.lengthscale))
+        if self.is_normalized:
+            self._K_eq/=(np.sqrt(2*np.pi)*self.lengthscale)
+
+    def _K_compute_ode_eq(self, transpose=False):
+        """Compute the cross covariances between latent exponentiated quadratic and observed ordinary differential equations.
+
+        :param transpose: if set to false the exponentiated quadratic is on the rows of the matrix and is computed according to self._t, if set to true it is on the columns and is computed according to self._t2 (default=False).
+        :type transpose: bool"""
+
+        if self._t2 is not None:
+            if transpose:
+                t_eq = self._t[self._index==0]
+                t_ode = self._t2[self._index2>0]
+                index_ode = self._index2[self._index2>0]-1
+            else:
+                t_eq = self._t2[self._index2==0]
+                t_ode = self._t[self._index>0]
+                index_ode = self._index[self._index>0]-1
+        else:
+            t_eq = self._t[self._index==0]
+            t_ode = self._t[self._index>0]
+            index_ode = self._index[self._index>0]-1
+
+        if t_ode.size==0 or t_eq.size==0:
+            if transpose:
+                self._K_eq_ode = np.zeros((t_eq.shape[0], t_ode.shape[0]))
+            else:
+                self._K_ode_eq = np.zeros((t_ode.shape[0], t_eq.shape[0]))
+            return
+
+        t_ode_mat = t_ode[:, None]
+        t_eq_mat = t_eq[None, :]
+        if self.delay is not None:
+            t_ode_mat -= self.delay[index_ode, None]
+        diff_t = (t_ode_mat - t_eq_mat)
+
+        inv_sigma_diff_t = 1./self.sigma*diff_t
+        decay_vals = self.decay[index_ode][:, None]
+        half_sigma_d_i = 0.5*self.sigma*decay_vals
+
+        if self.is_stationary:
+            ln_part, signs = ln_diff_erfs(inf, half_sigma_d_i - inv_sigma_diff_t, return_sign=True)
+        else:
+            ln_part, signs = ln_diff_erfs(half_sigma_d_i + t_eq_mat/self.sigma, half_sigma_d_i - inv_sigma_diff_t, return_sign=True)
+        sK = signs*np.exp(half_sigma_d_i*half_sigma_d_i - decay_vals*diff_t + ln_part)
+
+        sK *= 0.5
+
+        if not self.is_normalized:
+            sK *= np.sqrt(np.pi)*self.sigma
+
+
+        if transpose:
+            self._K_eq_ode = sK.T
+        else:
+            self._K_ode_eq = sK
+        
+    def _K_compute_ode(self):
+        # Compute covariances between outputs of the ODE models.
+
+        t_ode = self._t[self._index>0]
+        index_ode = self._index[self._index>0]-1
+        if self._t2 is None:
+            if t_ode.size==0:
+                self._K_ode = np.zeros((0, 0))
+                return        
+            t2_ode = t_ode
+            index2_ode = index_ode
+        else:
+            t2_ode = self._t2[self._index2>0]
+            if t_ode.size==0 or t2_ode.size==0:
+                self._K_ode = np.zeros((t_ode.size, t2_ode.size))
+                return
+            index2_ode = self._index2[self._index2>0]-1
+        
+        # When index is identical
+        h = self._compute_H(t_ode, index_ode, t2_ode, index2_ode, stationary=self.is_stationary)
+
+        if self._t2 is None:
+            self._K_ode = 0.5 * (h + h.T)
+        else:
+            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary)                
+            self._K_ode = 0.5 * (h + h2.T)
+
+        if not self.is_normalized:
+            self._K_ode *= np.sqrt(np.pi)*self.sigma
+    def _compute_diag_H(self, t, index, update_derivatives=False, stationary=False):
+        """Helper function for computing H for the diagonal only.
+        :param t: time input.
+        :type t: array
+        :param index: first output indices
+        :type index: array of int.
+        :param index: second output indices
+        :type index: array of int.
+        :param update_derivatives: whether or not to update the derivative portions (default False).
+        :type update_derivatives: bool
+        :param stationary: whether to compute the stationary version of the covariance (default False).
+        :type stationary: bool"""
+
+        """if delta_i~=delta_j:
+            [h, dh_dD_i, dh_dD_j, dh_dsigma] = np.diag(simComputeH(t, index, t, index, update_derivatives=True, stationary=self.is_stationary))
+        else:
+            Decay = self.decay[index]
+            if self.delay is not None:
+                t = t - self.delay[index]
+            
+            t_squared = t*t
+            half_sigma_decay = 0.5*self.sigma*Decay
+            [ln_part_1, sign1] = ln_diff_erfs(half_sigma_decay + t/self.sigma,
+                                              half_sigma_decay)
+    
+            [ln_part_2, sign2] = ln_diff_erfs(half_sigma_decay,
+                                              half_sigma_decay - t/self.sigma)
+            
+            h = (sign1*np.exp(half_sigma_decay*half_sigma_decay
+                             + ln_part_1
+                             - log(Decay + D_j)) 
+                 - sign2*np.exp(half_sigma_decay*half_sigma_decay
+                                - (Decay + D_j)*t
+                                + ln_part_2 
+                                - log(Decay + D_j)))
+    
+            sigma2 = self.sigma*self.sigma
+
+        if update_derivatives:
+        
+            dh_dD_i = ((0.5*Decay*sigma2*(Decay + D_j)-1)*h 
+                       + t*sign2*np.exp(
+                half_sigma_decay*half_sigma_decay-(Decay+D_j)*t + ln_part_2
+                )
+                       + self.sigma/np.sqrt(np.pi)*
+                       (-1 + np.exp(-t_squared/sigma2-Decay*t)
+                        + np.exp(-t_squared/sigma2-D_j*t)
+                        - np.exp(-(Decay + D_j)*t)))
+        
+            dh_dD_i = (dh_dD_i/(Decay+D_j)).real
+        
+        
+        
+            dh_dD_j = (t*sign2*np.exp(
+                half_sigma_decay*half_sigma_decay-(Decay + D_j)*t+ln_part_2
+                )
+                       -h)
+            dh_dD_j = (dh_dD_j/(Decay + D_j)).real
+
+            dh_dsigma = 0.5*Decay*Decay*self.sigma*h \
+                        + 2/(np.sqrt(np.pi)*(Decay+D_j))\
+                        *((-Decay/2) \
+                          + (-t/sigma2+Decay/2)*np.exp(-t_squared/sigma2 - Decay*t) \
+                          - (-t/sigma2-Decay/2)*np.exp(-t_squared/sigma2 - D_j*t) \
+                          - Decay/2*np.exp(-(Decay+D_j)*t))"""
+        pass
+    
+    def _compute_H(self, t, index, t2, index2, update_derivatives=False, stationary=False):
+        """Helper function for computing part of the ode1 covariance function.
+
+        :param t: first time input.
+        :type t: array
+        :param index: Indices of first output.
+        :type index: array of int
+        :param t2: second time input.
+        :type t2: array
+        :param index2: Indices of second output.
+        :type index2: array of int
+        :param update_derivatives: whether to update derivatives (default is False)
+        :return h : result of this subcomponent of the kernel for the given values.
+        :rtype: ndarray
+"""
+
+        if stationary:
+            raise NotImplementedError, "Error, stationary version of this covariance not yet implemented."
+        # Vector of decays and delays associated with each output.
+        Decay = self.decay[index]
+        Decay2 = self.decay[index2]
+        t_mat = t[:, None]
+        t2_mat = t2[None, :]
+        if self.delay is not None:
+            Delay = self.delay[index]
+            Delay2 = self.delay[index2]
+            t_mat-=Delay[:, None]
+            t2_mat-=Delay2[None, :]
+
+        diff_t = (t_mat - t2_mat)
+        inv_sigma_diff_t = 1./self.sigma*diff_t
+        half_sigma_decay_i = 0.5*self.sigma*Decay[:, None]
+
+        ln_part_1, sign1 = ln_diff_erfs(half_sigma_decay_i + t2_mat/self.sigma, 
+                                        half_sigma_decay_i - inv_sigma_diff_t,
+                                        return_sign=True)
+        ln_part_2, sign2 = ln_diff_erfs(half_sigma_decay_i,
+                                        half_sigma_decay_i - t_mat/self.sigma,
+                                        return_sign=True)
+
+        h = sign1*np.exp(half_sigma_decay_i
+                         *half_sigma_decay_i
+                         -Decay[:, None]*diff_t+ln_part_1
+                         -np.log(Decay[:, None] + Decay2[None, :]))
+        h -= sign2*np.exp(half_sigma_decay_i*half_sigma_decay_i
+                          -Decay[:, None]*t_mat-Decay2[None, :]*t2_mat+ln_part_2
+                          -np.log(Decay[:, None] + Decay2[None, :]))
+
+        if update_derivatives:
+            sigma2 = self.sigma*self.sigma
+            # Update ith decay gradient
+
+            dh_ddecay = ((0.5*Decay[:, None]*sigma2*(Decay[:, None] + Decay2[None, :])-1)*h
+                         + (-diff_t*sign1*np.exp(
+                half_sigma_decay_i*half_sigma_decay_i-Decay[:, None]*diff_t+ln_part_1
+                )
+                            +t_mat*sign2*np.exp(
+                half_sigma_decay_i*half_sigma_decay_i-Decay[:, None]*t_mat
+                - Decay2*t2_mat+ln_part_2))
+                         +self.sigma/np.sqrt(np.pi)*(
+                -np.exp(
+                -diff_t*diff_t/sigma2
+                )+np.exp(
+                -t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat
+                )+np.exp(
+                -t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat
+                )-np.exp(
+                -(Decay[:, None]*t_mat + Decay2[None, :]*t2_mat)
+                )
+                ))
+            self._dh_ddecay = (dh_ddecay/(Decay[:, None]+Decay2[None, :])).real
+            
+            # Update jth decay gradient
+            dh_ddecay2 = (t2_mat*sign2
+                         *np.exp(
+                half_sigma_decay_i*half_sigma_decay_i
+                -(Decay[:, None]*t_mat + Decay2[None, :]*t2_mat)
+                +ln_part_2
+                )
+                         -h)
+            self._dh_ddecay2 = (dh_ddecay/(Decay[:, None] + Decay2[None, :])).real
+            
+            # Update sigma gradient
+            self._dh_dsigma = (half_sigma_decay_i*Decay[:, None]*h
+                               + 2/(np.sqrt(np.pi)
+                                    *(Decay[:, None]+Decay2[None, :]))
+                               *((-diff_t/sigma2-Decay[:, None]/2)
+                                 *np.exp(-diff_t*diff_t/sigma2)
+                                 + (-t2_mat/sigma2+Decay[:, None]/2)
+                                 *np.exp(-t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat) 
+                                 - (-t_mat/sigma2-Decay[:, None]/2) 
+                                 *np.exp(-t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat) 
+                                 - Decay[:, None]/2
+                                 *np.exp(-(Decay[:, None]*t_mat+Decay2[None, :]*t2_mat))))
+                
+        return h
--- a/GPy/kern/parts/exponential.py
+++ b/GPy/kern/parts/exponential.py
@ -6,7 +6,7 @@ from kernpart import Kernpart
 import numpy as np
 from scipy import integrate

-class exponential(Kernpart):
+class Exponential(Kernpart):
    """
    Exponential kernel (aka Ornstein-Uhlenbeck or Matern 1/2)

--- a/GPy/kern/parts/finite_dimensional.py
+++ b/GPy/kern/parts/finite_dimensional.py
@ -4,9 +4,9 @@

 from kernpart import Kernpart
 import numpy as np
-from ..util.linalg import pdinv,mdot
+from ...util.linalg import pdinv,mdot

-class finite_dimensional(Kernpart):
+class FiniteDimensional(Kernpart):
    def __init__(self, input_dim, F, G, variance=1., weights=None):
        """
        Argumnents
--- a/GPy/kern/parts/fixed.py
+++ b/GPy/kern/parts/fixed.py
@ -15,7 +15,7 @@ class Fixed(Kernpart):
        self.input_dim = input_dim
        self.fixed_K = K
        self.num_params = 1
-        self.name = 'Fixed'
+        self.name = 'fixed'
        self._set_params(np.array([variance]).flatten())

    def _get_params(self):
--- a/GPy/kern/parts/gibbs.py
+++ b/GPy/kern/parts/gibbs.py
@ -0,0 +1,154 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import tdot
+from ...core.mapping import Mapping
+import GPy
+
+class Gibbs(Kernpart):
+    """
+    Gibbs non-stationary covariance function. 
+
+    .. math::
+       
+       r = sqrt((x_i - x_j)'*(x_i - x_j))
+       
+       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+
+       Z = (2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')^{q/2}
+
+       where :math:`l(x)` is a function giving the length scale as a function of space and :math:`q` is the dimensionality of the input space.
+       This is the non stationary kernel proposed by Mark Gibbs in his 1997
+        thesis. It is similar to an RBF but has a length scale that varies
+        with input location. This leads to an additional term in front of
+        the kernel.
+
+        The parameters are :math:`\sigma^2`, the process variance, and
+        the parameters of l(x) which is a function that can be
+        specified by the user, by default an multi-layer peceptron is
+        used.
+
+        :param input_dim: the number of input dimensions
+        :type input_dim: int 
+        :param variance: the variance :math:`\sigma^2`
+        :type variance: float
+        :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+        :type mapping: GPy.core.Mapping
+        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+        :type ARD: Boolean
+        :rtype: Kernpart object
+
+    See Mark Gibbs's thesis for more details: Gibbs,
+    M. N. (1997). Bayesian Gaussian Processes for Regression and
+    Classification. PhD thesis, Department of Physics, University of
+    Cambridge. Or also see Page 93 of Gaussian Processes for Machine
+    Learning by Rasmussen and Williams. Although note that we do not
+    constrain the lengthscale to be positive by default. This allows
+    anticorrelation to occur. The positive constraint can be included
+    by the user manually.
+
+    """
+
+    def __init__(self, input_dim, variance=1., mapping=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not mapping:
+            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
+        if not ARD:
+            self.num_params=1+mapping.num_params
+        else:
+            raise NotImplementedError
+
+        self.mapping = mapping
+        self.name='gibbs'
+        self._set_params(np.hstack((variance, self.mapping._get_params())))
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.mapping._get_params()))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.mapping._set_params(x[1:])
+
+    def _get_param_names(self):
+        return ['variance'] + self.mapping._get_param_names()
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        self._K_computations(X, X2)
+        target += self.variance*self._K_dvar
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        np.add(target, self.variance, target)
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        self._dK_computations(dL_dK)
+        if X2==None:
+            gmapping = self.mapping.df_dtheta(2*self._dL_dl[:, None], X)
+        else:
+            gmapping = self.mapping.df_dtheta(self._dL_dl[:, None], X)
+            gmapping += self.mapping.df_dtheta(self._dL_dl_two[:, None], X2)
+
+        target+= np.hstack([(dL_dK*self._K_dvar).sum(), gmapping])
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X."""
+        # First account for gradients arising from presence of X in exponent.
+        self._K_computations(X, X2)
+        if X2 is None:
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_co
+        dK_dX = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
+        target += np.sum(dK_dX*dL_dK.T[:, :, None], 0)
+        # Now account for gradients arising from presence of X in lengthscale.
+        self._dK_computations(dL_dK)
+        if X2 is None:
+            target += 2.*self.mapping.df_dX(self._dL_dl[:, None], X)
+        else:
+            target += self.mapping.df_dX(self._dL_dl[:, None], X)
+    
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X."""
+        pass
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to parameters."""
+        target[0] += np.sum(dL_dKdiag)
+
+
+    
+    def _K_computations(self, X, X2=None):
+        """Pre-computations for the covariance function (used both when computing the covariance and its gradients). Here self._dK_dvar and self._K_dist2 are updated."""
+        self._lengthscales=self.mapping.f(X)
+        self._lengthscales2=np.square(self._lengthscales)
+        if X2==None:
+            self._lengthscales_two = self._lengthscales
+            self._lengthscales_two2 = self._lengthscales2
+            Xsquare = np.square(X).sum(1)
+            self._K_dist2 = -2.*tdot(X) + Xsquare[:, None] + Xsquare[None, :]
+        else:
+            self._lengthscales_two = self.mapping.f(X2)
+            self._lengthscales_two2 = np.square(self._lengthscales_two)
+            self._K_dist2 = -2.*np.dot(X, X2.T) + np.square(X).sum(1)[:, None] + np.square(X2).sum(1)[None, :]
+        self._w2 = self._lengthscales2 + self._lengthscales_two2.T
+        prod_length = self._lengthscales*self._lengthscales_two.T
+        self._K_exponential = np.exp(-self._K_dist2/self._w2)
+        self._K_dvar = np.sign(prod_length)*(2*np.abs(prod_length)/self._w2)**(self.input_dim/2.)*np.exp(-self._K_dist2/self._w2)
+
+    def _dK_computations(self, dL_dK):
+        """Pre-computations for the gradients of the covaraince function. Here the gradient of the covariance with respect to all the individual lengthscales is computed.
+        :param dL_dK: the gradient of the objective with respect to the covariance function.
+        :type dL_dK: ndarray"""
+        
+        self._dL_dl = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales_two.T**4 - self._lengthscales**4) + 2*self._lengthscales2*self._K_dist2)/(self._w2*self._w2*self._lengthscales)).sum(1)
+        if self._lengthscales_two is self._lengthscales:
+            self._dL_dl_two = None
+        else:
+            self._dL_dl_two = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales**4 - self._lengthscales_two.T**4 ) + 2*self._lengthscales_two2.T*self._K_dist2)/(self._w2*self._w2*self._lengthscales_two.T)).sum(0)
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@ -0,0 +1,103 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import tdot
+from ...core.mapping import Mapping
+import GPy
+
+class Hetero(Kernpart):
+    """
+    TODO: Need to constrain the function outputs
+    positive (still thinking of best way of doing this!!! Yes, intend to use
+    transformations, but what's the *best* way). Currently just squaring output.
+
+    Heteroschedastic noise which depends on input location. See, for example,
+    this paper by Goldberg et al.
+
+    .. math::
+
+       k(x_i, x_j) = \delta_{i,j} \sigma^2(x_i)
+
+       where :math:`\sigma^2(x)` is a function giving the variance  as a function of input space and :math:`\delta_{i,j}` is the Kronecker delta function.
+
+    The parameters are the parameters of \sigma^2(x) which is a
+    function that can be specified by the user, by default an
+    multi-layer peceptron is used.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+    :type mapping: GPy.core.Mapping
+    :rtype: Kernpart object
+
+    See this paper:
+
+    Goldberg, P. W.  Williams, C. K. I. and Bishop,
+    C. M. (1998) Regression with Input-dependent Noise: a Gaussian
+    Process Treatment In Advances in Neural Information Processing
+    Systems, Volume 10, pp.  493-499. MIT Press
+
+    for a Gaussian process treatment of this problem.
+
+    """
+
+    def __init__(self, input_dim, mapping=None, transform=None):
+        self.input_dim = input_dim
+        if not mapping:
+            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
+        if not transform:
+            transform = GPy.core.transformations.logexp()
+
+        self.transform = transform
+        self.mapping = mapping
+        self.name='hetero'
+        self.num_params=self.mapping.num_params
+        self._set_params(self.mapping._get_params())
+
+    def _get_params(self):
+        return self.mapping._get_params()
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.mapping._set_params(x)
+
+    def _get_param_names(self):
+        return self.mapping._get_param_names()
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        if (X2 is None) or (X2 is X):
+            target[np.diag_indices_from(target)] += self._Kdiag(X)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        target+=self._Kdiag(X)
+
+    def _Kdiag(self, X):
+        """Helper function for computing the diagonal elements of the covariance."""
+        return self.mapping.f(X).flatten()**2
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        if (X2 is None) or (X2 is X):
+            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
+            self.dKdiag_dtheta(dL_dKdiag, X, target)
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to parameters."""
+        target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None]*self.mapping.f(X), X)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X."""
+        if X2==None or X2 is X:
+            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
+            self.dKdiag_dX(dL_dKdiag, X, target)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X."""
+        target += 2.*self.mapping.df_dX(dL_dKdiag[:, None], X)*self.mapping.f(X)
+
+
+
--- a/GPy/kern/parts/hierarchical.py
+++ b/GPy/kern/parts/hierarchical.py
@ -0,0 +1,76 @@
+# Copyright (c) 2012, James Hesnsman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from independent_outputs import index_to_slices
+
+class Hierarchical(Kernpart):
+    """
+    A kernel part which can reopresent a hierarchy of indepencnce: a generalisation of independent_outputs
+
+    """
+    def __init__(self,parts):
+        self.levels = len(parts)
+        self.input_dim = parts[0].input_dim + 1
+        self.num_params = np.sum([k.num_params for k in parts])
+        self.name = 'hierarchy'
+        self.parts = parts
+
+        self.param_starts = np.hstack((0,np.cumsum([k.num_params for k in self.parts[:-1]])))
+        self.param_stops = np.cumsum([k.num_params for k in self.parts])
+
+    def _get_params(self):
+        return np.hstack([k._get_params() for k in self.parts])
+
+    def _set_params(self,x):
+        [k._set_params(x[start:stop]) for k, start, stop in zip(self.parts, self.param_starts, self.param_stops)]
+
+    def _get_param_names(self):
+        return sum([[str(i)+'_'+k.name+'_'+n for n in k._get_param_names()] for i,k in enumerate(self.parts)],[])
+
+    def _sort_slices(self,X,X2):
+        slices = [index_to_slices(x) for x in X[:,-self.levels:].T]
+        X = X[:,:-self.levels]
+        if X2 is None:
+            slices2 = slices
+            X2 = X
+        else:
+            slices2 = [index_to_slices(x) for x in X2[:,-self.levels:].T]
+            X2 = X2[:,:-self.levels]
+        return X, X2, slices, slices2
+
+    def K(self,X,X2,target):
+        X, X2, slices, slices2 = self._sort_slices(X,X2)
+
+        [[[[k.K(X[s],X2[s2],target[s,s2]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_,slices2_)] for k, slices_, slices2_ in zip(self.parts,slices,slices2)]
+
+    def Kdiag(self,X,target):
+        raise NotImplementedError
+        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        #[[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices]
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        X, X2, slices, slices2 = self._sort_slices(X,X2)
+        [[[[k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)]
+
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        raise NotImplementedError
+        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        #if X2 is None:
+            #X2,slices2 = X,slices
+        #else:
+            #X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        #[[[self.k.dK_dX(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+#
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        raise NotImplementedError
+        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        #[[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target[s,:-1]) for s in slices_i] for slices_i in slices]
+
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        raise NotImplementedError
+        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        #[[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target) for s in slices_i] for slices_i in slices]
--- a/GPy/kern/parts/independent_outputs.py
+++ b/GPy/kern/parts/independent_outputs.py
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@ -0,0 +1,125 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+class Kernpart(object):
+    def __init__(self,input_dim):
+        """
+        The base class for a kernpart: a positive definite function which forms part of a covariance function (kernel).
+
+        :param input_dim: the number of input dimensions to the function
+        :type input_dim: int
+
+        Do not instantiate.
+        """
+        # the input dimensionality for the covariance
+        self.input_dim = input_dim
+        # the number of optimisable parameters
+        self.num_params = 1
+        # the name of the covariance function.
+        self.name = 'unnamed'
+
+    def _get_params(self):
+        raise NotImplementedError
+    def _set_params(self,x):
+        raise NotImplementedError
+    def _get_param_names(self):
+        raise NotImplementedError
+    def K(self,X,X2,target):
+        raise NotImplementedError
+    def Kdiag(self,X,target):
+        raise NotImplementedError
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        raise NotImplementedError
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        # In the base case compute this by calling dK_dtheta. Need to
+        # override for stationary covariances (for example) to save
+        # time.
+        for i in range(X.shape[0]):
+            self.dK_dtheta(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
+    def psi0(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
+        raise NotImplementedError
+    def psi1(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi1_dtheta(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
+        raise NotImplementedError
+    def psi2(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
+        raise NotImplementedError
+    def dK_dX(self, dL_dK, X, X2, target):
+        raise NotImplementedError
+    def dKdiag_dX(self, dL_dK, X, target):
+        raise NotImplementedError
+
+
+
+class Kernpart_stationary(Kernpart):
+    def __init__(self, input_dim, lengthscale=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not ARD:
+            self.num_params = 2
+            if lengthscale is not None:
+                self.lengthscale = np.asarray(lengthscale)
+                assert self.lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                self.lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            if lengthscale is not None:
+                self.lengthscale = np.asarray(lengthscale)
+                assert self.lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                self.lengthscale = np.ones(self.input_dim)
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+
+    def _set_params(self, x):
+        self.lengthscale = x
+        self.lengthscale2 = np.square(self.lengthscale)
+        # reset cached results
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
+
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        # For stationary covariances, derivative of diagonal elements
+        # wrt lengthscale is 0.
+        target[0] += np.sum(dL_dKdiag)
+
+    def dKdiag_dX(self, dL_dK, X, target):
+        pass # true for all stationary kernels
+
+
+class Kernpart_inner(Kernpart):
+    def __init__(self,input_dim):
+        """
+        The base class for a kernpart_inner: a positive definite function which forms part of a kernel that is based on the inner product between inputs.
+
+        :param input_dim: the number of input dimensions to the function
+        :type input_dim: int
+
+        Do not instantiate.
+        """
+        Kernpart.__init__(self, input_dim)
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+
+
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@ -4,10 +4,12 @@

 from kernpart import Kernpart
 import numpy as np
-from ..util.linalg import tdot
+from ...util.linalg import tdot
+from ...util.misc import fast_array_equal
 from scipy import weave
+from ...util.config import *

-class linear(Kernpart):
+class Linear(Kernpart):
    """
    Linear kernel

@ -50,6 +52,26 @@ class linear(Kernpart):
        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
        self._X, self._X2, self._params = np.empty(shape=(3, 1))

+        # a set of optional args to pass to weave
+        weave_options_openmp = {'headers'           : ['<omp.h>'],
+                                'extra_compile_args': ['-fopenmp -O3'],
+                                'extra_link_args'   : ['-lgomp'],
+                                'libraries': ['gomp']}
+        weave_options_noopenmp = {'extra_compile_args': ['-O3']}
+
+
+        if config.getboolean('parallel', 'openmp'):
+            self.weave_options = weave_options_openmp
+            self.weave_support_code =  """
+            #include <omp.h>
+            #include <math.h>
+            """
+        else:
+            self.weave_options = weave_options_noopenmp
+            self.weave_support_code = """
+            #include <math.h>
+            """
+
    def _get_params(self):
        return self.variances

@ -98,7 +120,10 @@ class linear(Kernpart):
            target += tmp.sum()

    def dK_dX(self, dL_dK, X, X2, target):
-        target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+        if X2 is None:
+            target += 2*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+        else:
+            target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)

    def dKdiag_dX(self,dL_dKdiag,X,target):
        target += 2.*self.variances*dL_dKdiag[:,None]*X
@ -140,28 +165,24 @@ class linear(Kernpart):
        self.dK_dX(dL_dpsi1.T, Z, mu, target)

    def psi2(self, Z, mu, S, target):
-        """
-        returns N,num_inducing,num_inducing matrix
-        """
        self._psi_computations(Z, mu, S)
-#         psi2_old = self.ZZ * np.square(self.variances) * self.mu2_S[:, None, None, :]
-#         target += psi2.sum(-1)
-        # slow way of doing it, but right
-#         psi2_real = rm np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
-#         for n in range(mu.shape[0]):
-#             for m_prime in range(Z.shape[0]):
-#                 for m in range(Z.shape[0]):
-#                     tmp = self._Z[m:m + 1] * self.variances
-#                     tmp = np.dot(tmp, (tdot(self._mu[n:n + 1].T) + np.diag(S[n])))
-#                     psi2_real[n, m, m_prime] = np.dot(tmp, (
-#                             self._Z[m_prime:m_prime + 1] * self.variances).T)
-#         mu2_S = (self._mu[:, None, :] * self._mu[:, :, None])
-#         mu2_S[:, np.arange(self.input_dim), np.arange(self.input_dim)] += self._S
-#         psi2 = (self.ZA[None, :, None, :] * mu2_S[:, None]).sum(-1)
-#         psi2 = (psi2[:, :, None] * self.ZA[None, None]).sum(-1)
-#         psi2_tensor = np.tensordot(self.ZZ[None, :, :, :] * np.square(self.variances), self.mu2_S[:, None, None, :], ((3), (3))).squeeze().T
        target += self._psi2

+    def psi2_new(self,Z,mu,S,target):
+        tmp = np.zeros((mu.shape[0], Z.shape[0]))
+        self.K(mu,Z,tmp)
+        target += tmp[:,:,None]*tmp[:,None,:] + np.sum(S[:,None,None,:]*self.variances**2*Z[None,:,None,:]*Z[None,None,:,:],-1)
+
+    def dpsi2_dtheta_new(self, dL_dpsi2, Z, mu, S, target):
+        tmp = np.zeros((mu.shape[0], Z.shape[0]))
+        self.K(mu,Z,tmp)
+        self.dK_dtheta(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target)
+        result= 2.*(dL_dpsi2[:,:,:,None]*S[:,None,None,:]*self.variances*Z[None,:,None,:]*Z[None,None,:,:]).sum(0).sum(0).sum(0)
+        if self.ARD:
+            target += result.sum(0).sum(0).sum(0)
+        else:
+            target += result.sum()
+
    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
@ -170,6 +191,15 @@ class linear(Kernpart):
        else:
            target += tmp.sum()

+    def dpsi2_dmuS_new(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        tmp = np.zeros((mu.shape[0], Z.shape[0]))
+        self.K(mu,Z,tmp)
+        self.dK_dX(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target_mu)
+
+        Zs = Z*self.variances
+        Zs_sq = Zs[:,None,:]*Zs[None,:,:]
+        target_S += (dL_dpsi2[:,:,:,None]*Zs_sq[None,:,:,:]).sum(1).sum(1)
+
    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        """Think N,num_inducing,num_inducing,input_dim """
        self._psi_computations(Z, mu, S)
@ -181,11 +211,17 @@ class linear(Kernpart):
        #target_mu_dummy += (dL_dpsi2[:, :, :, None] * muAZZA).sum(1).sum(1)
        #target_S_dummy += (dL_dpsi2[:, :, :, None] * self.ZA[None, :, None, :] * self.ZA[None, None, :, :]).sum(1).sum(1)

+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = "#pragma omp parallel for private(m,mm,q,qq,factor,tmp)"
+        else:
+            pragma_string = ''
+
        #Using weave, we can exploiut the symmetry of this problem:
        code = """
        int n, m, mm,q,qq;
        double factor,tmp;
-        #pragma omp parallel for private(m,mm,q,qq,factor,tmp)
+        %s
        for(n=0;n<N;n++){
          for(m=0;m<num_inducing;m++){
            for(mm=0;mm<=m;mm++){
@ -209,19 +245,13 @@ class linear(Kernpart):
            }
          }
        }
-        """
-        support_code = """
-        #include <omp.h>
-        #include <math.h>
-        """
-        weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
-                         'extra_link_args'   : ['-lgomp']}
+        """ % pragma_string

-        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
-        weave.inline(code, support_code=support_code, libraries=['gomp'],
-                     arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
-                     type_converters=weave.converters.blitz,**weave_options)
+
+        N,num_inducing,input_dim = int(mu.shape[0]),int(Z.shape[0]),int(mu.shape[1])
+        weave.inline(code, support_code=self.weave_support_code,
+                    arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
+                    type_converters=weave.converters.blitz,**self.weave_options)


    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
@ -231,9 +261,15 @@ class linear(Kernpart):
        #dummy_target += psi2_dZ.sum(0).sum(0)

        AZA = self.variances*self.ZAinner
+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = '#pragma omp parallel for private(n,mm,q)'
+        else:
+            pragma_string = ''
+
        code="""
        int n,m,mm,q;
-        #pragma omp parallel for private(n,mm,q)
+        %s
        for(m=0;m<num_inducing;m++){
          for(q=0;q<input_dim;q++){
            for(mm=0;mm<num_inducing;mm++){
@ -243,22 +279,13 @@ class linear(Kernpart):
            }
          }
        }
-        """
-        support_code = """
-        #include <omp.h>
-        #include <math.h>
-        """
-        weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
-                         'extra_link_args'   : ['-lgomp']}
+        """ % pragma_string

-        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
-        weave.inline(code, support_code=support_code, libraries=['gomp'],
+
+        N,num_inducing,input_dim = int(mu.shape[0]),int(Z.shape[0]),int(mu.shape[1])
+        weave.inline(code, support_code=self.weave_support_code, 
                     arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
-                     type_converters=weave.converters.blitz,**weave_options)
-
-
-
+                     type_converters=weave.converters.blitz,**self.weave_options)


    #---------------------------------------#
@ -266,7 +293,7 @@ class linear(Kernpart):
    #---------------------------------------#

    def _K_computations(self, X, X2):
-        if not (np.array_equal(X, self._Xcache) and np.array_equal(X2, self._X2cache)):
+        if not (fast_array_equal(X, self._Xcache) and fast_array_equal(X2, self._X2cache)):
            self._Xcache = X.copy()
            if X2 is None:
                self._dot_product = tdot(X)
@ -277,8 +304,8 @@ class linear(Kernpart):

    def _psi_computations(self, Z, mu, S):
        # here are the "statistics" for psi1 and psi2
-        Zv_changed = not (np.array_equal(Z, self._Z) and np.array_equal(self.variances, self._variances))
-        muS_changed = not (np.array_equal(mu, self._mu) and np.array_equal(S, self._S))
+        Zv_changed = not (fast_array_equal(Z, self._Z) and fast_array_equal(self.variances, self._variances))
+        muS_changed = not (fast_array_equal(mu, self._mu) and fast_array_equal(S, self._S))
        if Zv_changed:
            # Z has changed, compute Z specific stuff
            # self.ZZ = Z[:,None,:]*Z[None,:,:] # num_inducing,num_inducing,input_dim
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@ -0,0 +1,162 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+four_over_tau = 2./np.pi
+
+class MLP(Kernpart):
+    """
+
+    Multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
+
+    .. math::
+
+          k(x,y) = \\sigma^{2}\\frac{2}{\\pi }  \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y \\sigma_b^2 +1}} \\right )
+          
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int 
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
+    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
+    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object
+
+
+    """
+
+    def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=100., ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not ARD:
+            self.num_params=3
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == 1, "Only one weight variance needed for non-ARD kernel"
+            else:
+                weight_variance = 100.*np.ones(1)
+        else:
+            self.num_params = self.input_dim + 2
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == self.input_dim, "bad number of weight variances"
+            else:
+                weight_variance = np.ones(self.input_dim)
+            raise NotImplementedError
+
+        self.name='mlp'
+        self._set_params(np.hstack((variance, weight_variance.flatten(), bias_variance)))
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.weight_variance.flatten(), self.bias_variance))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.weight_variance = x[1:-1]
+        self.weight_std = np.sqrt(self.weight_variance)
+        self.bias_variance = x[-1]
+
+    def _get_param_names(self):
+        if self.num_params == 3:
+            return ['variance', 'weight_variance', 'bias_variance']
+        else:
+            return ['variance'] + ['weight_variance_%i' % i for i in range(self.lengthscale.size)] + ['bias_variance']
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        self._K_computations(X, X2)
+        target += self.variance*self._K_dvar
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        self._K_diag_computations(X)
+        target+= self.variance*self._K_diag_dvar
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        denom3 = self._K_denom*self._K_denom*self._K_denom
+        base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
+        base_cov_grad = base*dL_dK
+
+        if X2 is None:
+            vec = np.diag(self._K_inner_prod)
+            target[1] += ((self._K_inner_prod/self._K_denom 
+                           -.5*self._K_numer/denom3
+                           *(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec) 
+                             +np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
+            target[2] += ((1./self._K_denom 
+                           -.5*self._K_numer/denom3 
+                           *((vec[None, :]+vec[:, None])*self.weight_variance
+                           +2.*self.bias_variance + 2.))*base_cov_grad).sum()
+        else:
+            vec1 = (X*X).sum(1)
+            vec2 = (X2*X2).sum(1)
+            target[1] += ((self._K_inner_prod/self._K_denom 
+                           -.5*self._K_numer/denom3
+                           *(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
+            target[2] += ((1./self._K_denom 
+                           -.5*self._K_numer/denom3 
+                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
+                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()
+            
+        target[0] += np.sum(self._K_dvar*dL_dK)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X"""
+        self._K_computations(X, X2)
+        arg = self._K_asin_arg
+        numer = self._K_numer
+        denom = self._K_denom
+        denom3 = denom*denom*denom
+        if X2 is not None:
+            vec2 = (X2*X2).sum(1)*self.weight_variance+self.bias_variance + 1.
+            target += four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+        else:
+            vec = (X*X).sum(1)*self.weight_variance+self.bias_variance + 1.
+            target += 2*four_over_tau*self.weight_variance*self.variance*((X[None, :, :]/denom[:, :, None] - vec[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+            
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X"""
+        self._K_diag_computations(X)
+        arg = self._K_diag_asin_arg
+        denom = self._K_diag_denom
+        numer = self._K_diag_numer
+        target += four_over_tau*2.*self.weight_variance*self.variance*X*(1/denom*(1 - arg)*dL_dKdiag/(np.sqrt(1-arg*arg)))[:, None] 
+
+    
+    def _K_computations(self, X, X2):
+        """Pre-computations for the covariance matrix (used for computing the covariance and its gradients."""
+        if self.ARD:
+            pass
+        else:
+            if X2 is None:
+                self._K_inner_prod = np.dot(X,X.T)
+                self._K_numer = self._K_inner_prod*self.weight_variance+self.bias_variance
+                vec = np.diag(self._K_numer) + 1.
+                self._K_denom = np.sqrt(np.outer(vec,vec))
+                self._K_asin_arg = self._K_numer/self._K_denom
+                self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
+            else:
+                self._K_inner_prod = np.dot(X,X2.T)
+                self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
+                vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
+                vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
+                self._K_denom = np.sqrt(np.outer(vec1,vec2))
+                self._K_asin_arg = self._K_numer/self._K_denom
+                self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
+
+    def _K_diag_computations(self, X):
+        """Pre-computations concerning the diagonal terms (used for computation of diagonal and its gradients)."""
+        if self.ARD:
+            pass
+        else:
+            self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
+            self._K_diag_denom = self._K_diag_numer+1.
+            self._K_diag_asin_arg = self._K_diag_numer/self._K_diag_denom
+            self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_asin_arg)
--- a/GPy/kern/parts/odekern1.c
+++ b/GPy/kern/parts/odekern1.c
@ -0,0 +1,38 @@
+#include <math.h> 
+
+ double k_uu(t1,t2,theta1,theta2,sig1,sig2)
+ {
+  double kern=0;
+  double dist=0;
+  
+  dist = sqrt(t2*t2-t1*t1) 
+ 
+  kern = sig1*(1+theta1*dist)*exp(-theta1*dist)
+
+ return kern;
+ }
+
+
+
+ double k_yy(t1, t2, theta1,theta2,sig1,sig2)
+ {
+  double kern=0;
+  double dist=0;
+  
+  dist = sqrt(t2*t2-t1*t1) 
+ 
+  kern = sig1*sig2 * (  exp(-theta1*dist)*(theta2-2*theta1+theta1*theta2*dist-theta1*theta1*dist) +
+  	exp(-dist)  ) / ((theta2-theta1)*(theta2-theta1))
+
+  return kern;
+ } 
+
+
+
+
+
+
+	
+
+
+
--- a/GPy/kern/parts/periodic_Matern32.py
+++ b/GPy/kern/parts/periodic_Matern32.py
@ -7,7 +7,7 @@ import numpy as np
 from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors

-class periodic_Matern32(Kernpart):
+class PeriodicMatern32(Kernpart):
    """
    Kernel of the periodic subspace (up to a given frequency) of a Matern 3/2 RKHS. Only defined for input_dim=1.

@ -113,7 +113,7 @@ class periodic_Matern32(Kernpart):

    @silence_errors
    def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
--- a/GPy/kern/parts/periodic_Matern52.py
+++ b/GPy/kern/parts/periodic_Matern52.py
@ -7,7 +7,7 @@ import numpy as np
 from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors

-class periodic_Matern52(Kernpart):
+class PeriodicMatern52(Kernpart):
    """
    Kernel of the periodic subspace (up to a given frequency) of a Matern 5/2 RKHS. Only defined for input_dim=1.

@ -115,7 +115,7 @@ class periodic_Matern52(Kernpart):

    @silence_errors
    def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
--- a/GPy/kern/parts/periodic_exponential.py
+++ b/GPy/kern/parts/periodic_exponential.py
@ -7,7 +7,7 @@ import numpy as np
 from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors

-class periodic_exponential(Kernpart):
+class PeriodicExponential(Kernpart):
    """
    Kernel of the periodic subspace (up to a given frequency) of a exponential (Matern 1/2) RKHS. Only defined for input_dim=1.

@ -111,7 +111,7 @@ class periodic_exponential(Kernpart):

    @silence_errors
    def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters (shape is N x num_inducing x num_params)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@ -0,0 +1,138 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+four_over_tau = 2./np.pi
+
+class POLY(Kernpart):
+    """
+
+    Polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel:
+
+    .. math::
+        k(x, y) = \sigma^2\*(\sigma_w^2 x'y+\sigma_b^b)^d
+
+    The kernel parameters are :math:`\sigma^2` (variance), :math:`\sigma^2_w`
+    (weight_variance), :math:`\sigma^2_b` (bias_variance) and d
+    (degree). Only gradients of the first three are provided for
+    kernel optimisation, it is assumed that polynomial degree would
+    be set by hand.
+
+    The kernel is not recommended as it is badly behaved when the
+    :math:`\sigma^2_w\*x'\*y + \sigma^2_b` has a magnitude greater than one. For completeness
+    there is an automatic relevance determination version of this
+    kernel provided (NOTE YET IMPLEMENTED!).
+    :param input_dim: the number of input dimensions
+    :type input_dim: int 
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
+    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
+    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
+    :param degree: the degree of the polynomial.
+    :type degree: int
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object
+
+    """
+
+    def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=1., degree=2, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not ARD:
+            self.num_params=3
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == 1, "Only one weight variance needed for non-ARD kernel"
+            else:
+                weight_variance = 1.*np.ones(1)
+        else:
+            self.num_params = self.input_dim + 2
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == self.input_dim, "bad number of weight variances"
+            else:
+                weight_variance = np.ones(self.input_dim)
+            raise NotImplementedError
+        self.degree=degree
+        self.name='poly_deg' + str(self.degree)
+        self._set_params(np.hstack((variance, weight_variance.flatten(), bias_variance)))
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.weight_variance.flatten(), self.bias_variance))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.weight_variance = x[1:-1]
+        self.weight_std = np.sqrt(self.weight_variance)
+        self.bias_variance = x[-1]
+
+    def _get_param_names(self):
+        if self.num_params == 3:
+            return ['variance', 'weight_variance', 'bias_variance']
+        else:
+            return ['variance'] + ['weight_variance_%i' % i for i in range(self.lengthscale.size)] + ['bias_variance']
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        self._K_computations(X, X2)
+        target += self.variance*self._K_dvar
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        self._K_diag_computations(X)
+        target+= self.variance*self._K_diag_dvar
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        base = self.variance*self.degree*self._K_poly_arg**(self.degree-1)
+        base_cov_grad = base*dL_dK
+
+
+            
+        target[0] += np.sum(self._K_dvar*dL_dK)
+        target[1] += (self._K_inner_prod*base_cov_grad).sum()
+        target[2] += base_cov_grad.sum()
+
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X"""
+        self._K_computations(X, X2)
+        arg = self._K_poly_arg
+        if X2 is None:
+            target += 2*self.weight_variance*self.degree*self.variance*(((X[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
+        else:
+            target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
+            
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X"""
+        self._K_diag_computations(X)
+        arg = self._K_diag_poly_arg
+        target += 2.*self.weight_variance*self.degree*self.variance*X*dL_dKdiag[:, None]*(arg**(self.degree-1))[:, None]
+    
+    
+    def _K_computations(self, X, X2):
+        if self.ARD:
+            pass
+        else:
+            if X2 is None:
+                self._K_inner_prod = np.dot(X,X.T)
+            else:
+                self._K_inner_prod = np.dot(X,X2.T)
+            self._K_poly_arg = self._K_inner_prod*self.weight_variance + self.bias_variance
+        self._K_dvar = self._K_poly_arg**self.degree
+
+    def _K_diag_computations(self, X):
+        if self.ARD:
+            pass
+        else:
+            self._K_diag_poly_arg = (X*X).sum(1)*self.weight_variance + self.bias_variance
+        self._K_diag_dvar = self._K_diag_poly_arg**self.degree
+
+  
+
+
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@ -2,10 +2,11 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 from kernpart import Kernpart
+from coregionalize import Coregionalize
 import numpy as np
 import hashlib

-class prod(Kernpart):
+class Prod(Kernpart):
    """
    Computes the product of 2 kernels

@ -18,7 +19,10 @@ class prod(Kernpart):
    """
    def __init__(self,k1,k2,tensor=False):
        self.num_params = k1.num_params + k2.num_params
-        self.name = k1.name + '<times>' + k2.name
+        if tensor:
+            self.name = '['+k1.name + '**' + k2.name +']'
+        else:
+            self.name = '['+k1.name + '*' + k2.name +']'
        self.k1 = k1
        self.k2 = k2
        if tensor:
@ -51,8 +55,18 @@ class prod(Kernpart):
        self._K_computations(X,X2)
        target += self._K1 * self._K2

+    def K1(self,X, X2):
+        """Compute the part of the kernel associated with k1."""
+        self._K_computations(X, X2)
+        return self._K1
+
+    def K2(self, X, X2):
+        """Compute the part of the kernel associated with k2."""
+        self._K_computations(X, X2)
+        return self._K2
+
    def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters."""
+        """Derivative of the covariance matrix with respect to the parameters."""
        self._K_computations(X,X2)
        if X2 is None:
            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
@ -80,8 +94,18 @@ class prod(Kernpart):
    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        self._K_computations(X,X2)
-        self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target)
-        self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target)
+        if X2 is None:
+            if not isinstance(self.k1,Coregionalize) and not isinstance(self.k2,Coregionalize):
+                self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
+                self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
+            else:#if isinstance(self.k1,Coregionalize) or isinstance(self.k2,Coregionalize):
+                #NOTE The indices column in the inputs makes the ki.dK_dX fail when passing None instead of X[:,self.slicei]
+                X2 = X
+                self.k1.dK_dX(2.*dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+                self.k2.dK_dX(2.*dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+        else:
+            self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+            self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])

    def dKdiag_dX(self, dL_dKdiag, X, target):
        K1 = np.zeros(X.shape[0])
@ -89,8 +113,8 @@ class prod(Kernpart):
        self.k1.Kdiag(X[:,self.slice1],K1)
        self.k2.Kdiag(X[:,self.slice2],K2)

-        self.k1.dK_dX(dL_dKdiag*K2, X[:,self.slice1], target)
-        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.slice2], target)
+        self.k1.dK_dX(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
+        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])

    def _K_computations(self,X,X2):
        if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
@ -109,3 +133,13 @@ class prod(Kernpart):
                self.k1.K(X[:,self.slice1],X2[:,self.slice1],self._K1)
                self.k2.K(X[:,self.slice2],X2[:,self.slice2],self._K2)

+    #def __getstate__(self):
+        #return [self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params]
+
+    #def __setstate__(self, state):
+        #self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params = state
+        #self._X, self._X2, self._params = np.empty(shape=(3,1))
+
+
+
+
--- a/GPy/kern/parts/prod_orthogonal.py
+++ b/GPy/kern/parts/prod_orthogonal.py
--- a/GPy/kern/parts/rational_quadratic.py
+++ b/GPy/kern/parts/rational_quadratic.py
@ -5,7 +5,7 @@
 from kernpart import Kernpart
 import numpy as np

-class rational_quadratic(Kernpart):
+class RationalQuadratic(Kernpart):
    """
    rational quadratic kernel

@ -57,7 +57,7 @@ class rational_quadratic(Kernpart):
        dist2 = np.square((X-X2.T)/self.lengthscale)

        dvar = (1 + dist2/2.)**(-self.power)
-        dl = self.power * self.variance * dist2 * self.lengthscale**(-3) * (1 + dist2/2./self.power)**(-self.power-1)
+        dl = self.power * self.variance * dist2 / self.lengthscale * (1 + dist2/2.)**(-self.power-1)
        dp = - self.variance * np.log(1 + dist2/2.) * (1 + dist2/2.)**(-self.power)

        target[0] += np.sum(dvar*dL_dK)
@ -70,10 +70,12 @@ class rational_quadratic(Kernpart):

    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist2 = np.square((X-X2.T)/self.lengthscale)
-
-        dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        if X2 is None:
+            dist2 = np.square((X-X.T)/self.lengthscale)
+            dX = -2.*self.variance*self.power * (X-X.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        else:
+            dist2 = np.square((X-X2.T)/self.lengthscale)
+            dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
        target += np.sum(dL_dK*dX,1)[:,np.newaxis]

    def dKdiag_dX(self,dL_dKdiag,X,target):
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@ -4,11 +4,12 @@

 from kernpart import Kernpart
 import numpy as np
-import hashlib
 from scipy import weave
-from ..util.linalg import tdot
+from ...util.linalg import tdot
+from ...util.misc import fast_array_equal
+from ...util.config import *

-class rbf(Kernpart):
+class RBF(Kernpart):
    """
    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:

@ -57,12 +58,27 @@ class rbf(Kernpart):
        self._X, self._X2, self._params = np.empty(shape=(3, 1))

        # a set of optional args to pass to weave
-        self.weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
-                         'extra_link_args'   : ['-lgomp']}
+        weave_options_openmp = {'headers'           : ['<omp.h>'],
+                                'extra_compile_args': ['-fopenmp -O3'],
+                                'extra_link_args'   : ['-lgomp'],
+                                'libraries': ['gomp']}
+        weave_options_noopenmp = {'extra_compile_args': ['-O3']}



+        if config.getboolean('parallel', 'openmp'):
+            self.weave_options = weave_options_openmp
+            self.weave_support_code =  """
+            #include <omp.h>
+            #include <math.h>
+            """
+        else:
+            self.weave_options = weave_options_noopenmp
+            self.weave_support_code = """
+            #include <math.h>
+            """
+
+
    def _get_params(self):
        return np.hstack((self.variance, self.lengthscale))

@ -110,8 +126,8 @@ class rbf(Kernpart):
                  target(q+1) += var_len3(q)*tmp;
                }
                """
-                num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
-                weave.inline(code, arg_names=['num_data','num_inducing','input_dim','X','X2','target','dvardLdK','var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
+                num_data, num_inducing, input_dim = int(X.shape[0]), int(X.shape[0]), int(self.input_dim)
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
            else:
                code = """
                int q,i,j;
@ -126,9 +142,9 @@ class rbf(Kernpart):
                  target(q+1) += var_len3(q)*tmp;
                }
                """
-                num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
-                #[np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
-                weave.inline(code, arg_names=['num_data','num_inducing','input_dim','X','X2','target','dvardLdK','var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
+                num_data, num_inducing, input_dim = int(X.shape[0]), int(X2.shape[0]), int(self.input_dim)
+                # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
        else:
            target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)

@ -138,7 +154,10 @@ class rbf(Kernpart):

    def dK_dX(self, dL_dK, X, X2, target):
        self._K_computations(X, X2)
-        _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        if X2 is None:
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
        dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)

@ -165,10 +184,9 @@ class rbf(Kernpart):

    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
-        denom_deriv = S[:, None, :] / (self.lengthscale ** 3 + self.lengthscale * S[:, None, :])
-        d_length = self._psi1[:, :, None] * (self.lengthscale * np.square(self._psi1_dist / (self.lengthscale2 + S[:, None, :])) + denom_deriv)
        target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
-        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+        d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale)
+        dpsi1_dlength = d_length * np.atleast_3d(dL_dpsi1)
        if not self.ARD:
            target[1] += dpsi1_dlength.sum()
        else:
@ -190,12 +208,19 @@ class rbf(Kernpart):
        self._psi_computations(Z, mu, S)
        target += self._psi2

+    def _crossterm_mu_S(self, Z, mu, S):
+        # compute the crossterm expectation for K as the other kernel:
+        Sigma = 1./self.lengthscale2[None,None,:] + 1./S[:,None,:] # is independent across M, 
+        Sigma_tilde = (self.lengthscale2[None, :] + S)
+        M = (S*mu/Sigma_tilde)[:, None, :] + (self.lengthscale2[None,:]*Z)[None, :, :]/Sigma_tilde[:, None, :]
+        # make sure return is [N x M x Q]
+        return M, Sigma.repeat(Z.shape[0],1) 
+
    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
        """Shape N,num_inducing,num_inducing,Ntheta"""
        self._psi_computations(Z, mu, S)
        d_var = 2.*self._psi2 / self.variance
        d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
-
        target[0] += np.sum(dL_dpsi2 * d_var)
        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
        if not self.ARD:
@ -222,9 +247,10 @@ class rbf(Kernpart):
    #---------------------------------------#

    def _K_computations(self, X, X2):
-        if not (np.array_equal(X, self._X) and np.array_equal(X2, self._X2) and np.array_equal(self._params , self._get_params())):
+        params = self._get_params()
+        if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2) and fast_array_equal(self._params , params)):
            self._X = X.copy()
-            self._params == self._get_params().copy()
+            self._params = params.copy()
            if X2 is None:
                self._X2 = None
                X = X / self.lengthscale
@ -239,55 +265,61 @@ class rbf(Kernpart):

    def _psi_computations(self, Z, mu, S):
        # here are the "statistics" for psi1 and psi2
-        if not np.array_equal(Z, self._Z):
-            #Z has changed, compute Z specific stuff
-            self._psi2_Zhat = 0.5*(Z[:,None,:] +Z[None,:,:]) # M,M,Q
-            self._psi2_Zdist = 0.5*(Z[:,None,:]-Z[None,:,:]) # M,M,Q
-            self._psi2_Zdist_sq = np.square(self._psi2_Zdist/self.lengthscale) # M,M,Q
-            self._Z = Z
+        Z_changed = not fast_array_equal(Z, self._Z)
+        if Z_changed:
+            # Z has changed, compute Z specific stuff
+            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist_sq = np.square(self._psi2_Zdist / self.lengthscale) # M,M,Q

-        if not (np.array_equal(Z, self._Z) and np.array_equal(mu, self._mu) and np.array_equal(S, self._S)):
-            #something's changed. recompute EVERYTHING
+        if Z_changed or not fast_array_equal(mu, self._mu) or not fast_array_equal(S, self._S):
+            # something's changed. recompute EVERYTHING

-            #psi1
-            self._psi1_denom = S[:,None,:]/self.lengthscale2 + 1.
-            self._psi1_dist = Z[None,:,:]-mu[:,None,:]
-            self._psi1_dist_sq = np.square(self._psi1_dist)/self.lengthscale2/self._psi1_denom
-            self._psi1_exponent = -0.5*np.sum(self._psi1_dist_sq+np.log(self._psi1_denom),-1)
-            self._psi1 = self.variance*np.exp(self._psi1_exponent)
+            # psi1
+            self._psi1_denom = S[:, None, :] / self.lengthscale2 + 1.
+            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
+            self._psi1_dist_sq = np.square(self._psi1_dist) / self.lengthscale2 / self._psi1_denom
+            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
+            self._psi1 = self.variance * np.exp(self._psi1_exponent)

-            #psi2
-            self._psi2_denom = 2.*S[:,None,None,:]/self.lengthscale2+1. # N,M,M,Q
-            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu,self._psi2_Zhat)
-            #self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
-            #self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
-            #self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
-            self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,M,M,Q
+            # psi2
+            self._psi2_denom = 2.*S[:, None, None, :] / self.lengthscale2 + 1. # N,M,M,Q
+            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
+            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
+            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
+            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
+            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q

-            #store matrices for caching
-            self._Z, self._mu, self._S = Z, mu,S
+            # store matrices for caching
+            self._Z, self._mu, self._S = Z, mu, S

-    def weave_psi2(self,mu,Zhat):
-        N,input_dim = mu.shape
+    def weave_psi2(self, mu, Zhat):
+        N, input_dim = mu.shape
        num_inducing = Zhat.shape[0]

-        mudist = np.empty((N,num_inducing,num_inducing,input_dim))
-        mudist_sq = np.empty((N,num_inducing,num_inducing,input_dim))
-        psi2_exponent = np.zeros((N,num_inducing,num_inducing))
-        psi2 = np.empty((N,num_inducing,num_inducing))
+        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
+        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
+        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
+        psi2 = np.empty((N, num_inducing, num_inducing))

        psi2_Zdist_sq = self._psi2_Zdist_sq
-        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
-        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
+        _psi2_denom = self._psi2_denom.squeeze().reshape(-1, input_dim)
+        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(-1, input_dim)
        variance_sq = float(np.square(self.variance))
        if self.ARD:
            lengthscale2 = self.lengthscale2
        else:
            lengthscale2 = np.ones(input_dim) * self.lengthscale2
+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = '#pragma omp parallel for private(tmp)'
+        else:
+            pragma_string = ''
+
        code = """
        double tmp;

-        #pragma omp parallel for private(tmp)
+        %s
        for (int n=0; n<N; n++){
            for (int m=0; m<num_inducing; m++){
               for (int mm=0; mm<(m+1); mm++){
@ -317,14 +349,21 @@ class rbf(Kernpart):
            }
        }

-        """
+        """ % pragma_string
+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = '#include <omp.h>'
+        else:
+            pragma_string = ''

        support_code = """
-        #include <omp.h>
+        %s
        #include <math.h>
-        """
-        weave.inline(code, support_code=support_code, libraries=['gomp'],
-                     arg_names=['N','num_inducing','input_dim','mu','Zhat','mudist_sq','mudist','lengthscale2','_psi2_denom','psi2_Zdist_sq','psi2_exponent','half_log_psi2_denom','psi2','variance_sq'],
+        """ % pragma_string
+
+        N, num_inducing, input_dim = int(N), int(num_inducing), int(input_dim)
+        weave.inline(code, support_code=support_code,
+                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
                     type_converters=weave.converters.blitz, **self.weave_options)

        return mudist, mudist_sq, psi2_exponent, psi2
--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
@ -0,0 +1,341 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from rbf import RBF
+import numpy as np
+import hashlib
+from scipy import weave
+from ...util.linalg import tdot
+from ...util.config import *
+
+
+class RBFInv(RBF):
+    """
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel. It only
+    differs from RBF in that here the parametrization is wrt the inverse lengthscale:
+
+    .. math::
+
+       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
+
+    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the vector of lengthscale of the kernel
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    .. Note: this object implements both the ARD and 'spherical' version of the function
+    """
+
+    def __init__(self, input_dim, variance=1., inv_lengthscale=None, ARD=False):
+        self.input_dim = input_dim
+        self.name = 'rbf_inv'
+        self.ARD = ARD
+        if not ARD:
+            self.num_params = 2
+            if inv_lengthscale is not None:
+                inv_lengthscale = np.asarray(inv_lengthscale)
+                assert inv_lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                inv_lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            if inv_lengthscale is not None:
+                inv_lengthscale = np.asarray(inv_lengthscale)
+                assert inv_lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                inv_lengthscale = np.ones(self.input_dim)
+
+        self._set_params(np.hstack((variance, inv_lengthscale.flatten())))
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+
+        # a set of optional args to pass to weave
+        weave_options_openmp = {'headers'           : ['<omp.h>'],
+                                'extra_compile_args': ['-fopenmp -O3'],
+                                'extra_link_args'   : ['-lgomp'],
+                                'libraries': ['gomp']}
+        weave_options_noopenmp = {'extra_compile_args': ['-O3']}
+
+        if config.getboolean('parallel', 'openmp'):
+            self.weave_options = weave_options_openmp
+            self.weave_support_code =  """
+            #include <omp.h>
+            #include <math.h>
+            """
+        else:
+            self.weave_options = weave_options_noopenmp
+            self.weave_support_code = """
+            #include <math.h>
+            """
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.inv_lengthscale))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.inv_lengthscale = x[1:]
+        self.inv_lengthscale2 = np.square(self.inv_lengthscale)
+        # TODO: We can rewrite everything with inv_lengthscale and never need to do the below
+        self.lengthscale = 1. / self.inv_lengthscale
+        self.lengthscale2 = np.square(self.lengthscale)
+        # reset cached results
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
+
+    def _get_param_names(self):
+        if self.num_params == 2:
+            return ['variance', 'inv_lengthscale']
+        else:
+            return ['variance'] + ['inv_lengthscale%i' % i for i in range(self.inv_lengthscale.size)]
+
+    # TODO: Rewrite computations so that lengthscale is not needed (but only inv. lengthscale)
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        target[0] += np.sum(self._K_dvar * dL_dK)
+        if self.ARD:
+            dvardLdK = self._K_dvar * dL_dK
+            var_len3 = self.variance / np.power(self.lengthscale, 3)
+            len2 = self.lengthscale2
+            if X2 is None:
+                # save computation for the symmetrical case
+                dvardLdK = dvardLdK + dvardLdK.T
+                code = """
+                int q,i,j;
+                double tmp;
+                for(q=0; q<input_dim; q++){
+                  tmp = 0;
+                  for(i=0; i<num_data; i++){
+                    for(j=0; j<i; j++){
+                      tmp += (X(i,q)-X(j,q))*(X(i,q)-X(j,q))*dvardLdK(i,j);
+                    }
+                  }
+                  target(q+1) += var_len3(q)*tmp*(-len2(q));
+                }
+                """
+                num_data, num_inducing, input_dim = int(X.shape[0]), int(X.shape[0]), int(self.input_dim)
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
+            else:
+                code = """
+                int q,i,j;
+                double tmp;
+                for(q=0; q<input_dim; q++){
+                  tmp = 0;
+                  for(i=0; i<num_data; i++){
+                    for(j=0; j<num_inducing; j++){
+                      tmp += (X(i,q)-X2(j,q))*(X(i,q)-X2(j,q))*dvardLdK(i,j);
+                    }
+                  }
+                  target(q+1) += var_len3(q)*tmp*(-len2(q));
+                }
+                """
+                num_data, num_inducing, input_dim = int(X.shape[0]), int(X2.shape[0]), int(self.input_dim)
+                # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
+        else:
+            target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK) * (-self.lengthscale2)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        if X2 is None:
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        dK_dX = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        pass
+
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    # def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+    #     self._psi_computations(Z, mu, S)
+    #     denom_deriv = S[:, None, :] / (self.lengthscale ** 3 + self.lengthscale * S[:, None, :])
+    #     d_length = self._psi1[:, :, None] * (self.lengthscale * np.square(self._psi1_dist / (self.lengthscale2 + S[:, None, :])) + denom_deriv)
+    #     target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
+    #     dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+    #     if not self.ARD:
+    #         target[1] += dpsi1_dlength.sum()*(-self.lengthscale2)
+    #     else:
+    #         target[1:] += dpsi1_dlength.sum(0).sum(0)*(-self.lengthscale2)
+    #     #target[1:] = target[1:]*(-self.lengthscale2)
+
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        tmp = 1 + S[:, None, :] * self.inv_lengthscale2
+        # d_inv_length_old = -self._psi1[:, :, None] * ((self._psi1_dist_sq - 1.) / (self.lengthscale * self._psi1_denom) + self.inv_lengthscale) / self.inv_lengthscale2
+        d_length = -(self._psi1[:, :, None] * ((np.square(self._psi1_dist) * self.inv_lengthscale) / (tmp ** 2) + (S[:, None, :] * self.inv_lengthscale) / (tmp)))
+        # d_inv_length = -self._psi1[:, :, None] * ((self._psi1_dist_sq - 1.) / self._psi1_denom + self.lengthscale)
+        target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
+        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+        if not self.ARD:
+            target[1] += dpsi1_dlength.sum() # *(-self.lengthscale2)
+        else:
+            target[1:] += dpsi1_dlength.sum(0).sum(0) # *(-self.lengthscale2)
+        # target[1:] = target[1:]*(-self.lengthscale2)
+
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        dpsi1_dZ = -self._psi1[:, :, None] * ((self.inv_lengthscale2 * self._psi1_dist) / self._psi1_denom)
+        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
+
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
+        self._psi_computations(Z, mu, S)
+        tmp = (self._psi1[:, :, None] * self.inv_lengthscale2) / self._psi1_denom
+        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
+        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
+
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
+        """Shape N,num_inducing,num_inducing,Ntheta"""
+        self._psi_computations(Z, mu, S)
+        d_var = 2.*self._psi2 / self.variance
+        # d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
+        d_length = -2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] * self.inv_lengthscale2) / (self.inv_lengthscale * self._psi2_denom)
+        target[0] += np.sum(dL_dpsi2 * d_var)
+        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
+        if not self.ARD:
+            target[1] += dpsi2_dlength.sum() # *(-self.lengthscale2)
+        else:
+            target[1:] += dpsi2_dlength.sum(0).sum(0).sum(0) # *(-self.lengthscale2)
+        # target[1:] = target[1:]*(-self.lengthscale2)
+
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        term1 = self._psi2_Zdist * self.inv_lengthscale2 # num_inducing, num_inducing, input_dim
+        term2 = (self._psi2_mudist * self.inv_lengthscale2) / self._psi2_denom # N, num_inducing, num_inducing, input_dim
+        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
+        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
+
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        """Think N,num_inducing,num_inducing,input_dim """
+        self._psi_computations(Z, mu, S)
+        tmp = (self.inv_lengthscale2 * self._psi2[:, :, :, None]) / self._psi2_denom
+        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
+        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
+
+    #---------------------------------------#
+    #            Precomputations            #
+    #---------------------------------------#
+
+    def _K_computations(self, X, X2):
+        if not (np.array_equal(X, self._X) and np.array_equal(X2, self._X2) and np.array_equal(self._params , self._get_params())):
+            self._X = X.copy()
+            self._params = self._get_params().copy()
+            if X2 is None:
+                self._X2 = None
+                X = X * self.inv_lengthscale
+                Xsquare = np.sum(np.square(X), 1)
+                self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
+            else:
+                self._X2 = X2.copy()
+                X = X * self.inv_lengthscale
+                X2 = X2 * self.inv_lengthscale
+                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), 1)[:, None] + np.sum(np.square(X2), 1)[None, :])
+            self._K_dvar = np.exp(-0.5 * self._K_dist2)
+
+    def _psi_computations(self, Z, mu, S):
+        # here are the "statistics" for psi1 and psi2
+        if not np.array_equal(Z, self._Z):
+            # Z has changed, compute Z specific stuff
+            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist_sq = np.square(self._psi2_Zdist * self.inv_lengthscale) # M,M,Q
+
+        if not (np.array_equal(Z, self._Z) and np.array_equal(mu, self._mu) and np.array_equal(S, self._S)):
+            # something's changed. recompute EVERYTHING
+
+            # psi1
+            self._psi1_denom = S[:, None, :] * self.inv_lengthscale2 + 1.
+            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
+            self._psi1_dist_sq = (np.square(self._psi1_dist) * self.inv_lengthscale2) / self._psi1_denom
+            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
+            self._psi1 = self.variance * np.exp(self._psi1_exponent)
+
+            # psi2
+            self._psi2_denom = 2.*S[:, None, None, :] * self.inv_lengthscale2 + 1. # N,M,M,Q
+            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
+            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
+            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
+            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
+            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q
+
+            # store matrices for caching
+            self._Z, self._mu, self._S = Z, mu, S
+
+    def weave_psi2(self, mu, Zhat):
+        N, input_dim = int(mu.shape[0]), int(mu.shape[1])
+        num_inducing = int(Zhat.shape[0])
+
+        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
+        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
+        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
+        psi2 = np.empty((N, num_inducing, num_inducing))
+
+        psi2_Zdist_sq = self._psi2_Zdist_sq
+        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
+        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
+        variance_sq = float(np.square(self.variance))
+        if self.ARD:
+            inv_lengthscale2 = self.inv_lengthscale2
+        else:
+            inv_lengthscale2 = np.ones(input_dim) * self.inv_lengthscale2
+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = '#pragma omp parallel for private(tmp)'
+        else:
+            pragma_string = ''
+
+        code = """
+        double tmp;
+
+        %s
+        for (int n=0; n<N; n++){
+            for (int m=0; m<num_inducing; m++){
+               for (int mm=0; mm<(m+1); mm++){
+                   for (int q=0; q<input_dim; q++){
+                       //compute mudist
+                       tmp = mu(n,q) - Zhat(m,mm,q);
+                       mudist(n,m,mm,q) = tmp;
+                       mudist(n,mm,m,q) = tmp;
+
+                       //now mudist_sq
+                       tmp = tmp*tmp*inv_lengthscale2(q)/_psi2_denom(n,q);
+                       mudist_sq(n,m,mm,q) = tmp;
+                       mudist_sq(n,mm,m,q) = tmp;
+
+                       //now psi2_exponent
+                       tmp = -psi2_Zdist_sq(m,mm,q) - tmp - half_log_psi2_denom(n,q);
+                       psi2_exponent(n,mm,m) += tmp;
+                       if (m !=mm){
+                           psi2_exponent(n,m,mm) += tmp;
+                       }
+                   //psi2 would be computed like this, but np is faster
+                   //tmp = variance_sq*exp(psi2_exponent(n,m,mm));
+                   //psi2(n,m,mm) = tmp;
+                   //psi2(n,mm,m) = tmp;
+                   }
+                }
+            }
+        }
+
+        """ % pragma_string
+
+        weave.inline(code, support_code=self.weave_support_code,
+                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'inv_lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
+                     type_converters=weave.converters.blitz, **self.weave_options)
+
+        return mudist, mudist_sq, psi2_exponent, psi2
--- a/GPy/kern/parts/rbfcos.py
+++ b/GPy/kern/parts/rbfcos.py
@ -6,7 +6,7 @@
 from kernpart import Kernpart
 import numpy as np

-class rbfcos(Kernpart):
+class RBFCos(Kernpart):
    def __init__(self,input_dim,variance=1.,frequencies=None,bandwidths=None,ARD=False):
        self.input_dim = input_dim
        self.name = 'rbfcos'
--- a/GPy/kern/parts/spline.py
+++ b/GPy/kern/parts/spline.py
@ -9,7 +9,7 @@ def theta(x):
    """Heaviside step function"""
    return np.where(x>=0.,1.,0.)

-class spline(Kernpart):
+class Spline(Kernpart):
    """
    Spline kernel

--- a/GPy/kern/parts/symmetric.py
+++ b/GPy/kern/parts/symmetric.py
@ -4,7 +4,7 @@
 from kernpart import Kernpart
 import numpy as np

-class symmetric(Kernpart):
+class Symmetric(Kernpart):
    """
    Symmetrical kernels

@ -56,7 +56,7 @@ class symmetric(Kernpart):
        AX = np.dot(X,self.transform)
        if X2 is None:
            X2 = X
-            ZX2 = AX
+            AX2 = AX
        else:
            AX2 = np.dot(X2, self.transform)
        self.k.dK_dtheta(dL_dK,X,X2,target)
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@ -0,0 +1,196 @@
+#include "Python.h"
+#include <math.h>
+#include <float.h>
+#include <stdlib.h>
+#include <iostream>
+#include <stdexcept>
+double DiracDelta(double x){
+  // TODO: this doesn't seem to be a dirac delta ... should return infinity. Neil
+    if((x<0.000001) & (x>-0.000001))//go on, laugh at my c++ skills
+        return 1.0;
+    else
+        return 0.0;
+};
+double DiracDelta(double x,int foo){
+    return 0.0;
+};
+
+double sinc(double x){
+  // compute the sinc function
+  if (x==0)
+    return 1.0;
+  else 
+    return sin(x)/x;
+}
+
+double sinc_grad(double x){
+  // compute the gradient of the sinc function.
+  if (x==0)
+    return 0.0;
+  else 
+    return (x*cos(x) - sin(x))/(x*x);
+}
+double erfcx(double x){
+  // Based on code by Soren Hauberg 2010 for Octave.
+  // compute the scaled complex error function.
+  //return erfc(x)*exp(x*x);
+  double xneg=-sqrt(log(DBL_MAX/2));
+  double xmax = 1/(sqrt(M_PI)*DBL_MIN);
+  xmax = DBL_MAX<xmax ? DBL_MAX : xmax;
+  // Find values where erfcx can be evaluated
+  double t = 3.97886080735226 / (fabs(x) + 3.97886080735226);
+  double u = t-0.5;
+  double y = (((((((((u * 0.00127109764952614092 + 1.19314022838340944e-4) * u 
+		     - 0.003963850973605135)   * u - 8.70779635317295828e-4) * u 
+		   + 0.00773672528313526668) * u + 0.00383335126264887303) * u 
+		 - 0.0127223813782122755)  * u - 0.0133823644533460069)  * u 
+	       + 0.0161315329733252248)  * u + 0.0390976845588484035)  * u + 0.00249367200053503304;
+  y = ((((((((((((y * u - 0.0838864557023001992) * u -		       
+		 0.119463959964325415) * u + 0.0166207924969367356) * u + 
+	       0.357524274449531043) * u + 0.805276408752910567)  * u + 
+	     1.18902982909273333)  * u + 1.37040217682338167)   * u +	
+	   1.31314653831023098)  * u + 1.07925515155856677)   * u +	
+	 0.774368199119538609) * u + 0.490165080585318424)  * u +	
+       0.275374741597376782) * t;
+
+  if (x<xneg)
+    return -INFINITY;
+  else if (x<0)
+    return 2.0*exp(x*x)-y;
+  else if (x>xmax)
+    return 0.0;
+  else 
+    return y;
+}
+
+double ln_diff_erf(double x0, double x1){
+  // stably compute the log of difference between two erfs.
+  if (x1>x0){
+    PyErr_SetString(PyExc_RuntimeError,"second argument must be smaller than or equal to first in ln_diff_erf");
+    throw 1;
+  }
+  if (x0==x1){
+    PyErr_WarnEx(PyExc_RuntimeWarning,"divide by zero encountered in log", 1);
+    return -INFINITY;
+  }
+  else if(x0<0 && x1>0 || x0>0 && x1<0) //x0 and x1 have opposite signs
+    return log(erf(x0)-erf(x1));
+  else if(x0>0) //x0 positive, x1 non-negative
+    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1- x0*x0))-x1*x1; 
+  else //x0 and x1 non-positive
+    return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
+}
+// TODO: For all these computations of h things are very efficient at the moment. Need to recode sympykern to allow the precomputations to take place and all the gradients to be computed in one function. Not sure of best way forward for that yet. Neil
+double h(double t, double tprime, double d_i, double d_j, double l){
+  // Compute the h function for the sim covariance.
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  // if either ln_part_1 or ln_part_2 are -inf, don't bother computing rest of that term.
+  double part_1 = 0.0;
+  if(isfinite(ln_part_1))
+    part_1 = sign_val*exp(half_l_di*half_l_di - d_i*(t-tprime) + ln_part_1 - log(d_i + d_j));
+  double part_2 = 0.0;
+  if(isfinite(ln_part_2))
+    part_2 = sign_val*exp(half_l_di*half_l_di - d_i*t - d_j*tprime + ln_part_2 - log(d_i + d_j));
+  return part_1 - part_2;
+}
+
+
+double dh_dd_i(double t, double tprime, double d_i, double d_j, double l){
+  double diff_t = (t-tprime);
+  double l2 = l*l;
+  double hv = h(t, tprime, d_i, d_j, l);
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_1 = half_l_di;
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
+  double base = (0.5*d_i*l2*(d_i+d_j)-1)*hv;
+  if(isfinite(ln_part_1))
+    base -= diff_t*sign_val*exp(half_l_di*half_l_di
+				-d_i*diff_t
+				+ln_part_1);
+  if(isfinite(ln_part_2))
+    base += t*sign_val*exp(half_l_di*half_l_di
+			   -d_i*t-d_j*tprime
+			   +ln_part_2);
+  base += l/sqrt(M_PI)*(-exp(-diff_t*diff_t/l2)
+			+exp(-tprime*tprime/l2-d_i*t)
+			+exp(-t*t/l2-d_j*tprime)
+			-exp(-(d_i*t + d_j*tprime)));
+  return base/(d_i+d_j);
+
+}
+
+double dh_dd_j(double t, double tprime, double d_i, double d_j, double l){
+  double half_l_di = 0.5*l*d_i;
+  double hv = h(t, tprime, d_i, d_j, l);
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
+  double base = -hv;
+  if(isfinite(ln_part_2))
+    base += tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2);
+  return base/(d_i+d_j);
+}
+
+double dh_dl(double t, double tprime, double d_i, double d_j, double l){
+  // compute gradient of h function with respect to lengthscale for sim covariance
+  // TODO a lot of energy wasted recomputing things here, need to do this in a shared way somehow ... perhaps needs rewrite of sympykern.
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  double diff_t = t - tprime;
+  double l2 = l*l;
+  double hv = h(t, tprime, d_i, d_j, l);
+  return 0.5*d_i*d_i*l*hv + 2/(sqrt(M_PI)*(d_i+d_j))*((-diff_t/l2-d_i/2)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2)*exp(-t*t/l2-d_j*tprime)-d_i/2*exp(-(d_i*t+d_j*tprime)));
+}
+
+double dh_dt(double t, double tprime, double d_i, double d_j, double l){
+  // compute gradient of h function with respect to t.
+  double diff_t = t - tprime;
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - diff_t/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  
+  return (d_i*exp(ln_part_2-d_i*t - d_j*tprime) - d_i*exp(ln_part_1-d_i*diff_t) + 2*exp(-d_i*diff_t - pow(half_l_di - diff_t/l, 2))/(sqrt(M_PI)*l) - 2*exp(-d_i*t - d_j*tprime - pow(half_l_di - t/l,2))/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
+}
+
+double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
+  // compute gradient of h function with respect to tprime.
+  double diff_t = t - tprime;
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - diff_t/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+
+  return (d_i*exp(ln_part_1-d_i*diff_t) + d_j*exp(ln_part_2-d_i*t - d_j*tprime) + (-2*exp(-pow(half_l_di - diff_t/l,2)) + 2*exp(-pow(half_l_di + tprime/l,2)))*exp(-d_i*diff_t)/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
+}
--- a/GPy/kern/parts/sympy_helpers.h
+++ b/GPy/kern/parts/sympy_helpers.h
@ -0,0 +1,16 @@
+#include <math.h>
+double DiracDelta(double x);
+double DiracDelta(double x, int foo);
+
+double sinc(double x);
+double sinc_grad(double x);
+
+double erfcx(double x);
+double ln_diff_erf(double x0, double x1);
+
+double h(double t, double tprime, double d_i, double d_j, double l);
+double dh_dl(double t, double tprime, double d_i, double d_j, double l);
+double dh_dd_i(double t, double tprime, double d_i, double d_j, double l);
+double dh_dd_j(double t, double tprime, double d_i, double d_j, double l);
+double dh_dt(double t, double tprime, double d_i, double d_j, double l);
+double dh_dtprime(double t, double tprime, double d_i, double d_j, double l);
--- a/GPy/kern/parts/sympy_helpers.py
+++ b/GPy/kern/parts/sympy_helpers.py
@ -0,0 +1,71 @@
+# Code for testing functions written in sympy_helpers.cpp
+from scipy import weave
+import tempfile
+import os
+import numpy as np
+current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+extra_compile_args = []
+
+weave_kwargs = {
+    'support_code': "",
+    'include_dirs':[tempfile.gettempdir(), current_dir],
+    'headers':['"parts/sympy_helpers.h"'],
+    'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
+    'extra_compile_args':extra_compile_args,
+    'extra_link_args':['-lgomp'],
+    'verbose':True}
+
+def erfcx(x):
+    code = """
+        // Code for computing scaled complementary erf
+        int i;
+        int dim;
+        int elements = Ntarget[0];
+        for (dim=1; dim<Dtarget; dim++)
+          elements *= Ntarget[dim];
+        for (i=0;i<elements;i++) 
+            target[i] = erfcx(x[i]);
+        """
+    x = np.asarray(x)
+    arg_names = ['target','x']
+    target = np.zeros_like(x)
+    weave.inline(code=code, arg_names=arg_names,**weave_kwargs)
+    return target
+
+def ln_diff_erf(x, y):
+    code = """
+        // Code for computing scaled complementary erf
+        int i;
+        int dim;
+        int elements = Ntarget[0];
+        for (dim=1; dim<Dtarget; dim++)
+          elements *= Ntarget[dim];
+        for (i=0;i<elements;i++) 
+          target[i] = ln_diff_erf(x[i], y[i]);
+        """
+    x = np.asarray(x)
+    y = np.asarray(y)
+    assert(x.shape==y.shape)
+    target = np.zeros_like(x)
+    arg_names = ['target','x', 'y']
+    weave.inline(code=code, arg_names=arg_names,**weave_kwargs)
+    return target
+
+def h(t, tprime, d_i, d_j, l):
+    code = """
+        // Code for computing the 1st order ODE h helper function.
+        int i;
+        int dim;
+        int elements = Ntarget[0];
+        for (dim=1; dim<Dtarget; dim++)
+          elements *= Ntarget[dim];
+        for (i=0;i<elements;i++) 
+          target[i] = h(t[i], tprime[i], d_i, d_j, l);
+        """
+    t = np.asarray(t)
+    tprime = np.asarray(tprime)
+    assert(tprime.shape==t.shape)
+    target = np.zeros_like(t)
+    arg_names = ['target','t', 'tprime', 'd_i', 'd_j', 'l']
+    weave.inline(code=code, arg_names=arg_names,**weave_kwargs)
+    return target
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@ -0,0 +1,461 @@
+import numpy as np
+import sympy as sp
+from sympy.utilities.codegen import codegen
+from sympy.core.cache import clear_cache
+from scipy import weave
+import re
+import os
+import sys
+current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+import tempfile
+import pdb
+import ast
+from kernpart import Kernpart
+from ...util.config import config
+
+class spkern(Kernpart):
+    """
+    A kernel object, where all the hard work in done by sympy.
+
+    :param k: the covariance function
+    :type k: a positive definite sympy function of x_0, z_0, x_1, z_1, x_2, z_2...
+
+    To construct a new sympy kernel, you'll need to define:
+     - a kernel function using a sympy object. Ensure that the kernel is of the form k(x,z).
+     - that's it! we'll extract the variables from the function k.
+
+    Note:
+     - to handle multiple inputs, call them x_1, z_1, etc
+     - to handle multpile correlated outputs, you'll need to add parameters with an index, such as lengthscale_i and lengthscale_j.
+    """
+    def __init__(self, input_dim, k=None, output_dim=1, name=None, param=None):
+        if name is None:
+            self.name='sympykern'
+        else:
+            self.name = name
+        if k is None:
+            raise ValueError, "You must provide an argument for the covariance function."
+        self._sp_k = k
+        sp_vars = [e for e in k.atoms() if e.is_Symbol]
+        self._sp_x= sorted([e for e in sp_vars if e.name[0:2]=='x_'],key=lambda x:int(x.name[2:]))
+        self._sp_z= sorted([e for e in sp_vars if e.name[0:2]=='z_'],key=lambda z:int(z.name[2:]))
+        # Check that variable names make sense.
+        assert all([x.name=='x_%i'%i for i,x in enumerate(self._sp_x)])
+        assert all([z.name=='z_%i'%i for i,z in enumerate(self._sp_z)])
+        assert len(self._sp_x)==len(self._sp_z)
+        self.input_dim = len(self._sp_x)
+        self._real_input_dim = self.input_dim
+        if output_dim > 1:
+            self.input_dim += 1
+        assert self.input_dim == input_dim
+        self.output_dim = output_dim
+        # extract parameter names
+        thetas = sorted([e for e in sp_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name)
+
+
+        # Look for parameters with index.
+        if self.output_dim>1:
+            self._sp_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name)
+            self._sp_theta_j = sorted([e for e in thetas if (e.name[-2:]=='_j')], key=lambda e:e.name)
+            # Make sure parameter appears with both indices!
+            assert len(self._sp_theta_i)==len(self._sp_theta_j)
+            assert all([theta_i.name[:-2]==theta_j.name[:-2] for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j)])
+
+            # Extract names of shared parameters
+            self._sp_theta = [theta for theta in thetas if theta not in self._sp_theta_i and theta not in self._sp_theta_j]
+            
+            self.num_split_params = len(self._sp_theta_i)
+            self._split_theta_names = ["%s"%theta.name[:-2] for theta in self._sp_theta_i]
+            for theta in self._split_theta_names:
+                setattr(self, theta, np.ones(self.output_dim))
+            
+            self.num_shared_params = len(self._sp_theta)
+            self.num_params = self.num_shared_params+self.num_split_params*self.output_dim
+            
+        else:
+            self.num_split_params = 0
+            self._split_theta_names = []
+            self._sp_theta = thetas
+            self.num_shared_params = len(self._sp_theta)
+            self.num_params = self.num_shared_params
+        
+        for theta in self._sp_theta:
+            val = 1.0
+            if param is not None:
+                if param.has_key(theta):
+                    val = param[theta]
+            setattr(self, theta.name, val)
+        #deal with param            
+        self._set_params(self._get_params())
+
+        #Differentiate!
+        self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
+        if self.output_dim > 1:
+            self._sp_dk_dtheta_i = [sp.diff(k,theta).simplify() for theta in self._sp_theta_i]
+            
+        self._sp_dk_dx = [sp.diff(k,xi).simplify() for xi in self._sp_x]
+
+        if False:
+            self.compute_psi_stats()
+
+        self._gen_code()
+
+        if False:
+            extra_compile_args = ['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5']
+        else:
+            extra_compile_args = []
+            
+        self.weave_kwargs = {
+            'support_code':self._function_code,
+            'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'parts/')],
+            'headers':['"sympy_helpers.h"'],
+            'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
+            'extra_compile_args':extra_compile_args,
+            'extra_link_args':[],
+            'verbose':True}
+        if config.getboolean('parallel', 'openmp'): self.weave_kwargs.append('-lgomp')
+
+    def __add__(self,other):
+        return spkern(self._sp_k+other._sp_k)
+
+    def _gen_code(self):
+        """Generates the C functions necessary for computing the covariance function using the sympy objects as input."""
+        #TODO: maybe generate one C function only to save compile time? Also easier to take that as a basis and hand craft other covariances??
+
+        #generate c functions from sympy objects        
+        argument_sequence = self._sp_x+self._sp_z+self._sp_theta
+        code_list = [('k',self._sp_k)]
+        # gradients with respect to covariance input
+        code_list += [('dk_d%s'%x.name,dx) for x,dx in zip(self._sp_x,self._sp_dk_dx)]
+        # gradient with respect to parameters
+        code_list += [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta,self._sp_dk_dtheta)]
+        # gradient with respect to multiple output parameters
+        if self.output_dim > 1:
+            argument_sequence += self._sp_theta_i + self._sp_theta_j
+            code_list += [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta_i,self._sp_dk_dtheta_i)]
+        (foo_c,self._function_code), (foo_h,self._function_header) = \
+                                     codegen(code_list, "C",'foobar',argument_sequence=argument_sequence)
+        #put the header file where we can find it
+        f = file(os.path.join(tempfile.gettempdir(),'foobar.h'),'w')
+        f.write(self._function_header)
+        f.close()
+
+        # Substitute any known derivatives which sympy doesn't compute
+        self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
+
+
+        ############################################################
+        # This is the basic argument construction for the C code.  #
+        ############################################################
+        
+        arg_list = (["X2(i, %s)"%x.name[2:] for x in self._sp_x]
+                    + ["Z2(j, %s)"%z.name[2:] for z in self._sp_z])
+
+        # for multiple outputs need to also provide these arguments reversed.
+        if self.output_dim>1:
+            reverse_arg_list = list(arg_list)
+            reverse_arg_list.reverse()
+
+        # Add in any 'shared' parameters to the list.
+        param_arg_list = [shared_params.name for shared_params in self._sp_theta]
+        arg_list += param_arg_list
+
+        precompute_list=[]
+        if self.output_dim > 1:
+            reverse_arg_list+=list(param_arg_list)
+            split_param_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['ii', 'jj'] for theta in self._sp_theta_i]
+            split_param_reverse_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['jj', 'ii'] for theta in self._sp_theta_i]
+            arg_list += split_param_arg_list
+            reverse_arg_list += split_param_reverse_arg_list
+            # Extract the right output indices from the inputs.
+            c_define_output_indices = [' '*16 + "int %s=(int)%s(%s, %i);"%(index, var, index2, self.input_dim-1) for index, var, index2 in zip(['ii', 'jj'], ['X2', 'Z2'], ['i', 'j'])]
+            precompute_list += c_define_output_indices
+            reverse_arg_string = ", ".join(reverse_arg_list)
+        arg_string = ", ".join(arg_list)
+        precompute_string = "\n".join(precompute_list)
+
+        # Code to compute argments string needed when only X is provided.
+        X_arg_string = re.sub('Z','X',arg_string)
+        # Code to compute argument string when only diagonal is required.
+        diag_arg_string = re.sub('int jj','//int jj',X_arg_string)
+        diag_arg_string = re.sub('j','i',diag_arg_string)
+        if precompute_string == '':
+            # if it's not multioutput, the precompute strings are set to zero
+            diag_precompute_string = ''
+            diag_precompute_replace = ''
+        else:
+            # for multioutput we need to extract the index of the output form the input.
+            diag_precompute_string = precompute_list[0]
+            diag_precompute_replace = precompute_list[1]
+        
+
+        # Here's the code to do the looping for K
+        self._K_code =\
+        """
+        // _K_code
+        // Code for computing the covariance function.
+        int i;
+        int j;
+        int N = target_array->dimensions[0];
+        int num_inducing = target_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for private(j)
+        for (i=0;i<N;i++){
+            for (j=0;j<num_inducing;j++){
+%s
+                //target[i*num_inducing+j] = 
+                TARGET2(i, j) += k(%s);
+            }
+        }
+        %s
+        """%(precompute_string,arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+        self._K_code_X = """
+        // _K_code_X
+        // Code for computing the covariance function.
+        int i;
+        int j;
+        int N = target_array->dimensions[0];
+        int num_inducing = target_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for private(j)
+        for (i=0;i<N;i++){
+            %s // int ii=(int)X2(i, 1);
+            TARGET2(i, i) += k(%s);
+            for (j=0;j<i;j++){
+              %s //int jj=(int)X2(j, 1);
+              double kval = k(%s); //double kval = k(X2(i, 0), shared_lengthscale, LENGTHSCALE1(ii), SCALE1(ii));
+              TARGET2(i, j) += kval;
+              TARGET2(j, i) += kval;
+            }
+        }
+        /*%s*/
+        """%(diag_precompute_string, diag_arg_string, re.sub('Z2', 'X2', diag_precompute_replace), X_arg_string,str(self._sp_k)) #adding a string representation forces recompile when needed
+
+        # Code to do the looping for Kdiag
+        self._Kdiag_code =\
+        """
+        // _Kdiag_code
+        // Code for computing diagonal of covariance function.
+        int i;
+        int N = target_array->dimensions[0];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for
+        for (i=0;i<N;i++){
+                %s
+                //target[i] =
+                TARGET1(i)=k(%s);
+        }
+        %s
+        """%(diag_precompute_string,diag_arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+        # Code to compute gradients
+        grad_func_list = []
+        if self.output_dim>1:
+            grad_func_list += c_define_output_indices
+            grad_func_list += [' '*16 + 'TARGET1(%i+ii) += PARTIAL2(i, j)*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
+            grad_func_list += [' '*16 + 'TARGET1(%i+jj) += PARTIAL2(i, j)*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
+        grad_func_list += ([' '*16 + 'TARGET1(%i) += PARTIAL2(i, j)*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
+        grad_func_string = '\n'.join(grad_func_list) 
+
+        self._dK_dtheta_code =\
+        """
+        // _dK_dtheta_code
+        // Code for computing gradient of covariance with respect to parameters.
+        int i;
+        int j;
+        int N = partial_array->dimensions[0];
+        int num_inducing = partial_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for private(j)
+        for (i=0;i<N;i++){
+            for (j=0;j<num_inducing;j++){
+%s
+            }
+        }
+        %s
+        """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
+
+
+        # Code to compute gradients for Kdiag TODO: needs clean up
+        diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
+        diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
+        diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
+        diag_grad_func_string = re.sub('PARTIAL2\(i, i\)','PARTIAL1(i)',diag_grad_func_string)
+        self._dKdiag_dtheta_code =\
+        """
+        // _dKdiag_dtheta_code
+        // Code for computing gradient of diagonal with respect to parameters.
+        int i;
+        int N = partial_array->dimensions[0];
+        int input_dim = X_array->dimensions[1];
+        for (i=0;i<N;i++){
+                %s
+        }
+        %s
+        """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+        # Code for gradients wrt X, TODO: may need to deal with special case where one input is actually an output.
+        gradX_func_list = []
+        if self.output_dim>1:
+            gradX_func_list += c_define_output_indices
+        gradX_func_list += ["TARGET2(i, %i) += PARTIAL2(i, j)*dk_dx_%i(%s);"%(q,q,arg_string) for q in range(self._real_input_dim)]
+        gradX_func_string = "\n".join(gradX_func_list)
+
+        self._dK_dX_code = \
+        """
+        // _dK_dX_code
+        // Code for computing gradient of covariance with respect to inputs.
+        int i;
+        int j;
+        int N = partial_array->dimensions[0];
+        int num_inducing = partial_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for private(j)
+        for (i=0;i<N; i++){
+          for (j=0; j<num_inducing; j++){
+            %s
+          }
+        }
+        %s
+        """%(gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+  
+
+        diag_gradX_func_string = re.sub('Z','X',gradX_func_string,count=0)
+        diag_gradX_func_string = re.sub('int jj','//int jj',diag_gradX_func_string)
+        diag_gradX_func_string = re.sub('j','i',diag_gradX_func_string)
+        diag_gradX_func_string = re.sub('PARTIAL2\(i, i\)','2*PARTIAL1(i)',diag_gradX_func_string)
+
+        # Code for gradients of Kdiag wrt X
+        self._dKdiag_dX_code= \
+        """
+        // _dKdiag_dX_code
+        // Code for computing gradient of diagonal with respect to inputs.
+        int N = partial_array->dimensions[0];
+        int input_dim = X_array->dimensions[1];
+        for (int i=0;i<N; i++){
+            %s
+        }
+        %s
+        """%(diag_gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a
+        # string representation forces recompile when needed Get rid
+        # of Zs in argument for diagonal. TODO: Why wasn't
+        # diag_func_string called here? Need to check that.
+        #self._dKdiag_dX_code = self._dKdiag_dX_code.replace('Z[j', 'X[i')
+
+        # Code to use when only X is provided. 
+        self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
+        self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= PARTIAL2(', '+= 2*PARTIAL2(') 
+        self._dK_dtheta_code_X = self._dK_dtheta_code_X.replace('Z2(', 'X2(')
+        self._dK_dX_code_X = self._dK_dX_code_X.replace('Z2(', 'X2(')
+
+
+        #TODO: insert multiple functions here via string manipulation
+        #TODO: similar functions for psi_stats
+    def _get_arg_names(self, Z=None, partial=None):
+        arg_names = ['target','X']
+        for shared_params in self._sp_theta:
+            arg_names += [shared_params.name]
+        if Z is not None:
+            arg_names += ['Z']
+        if partial is not None:
+            arg_names += ['partial']
+        if self.output_dim>1:
+            arg_names += self._split_theta_names
+            arg_names += ['output_dim']
+        return arg_names
+        
+    def _weave_inline(self, code, X, target, Z=None, partial=None):
+        output_dim = self.output_dim
+        for shared_params in self._sp_theta:
+            locals()[shared_params.name] = getattr(self, shared_params.name)
+
+        # Need to extract parameters first
+        for split_params in self._split_theta_names:
+            locals()[split_params] = getattr(self, split_params)
+        arg_names = self._get_arg_names(Z, partial)        
+        weave.inline(code=code, arg_names=arg_names,**self.weave_kwargs)
+
+    def K(self,X,Z,target):        
+        if Z is None:
+            self._weave_inline(self._K_code_X, X, target)
+        else:
+            self._weave_inline(self._K_code, X, target, Z)
+
+
+    def Kdiag(self,X,target):
+        self._weave_inline(self._Kdiag_code, X, target)
+
+    def dK_dtheta(self,partial,X,Z,target):
+        if Z is None:
+            self._weave_inline(self._dK_dtheta_code_X, X, target, Z, partial)
+        else:
+            self._weave_inline(self._dK_dtheta_code, X, target, Z, partial)
+            
+    def dKdiag_dtheta(self,partial,X,target):
+        self._weave_inline(self._dKdiag_dtheta_code, X, target, Z=None, partial=partial)
+               
+    def dK_dX(self,partial,X,Z,target):
+        if Z is None:
+            self._weave_inline(self._dK_dX_code_X, X, target, Z, partial)
+        else:
+            self._weave_inline(self._dK_dX_code, X, target, Z, partial)
+
+    def dKdiag_dX(self,partial,X,target):
+        self._weave_inline(self._dKdiag_dX_code, X, target, Z=None, partial=partial)
+
+    def compute_psi_stats(self):
+        #define some normal distributions
+        mus = [sp.var('mu_%i'%i,real=True) for i in range(self.input_dim)]
+        Ss = [sp.var('S_%i'%i,positive=True) for i in range(self.input_dim)]
+        normals = [(2*sp.pi*Si)**(-0.5)*sp.exp(-0.5*(xi-mui)**2/Si) for xi, mui, Si in zip(self._sp_x, mus, Ss)]
+
+        #do some integration!
+        #self._sp_psi0 = ??
+        self._sp_psi1 = self._sp_k
+        for i in range(self.input_dim):
+            print 'perfoming integrals %i of %i'%(i+1,2*self.input_dim)
+            sys.stdout.flush()
+            self._sp_psi1 *= normals[i]
+            self._sp_psi1 = sp.integrate(self._sp_psi1,(self._sp_x[i],-sp.oo,sp.oo))
+            clear_cache()
+        self._sp_psi1 = self._sp_psi1.simplify()
+
+        #and here's psi2 (eek!)
+        zprime = [sp.Symbol('zp%i'%i) for i in range(self.input_dim)]
+        self._sp_psi2 = self._sp_k.copy()*self._sp_k.copy().subs(zip(self._sp_z,zprime))
+        for i in range(self.input_dim):
+            print 'perfoming integrals %i of %i'%(self.input_dim+i+1,2*self.input_dim)
+            sys.stdout.flush()
+            self._sp_psi2 *= normals[i]
+            self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
+            clear_cache()
+        self._sp_psi2 = self._sp_psi2.simplify()
+
+
+    def _set_params(self,param):        
+        assert param.size == (self.num_params)
+        for i, shared_params in enumerate(self._sp_theta):
+            setattr(self, shared_params.name, param[i])
+            
+        if self.output_dim>1:
+            for i, split_params in enumerate(self._split_theta_names):
+                start = self.num_shared_params + i*self.output_dim
+                end = self.num_shared_params + (i+1)*self.output_dim
+                setattr(self, split_params, param[start:end])
+
+
+    def _get_params(self):
+        params = np.zeros(0)
+        for shared_params in self._sp_theta:
+            params = np.hstack((params, getattr(self, shared_params.name)))
+        if self.output_dim>1:
+            for split_params in self._split_theta_names:
+                params = np.hstack((params, getattr(self, split_params).flatten()))
+        return params
+
+    def _get_param_names(self):
+        if self.output_dim>1:
+            return [x.name for x in self._sp_theta] + [x.name[:-2] + str(i)  for x in self._sp_theta_i for i in range(self.output_dim)]
+        else:
+            return [x.name for x in self._sp_theta]
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@ -1,10 +1,10 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 from kernpart import Kernpart
 import numpy as np
-class white(Kernpart):
+
+class White(Kernpart):
    """
    White noise kernel.

@ -51,10 +51,10 @@ class white(Kernpart):
        pass

    def psi0(self,Z,mu,S,target):
-        target += self.variance
+        pass # target += self.variance

    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
-        target += dL_dpsi0.sum()
+        pass # target += dL_dpsi0.sum()

    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
        pass
--- a/GPy/kern/sympy_helpers.cpp
+++ b/GPy/kern/sympy_helpers.cpp
@ -1,10 +0,0 @@
-#include <math.h>
-double DiracDelta(double x){
-    if((x<0.000001) & (x>-0.000001))//go on, laught at my c++ skills
-        return 1.0;
-    else
-        return 0.0;
-};
-double DiracDelta(double x,int foo){
-    return 0.0;
-};
--- a/GPy/kern/sympy_helpers.h
+++ b/GPy/kern/sympy_helpers.h
@ -1,3 +0,0 @@
-#include <math.h>
-double DiracDelta(double x);
-double DiracDelta(double x, int foo);
--- a/GPy/kern/sympykern.py
+++ b/GPy/kern/sympykern.py
@ -1,258 +0,0 @@
-import numpy as np
-import sympy as sp
-from sympy.utilities.codegen import codegen
-from sympy.core.cache import clear_cache
-from scipy import weave
-import re
-import os
-import sys
-current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
-import tempfile
-import pdb
-from kernpart import Kernpart
-
-class spkern(Kernpart):
-    """
-    A kernel object, where all the hard work in done by sympy.
-
-    :param k: the covariance function
-    :type k: a positive definite sympy function of x1, z1, x2, z2...
-
-    To construct a new sympy kernel, you'll need to define:
-     - a kernel function using a sympy object. Ensure that the kernel is of the form k(x,z).
-     - that's it! we'll extract the variables from the function k.
-
-    Note:
-     - to handle multiple inputs, call them x1, z1, etc
-     - to handle multpile correlated outputs, you'll need to define each covariance function and 'cross' variance function. TODO
-    """
-    def __init__(self,input_dim,k,param=None):
-        self.name='sympykern'
-        self._sp_k = k
-        sp_vars = [e for e in k.atoms() if e.is_Symbol]
-        self._sp_x= sorted([e for e in sp_vars if e.name[0]=='x'],key=lambda x:int(x.name[1:]))
-        self._sp_z= sorted([e for e in sp_vars if e.name[0]=='z'],key=lambda z:int(z.name[1:]))
-        assert all([x.name=='x%i'%i for i,x in enumerate(self._sp_x)])
-        assert all([z.name=='z%i'%i for i,z in enumerate(self._sp_z)])
-        assert len(self._sp_x)==len(self._sp_z)
-        self.input_dim = len(self._sp_x)
-        assert self.input_dim == input_dim
-        self._sp_theta = sorted([e for e in sp_vars if not (e.name[0]=='x' or e.name[0]=='z')],key=lambda e:e.name)
-        self.num_params = len(self._sp_theta)
-
-        #deal with param
-        if param is None:
-            param = np.ones(self.num_params)
-        assert param.size==self.num_params
-        self._set_params(param)
-
-        #Differentiate!
-        self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
-        self._sp_dk_dx = [sp.diff(k,xi).simplify() for xi in self._sp_x]
-        #self._sp_dk_dz = [sp.diff(k,zi) for zi in self._sp_z]
-
-        #self.compute_psi_stats()
-        self._gen_code()
-
-        self.weave_kwargs = {\
-            'support_code':self._function_code,\
-            'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'kern/')],\
-            'headers':['"sympy_helpers.h"'],\
-            'sources':[os.path.join(current_dir,"kern/sympy_helpers.cpp")],\
-            #'extra_compile_args':['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5'],\
-            'extra_compile_args':[],\
-            'extra_link_args':['-lgomp'],\
-            'verbose':True}
-
-    def __add__(self,other):
-        return spkern(self._sp_k+other._sp_k)
-
-    def compute_psi_stats(self):
-        #define some normal distributions
-        mus = [sp.var('mu%i'%i,real=True) for i in range(self.input_dim)]
-        Ss = [sp.var('S%i'%i,positive=True) for i in range(self.input_dim)]
-        normals = [(2*sp.pi*Si)**(-0.5)*sp.exp(-0.5*(xi-mui)**2/Si) for xi, mui, Si in zip(self._sp_x, mus, Ss)]
-
-        #do some integration!
-        #self._sp_psi0 = ??
-        self._sp_psi1 = self._sp_k
-        for i in range(self.input_dim):
-            print 'perfoming integrals %i of %i'%(i+1,2*self.input_dim)
-            sys.stdout.flush()
-            self._sp_psi1 *= normals[i]
-            self._sp_psi1 = sp.integrate(self._sp_psi1,(self._sp_x[i],-sp.oo,sp.oo))
-            clear_cache()
-        self._sp_psi1 = self._sp_psi1.simplify()
-
-        #and here's psi2 (eek!)
-        zprime = [sp.Symbol('zp%i'%i) for i in range(self.input_dim)]
-        self._sp_psi2 = self._sp_k.copy()*self._sp_k.copy().subs(zip(self._sp_z,zprime))
-        for i in range(self.input_dim):
-            print 'perfoming integrals %i of %i'%(self.input_dim+i+1,2*self.input_dim)
-            sys.stdout.flush()
-            self._sp_psi2 *= normals[i]
-            self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
-            clear_cache()
-        self._sp_psi2 = self._sp_psi2.simplify()
-
-
-    def _gen_code(self):
-        #generate c functions from sympy objects
-        (foo_c,self._function_code),(foo_h,self._function_header) = \
-                codegen([('k',self._sp_k)] \
-                + [('dk_d%s'%x.name,dx) for x,dx in zip(self._sp_x,self._sp_dk_dx)]\
-                #+ [('dk_d%s'%z.name,dz) for z,dz in zip(self._sp_z,self._sp_dk_dz)]\
-                + [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta,self._sp_dk_dtheta)]\
-                ,"C",'foobar',argument_sequence=self._sp_x+self._sp_z+self._sp_theta)
-        #put the header file where we can find it
-        f = file(os.path.join(tempfile.gettempdir(),'foobar.h'),'w')
-        f.write(self._function_header)
-        f.close()
-
-        #get rid of derivatives of DiracDelta
-        self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
-
-        #Here's some code to do the looping for K
-        arglist = ", ".join(["X[i*input_dim+%s]"%x.name[1:] for x in self._sp_x]\
-                + ["Z[j*input_dim+%s]"%z.name[1:] for z in self._sp_z]\
-                + ["param[%i]"%i for i in range(self.num_params)])
-
-        self._K_code =\
-        """
-        int i;
-        int j;
-        int N = target_array->dimensions[0];
-        int num_inducing = target_array->dimensions[1];
-        int input_dim = X_array->dimensions[1];
-        //#pragma omp parallel for private(j)
-        for (i=0;i<N;i++){
-            for (j=0;j<num_inducing;j++){
-                target[i*num_inducing+j] = k(%s);
-            }
-        }
-        %s
-        """%(arglist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-
-        diag_arglist = re.sub('Z','X',arglist)
-        diag_arglist = re.sub('j','i',diag_arglist)
-        #Here's some code to do the looping for Kdiag
-        self._Kdiag_code =\
-        """
-        int i;
-        int N = target_array->dimensions[0];
-        int input_dim = X_array->dimensions[1];
-        //#pragma omp parallel for
-        for (i=0;i<N;i++){
-                target[i] = k(%s);
-        }
-        %s
-        """%(diag_arglist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-
-        #here's some code to compute gradients
-        funclist = '\n'.join([' '*16 + 'target[%i] += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arglist) for i,theta in  enumerate(self._sp_theta)])
-        self._dK_dtheta_code =\
-        """
-        int i;
-        int j;
-        int N = partial_array->dimensions[0];
-        int num_inducing = partial_array->dimensions[1];
-        int input_dim = X_array->dimensions[1];
-        //#pragma omp parallel for private(j)
-        for (i=0;i<N;i++){
-            for (j=0;j<num_inducing;j++){
-%s
-            }
-        }
-        %s
-        """%(funclist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-
-        #here's some code to compute gradients for Kdiag TODO: thius is yucky.
-        diag_funclist = re.sub('Z','X',funclist,count=0)
-        diag_funclist = re.sub('j','i',diag_funclist)
-        diag_funclist = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_funclist)
-        self._dKdiag_dtheta_code =\
-        """
-        int i;
-        int N = partial_array->dimensions[0];
-        int input_dim = X_array->dimensions[1];
-        for (i=0;i<N;i++){
-                %s
-        }
-        %s
-        """%(diag_funclist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-
-        #Here's some code to do gradients wrt x
-        gradient_funcs = "\n".join(["target[i*input_dim+%i] += partial[i*num_inducing+j]*dk_dx%i(%s);"%(q,q,arglist) for q in range(self.input_dim)])
-        self._dK_dX_code = \
-        """
-        int i;
-        int j;
-        int N = partial_array->dimensions[0];
-        int num_inducing = partial_array->dimensions[1];
-        int input_dim = X_array->dimensions[1];
-        //#pragma omp parallel for private(j)
-        for (i=0;i<N; i++){
-            for (j=0; j<num_inducing; j++){
-                %s
-                //if(isnan(target[i*input_dim+2])){printf("%%f\\n",dk_dx2(X[i*input_dim+0], X[i*input_dim+1], X[i*input_dim+2], Z[j*input_dim+0], Z[j*input_dim+1], Z[j*input_dim+2], param[0], param[1], param[2], param[3], param[4], param[5]));}
-                //if(isnan(target[i*input_dim+2])){printf("%%f,%%f,%%i,%%i\\n", X[i*input_dim+2], Z[j*input_dim+2],i,j);}
-
-            }
-        }
-        %s
-        """%(gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-
-        #now for gradients of Kdiag wrt X
-        self._dKdiag_dX_code= \
-        """
-        int i;
-        int j;
-        int N = partial_array->dimensions[0];
-        int num_inducing = 0;
-        int input_dim = X_array->dimensions[1];
-        for (i=0;i<N; i++){
-            j = i;
-            %s
-        }
-        %s
-        """%(gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-
-
-        #TODO: insert multiple functions here via string manipulation
-        #TODO: similar functions for psi_stats
-
-    def K(self,X,Z,target):
-        param = self._param
-        weave.inline(self._K_code,arg_names=['target','X','Z','param'],**self.weave_kwargs)
-
-    def Kdiag(self,X,target):
-        param = self._param
-        weave.inline(self._Kdiag_code,arg_names=['target','X','param'],**self.weave_kwargs)
-
-    def dK_dtheta(self,partial,X,Z,target):
-        param = self._param
-        weave.inline(self._dK_dtheta_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
-
-    def dKdiag_dtheta(self,partial,X,target):
-        param = self._param
-        Z = X
-        weave.inline(self._dKdiag_dtheta_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
-
-    def dK_dX(self,partial,X,Z,target):
-        param = self._param
-        weave.inline(self._dK_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
-
-    def dKdiag_dX(self,partial,X,target):
-        param = self._param
-        Z = X
-        weave.inline(self._dKdiag_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
-
-    def _set_params(self,param):
-        #print param.flags['C_CONTIGUOUS']
-        self._param = param.copy()
-
-    def _get_params(self):
-        return self._param
-
-    def _get_param_names(self):
-        return [x.name for x in self._sp_theta]
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,4 +1,7 @@
 from ep import EP
+from laplace import Laplace
+from ep_mixed_noise import EP_Mixed_Noise
 from gaussian import Gaussian
-# TODO: from Laplace import Laplace
-import likelihood_functions as functions
+from gaussian_mixed_noise import Gaussian_Mixed_Noise
+import noise_models
+from noise_model_constructors import *
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -4,60 +4,87 @@ from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood

 class EP(likelihood):
-    def __init__(self,data,LikelihoodFunction,epsilon=1e-3,power_ep=[1.,1.]):
+    def __init__(self,data,noise_model):
        """
        Expectation Propagation

-        Arguments
-        ---------
-        epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        LikelihoodFunction : a likelihood function (see likelihood_functions.py)
+        :param data: data to model
+        :type data: numpy array
+        :param noise_model: noise distribution
+        :type noise_model: A GPy noise model
+
        """
-        self.LikelihoodFunction = LikelihoodFunction
-        self.epsilon = epsilon
-        self.eta, self.delta = power_ep
+        self.noise_model = noise_model
        self.data = data
-        self.N, self.output_dim = self.data.shape
+        self.num_data, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
-        self.Nparams = 0
-        self._transf_data = self.LikelihoodFunction._preprocess_values(data)
+        self.num_params = 0

        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
-        self.tau_tilde = np.zeros(self.N)
-        self.v_tilde = np.zeros(self.N)
+        self.tau_tilde = np.zeros(self.num_data)
+        self.v_tilde = np.zeros(self.num_data)

        #initial values for the GP variables
-        self.Y = np.zeros((self.N,1))
-        self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:,None]
+        self.Y = np.zeros((self.num_data,1))
+        self.covariance_matrix = np.eye(self.num_data)
+        self.precision = np.ones(self.num_data)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.
+
+        super(EP, self).__init__()

    def restart(self):
-        self.tau_tilde = np.zeros(self.N)
-        self.v_tilde = np.zeros(self.N)
-        self.Y = np.zeros((self.N,1))
-        self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:,None]
+        self.tau_tilde = np.zeros(self.num_data)
+        self.v_tilde = np.zeros(self.num_data)
+        self.Y = np.zeros((self.num_data,1))
+        self.covariance_matrix = np.eye(self.num_data)
+        self.precision = np.ones(self.num_data)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.

-    def predictive_values(self,mu,var,full_cov):
+    def predictive_values(self,mu,var,full_cov,**noise_args):
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.LikelihoodFunction.predictive_values(mu,var)
+        return self.noise_model.predictive_values(mu,var,**noise_args)
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)

    def _get_params(self):
-        return np.zeros(0)
+        #return np.zeros(0)
+        return self.noise_model._get_params()
+
    def _get_param_names(self):
-        return []
+        #return []
+        return self.noise_model._get_param_names()
+
    def _set_params(self,p):
-        pass # TODO: the EP likelihood might want to take some parameters...
+        #pass # TODO: the EP likelihood might want to take some parameters...
+        self.noise_model._set_params(p)
+
    def _gradients(self,partial):
-        return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
+        #return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
+        return self.noise_model._gradients(partial)

    def _compute_GP_variables(self):
        #Variables to be called from GP
@ -65,20 +92,32 @@ class EP(likelihood):
        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
+        self.Z += 0.5*self.num_data*np.log(2*np.pi)

        self.Y =  mu_tilde[:,None]
        self.YYT = np.dot(self.Y,self.Y.T)
        self.covariance_matrix = np.diag(1./self.tau_tilde)
        self.precision = self.tau_tilde[:,None]
        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = np.trace(self.YYT)

-    def fit_full(self,K):
+    def fit_full(self, K, epsilon=1e-3,power_ep=[1.,1.]):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
+
+        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
+        :type epsilon: float
+        :param power_ep: Power EP parameters
+        :type power_ep: list of floats
+
        """
+        self.epsilon = epsilon
+        self.eta, self.delta = power_ep
+
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
-        mu = np.zeros(self.N)
+        mu = np.zeros(self.num_data)
        Sigma = K.copy()

        """
@ -87,15 +126,15 @@ class EP(likelihood):
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
+        self.tau_ = np.empty(self.num_data,dtype=float)
+        self.v_ = np.empty(self.num_data,dtype=float)

        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
+        z = np.empty(self.num_data,dtype=float)
+        self.Z_hat = np.empty(self.num_data,dtype=float)
+        phi = np.empty(self.num_data,dtype=float)
+        mu_hat = np.empty(self.num_data,dtype=float)
+        sigma2_hat = np.empty(self.num_data,dtype=float)

        #Approximation
        epsilon_np1 = self.epsilon + 1.
@ -104,13 +143,13 @@ class EP(likelihood):
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
+            update_order = np.random.permutation(self.num_data)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@ -122,23 +161,32 @@ class EP(likelihood):
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
-            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
+            B = np.eye(self.num_data) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
            L = jitchol(B)
            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
            Sigma = K - np.dot(V.T,V)
            mu = np.dot(Sigma,self.v_tilde)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())

        return self._compute_GP_variables()

-    def fit_DTC(self, Kmm, Kmn):
+    def fit_DTC(self, Kmm, Kmn, epsilon=1e-3,power_ep=[1.,1.]):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see ... 2013.
+
+        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
+        :type epsilon: float
+        :param power_ep: Power EP parameters
+        :type power_ep: list of floats
+
        """
+        self.epsilon = epsilon
+        self.eta, self.delta = power_ep
+
        num_inducing = Kmm.shape[0]

        #TODO: this doesn't work with uncertain inputs!
@ -167,7 +215,7 @@ class EP(likelihood):
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
-        mu = np.zeros(self.N)
+        mu = np.zeros(self.num_data)
        LLT = Kmm.copy()
        Sigma_diag = Qnn_diag.copy()

@ -177,15 +225,15 @@ class EP(likelihood):
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
+        self.tau_ = np.empty(self.num_data,dtype=float)
+        self.v_ = np.empty(self.num_data,dtype=float)

        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
+        z = np.empty(self.num_data,dtype=float)
+        self.Z_hat = np.empty(self.num_data,dtype=float)
+        phi = np.empty(self.num_data,dtype=float)
+        mu_hat = np.empty(self.num_data,dtype=float)
+        sigma2_hat = np.empty(self.num_data,dtype=float)

        #Approximation
        epsilon_np1 = 1
@ -194,13 +242,13 @@ class EP(likelihood):
        np1 = [self.tau_tilde.copy()]
        np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
+            update_order = np.random.permutation(self.num_data)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@ -223,18 +271,26 @@ class EP(likelihood):
            Sigma_diag = np.sum(V*V,-2)
            Knmv_tilde = np.dot(Kmn,self.v_tilde)
            mu = np.dot(V2.T,Knmv_tilde)
-            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
+            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.num_data
+            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.num_data
            np1.append(self.tau_tilde.copy())
            np2.append(self.v_tilde.copy())

        self._compute_GP_variables()

-    def fit_FITC(self, Kmm, Kmn, Knn_diag):
+    def fit_FITC(self, Kmm, Kmn, Knn_diag, epsilon=1e-3,power_ep=[1.,1.]):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
+
+        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
+        :type epsilon: float
+        :param power_ep: Power EP parameters
+        :type power_ep: list of floats
        """
+        self.epsilon = epsilon
+        self.eta, self.delta = power_ep
+
        num_inducing = Kmm.shape[0]

        """
@ -257,9 +313,9 @@ class EP(likelihood):
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
-        self.w = np.zeros(self.N)
+        self.w = np.zeros(self.num_data)
        self.Gamma = np.zeros(num_inducing)
-        mu = np.zeros(self.N)
+        mu = np.zeros(self.num_data)
        P = P0.copy()
        R = R0.copy()
        Diag = Diag0.copy()
@ -272,15 +328,15 @@ class EP(likelihood):
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
+        self.tau_ = np.empty(self.num_data,dtype=float)
+        self.v_ = np.empty(self.num_data,dtype=float)

        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
+        z = np.empty(self.num_data,dtype=float)
+        self.Z_hat = np.empty(self.num_data,dtype=float)
+        phi = np.empty(self.num_data,dtype=float)
+        mu_hat = np.empty(self.num_data,dtype=float)
+        sigma2_hat = np.empty(self.num_data,dtype=float)

        #Approximation
        epsilon_np1 = 1
@ -289,13 +345,13 @@ class EP(likelihood):
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
+            update_order = np.random.permutation(self.num_data)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@ -328,8 +384,8 @@ class EP(likelihood):
            self.w = Diag * self.v_tilde
            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
            mu = self.w + np.dot(P,self.Gamma)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())

--- a/GPy/likelihoods/ep_mixed_noise.py
+++ b/GPy/likelihoods/ep_mixed_noise.py
@ -0,0 +1,385 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
+from likelihood import likelihood
+
+class EP_Mixed_Noise(likelihood):
+    def __init__(self,data_list,noise_model_list,epsilon=1e-3,power_ep=[1.,1.]):
+        """
+        Expectation Propagation
+
+        Arguments
+        ---------
+        :param data_list: list of outputs
+        :param noise_model_list: a list of noise models
+        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations
+        :type epsilon: float
+        :param power_ep: list of power ep parameters
+        """
+        assert len(data_list) == len(noise_model_list)
+        self.noise_model_list = noise_model_list
+        n_list = [data.size for data in data_list]
+        self.n_models = len(data_list)
+        self.n_params = [noise_model._get_params().size for noise_model in noise_model_list]
+        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.n_models),n_list)])
+        self.epsilon = epsilon
+        self.eta, self.delta = power_ep
+        self.data = np.vstack(data_list)
+        self.N, self.output_dim = self.data.shape
+        self.is_heteroscedastic = True
+        self.num_params = 0#FIXME
+        self._transf_data = np.vstack([noise_model._preprocess_values(data) for noise_model,data in zip(noise_model_list,data_list)])
+        #TODO non-gaussian index
+
+        #Initial values - Likelihood approximation parameters:
+        #p(y|f) = t(f|tau_tilde,v_tilde)
+        self.tau_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.N)
+
+        #initial values for the GP variables
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.
+
+    def restart(self):
+        self.tau_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.N)
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.
+
+    def predictive_values(self,mu,var,full_cov,noise_model):
+        """
+        Predicts the output given the GP
+
+        :param mu: GP's mean
+        :param var: GP's variance
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: False|True
+        :param noise_model: noise model to use
+        :type noise_model: integer
+        """
+        if full_cov:
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        #_mu = []
+        #_var = []
+        #_q1 = []
+        #_q2 = []
+        #for m,v,o in zip(mu,var,output.flatten()):
+        #    a,b,c,d = self.noise_model_list[int(o)].predictive_values(m,v)
+        #    _mu.append(a)
+        #    _var.append(b)
+        #    _q1.append(c)
+        #    _q2.append(d)
+        #return np.vstack(_mu),np.vstack(_var),np.vstack(_q1),np.vstack(_q2)
+        return self.noise_model_list[noise_model].predictive_values(mu,var)
+
+    def _get_params(self):
+        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
+
+    def _get_param_names(self):
+        names = []
+        for noise_model in self.noise_model_list:
+           names += noise_model._get_param_names()
+        return names
+
+    def _set_params(self,p):
+        cs_params = np.cumsum([0]+self.n_params)
+        for i in range(len(self.n_params)):
+            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
+
+    def _gradients(self,partial):
+        #NOTE this is not tested
+        return np.hstack([noise_model._gradients(partial) for noise_model in self.noise_model_list])
+
+    def _compute_GP_variables(self):
+        #Variables to be called from GP
+        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
+        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
+        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
+        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
+
+        self.Y =  mu_tilde[:,None]
+        self.YYT = np.dot(self.Y,self.Y.T)
+        self.covariance_matrix = np.diag(1./self.tau_tilde)
+        self.precision = self.tau_tilde[:,None]
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = np.trace(self.YYT)
+
+    def fit_full(self,K):
+        """
+        The expectation-propagation algorithm.
+        For nomenclature see Rasmussen & Williams 2006.
+        """
+        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
+        mu = np.zeros(self.N)
+        Sigma = K.copy()
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = self.epsilon + 1.
+        epsilon_np2 = self.epsilon + 1.
+       	self.iterations = 0
+        self.np1 = [self.tau_tilde.copy()]
+        self.np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
+                mu = np.dot(Sigma,self.v_tilde)
+                self.iterations += 1
+            #Sigma recomptutation with Cholesky decompositon
+            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
+            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
+            L = jitchol(B)
+            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
+            Sigma = K - np.dot(V.T,V)
+            mu = np.dot(Sigma,self.v_tilde)
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            self.np1.append(self.tau_tilde.copy())
+            self.np2.append(self.v_tilde.copy())
+
+        return self._compute_GP_variables()
+
+    def fit_DTC(self, Kmm, Kmn):
+        """
+        The expectation-propagation algorithm with sparse pseudo-input.
+        For nomenclature see ... 2013.
+        """
+        num_inducing = Kmm.shape[0]
+
+        #TODO: this doesn't work with uncertain inputs!
+
+        """
+        Prior approximation parameters:
+        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
+        Sigma0 = Qnn = Knm*Kmmi*Kmn
+        """
+        KmnKnm = np.dot(Kmn,Kmn.T)
+        Lm = jitchol(Kmm)
+        Lmi = chol_inv(Lm)
+        Kmmi = np.dot(Lmi.T,Lmi)
+        KmmiKmn = np.dot(Kmmi,Kmn)
+        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
+        LLT0 = Kmm.copy()
+
+        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
+        #KmnKnm = np.dot(Kmn, Kmn.T)
+        #KmmiKmn = np.dot(Kmmi,Kmn)
+        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
+        #LLT0 = Kmm.copy()
+
+        """
+        Posterior approximation: q(f|y) = N(f| mu, Sigma)
+        Sigma = Diag + P*R.T*R*P.T + K
+        mu = w + P*Gamma
+        """
+        mu = np.zeros(self.N)
+        LLT = Kmm.copy()
+        Sigma_diag = Qnn_diag.copy()
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = 1
+        epsilon_np2 = 1
+       	self.iterations = 0
+        np1 = [self.tau_tilde.copy()]
+        np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
+                L = jitchol(LLT)
+                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
+                V,info = dtrtrs(L,Kmn,lower=1)
+                Sigma_diag = np.sum(V*V,-2)
+                si = np.sum(V.T*V[:,i],-1)
+                mu += (Delta_v-Delta_tau*mu[i])*si
+                self.iterations += 1
+            #Sigma recomputation with Cholesky decompositon
+            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
+            L = jitchol(LLT)
+            V,info = dtrtrs(L,Kmn,lower=1)
+            V2,info = dtrtrs(L.T,V,lower=0)
+            Sigma_diag = np.sum(V*V,-2)
+            Knmv_tilde = np.dot(Kmn,self.v_tilde)
+            mu = np.dot(V2.T,Knmv_tilde)
+            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
+            np1.append(self.tau_tilde.copy())
+            np2.append(self.v_tilde.copy())
+
+        self._compute_GP_variables()
+
+    def fit_FITC(self, Kmm, Kmn, Knn_diag):
+        """
+        The expectation-propagation algorithm with sparse pseudo-input.
+        For nomenclature see Naish-Guzman and Holden, 2008.
+        """
+        num_inducing = Kmm.shape[0]
+
+        """
+        Prior approximation parameters:
+        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
+        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
+        """
+        Lm = jitchol(Kmm)
+        Lmi = chol_inv(Lm)
+        Kmmi = np.dot(Lmi.T,Lmi)
+        P0 = Kmn.T
+        KmnKnm = np.dot(P0.T, P0)
+        KmmiKmn = np.dot(Kmmi,P0.T)
+        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
+        Diag0 = Knn_diag - Qnn_diag
+        R0 = jitchol(Kmmi).T
+
+        """
+        Posterior approximation: q(f|y) = N(f| mu, Sigma)
+        Sigma = Diag + P*R.T*R*P.T + K
+        mu = w + P*Gamma
+        """
+        self.w = np.zeros(self.N)
+        self.Gamma = np.zeros(num_inducing)
+        mu = np.zeros(self.N)
+        P = P0.copy()
+        R = R0.copy()
+        Diag = Diag0.copy()
+        Sigma_diag = Knn_diag
+        RPT0 = np.dot(R0,P0.T)
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = 1
+        epsilon_np2 = 1
+       	self.iterations = 0
+        self.np1 = [self.tau_tilde.copy()]
+        self.np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                dtd1 = Delta_tau*Diag[i] + 1.
+                dii = Diag[i]
+                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
+                pi_ = P[i,:].reshape(1,num_inducing)
+                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
+                Rp_i = np.dot(R,pi_.T)
+                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
+                R = jitchol(RTR).T
+                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
+                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
+                RPT = np.dot(R,P.T)
+                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
+                mu = self.w + np.dot(P,self.Gamma)
+                self.iterations += 1
+            #Sigma recomptutation with Cholesky decompositon
+            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
+            Diag = Diag0 * Iplus_Dprod_i
+            P = Iplus_Dprod_i[:,None] * P0
+            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
+            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
+            R,info = dtrtrs(L,R0,lower=1)
+            RPT = np.dot(R,P.T)
+            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
+            self.w = Diag * self.v_tilde
+            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
+            mu = self.w + np.dot(P,self.Gamma)
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            self.np1.append(self.tau_tilde.copy())
+            self.np2.append(self.v_tilde.copy())
+
+        return self._compute_GP_variables()
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -1,19 +1,21 @@
 import numpy as np
 from likelihood import likelihood
+from ..util.linalg import jitchol
+

 class Gaussian(likelihood):
    """
    Likelihood class for doing Expectation propagation

-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood_function used
-    :param variance : 
+    :param data: observed output
+    :type data: Nx1 numpy.darray
+    :param variance: noise parameter
    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
    :type normalize: False|True
    """
    def __init__(self, data, variance=1., normalize=False):
        self.is_heteroscedastic = False
-        self.Nparams = 1
+        self.num_params = 1
        self.Z = 0. # a correction factor which accounts for the approximation made
        N, self.output_dim = data.shape

@ -32,6 +34,8 @@ class Gaussian(likelihood):
        self._variance = np.asarray(variance) + 1.
        self._set_params(np.asarray(variance))

+        super(Gaussian, self).__init__()
+
    def set_data(self, data):
        self.data = data
        self.N, D = data.shape
@ -40,9 +44,11 @@ class Gaussian(likelihood):
        if D > self.N:
            self.YYT = np.dot(self.Y, self.Y.T)
            self.trYYT = np.trace(self.YYT)
+            self.YYT_factor = jitchol(self.YYT)
        else:
            self.YYT = None
            self.trYYT = np.sum(np.square(self.Y))
+            self.YYT_factor = self.Y

    def _get_params(self):
        return np.asarray(self._variance)
@ -53,16 +59,17 @@ class Gaussian(likelihood):
    def _set_params(self, x):
        x = np.float64(x)
        if np.all(self._variance != x):
-            if x == 0.:
+            if x == 0.:#special case of zero noise
                self.precision = np.inf
                self.V = None
            else:
                self.precision = 1. / x
                self.V = (self.precision) * self.Y
+                self.VVT_factor = self.precision * self.YYT_factor
            self.covariance_matrix = np.eye(self.N) * x
            self._variance = x

-    def predictive_values(self, mu, var, full_cov):
+    def predictive_values(self, mu, var, full_cov, **likelihood_args):
        """
        Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval
        """
@ -83,11 +90,25 @@ class Gaussian(likelihood):
            _95pc = mean + 2.*np.sqrt(true_var)
        return mean, true_var, _5pc, _95pc

-    def fit_full(self):
+    def log_predictive_density(self, y_test, mu_star, var_star):
        """
-        No approximations needed
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+
+        .. Note:
+            Works as if each test point was provided individually, i.e. not full_cov
        """
-        pass
+        y_rescaled = (y_test - self._offset)/self._scale
+        return -0.5*np.log(2*np.pi) -0.5*np.log(var_star + self._variance) -0.5*(np.square(y_rescaled - mu_star))/(var_star + self._variance)

    def _gradients(self, partial):
        return np.sum(partial)
--- a/GPy/likelihoods/gaussian_mixed_noise.py
+++ b/GPy/likelihoods/gaussian_mixed_noise.py
@ -0,0 +1,108 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
+from likelihood import likelihood
+from . import Gaussian
+
+
+class Gaussian_Mixed_Noise(likelihood):
+    """
+    Gaussian Likelihood for multiple outputs
+
+    This is a wrapper around likelihood.Gaussian class
+
+    :param data_list: data observations
+    :type data_list: list of numpy arrays (num_data_output_i x 1), one array per output
+    :param noise_params: noise parameters of each output
+    :type noise_params: list of floats, one per output
+    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize: False|True
+    """
+    def __init__(self, data_list, noise_params=None, normalize=True):
+        self.num_params = len(data_list)
+        self.n_list = [data.size for data in data_list]
+        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.num_params),self.n_list)])
+
+        if noise_params is None:
+            noise_params = [1.] * self.num_params
+        else:
+            assert self.num_params == len(noise_params), 'Number of noise parameters does not match the number of noise models.'
+
+        self.noise_model_list = [Gaussian(Y,variance=v,normalize = normalize) for Y,v in zip(data_list,noise_params)]
+        self.n_params = [noise_model._get_params().size for noise_model in self.noise_model_list]
+        self.data = np.vstack(data_list)
+        self.N, self.output_dim = self.data.shape
+        self._offset = np.zeros((1, self.output_dim))
+        self._scale = np.ones((1, self.output_dim))
+
+        self.is_heteroscedastic = True
+        self.Z = 0. # a correction factor which accounts for the approximation made
+
+        self.set_data(data_list)
+        self._set_params(np.asarray(noise_params))
+
+        super(Gaussian_Mixed_Noise, self).__init__()
+
+    def set_data(self, data_list):
+        self.data = np.vstack(data_list)
+        self.N, D = self.data.shape
+        assert D == self.output_dim
+        self.Y = (self.data - self._offset) / self._scale
+        if D > self.N:
+            raise NotImplementedError
+            #self.YYT = np.dot(self.Y, self.Y.T)
+            #self.trYYT = np.trace(self.YYT)
+            #self.YYT_factor = jitchol(self.YYT)
+        else:
+            self.YYT = None
+            self.trYYT = np.sum(np.square(self.Y))
+            self.YYT_factor = self.Y
+
+    def predictive_values(self,mu,var,full_cov,noise_model):
+        """
+        Predicts the output given the GP
+
+        :param mu: GP's mean
+        :param var: GP's variance
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: False|True
+        :param noise_model: noise model to use
+        :type noise_model: integer
+        """
+        if full_cov:
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        return self.noise_model_list[noise_model].predictive_values(mu,var,full_cov)
+
+    def _get_params(self):
+        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
+
+    def _get_param_names(self):
+        if len(self.noise_model_list) == 1:
+            names = self.noise_model_list[0]._get_param_names()
+        else:
+            names = []
+            for noise_model,i in zip(self.noise_model_list,range(len(self.n_list))):
+                names.append(''.join(noise_model._get_param_names() + ['_%s' %i]))
+        return names
+
+    def _set_params(self,p):
+        cs_params = np.cumsum([0]+self.n_params)
+
+        for i in range(len(self.n_params)):
+            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
+        self.precision = np.hstack([np.repeat(noise_model.precision,n) for noise_model,n in zip(self.noise_model_list,self.n_list)])[:,None]
+
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.precision * self.YYT_factor
+        self.covariance_matrix = np.eye(self.N) * 1./self.precision
+
+    def _gradients(self,partial):
+        gradients = []
+        aux = np.cumsum([0]+self.n_list)
+        for ai,af,noise_model in zip(aux[:-1],aux[1:],self.noise_model_list):
+            gradients += [noise_model._gradients(partial[ai:af])]
+        return np.hstack(gradients)
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@ -0,0 +1,403 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+#
+#Parts of this file were influenced by the Matlab GPML framework written by
+#Carl Edward Rasmussen & Hannes Nickisch, however all bugs are our own.
+#
+#The GPML code is released under the FreeBSD License.
+#Copyright (c) 2005-2013 Carl Edward Rasmussen & Hannes Nickisch. All rights reserved.
+#
+#The code and associated documentation is available from
+#http://gaussianprocess.org/gpml/code.
+
+import numpy as np
+import scipy as sp
+from likelihood import likelihood
+from ..util.linalg import mdot, jitchol, pddet, dpotrs
+from functools import partial as partial_func
+import warnings
+
+class Laplace(likelihood):
+    """Laplace approximation to a posterior"""
+
+    def __init__(self, data, noise_model, extra_data=None):
+        """
+        Laplace Approximation
+
+        Find the moments \hat{f} and the hessian at this point
+        (using Newton-Raphson) of the unnormalised posterior
+
+        Compute the GP variables (i.e. generate some Y^{squiggle} and
+        z^{squiggle} which makes a gaussian the same as the laplace
+        approximation to the posterior, but normalised
+
+        Arguments
+        ---------
+
+        :param data: array of data the likelihood function is approximating
+        :type data: NxD
+        :param noise_model: likelihood function - subclass of noise_model
+        :type noise_model: noise_model
+        :param extra_data: additional data used by some likelihood functions,
+        """
+        self.data = data
+        self.noise_model = noise_model
+        self.extra_data = extra_data
+
+        #Inital values
+        self.N, self.D = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
+        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
+
+        self.restart()
+        likelihood.__init__(self)
+
+    def restart(self):
+        """
+        Reset likelihood variables to their defaults
+        """
+        #Initial values for the GP variables
+        self.Y = np.zeros((self.N, 1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:, None]
+        self.Z = 0
+        self.YYT = None
+
+        self.old_Ki_f = None
+        self.bad_fhat = False
+
+    def predictive_values(self,mu,var,full_cov,**noise_args):
+        if full_cov:
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        return self.noise_model.predictive_values(mu,var,**noise_args)
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
+
+    def _get_params(self):
+        return np.asarray(self.noise_model._get_params())
+
+    def _get_param_names(self):
+        return self.noise_model._get_param_names()
+
+    def _set_params(self, p):
+        return self.noise_model._set_params(p)
+
+    def _shared_gradients_components(self):
+        d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
+        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
+        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
+        return dL_dfhat, I_KW_i
+
+    def _Kgradients(self):
+        """
+        Gradients with respect to prior kernel parameters dL_dK to be chained
+        with dK_dthetaK to give dL_dthetaK
+        :returns: dL_dK matrix
+        :rtype: Matrix (1 x num_kernel_params)
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #Explicit
+        #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
+        #expl_b = self.Wi_K_i
+        #expl = 0.5*expl_a - 0.5*expl_b
+        #dL_dthetaK_exp = dK_dthetaK(expl, X)
+
+        #Implicit
+        impl = mdot(dlp, dL_dfhat, I_KW_i)
+
+        #No longer required as we are computing these in the gp already
+        #otherwise we would take them away and add them back
+        #dL_dthetaK_imp = dK_dthetaK(impl, X)
+        #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
+        #dL_dK = expl + impl
+
+        #No need to compute explicit as we are computing dZ_dK to account
+        #for the difference between the K gradients of a normal GP,
+        #and the K gradients including the implicit part
+        dL_dK = impl
+        return dL_dK
+
+    def _gradients(self, partial):
+        """
+        Gradients with respect to likelihood parameters (dL_dthetaL)
+
+        :param partial: Not needed by this likelihood
+        :type partial: lambda function
+        :rtype: array of derivatives (1 x num_likelihood_params)
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
+
+        #len(dlik_dthetaL)
+        num_params = len(self._get_param_names())
+        # make space for one derivative for each likelihood parameter
+        dL_dthetaL = np.zeros(num_params)
+        for thetaL_i in range(num_params):
+            #Explicit
+            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[:, thetaL_i])
+                             #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
+                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[:, thetaL_i])
+                             )
+
+            #Implicit
+            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[:, thetaL_i])
+            dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
+            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+
+        return dL_dthetaL
+
+    def _compute_GP_variables(self):
+        """
+        Generate data Y which would give the normal distribution identical
+        to the laplace approximation to the posterior, but normalised
+
+        GPy expects a likelihood to be gaussian, so need to caluclate
+        the data Y^{\tilde} that makes the posterior match that found
+        by a laplace approximation to a non-gaussian likelihood but with
+        a gaussian likelihood
+
+        Firstly,
+        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
+        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
+        we wish to find the hessian \Sigma^{\tilde}
+        that has the same curvature but using our new simulated data Y^{\tilde}
+        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
+        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
+        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
+
+        Secondly,
+        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
+        So we can suck up any differences between that and our log marginal likelihood approximation
+        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
+        which we want to optimize instead, by equating them and rearranging, the difference is added onto
+        the log p(y) that GPy optimizes by default
+
+        Thirdly,
+        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
+        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
+        gp.py code
+        """
+        Wi = 1.0/self.W
+        self.Sigma_tilde = np.diagflat(Wi)
+
+        Y_tilde = Wi*self.Ki_f + self.f_hat
+
+        self.Wi_K_i = self.W12BiW12
+        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+
+        Z_tilde = (+ lik
+                   - 0.5*self.ln_B_det
+                   + 0.5*ln_det_Wi_K
+                   - 0.5*self.f_Ki_f
+                   + 0.5*y_Wi_K_i_y
+                   + self.NORMAL_CONST
+                  )
+
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
+        self.YYT = np.dot(self.Y, self.Y.T)
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
+
+        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
+        self.dZ_dK = self._Kgradients()
+        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
+
+    def fit_full(self, K):
+        """
+        The laplace approximation algorithm, find K and expand hessian
+        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
+
+        :param K: Prior covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        """
+        self.K = K.copy()
+
+        #Find mode
+        self.f_hat = self.rasm_mode(self.K)
+
+        #Compute hessian and other variables at mode
+        self._compute_likelihood_variables()
+
+        #Compute fake variables replicating laplace approximation to posterior
+        self._compute_GP_variables()
+
+    def _compute_likelihood_variables(self):
+        """
+        Compute the variables required to compute gaussian Y variables
+        """
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
+
+        if not self.noise_model.log_concave:
+            #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
+            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+
+        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
+
+        self.Ki_f = self.Ki_f
+        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
+        self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
+
+    def _compute_B_statistics(self, K, W, a):
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :param K: Prior Covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of hessian (1xN)
+        :param a: Matrix to calculate W12BiW12a
+        :type a: Matrix NxN
+        :returns: (W12BiW12, ln_B_det)
+        """
+        if not self.noise_model.log_concave:
+            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
+            W[W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                # To cause the posterior to become less certain than the prior and likelihood,
+                                # This is a property only held by non-log-concave likelihoods
+
+
+        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        B = np.eye(self.N) + W_12*K*W_12.T
+        L = jitchol(B)
+
+        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
+        ln_B_det = 2*np.sum(np.log(np.diag(L)))
+        return W12BiW12a, ln_B_det
+
+    def rasm_mode(self, K, MAX_ITER=40):
+        """
+        Rasmussen's numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+        Influenced by GPML (BSD) code, all errors are our own
+
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
+        :param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :type MAX_ITER: scalar
+        :returns: f_hat, mode on which to make laplace approxmiation
+        :rtype: NxD matrix
+        """
+        #old_Ki_f = np.zeros((self.N, 1))
+
+        #Start f's at zero originally of if we have gone off track, try restarting
+        if self.old_Ki_f is None or self.bad_fhat:
+            old_Ki_f = np.random.rand(self.N, 1)/50.0
+            #old_Ki_f = self.Y
+            f = np.dot(K, old_Ki_f)
+        else:
+            #Start at the old best point
+            old_Ki_f = self.old_Ki_f.copy()
+            f = self.f_hat.copy()
+
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(Ki_f, f):
+            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
+
+        difference = np.inf
+        epsilon = 1e-7
+        #step_size = 1
+        #rs = 0
+        i = 0
+
+        while difference > epsilon and i < MAX_ITER:
+            W = -self.noise_model.d2logpdf_df2(f, self.data, extra_data=self.extra_data)
+
+            W_f = W*f
+            grad = self.noise_model.dlogpdf_df(f, self.data, extra_data=self.extra_data)
+
+            b = W_f + grad
+            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_Ki_f = b - W12BiW12Kb
+            dKi_f = full_step_Ki_f - old_Ki_f
+
+            f_old = f.copy()
+            def inner_obj(step_size, old_Ki_f, dKi_f, K):
+                Ki_f = old_Ki_f + step_size*dKi_f
+                f = np.dot(K, Ki_f)
+                # This is nasty, need to set something within an optimization though
+                self.tmp_Ki_f = Ki_f.copy()
+                self.tmp_f = f.copy()
+                return -obj(Ki_f, f)
+
+            i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
+            #Find the stepsize that minimizes the objective function using a brent line search
+            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
+            #steps than get this exact then make a step, if B was bigger it might be the other way around though
+            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
+            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
+            f = self.tmp_f.copy()
+            Ki_f = self.tmp_Ki_f.copy()
+
+            #Optimize without linesearch
+            #f_old = f.copy()
+            #update_passed = False
+            #while not update_passed:
+                #Ki_f = old_Ki_f + step_size*dKi_f
+                #f = np.dot(K, Ki_f)
+
+                #old_obj = new_obj
+                #new_obj = obj(Ki_f, f)
+                #difference = new_obj - old_obj
+                ##print "difference: ",difference
+                #if difference < 0:
+                    ##print "Objective function rose", np.float(difference)
+                    ##If the objective function isn't rising, restart optimization
+                    #step_size *= 0.8
+                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    ##objective function isn't increasing, try reducing step size
+                    #f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
+                    #old_obj = new_obj
+                    #rs += 1
+                #else:
+                    #update_passed = True
+
+            #old_Ki_f = self.Ki_f.copy()
+
+            #difference = abs(new_obj - old_obj)
+            #old_obj = new_obj.copy()
+            difference = np.abs(np.sum(f - f_old)) + np.abs(np.sum(Ki_f - old_Ki_f))
+            #difference = np.abs(np.sum(Ki_f - old_Ki_f))/np.float(self.N)
+            old_Ki_f = Ki_f.copy()
+            i += 1
+
+        self.old_Ki_f = old_Ki_f.copy()
+
+        #Warn of bad fits
+        if difference > epsilon:
+            self.bad_fhat = True
+            warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now perfect again")
+
+        self.Ki_f = Ki_f
+        return f
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -1,7 +1,8 @@
 import numpy as np
 import copy
+from ..core.parameterized import Parameterized

-class likelihood:
+class likelihood(Parameterized):
    """
    The atom for a likelihood class

@ -9,17 +10,20 @@ class likelihood:
    (Gaussian) inherits directly from this, as does the EP algorithm

    Some things must be defined for this to work properly:
-    self.Y : the effective Gaussian target of the GP
-    self.N, self.D : Y.shape
-    self.covariance_matrix : the effective (noise) covariance of the GP targets
-    self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
-    self.is_heteroscedastic : enables significant computational savings in GP
-    self.precision : a scalar or vector representation of the effective target precision
-    self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
-    self.V : self.precision * self.Y 
+
+        - self.Y : the effective Gaussian target of the GP
+        - self.N, self.D : Y.shape
+        - self.covariance_matrix : the effective (noise) covariance of the GP targets
+        - self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
+        - self.is_heteroscedastic : enables significant computational savings in GP
+        - self.precision : a scalar or vector representation of the effective target precision
+        - self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
+        - self.V : self.precision * self.Y
+
    """
-    def __init__(self,data):
-        raise ValueError, "this class is not to be instantiated"
+    def __init__(self):
+        Parameterized.__init__(self)
+        self.dZ_dK = 0

    def _get_params(self):
        raise NotImplementedError
@ -30,8 +34,17 @@ class likelihood:
    def _set_params(self, x):
        raise NotImplementedError

-    def fit(self):
-        raise NotImplementedError
+    def fit_full(self, K):
+        """
+        No approximations needed by default
+        """
+        pass
+
+    def restart(self):
+        """
+        No need to restart if not an approximation
+        """
+        pass

    def _gradients(self, partial):
        raise NotImplementedError
@ -39,6 +52,18 @@ class likelihood:
    def predictive_values(self, mu, var):
        raise NotImplementedError

-    def copy(self):
-        """ Returns a (deep) copy of the current likelihood """
-        return copy.deepcopy(self)
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        raise NotImplementedError
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -1,166 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-import scipy as sp
-import pylab as pb
-from ..util.plot import gpplot
-from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import link_functions
-
-class LikelihoodFunction(object):
-    """
-    Likelihood class for doing Expectation propagation
-
-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the LikelihoodFunction used
-    """
-    def __init__(self,link):
-        if link == self._analytical:
-            self.moments_match = self._moments_match_analytical
-        else:
-            assert isinstance(link,link_functions.LinkFunction)
-            self.link = link
-            self.moments_match = self._moments_match_numerical
-
-    def _preprocess_values(self,Y):
-        return Y
-
-    def _product(self,gp,obs,mu,sigma):
-        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._distribution(gp,obs)
-
-    def _nlog_product(self,gp,obs,mu,sigma):
-        return -(-.5*(gp-mu)**2/sigma**2 + self._log_distribution(gp,obs))
-
-    def _locate(self,obs,mu,sigma):
-        """
-        Golden Search to find the mode in the _product function (cavity x exact likelihood) and define a grid around it for numerical integration
-        """
-        golden_A = -1 if obs == 0 else np.array([np.log(obs),mu]).min() #Lower limit
-        golden_B = np.array([np.log(obs),mu]).max() #Upper limit
-        return sp.optimize.golden(self._nlog_product, args=(obs,mu,sigma), brack=(golden_A,golden_B)) #Better to work with _nlog_product than with _product
-
-    def _moments_match_numerical(self,obs,tau,v):
-        """
-        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
-        """
-        mu = v/tau
-        sigma = np.sqrt(1./tau)
-        opt = self._locate(obs,mu,sigma)
-        width = 3./np.log(max(obs,2))
-        A = opt - width #Grid's lower limit
-        B = opt + width #Grid's Upper limit
-        K =  10*int(np.log(max(obs,150))) #Number of points in the grid
-        h = (B-A)/K # length of the intervals
-        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
-        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
-        _aux1 = self._product(A,obs,mu,sigma)
-        _aux2 = self._product(B,obs,mu,sigma)
-        _aux3 = 4*self._product(grid_x[range(1,K,2)],obs,mu,sigma)
-        _aux4 = 2*self._product(grid_x[range(2,K-1,2)],obs,mu,sigma)
-        zeroth = np.hstack((_aux1,_aux2,_aux3,_aux4)) # grid of points (Y axis) rearranged
-        first = zeroth*x
-        second = first*x
-        Z_hat = sum(zeroth)*h/3 # Zero-th moment
-        mu_hat = sum(first)*h/(3*Z_hat) # First moment
-        m2 = sum(second)*h/(3*Z_hat) # Second moment
-        sigma2_hat = m2 - mu_hat**2 # Second central moment
-        return float(Z_hat), float(mu_hat), float(sigma2_hat)
-
-class Binomial(LikelihoodFunction):
-    """
-    Probit likelihood
-    Y is expected to take values in {-1,1}
-    -----
-    $$
-    L(x) = \\Phi (Y_i*f_i)
-    $$
-    """
-    def __init__(self,link=None):
-        self._analytical = link_functions.Probit
-        if not link:
-            link = self._analytical
-        super(Binomial, self).__init__(link)
-
-    def _distribution(self,gp,obs):
-        pass
-
-    def _log_distribution(self,gp,obs):
-        pass
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-
-        ..Note:: Binary classification algorithm works better with classes {-1,1}
-        """
-        Y_prep = Y.copy()
-        Y1 = Y[Y.flatten()==1].size
-        Y2 = Y[Y.flatten()==0].size
-        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
-        Y_prep[Y.flatten() == 0] = -1
-        return Y_prep
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
-        Z_hat = std_norm_cdf(z)
-        phi = std_norm_pdf(z)
-        mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
-        sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
-        return Z_hat, mu_hat, sigma2_hat
-
-    def predictive_values(self,mu,var):
-        """
-        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
-        :param mu: mean of the latent variable
-        :param var: variance of the latent variable
-        """
-        mu = mu.flatten()
-        var = var.flatten()
-        mean = stats.norm.cdf(mu/np.sqrt(1+var))
-        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
-        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
-        p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
-        p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
-        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
-
-class Poisson(LikelihoodFunction):
-    """
-    Poisson likelihood
-    Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
-    """
-    def __init__(self,link=None):
-        self._analytical = None
-        if not link:
-            link = link_functions.Log()
-        super(Poisson, self).__init__(link)
-
-    def _distribution(self,gp,obs):
-        return stats.poisson.pmf(obs,self.link.inv_transf(gp))
-
-    def _log_distribution(self,gp,obs):
-        return - self.link.inv_transf(gp) + obs * self.link.log_inv_transf(gp)
-
-    def predictive_values(self,mu,var):
-        """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
-        """
-        mean = self.link.transf(mu)#np.exp(mu*self.scale + self.location)
-        tmp = stats.poisson.ppf(np.array([.025,.975]),mean)
-        p_025 = tmp[:,0]
-        p_975 = tmp[:,1]
-        return mean,np.nan*mean,p_025,p_975 # better variance here TODO
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -1,33 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-import scipy as sp
-import pylab as pb
-from ..util.plot import gpplot
-from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-
-class LinkFunction(object):
-    """
-    Link function class for doing non-Gaussian likelihoods approximation
-
-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood_function used
-    """
-    def __init__(self):
-        pass
-
-class Probit(LinkFunction):
-    """
-    Probit link function: Squashes a likelihood between 0 and 1
-    """
-    def transf(self,mu):
-        pass
-
-    def inv_transf(self,f):
-        pass
-
-    def log_inv_transf(self,f):
-        pass
--- a/GPy/likelihoods/noise_model_constructors.py
+++ b/GPy/likelihoods/noise_model_constructors.py
@ -0,0 +1,121 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import noise_models
+
+def bernoulli(gp_link=None):
+    """
+    Construct a bernoulli likelihood
+
+    :param gp_link: a GPy gp_link function
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Probit()
+    #else:
+    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
+
+    if isinstance(gp_link,noise_models.gp_transformations.Probit):
+        analytical_mean = True
+        analytical_variance = False
+
+    elif isinstance(gp_link,noise_models.gp_transformations.Heaviside):
+        analytical_mean = True
+        analytical_variance = True
+
+    else:
+        analytical_mean = False
+        analytical_variance = False
+
+    return noise_models.bernoulli_noise.Bernoulli(gp_link,analytical_mean,analytical_variance)
+
+def exponential(gp_link=None):
+
+    """
+    Construct a exponential likelihood
+
+    :param gp_link: a GPy gp_link function
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Log_ex_1()
+
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.exponential_noise.Exponential(gp_link,analytical_mean,analytical_variance)
+
+def gaussian_ep(gp_link=None,variance=1.):
+    """
+    Construct a gaussian likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param variance: scalar
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+    #else:
+    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
+
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.gaussian_noise.Gaussian(gp_link,analytical_mean,analytical_variance,variance)
+
+def poisson(gp_link=None):
+    """
+    Construct a Poisson likelihood
+
+    :param gp_link: a GPy gp_link function
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Log_ex_1()
+    #else:
+    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.poisson_noise.Poisson(gp_link,analytical_mean,analytical_variance)
+
+def gamma(gp_link=None,beta=1.):
+    """
+    Construct a Gamma likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param beta: scalar
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Log_ex_1()
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.gamma_noise.Gamma(gp_link,analytical_mean,analytical_variance,beta)
+
+def gaussian(gp_link=None, variance=2, D=None, N=None):
+    """
+    Construct a Gaussian likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param variance: variance
+    :type variance: scalar
+    :returns: Gaussian noise model:
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+    analytical_mean = True
+    analytical_variance = True # ?
+    return noise_models.gaussian_noise.Gaussian(gp_link, analytical_mean,
+            analytical_variance, variance=variance, D=D, N=N)
+
+def student_t(gp_link=None, deg_free=5, sigma2=2):
+    """
+    Construct a Student t likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param deg_free: degrees of freedom of student-t
+    :type deg_free: scalar
+    :param sigma2: variance
+    :type sigma2: scalar
+    :returns: Student-T noise model
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+    analytical_mean = True
+    analytical_variance = True
+    return noise_models.student_t_noise.StudentT(gp_link, analytical_mean,
+            analytical_variance,deg_free, sigma2)
--- a/GPy/likelihoods/noise_models/init.py
+++ b/GPy/likelihoods/noise_models/init.py
@ -0,0 +1,8 @@
+import noise_distributions
+import bernoulli_noise
+import exponential_noise
+import gaussian_noise
+import gamma_noise
+import poisson_noise
+import student_t_noise
+import gp_transformations
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@ -0,0 +1,222 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Bernoulli(NoiseDistribution):
+    """
+    Bernoulli likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+    .. Note::
+        Y is expected to take values in {-1,1}
+        Probit likelihood usually used
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        super(Bernoulli, self).__init__(gp_link,analytical_mean,analytical_variance)
+        if isinstance(gp_link , (gp_transformations.Heaviside, gp_transformations.Probit)):
+            self.log_concave = True
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+
+        ..Note:: Binary classification algorithm works better with classes {-1,1}
+        """
+        Y_prep = Y.copy()
+        Y1 = Y[Y.flatten()==1].size
+        Y2 = Y[Y.flatten()==0].size
+        assert Y1 + Y2 == Y.size, 'Bernoulli likelihood is meant to be used only with outputs in {0,1}.'
+        Y_prep[Y.flatten() == 0] = -1
+        return Y_prep
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        if data_i == 1:
+            sign = 1.
+        elif data_i == 0:
+            sign = -1
+        else:
+            raise ValueError("bad value for Bernouilli observation (0,1)")
+        if isinstance(self.gp_link,gp_transformations.Probit):
+            z = sign*v_i/np.sqrt(tau_i**2 + tau_i)
+            Z_hat = std_norm_cdf(z)
+            phi = std_norm_pdf(z)
+            mu_hat = v_i/tau_i + sign*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
+            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
+
+        elif isinstance(self.gp_link,gp_transformations.Heaviside):
+            a = sign*v_i/np.sqrt(tau_i)
+            Z_hat = std_norm_cdf(a)
+            N = std_norm_pdf(a)
+            mu_hat = v_i/tau_i + sign*N/Z_hat/np.sqrt(tau_i)
+            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
+            if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
+                stop
+        else:
+            raise ValueError("Exact moment matching not available for link {}".format(self.gp_link.gp_transformations.__name__))
+
+        return Z_hat, mu_hat, sigma2_hat
+
+    def _predictive_mean_analytical(self,mu,variance):
+
+        if isinstance(self.gp_link,gp_transformations.Probit):
+            return stats.norm.cdf(mu/np.sqrt(1+variance))
+
+        elif isinstance(self.gp_link,gp_transformations.Heaviside):
+            return stats.norm.cdf(mu/np.sqrt(variance))
+
+        else:
+            raise NotImplementedError
+
+    def _predictive_variance_analytical(self,mu,variance, pred_mean):
+
+        if isinstance(self.gp_link,gp_transformations.Heaviside):
+            return 0.
+        else:
+            raise NotImplementedError
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        .. Note:
+            Each y_i must be in {0,1}
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        objective = (link_f**y) * ((1.-link_f)**(1.-y))
+        return np.exp(np.sum(np.log(objective)))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: log likelihood evaluated at points link(f)
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        #objective = y*np.log(link_f) + (1.-y)*np.log(link_f)
+        objective = np.where(y==1, np.log(link_f), np.log(1-link_f))
+        return np.sum(objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the pdf at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - \\frac{(1 - y_{i})}{(1 - \\lambda(f_{i}))}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: gradient of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = (y/link_f) - (1.-y)/(1-link_f)
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j
+        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
+
+
+        .. math::
+            \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d2logpdf_dlink2 = -y/(link_f**2) - (1-y)/((1-link_f)**2)
+        return d2logpdf_dlink2
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f)^{3}} - \\frac{2(1-y_{i}}{(1-\\lambda(f))^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: third derivative of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3logpdf_dlink3 = 2*(y/(link_f**3) - (1-y)/((1-link_f)**3))
+        return d3logpdf_dlink3
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        p = self.gp_link.transf(gp)
+        return p*(1.-p)
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        ns = np.ones_like(gp, dtype=int)
+        Ysim = np.random.binomial(ns, self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@ -0,0 +1,156 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Exponential(NoiseDistribution):
+    """
+    Expoential likelihood
+    Y is expected to take values in {0,1,2,...}
+    -----
+    $$
+    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+    $$
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        super(Exponential, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        return Y
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})\\exp (-y\\lambda(f_{i}))
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        log_objective = link_f*np.exp(-y*link_f)
+        return np.exp(np.sum(np.log(log_objective)))
+        #return np.exp(np.sum(-y/link_f - np.log(link_f) ))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\lambda(f_{i}) - y_{i}\\lambda(f_{i})
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        log_objective = np.log(link_f) - y*link_f
+        #logpdf_link = np.sum(-np.log(link_f) - y/link_f)
+        return np.sum(log_objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\lambda(f)} - y_{i}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = 1./link_f - y
+        #grad = y/(link_f**2) - 1./link_f
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\frac{1}{\\lambda(f_{i})^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -1./(link_f**2)
+        #hess = -2*y/(link_f**3) + 1/(link_f**2)
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2}{\\lambda(f_{i})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = 2./(link_f**3)
+        #d3lik_dlink3 = 6*y/(link_f**4) - 2./(link_f**3)
+        return d3lik_dlink3
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)**2
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.random.exponential(1.0/self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@ -0,0 +1,155 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Gamma(NoiseDistribution):
+    """
+    Gamma likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\beta^{\\alpha_{i}}}{\\Gamma(\\alpha_{i})}y_{i}^{\\alpha_{i}-1}e^{-\\beta y_{i}}\\\\
+        \\alpha_{i} = \\beta y_{i}
+
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
+        self.beta = beta
+        super(Gamma, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        return Y
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\beta^{\\alpha_{i}}}{\\Gamma(\\alpha_{i})}y_{i}^{\\alpha_{i}-1}e^{-\\beta y_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
+        alpha = link_f*self.beta
+        objective = (y**(alpha - 1.) * np.exp(-self.beta*y) * self.beta**alpha)/ special.gamma(alpha)
+        return np.exp(np.sum(np.log(objective)))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        #alpha = self.gp_link.transf(gp)*self.beta
+        #return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+        alpha = link_f*self.beta
+        log_objective = alpha*np.log(self.beta) - np.log(special.gamma(alpha)) + (alpha - 1)*np.log(y) - self.beta*y
+        return np.sum(log_objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\beta (\\log \\beta y_{i}) - \\Psi(\\alpha_{i})\\beta\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = self.beta*np.log(self.beta*y) - special.psi(self.beta*link_f)*self.beta
+        #old
+        #return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -special.polygamma(1, self.beta*link_f)*(self.beta**2)
+        #old
+        #return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = -special.polygamma(2, self.beta*link_f)*(self.beta**3)
+        return d3lik_dlink3
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)/self.beta
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@ -0,0 +1,300 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Gaussian(NoiseDistribution):
+    """
+    Gaussian likelihood
+
+    .. math::
+        \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+    :param variance: variance value of the Gaussian distribution
+    :param N: Number of data points
+    :type N: int
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1., D=None, N=None):
+        self.variance = variance
+        self.N = N
+        self._set_params(np.asarray(variance))
+        super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)
+        if isinstance(gp_link , gp_transformations.Identity):
+            self.log_concave = True
+
+    def _get_params(self):
+        return np.array([self.variance])
+
+    def _get_param_names(self):
+        return ['noise_model_variance']
+
+    def _set_params(self, p):
+        self.variance = float(p)
+        self.I = np.eye(self.N)
+        self.covariance_matrix = self.I * self.variance
+        self.Ki = self.I*(1.0 / self.variance)
+        #self.ln_det_K = np.sum(np.log(np.diag(self.covariance_matrix)))
+        self.ln_det_K = self.N*np.log(self.variance)
+
+    def _gradients(self,partial):
+        return np.zeros(1)
+        #return np.sum(partial)
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+        """
+        return Y
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        sigma2_hat = 1./(1./self.variance + tau_i)
+        mu_hat = sigma2_hat*(data_i/self.variance + v_i)
+        sum_var = self.variance + 1./tau_i
+        Z_hat = 1./np.sqrt(2.*np.pi*sum_var)*np.exp(-.5*(data_i - v_i/tau_i)**2./sum_var)
+        return Z_hat, mu_hat, sigma2_hat
+
+    def _predictive_mean_analytical(self,mu,sigma):
+        new_sigma2 = self.predictive_variance(mu,sigma)
+        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)
+
+    def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
+        return 1./(1./self.variance + 1./sigma**2)
+
+    def _mass(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use pdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+    def _nlog_mass(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+
+    def _dnlog_mass_dgp(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use dlogpdf_df in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+
+    def _d2nlog_mass_dgp2(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use d2logpdf_df2 in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        #Assumes no covariance, exp, sum, log for numerical stability
+        return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: log likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        return -0.5*(np.sum((y-link_f)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the pdf at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\sigma^{2}}(y_{i} - \\lambda(f_{i}))
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: gradient of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s2_i = (1.0/self.variance)
+        grad = s2_i*y - s2_i*link_f
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link_f, w.r.t link_f.
+        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
+
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        hess = -(1.0/self.variance)*np.ones((self.N, 1))
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = 0
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: third derivative of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None]
+        return d3logpdf_dlink3
+
+    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\sigma^{2}} = -\\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - \\lambda(f_{i}))^{2}}{2\\sigma^{4}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: float
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        e = y - link_f
+        s_4 = 1.0/(self.variance**2)
+        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.sum(np.square(e))
+        return np.sum(dlik_dsigma) # Sure about this sum?
+
+    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
+        """
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)}) = \\frac{1}{\\sigma^{4}}(-y_{i} + \\lambda(f_{i}))
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s_4 = 1.0/(self.variance**2)
+        dlik_grad_dsigma = -s_4*y + s_4*link_f
+        return dlik_grad_dsigma
+
+    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)}) = \\frac{1}{\\sigma^{4}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log hessian evaluated at points link(f_i) and link(f_j) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        s_4 = 1.0/(self.variance**2)
+        d2logpdf_dlink2_dvar = np.diag(s_4*self.I)[:, None]
+        return d2logpdf_dlink2_dvar
+
+    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
+        return np.asarray([[dlogpdf_dvar]])
+
+    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
+        return d2logpdf_dlink2_dvar
+
+    def _mean(self,gp):
+        """
+        Expected value of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            E_{p(y|f)}[y]
+        """
+        return self.gp_link.transf(gp)
+
+    def _variance(self,gp):
+        """
+        Variance of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            Var_{p(y|f)}[y]
+        """
+        return self.variance
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp])
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@ -0,0 +1,159 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+import scipy as sp
+import pylab as pb
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_cdf
+
+class GPTransformation(object):
+    """
+    Link function class for doing non-Gaussian likelihoods approximation
+
+    :param Y: observed output (Nx1 numpy.darray)
+
+    .. note:: Y values allowed depend on the likelihood_function used
+
+    """
+    def __init__(self):
+        pass
+
+    def transf(self,f):
+        """
+        Gaussian process tranformation function, latent space -> output space
+        """
+        raise NotImplementedError
+
+    def dtransf_df(self,f):
+        """
+        derivative of transf(f) w.r.t. f
+        """
+        raise NotImplementedError
+
+    def d2transf_df2(self,f):
+        """
+        second derivative of transf(f) w.r.t. f
+        """
+        raise NotImplementedError
+
+    def d3transf_df3(self,f):
+        """
+        third derivative of transf(f) w.r.t. f
+        """
+        raise NotImplementedError
+
+class Identity(GPTransformation):
+    """
+    .. math::
+
+        g(f) = f
+
+    """
+    def transf(self,f):
+        return f
+
+    def dtransf_df(self,f):
+        return np.ones_like(f)
+
+    def d2transf_df2(self,f):
+        return np.zeros_like(f)
+
+    def d3transf_df3(self,f):
+        return np.zeros_like(f)
+
+
+class Probit(GPTransformation):
+    """
+    .. math::
+
+        g(f) = \\Phi^{-1} (mu)
+
+    """
+    def transf(self,f):
+        return std_norm_cdf(f)
+
+    def dtransf_df(self,f):
+        return std_norm_pdf(f)
+
+    def d2transf_df2(self,f):
+        #FIXME
+        return -f * std_norm_pdf(f)
+
+    def d3transf_df3(self,f):
+        #FIXME
+        f2 = f**2
+        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(1-f2)
+
+class Log(GPTransformation):
+    """
+    .. math::
+
+        g(f) = \\log(\\mu)
+
+    """
+    def transf(self,f):
+        return np.exp(f)
+
+    def dtransf_df(self,f):
+        return np.exp(f)
+
+    def d2transf_df2(self,f):
+        return np.exp(f)
+
+    def d3transf_df3(self,f):
+        return np.exp(f)
+
+class Log_ex_1(GPTransformation):
+    """
+    .. math::
+
+        g(f) = \\log(\\exp(\\mu) - 1)
+
+    """
+    def transf(self,f):
+        return np.log(1.+np.exp(f))
+
+    def dtransf_df(self,f):
+        return np.exp(f)/(1.+np.exp(f))
+
+    def d2transf_df2(self,f):
+        aux = np.exp(f)/(1.+np.exp(f))
+        return aux*(1.-aux)
+
+    def d3transf_df3(self,f):
+        aux = np.exp(f)/(1.+np.exp(f))
+        daux_df = aux*(1.-aux)
+        return daux_df - (2.*aux*daux_df)
+
+class Reciprocal(GPTransformation):
+    def transf(self,f):
+        return 1./f
+
+    def dtransf_df(self,f):
+        return -1./(f**2)
+
+    def d2transf_df2(self,f):
+        return 2./(f**3)
+
+    def d3transf_df3(self,f):
+        return -6./(f**4)
+
+class Heaviside(GPTransformation):
+    """
+
+    .. math::
+
+        g(f) = I_{x \\in A}
+
+    """
+    def transf(self,f):
+        #transformation goes here
+        return np.where(f>0, 1, 0)
+
+    def dtransf_df(self,f):
+        raise NotImplementedError, "This function is not differentiable!"
+
+    def d2transf_df2(self,f):
+        raise NotImplementedError, "This function is not differentiable!"
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@ -0,0 +1,433 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+import pylab as pb
+from GPy.util.plot import gpplot
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from GPy.util.misc import chain_1, chain_2, chain_3
+from scipy.integrate import quad
+import warnings
+
+class NoiseDistribution(object):
+    """
+    Likelihood class for doing approximations
+    """
+    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
+        assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."
+        self.gp_link = gp_link
+        self.analytical_mean = analytical_mean
+        self.analytical_variance = analytical_variance
+        if self.analytical_mean:
+            self.moments_match = self._moments_match_analytical
+            self.predictive_mean = self._predictive_mean_analytical
+        else:
+            self.moments_match = self._moments_match_numerical
+            self.predictive_mean = self._predictive_mean_numerical
+        if self.analytical_variance:
+            self.predictive_variance = self._predictive_variance_analytical
+        else:
+            self.predictive_variance = self._predictive_variance_numerical
+
+        self.log_concave = False
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self,p):
+        pass
+
+    def _gradients(self,partial):
+        return np.zeros(0)
+
+    def _preprocess_values(self,Y):
+        """
+        In case it is needed, this function assess the output values or makes any pertinent transformation on them.
+
+        :param Y: observed output
+        :type Y: Nx1 numpy.darray
+
+        """
+        return Y
+
+    def _moments_match_analytical(self,obs,tau,v):
+        """
+        If available, this function computes the moments analytically.
+        """
+        raise NotImplementedError
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        assert y_test.shape==mu_star.shape
+        assert y_test.shape==var_star.shape
+        assert y_test.shape[1] == 1
+        def integral_generator(y, m, v):
+            """Generate a function which can be integrated to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
+            def f(f_star):
+                return self.pdf(f_star, y)*np.exp(-(1./(2*v))*np.square(m-f_star))
+            return f
+
+        scaled_p_ystar, accuracy = zip(*[quad(integral_generator(y, m, v), -np.inf, np.inf) for y, m, v in zip(y_test.flatten(), mu_star.flatten(), var_star.flatten())])
+        scaled_p_ystar = np.array(scaled_p_ystar).reshape(-1,1)
+        p_ystar = scaled_p_ystar/np.sqrt(2*np.pi*var_star)
+        return np.log(p_ystar)
+
+    def _moments_match_numerical(self,obs,tau,v):
+        """
+        Calculation of moments using quadrature
+
+        :param obs: observed output
+        :param tau: cavity distribution 1st natural parameter (precision)
+        :param v: cavity distribution 2nd natural paramenter (mu*precision)
+        """
+        #Compute first integral for zeroth moment.
+        #NOTE constant np.sqrt(2*pi/tau) added at the end of the function
+        mu = v/tau
+        def int_1(f):
+            return self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        z_scaled, accuracy = quad(int_1, -np.inf, np.inf)
+
+        #Compute second integral for first moment
+        def int_2(f):
+            return f*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        mean, accuracy = quad(int_2, -np.inf, np.inf)
+        mean /= z_scaled
+
+        #Compute integral for variance
+        def int_3(f):
+            return (f**2)*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        Ef2, accuracy = quad(int_3, -np.inf, np.inf)
+        Ef2 /= z_scaled
+        variance = Ef2 - mean**2
+
+        #Add constant to the zeroth moment
+        #NOTE: this constant is not needed in the other moments because it cancells out.
+        z = z_scaled/np.sqrt(2*np.pi/tau)
+
+        return z, mean, variance
+
+    def _predictive_mean_analytical(self,mu,sigma):
+        """
+        Predictive mean
+        .. math::
+            E(Y^{*}|Y) = E( E(Y^{*}|f^{*}, Y) )
+
+        If available, this function computes the predictive mean analytically.
+        """
+        raise NotImplementedError
+
+    def _predictive_variance_analytical(self,mu,sigma):
+        """
+        Predictive variance
+        .. math::
+            V(Y^{*}| Y) = E( V(Y^{*}|f^{*}, Y) ) + V( E(Y^{*}|f^{*}, Y) )
+
+        If available, this function computes the predictive variance analytically.
+        """
+        raise NotImplementedError
+
+    def _predictive_mean_numerical(self,mu,variance):
+        """
+        Quadrature calculation of the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
+
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior
+
+        """
+        def int_mean(f,m,v):
+            return self._mean(f)*np.exp(-(0.5/v)*np.square(f - m))
+        scaled_mean = [quad(int_mean, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
+
+        return mean
+
+    def _predictive_variance_numerical(self,mu,variance,predictive_mean=None):
+        """
+        Numerical approximation to the predictive variance: V(Y_star)
+
+        The following variance decomposition is used:
+        V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior
+        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+
+        """
+        #sigma2 = sigma**2
+        normalizer = np.sqrt(2*np.pi*variance)
+
+        # E( V(Y_star|f_star) )
+        def int_var(f,m,v):
+            return self._variance(f)*np.exp(-(0.5/v)*np.square(f - m))
+        scaled_exp_variance = [quad(int_var, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        exp_var = np.array(scaled_exp_variance)[:,None] / normalizer
+
+        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
+
+        #E( E(Y_star|f_star) )**2
+        if predictive_mean is None:
+            predictive_mean = self.predictive_mean(mu,variance)
+        predictive_mean_sq = predictive_mean**2
+
+        #E( E(Y_star|f_star)**2 )
+        def int_pred_mean_sq(f,m,v,predictive_mean_sq):
+            return self._mean(f)**2*np.exp(-(0.5/v)*np.square(f - m))
+        scaled_exp_exp2 = [quad(int_pred_mean_sq, -np.inf, np.inf,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
+        exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer
+
+        var_exp = exp_exp2 - predictive_mean_sq
+
+        # V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+        return exp_var + var_exp
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_link_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def pdf(self, f, y, extra_data=None):
+        """
+        Evaluates the link function link(f) then computes the likelihood (pdf) using it
+
+        .. math:
+            p(y|\\lambda(f))
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        link_f = self.gp_link.transf(f)
+        return self.pdf_link(link_f, y, extra_data=extra_data)
+
+    def logpdf(self, f, y, extra_data=None):
+        """
+        Evaluates the link function link(f) then computes the log likelihood (log pdf) using it
+
+        .. math:
+            \\log p(y|\\lambda(f))
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: log likelihood evaluated for this point
+        :rtype: float
+        """
+        link_f = self.gp_link.transf(f)
+        return self.logpdf_link(link_f, y, extra_data=extra_data)
+
+    def dlogpdf_df(self, f, y, extra_data=None):
+        """
+        Evaluates the link function link(f) then computes the derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule
+
+        .. math::
+            \\frac{d\\log p(y|\\lambda(f))}{df} = \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d\\lambda(f)}{df}
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of log likelihood evaluated for this point
+        :rtype: 1xN array
+        """
+        link_f = self.gp_link.transf(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        return chain_1(dlogpdf_dlink, dlink_df)
+
+    def d2logpdf_df2(self, f, y, extra_data=None):
+        """
+        Evaluates the link function link(f) then computes the second derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule
+
+        .. math::
+            \\frac{d^{2}\\log p(y|\\lambda(f))}{df^{2}} = \\frac{d^{2}\\log p(y|\\lambda(f))}{d^{2}\\lambda(f)}\\left(\\frac{d\\lambda(f)}{df}\\right)^{2} + \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d^{2}\\lambda(f)}{df^{2}}
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: second derivative of log likelihood evaluated for this point (diagonal only)
+        :rtype: 1xN array
+        """
+        link_f = self.gp_link.transf(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+
+    def d3logpdf_df3(self, f, y, extra_data=None):
+        """
+        Evaluates the link function link(f) then computes the third derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule
+
+        .. math::
+            \\frac{d^{3}\\log p(y|\\lambda(f))}{df^{3}} = \\frac{d^{3}\\log p(y|\\lambda(f)}{d\\lambda(f)^{3}}\\left(\\frac{d\\lambda(f)}{df}\\right)^{3} + 3\\frac{d^{2}\\log p(y|\\lambda(f)}{d\\lambda(f)^{2}}\\frac{d\\lambda(f)}{df}\\frac{d^{2}\\lambda(f)}{df^{2}} + \\frac{d\\log p(y|\\lambda(f)}{d\\lambda(f)}\\frac{d^{3}\\lambda(f)}{df^{3}}
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: third derivative of log likelihood evaluated for this point
+        :rtype: float
+        """
+        link_f = self.gp_link.transf(f)
+        d3logpdf_dlink3 = self.d3logpdf_dlink3(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        d3link_df3 = self.gp_link.d3transf_df3(f)
+        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+
+    def dlogpdf_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([1, 0])
+
+    def dlogpdf_df_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([f.shape[0], 0])
+
+    def d2logpdf_df2_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([f.shape[0], 0])
+
+    def _laplace_gradients(self, f, y, extra_data=None):
+        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, extra_data=extra_data)
+        dlogpdf_df_dtheta = self.dlogpdf_df_dtheta(f, y, extra_data=extra_data)
+        d2logpdf_df2_dtheta = self.d2logpdf_df2_dtheta(f, y, extra_data=extra_data)
+
+        #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
+        # ensure we have gradients for every parameter we want to optimize
+        assert dlogpdf_dtheta.shape[1] == len(self._get_param_names())
+        assert dlogpdf_df_dtheta.shape[1] == len(self._get_param_names())
+        assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
+        return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
+
+    def predictive_values(self, mu, var, full_cov=False, sampling=False, num_samples=10000):
+        """
+        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
+
+        :param mu: mean of the latent variable, f, of posterior
+        :param var: variance of the latent variable, f, of posterior
+        :param full_cov: whether to use the full covariance or just the diagonal
+        :type full_cov: Boolean
+        :param num_samples: number of samples to use in computing quantiles and
+                            possibly mean variance
+        :type num_samples: integer
+        :param sampling: Whether to use samples for mean and variances anyway
+        :type sampling: Boolean
+
+        """
+
+        if sampling:
+            #Get gp_samples f* using posterior mean and variance
+            if not full_cov:
+                gp_samples = np.random.multivariate_normal(mu.flatten(), np.diag(var.flatten()),
+                                                            size=num_samples).T
+            else:
+                gp_samples = np.random.multivariate_normal(mu.flatten(), var,
+                                                               size=num_samples).T
+            #Push gp samples (f*) through likelihood to give p(y*|f*)
+            samples = self.samples(gp_samples)
+            axis=-1
+
+            #Calculate mean, variance and precentiles from samples
+            print "WARNING: Using sampling to calculate mean, variance and predictive quantiles."
+            pred_mean = np.mean(samples, axis=axis)[:,None]
+            pred_var = np.var(samples, axis=axis)[:,None]
+            q1 = np.percentile(samples, 2.5, axis=axis)[:,None]
+            q3 = np.percentile(samples, 97.5, axis=axis)[:,None]
+
+        else:
+
+            pred_mean = self.predictive_mean(mu, var)
+            pred_var = self.predictive_variance(mu, var, pred_mean)
+            print "WARNING: Predictive quantiles are only computed when sampling."
+            q1 = np.repeat(np.nan,pred_mean.size)[:,None]
+            q3 = q1.copy()
+
+        return pred_mean, pred_var, q1, q3
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        raise NotImplementedError
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@ -0,0 +1,152 @@
+from __future__ import division
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Poisson(NoiseDistribution):
+    """
+    Poisson likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\lambda(f_{i})^{y_{i}}}{y_{i}!}e^{-\\lambda(f_{i})}
+
+    .. Note::
+        Y is expected to take values in {0,1,2,...}
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y): #TODO
+        return Y
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\lambda(f_{i})^{y_{i}}}{y_{i}!}e^{-\\lambda(f_{i})}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return np.prod(stats.poisson.pmf(y,link_f))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = -\\lambda(f_{i}) + y_{i}\\log \\lambda(f_{i}) - \\log y_{i}!
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return np.sum(-link_f + y*np.log(link_f) - special.gammaln(y+1))
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - 1
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return y/link_f - 1
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{-y_{i}}{\\lambda(f_{i})^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -y/(link_f**2)
+        return hess
+        #d2_df = self.gp_link.d2transf_df2(gp)
+        #transf = self.gp_link.transf(gp)
+        #return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f_{i})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = 2*y/(link_f)**3
+        return d3lik_dlink3
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.random.poisson(self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@ -0,0 +1,277 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats, special
+import scipy as sp
+import gp_transformations
+from noise_distributions import NoiseDistribution
+from scipy import stats, integrate
+from scipy.special import gammaln, gamma
+
+class StudentT(NoiseDistribution):
+    """
+    Student T likelihood
+
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+
+    """
+    def __init__(self,gp_link=None,analytical_mean=True,analytical_variance=True, deg_free=5, sigma2=2):
+        self.v = deg_free
+        self.sigma2 = sigma2
+
+        self._set_params(np.asarray(sigma2))
+        super(StudentT, self).__init__(gp_link,analytical_mean,analytical_variance)
+        self.log_concave = False
+
+    def _get_params(self):
+        return np.asarray(self.sigma2)
+
+    def _get_param_names(self):
+        return ["t_noise_std2"]
+
+    def _set_params(self, x):
+        self.sigma2 = float(x)
+
+    @property
+    def variance(self, extra_data=None):
+        return (self.v / float(self.v - 2)) * self.sigma2
+
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        #Careful gamma(big_number) is infinity!
+        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
+                     / (np.sqrt(self.v * np.pi * self.sigma2)))
+                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
+                    )
+        return np.prod(objective)
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        objective = (+ gammaln((self.v + 1) * 0.5)
+                     - gammaln(self.v * 0.5)
+                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                     - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
+                    )
+        return np.sum(objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
+                    )
+        return d3lik_dlink3
+
+    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
+        return np.sum(dlogpdf_dvar)
+
+    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
+        """
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2}
+
+        :param link_f: latent variables link_f
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
+        """
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution
+        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        e = y - link_f
+        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+                              / ((self.sigma2*self.v + (e**2))**3)
+                           )
+        return d2logpdf_dlink2_dvar
+
+    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
+        return np.asarray([[dlogpdf_dvar]])
+
+    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
+        return d2logpdf_dlink2_dvar
+
+    def _predictive_variance_analytical(self, mu, sigma, predictive_mean=None):
+        """
+        Compute predictive variance of student_t*normal p(y*|f*)p(f*)
+
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+
+        #FIXME: Not correct
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = 1/(1/sigma**2 + 1/self.variance)
+
+        return true_var
+
+    def _predictive_mean_analytical(self, mu, sigma):
+        """
+        Compute mean of the prediction
+        """
+        #FIXME: Not correct
+        return mu
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        #FIXME: Very slow as we are computing a new random variable per input!
+        #Can't get it to sample all at the same time
+        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        dfs = np.ones_like(gp)*self.v
+        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
+        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
+                                        scale=scales)
+        return student_t_samples.reshape(orig_shape)
--- a/GPy/mappings/init.py
+++ b/GPy/mappings/init.py
@ -0,0 +1,7 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernel import Kernel
+from linear import Linear
+from mlp import MLP
+#from rbf import RBF
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@ -0,0 +1,60 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.mapping import Mapping
+import GPy
+
+class Kernel(Mapping):
+    """
+    Mapping based on a kernel/covariance function.
+
+    .. math::
+
+       f(\mathbf{x}*) = \mathbf{A}\mathbf{k}(\mathbf{X}, \mathbf{x}^*) + \mathbf{b}
+
+    :param X: input observations containing :math:`\mathbf{X}`
+    :type X: ndarray
+    :param output_dim: dimension of output.
+    :type output_dim: int
+    :param kernel: a GPy kernel, defaults to GPy.kern.rbf
+    :type kernel: GPy.kern.kern
+
+    """
+
+    def __init__(self, X, output_dim=1, kernel=None):
+        Mapping.__init__(self, input_dim=X.shape[1], output_dim=output_dim)
+        if kernel is None:
+            kernel = GPy.kern.rbf(self.input_dim)
+        self.kern = kernel
+        self.X = X
+        self.num_data = X.shape[0]
+        self.num_params = self.output_dim*(self.num_data + 1)
+        self.A = np.array((self.num_data, self.output_dim))
+        self.bias = np.array(self.output_dim)
+        self.randomize()
+        self.name = 'kernel'
+    def _get_param_names(self):
+        return sum([['A_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.num_data)], []) + ['bias_%i' % d for d in range(self.output_dim)]
+
+    def _get_params(self):
+        return np.hstack((self.A.flatten(), self.bias))
+
+    def _set_params(self, x):
+        self.A = x[:self.num_data * self.output_dim].reshape(self.num_data, self.output_dim).copy()
+        self.bias = x[self.num_data*self.output_dim:].copy()
+        
+    def randomize(self):
+        self.A = np.random.randn(self.num_data, self.output_dim)/np.sqrt(self.num_data+1)
+        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.num_data+1)
+
+    def f(self, X):
+        return np.dot(self.kern.K(X, self.X),self.A) + self.bias
+
+    def df_dtheta(self, dL_df, X):
+        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
+        self._df_dbias = (dL_df.sum(0))
+        return np.hstack((self._df_dA.flatten(), self._df_dbias))
+
+    def df_dX(self, dL_df, X):
+        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) 
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@ -0,0 +1,53 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.mapping import Mapping
+
+class Linear(Mapping):
+    """
+    Mapping based on a linear model.
+
+    .. math::
+
+       f(\mathbf{x}*) = \mathbf{W}\mathbf{x}^* + \mathbf{b}
+
+    :param X: input observations
+    :type X: ndarray
+    :param output_dim: dimension of output.
+    :type output_dim: int
+    
+    """
+
+    def __init__(self, input_dim=1, output_dim=1):
+        self.name = 'linear'
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        self.num_params = self.output_dim*(self.input_dim + 1)
+        self.W = np.array((self.input_dim, self.output_dim))
+        self.bias = np.array(self.output_dim)
+        self.randomize()
+
+    def _get_param_names(self):
+        return sum([['W_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.input_dim)], []) + ['bias_%i' % d for d in range(self.output_dim)]
+
+    def _get_params(self):
+        return np.hstack((self.W.flatten(), self.bias))
+
+    def _set_params(self, x):
+        self.W = x[:self.input_dim * self.output_dim].reshape(self.input_dim, self.output_dim).copy()
+        self.bias = x[self.input_dim*self.output_dim:].copy()
+    def randomize(self):
+        self.W = np.random.randn(self.input_dim, self.output_dim)/np.sqrt(self.input_dim + 1)
+        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.input_dim + 1)
+
+    def f(self, X):
+        return np.dot(X,self.W) + self.bias
+
+    def df_dtheta(self, dL_df, X):
+        self._df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T
+        self._df_dbias = (dL_df.sum(0))
+        return np.hstack((self._df_dW.flatten(), self._df_dbias))
+        
+    def df_dX(self, dL_df, X):
+        return (dL_df[:, None, :]*self.W[None, :, :]).sum(2) 
+    
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -0,0 +1,130 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.mapping import Mapping
+
+class MLP(Mapping):
+    """
+    Mapping based on a multi-layer perceptron neural network model.
+
+    .. math::
+
+       f(\\mathbf{x}*) = \\mathbf{W}^0\\boldsymbol{\\phi}(\\mathbf{W}^1\\mathbf{x}+\\mathbf{b}^1)^* + \\mathbf{b}^0
+
+    where
+
+    .. math::
+
+      \\phi(\\cdot) = \\text{tanh}(\\cdot)
+
+    :param X: input observations
+    :type X: ndarray
+    :param output_dim: dimension of output.
+    :type output_dim: int
+    :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
+    :type hidden_dim: int or list of ints. 
+
+    """
+
+    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        self.name = 'mlp'
+        if isinstance(hidden_dim, int):
+            hidden_dim = [hidden_dim]
+        self.hidden_dim = hidden_dim
+        self.activation = [None]*len(self.hidden_dim)
+        self.W = []
+        self._dL_dW = []
+        self.bias = []
+        self._dL_dbias = []
+        self.W.append(np.zeros((self.input_dim, self.hidden_dim[0])))
+        self._dL_dW.append(np.zeros((self.input_dim, self.hidden_dim[0])))
+        self.bias.append(np.zeros(self.hidden_dim[0]))
+        self._dL_dbias.append(np.zeros(self.hidden_dim[0]))
+        self.num_params = self.hidden_dim[0]*(self.input_dim+1)
+        for h1, h0 in zip(hidden_dim[1:], hidden_dim[0:-1]):
+            self.W.append(np.zeros((h0, h1)))
+            self._dL_dW.append(np.zeros((h0, h1)))
+            self.bias.append(np.zeros(h1))
+            self._dL_dbias.append(np.zeros(h1))
+            self.num_params += h1*(h0+1)
+        self.W.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
+        self._dL_dW.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
+        self.bias.append(np.zeros(self.output_dim))
+        self._dL_dbias.append(np.zeros(self.output_dim))
+        self.num_params += self.output_dim*(self.hidden_dim[-1]+1)
+        self.randomize()
+
+    def _get_param_names(self):
+        return sum([['W%i_%i_%i' % (i, n, d)  for n in range(self.W[i].shape[0]) for d in range(self.W[i].shape[1])] + ['bias%i_%i' % (i, d) for d in range(self.W[i].shape[1])] for i in range(len(self.W))], [])
+
+    def _get_params(self):
+        param = np.array([])
+        for W, bias in zip(self.W, self.bias):
+            param = np.hstack((param, W.flatten(), bias))
+        return param
+    
+    def _set_params(self, x):
+        start = 0
+        for W, bias in zip(self.W, self.bias):
+            end = W.shape[0]*W.shape[1]+start
+            W[:] = x[start:end].reshape(W.shape[0], W.shape[1]).copy()
+            start = end
+            end = W.shape[1]+end
+            bias[:] = x[start:end].copy()
+            start = end
+
+    def randomize(self):
+        for W, bias in zip(self.W, self.bias):
+            W[:] = np.random.randn(W.shape[0], W.shape[1])/np.sqrt(W.shape[0]+1)
+            bias[:] = np.random.randn(W.shape[1])/np.sqrt(W.shape[0]+1)
+
+    def f(self, X):
+        self._f_computations(X)
+        return np.dot(np.tanh(self.activation[-1]), self.W[-1]) + self.bias[-1]
+
+    def _f_computations(self, X):
+        W = self.W[0]
+        bias = self.bias[0]
+        self.activation[0] = np.dot(X,W) + bias
+        for W, bias, index in zip(self.W[1:-1], self.bias[1:-1], range(1, len(self.activation))):
+            self.activation[index] = np.dot(np.tanh(self.activation[index-1]), W)+bias
+
+    def df_dtheta(self, dL_df, X):
+        self._df_computations(dL_df, X)
+        g = np.array([])
+        for gW, gbias in zip(self._dL_dW, self._dL_dbias):
+            g = np.hstack((g, gW.flatten(), gbias))
+        return g
+
+    def _df_computations(self, dL_df, X):
+        self._f_computations(X)
+        a0 = self.activation[-1]
+        W = self.W[-1]
+        self._dL_dW[-1] = (dL_df[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
+        dL_dta=(dL_df[:, None, :]*W[None, :, :]).sum(2)
+        self._dL_dbias[-1] = (dL_df.sum(0))
+        for dL_dW, dL_dbias, W, bias, a0, a1 in zip(self._dL_dW[-2:0:-1],
+                                                    self._dL_dbias[-2:0:-1],
+                                                    self.W[-2:0:-1],
+                                                    self.bias[-2:0:-1],
+                                                    self.activation[-2::-1],
+                                                    self.activation[-1:0:-1]):
+            ta = np.tanh(a1)
+            dL_da = dL_dta*(1-ta*ta)
+            dL_dW[:] = (dL_da[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
+            dL_dbias[:] = (dL_da.sum(0))
+            dL_dta = (dL_da[:, None, :]*W[None, :, :]).sum(2)
+        ta = np.tanh(self.activation[0])
+        dL_da = dL_dta*(1-ta*ta)
+        W = self.W[0]
+        self._dL_dW[0] = (dL_da[:, :, None]*X[:, None, :]).sum(0).T
+        self._dL_dbias[0] = (dL_da.sum(0))
+        self._dL_dX = (dL_da[:, None, :]*W[None, :, :]).sum(2)
+
+        
+    def df_dX(self, dL_df, X):
+        self._df_computations(dL_df, X)
+        return self._dL_dX
+    
--- a/GPy/models.py
+++ b/GPy/models.py
@ -0,0 +1,31 @@
+'''
+GPy Models
+==========
+
+Implementations for common models used in GP regression and classification.
+The different models can be viewed in :mod:`GPy.models_modules`, which holds
+detailed explanations for the different models.
+
+:warning: This module is a convienince module for endusers to use. For developers 
+see :mod:`GPy.models_modules`, which holds the implementions for each model. 
+'''
+
+__updated__ = '2013-11-28'
+
+from models_modules.bayesian_gplvm import BayesianGPLVM
+from models_modules.gp_regression import GPRegression
+from models_modules.gp_classification import GPClassification#; _gp_classification = gp_classification ; del gp_classification 
+from models_modules.sparse_gp_regression import SparseGPRegression#; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
+from models_modules.svigp_regression import SVIGPRegression#; _svigp_regression = svigp_regression ; del svigp_regression 
+from models_modules.sparse_gp_classification import SparseGPClassification#; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
+from models_modules.fitc_classification import FITCClassification#; _fitc_classification = fitc_classification ; del fitc_classification 
+from models_modules.gplvm import GPLVM#; _gplvm = gplvm ; del gplvm 
+from models_modules.bcgplvm import BCGPLVM#; _bcgplvm = bcgplvm; del bcgplvm
+from models_modules.sparse_gplvm import SparseGPLVM#; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
+from models_modules.warped_gp import WarpedGP#; _warped_gp = warped_gp ; del warped_gp 
+from models_modules.bayesian_gplvm import BayesianGPLVM#; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
+from models_modules.mrd import MRD#; _mrd = mrd; del mrd 
+from models_modules.gradient_checker import GradientChecker#; _gradient_checker = gradient_checker ; del gradient_checker 
+from models_modules.gp_multioutput_regression import GPMultioutputRegression#; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
+from models_modules.sparse_gp_multioutput_regression import SparseGPMultioutputRegression#; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
+from models_modules.gradient_checker import GradientChecker
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -1,13 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-from gp_regression import GPRegression
-from gp_classification import GPClassification
-from sparse_gp_regression import SparseGPRegression
-from svigp_regression import SVIGPRegression
-from sparse_gp_classification import SparseGPClassification
-from fitc_classification import FITCClassification
-from gplvm import GPLVM
-from warped_gp import WarpedGP
-from bayesian_gplvm import BayesianGPLVM
-from mrd import MRD
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -1,583 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-from ..core import SparseGP
-from ..likelihoods import Gaussian
-from .. import kern
-import itertools
-from matplotlib.colors import colorConverter
-from GPy.inference.optimization import SCG
-from GPy.util import plot_latent
-from GPy.models.gplvm import GPLVM
-
-class BayesianGPLVM(SparseGP, GPLVM):
-    """
-    Bayesian Gaussian Process Latent Variable Model
-
-    :param Y: observed data (np.ndarray) or GPy.likelihood
-    :type Y: np.ndarray| GPy.likelihood instance
-    :param input_dim: latent dimensionality
-    :type input_dim: int
-    :param init: initialisation method for the latent space
-    :type init: 'PCA'|'random'
-
-    """
-    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, oldpsave=10, _debug=False,
-                 **kwargs):
-        if type(likelihood_or_Y) is np.ndarray:
-            likelihood = Gaussian(likelihood_or_Y)
-        else:
-            likelihood = likelihood_or_Y
-
-        if X == None:
-            X = self.initialise_latent(init, input_dim, likelihood.Y)
-        self.init = init
-
-        if X_variance is None:
-            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
-
-        if Z is None:
-            Z = np.random.permutation(X.copy())[:num_inducing]
-        assert Z.shape[1] == X.shape[1]
-
-        if kernel is None:
-            kernel = kern.rbf(input_dim) + kern.white(input_dim)
-
-        self.oldpsave = oldpsave
-        self._oldps = []
-        self._debug = _debug
-
-        if self._debug:
-            self.f_call = 0
-            self._count = itertools.count()
-            self._savedklll = []
-            self._savedparams = []
-            self._savedgradients = []
-            self._savederrors = []
-            self._savedpsiKmm = []
-            self._savedABCD = []
-
-        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
-        self.ensure_default_constraints()
-
-    @property
-    def oldps(self):
-        return self._oldps
-    @oldps.setter
-    def oldps(self, p):
-        if len(self._oldps) == (self.oldpsave + 1):
-            self._oldps.pop()
-        # if len(self._oldps) == 0 or not np.any([np.any(np.abs(p - op) > 1e-5) for op in self._oldps]):
-        self._oldps.insert(0, p.copy())
-
-    def _get_param_names(self):
-        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
-        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
-        return (X_names + S_names + SparseGP._get_param_names(self))
-
-    def _get_params(self):
-        """
-        Horizontally stacks the parameters in order to present them to the optimizer.
-        The resulting 1-input_dim array has this structure:
-
-        ===============================================================
-        |       mu       |        S        |    Z    | theta |  beta  |
-        ===============================================================
-
-        """
-        x = np.hstack((self.X.flatten(), self.X_variance.flatten(), SparseGP._get_params(self)))
-        return x
-
-    def _clipped(self, x):
-        return x # np.clip(x, -1e300, 1e300)
-
-    def _set_params(self, x, save_old=True, save_count=0):
-#         try:
-            x = self._clipped(x)
-            N, input_dim = self.num_data, self.input_dim
-            self.X = x[:self.X.size].reshape(N, input_dim).copy()
-            self.X_variance = x[(N * input_dim):(2 * N * input_dim)].reshape(N, input_dim).copy()
-            SparseGP._set_params(self, x[(2 * N * input_dim):])
-#             self.oldps = x
-#         except (LinAlgError, FloatingPointError, ZeroDivisionError):
-#             print "\rWARNING: Caught LinAlgError, continueing without setting            "
-#             if self._debug:
-#                 self._savederrors.append(self.f_call)
-#             if save_count > 10:
-#                 raise
-#             self._set_params(self.oldps[-1], save_old=False, save_count=save_count + 1)
-
-    def dKL_dmuS(self):
-        dKL_dS = (1. - (1. / (self.X_variance))) * 0.5
-        dKL_dmu = self.X
-        return dKL_dmu, dKL_dS
-
-    def dL_dmuS(self):
-        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.dL_dpsi0, self.Z, self.X, self.X_variance)
-        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.dL_dpsi1, self.Z, self.X, self.X_variance)
-        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2, self.Z, self.X, self.X_variance)
-        dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
-        dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
-
-        return dL_dmu, dL_dS
-
-    def KL_divergence(self):
-        var_mean = np.square(self.X).sum()
-        var_S = np.sum(self.X_variance - np.log(self.X_variance))
-        return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data
-
-    def log_likelihood(self):
-        ll = SparseGP.log_likelihood(self)
-        kl = self.KL_divergence()
-
-#         if ll < -2E4:
-#             ll = -2E4 + np.random.randn()
-#         if kl > 5E4:
-#             kl = 5E4 + np.random.randn()
-
-        if self._debug:
-            self.f_call = self._count.next()
-            if self.f_call % 1 == 0:
-                self._savedklll.append([self.f_call, ll, kl])
-                self._savedparams.append([self.f_call, self._get_params()])
-                self._savedgradients.append([self.f_call, self._log_likelihood_gradients()])
-                self._savedpsiKmm.append([self.f_call, [self.Kmm, self.dL_dKmm]])
-#                 sf2 = self.scale_factor ** 2
-                if self.likelihood.is_heteroscedastic:
-                    A = -0.5 * self.num_data * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.V * self.likelihood.Y)
-#                     B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A) * sf2)
-                    B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
-                else:
-                    A = -0.5 * self.num_data * self.input_dim * (np.log(2.*np.pi) + np.log(self.likelihood._variance)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
-#                     B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A) * sf2)
-                    B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
-                C = -self.input_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
-                D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
-                self._savedABCD.append([self.f_call, A, B, C, D])
-
-        # print "\nkl:", kl, "ll:", ll
-        return ll - kl
-
-    def _log_likelihood_gradients(self):
-        dKL_dmu, dKL_dS = self.dKL_dmuS()
-        dL_dmu, dL_dS = self.dL_dmuS()
-        # TODO: find way to make faster
-
-        d_dmu = (dL_dmu - dKL_dmu).flatten()
-        d_dS = (dL_dS - dKL_dS).flatten()
-        # TEST KL: ====================
-        # d_dmu = (dKL_dmu).flatten()
-        # d_dS = (dKL_dS).flatten()
-        # ========================
-        # TEST L: ====================
-#         d_dmu = (dL_dmu).flatten()
-#         d_dS = (dL_dS).flatten()
-        # ========================
-        self.dbound_dmuS = np.hstack((d_dmu, d_dS))
-        self.dbound_dZtheta = SparseGP._log_likelihood_gradients(self)
-        return self._clipped(np.hstack((self.dbound_dmuS.flatten(), self.dbound_dZtheta)))
-
-    def plot_latent(self, *args, **kwargs):
-        return plot_latent.plot_latent_indices(self, *args, **kwargs)
-
-    def do_test_latents(self, Y):
-        """
-        Compute the latent representation for a set of new points Y
-
-        Notes:
-        This will only work with a univariate Gaussian likelihood (for now)
-        """
-        assert not self.likelihood.is_heteroscedastic
-        N_test = Y.shape[0]
-        input_dim = self.Z.shape[1]
-        means = np.zeros((N_test, input_dim))
-        covars = np.zeros((N_test, input_dim))
-
-        dpsi0 = -0.5 * self.input_dim * self.likelihood.precision
-        dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
-        V = self.likelihood.precision * Y
-        dpsi1 = np.dot(self.Cpsi1V, V.T)
-
-        start = np.zeros(self.input_dim * 2)
-
-        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
-            args = (self.kern, self.Z, dpsi0, dpsi1_n, dpsi2)
-            xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False)
-
-            mu, log_S = xopt.reshape(2, 1, -1)
-            means[n] = mu[0].copy()
-            covars[n] = np.exp(log_S[0]).copy()
-
-        return means, covars
-
-
-    def plot_X_1d(self, fignum=None, ax=None, colors=None):
-        """
-        Plot latent space X in 1D:
-
-            -if fig is given, create input_dim subplots in fig and plot in these
-            -if ax is given plot input_dim 1D latent space plots of X into each `axis`
-            -if neither fig nor ax is given create a figure with fignum and plot in there
-
-        colors:
-            colors of different latent space dimensions input_dim
-        """
-        import pylab
-        if ax is None:
-            fig = pylab.figure(num=fignum, figsize=(8, min(12, (2 * self.X.shape[1]))))
-        if colors is None:
-            colors = pylab.gca()._get_lines.color_cycle
-            pylab.clf()
-        else:
-            colors = iter(colors)
-        plots = []
-        x = np.arange(self.X.shape[0])
-        for i in range(self.X.shape[1]):
-            if ax is None:
-                a = fig.add_subplot(self.X.shape[1], 1, i + 1)
-            elif isinstance(ax, (tuple, list)):
-                a = ax[i]
-            else:
-                raise ValueError("Need one ax per latent dimnesion input_dim")
-            a.plot(self.X, c='k', alpha=.3)
-            plots.extend(a.plot(x, self.X.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
-            a.fill_between(x,
-                            self.X.T[i] - 2 * np.sqrt(self.X_variance.T[i]),
-                            self.X.T[i] + 2 * np.sqrt(self.X_variance.T[i]),
-                            facecolor=plots[-1].get_color(),
-                            alpha=.3)
-            a.legend(borderaxespad=0.)
-            a.set_xlim(x.min(), x.max())
-            if i < self.X.shape[1] - 1:
-                a.set_xticklabels('')
-        pylab.draw()
-        fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
-        return fig
-
-    def __getstate__(self):
-        return (self.likelihood, self.input_dim, self.X, self.X_variance,
-                self.init, self.num_inducing, self.Z, self.kern,
-                self.oldpsave, self._debug)
-
-    def __setstate__(self, state):
-        self.__init__(*state)
-
-    def _debug_filter_params(self, x):
-        start, end = 0, self.X.size,
-        X = x[start:end].reshape(self.num_data, self.input_dim)
-        start, end = end, end + self.X_variance.size
-        X_v = x[start:end].reshape(self.num_data, self.input_dim)
-        start, end = end, end + (self.num_inducing * self.input_dim)
-        Z = x[start:end].reshape(self.num_inducing, self.input_dim)
-        start, end = end, end + self.input_dim
-        theta = x[start:]
-        return X, X_v, Z, theta
-
-
-    def _debug_get_axis(self, figs):
-        if figs[-1].axes:
-            ax1 = figs[-1].axes[0]
-            ax1.cla()
-        else:
-            ax1 = figs[-1].add_subplot(111)
-        return ax1
-
-    def _debug_plot(self):
-        assert self._debug, "must enable _debug, to debug-plot"
-        import pylab
-#         from mpl_toolkits.mplot3d import Axes3D
-        figs = [pylab.figure('BGPLVM DEBUG', figsize=(12, 4))]
-#         fig.clf()
-
-        # log like
-#         splotshape = (6, 4)
-#         ax1 = pylab.subplot2grid(splotshape, (0, 0), 1, 4)
-        ax1 = self._debug_get_axis(figs)
-        ax1.text(.5, .5, "Optimization", alpha=.3, transform=ax1.transAxes,
-                 ha='center', va='center')
-        kllls = np.array(self._savedklll)
-        LL, = ax1.plot(kllls[:, 0], kllls[:, 1] - kllls[:, 2], '-', label=r'$\log p(\mathbf{Y})$', mew=1.5)
-        KL, = ax1.plot(kllls[:, 0], kllls[:, 2], '-', label=r'$\mathcal{KL}(p||q)$', mew=1.5)
-        L, = ax1.plot(kllls[:, 0], kllls[:, 1], '-', label=r'$L$', mew=1.5) # \mathds{E}_{q(\mathbf{X})}[p(\mathbf{Y|X})\frac{p(\mathbf{X})}{q(\mathbf{X})}]
-
-        param_dict = dict(self._savedparams)
-        gradient_dict = dict(self._savedgradients)
-#         kmm_dict = dict(self._savedpsiKmm)
-        iters = np.array(param_dict.keys())
-        ABCD_dict = np.array(self._savedABCD)
-        self.showing = 0
-
-#         ax2 = pylab.subplot2grid(splotshape, (1, 0), 2, 4)
-        figs.append(pylab.figure("BGPLVM DEBUG X", figsize=(12, 4)))
-        ax2 = self._debug_get_axis(figs)
-        ax2.text(.5, .5, r"$\mathbf{X}$", alpha=.5, transform=ax2.transAxes,
-                 ha='center', va='center')
-        figs[-1].canvas.draw()
-        figs[-1].tight_layout(rect=(0, 0, 1, .86))
-#         ax3 = pylab.subplot2grid(splotshape, (3, 0), 2, 4, sharex=ax2)
-        figs.append(pylab.figure("BGPLVM DEBUG S", figsize=(12, 4)))
-        ax3 = self._debug_get_axis(figs)
-        ax3.text(.5, .5, r"$\mathbf{S}$", alpha=.5, transform=ax3.transAxes,
-                 ha='center', va='center')
-        figs[-1].canvas.draw()
-        figs[-1].tight_layout(rect=(0, 0, 1, .86))
-#         ax4 = pylab.subplot2grid(splotshape, (5, 0), 2, 2)
-        figs.append(pylab.figure("BGPLVM DEBUG Z", figsize=(6, 4)))
-        ax4 = self._debug_get_axis(figs)
-        ax4.text(.5, .5, r"$\mathbf{Z}$", alpha=.5, transform=ax4.transAxes,
-                 ha='center', va='center')
-        figs[-1].canvas.draw()
-        figs[-1].tight_layout(rect=(0, 0, 1, .86))
-#         ax5 = pylab.subplot2grid(splotshape, (5, 2), 2, 2)
-        figs.append(pylab.figure("BGPLVM DEBUG theta", figsize=(6, 4)))
-        ax5 = self._debug_get_axis(figs)
-        ax5.text(.5, .5, r"${\theta}$", alpha=.5, transform=ax5.transAxes,
-                 ha='center', va='center')
-        figs[-1].canvas.draw()
-        figs[-1].tight_layout(rect=(.15, 0, 1, .86))
-#         figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6)))
-#         fig = figs[-1]
-#         ax6 = fig.add_subplot(121)
-#         ax6.text(.5, .5, r"${\mathbf{K}_{mm}}$", color='magenta', alpha=.5, transform=ax6.transAxes,
-#                  ha='center', va='center')
-#         ax7 = fig.add_subplot(122)
-#         ax7.text(.5, .5, r"${\frac{dL}{dK_{mm}}}$", color='magenta', alpha=.5, transform=ax7.transAxes,
-#                  ha='center', va='center')
-        figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6)))
-        fig = figs[-1]
-        ax8 = fig.add_subplot(121)
-        ax8.text(.5, .5, r"${\mathbf{A,B,C,input_dim}}$", color='k', alpha=.5, transform=ax8.transAxes,
-                 ha='center', va='center')
-        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 1], label='A')
-        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 2], label='B')
-        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 3], label='C')
-        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 4], label='input_dim')
-        ax8.legend()
-        figs[-1].canvas.draw()
-        figs[-1].tight_layout(rect=(.15, 0, 1, .86))
-
-        X, S, Z, theta = self._debug_filter_params(param_dict[self.showing])
-        Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing])
-#         Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag
-
-        quiver_units = 'xy'
-        quiver_scale = 1
-        quiver_scale_units = 'xy'
-        Xlatentplts = ax2.plot(X, ls="-", marker="x")
-        colors = colorConverter.to_rgba_array([p.get_color() for p in Xlatentplts], .4)
-        Ulatent = np.zeros_like(X)
-        xlatent = np.tile(np.arange(0, X.shape[0])[:, None], X.shape[1])
-        Xlatentgrads = ax2.quiver(xlatent, X, Ulatent, Xg, color=colors,
-                                  units=quiver_units, scale_units=quiver_scale_units,
-                                  scale=quiver_scale)
-
-        Slatentplts = ax3.plot(S, ls="-", marker="x")
-        Slatentgrads = ax3.quiver(xlatent, S, Ulatent, Sg, color=colors,
-                                  units=quiver_units, scale_units=quiver_scale_units,
-                                  scale=quiver_scale)
-        ax3.set_ylim(0, 1.)
-
-        xZ = np.tile(np.arange(0, Z.shape[0])[:, None], Z.shape[1])
-        UZ = np.zeros_like(Z)
-        Zplts = ax4.plot(Z, ls="-", marker="x")
-        Zgrads = ax4.quiver(xZ, Z, UZ, Zg, color=colors,
-                                  units=quiver_units, scale_units=quiver_scale_units,
-                                  scale=quiver_scale)
-
-        xtheta = np.arange(len(theta))
-        Utheta = np.zeros_like(theta)
-        thetaplts = ax5.bar(xtheta - .4, theta, color=colors)
-        thetagrads = ax5.quiver(xtheta, theta, Utheta, thetag, color=colors,
-                                  units=quiver_units, scale_units=quiver_scale_units,
-                                  scale=quiver_scale,
-                                  edgecolors=('k',), linewidths=[1])
-        pylab.setp(thetaplts, zorder=0)
-        pylab.setp(thetagrads, zorder=10)
-        ax5.set_xticks(np.arange(len(theta)))
-        ax5.set_xticklabels(self._get_param_names()[-len(theta):], rotation=17)
-
-#         imkmm = ax6.imshow(kmm_dict[self.showing][0])
-#         from mpl_toolkits.axes_grid1 import make_axes_locatable
-#         divider = make_axes_locatable(ax6)
-#         caxkmm = divider.append_axes("right", "5%", pad="1%")
-#         cbarkmm = pylab.colorbar(imkmm, cax=caxkmm)
-#
-#         imkmmdl = ax7.imshow(kmm_dict[self.showing][1])
-#         divider = make_axes_locatable(ax7)
-#         caxkmmdl = divider.append_axes("right", "5%", pad="1%")
-#         cbarkmmdl = pylab.colorbar(imkmmdl, cax=caxkmmdl)
-
-#         input_dimleg = ax1.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
-#                    loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.15, 1, 1.15),
-#                    borderaxespad=0, mode="expand")
-        ax2.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
-                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
-                   borderaxespad=0, mode="expand")
-        ax3.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
-                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
-                   borderaxespad=0, mode="expand")
-        ax4.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
-                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
-                   borderaxespad=0, mode="expand")
-        ax5.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
-                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
-                   borderaxespad=0, mode="expand")
-        Lleg = ax1.legend()
-        Lleg.draggable()
-#         ax1.add_artist(input_dimleg)
-
-        indicatorKL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 2], 'o', c=KL.get_color())
-        indicatorLL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1] - kllls[self.showing, 2], 'o', c=LL.get_color())
-        indicatorL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1], 'o', c=L.get_color())
-#         for err in self._savederrors:
-#             if err < kllls.shape[0]:
-#                 ax1.scatter(kllls[err, 0], kllls[err, 2], s=50, marker=(5, 2), c=KL.get_color())
-#                 ax1.scatter(kllls[err, 0], kllls[err, 1] - kllls[err, 2], s=50, marker=(5, 2), c=LL.get_color())
-#                 ax1.scatter(kllls[err, 0], kllls[err, 1], s=50, marker=(5, 2), c=L.get_color())
-
-#         try:
-#             for f in figs:
-#                 f.canvas.draw()
-#                 f.tight_layout(box=(0, .15, 1, .9))
-# #             pylab.draw()
-# #             pylab.tight_layout(box=(0, .1, 1, .9))
-#         except:
-#             pass
-
-        # parameter changes
-        # ax2 = pylab.subplot2grid((4, 1), (1, 0), 3, 1, projection='3d')
-        button_options = [0, 0] # [0]: clicked -- [1]: dragged
-
-        def update_plots(event):
-            if button_options[0] and not button_options[1]:
-#               event.button, event.x, event.y, event.xdata, event.ydata)
-                tmp = np.abs(iters - event.xdata)
-                closest_hit = iters[tmp == tmp.min()][0]
-
-                if closest_hit != self.showing:
-                    self.showing = closest_hit
-                    # print closest_hit, iters, event.xdata
-
-                    indicatorLL.set_data(self.showing, kllls[self.showing, 1] - kllls[self.showing, 2])
-                    indicatorKL.set_data(self.showing, kllls[self.showing, 2])
-                    indicatorL.set_data(self.showing, kllls[self.showing, 1])
-
-                    X, S, Z, theta = self._debug_filter_params(param_dict[self.showing])
-                    Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing])
-#                     Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag
-
-                    for i, Xlatent in enumerate(Xlatentplts):
-                        Xlatent.set_ydata(X[:, i])
-                    Xlatentgrads.set_offsets(np.array([xlatent.ravel(), X.ravel()]).T)
-                    Xlatentgrads.set_UVC(Ulatent, Xg)
-
-                    for i, Slatent in enumerate(Slatentplts):
-                        Slatent.set_ydata(S[:, i])
-                    Slatentgrads.set_offsets(np.array([xlatent.ravel(), S.ravel()]).T)
-                    Slatentgrads.set_UVC(Ulatent, Sg)
-
-                    for i, Zlatent in enumerate(Zplts):
-                        Zlatent.set_ydata(Z[:, i])
-                    Zgrads.set_offsets(np.array([xZ.ravel(), Z.ravel()]).T)
-                    Zgrads.set_UVC(UZ, Zg)
-
-                    for p, t in zip(thetaplts, theta):
-                        p.set_height(t)
-                    thetagrads.set_offsets(np.array([xtheta.ravel(), theta.ravel()]).T)
-                    thetagrads.set_UVC(Utheta, thetag)
-
-#                     imkmm.set_data(kmm_dict[self.showing][0])
-#                     imkmm.autoscale()
-#                     cbarkmm.update_normal(imkmm)
-#
-#                     imkmmdl.set_data(kmm_dict[self.showing][1])
-#                     imkmmdl.autoscale()
-#                     cbarkmmdl.update_normal(imkmmdl)
-
-                    ax2.relim()
-                    # ax3.relim()
-                    ax4.relim()
-                    ax5.relim()
-                    ax2.autoscale()
-                    # ax3.autoscale()
-                    ax4.autoscale()
-                    ax5.autoscale()
-
-                    [fig.canvas.draw() for fig in figs]
-            button_options[0] = 0
-            button_options[1] = 0
-
-        def onclick(event):
-            if event.inaxes is ax1 and event.button == 1:
-                button_options[0] = 1
-        def motion(event):
-            if button_options[0]:
-                button_options[1] = 1
-
-        cidr = figs[0].canvas.mpl_connect('button_release_event', update_plots)
-        cidp = figs[0].canvas.mpl_connect('button_press_event', onclick)
-        cidd = figs[0].canvas.mpl_connect('motion_notify_event', motion)
-
-        return ax1, ax2, ax3, ax4, ax5 # , ax6, ax7
-
-
-
-
-def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
-    """
-    objective function for fitting the latent variables for test points
-    (negative log-likelihood: should be minimised!)
-    """
-    mu, log_S = mu_S.reshape(2, 1, -1)
-    S = np.exp(log_S)
-
-    psi0 = kern.psi0(Z, mu, S)
-    psi1 = kern.psi1(Z, mu, S)
-    psi2 = kern.psi2(Z, mu, S)
-
-    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
-
-    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
-    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
-    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
-
-    dmu = mu0 + mu1 + mu2 - mu
-    # dS = S0 + S1 + S2 -0.5 + .5/S
-    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
-    return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
-
-def latent_cost(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
-    """
-    objective function for fitting the latent variables (negative log-likelihood: should be minimised!)
-    This is the same as latent_cost_and_grad but only for the objective
-    """
-    mu, log_S = mu_S.reshape(2, 1, -1)
-    S = np.exp(log_S)
-
-    psi0 = kern.psi0(Z, mu, S)
-    psi1 = kern.psi1(Z, mu, S)
-    psi2 = kern.psi2(Z, mu, S)
-
-    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
-    return -float(lik)
-
-def latent_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
-    """
-    This is the same as latent_cost_and_grad but only for the grad
-    """
-    mu, log_S = mu_S.reshape(2, 1, -1)
-    S = np.exp(log_S)
-
-    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
-    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
-    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
-
-    dmu = mu0 + mu1 + mu2 - mu
-    # dS = S0 + S1 + S2 -0.5 + .5/S
-    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
-
-    return -np.hstack((dmu.flatten(), dlnS.flatten()))
-
-
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -1,67 +0,0 @@
-### Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-import pylab as pb
-import sys, pdb
-from .. import kern
-from ..core import Model
-from ..util.linalg import pdinv, PCA
-from ..core import GP
-from ..likelihoods import Gaussian
-from .. import util
-from GPy.util import plot_latent
-
-
-class GPLVM(GP):
-    """
-    Gaussian Process Latent Variable Model
-
-    :param Y: observed data
-    :type Y: np.ndarray
-    :param input_dim: latent dimensionality
-    :type input_dim: int
-    :param init: initialisation method for the latent space
-    :type init: 'PCA'|'random'
-
-    """
-    def __init__(self, Y, input_dim, init='PCA', X = None, kernel=None, normalize_Y=False):
-        if X is None:
-            X = self.initialise_latent(init, input_dim, Y)
-        if kernel is None:
-            kernel = kern.rbf(input_dim, ARD=input_dim>1) + kern.bias(input_dim, np.exp(-2)) + kern.white(input_dim, np.exp(-2))
-        likelihood = Gaussian(Y, normalize=normalize_Y)
-        GP.__init__(self, X, likelihood, kernel, normalize_X=False)
-        self.ensure_default_constraints()
-
-    def initialise_latent(self, init, input_dim, Y):
-        if init == 'PCA':
-            return PCA(Y, input_dim)[0]
-        else:
-            return np.random.randn(Y.shape[0], input_dim)
-
-    def _get_param_names(self):
-        return sum([['X_%i_%i'%(n,q) for q in range(self.input_dim)] for n in range(self.num_data)],[]) + GP._get_param_names(self)
-
-    def _get_params(self):
-        return np.hstack((self.X.flatten(), GP._get_params(self)))
-
-    def _set_params(self,x):
-        self.X = x[:self.num_data*self.input_dim].reshape(self.num_data,self.input_dim).copy()
-        GP._set_params(self, x[self.X.size:])
-
-    def _log_likelihood_gradients(self):
-        dL_dX = 2.*self.kern.dK_dX(self.dL_dK,self.X)
-
-        return np.hstack((dL_dX.flatten(),GP._log_likelihood_gradients(self)))
-
-    def plot(self):
-        assert self.likelihood.Y.shape[1]==2
-        pb.scatter(self.likelihood.Y[:,0],self.likelihood.Y[:,1],40,self.X[:,0].copy(),linewidth=0,cmap=pb.cm.jet)
-        Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None]
-        mu, var, upper, lower = self.predict(Xnew)
-        pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)
-
-    def plot_latent(self, *args, **kwargs):
-        return util.plot_latent.plot_latent(self, *args, **kwargs)
--- a/GPy/models_modules/init.py
+++ b/GPy/models_modules/init.py
@ -0,0 +1,19 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+# from gp_regression import GPRegression; _gp_regression = gp_regression ; del gp_regression 
+# from gp_classification import GPClassification; _gp_classification = gp_classification ; del gp_classification 
+# from sparse_gp_regression import SparseGPRegression; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
+# from svigp_regression import SVIGPRegression; _svigp_regression = svigp_regression ; del svigp_regression 
+# from sparse_gp_classification import SparseGPClassification; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
+# from fitc_classification import FITCClassification; _fitc_classification = fitc_classification ; del fitc_classification 
+# from gplvm import GPLVM; _gplvm = gplvm ; del gplvm 
+# from bcgplvm import BCGPLVM; _bcgplvm = bcgplvm; del bcgplvm
+# from sparse_gplvm import SparseGPLVM; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
+# from warped_gp import WarpedGP; _warped_gp = warped_gp ; del warped_gp 
+# from bayesian_gplvm import BayesianGPLVM; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
+# from mrd import MRD; _mrd = mrd ; del mrd 
+# from gradient_checker import GradientChecker; _gradient_checker = gradient_checker ; del gradient_checker 
+# from gp_multioutput_regression import GPMultioutputRegression; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
+# from sparse_gp_multioutput_regression import SparseGPMultioutputRegression; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
+
--- a/GPy/models_modules/bayesian_gplvm.py
+++ b/GPy/models_modules/bayesian_gplvm.py
@ -0,0 +1,396 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.sparse_gp import SparseGP
+from ..likelihoods import Gaussian
+from .. import kern
+import itertools
+from matplotlib.colors import colorConverter
+from GPy.inference.optimization import SCG
+from GPy.util import plot_latent, linalg
+from .gplvm import GPLVM
+from GPy.util.plot_latent import most_significant_input_dimensions
+from matplotlib import pyplot
+from GPy.core.model import Model
+
+class BayesianGPLVM(SparseGP, GPLVM):
+    """
+    Bayesian Gaussian Process Latent Variable Model
+
+    :param Y: observed data (np.ndarray) or GPy.likelihood
+    :type Y: np.ndarray| GPy.likelihood instance
+    :param input_dim: latent dimensionality
+    :type input_dim: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+
+    """
+    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, **kwargs):
+        if type(likelihood_or_Y) is np.ndarray:
+            likelihood = Gaussian(likelihood_or_Y)
+        else:
+            likelihood = likelihood_or_Y
+
+        if X == None:
+            X = self.initialise_latent(init, input_dim, likelihood.Y)
+        self.init = init
+
+        if X_variance is None:
+            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
+
+        if Z is None:
+            Z = np.random.permutation(X.copy())[:num_inducing]
+        assert Z.shape[1] == X.shape[1]
+
+        if kernel is None:
+            kernel = kern.rbf(input_dim) # + kern.white(input_dim)
+
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
+        self.ensure_default_constraints()
+
+    def _get_param_names(self):
+        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        return (X_names + S_names + SparseGP._get_param_names(self))
+
+    #def _get_print_names(self):
+    #    return SparseGP._get_print_names(self)
+
+    def _get_params(self):
+        """
+        Horizontally stacks the parameters in order to present them to the optimizer.
+        The resulting 1-input_dim array has this structure:
+
+        ===============================================================
+        |       mu       |        S        |    Z    | theta |  beta  |
+        ===============================================================
+
+        """
+        x = np.hstack((self.X.flatten(), self.X_variance.flatten(), SparseGP._get_params(self)))
+        return x
+
+    def _set_params(self, x, save_old=True, save_count=0):
+        N, input_dim = self.num_data, self.input_dim
+        self.X = x[:self.X.size].reshape(N, input_dim).copy()
+        self.X_variance = x[(N * input_dim):(2 * N * input_dim)].reshape(N, input_dim).copy()
+        SparseGP._set_params(self, x[(2 * N * input_dim):])
+
+    def dKL_dmuS(self):
+        dKL_dS = (1. - (1. / (self.X_variance))) * 0.5
+        dKL_dmu = self.X
+        return dKL_dmu, dKL_dS
+
+    def dL_dmuS(self):
+        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.dL_dpsi0, self.Z, self.X, self.X_variance)
+        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.dL_dpsi1, self.Z, self.X, self.X_variance)
+        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2, self.Z, self.X, self.X_variance)
+        dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
+        dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
+
+        return dL_dmu, dL_dS
+
+    def KL_divergence(self):
+        var_mean = np.square(self.X).sum()
+        var_S = np.sum(self.X_variance - np.log(self.X_variance))
+        return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data
+
+    def log_likelihood(self):
+        ll = SparseGP.log_likelihood(self)
+        kl = self.KL_divergence()
+        return ll - kl
+
+    def _log_likelihood_gradients(self):
+        dKL_dmu, dKL_dS = self.dKL_dmuS()
+        dL_dmu, dL_dS = self.dL_dmuS()
+        d_dmu = (dL_dmu - dKL_dmu).flatten()
+        d_dS = (dL_dS - dKL_dS).flatten()
+        self.dbound_dmuS = np.hstack((d_dmu, d_dS))
+        self.dbound_dZtheta = SparseGP._log_likelihood_gradients(self)
+        return np.hstack((self.dbound_dmuS.flatten(), self.dbound_dZtheta))
+
+    def plot_latent(self, plot_inducing=True, *args, **kwargs):
+        return plot_latent.plot_latent(self, plot_inducing=plot_inducing, *args, **kwargs)
+
+    def do_test_latents(self, Y):
+        """
+        Compute the latent representation for a set of new points Y
+
+        Notes:
+        This will only work with a univariate Gaussian likelihood (for now)
+        """
+        assert not self.likelihood.is_heteroscedastic
+        N_test = Y.shape[0]
+        input_dim = self.Z.shape[1]
+        means = np.zeros((N_test, input_dim))
+        covars = np.zeros((N_test, input_dim))
+
+        dpsi0 = -0.5 * self.input_dim * self.likelihood.precision
+        dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
+        V = self.likelihood.precision * Y
+
+        #compute CPsi1V
+        if self.Cpsi1V is None:
+            psi1V = np.dot(self.psi1.T, self.likelihood.V)
+            tmp, _ = linalg.dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            tmp, _ = linalg.dpotrs(self.LB, tmp, lower=1)
+            self.Cpsi1V, _ = linalg.dtrtrs(self._Lm, tmp, lower=1, trans=1)
+
+        dpsi1 = np.dot(self.Cpsi1V, V.T)
+
+        start = np.zeros(self.input_dim * 2)
+
+        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
+            args = (self.kern, self.Z, dpsi0, dpsi1_n.T, dpsi2)
+            xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False)
+
+            mu, log_S = xopt.reshape(2, 1, -1)
+            means[n] = mu[0].copy()
+            covars[n] = np.exp(log_S[0]).copy()
+
+        return means, covars
+
+    def dmu_dX(self, Xnew):
+        """
+        Calculate the gradient of the prediction at Xnew w.r.t Xnew.
+        """
+        dmu_dX = np.zeros_like(Xnew)
+        for i in range(self.Z.shape[0]):
+            dmu_dX += self.kern.dK_dX(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
+        return dmu_dX
+
+    def dmu_dXnew(self, Xnew):
+        """
+        Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
+        """
+        dK_dX = np.zeros((Xnew.shape[0], self.num_inducing))
+        ones = np.ones((1, 1))
+        for i in range(self.Z.shape[0]):
+            dK_dX[:, i] = self.kern.dK_dX(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
+        return np.dot(dK_dX, self.Cpsi1Vf)
+
+    def plot_steepest_gradient_map(self, fignum=None, ax=None, which_indices=None, labels=None, data_labels=None, data_marker='o', data_s=40, resolution=20, aspect='auto', updates=False, ** kwargs):
+        input_1, input_2 = significant_dims = most_significant_input_dimensions(self, which_indices)
+
+        X = np.zeros((resolution ** 2, self.input_dim))
+        indices = np.r_[:X.shape[0]]
+        if labels is None:
+            labels = range(self.output_dim)
+
+        def plot_function(x):
+            X[:, significant_dims] = x
+            dmu_dX = self.dmu_dXnew(X)
+            argmax = np.argmax(dmu_dX, 1)
+            return dmu_dX[indices, argmax], np.array(labels)[argmax]
+
+        if ax is None:
+            fig = pyplot.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        if data_labels is None:
+            data_labels = np.ones(self.num_data)
+        ulabels = []
+        for lab in data_labels:
+            if not lab in ulabels:
+                ulabels.append(lab)
+        marker = itertools.cycle(list(data_marker))
+        from GPy.util import Tango
+        for i, ul in enumerate(ulabels):
+            if type(ul) is np.string_:
+                this_label = ul
+            elif type(ul) is np.int64:
+                this_label = 'class %i' % ul
+            else:
+                this_label = 'class %i' % i
+            m = marker.next()
+            index = np.nonzero(data_labels == ul)[0]
+            x = self.X[index, input_1]
+            y = self.X[index, input_2]
+            ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)
+
+        ax.set_xlabel('latent dimension %i' % input_1)
+        ax.set_ylabel('latent dimension %i' % input_2)
+
+        from matplotlib.cm import get_cmap
+        from GPy.util.latent_space_visualizations.controllers.imshow_controller import ImAnnotateController
+        controller = ImAnnotateController(ax,
+                                      plot_function,
+                                      tuple(self.X.min(0)[:, significant_dims]) + tuple(self.X.max(0)[:, significant_dims]),
+                                      resolution=resolution,
+                                      aspect=aspect,
+                                      cmap=get_cmap('jet'),
+                                      **kwargs)
+        ax.legend()
+        ax.figure.tight_layout()
+        if updates:
+            pyplot.show()
+            clear = raw_input('Enter to continue')
+            if clear.lower() in 'yes' or clear == '':
+                controller.deactivate()
+        return controller.view
+
+    def plot_X_1d(self, fignum=None, ax=None, colors=None):
+        """
+        Plot latent space X in 1D:
+
+            - if fig is given, create input_dim subplots in fig and plot in these
+            - if ax is given plot input_dim 1D latent space plots of X into each `axis`
+            - if neither fig nor ax is given create a figure with fignum and plot in there
+
+        colors:
+            colors of different latent space dimensions input_dim
+
+        """
+        import pylab
+        if ax is None:
+            fig = pylab.figure(num=fignum, figsize=(8, min(12, (2 * self.X.shape[1]))))
+        if colors is None:
+            colors = pylab.gca()._get_lines.color_cycle
+            pylab.clf()
+        else:
+            colors = iter(colors)
+        plots = []
+        x = np.arange(self.X.shape[0])
+        for i in range(self.X.shape[1]):
+            if ax is None:
+                a = fig.add_subplot(self.X.shape[1], 1, i + 1)
+            elif isinstance(ax, (tuple, list)):
+                a = ax[i]
+            else:
+                raise ValueError("Need one ax per latent dimnesion input_dim")
+            a.plot(self.X, c='k', alpha=.3)
+            plots.extend(a.plot(x, self.X.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
+            a.fill_between(x,
+                            self.X.T[i] - 2 * np.sqrt(self.X_variance.T[i]),
+                            self.X.T[i] + 2 * np.sqrt(self.X_variance.T[i]),
+                            facecolor=plots[-1].get_color(),
+                            alpha=.3)
+            a.legend(borderaxespad=0.)
+            a.set_xlim(x.min(), x.max())
+            if i < self.X.shape[1] - 1:
+                a.set_xticklabels('')
+        pylab.draw()
+        fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+        return fig
+
+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return SparseGP.getstate(self) + [self.init]
+
+    def setstate(self, state):
+        self._const_jitter = None
+        self.init = state.pop()
+        SparseGP.setstate(self, state)
+
+class BayesianGPLVMWithMissingData(Model):
+    """
+    Bayesian Gaussian Process Latent Variable Model with missing data support.
+    NOTE: Missing data is assumed to be missing at random!
+    
+    This extension comes with a large memory and computing time deficiency.
+    Use only if fraction of missing data at random is higher than 60%.
+    Otherwise, try filtering data before using this extension.
+    
+    Y can hold missing data as given by `missing`, standard is :class:`~numpy.nan`.
+    
+    If likelihood is given for Y, this likelihood will be discarded, but the parameters
+    of the likelihood will be taken. Also every effort of creating the same likelihood
+    will be done.
+     
+    :param likelihood_or_Y: observed data (np.ndarray) or GPy.likelihood
+    :type likelihood_or_Y: :class:`~numpy.ndarray` | :class:`~GPy.likelihoods.likelihood.likelihood` instance
+    :param int input_dim: latent dimensionality
+    :param init: initialisation method for the latent space
+    :type init: 'PCA' | 'random'
+    """
+    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, missing=np.nan, **kwargs):
+        if type(likelihood_or_Y) is np.ndarray:
+            likelihood = Gaussian(likelihood_or_Y)
+        else:
+            likelihood = likelihood_or_Y
+
+        if X == None:
+            X = self.initialise_latent(init, input_dim, likelihood.Y)
+        self.init = init
+
+        if X_variance is None:
+            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
+
+        if Z is None:
+            Z = np.random.permutation(X.copy())[:num_inducing]
+        assert Z.shape[1] == X.shape[1]
+
+        if kernel is None:
+            kernel = kern.rbf(input_dim) # + kern.white(input_dim)
+
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
+        self.ensure_default_constraints()
+
+    def _get_param_names(self):
+        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        return (X_names + S_names + SparseGP._get_param_names(self))
+
+    pass
+
+def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    objective function for fitting the latent variables for test points
+    (negative log-likelihood: should be minimised!)
+    """
+    mu, log_S = mu_S.reshape(2, 1, -1)
+    S = np.exp(log_S)
+
+    psi0 = kern.psi0(Z, mu, S)
+    psi1 = kern.psi1(Z, mu, S)
+    psi2 = kern.psi2(Z, mu, S)
+
+    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
+
+    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
+    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
+    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
+
+    dmu = mu0 + mu1 + mu2 - mu
+    # dS = S0 + S1 + S2 -0.5 + .5/S
+    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
+    return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
+
+def latent_cost(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    objective function for fitting the latent variables (negative log-likelihood: should be minimised!)
+    This is the same as latent_cost_and_grad but only for the objective
+    """
+    mu, log_S = mu_S.reshape(2, 1, -1)
+    S = np.exp(log_S)
+
+    psi0 = kern.psi0(Z, mu, S)
+    psi1 = kern.psi1(Z, mu, S)
+    psi2 = kern.psi2(Z, mu, S)
+
+    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
+    return -float(lik)
+
+def latent_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    This is the same as latent_cost_and_grad but only for the grad
+    """
+    mu, log_S = mu_S.reshape(2, 1, -1)
+    S = np.exp(log_S)
+
+    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
+    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
+    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
+
+    dmu = mu0 + mu1 + mu2 - mu
+    # dS = S0 + S1 + S2 -0.5 + .5/S
+    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
+
+    return -np.hstack((dmu.flatten(), dlnS.flatten()))
+
+
--- a/Show more
+++ b/Show more