Merge branch 'devel' of https://github.com/SheffieldML/GPy into devel

2026-05-08 11:32:39 +02:00 · 2015-04-08 08:24:59 +02:00 · 2015-04-08 08:24:59 +02:00 · 970df9b88e
commit 970df9b88e
parent e35999b24b a3679b9f61
52 changed files with 2183 additions and 772 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -5,6 +5,7 @@ import numpy as np
 import sys
 from .. import kern
 from model import Model
+from mapping import Mapping
 from parameterization import ObsAr
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
@ -34,7 +35,7 @@ class GP(Model):


    """
-    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
+    def __init__(self, X, Y, kernel, likelihood, mean_function=None, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
        super(GP, self).__init__(name)

        assert X.ndim == 2
@ -75,6 +76,15 @@ class GP(Model):
        assert isinstance(likelihood, likelihoods.Likelihood)
        self.likelihood = likelihood

+        #handle the mean function
+        self.mean_function = mean_function
+        if mean_function is not None:
+            assert isinstance(self.mean_function, Mapping)
+            assert mean_function.input_dim == self.input_dim
+            assert mean_function.output_dim == self.output_dim
+            self.link_parameter(mean_function)
+
+
        #find a sensible inference method
        logger.info("initializing inference method")
        if inference_method is None:
@ -153,9 +163,11 @@ class GP(Model):
            This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
            this method yourself, there may be unexpected consequences.
        """
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dm'], self.X)

    def log_likelihood(self):
        """
@ -192,6 +204,10 @@ class GP(Model):

        #force mu to be a column vector
        if len(mu.shape)==1: mu = mu[:,None]
+
+        #add the mean function in
+        if not self.mean_function is None:
+            mu += self.mean_function.f(_Xnew)
        return mu, var

    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
@ -241,12 +257,14 @@ class GP(Model):

    def predictive_gradients(self, Xnew):
        """
-        Compute the derivatives of the latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect to X*

        Given a set of points at which to predict X* (size [N*,Q]), compute the
        derivatives of the mean and variance. Resulting arrays are sized:
         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).

+        Note that this is not the same as computing the mean and variance of the derivative of the function!
+
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
        :param X: The points at which to get the predictive gradients
        :type X: np.ndarray (Xnew x self.input_dim)
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -1,4 +1,5 @@
 # Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import sys
@ -7,7 +8,7 @@ import numpy as np

 class Mapping(Parameterized):
    """
-    Base model for shared behavior between models that can act like a mapping.
+    Base model for shared mapping behaviours
    """

    def __init__(self, input_dim, output_dim, name='mapping'):
@ -18,49 +19,12 @@ class Mapping(Parameterized):
    def f(self, X):
        raise NotImplementedError

-    def df_dX(self, dL_df, X):
-        """Evaluate derivatives of mapping outputs with respect to inputs.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: the input locations where derivatives are to be evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: matrix containing gradients of the function with respect to the inputs.
-        """
+    def gradients_X(self, dL_dF, X):
        raise NotImplementedError

-    def df_dtheta(self, dL_df, X):
-        """The gradient of the outputs of the mapping with respect to each of the parameters.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: input locations where the function is evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
-        :rtype: ndarray (num_params length)
-        """
-
+    def update_gradients(self, dL_dF, X):
        raise NotImplementedError

-    def plot(self, *args):
-        """
-        Plots the mapping associated with the model.
-          - In one dimension, the function is plotted.
-          - In two dimensions, a contour-plot shows the function
-          - In higher dimensions, we've not implemented this yet !TODO!
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
-
-        This is a convenience function: arguments are passed to
-        GPy.plotting.matplot_dep.models_plots.plot_mapping
-        """
-
-        if "matplotlib" in sys.modules:
-            from ..plotting.matplot_dep import models_plots
-            mapping_plots.plot_mapping(self,*args)
-        else:
-            raise NameError, "matplotlib package has not been imported."

 class Bijective_mapping(Mapping):
    """
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -50,31 +50,29 @@ class SpikeAndSlabPrior(VariationalPrior):
    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi
            
        var_mean = np.square(mu)/self.variance
        var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*(log_gamma-np.log(pi))).sum()+(gamma1*(log_gamma1-np.log(1-pi))).sum()
+        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
        return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.

    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

-        variational_posterior.binary_prob.gradient -= (np.log((1-pi)/pi)+log_gamma-log_gamma1+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.)*gamma*gamma1
+        variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
        mu.gradient -= gamma*mu/self.variance
        S.gradient -= (1./self.variance - 1./S) * gamma /2.
        if self.learnPi:
@ -161,24 +159,8 @@ class SpikeAndSlabPosterior(VariationalPosterior):
        binary_prob : the probability of the distribution on the slab part.
        """
        super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
-        self.gamma = Param("binary_prob",binary_prob)
+        self.gamma = Param("binary_prob",binary_prob,Logistic(0.,1.))
        self.link_parameter(self.gamma)
-        
-    @Cache_this(limit=5)
-    def gamma_probabilities(self):
-        prob = np.zeros_like(param_to_array(self.gamma))
-        prob[self.gamma>-710] = 1./(1.+np.exp(-self.gamma[self.gamma>-710]))
-        prob1 = -np.zeros_like(param_to_array(self.gamma))
-        prob1[self.gamma<710] = 1./(1.+np.exp(self.gamma[self.gamma<710]))
-        return prob, prob1
-    
-    @Cache_this(limit=5)
-    def gamma_log_prob(self):
-        loggamma = param_to_array(self.gamma).copy()
-        loggamma[loggamma>-40] = -np.log1p(np.exp(-loggamma[loggamma>-40]))
-        loggamma1 = -param_to_array(self.gamma).copy()
-        loggamma1[loggamma1>-40] = -np.log1p(np.exp(-loggamma1[loggamma1>-40]))
-        return loggamma,loggamma1

    def set_gradients(self, grad):
        self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -19,7 +19,7 @@ class SparseGP(GP):
    This model allows (approximate) inference using variational DTC or FITC
    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
    these.
-    
+
    This is not for missing data, as the implementation for missing data involves
    some inefficient optimization routine decisions.
    See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
@ -39,7 +39,7 @@ class SparseGP(GP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False):
        #pick a sensible inference method
        if inference_method is None:
@ -53,7 +53,7 @@ class SparseGP(GP):
        self.Z = Param('inducing inputs', Z)
        self.num_inducing = Z.shape[0]

-        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)

        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
@ -61,7 +61,7 @@ class SparseGP(GP):

    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)
-    
+
    def set_Z(self, Z, trigger_update=True):
        if trigger_update: self.update_model(False)
        self.unlink_parameter(self.Z)
@ -110,8 +110,8 @@ class SparseGP(GP):

    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
-        Make a prediction for the latent function values. 
-    
+        Make a prediction for the latent function values.
+
        For certain inputs we give back a full_cov of shape NxN,
        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
        we take only the diagonal elements across N.
@ -136,6 +136,9 @@ class SparseGP(GP):
            else:
                Kxx = kern.Kdiag(Xnew)
                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+            #add in the mean function
+            if self.mean_function is not None:
+                mu += self.mean_function.f(Xnew)
        else:
            psi0_star = self.kern.psi0(self.Z, Xnew)
            psi1_star = self.kern.psi1(self.Z, Xnew)
@ -165,4 +168,5 @@ class SparseGP(GP):
                    var[i] = var_
                else:
                    var[i] = np.diag(var_)+p0-t2
+
        return mu, var
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf


 class SVGP(SparseGP):
-    def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
        """
        Stochastic Variational GP.

@ -38,7 +38,7 @@ class SVGP(SparseGP):
        #create the SVI inference method
        inf_method = svgp_inf()

-        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
+        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
                 name=name, Y_metadata=Y_metadata, normalizer=False)

        self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
@ -48,7 +48,7 @@ class SVGP(SparseGP):
        self.link_parameter(self.m)

    def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))

        #update the kernel gradients
        self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
@ -65,6 +65,13 @@ class SVGP(SparseGP):
        self.m.gradient = self.grad_dict['dL_dm']
        self.chol.gradient = self.grad_dict['dL_dchol']

+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
+            g = self.mean_function.gradient[:].copy()
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
+            self.mean_function.gradient[:] += g
+            self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
+
    def set_data(self, X, Y):
        """
        Set the data without calling parameters_changed to avoid wasted computation
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -505,3 +505,48 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):

    print m
    return m
+
+def simple_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    The simplest possible mean function. No parameters, just a simple Sinusoid.
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+    mf.update_gradients = lambda a,b: None
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot(plot_limits=(-10,15))
+    return m
+
+def parametric_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    A linear mean function with parameters that we'll learn alongside the kernel
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
+
+    mf = GPy.mappings.Linear(1,1)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot()
+    return m
+
+
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -50,19 +50,19 @@ class InferenceMethodList(LatentFunctionInference, list):
    def on_optimization_end(self):
        for inf in self:
            inf.on_optimization_end()
-    
+
    def __getstate__(self):
        state = []
        for inf in self:
            state.append(inf)
        return state
-    
+
    def __setstate__(self, state):
        for inf in state:
            self.append(inf)

 from exact_gaussian_inference import ExactGaussianInference
-from laplace import Laplace
+from laplace import Laplace, LaplaceBlock
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
 from expectation_propagation import EP
 from expectation_propagation_dtc import EPDTC
@ -78,9 +78,9 @@ from svgp import SVGP
 # class EMLikeLatentFunctionInference(LatentFunctionInference):
 #     def update_approximation(self):
 #         """
-#         This function gets called when the 
+#         This function gets called when the
 #         """
-#     
+#
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
@ -88,7 +88,7 @@ from svgp import SVGP
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
-# 
+#
 # class VariationalLatentFunctionInference(LatentFunctionInference):
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@ -20,7 +20,8 @@ class DTC(LatentFunctionInference):
    def __init__(self):
        self.const_jitter = 1e-6

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."

        num_inducing, _ = Z.shape
@ -88,7 +89,8 @@ class vDTC(object):
    def __init__(self):
        self.const_jitter = 1e-6

-    def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."

        num_inducing, _ = Z.shape
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -36,11 +36,18 @@ class ExactGaussianInference(LatentFunctionInference):
            #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
            return Y

-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """
-        YYT_factor = self.get_YYTfactor(Y)
+
+        if mean_function is None:
+            m = 0
+        else:
+            m = mean_function.f(X)
+
+
+        YYT_factor = self.get_YYTfactor(Y-m)

        K = kern.K(X)

@ -56,4 +63,4 @@ class ExactGaussianInference(LatentFunctionInference):

        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)

-        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -33,15 +33,19 @@ class EP(LatentFunctionInference):
        # TODO: update approximation in the end as well? Maybe even with a switch?
        pass

-    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
+        assert mean_function is None, "inference with a mean function not implemented"
        num_data, output_dim = Y.shape
        assert output_dim ==1, "ep in 1D only (for now!)"

        K = kern.K(X)

        if self._ep_approximation is None:
+
+            #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
        else:
+            #if we've already run EP, just use the existing approximation stored in self._ep_approximation
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation

        Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@ -64,7 +64,8 @@ class EPDTC(LatentFunctionInference):
        self.old_mutilde, self.old_vtilde = None, None
        self._ep_approximation = None

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
        num_data, output_dim = Y.shape
        assert output_dim ==1, "ep in 1D only (for now!)"

--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@ -18,7 +18,8 @@ class FITC(LatentFunctionInference):
    """
    const_jitter = 1e-6

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"

        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -39,10 +39,11 @@ class Laplace(LatentFunctionInference):
        self.first_run = True
        self._previous_Ki_fhat = None

-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """
+        assert mean_function is None, "inference with a mean function not implemented"

        # Compute K
        K = kern.K(X)
@ -50,21 +51,25 @@ class Laplace(LatentFunctionInference):
        #Find mode
        if self.bad_fhat or self.first_run:
            Ki_f_init = np.zeros_like(Y)
-            first_run = False
+            self.first_run = False
        else:
            Ki_f_init = self._previous_Ki_fhat

+        Ki_f_init = np.zeros_like(Y)# FIXME: take this out
+
        f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
+
        self.f_hat = f_hat
-        self.Ki_fhat =  Ki_fhat
-        self.K = K.copy()
+        #self.Ki_fhat =  Ki_fhat
+        #self.K = K.copy()
+
        #Compute hessian and other variables at mode
        log_marginal, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)

        self._previous_Ki_fhat = Ki_fhat.copy()
        return Posterior(woodbury_vector=Ki_fhat, woodbury_inv=woodbury_inv, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}

-    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None, *args, **kwargs):
        """
        Rasmussen's numerically stable mode finding
        For nomenclature see Rasmussen & Williams 2006
@ -89,7 +94,12 @@ class Laplace(LatentFunctionInference):

        #define the objective function (to be maximised)
        def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            ll = -0.5*np.sum(np.dot(Ki_f.T, f)) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            if np.isnan(ll):
+                return -np.inf
+            else:
+                return ll
+

        difference = np.inf
        iteration = 0
@ -104,7 +114,7 @@ class Laplace(LatentFunctionInference):
            W_f = W*f

            b = W_f + grad # R+W p46 line 6.
-            W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
+            W12BiW12, _, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave, *args, **kwargs)
            W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))

            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
@ -121,7 +131,9 @@ class Laplace(LatentFunctionInference):
            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
            Ki_f_new = Ki_f + step*dKi_f
            f_new = np.dot(K, Ki_f_new)
-
+            #print "new {} vs old {}".format(obj(Ki_f_new, f_new), obj(Ki_f, f))
+            if obj(Ki_f_new, f_new) < obj(Ki_f, f):
+                raise ValueError("Shouldn't happen, brent optimization failing")
            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
            Ki_f = Ki_f_new
            f = f_new
@ -152,14 +164,10 @@ class Laplace(LatentFunctionInference):
        if np.any(np.isnan(W)):
            raise ValueError('One or more element(s) of W is NaN')

-        K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
-
-        #compute vital matrices
-        C = np.dot(LiW12, K)
-        Ki_W_i  = K - C.T.dot(C)
+        K_Wi_i, logdet_I_KW, I_KW_i, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)

        #compute the log marginal
-        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - np.sum(np.log(np.diag(L)))
+        log_marginal = -0.5*np.sum(np.dot(Ki_f.T, f_hat)) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - 0.5*logdet_I_KW

        # Compute matrices for derivatives
        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
@ -196,23 +204,23 @@ class Laplace(LatentFunctionInference):
            dL_dthetaL = np.zeros(num_params)
            for thetaL_i in range(num_params):
                #Explicit
-                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
+                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i,:, :])
                                # The + comes from the fact that dlik_hess_dthetaL == -dW_dthetaL
-                                + 0.5*np.sum(np.diag(Ki_W_i).flatten()*dlik_hess_dthetaL[:, thetaL_i].flatten())
+                                  + 0.5*np.sum(np.diag(Ki_W_i)*np.squeeze(dlik_hess_dthetaL[thetaL_i, :, :]))
                                )

                #Implicit
-                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[:, thetaL_i])
-                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[:, thetaL_i])
+                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[thetaL_i, :, :])
+                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[thetaL_i, :, :])
                dL_dthetaL_imp = np.dot(dL_dfhat.T, dfhat_dthetaL)
-                dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+                dL_dthetaL[thetaL_i] = np.sum(dL_dthetaL_exp + dL_dthetaL_imp)

        else:
            dL_dthetaL = np.zeros(likelihood.size)

        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL

-    def _compute_B_statistics(self, K, W, log_concave):
+    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
        """
        Rasmussen suggests the use of a numerically stable positive definite matrix B
        Which has a positive diagonal elements and can be easily inverted
@ -225,7 +233,7 @@ class Laplace(LatentFunctionInference):
        """
        if not log_concave:
            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
-            W[W<1e-6] = 1e-6
+            W = np.clip(W, 1e-6, 1e+30)
            # NOTE: when setting a parameter inside parameters_changed it will allways come to closed update circles!!!
            #W.__setitem__(W < 1e-6, 1e-6, update=False)  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@ -247,5 +255,160 @@ class Laplace(LatentFunctionInference):
        #K_Wi_i_2 , _= dpotri(L2)
        #symmetrify(K_Wi_i_2)

-        return K_Wi_i, L, LiW12
+        #compute vital matrices
+        C = np.dot(LiW12, K)
+        Ki_W_i  = K - C.T.dot(C)

+        I_KW_i = np.eye(K.shape[0]) - np.dot(K, K_Wi_i)
+        logdet_I_KW = 2*np.sum(np.log(np.diag(L)))
+
+        return K_Wi_i, logdet_I_KW, I_KW_i, Ki_W_i
+
+class LaplaceBlock(Laplace):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None, *args, **kwargs):
+        Ki_f = Ki_f_init.copy()
+        f = np.dot(K, Ki_f)
+
+        #define the objective function (to be maximised)
+        def obj(Ki_f, f):
+            ll = -0.5*np.dot(Ki_f.T, f) + np.sum(likelihood.logpdf_sum(f, Y, Y_metadata=Y_metadata))
+            if np.isnan(ll):
+                return -np.inf
+            else:
+                return ll
+
+        difference = np.inf
+        iteration = 0
+
+        I = np.eye(K.shape[0])
+        while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
+            W = -likelihood.d2logpdf_df2(f, Y, Y_metadata=Y_metadata)
+
+            W[np.diag_indices_from(W)] = np.clip(np.diag(W), 1e-6, 1e+30)
+
+            W_f = np.dot(W, f)
+            grad = likelihood.dlogpdf_df(f, Y, Y_metadata=Y_metadata)
+
+            b = W_f + grad # R+W p46 line 6.
+            K_Wi_i, _, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave, *args, **kwargs)
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            #a = (I - (K+Wi)i*K)*b
+            full_step_Ki_f = np.dot(I - np.dot(K_Wi_i, K), b)
+            dKi_f = full_step_Ki_f - Ki_f
+
+            #define an objective for the line search (minimize this one)
+            def inner_obj(step_size):
+                Ki_f_trial = Ki_f + step_size*dKi_f
+                f_trial = np.dot(K, Ki_f_trial)
+                return -obj(Ki_f_trial, f_trial)
+
+            #use scipy for the line search, the compute new values of f, Ki_f
+            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
+
+            Ki_f_new = Ki_f + step*dKi_f
+            f_new = np.dot(K, Ki_f_new)
+
+            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
+            Ki_f = Ki_f_new
+            f = f_new
+            iteration += 1
+
+        #Warn of bad fits
+        if difference > self._mode_finding_tolerance:
+            if not self.bad_fhat:
+                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+            self._previous_Ki_fhat = np.zeros_like(Y)
+            self.bad_fhat = True
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now fine again")
+        if iteration > self._mode_finding_max_iter:
+            warnings.warn("didn't find the best")
+
+        return f, Ki_f
+
+    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, kern, Y_metadata):
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
+
+        W[np.diag_indices_from(W)] = np.clip(np.diag(W), 1e-6, 1e+30)
+
+        K_Wi_i, log_B_det, I_KW_i, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        #compute the log marginal
+        #FIXME: The derterminant should be output_dim*0.5 I think, gradients may now no longer check
+        log_marginal = -0.5*np.dot(f_hat.T, Ki_f) + np.sum(likelihood.logpdf_sum(f_hat, Y, Y_metadata=Y_metadata)) - 0.5*log_B_det
+
+        #Compute vival matrices for derivatives
+        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
+
+        #dL_dfhat = np.zeros((f_hat.shape[0]))
+        #for i in range(f_hat.shape[0]):
+            #dL_dfhat[i] = -0.5*np.trace(np.dot(Ki_W_i, dW_df[:,:,i]))
+
+        dL_dfhat = -0.5*np.einsum('ij,ijk->k', Ki_W_i, dW_df)
+
+        woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, Y_metadata=Y_metadata)
+
+        ####################
+        #compute dL_dK#
+        ####################
+        if kern.size > 0 and not kern.is_fixed:
+            #Explicit
+            explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
+
+            #Implicit
+            implicit_part = woodbury_vector.dot(dL_dfhat[None,:]).dot(I_KW_i)
+            #implicit_part = Ki_f.dot(dL_dfhat[None,:]).dot(I_KW_i)
+
+            dL_dK = explicit_part + implicit_part
+        else:
+            dL_dK = np.zeros_like(K)
+
+        ####################
+        #compute dL_dthetaL#
+        ####################
+        if likelihood.size > 0 and not likelihood.is_fixed:
+            raise NotImplementedError
+        else:
+            dL_dthetaL = np.zeros(likelihood.size)
+
+        #self.K_Wi_i = K_Wi_i
+        #self.Ki_W_i = Ki_W_i
+        #self.W = W
+        #self.K = K
+        #self.dL_dfhat = dL_dfhat
+        #self.explicit_part = explicit_part
+        #self.implicit_part = implicit_part
+        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL
+
+    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :param K: Prior Covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of hessian (1xN)
+        :returns: (K_Wi_i, L_B, not_provided)
+        """
+        #w = GPy.util.diag.view(W)
+        #W[:] = np.where(w<1e-6, 1e-6, w)
+
+        #B = I + KW
+        B = np.eye(K.shape[0]) + np.dot(K, W)
+        #Bi, L, Li, logdetB = pdinv(B)
+        Bi = np.linalg.inv(B)
+
+        #K_Wi_i = np.eye(K.shape[0]) - mdot(W, Bi, K)
+        K_Wi_i = np.dot(W, Bi)
+
+        #self.K_Wi_i_brute = np.linalg.inv(K + np.linalg.inv(W))
+        #self.B = B
+        #self.Bi = Bi
+        Ki_W_i = np.dot(Bi, K)
+
+        sign, logdetB = np.linalg.slogdet(B)
+        return K_Wi_i, sign*logdetB, Bi, Ki_W_i
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -15,7 +15,7 @@ class Posterior(object):
    the function at any new point x_* by integrating over this posterior.

    """
-    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
+    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
        """
        woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
        woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@ -67,6 +67,7 @@ class Posterior(object):
        #option 2:
        self._mean = mean
        self._covariance = cov
+        self._prior_mean = prior_mean

        #compute this lazily
        self._precision = None
@ -175,7 +176,7 @@ class Posterior(object):
        $$
        """
        if self._woodbury_vector is None:
-            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
+            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
        return self._woodbury_vector

    @property
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@ -6,7 +6,8 @@ from posterior import Posterior

 class SVGP(LatentFunctionInference):

-    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+
        num_inducing = Z.shape[0]
        num_data, num_outputs = Y.shape

@ -22,6 +23,15 @@ class SVGP(LatentFunctionInference):
            #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
            #Si, Lnew, _,_ = linalg.pdinv(S)

+        #compute mean function stuff
+        if mean_function is not None:
+            prior_mean_u = mean_function.f(Z)
+            prior_mean_f = mean_function.f(X)
+        else:
+            prior_mean_u = np.zeros((num_inducing, num_outputs))
+            prior_mean_f = np.zeros((num_data, num_outputs))
+
+
        #compute kernel related stuff
        Kmm = kern.K(Z)
        Knm = kern.K(X, Z)
@ -30,29 +40,43 @@ class SVGP(LatentFunctionInference):

        #compute the marginal means and variances of q(f)
        A = np.dot(Knm, Kmmi)
-        mu = np.dot(A, q_u_mean)
+        mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
        v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)

        #compute the KL term
        Kmmim = np.dot(Kmmi, q_u_mean)
        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
        KL = KLs.sum()
-        dKL_dm = Kmmim
+        #gradient of the KL term (assuming zero mean function)
+        dKL_dm = Kmmim.copy()
        dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)

+        if mean_function is not None:
+            #adjust KL term for mean function
+            Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
+            KL += -np.sum(q_u_mean*Kmmi_mfZ)
+            KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
+
+            #adjust gradient for mean fucntion
+            dKL_dm -= Kmmi_mfZ
+            dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
+            dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
+
+            #compute gradients for mean_function
+            dKL_dmfZ = Kmmi_mfZ - Kmmim

        #quadrature for the likelihood
        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)

        #rescale the F term if working on a batch
        F, dF_dmu, dF_dv =  F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
+        if dF_dthetaL is not None:
+            dF_dthetaL =  dF_dthetaL.sum(1).sum(1)*batch_scale

-        #derivatives of expected likelihood
+        #derivatives of expected likelihood, assuming zero mean function
        Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
        Admu = A.T.dot(dF_dmu)
-        #AdvA = np.einsum('ijk,jl->ilk', Adv, A)
-        #AdvA = np.dot(A.T, Adv).swapaxes(0,1)
        AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
        tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
@ -62,6 +86,14 @@ class SVGP(LatentFunctionInference):
        dF_dm = Admu
        dF_dS = AdvA

+        #adjust gradient to account for mean function
+        if mean_function is not None:
+            dF_dmfX = dF_dmu.copy()
+            dF_dmfZ = -Admu
+            dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
+            dF_dKmm += Admu.dot(Kmmi_mfZ.T)
+
+
        #sum (gradients of) expected likelihood and KL part
        log_marginal = F.sum() - KL
        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
@ -69,4 +101,8 @@ class SVGP(LatentFunctionInference):
        dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
        dL_dchol = choleskies.triang_to_flat(dL_dchol)

-        return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        if mean_function is not None:
+            grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
+            grad_dict['dL_dmfX'] = dF_dmfX
+        return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -169,11 +169,13 @@ class VarDTC_minibatch(LatentFunctionInference):

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
-        Lm = jitchol(Kmm, maxtries=100)
+        if not np.isfinite(Kmm).all():
+            print Kmm
+        Lm = jitchol(Kmm)

        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
-        LL = jitchol(Lambda, maxtries=100)
+        LL = jitchol(Lambda)
        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
        b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
        bbt = np.square(b).sum()
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -16,5 +16,6 @@ from _src.poly import Poly
 from _src.eq_ode2 import EQ_ODE2

 from _src.trunclinear import TruncLinear,TruncLinear_inf
-from _src.splitKern import SplitKern,DiffGenomeKern
+from _src.splitKern import SplitKern,DEtime
+from _src.splitKern import DEtime as DiffGenomeKern

--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -6,6 +6,20 @@ from kern import CombinationKernel
 from ...util.caching import Cache_this
 import itertools

+
+def numpy_invalid_op_as_exception(func):
+    """
+    A decorator that allows catching numpy invalid operations
+    as exceptions (the default behaviour is raising warnings).
+    """
+    def func_wrapper(*args, **kwargs):
+        np.seterr(invalid='raise')
+        result = func(*args, **kwargs)
+        np.seterr(invalid='warn')
+        return result
+    return func_wrapper
+
+
 class Prod(CombinationKernel):
    """
    Computes the product of 2 kernels
@ -46,18 +60,20 @@ class Prod(CombinationKernel):
            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
            self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
        else:
-            k = self.K(X,X2)*dL_dK
-            for p in self.parts:
-                p.update_gradients_full(k/p.K(X,X2),X,X2)
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                to_update.update_gradients_full(dL_dK * prod, X, X2)

    def update_gradients_diag(self, dL_dKdiag, X):
        if len(self.parts)==2:
            self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
            self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
        else:
-            k = self.Kdiag(X)*dL_dKdiag
-            for p in self.parts:
-                p.update_gradients_diag(k/p.Kdiag(X),X)
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                to_update.update_gradients_diag(dL_dKdiag * prod, X)

    def gradients_X(self, dL_dK, X, X2=None):
        target = np.zeros(X.shape)
@ -65,9 +81,10 @@ class Prod(CombinationKernel):
            target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
            target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
        else:
-            k = self.K(X,X2)*dL_dK
-            for p in self.parts:
-                target += p.gradients_X(k/p.K(X,X2),X,X2)
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                target += to_update.gradients_X(dL_dK * prod, X, X2)
        return target

    def gradients_X_diag(self, dL_dKdiag, X):
@ -80,3 +97,5 @@ class Prod(CombinationKernel):
            for p in self.parts:
                target += p.gradients_X_diag(k/p.Kdiag(X),X)
        return target
+
+
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@ -37,11 +37,11 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati

    # Compute for psi0 and psi1
    mu2S = np.square(mu)+S
-    dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu)
-    dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu)
-    dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z)
-    dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance)
-    dL_dZ +=  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu)
+    dL_dvar += (dL_dpsi0[:,None]*gamma*mu2S).sum(axis=0) + (dL_dpsi1.T.dot(gamma*mu)*Z).sum(axis=0)
+    dL_dgamma += dL_dpsi0[:,None]*variance*mu2S+ dL_dpsi1.dot(Z)*mu*variance
+    dL_dmu += dL_dpsi0[:,None]*2.*variance*gamma*mu + dL_dpsi1.dot(Z)*gamma*variance
+    dL_dS += dL_dpsi0[:,None]*variance*gamma
+    dL_dZ += dL_dpsi1.T.dot(gamma*mu)*variance
    
    return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma

@ -64,29 +64,23 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
    gamma2 = np.square(gamma)
    variance2 = np.square(variance)
    mu2S = mu2+S # NxQ
-    gvm = np.einsum('nq,nq,q->nq',gamma,mu,variance)
-    common_sum = np.einsum('nq,mq->nm',gvm,Z)
-#     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
-    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
+    gvm = gamma*mu*variance
+    common_sum = gvm.dot(Z.T)
+    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+    Z_expect_var2 = Z_expect*variance2
    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
-    tmp = np.einsum('mo,oq->mq',dL_dpsi2T,Z)
-    common_expect = np.einsum('mq,nm->nq',tmp,common_sum)
-#     common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
-    Z2_expect = np.einsum('om,nm->no',dL_dpsi2T,common_sum)
-    Z1_expect = np.einsum('om,mq->oq',dL_dpsi2T,Z)
+    common_expect = common_sum.dot(dL_dpsi2T).dot(Z)
+    Z2_expect = common_sum.dot(dL_dpsi2T)
+    Z1_expect = dL_dpsi2T.dot(Z)
    
-    dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\
-        np.einsum('nq,nq,nq->q',common_expect,gamma,mu)
+    dL_dvar = variance*Z_expect*2.*(gamma*mu2S-gamma2*mu2).sum(axis=0)+(common_expect*gamma*mu).sum(axis=0)
        
-    dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\
-        np.einsum('nq,q,nq->nq',common_expect,variance,mu)
+    dL_dgamma = Z_expect_var2*(mu2S-2.*gamma*mu2)+common_expect*mu*variance
+                
+    dL_dmu = Z_expect_var2*mu*2.*(gamma-gamma2) + common_expect*gamma*variance
+
+    dL_dS = gamma*Z_expect_var2
    
-    dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\
-            np.einsum('nq,nq,q->nq',common_expect,gamma,variance)
-                    
-    dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2)
-    
-#     dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum))
-    dL_dZ = Z1_expect*np.einsum('nq,q,nq->q',gamma,variance2,(mu2S-gamma*mu2))+np.einsum('nq,q,nq,nm->mq',gamma,variance,mu,Z2_expect)
+    dL_dZ = (gamma*(mu2S-gamma*mu2)).sum(axis=0)*variance2*Z1_expect+ Z2_expect.T.dot(gamma*mu)*variance

    return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@ -22,12 +22,14 @@ try:
        # _psi1                NxM
        mu = variational_posterior.mean
        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
         
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
        l2 = np.square(lengthscale)
        log_denom1 = np.log(S/l2+1)
        log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
        variance = float(variance)
        psi0 = np.empty(N)
        psi0[:] = variance
@ -37,6 +39,7 @@ try:
        from ....util.misc import param_to_array
        S = param_to_array(S)
        mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
        Z = param_to_array(Z)
         
        support_code = """
@ -79,7 +82,7 @@ try:
            }
        }
        """
-        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
     
        psi2 = psi2n.sum(axis=0)
        return psi0,psi1,psi2,psi2n
@ -94,12 +97,13 @@ try:
     
        mu = variational_posterior.mean
        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
        l2 = np.square(lengthscale)
        log_denom1 = np.log(S/l2+1)
        log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
-        gamma, gamma1 = variational_posterior.gamma_probabilities()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
        variance = float(variance)
     
        dvar = np.zeros(1)
@ -113,6 +117,7 @@ try:
        from ....util.misc import param_to_array
        S = param_to_array(S)
        mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
        Z = param_to_array(Z)
         
        support_code = """
@ -130,7 +135,6 @@ try:
                        double Zm1q = Z(m1,q);
                        double Zm2q = Z(m2,q);
                        double gnq = gamma(n,q);
-                        double g1nq = gamma1(n,q);
                        double mu_nq = mu(n,q);
                         
                        if(m2==0) {
@ -156,7 +160,7 @@ try:
                             
                            dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                            dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
-                            dgamma(n,q) += lpsi1*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                            dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                            dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
                            dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                        }
@ -184,7 +188,7 @@ try:
                         
                        dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
                        dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
-                        dgamma(n,q) += lpsi2*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                        dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                        dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
                    }
@ -192,7 +196,7 @@ try:
            }
        }
        """
-        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','gamma1','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
     
        dl *= 2.*lengthscale
        if not ARD:
--- a/GPy/kern/_src/splitKern.py
+++ b/GPy/kern/_src/splitKern.py
@ -7,7 +7,7 @@ from kern import Kern,CombinationKernel
 from .independent_outputs import index_to_slices
 import itertools

-class DiffGenomeKern(Kern):
+class DEtime(Kern):

    def __init__(self, kernel, idx_p, Xp, index_dim=-1, name='DiffGenomeKern'):
        self.idx_p = idx_p
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -60,7 +60,10 @@ class White(Static):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)

    def update_gradients_full(self, dL_dK, X, X2=None):
-        self.variance.gradient = np.trace(dL_dK)
+        if X2 is None:
+            self.variance.gradient = np.trace(dL_dK)
+        else:
+            self.variance.gradient = 0.

    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = dL_dKdiag.sum()
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -296,6 +296,8 @@ class Exponential(Stationary):
        return -0.5*self.K_of_r(r)


+
+
 class OU(Stationary):
    """
    OU kernel:
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -77,7 +77,7 @@ class Bernoulli(Likelihood):

        return Z_hat, mu_hat, sigma2_hat

-    def variational_expectations(self, Y, m, v, gh_points=None):
+    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        if isinstance(self.gp_link, link_functions.Probit):

            if gh_points is None:
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -34,7 +34,9 @@ class Gaussian(Likelihood):
        if gp_link is None:
            gp_link = link_functions.Identity()

-        assert isinstance(gp_link, link_functions.Identity), "the likelihood only implemented for the identity link"
+        if not isinstance(gp_link, link_functions.Identity):
+            print "Warning, Exact inference is not implemeted for non-identity link functions,\
+            if you are not already, ensure Laplace inference_method is used"

        super(Gaussian, self).__init__(gp_link, name=name)

@ -263,16 +265,19 @@ class Gaussian(Likelihood):
        return d2logpdf_dlink2_dvar

    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
-        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
-        return dlogpdf_dvar
+        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dtheta[0,:,:] = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dtheta

    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
-        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
-        return dlogpdf_dlink_dvar
+        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dlink_dtheta[0, :, :]= self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dlink_dtheta

    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
-        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
-        return d2logpdf_dlink2_dvar
+        d2logpdf_dlink2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        d2logpdf_dlink2_dtheta[0, :, :] = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
+        return d2logpdf_dlink2_dtheta

    def _mean(self, gp):
        """
@ -305,18 +310,17 @@ class Gaussian(Likelihood):
        Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp])
        return Ysim.reshape(orig_shape)

-    def log_predictive_density(self, y_test, mu_star, var_star):
+    def log_predictive_density(self, y_test, mu_star, var_star, Y_metadata=None):
        """
        assumes independence
        """
        v = var_star + self.variance
        return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v

-    def variational_expectations(self, Y, m, v, gh_points=None):
+    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        lik_var = float(self.variance)
        F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/lik_var
        dF_dmu = (Y - m)/lik_var
        dF_dv = np.ones_like(v)*(-0.5/lik_var)
-        dF_dlik_var = np.sum(-0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2))
-        dF_dtheta = [dF_dlik_var]
-        return F, dF_dmu, dF_dv, dF_dtheta
+        dF_dtheta = -0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2)
+        return F, dF_dmu, dF_dv, dF_dtheta.reshape(1, Y.shape[0], Y.shape[1])
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -5,7 +5,7 @@ import numpy as np
 from scipy import stats,special
 import scipy as sp
 import link_functions
-from ..util.misc import chain_1, chain_2, chain_3
+from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp
 from scipy.integrate import quad
 import warnings
 from ..core.parameterization import Parameterized
@ -39,6 +39,7 @@ class Likelihood(Parameterized):
        assert isinstance(gp_link,link_functions.GPTransformation), "gp_link is not a valid GPTransformation."
        self.gp_link = gp_link
        self.log_concave = False
+        self.not_block_really = False

    def _gradients(self,partial):
        return np.zeros(0)
@ -177,7 +178,12 @@ class Likelihood(Parameterized):
        if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
            stop

-        dF_dtheta = None # Not yet implemented
+        if self.size:
+            dF_dtheta = self.dlogpdf_dtheta(X, Y[:,None]) # Ntheta x (orig size) x N_{quad_points}
+            dF_dtheta = np.dot(dF_dtheta, gh_w)
+            dF_dtheta = dF_dtheta.reshape(self.size, shape[0], shape[1])
+        else:
+            dF_dtheta = None # Not yet implemented
        return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), dF_dtheta

    def predictive_mean(self, mu, variance, Y_metadata=None):
@ -189,20 +195,27 @@ class Likelihood(Parameterized):

        """
        #conditional_mean: the edpected value of y given some f, under this likelihood
+        fmin = -np.inf
+        fmax = np.inf
        def int_mean(f,m,v):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            #If exponent is under -30 then exp(exponent) will be very small, so don't exp it!)
            #If p is zero then conditional_mean will overflow
+            assert v.all() > 0
+            p = safe_exp(exponent)
+
+            #If p is zero then conditional_variance will overflow
            if p < 1e-10:
                return 0.
            else:
                return self.conditional_mean(f)*p
-        scaled_mean = [quad(int_mean, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        scaled_mean = [quad(int_mean, fmin, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
        mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))

        return mean

    def _conditional_mean(self, f):
-        """Quadrature calculation of the conditional mean: E(Y_star|f)"""
+        """Quadrature calculation of the conditional mean: E(Y_star|f_star)"""
        raise NotImplementedError, "implement this function to make predictions"

    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
@ -210,7 +223,7 @@ class Likelihood(Parameterized):
        Approximation to the predictive variance: V(Y_star)

        The following variance decomposition is used:
-        V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+        V(Y_star) = E( V(Y_star|f_star)**2 ) + V( E(Y_star|f_star) )**2

        :param mu: mean of posterior
        :param sigma: standard deviation of posterior
@ -220,15 +233,22 @@ class Likelihood(Parameterized):
        #sigma2 = sigma**2
        normalizer = np.sqrt(2*np.pi*variance)

+        fmin_v = -np.inf
+        fmin_m = np.inf
+        fmin = -np.inf
+        fmax = np.inf
+
+        from ..util.misc import safe_exp
        # E( V(Y_star|f_star) )
        def int_var(f,m,v):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            p = safe_exp(exponent)
            #If p is zero then conditional_variance will overflow
            if p < 1e-10:
                return 0.
            else:
                return self.conditional_variance(f)*p
-        scaled_exp_variance = [quad(int_var, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        scaled_exp_variance = [quad(int_var, fmin_v, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
        exp_var = np.array(scaled_exp_variance)[:,None] / normalizer

        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
@ -240,14 +260,15 @@ class Likelihood(Parameterized):

        #E( E(Y_star|f_star)**2 )
        def int_pred_mean_sq(f,m,v,predictive_mean_sq):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            p = np.exp(exponent)
            #If p is zero then conditional_mean**2 will overflow
            if p < 1e-10:
                return 0.
            else:
                return self.conditional_mean(f)**2*p

-        scaled_exp_exp2 = [quad(int_pred_mean_sq, -np.inf, np.inf,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
+        scaled_exp_exp2 = [quad(int_pred_mean_sq, fmin_m, fmax,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
        exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer

        var_exp = exp_exp2 - predictive_mean_sq
@ -295,8 +316,18 @@ class Likelihood(Parameterized):
        :returns: likelihood evaluated for this point
        :rtype: float
        """
-        inv_link_f = self.gp_link.transf(f)
-        return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.pdf_link(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+
+    def logpdf_sum(self, f, y, Y_metadata=None):
+        """
+        Convenience function that can overridden for functions where this could
+        be computed more efficiently
+        """
+        return np.sum(self.logpdf(f, y, Y_metadata=Y_metadata))

    def logpdf(self, f, y, Y_metadata=None):
        """
@ -313,8 +344,11 @@ class Likelihood(Parameterized):
        :returns: log likelihood evaluated for this point
        :rtype: float
        """
-        inv_link_f = self.gp_link.transf(f)
-        return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.logpdf_link(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)

    def dlogpdf_df(self, f, y, Y_metadata=None):
        """
@ -332,11 +366,15 @@ class Likelihood(Parameterized):
        :returns: derivative of log likelihood evaluated for this point
        :rtype: 1xN array
        """
-        inv_link_f = self.gp_link.transf(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        return chain_1(dlogpdf_dlink, dlink_df)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.dlogpdf_dlink(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            return chain_1(dlogpdf_dlink, dlink_df)

+    @blockify_hessian
    def d2logpdf_df2(self, f, y, Y_metadata=None):
        """
        Evaluates the link function link(f) then computes the second derivative of log likelihood using it
@ -353,13 +391,18 @@ class Likelihood(Parameterized):
        :returns: second derivative of log likelihood evaluated for this point (diagonal only)
        :rtype: 1xN array
        """
-        inv_link_f = self.gp_link.transf(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+        if isinstance(self.gp_link, link_functions.Identity):
+            d2logpdf_df2 = self.d2logpdf_dlink2(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            d2logpdf_df2 = chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+        return d2logpdf_df2

+    @blockify_third
    def d3logpdf_df3(self, f, y, Y_metadata=None):
        """
        Evaluates the link function link(f) then computes the third derivative of log likelihood using it
@ -376,64 +419,96 @@ class Likelihood(Parameterized):
        :returns: third derivative of log likelihood evaluated for this point
        :rtype: float
        """
-        inv_link_f = self.gp_link.transf(f)
-        d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        d3link_df3 = self.gp_link.d3transf_df3(f)
-        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+        if isinstance(self.gp_link, link_functions.Identity):
+            d3logpdf_df3 = self.d3logpdf_dlink3(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            d3link_df3 = self.gp_link.d3transf_df3(f)
+            d3logpdf_df3 = chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+        return d3logpdf_df3
+

    def dlogpdf_dtheta(self, f, y, Y_metadata=None):
        """
        TODO: Doc strings
        """
        if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.dlogpdf_link_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
        else:
            # There are no parameters so return an empty array for derivatives
-            return np.zeros([1, 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))

    def dlogpdf_df_dtheta(self, f, y, Y_metadata=None):
        """
        TODO: Doc strings
        """
        if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.dlogpdf_dlink_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                dlink_df = self.gp_link.dtransf_df(f)
+                dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+
+                dlogpdf_df_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+                #Chain each parameter of hte likelihood seperately
+                for p in range(self.size):
+                    dlogpdf_df_dtheta[p, :, :] = chain_1(dlogpdf_dlink_dtheta[p,:,:], dlink_df)
+                return dlogpdf_df_dtheta
+                #return chain_1(dlogpdf_dlink_dtheta, dlink_df)
        else:
            # There are no parameters so return an empty array for derivatives
-            return np.zeros([f.shape[0], 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))

    def d2logpdf_df2_dtheta(self, f, y, Y_metadata=None):
        """
        TODO: Doc strings
        """
        if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            d2link_df2 = self.gp_link.d2transf_df2(f)
-            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.d2logpdf_dlink2_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                dlink_df = self.gp_link.dtransf_df(f)
+                d2link_df2 = self.gp_link.d2transf_df2(f)
+                d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+                dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+
+                d2logpdf_df2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+                #Chain each parameter of hte likelihood seperately
+                for p in range(self.size):
+                    d2logpdf_df2_dtheta[p, :, :] = chain_2(d2logpdf_dlink2_dtheta[p,:,:], dlink_df, dlogpdf_dlink_dtheta[p,:,:], d2link_df2)
+                return d2logpdf_df2_dtheta
+                #return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
        else:
            # There are no parameters so return an empty array for derivatives
-            return np.zeros([f.shape[0], 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))

    def _laplace_gradients(self, f, y, Y_metadata=None):
-        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, Y_metadata=Y_metadata).sum(axis=0)
+        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, Y_metadata=Y_metadata)
        dlogpdf_df_dtheta = self.dlogpdf_df_dtheta(f, y, Y_metadata=Y_metadata)
        d2logpdf_df2_dtheta = self.d2logpdf_df2_dtheta(f, y, Y_metadata=Y_metadata)

        #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
        # ensure we have gradients for every parameter we want to optimize
-        assert len(dlogpdf_dtheta) == self.size #1 x num_param array
-        assert dlogpdf_df_dtheta.shape[1] == self.size #f x num_param matrix
-        assert d2logpdf_df2_dtheta.shape[1] == self.size #f x num_param matrix
+        assert dlogpdf_dtheta.shape[0] == self.size #f, d x num_param array
+        assert dlogpdf_df_dtheta.shape[0] == self.size #f x d x num_param matrix or just f x num_param
+        assert d2logpdf_df2_dtheta.shape[0] == self.size #f x num_param matrix or f x d x num_param matrix, f x f x num_param or f x f x d x num_param

        return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta

@ -454,19 +529,98 @@ class Likelihood(Parameterized):

    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
        #compute the quantiles by sampling!!!
-        N_samp = 1000
+        N_samp = 500
        s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
        #ss_f = s.flatten()
        #ss_y = self.samples(ss_f, Y_metadata)
+        #ss_y = self.samples(s, Y_metadata, samples=100)
        ss_y = self.samples(s, Y_metadata)
        #ss_y = ss_y.reshape(mu.shape[0], N_samp)

        return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles]

-    def samples(self, gp, Y_metadata=None):
+    def samples(self, gp, Y_metadata=None, samples=1):
        """
        Returns a set of samples of observations based on a given value of the latent variable.

        :param gp: latent variable
+        :param samples: number of samples to take for each f location
        """
-        raise NotImplementedError
+        raise NotImplementedError("""May be possible to use MCMC with user-tuning, see
+                                  MCMC_pdf_samples in likelihood.py and write samples function
+                                  using this, beware this is a simple implementation
+                                  of Metropolis and will not work well for all likelihoods""")
+
+    def MCMC_pdf_samples(self, fNew, num_samples=1000, starting_loc=None, stepsize=0.1, burn_in=1000, Y_metadata=None):
+        """
+        Simple implementation of Metropolis sampling algorithm
+
+        Will run a parallel chain for each input dimension (treats each f independently)
+        Thus assumes f*_1 independant of f*_2 etc.
+
+        :param num_samples: Number of samples to take
+        :param fNew: f at which to sample around
+        :param starting_loc: Starting locations of the independant chains (usually will be conditional_mean of likelihood), often link_f
+        :param stepsize: Stepsize for the normal proposal distribution (will need modifying)
+        :param burnin: number of samples to use for burnin (will need modifying)
+        :param Y_metadata: Y_metadata for pdf
+        """
+        print "Warning, using MCMC for sampling y*, needs to be tuned!"
+        if starting_loc is None:
+            starting_loc = fNew
+        from functools import partial
+        logpdf = partial(self.logpdf, f=fNew, Y_metadata=Y_metadata)
+        pdf = lambda y_star: np.exp(logpdf(y=y_star[:, None]))
+        #Should be the link function of f is a good starting point
+        #(i.e. the point before you corrupt it with the likelihood)
+        par_chains = starting_loc.shape[0]
+        chain_values = np.zeros((par_chains, num_samples))
+        chain_values[:, 0][:,None] = starting_loc
+        #Use same stepsize for all par_chains
+        stepsize = np.ones(par_chains)*stepsize
+        accepted = np.zeros((par_chains, num_samples+burn_in))
+        accept_ratio = np.zeros(num_samples+burn_in)
+        #Whilst burning in, only need to keep the previous lot
+        burnin_cache = np.zeros(par_chains)
+        burnin_cache[:] = starting_loc.flatten()
+        burning_in = True
+        for i in xrange(burn_in+num_samples):
+            next_ind = i-burn_in
+            if burning_in:
+                old_y = burnin_cache
+            else:
+                old_y = chain_values[:,next_ind-1]
+
+            old_lik = pdf(old_y)
+            #Propose new y from Gaussian proposal
+            new_y = np.random.normal(loc=old_y, scale=stepsize)
+            new_lik = pdf(new_y)
+            #Accept using Metropolis (not hastings) acceptance
+            #Always accepts if new_lik > old_lik
+            accept_probability = np.minimum(1, new_lik/old_lik)
+            u = np.random.uniform(0,1,par_chains)
+            #print "Accept prob: ", accept_probability
+            accepts = u < accept_probability
+            if burning_in:
+                burnin_cache[accepts] = new_y[accepts]
+                burnin_cache[~accepts] = old_y[~accepts]
+                if i == burn_in:
+                    burning_in = False
+                    chain_values[:,0] = burnin_cache
+            else:
+                #If it was accepted then new_y becomes the latest sample
+                chain_values[accepts, next_ind] = new_y[accepts]
+                #Otherwise use old y as the sample
+                chain_values[~accepts, next_ind] = old_y[~accepts]
+
+            accepted[~accepts, i] = 0
+            accepted[accepts, i] = 1
+            accept_ratio[i] = np.sum(accepted[:,i])/float(par_chains)
+
+            #Show progress
+            if i % int((burn_in+num_samples)*0.1) == 0:
+                print "{}% of samples taken ({})".format((i/int((burn_in+num_samples)*0.1)*10), i)
+                print "Last run accept ratio: ", accept_ratio[i]
+
+        print "Average accept ratio: ", np.mean(accept_ratio)
+        return chain_values
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@ -35,8 +35,8 @@ class StudentT(Likelihood):

        self.log_concave = False

-    def parameters_changed(self):
-        self.variance = (self.v / float(self.v - 2)) * self.sigma2
+    #def parameters_changed(self):
+        #self.variance = (self.v / float(self.v - 2)) * self.sigma2

    def update_gradients(self, grads):
        """
@ -180,7 +180,8 @@ class StudentT(Likelihood):
        :rtype: float
        """
        e = y - inv_link_f
-        dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
+        e2 = np.square(e)
+        dlogpdf_dvar = self.v*(e2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e2))
        return dlogpdf_dvar

    def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None):
@ -226,17 +227,18 @@ class StudentT(Likelihood):
    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
        dlogpdf_dv = np.zeros_like(dlogpdf_dvar) #FIXME: Not done yet
-        return np.hstack((dlogpdf_dvar, dlogpdf_dv))
+        return np.array((dlogpdf_dvar, dlogpdf_dv))

    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
        dlogpdf_dlink_dv = np.zeros_like(dlogpdf_dlink_dvar) #FIXME: Not done yet
-        return np.hstack((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))
+        return np.array((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))

    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
        d2logpdf_dlink2_dv = np.zeros_like(d2logpdf_dlink2_dvar) #FIXME: Not done yet
-        return np.hstack((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
+
+        return np.array((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))

    def predictive_mean(self, mu, sigma, Y_metadata=None):
        # The comment here confuses mean and median.
--- a/GPy/mappings/init.py
+++ b/GPy/mappings/init.py
@ -4,4 +4,5 @@
 from kernel import Kernel
 from linear import Linear
 from mlp import MLP
-#from rbf import RBF
+from additive import Additive
+from compound import Compound
--- a/GPy/mappings/additive.py
+++ b/GPy/mappings/additive.py
@ -2,8 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from ..core.mapping import Mapping
-import GPy
+from ..core import Mapping

 class Additive(Mapping):
    """
@ -17,45 +16,23 @@ class Additive(Mapping):
    :type mapping1: GPy.mappings.Mapping
    :param mapping2: second mapping to add together.
    :type mapping2: GPy.mappings.Mapping
-    :param tensor: whether or not to use the tensor product of input spaces
-    :type tensor: bool

    """

-    def __init__(self, mapping1, mapping2, tensor=False):
-        if tensor:
-            input_dim = mapping1.input_dim + mapping2.input_dim
-        else:
-            input_dim = mapping1.input_dim
-            assert(mapping1.input_dim==mapping2.input_dim)
+    def __init__(self, mapping1, mapping2):
+        assert(mapping1.input_dim==mapping2.input_dim)
        assert(mapping1.output_dim==mapping2.output_dim)
-        output_dim = mapping1.output_dim
+        input_dim, output_dim = mapping1.input_dim, mapping1.output_dim
        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
        self.mapping1 = mapping1
        self.mapping2 = mapping2
-        self.num_params = self.mapping1.num_params + self.mapping2.num_params
-        self.name = self.mapping1.name + '+' + self.mapping2.name
-    def _get_param_names(self):
-        return self.mapping1._get_param_names + self.mapping2._get_param_names
-
-    def _get_params(self):
-        return np.hstack((self.mapping1._get_params(), self.mapping2._get_params()))
-
-    def _set_params(self, x):
-        self.mapping1._set_params(x[:self.mapping1.num_params])
-        self.mapping2._set_params(x[self.mapping1.num_params:])
-        
-    def randomize(self):
-        self.mapping1._randomize()
-        self.mapping2._randomize()

    def f(self, X):
        return self.mapping1.f(X) + self.mapping2.f(X)

-    def df_dtheta(self, dL_df, X):
-        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
-        self._df_dbias = (dL_df.sum(0))
-        return np.hstack((self._df_dA.flatten(), self._df_dbias))
+    def update_gradients(self, dL_dF, X):
+        self.mapping1.update_gradients(dL_dF, X)
+        self.mapping2.update_gradients(dL_dF, X)

-    def df_dX(self, dL_df, X):
-        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) 
+    def gradients_X(self, dL_dF, X):
+        return self.mapping1.gradients_X(dL_dF, X) + self.mapping2.gradients_X(dL_dF, X)
--- a/GPy/mappings/compound.py
+++ b/GPy/mappings/compound.py
@ -0,0 +1,39 @@
+# Copyright (c) 2015, James Hensman and Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from ..core import Mapping
+
+class Compound(Mapping):
+    """
+    Mapping based on passing one mapping through another
+
+    .. math::
+
+       f(\mathbf{x}) = f_2(f_1(\mathbf{x}))
+
+    :param mapping1: first mapping
+    :type mapping1: GPy.mappings.Mapping
+    :param mapping2: second mapping
+    :type mapping2: GPy.mappings.Mapping
+
+    """
+
+    def __init__(self, mapping1, mapping2):
+        assert(mapping1.output_dim==mapping2.input_dim)
+        input_dim, output_dim = mapping1.input_dim, mapping2.output_dim
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        self.mapping1 = mapping1
+        self.mapping2 = mapping2
+        self.link_parameters(self.mapping1, self.mapping2)
+
+    def f(self, X):
+        return self.mapping2.f(self.mapping1.f(X))
+
+    def update_gradients(self, dL_dF, X):
+        hidden = self.mapping1.f(X)
+        self.mapping2.update_gradients(dL_dF, hidden)
+        self.mapping1.update_gradients(self.mapping2.gradients_X(dL_dF, hidden), X)
+
+    def gradients_X(self, dL_dF, X):
+        hidden = self.mapping1.f(X)
+        return self.mapping1.gradients_X(self.mapping2.gradients_X(dL_dF, hidden), X)
--- a/GPy/mappings/identity.py
+++ b/GPy/mappings/identity.py
@ -0,0 +1,26 @@
+# Copyright (c) 2015, James Hensman
+
+from ..core.mapping import Mapping
+from ..core import Param
+
+class Identity(Mapping):
+    """
+    A mapping that does nothing!
+    """
+    def __init__(self, input_dim, output_dim, name='identity'):
+        Mapping.__init__(self, input_dim, output_dim, name)
+
+    def f(self, X):
+        return X
+
+    def update_gradients(self, dL_dF, X):
+        pass
+
+    def gradients_X(self, dL_dF, X):
+        return dL_dF
+
+
+
+
+
+
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@ -1,9 +1,10 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
 from ..core.mapping import Mapping
-import GPy
+from ..core import Param

 class Kernel(Mapping):
    """
@ -11,50 +12,41 @@ class Kernel(Mapping):

    .. math::

-       f(\mathbf{x}*) = \mathbf{A}\mathbf{k}(\mathbf{X}, \mathbf{x}^*) + \mathbf{b}
+       f(\mathbf{x}) = \sum_i \alpha_i k(\mathbf{z}_i, \mathbf{x})

-    :param X: input observations containing :math:`\mathbf{X}`
-    :type X: ndarray
+    or for multple outputs
+
+    .. math::
+
+       f_i(\mathbf{x}) = \sum_j \alpha_{i,j} k(\mathbf{z}_i, \mathbf{x})
+
+
+    :param input_dim: dimension of input.
+    :type input_dim: int
    :param output_dim: dimension of output.
    :type output_dim: int
+    :param Z: input observations containing :math:`\mathbf{Z}`
+    :type Z: ndarray
    :param kernel: a GPy kernel, defaults to GPy.kern.RBF
    :type kernel: GPy.kern.kern

    """

-    def __init__(self, X, output_dim=1, kernel=None):
-        Mapping.__init__(self, input_dim=X.shape[1], output_dim=output_dim)
-        if kernel is None:
-            kernel = GPy.kern.RBF(self.input_dim)
+    def __init__(self, input_dim, output_dim, Z, kernel, name='kernmap'):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
        self.kern = kernel
-        self.X = X
-        self.num_data = X.shape[0]
-        self.num_params = self.output_dim*(self.num_data + 1)
-        self.A = np.array((self.num_data, self.output_dim))
-        self.bias = np.array(self.output_dim)
-        self.randomize()
-        self.name = 'kernel'
-    def _get_param_names(self):
-        return sum([['A_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.num_data)], []) + ['bias_%i' % d for d in range(self.output_dim)]
-
-    def _get_params(self):
-        return np.hstack((self.A.flatten(), self.bias))
-
-    def _set_params(self, x):
-        self.A = x[:self.num_data * self.output_dim].reshape(self.num_data, self.output_dim).copy()
-        self.bias = x[self.num_data*self.output_dim:].copy()
-
-    def randomize(self):
-        self.A = np.random.randn(self.num_data, self.output_dim)/np.sqrt(self.num_data+1)
-        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.num_data+1)
+        self.Z = Z
+        self.num_bases, Zdim = Z.shape
+        assert Zdim == self.input_dim
+        self.A = Param('A', np.random.randn(self.num_bases, self.output_dim))
+        self.link_parameter(self.A)

    def f(self, X):
-        return np.dot(self.kern.K(X, self.X),self.A) + self.bias
+        return np.dot(self.kern.K(X, self.Z), self.A)

-    def df_dtheta(self, dL_df, X):
-        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
-        self._df_dbias = (dL_df.sum(0))
-        return np.hstack((self._df_dA.flatten(), self._df_dbias))
+    def update_gradients(self, dL_dF, X):
+        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T), X, self.Z)
+        self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)

-    def df_dX(self, dL_df, X):
-        return self.kern.gradients_X((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)
+    def gradients_X(self, dL_dF, X):
+        return self.kern.gradients_X(np.dot(dL_dF, self.A.T), X, self.Z)
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@ -1,43 +1,39 @@
 # Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from ..core.mapping import Bijective_mapping
+from ..core.mapping import Mapping
 from ..core.parameterization import Param

-class Linear(Bijective_mapping):
+class Linear(Mapping):
    """
-    Mapping based on a linear model.
+    A Linear mapping.

    .. math::

-       f(\mathbf{x}*) = \mathbf{W}\mathbf{x}^* + \mathbf{b}
+       F(\mathbf{x}) = \mathbf{A} \mathbf{x})

-    :param X: input observations
-    :type X: ndarray
+
+    :param input_dim: dimension of input.
+    :type input_dim: int
    :param output_dim: dimension of output.
    :type output_dim: int
+    :param kernel: a GPy kernel, defaults to GPy.kern.RBF
+    :type kernel: GPy.kern.kern

    """

-    def __init__(self, input_dim=1, output_dim=1, name='linear'):
-        Bijective_mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
-        self.W = Param('W',np.array((self.input_dim, self.output_dim)))
-        self.bias = Param('bias',np.array(self.output_dim))
-        self.link_parameters(self.W, self.bias)
+    def __init__(self, input_dim, output_dim, name='linmap'):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
+        self.A = Param('A', np.random.randn(self.input_dim, self.output_dim))
+        self.link_parameter(self.A)

    def f(self, X):
-        return np.dot(X,self.W) + self.bias
+        return np.dot(X, self.A)

-    def g(self, f):
-        V = np.linalg.solve(np.dot(self.W.T, self.W), W.T)
-        return np.dot(f-self.bias, V)  
+    def update_gradients(self, dL_dF, X):
+        self.A.gradient = np.dot( X.T, dL_dF)

-    def df_dtheta(self, dL_df, X):
-        df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T
-        df_dbias = (dL_df.sum(0))
-        return np.hstack((df_dW.flatten(), df_dbias))
-
-    def dL_dX(self, partial, X):
-        """The gradient of L with respect to the inputs to the mapping, where L is a function that is dependent on the output of the mapping, f."""
-        return (partial[:, None, :]*self.W[None, :, :]).sum(2)
+    def gradients_X(self, dL_dF, X):
+        return np.dot(dL_dF, self.A.T)
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -3,128 +3,53 @@

 import numpy as np
 from ..core.mapping import Mapping
+from ..core import Param

 class MLP(Mapping):
    """
-    Mapping based on a multi-layer perceptron neural network model.
-
-    .. math::
-
-       f(\\mathbf{x}*) = \\mathbf{W}^0\\boldsymbol{\\phi}(\\mathbf{W}^1\\mathbf{x}+\\mathbf{b}^1)^* + \\mathbf{b}^0
-
-    where
-
-    .. math::
-
-      \\phi(\\cdot) = \\text{tanh}(\\cdot)
-
-    :param X: input observations
-    :type X: ndarray
-    :param output_dim: dimension of output.
-    :type output_dim: int
-    :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
-    :type hidden_dim: int or list of ints. 
-
+    Mapping based on a multi-layer perceptron neural network model, with a single hidden layer
    """

-    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
-        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
-        self.name = 'mlp'
-        if isinstance(hidden_dim, int):
-            hidden_dim = [hidden_dim]
+    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
+        super(MLP, self).__init__(input_dim=input_dim, output_dim=output_dim, name=name)
        self.hidden_dim = hidden_dim
-        self.activation = [None]*len(self.hidden_dim)
-        self.W = []
-        self._dL_dW = []
-        self.bias = []
-        self._dL_dbias = []
-        self.W.append(np.zeros((self.input_dim, self.hidden_dim[0])))
-        self._dL_dW.append(np.zeros((self.input_dim, self.hidden_dim[0])))
-        self.bias.append(np.zeros(self.hidden_dim[0]))
-        self._dL_dbias.append(np.zeros(self.hidden_dim[0]))
-        self.num_params = self.hidden_dim[0]*(self.input_dim+1)
-        for h1, h0 in zip(hidden_dim[1:], hidden_dim[0:-1]):
-            self.W.append(np.zeros((h0, h1)))
-            self._dL_dW.append(np.zeros((h0, h1)))
-            self.bias.append(np.zeros(h1))
-            self._dL_dbias.append(np.zeros(h1))
-            self.num_params += h1*(h0+1)
-        self.W.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
-        self._dL_dW.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
-        self.bias.append(np.zeros(self.output_dim))
-        self._dL_dbias.append(np.zeros(self.output_dim))
-        self.num_params += self.output_dim*(self.hidden_dim[-1]+1)
-        self.randomize()
+        self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
+        self.b1 = Param('b1', np.random.randn(self.hidden_dim))
+        self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
+        self.b2 = Param('b2', np.random.randn(self.output_dim))
+        self.link_parameters(self.W1, self.b1, self.W2, self.b2)

-    def _get_param_names(self):
-        return sum([['W%i_%i_%i' % (i, n, d)  for n in range(self.W[i].shape[0]) for d in range(self.W[i].shape[1])] + ['bias%i_%i' % (i, d) for d in range(self.W[i].shape[1])] for i in range(len(self.W))], [])
-
-    def _get_params(self):
-        param = np.array([])
-        for W, bias in zip(self.W, self.bias):
-            param = np.hstack((param, W.flatten(), bias))
-        return param
-    
-    def _set_params(self, x):
-        start = 0
-        for W, bias in zip(self.W, self.bias):
-            end = W.shape[0]*W.shape[1]+start
-            W[:] = x[start:end].reshape(W.shape[0], W.shape[1]).copy()
-            start = end
-            end = W.shape[1]+end
-            bias[:] = x[start:end].copy()
-            start = end
-
-    def randomize(self):
-        for W, bias in zip(self.W, self.bias):
-            W[:] = np.random.randn(W.shape[0], W.shape[1])/np.sqrt(W.shape[0]+1)
-            bias[:] = np.random.randn(W.shape[1])/np.sqrt(W.shape[0]+1)

    def f(self, X):
-        self._f_computations(X)
-        return np.dot(np.tanh(self.activation[-1]), self.W[-1]) + self.bias[-1]
+        layer1 = np.dot(X, self.W1) + self.b1
+        activations = np.tanh(layer1)
+        return  np.dot(activations, self.W2) + self.b2

-    def _f_computations(self, X):
-        W = self.W[0]
-        bias = self.bias[0]
-        self.activation[0] = np.dot(X,W) + bias
-        for W, bias, index in zip(self.W[1:-1], self.bias[1:-1], range(1, len(self.activation))):
-            self.activation[index] = np.dot(np.tanh(self.activation[index-1]), W)+bias
+    def update_gradients(self, dL_dF, X):
+        layer1 = np.dot(X,self.W1) + self.b1
+        activations = np.tanh(layer1)
+
+        #Evaluate second-layer gradients.
+        self.W2.gradient = np.dot(activations.T, dL_dF)
+        self.b2.gradient = np.sum(dL_dF, 0)
+
+        # Backpropagation to hidden layer.
+        dL_dact = np.dot(dL_dF, self.W2.T)
+        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+
+        # Finally, evaluate the first-layer gradients.
+        self.W1.gradient = np.dot(X.T,dL_dlayer1)
+        self.b1.gradient = np.sum(dL_dlayer1, 0)
+
+    def gradients_X(self, dL_dF, X):
+        layer1 = np.dot(X,self.W1) + self.b1
+        activations = np.tanh(layer1)
+
+        # Backpropagation to hidden layer.
+        dL_dact = np.dot(dL_dF, self.W2.T)
+        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+
+        return np.dot(dL_dlayer1, self.W1.T)

-    def df_dtheta(self, dL_df, X):
-        self._df_computations(dL_df, X)
-        g = np.array([])
-        for gW, gbias in zip(self._dL_dW, self._dL_dbias):
-            g = np.hstack((g, gW.flatten(), gbias))
-        return g

-    def _df_computations(self, dL_df, X):
-        self._f_computations(X)
-        a0 = self.activation[-1]
-        W = self.W[-1]
-        self._dL_dW[-1] = (dL_df[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
-        dL_dta=(dL_df[:, None, :]*W[None, :, :]).sum(2)
-        self._dL_dbias[-1] = (dL_df.sum(0))
-        for dL_dW, dL_dbias, W, bias, a0, a1 in zip(self._dL_dW[-2:0:-1],
-                                                    self._dL_dbias[-2:0:-1],
-                                                    self.W[-2:0:-1],
-                                                    self.bias[-2:0:-1],
-                                                    self.activation[-2::-1],
-                                                    self.activation[-1:0:-1]):
-            ta = np.tanh(a1)
-            dL_da = dL_dta*(1-ta*ta)
-            dL_dW[:] = (dL_da[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
-            dL_dbias[:] = (dL_da.sum(0))
-            dL_dta = (dL_da[:, None, :]*W[None, :, :]).sum(2)
-        ta = np.tanh(self.activation[0])
-        dL_da = dL_dta*(1-ta*ta)
-        W = self.W[0]
-        self._dL_dW[0] = (dL_da[:, :, None]*X[:, None, :]).sum(0).T
-        self._dL_dbias[0] = (dL_da.sum(0))
-        self._dL_dX = (dL_da[:, None, :]*W[None, :, :]).sum(2)

-        
-    def df_dX(self, dL_df, X):
-        self._df_computations(dL_df, X)
-        return self._dL_dX
-    
--- a/GPy/mappings/piecewise_linear.py
+++ b/GPy/mappings/piecewise_linear.py
@ -0,0 +1,94 @@
+from GPy.core.mapping import Mapping
+from GPy.core import Param
+import numpy as np
+
+class PiecewiseLinear(Mapping):
+    """
+    A piecewise-linear mapping.
+
+    The parameters of this mapping are the positions and values of the function where it is broken (self.breaks, self.values).
+
+    Outside the range of the breaks, the function is assumed to have gradient 1
+    """
+    def __init__(self, input_dim, output_dim, values, breaks, name='piecewise_linear'):
+
+        assert input_dim==1
+        assert output_dim==1
+
+        Mapping.__init__(self, input_dim, output_dim, name)
+
+        values, breaks = np.array(values).flatten(), np.array(breaks).flatten()
+        assert values.size == breaks.size
+        self.values = Param('values', values)
+        self.breaks = Param('breaks', breaks)
+        self.link_parameter(self.values)
+        self.link_parameter(self.breaks)
+
+    def parameters_changed(self):
+        self.order = np.argsort(self.breaks)*1
+        self.reverse_order = np.zeros_like(self.order)
+        self.reverse_order[self.order] = np.arange(self.order.size)
+
+        self.sorted_breaks = self.breaks[self.order]
+        self.sorted_values = self.values[self.order]
+
+        self.grads = np.diff(self.sorted_values)/np.diff(self.sorted_breaks)
+
+    def f(self, X):
+        x = X.flatten()
+        y = x.copy()
+
+        #first adjus the points below the first value
+        y[x<self.sorted_breaks[0]]  = x[x<self.sorted_breaks[0]] + self.sorted_values[0] - self.sorted_breaks[0]
+
+        #now all the points pas the last break
+        y[x>self.sorted_breaks[-1]]  = x[x>self.sorted_breaks[-1]] + self.sorted_values[-1] - self.sorted_breaks[-1]
+
+        #loop throught the pairs of points
+        for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]):
+            i = np.logical_and(x>low, x<up)
+            y[i] = v + (x[i]-low)*g
+
+        return y.reshape(-1,1)
+
+    def update_gradients(self, dL_dF, X):
+        x = X.flatten()
+        dL_dF = dL_dF.flatten()
+
+        dL_db = np.zeros(self.sorted_breaks.size)
+        dL_dv = np.zeros(self.sorted_values.size)
+
+        #loop across each interval, computing the gradient for each of the 4 parameters that define it
+        for i, (low, up, g, v) in enumerate(zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1])):
+            index = np.logical_and(x>low, x<up)
+            xx = x[index]
+            grad = dL_dF[index]
+            span = up-low
+            dL_dv[i] += np.sum(grad*( (low - xx)/span + 1))
+            dL_dv[i+1] += np.sum(grad*(xx-low)/span)
+            dL_db[i] += np.sum(grad*g*(xx-up)/span)
+            dL_db[i+1] += np.sum(grad*g*(low-xx)/span)
+
+        #now the end parts
+        dL_db[0] -= np.sum(dL_dF[x<self.sorted_breaks[0]])
+        dL_db[-1] -= np.sum(dL_dF[x>self.sorted_breaks[-1]])
+        dL_dv[0] += np.sum(dL_dF[x<self.sorted_breaks[0]])
+        dL_dv[-1] += np.sum(dL_dF[x>self.sorted_breaks[-1]])
+
+        #now put the gradients back in the correct order!
+        self.breaks.gradient = dL_db[self.reverse_order]
+        self.values.gradient = dL_dv[self.reverse_order]
+
+    def gradients_X(self, dL_dF, X):
+        x = X.flatten()
+
+        #outside the range of the breakpoints, the function is just offset by a contant, so the partial derivative is 1.
+        dL_dX = dL_dF.copy().flatten()
+
+        #insude the breakpoints, the partial derivative is self.grads
+        for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]):
+            i = np.logical_and(x>low, x<up)
+            dL_dX[i] = dL_dF[i]*g
+
+        return dL_dX.reshape(-1,1)
+
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -43,10 +43,11 @@ class SparseGPMiniBatch(SparseGP):
    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False,
                 missing_data=False, stochastic=False, batchsize=1):
-        #pick a sensible inference method
+        
+        # pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
            else:
                #inference_method = ??
                raise NotImplementedError, "what to do what to do?"
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@ -39,7 +39,10 @@ class SSGPLVM(SparseGP_MPI):
            X_variance = np.random.uniform(0,.1,X.shape)
            
        if Gamma is None:
-            gamma = np.random.randn(X.shape[0], input_dim)
+            gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+            gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
+            gamma[gamma>1.-1e-9] = 1.-1e-9
+            gamma[gamma<1e-9] = 1e-9
        else:
            gamma = Gamma.copy()
                
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
@ -10,14 +9,16 @@ from GPy.util.warping_functions import TanhWarpingFunction_d
 from GPy import kern

 class WarpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):

        if kernel is None:
-            kernel = kern.rbf(X.shape[1])
+            kernel = kern.RBF(X.shape[1])

        if warping_function == None:
            self.warping_function = TanhWarpingFunction_d(warping_terms)
            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
+        else:
+            self.warping_function = warping_function

        self.scale_data = False
        if self.scale_data:
@ -25,10 +26,10 @@ class WarpedGP(GP):
        self.has_uncertain_inputs = False
        self.Y_untransformed = Y.copy()
        self.predict_in_warped_space = False
-        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
+        likelihood = likelihoods.Gaussian()

-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
+        self.link_parameter(self.warping_function)

    def _scale_data(self, Y):
        self._Ymax = Y.max()
@ -38,62 +39,55 @@ class WarpedGP(GP):
    def _unscale_data(self, Y):
        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin

-    def _set_params(self, x):
-        self.warping_params = x[:self.warping_function.num_parameters]
-        Y = self.transform_data()
-        self.likelihood.set_data(Y)
-        GP._set_params(self, x[self.warping_function.num_parameters:].copy())
+    def parameters_changed(self):
+        self.Y[:] = self.transform_data()
+        super(WarpedGP, self).parameters_changed()

-    def _get_params(self):
-        return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
+        Kiy = self.posterior.woodbury_vector.flatten()

-    def _get_param_names(self):
-        warping_names = self.warping_function._get_param_names()
-        param_names = GP._get_param_names(self)
-        return warping_names + param_names
-
-    def transform_data(self):
-        Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
-        return Y
-
-    def log_likelihood(self):
-        ll = GP.log_likelihood(self)
-        jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        return ll + np.log(jacobian).sum()
-
-    def _log_likelihood_gradients(self):
-        ll_grads = GP._log_likelihood_gradients(self)
-        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
-        warping_grads = self.warping_function_gradients(alpha)
-
-        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
-        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
-
-    def warping_function_gradients(self, Kiy):
-        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
+        grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
+        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
                                                                 return_covar_chain=True)
        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)

-        return -dquad_dpsi + djac_dpsi
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
+        self.warping_function.d.gradient[:] = warping_grads[0, -1]
+
+
+    def transform_data(self):
+        Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
+        return Y
+
+    def log_likelihood(self):
+        ll = GP.log_likelihood(self)
+        jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
+        return ll + np.log(jacobian).sum()

    def plot_warping(self):
-        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
+        self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())

-    def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
+    def predict(self, Xnew, which_parts='all', pred_init=None):
        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
+        # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = GP._raw_predict(self, Xnew)

        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var = self.likelihood.predictive_values(mu, var)

        if self.predict_in_warped_space:
-            mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
-            var = self.warping_function.f_inv(var, self.warping_params)
+            mean = self.warping_function.f_inv(mean,  y=pred_init)
+            var = self.warping_function.f_inv(var)

        if self.scale_data:
            mean = self._unscale_data(mean)
-        
-        return mean, var, _025pm, _975pm
+
+        return mean, var
+
+if __name__ == '__main__':
+    X = np.random.randn(100, 1)
+    Y = np.sin(X) + np.random.randn(100, 1)*0.05
+
+    m = WarpedGP(X, Y)
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@ -6,7 +6,11 @@ try:
    from matplotlib.patches import Polygon
    from matplotlib.collections import PatchCollection
    #from matplotlib import cm
-    pb.ion()
+    try:
+        __IPYTHON__
+        pb.ion()
+    except NameError:
+        pass
 except:
    pass
 import re
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -11,39 +11,38 @@ import GPy


 class InferenceXTestCase(unittest.TestCase):
-    
+
    def genData(self):
        D1,D2,N = 12,12,50
-        np.random.seed(1234)
-    
+
        x = np.linspace(0, 4 * np.pi, N)[:, None]
        s1 = np.vectorize(lambda x: np.sin(x))
        s2 = np.vectorize(lambda x: np.cos(x)**2)
        s3 = np.vectorize(lambda x:-np.exp(-np.cos(2 * x)))
        sS = np.vectorize(lambda x: np.cos(x))
-    
+
        s1 = s1(x)
        s2 = s2(x)
        s3 = s3(x)
        sS = sS(x)
-    
+
        s1 -= s1.mean(); s1 /= s1.std(0)
        s2 -= s2.mean(); s2 /= s2.std(0)
        s3 -= s3.mean(); s3 /= s3.std(0)
        sS -= sS.mean(); sS /= sS.std(0)
-    
+
        S1 = np.hstack([s1, sS])
        S2 = np.hstack([s3, sS])
-    
+
        P1 = np.random.randn(S1.shape[1], D1)
        P2 = np.random.randn(S2.shape[1], D2)
-    
+
        Y1 = S1.dot(P1)
        Y2 = S2.dot(P2)
-    
+
        Y1 += .01 * np.random.randn(*Y1.shape)
        Y2 += .01 * np.random.randn(*Y2.shape)
-    
+
        Y1 -= Y1.mean(0)
        Y2 -= Y2.mean(0)
        Y1 /= Y1.std(0)
@ -52,33 +51,34 @@ class InferenceXTestCase(unittest.TestCase):
        slist = [s1, s2, s3, sS]
        slist_names = ["s1", "s2", "s3", "sS"]
        Ylist = [Y1, Y2]
-        
+
        return Ylist
-    
+
    def test_inferenceX_BGPLVM(self):
        Ys = self.genData()
        m = GPy.models.BayesianGPLVM(Ys[0],5,kernel=GPy.kern.Linear(5,ARD=True))
-        
+
        x,mi = m.infer_newX(m.Y, optimize=False)
        self.assertTrue(mi.checkgrad())
-        
-        m.optimize(max_iters=10000)
-        x,mi = m.infer_newX(m.Y)

-        self.assertTrue(np.allclose(m.X.mean, mi.X.mean))
-        self.assertTrue(np.allclose(m.X.variance, mi.X.variance))
+        m.optimize(max_iters=10000)
+        x, mi = m.infer_newX(m.Y)
+
+        print m.X.mean - mi.X.mean
+        self.assertTrue(np.allclose(m.X.mean, mi.X.mean, rtol=1e-4, atol=1e-4))
+        self.assertTrue(np.allclose(m.X.variance, mi.X.variance, rtol=1e-4, atol=1e-4))

    def test_inferenceX_GPLVM(self):
        Ys = self.genData()
        m = GPy.models.GPLVM(Ys[0],3,kernel=GPy.kern.RBF(3,ARD=True))
-        
+
        x,mi = m.infer_newX(m.Y, optimize=False)
        self.assertTrue(mi.checkgrad())
-        
+
 #         m.optimize(max_iters=10000)
 #         x,mi = m.infer_newX(m.Y)
 #         self.assertTrue(np.allclose(m.X, x))
-        
+

 if __name__ == "__main__":
    unittest.main()
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -256,13 +256,23 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))

+    def test_Prod1(self):
+        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
    def test_Prod2(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))

    def test_Prod3(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D) * GPy.kern.Bias(self.D)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_Prod4(self):
+        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D) * GPy.kern.Matern32(2, active_dims=[0,1])
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))

@ -401,11 +411,27 @@ class Coregionalize_weave_test(unittest.TestCase):
    GPy.util.config.config.set('weave', 'working', 'False')


+class KernelTestsProductWithZeroValues(unittest.TestCase):
+
+    def setUp(self):
+        self.X = np.array([[0,1],[1,0]])
+        self.k = GPy.kern.Linear(2) * GPy.kern.Bias(2)
+
+    def test_zero_valued_kernel_full(self):
+        self.k.update_gradients_full(1, self.X)
+        self.assertFalse(np.isnan(self.k['linear.variances'].gradient),
+                         "Gradient resulted in NaN")
+
+    def test_zero_valued_kernel_gradients_X(self):
+        target = self.k.gradients_X(1, self.X)
+        self.assertFalse(np.any(np.isnan(target)),
+                         "Gradient resulted in NaN")


 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
    unittest.main()
+
 #     np.random.seed(0)
 #     N0 = 3
 #     N1 = 9
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -10,7 +10,7 @@ from GPy.likelihoods import link_functions
 from GPy.core.parameterization import Param
 from functools import partial
 #np.random.seed(300)
-#np.random.seed(7)
+#np.random.seed(4)

 #np.seterr(divide='raise')
 def dparam_partial(inst_func, *args):
@ -52,8 +52,17 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
    zipped_params = zip(params, params_names)
    for param_ind, (param_val, param_name) in enumerate(zipped_params):
        #Check one parameter at a time, make sure it is 2d (as some gradients only return arrays) then strip out the parameter
-        fnum = np.atleast_2d(partial_f(param_val, param_name))[:, param_ind].shape[0]
-        dfnum = np.atleast_2d(partial_df(param_val, param_name))[:, param_ind].shape[0]
+        f_ = partial_f(param_val, param_name)
+        df_ = partial_df(param_val, param_name)
+        #Reshape it such that we have a 3d matrix incase, that is we want it (?, N, D) regardless of whether ? is num_params or not
+        f_ = f_.reshape(-1, f_.shape[0], f_.shape[1])
+        df_ = df_.reshape(-1, f_.shape[0], f_.shape[1])
+
+        #Get the number of f and number of dimensions
+        fnum = f_.shape[-2]
+        fdim = f_.shape[-1]
+        dfnum = df_.shape[-2]
+
        for fixed_val in range(dfnum):
            #dlik and dlik_dvar gives back 1 value for each
            f_ind = min(fnum, fixed_val+1) - 1
@ -61,9 +70,13 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
            #Make grad checker with this param moving, note that set_params is NOT being called
            #The parameter is being set directly with __setattr__
            #Check only the parameter and function value we wish to check at a time
-            grad = GradientChecker(lambda p_val: np.atleast_2d(partial_f(p_val, param_name))[f_ind, param_ind],
-                                   lambda p_val: np.atleast_2d(partial_df(p_val, param_name))[fixed_val, param_ind],
-                                   param_val, [param_name])
+            #func = lambda p_val, fnum, fdim, param_ind, f_ind, param_ind: partial_f(p_val, param_name).reshape(-1, fnum, fdim)[param_ind, f_ind, :]
+            #dfunc_dparam = lambda d_val, fnum, fdim, param_ind, fixed_val: partial_df(d_val, param_name).reshape(-1, fnum, fdim)[param_ind, fixed_val, :]
+
+            #First we reshape the output such that it is (num_params, N, D) then we pull out the relavent parameter-findex and checkgrad just this index at a time
+            func = lambda p_val: partial_f(p_val, param_name).reshape(-1, fnum, fdim)[param_ind, f_ind, :]
+            dfunc_dparam = lambda d_val: partial_df(d_val, param_name).reshape(-1, fnum, fdim)[param_ind, fixed_val, :]
+            grad = GradientChecker(func, dfunc_dparam, param_val, [param_name])

            if constraints is not None:
                for constrain_param, constraint in constraints:
@ -104,37 +117,9 @@ class TestNoiseModels(object):

        self.var = 0.2

-        self.var = np.random.rand(1)
-
        #Make a bigger step as lower bound can be quite curved
        self.step = 1e-4

-    def tearDown(self):
-        self.Y = None
-        self.f = None
-        self.X = None
-
-    def test_scale2_models(self):
-        self.setUp()
-
-        ####################################################
-        # Constraint wrappers so we can just list them off #
-        ####################################################
-        def constrain_fixed(regex, model):
-            model[regex].constrain_fixed()
-
-        def constrain_negative(regex, model):
-            model[regex].constrain_negative()
-
-        def constrain_positive(regex, model):
-            model[regex].constrain_positive()
-
-        def constrain_bounded(regex, model, lower, upper):
-            """
-            Used like: partial(constrain_bounded, lower=0, upper=1)
-            """
-            model[regex].constrain_bounded(lower, upper)
-
        """
        Dictionary where we nest models we would like to check
            Name: {
@ -149,136 +134,170 @@ class TestNoiseModels(object):
                "link_f_constraints": [constraint_wrappers, listed_here]
                }
        """
-        noise_models = {"Student_t_default": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                #"constraints": [("t_scale2", constrain_positive), ("deg_free", partial(constrain_fixed, value=5))]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_1_var": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [1.0],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_small_deg_free": {
-                            "model": GPy.likelihoods.StudentT(deg_free=1.5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_small_var": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [0.001],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_large_var": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [10.0],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_approx_gauss": {
-                            "model": GPy.likelihoods.StudentT(deg_free=1000, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_log": {
-                            "model": GPy.likelihoods.StudentT(gp_link=link_functions.Log(), deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Gaussian_default": {
-                            "model": GPy.likelihoods.Gaussian(variance=self.var),
-                            "grad_params": {
-                                "names": [".*variance"],
-                                "vals": [self.var],
-                                "constraints": [(".*variance", constrain_positive)]
-                                },
-                            "laplace": True,
-                            "ep": False # FIXME: Should be True when we have it working again
-                            },
-                        #"Gaussian_log": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log(), variance=self.var, D=self.D, N=self.N),
-                            #"grad_params": {
-                                #"names": ["noise_model_variance"],
-                                #"vals": [self.var],
-                                #"constraints": [constrain_positive]
-                                #},
-                            #"laplace": True
-                            #},
-                        #"Gaussian_probit": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
-                            #"grad_params": {
-                                #"names": ["noise_model_variance"],
-                                #"vals": [self.var],
-                                #"constraints": [constrain_positive]
-                                #},
-                            #"laplace": True
-                            #},
-                        #"Gaussian_log_ex": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
-                            #"grad_params": {
-                                #"names": ["noise_model_variance"],
-                                #"vals": [self.var],
-                                #"constraints": [constrain_positive]
-                                #},
-                            #"laplace": True
-                            #},
-                        "Bernoulli_default": {
-                            "model": GPy.likelihoods.Bernoulli(),
-                            "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
-                            "laplace": True,
-                            "Y": self.binary_Y,
-                            "ep": False # FIXME: Should be True when we have it working again
-                            },
-                        "Exponential_default": {
-                            "model": GPy.likelihoods.Exponential(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.positive_Y,
-                            "laplace": True,
-                        },
-                        "Poisson_default": {
-                            "model": GPy.likelihoods.Poisson(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.integer_Y,
-                            "laplace": True,
-                            "ep": False #Should work though...
-                        }#,
-                        #GAMMA needs some work!"Gamma_default": {
-                            #"model": GPy.likelihoods.Gamma(),
-                            #"link_f_constraints": [constrain_positive],
-                            #"Y": self.positive_Y,
-                            #"laplace": True
-                        #}
-                    }
+        self.noise_models = {"Student_t_default": {
+            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+            "grad_params": {
+                "names": [".*t_scale2"],
+                "vals": [self.var],
+                "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+            },
+            "laplace": True
+            },
+            "Student_t_1_var": {
+                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [1.0],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_small_deg_free": {
+                "model": GPy.likelihoods.StudentT(deg_free=1.5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [self.var],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_small_var": {
+                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [0.001],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_large_var": {
+                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [10.0],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_approx_gauss": {
+                "model": GPy.likelihoods.StudentT(deg_free=1000, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [self.var],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            #"Student_t_log": {
+            #"model": GPy.likelihoods.StudentT(gp_link=link_functions.Log(), deg_free=5, sigma2=self.var),
+            #"grad_params": {
+            #"names": [".*t_noise"],
+            #"vals": [self.var],
+            #"constraints": [(".*t_noise", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+            #},
+            #"laplace": True
+            #},
+            "Gaussian_default": {
+                "model": GPy.likelihoods.Gaussian(variance=self.var),
+                "grad_params": {
+                    "names": [".*variance"],
+                    "vals": [self.var],
+                    "constraints": [(".*variance", self.constrain_positive)]
+                },
+                "laplace": True,
+                "ep": False # FIXME: Should be True when we have it working again
+            },
+            "Gaussian_log": {
+                "model": GPy.likelihoods.Gaussian(gp_link=link_functions.Log(), variance=self.var),
+                "grad_params": {
+                    "names": [".*variance"],
+                    "vals": [self.var],
+                    "constraints": [(".*variance", self.constrain_positive)]
+                },
+                "laplace": True
+            },
+            #"Gaussian_probit": {
+            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
+            #"grad_params": {
+            #"names": ["noise_model_variance"],
+            #"vals": [self.var],
+            #"constraints": [constrain_positive]
+            #},
+            #"laplace": True
+            #},
+            #"Gaussian_log_ex": {
+            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+            #"grad_params": {
+            #"names": ["noise_model_variance"],
+            #"vals": [self.var],
+            #"constraints": [constrain_positive]
+            #},
+            #"laplace": True
+            #},
+            "Bernoulli_default": {
+                "model": GPy.likelihoods.Bernoulli(),
+                "link_f_constraints": [partial(self.constrain_bounded, lower=0, upper=1)],
+                "laplace": True,
+                "Y": self.binary_Y,
+                "ep": False # FIXME: Should be True when we have it working again
+            },
+            "Exponential_default": {
+                "model": GPy.likelihoods.Exponential(),
+                "link_f_constraints": [self.constrain_positive],
+                "Y": self.positive_Y,
+                "laplace": True,
+            },
+            "Poisson_default": {
+                "model": GPy.likelihoods.Poisson(),
+                "link_f_constraints": [self.constrain_positive],
+                "Y": self.integer_Y,
+                "laplace": True,
+                "ep": False #Should work though...
+            },
+            #,
+            #GAMMA needs some work!"Gamma_default": {
+            #"model": GPy.likelihoods.Gamma(),
+            #"link_f_constraints": [constrain_positive],
+            #"Y": self.positive_Y,
+            #"laplace": True
+            #}
+        }

-        for name, attributes in noise_models.iteritems():
+
+    ####################################################
+    # Constraint wrappers so we can just list them off #
+    ####################################################
+    def constrain_fixed(self, regex, model):
+        model[regex].constrain_fixed()
+
+    def constrain_negative(self, regex, model):
+        model[regex].constrain_negative()
+
+    def constrain_positive(self, regex, model):
+        model[regex].constrain_positive()
+
+    def constrain_fixed_below(self, regex, model, up_to):
+        model[regex][0:up_to].constrain_fixed()
+
+    def constrain_fixed_above(self, regex, model, above):
+        model[regex][above:].constrain_fixed()
+
+    def constrain_bounded(self, regex, model, lower, upper):
+        """
+        Used like: partial(constrain_bounded, lower=0, upper=1)
+        """
+        model[regex].constrain_bounded(lower, upper)
+
+
+    def tearDown(self):
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_scale2_models(self):
+        self.setUp()
+
+        for name, attributes in self.noise_models.iteritems():
            model = attributes["model"]
            if "grad_params" in attributes:
                params = attributes["grad_params"]
@ -290,7 +309,7 @@ class TestNoiseModels(object):
                param_vals = []
                param_names = []
                constrain_positive = []
-                param_constraints = [] # ??? TODO: Saul to Fix.
+                param_constraints = []
            if "link_f_constraints" in attributes:
                link_f_constraints = attributes["link_f_constraints"]
            else:
@ -303,6 +322,10 @@ class TestNoiseModels(object):
                f = attributes["f"].copy()
            else:
                f = self.f.copy()
+            if "Y_metadata" in attributes:
+                Y_metadata = attributes["Y_metadata"].copy()
+            else:
+                Y_metadata = None
            if "laplace" in attributes:
                laplace = attributes["laplace"]
            else:
@ -317,30 +340,30 @@ class TestNoiseModels(object):

            #Required by all
            #Normal derivatives
-            yield self.t_logpdf, model, Y, f
-            yield self.t_dlogpdf_df, model, Y, f
-            yield self.t_d2logpdf_df2, model, Y, f
+            yield self.t_logpdf, model, Y, f, Y_metadata
+            yield self.t_dlogpdf_df, model, Y, f, Y_metadata
+            yield self.t_d2logpdf_df2, model, Y, f, Y_metadata
            #Link derivatives
-            yield self.t_dlogpdf_dlink, model, Y, f, link_f_constraints
-            yield self.t_d2logpdf_dlink2, model, Y, f, link_f_constraints
+            yield self.t_dlogpdf_dlink, model, Y, f, Y_metadata, link_f_constraints
+            yield self.t_d2logpdf_dlink2, model, Y, f, Y_metadata, link_f_constraints
            if laplace:
                #Laplace only derivatives
-                yield self.t_d3logpdf_df3, model, Y, f
-                yield self.t_d3logpdf_dlink3, model, Y, f, link_f_constraints
+                yield self.t_d3logpdf_df3, model, Y, f, Y_metadata
+                yield self.t_d3logpdf_dlink3, model, Y, f, Y_metadata, link_f_constraints
                #Params
-                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_df_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_df2_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
                #Link params
-                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_link_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_dlink_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints

                #laplace likelihood gradcheck
-                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints
            if ep:
                #ep likelihood gradcheck
-                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints


        self.tearDown()
@ -349,41 +372,41 @@ class TestNoiseModels(object):
    # dpdf_df's #
    #############
    @with_setup(setUp, tearDown)
-    def t_logpdf(self, model, Y, f):
+    def t_logpdf(self, model, Y, f, Y_metadata):
        print "\n{}".format(inspect.stack()[0][3])
        print model
        #print model._get_params()
        np.testing.assert_almost_equal(
-                model.pdf(f.copy(), Y.copy()).prod(),
-                               np.exp(model.logpdf(f.copy(), Y.copy()).sum())
+                model.pdf(f.copy(), Y.copy(), Y_metadata=Y_metadata).prod(),
+                               np.exp(model.logpdf(f.copy(), Y.copy(), Y_metadata=Y_metadata).sum())
                               )

    @with_setup(setUp, tearDown)
-    def t_dlogpdf_df(self, model, Y, f):
+    def t_dlogpdf_df(self, model, Y, f, Y_metadata):
        print "\n{}".format(inspect.stack()[0][3])
        self.description = "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(np.sum(model.logpdf), y=Y)
-        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        logpdf = functools.partial(np.sum(model.logpdf), y=Y, Y_metadata=Y_metadata)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
        grad = GradientChecker(logpdf, dlogpdf_df, f.copy(), 'g')
        grad.randomize()
        print model
        assert grad.checkgrad(verbose=1)

    @with_setup(setUp, tearDown)
-    def t_d2logpdf_df2(self, model, Y, f):
+    def t_d2logpdf_df2(self, model, Y, f, Y_metadata):
        print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
-        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, f.copy(), 'g')
        grad.randomize()
        print model
        assert grad.checkgrad(verbose=1)

    @with_setup(setUp, tearDown)
-    def t_d3logpdf_df3(self, model, Y, f):
+    def t_d3logpdf_df3(self, model, Y, f, Y_metadata):
        print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
-        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
+        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y, Y_metadata=Y_metadata)
        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, f.copy(), 'g')
        grad.randomize()
        print model
@ -393,32 +416,32 @@ class TestNoiseModels(object):
    # df_dparams #
    ##############
    @with_setup(setUp, tearDown)
-    def t_dlogpdf_dparams(self, model, Y, f, params, params_names, param_constraints):
+    def t_dlogpdf_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
        print "\n{}".format(inspect.stack()[0][3])
        print model
        assert (
                dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
-                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                    randomize=False, verbose=True)
                )

    @with_setup(setUp, tearDown)
-    def t_dlogpdf_df_dparams(self, model, Y, f, params, params_names, param_constraints):
+    def t_dlogpdf_df_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
        print "\n{}".format(inspect.stack()[0][3])
        print model
        assert (
                dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
-                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                    randomize=False, verbose=True)
                )

    @with_setup(setUp, tearDown)
-    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, params_names, param_constraints):
+    def t_d2logpdf2_df2_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
        print "\n{}".format(inspect.stack()[0][3])
        print model
        assert (
                dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
-                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                    randomize=False, verbose=True)
                )

@ -426,10 +449,10 @@ class TestNoiseModels(object):
    # dpdf_dlink's #
    ################
    @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink(self, model, Y, f, link_f_constraints):
+    def t_dlogpdf_dlink(self, model, Y, f, Y_metadata, link_f_constraints):
        print "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(model.logpdf_link, y=Y)
-        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        logpdf = functools.partial(model.logpdf_link, y=Y, Y_metadata=Y_metadata)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
        grad = GradientChecker(logpdf, dlogpdf_dlink, f.copy(), 'g')

        #Apply constraints to link_f values
@ -442,10 +465,10 @@ class TestNoiseModels(object):
        assert grad.checkgrad(verbose=1)

    @with_setup(setUp, tearDown)
-    def t_d2logpdf_dlink2(self, model, Y, f, link_f_constraints):
+    def t_d2logpdf_dlink2(self, model, Y, f, Y_metadata, link_f_constraints):
        print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
-        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, f.copy(), 'g')

        #Apply constraints to link_f values
@ -458,10 +481,10 @@ class TestNoiseModels(object):
        assert grad.checkgrad(verbose=1)

    @with_setup(setUp, tearDown)
-    def t_d3logpdf_dlink3(self, model, Y, f, link_f_constraints):
+    def t_d3logpdf_dlink3(self, model, Y, f, Y_metadata, link_f_constraints):
        print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
-        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
+        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y, Y_metadata=Y_metadata)
        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, f.copy(), 'g')

        #Apply constraints to link_f values
@ -477,32 +500,32 @@ class TestNoiseModels(object):
    # dlink_dparams #
    #################
    @with_setup(setUp, tearDown)
-    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_names, param_constraints):
+    def t_dlogpdf_link_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
        print "\n{}".format(inspect.stack()[0][3])
        print model
        assert (
                dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
-                    params, param_names, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                    randomize=False, verbose=True)
                )

    @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_names, param_constraints):
+    def t_dlogpdf_dlink_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
        print "\n{}".format(inspect.stack()[0][3])
        print model
        assert (
                dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
-                    params, param_names, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                    randomize=False, verbose=True)
                )

    @with_setup(setUp, tearDown)
-    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_names, param_constraints):
+    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
        print "\n{}".format(inspect.stack()[0][3])
        print model
        assert (
                dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
-                    params, param_names, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                    randomize=False, verbose=True)
                )

@ -510,14 +533,15 @@ class TestNoiseModels(object):
    # laplace test #
    ################
    @with_setup(setUp, tearDown)
-    def t_laplace_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+    def t_laplace_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
        print "\n{}".format(inspect.stack()[0][3])
        #Normalize
        Y = Y/Y.max()
-        white_var = 1e-6
+        white_var = 1e-5
        kernel = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
        laplace_likelihood = GPy.inference.latent_function_inference.Laplace()
-        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=model, inference_method=laplace_likelihood)
+
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=model, Y_metadata=Y_metadata, inference_method=laplace_likelihood)
        m['.*white'].constrain_fixed(white_var)

        #Set constraints
@ -526,6 +550,7 @@ class TestNoiseModels(object):

        print m
        m.randomize()
+        m.randomize()

        #Set params
        for param_num in range(len(param_names)):
@ -545,14 +570,15 @@ class TestNoiseModels(object):
    # EP test #
    ###########
    @with_setup(setUp, tearDown)
-    def t_ep_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+    def t_ep_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
        print "\n{}".format(inspect.stack()[0][3])
        #Normalize
        Y = Y/Y.max()
        white_var = 1e-6
        kernel = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
        ep_inf = GPy.inference.latent_function_inference.EP()
-        m = GPy.core.GP(X.copy(), Y.copy(), kernel=kernel, likelihood=model, inference_method=ep_inf)
+
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel=kernel, likelihood=model, Y_metadata=Y_metadata, inference_method=ep_inf)
        m['.*white'].constrain_fixed(white_var)

        for param_num in range(len(param_names)):
@ -571,8 +597,8 @@ class LaplaceTests(unittest.TestCase):
    """

    def setUp(self):
-        self.N = 5
-        self.D = 3
+        self.N = 15
+        self.D = 1
        self.X = np.random.rand(self.N, self.D)*10

        self.real_std = 0.1
@ -636,20 +662,20 @@ class LaplaceTests(unittest.TestCase):
        exact_inf = GPy.inference.latent_function_inference.ExactGaussianInference()
        m1 = GPy.core.GP(X, Y.copy(), kernel=kernel1, likelihood=gauss_distr1, inference_method=exact_inf)
        m1['.*white'].constrain_fixed(1e-6)
-        m1['.*rbf.variance'] = initial_var_guess
-        m1['.*rbf.variance'].constrain_bounded(1e-4, 10)
+        m1['.*Gaussian_noise.variance'].constrain_bounded(1e-4, 10)
        m1.randomize()

        gauss_distr2 = GPy.likelihoods.Gaussian(variance=initial_var_guess)
        laplace_inf = GPy.inference.latent_function_inference.Laplace()
        m2 = GPy.core.GP(X, Y.copy(), kernel=kernel2, likelihood=gauss_distr2, inference_method=laplace_inf)
        m2['.*white'].constrain_fixed(1e-6)
-        m2['.*rbf.variance'].constrain_bounded(1e-4, 10)
+        m2['.*Gaussian_noise.variance'].constrain_bounded(1e-4, 10)
        m2.randomize()

        if debug:
            print m1
            print m2
+
        optimizer = 'scg'
        print "Gaussian"
        m1.optimize(optimizer, messages=debug, ipython_notebook=False)
@ -687,8 +713,6 @@ class LaplaceTests(unittest.TestCase):
            pb.scatter(X, m1.likelihood.Y, c='g')
            pb.scatter(X, m2.likelihood.Y, c='r', marker='x')

-
-
        #Check Y's are the same
        np.testing.assert_almost_equal(m1.Y, m2.Y, decimal=5)
        #Check marginals are the same
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@ -0,0 +1,72 @@
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class MappingGradChecker(GPy.core.Model):
+    """
+    This class has everything we need to check the gradient of a mapping. It
+    implement a simple likelihood which is a weighted sum of the outputs of the
+    mapping. the gradients are checked against the parameters of the mapping
+    and the input.
+    """
+    def __init__(self, mapping, X, name='map_grad_check'):
+        super(MappingGradChecker, self).__init__(name)
+        self.mapping = mapping
+        self.link_parameter(self.mapping)
+        self.X = GPy.core.Param('X',X)
+        self.link_parameter(self.X)
+        self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
+    def log_likelihood(self):
+        return np.sum(self.mapping.f(self.X) * self.dL_dY)
+    def parameters_changed(self):
+        self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
+        self.mapping.update_gradients(self.dL_dY, self.X)
+
+
+
+
+
+
+
+class MappingTests(unittest.TestCase):
+
+    def test_kernelmapping(self):
+        X = np.random.randn(100,3)
+        Z = np.random.randn(10,3)
+        mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_linearmapping(self):
+        mapping = GPy.mappings.Linear(3, 2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_mlpmapping(self):
+        mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_addmapping(self):
+        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
+        mapping = GPy.mappings.Additive(m1, m2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_compoundmapping(self):
+        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        Z = np.random.randn(10,2)
+        m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
+        mapping = GPy.mappings.Compound(m1, m2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+
+
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/meanfunc_tests.py
+++ b/GPy/testing/meanfunc_tests.py
@ -0,0 +1,56 @@
+# Copyright (c) 2015, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class MFtests(unittest.TestCase):
+    def simple_mean_function():
+        """
+        The simplest possible mean function. No parameters, just a simple Sinusoid.
+        """
+        #create  simple mean function
+        mf = GPy.core.Mapping(1,1)
+        mf.f = np.sin
+        mf.update_gradients = lambda a,b: None
+
+        X = np.linspace(0,10,50).reshape(-1,1)
+        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+
+        k =GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        self.assertTrue(m.checkgrad())
+
+    def test_parametric_mean_function(self):
+        """
+        A linear mean function with parameters that we'll learn alongside the kernel
+        """
+
+        X = np.linspace(0,10,50).reshape(-1,1)
+        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
+
+        mf = GPy.mappings.Linear(1,1)
+
+        k =GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        self.assertTrue(m.checkgrad())
+
+    def test_svgp_mean_function(self):
+
+        # an instance of the SVIGOP with a men function
+        X = np.linspace(0,10,500).reshape(-1,1)
+        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+        Y = np.where(Y>0, 1,0) # make aclassificatino problem
+
+        mf = GPy.mappings.Linear(1,1)
+        Z = np.linspace(0,10,50).reshape(-1,1)
+        lik = GPy.likelihoods.Bernoulli()
+        k =GPy.kern.RBF(1) + GPy.kern.White(1, 1e-4)
+        m = GPy.core.SVGP(X, Y,Z=Z, kernel=k, likelihood=lik, mean_function=mf)
+        self.assertTrue(m.checkgrad())
+
+
+
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@ -0,0 +1,18 @@
+import numpy as np
+import scipy as sp
+import GPy
+
+class MiscTests(np.testing.TestCase):
+    """
+    Testing some utilities of misc
+    """
+    def setUp(self):
+        self._lim_val = np.finfo(np.float64).max
+        self._lim_val_exp = np.log(self._lim_val)
+
+    def test_safe_exp_upper(self):
+        assert np.exp(self._lim_val_exp + 1) == np.inf
+        assert GPy.util.misc.safe_exp(self._lim_val_exp + 1) < np.inf
+
+    def test_safe_exp_lower(self):
+        assert GPy.util.misc.safe_exp(1e-10) < np.inf
--- a/GPy/testing/svgp_tests.py
+++ b/GPy/testing/svgp_tests.py
@ -0,0 +1,54 @@
+import numpy as np
+import scipy as sp
+import GPy
+
+class SVGP_nonconvex(np.testing.TestCase):
+    """
+    Inference in the SVGP with a student-T likelihood
+    """
+    def setUp(self):
+        X = np.linspace(0,10,100).reshape(-1,1)
+        Z = np.linspace(0,10,10).reshape(-1,1)
+        Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+        Y[50] += 3
+
+        lik = GPy.likelihoods.StudentT(deg_free=2)
+        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
+    def test_grad(self):
+        assert self.m.checkgrad(step=1e-4)
+
+class SVGP_classification(np.testing.TestCase):
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+    def setUp(self):
+        X = np.linspace(0,10,100).reshape(-1,1)
+        Z = np.linspace(0,10,10).reshape(-1,1)
+        Y = np.where((np.sin(X) + np.random.randn(*X.shape)*0.1)>0, 1,0)
+
+        lik = GPy.likelihoods.Bernoulli()
+        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
+    def test_grad(self):
+        assert self.m.checkgrad(step=1e-4)
+
+class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+    def setUp(self):
+        X = np.linspace(0,10,100).reshape(-1,1)
+        Z = np.linspace(0,10,10).reshape(-1,1)
+        latent_f = np.exp(0.1*X * 0.05*X**2)
+        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
+
+        mf = GPy.mappings.Linear(1,1)
+
+        lik = GPy.likelihoods.Poisson()
+        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
+    def test_grad(self):
+        assert self.m.checkgrad(step=1e-4)
+
+
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@ -17,6 +17,54 @@ def get_blocks(A, blocksizes):
        count_i += i
    return B

+def get_block_shapes(B):
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    return [B[b,b].shape[0] for b in range(0, B.shape[0])]
+
+def unblock(B):
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    block_shapes = get_block_shapes(B)
+    num_elements = np.sum(block_shapes)
+    A = np.empty(shape=(num_elements, num_elements))
+    count_i = 0
+    for Bi, i in enumerate(block_shapes):
+        count_j = 0
+        for Bj, j in enumerate(block_shapes):
+            A[count_i:count_i + i, count_j:count_j + j] = B[Bi, Bj]
+            count_j += j
+        count_i += i
+    return A
+
+def block_dot(A, B):
+    """
+    Element wise dot product on block matricies
+
+    +------+------+   +------+------+    +-------+-------+
+    |      |      |   |      |      |    |A11.B11|B12.B12|
+    | A11  | A12  |   | B11  | B12  |    |       |       |
+    +------+------+ o +------+------| =  +-------+-------+
+    |      |      |   |      |      |    |A21.B21|A22.B22|
+    | A21  | A22  |   | B21  | B22  |    |       |       |
+    +-------------+   +------+------+    +-------+-------+
+
+    ..Note
+        If either (A or B) of the diagonal matrices are stored as vectors then a more
+        efficient dot product using numpy broadcasting will be used, i.e. A11*B11
+    """
+    #Must have same number of blocks and be a block matrix
+    assert A.dtype is np.dtype('object'), "Must be a block matrix"
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    Ashape = A.shape
+    Bshape = B.shape
+    assert Ashape == Bshape
+    def f(A,B):
+        if Ashape[0] == Ashape[1] or Bshape[0] == Bshape[1]:
+            #FIXME: Careful if one is transpose of other, would make a matrix
+            return A*B
+        else:
+            return np.dot(A,B)
+    dot = np.vectorize(f, otypes = [np.object])
+    return dot(A,B)


 if __name__=='__main__':
@ -24,3 +72,5 @@ if __name__=='__main__':
    B = get_blocks(A,[2,3])
    B[0,0] += 7
    print B
+
+    assert np.all(unblock(B) == A)
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -96,16 +96,21 @@ def jitchol(A, maxtries=5):
        num_tries = 1
        while num_tries <= maxtries and np.isfinite(jitter):
            try:
+                print jitter
                L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
-                logging.warning('Added {} rounds of jitter, jitter of {:.10e}\n'.format(num_tries, jitter))
                return L
            except:
                jitter *= 10
+            finally:
                num_tries += 1
+        raise linalg.LinAlgError, "not positive definite, even with jitter."
    import traceback
-    logging.warning('\n'.join(['Added {} rounds of jitter, jitter of {:.10e}'.format(num_tries-1, jitter),
-                                '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
-    raise linalg.LinAlgError, "not positive definite, even with jitter."
+    try: raise
+    except:
+        logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
+            '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
+    import ipdb;ipdb.set_trace()
+    return L

 # def dtrtri(L, lower=1):
 #     """
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@ -4,6 +4,16 @@
 import numpy as np
 from config import *

+_lim_val = np.finfo(np.float64).max
+
+_lim_val_exp = np.log(_lim_val)
+_lim_val_square = np.sqrt(_lim_val)
+_lim_val_cube = np.power(_lim_val, -3)
+
+def safe_exp(f):
+    clip_f = np.clip(f, -np.inf, _lim_val_exp)
+    return np.exp(clip_f)
+
 def chain_1(df_dg, dg_dx):
    """
    Generic chaining function for first derivative
@ -11,6 +21,11 @@ def chain_1(df_dg, dg_dx):
    .. math::
        \\frac{d(f . g)}{dx} = \\frac{df}{dg} \\frac{dg}{dx}
    """
+    if np.all(dg_dx==1.):
+        return df_dg
+    if len(df_dg) > 1 and df_dg.shape[-1] > 1:
+        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        raise NotImplementedError('Not implemented for matricies yet')
    return df_dg * dg_dx

 def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
@ -20,7 +35,13 @@ def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
    .. math::
        \\frac{d^{2}(f . g)}{dx^{2}} = \\frac{d^{2}f}{dg^{2}}(\\frac{dg}{dx})^{2} + \\frac{df}{dg}\\frac{d^{2}g}{dx^{2}}
    """
-    return d2f_dg2*(dg_dx**2) + df_dg*d2g_dx2
+    if np.all(dg_dx==1.) and np.all(d2g_dx2 == 0):
+        return d2f_dg2
+    if  len(d2f_dg2) > 1 and d2f_dg2.shape[-1] > 1:
+        raise NotImplementedError('Not implemented for matricies yet')
+    #dg_dx_2 = np.clip(dg_dx, 1e-12, _lim_val_square)**2
+    dg_dx_2 = dg_dx**2
+    return d2f_dg2*(dg_dx_2) + df_dg*d2g_dx2

 def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
    """
@ -29,11 +50,18 @@ def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
    .. math::
        \\frac{d^{3}(f . g)}{dx^{3}} = \\frac{d^{3}f}{dg^{3}}(\\frac{dg}{dx})^{3} + 3\\frac{d^{2}f}{dg^{2}}\\frac{dg}{dx}\\frac{d^{2}g}{dx^{2}} + \\frac{df}{dg}\\frac{d^{3}g}{dx^{3}}
    """
-    return d3f_dg3*(dg_dx**3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3
+    if np.all(dg_dx==1.) and np.all(d2g_dx2==0) and np.all(d3g_dx3==0):
+        return d3f_dg3
+    if (  (len(d2f_dg2) > 1 and d2f_dg2.shape[-1] > 1)
+           or (len(d3f_dg3) > 1 and d3f_dg3.shape[-1] > 1)):
+        raise NotImplementedError('Not implemented for matricies yet')
+    #dg_dx_3 = np.clip(dg_dx, 1e-12, _lim_val_cube)**3
+    dg_dx_3 = dg_dx**3
+    return d3f_dg3*(dg_dx_3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3

 def opt_wrapper(m, **kwargs):
    """
-    This function just wraps the optimization procedure of a GPy
+    Thit function just wraps the optimization procedure of a GPy
    object so that optimize() pickleable (necessary for multiprocessing).
    """
    m.optimize(**kwargs)
@ -96,3 +124,47 @@ from :class:ndarray)"""
    if len(param) == 1:
        return param[0].view(np.ndarray)
    return [x.view(np.ndarray) for x in param]
+
+def blockify_hessian(func):
+    def wrapper_func(self, *args, **kwargs):
+        # Invoke the wrapped function first
+        retval = func(self, *args, **kwargs)
+        # Now do something here with retval and/or action
+        if self.not_block_really and (retval.shape[0] != retval.shape[1]):
+            return np.diagflat(retval)
+        else:
+            return retval
+    return wrapper_func
+
+def blockify_third(func):
+    def wrapper_func(self, *args, **kwargs):
+        # Invoke the wrapped function first
+        retval = func(self, *args, **kwargs)
+        # Now do something here with retval and/or action
+        if self.not_block_really and (len(retval.shape) < 3):
+            num_data = retval.shape[0]
+            d3_block_cache = np.zeros((num_data, num_data, num_data))
+            diag_slice = range(num_data)
+            d3_block_cache[diag_slice, diag_slice, diag_slice] = np.squeeze(retval)
+            return d3_block_cache
+        else:
+            return retval
+    return wrapper_func
+
+def blockify_dhess_dtheta(func):
+    def wrapper_func(self, *args, **kwargs):
+        # Invoke the wrapped function first
+        retval = func(self, *args, **kwargs)
+        # Now do something here with retval and/or action
+        if self.not_block_really and (len(retval.shape) < 3):
+            num_data = retval.shape[0]
+            num_params = retval.shape[-1]
+            dhess_dtheta = np.zeros((num_data, num_data, num_params))
+            diag_slice = range(num_data)
+            for param_ind in range(num_params):
+                dhess_dtheta[diag_slice, diag_slice, param_ind] = np.squeeze(retval[:,param_ind])
+            return dhess_dtheta
+        else:
+            return retval
+    return wrapper_func
+
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -1,17 +1,18 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 import numpy as np
+from GPy.core.parameterization import Parameterized, Param
+from ..core.parameterization.transformations import Logexp

-class WarpingFunction(object):
+class WarpingFunction(Parameterized):
    """
    abstract function for warping
    z = f(y)
    """

-    def __init__(self):
-        raise NotImplementedError
+    def __init__(self, name):
+        super(WarpingFunction, self).__init__(name=name)

    def f(self,y,psi):
        """function transformation
@ -34,9 +35,10 @@ class WarpingFunction(object):
    def _get_param_names(self):
        raise NotImplementedError

-    def plot(self, psi, xmin, xmax):
+    def plot(self,  xmin, xmax):
+        psi = self.psi
        y = np.arange(xmin, xmax, 0.01)
-        f_y = self.f(y, psi)
+        f_y = self.f(y)
        from matplotlib import pyplot as plt
        plt.figure()
        plt.plot(y, f_y)
@ -50,6 +52,7 @@ class TanhWarpingFunction(WarpingFunction):
        """n_terms specifies the number of tanh terms to be used"""
        self.n_terms = n_terms
        self.num_parameters = 3 * self.n_terms
+        super(TanhWarpingFunction, self).__init__(name='warp_tanh')

    def f(self,y,psi):
        """
@ -163,8 +166,18 @@ class TanhWarpingFunction_d(WarpingFunction):
        """n_terms specifies the number of tanh terms to be used"""
        self.n_terms = n_terms
        self.num_parameters = 3 * self.n_terms + 1
+        self.psi = np.ones((self.n_terms, 3))

-    def f(self,y,psi):
+        super(TanhWarpingFunction_d, self).__init__(name='warp_tanh')
+        self.psi = Param('psi', self.psi)
+        self.psi[:, :2].constrain_positive()
+
+        self.d = Param('%s' % ('d'), 1.0, Logexp())
+        self.link_parameter(self.psi)
+        self.link_parameter(self.d)
+
+
+    def f(self,y):
        """
        Transform y with f using parameter vector psi
        psi = [[a,b,c]]
@ -175,9 +188,9 @@ class TanhWarpingFunction_d(WarpingFunction):
        #1. check that number of params is consistent
        # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
        # assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
-        mpsi = psi.copy()
-        d = psi[-1]
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
+
+        d = self.d
+        mpsi = self.psi

        #3. transform data
        z = d*y.copy()
@ -187,7 +200,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        return z


-    def f_inv(self, z, psi, max_iterations=1000, y=None):
+    def f_inv(self, z, max_iterations=1000, y=None):
        """
        calculate the numerical inverse of f

@ -198,12 +211,12 @@ class TanhWarpingFunction_d(WarpingFunction):
        z = z.copy()
        if y is None:
            y = np.ones_like(z)
-            
+
        it = 0
        update = np.inf

        while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
-            update = (self.f(y, psi) - z)/self.fgrad_y(y, psi)
+            update = (self.f(y) - z)/self.fgrad_y(y)
            y -= update
            it += 1
        if it == max_iterations:
@ -212,7 +225,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        return y


-    def fgrad_y(self, y, psi, return_precalc = False):
+    def fgrad_y(self, y,return_precalc = False):
        """
        gradient of f w.r.t to y ([N x 1])

@ -221,9 +234,8 @@ class TanhWarpingFunction_d(WarpingFunction):
        """


-        mpsi = psi.copy()
-        d = psi[-1]
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
+        d = self.d
+        mpsi = self.psi

        # vectorized version

@ -240,7 +252,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        return GRAD


-    def fgrad_y_psi(self, y, psi, return_covar_chain = False):
+    def fgrad_y_psi(self, y, return_covar_chain = False):
        """
        gradient of f w.r.t to y and psi

@ -248,10 +260,10 @@ class TanhWarpingFunction_d(WarpingFunction):

        """

-        mpsi = psi.copy()
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)

-        w, s, r, d = self.fgrad_y(y, psi, return_precalc = True)
+        mpsi = self.psi
+
+        w, s, r, d = self.fgrad_y(y, return_precalc = True)

        gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
        for i in range(len(mpsi)):