diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 368ecd9f..bbd3939b 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -6,6 +6,9 @@ import sys
 from .. import kern
 from .model import Model
 from .parameterization import ObsAr
+from .model import Model
+from .mapping import Mapping
+from .parameterization import ObsAr
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
 from .parameterization.variational import VariationalPosterior
@@ -34,7 +37,7 @@ class GP(Model):
 
 
     """
-    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
+    def __init__(self, X, Y, kernel, likelihood, mean_function=None, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
         super(GP, self).__init__(name)
 
         assert X.ndim == 2
@@ -75,6 +78,15 @@ class GP(Model):
         assert isinstance(likelihood, likelihoods.Likelihood)
         self.likelihood = likelihood
 
+        #handle the mean function
+        self.mean_function = mean_function
+        if mean_function is not None:
+            assert isinstance(self.mean_function, Mapping)
+            assert mean_function.input_dim == self.input_dim
+            assert mean_function.output_dim == self.output_dim
+            self.link_parameter(mean_function)
+
+
         #find a sensible inference method
         logger.info("initializing inference method")
         if inference_method is None:
@@ -153,9 +165,11 @@ class GP(Model):
             This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
             this method yourself, there may be unexpected consequences.
         """
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
         self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
         self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dm'], self.X)
 
     def log_likelihood(self):
         """
@@ -192,6 +206,10 @@ class GP(Model):
 
         #force mu to be a column vector
         if len(mu.shape)==1: mu = mu[:,None]
+
+        #add the mean function in
+        if not self.mean_function is None:
+            mu += self.mean_function.f(_Xnew)
         return mu, var
 
     def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
@@ -241,12 +259,14 @@ class GP(Model):
 
     def predictive_gradients(self, Xnew):
         """
-        Compute the derivatives of the latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect to X*
 
         Given a set of points at which to predict X* (size [N*,Q]), compute the
         derivatives of the mean and variance. Resulting arrays are sized:
          dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
 
+        Note that this is not the same as computing the mean and variance of the derivative of the function!
+
          dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
         :param X: The points at which to get the predictive gradients
         :type X: np.ndarray (Xnew x self.input_dim)
diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py
index dd45a26e..30614384 100644
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@@ -3,7 +3,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import sys
-from parameterization import Parameterized
+from .parameterization import Parameterized
 import numpy as np
 
 class Mapping(Parameterized):
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index 842183fb..257b683f 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -50,31 +50,29 @@ class SpikeAndSlabPrior(VariationalPrior):
     def KL_divergence(self, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
         if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
             pi = self.pi[idx]
         else:
             pi = self.pi
             
         var_mean = np.square(mu)/self.variance
         var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*(log_gamma-np.log(pi))).sum()+(gamma1*(log_gamma1-np.log(1-pi))).sum()
+        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
         return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
 
     def update_gradients_KL(self, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
         if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
             pi = self.pi[idx]
         else:
             pi = self.pi
 
-        variational_posterior.binary_prob.gradient -= (np.log((1-pi)/pi)+log_gamma-log_gamma1+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.)*gamma*gamma1
+        variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
         mu.gradient -= gamma*mu/self.variance
         S.gradient -= (1./self.variance - 1./S) * gamma /2.
         if self.learnPi:
@@ -162,24 +160,8 @@ class SpikeAndSlabPosterior(VariationalPosterior):
         binary_prob : the probability of the distribution on the slab part.
         """
         super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
-        self.gamma = Param("binary_prob",binary_prob)
+        self.gamma = Param("binary_prob",binary_prob,Logistic(0.,1.))
         self.link_parameter(self.gamma)
-        
-    @Cache_this(limit=5)
-    def gamma_probabilities(self):
-        prob = np.zeros_like(param_to_array(self.gamma))
-        prob[self.gamma>-710] = 1./(1.+np.exp(-self.gamma[self.gamma>-710]))
-        prob1 = -np.zeros_like(param_to_array(self.gamma))
-        prob1[self.gamma<710] = 1./(1.+np.exp(self.gamma[self.gamma<710]))
-        return prob, prob1
-    
-    @Cache_this(limit=5)
-    def gamma_log_prob(self):
-        loggamma = param_to_array(self.gamma).copy()
-        loggamma[loggamma>-40] = -np.log1p(np.exp(-loggamma[loggamma>-40]))
-        loggamma1 = -param_to_array(self.gamma).copy()
-        loggamma1[loggamma1>-40] = -np.log1p(np.exp(-loggamma1[loggamma1>-40]))
-        return loggamma,loggamma1
 
     def set_gradients(self, grad):
         self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index b2db4222..35644bfe 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -19,7 +19,7 @@ class SparseGP(GP):
     This model allows (approximate) inference using variational DTC or FITC
     (Gaussian likelihoods) as well as non-conjugate sparse methods based on
     these.
-    
+
     This is not for missing data, as the implementation for missing data involves
     some inefficient optimization routine decisions.
     See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
@@ -39,7 +39,7 @@ class SparseGP(GP):
 
     """
 
-    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
                  name='sparse gp', Y_metadata=None, normalizer=False):
         #pick a sensible inference method
         if inference_method is None:
@@ -53,7 +53,7 @@ class SparseGP(GP):
         self.Z = Param('inducing inputs', Z)
         self.num_inducing = Z.shape[0]
 
-        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
 
         logger.info("Adding Z as parameter")
         self.link_parameter(self.Z, index=0)
@@ -61,7 +61,7 @@ class SparseGP(GP):
 
     def has_uncertain_inputs(self):
         return isinstance(self.X, VariationalPosterior)
-    
+
     def set_Z(self, Z, trigger_update=True):
         if trigger_update: self.update_model(False)
         self.unlink_parameter(self.Z)
@@ -110,8 +110,8 @@ class SparseGP(GP):
 
     def _raw_predict(self, Xnew, full_cov=False, kern=None):
         """
-        Make a prediction for the latent function values. 
-    
+        Make a prediction for the latent function values.
+
         For certain inputs we give back a full_cov of shape NxN,
         if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
         we take only the diagonal elements across N.
@@ -136,6 +136,9 @@ class SparseGP(GP):
             else:
                 Kxx = kern.Kdiag(Xnew)
                 var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+            #add in the mean function
+            if self.mean_function is not None:
+                mu += self.mean_function.f(Xnew)
         else:
             psi0_star = self.kern.psi0(self.Z, Xnew)
             psi1_star = self.kern.psi1(self.Z, Xnew)
@@ -165,4 +168,5 @@ class SparseGP(GP):
                     var[i] = var_
                 else:
                     var[i] = np.diag(var_)+p0-t2
+
         return mu, var
diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py
index 8330cc7e..fd48a7ab 100644
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf
 
 
 class SVGP(SparseGP):
-    def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
         """
         Stochastic Variational GP.
 
@@ -38,7 +38,7 @@ class SVGP(SparseGP):
         #create the SVI inference method
         inf_method = svgp_inf()
 
-        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
+        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
                  name=name, Y_metadata=Y_metadata, normalizer=False)
 
         self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
@@ -48,7 +48,7 @@ class SVGP(SparseGP):
         self.link_parameter(self.m)
 
     def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
 
         #update the kernel gradients
         self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
@@ -65,6 +65,13 @@ class SVGP(SparseGP):
         self.m.gradient = self.grad_dict['dL_dm']
         self.chol.gradient = self.grad_dict['dL_dchol']
 
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
+            g = self.mean_function.gradient[:].copy()
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
+            self.mean_function.gradient[:] += g
+            self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
+
     def set_data(self, X, Y):
         """
         Set the data without calling parameters_changed to avoid wasted computation
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index cf8205f9..267c6d1e 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -505,3 +505,48 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
 
     print(m)
     return m
+
+def simple_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    The simplest possible mean function. No parameters, just a simple Sinusoid.
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+    mf.update_gradients = lambda a,b: None
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot(plot_limits=(-10,15))
+    return m
+
+def parametric_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    A linear mean function with parameters that we'll learn alongside the kernel
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
+
+    mf = GPy.mappings.Linear(1,1)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot()
+    return m
+
+
diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py
index 95600a91..0aa990c1 100644
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@@ -20,7 +20,8 @@ class DTC(LatentFunctionInference):
     def __init__(self):
         self.const_jitter = 1e-6
 
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
 
         num_inducing, _ = Z.shape
@@ -88,7 +89,8 @@ class vDTC(object):
     def __init__(self):
         self.const_jitter = 1e-6
 
-    def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
 
         num_inducing, _ = Z.shape
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index ebf59eaa..2a0a2592 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -36,11 +36,18 @@ class ExactGaussianInference(LatentFunctionInference):
             #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
             return Y
 
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
-        YYT_factor = self.get_YYTfactor(Y)
+
+        if mean_function is None:
+            m = 0
+        else:
+            m = mean_function.f(X)
+
+
+        YYT_factor = self.get_YYTfactor(Y-m)
 
         K = kern.K(X)
 
@@ -56,4 +63,4 @@ class ExactGaussianInference(LatentFunctionInference):
 
         dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
 
-        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index a097c73d..85841a33 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -33,7 +33,8 @@ class EP(LatentFunctionInference):
         # TODO: update approximation in the end as well? Maybe even with a switch?
         pass
 
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         num_data, output_dim = Y.shape
         assert output_dim ==1, "ep in 1D only (for now!)"
 
diff --git a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
index 86dcb691..e182c9f7 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@@ -64,7 +64,8 @@ class EPDTC(LatentFunctionInference):
         self.old_mutilde, self.old_vtilde = None, None
         self._ep_approximation = None
 
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         num_data, output_dim = Y.shape
         assert output_dim ==1, "ep in 1D only (for now!)"
 
diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index 7011aef8..f38eb52b 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -18,7 +18,8 @@ class FITC(LatentFunctionInference):
     """
     const_jitter = 1e-6
 
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
 
         num_inducing, _ = Z.shape
         num_data, output_dim = Y.shape
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 3410c798..c6921f57 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -39,10 +39,12 @@ class Laplace(LatentFunctionInference):
         self.first_run = True
         self._previous_Ki_fhat = None
 
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
+        assert mean_function is None, "inference with a mean function not implemented"
+
         # Compute K
         K = kern.K(X)
 
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index ea608cce..fbd72f57 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -15,7 +15,7 @@ class Posterior(object):
     the function at any new point x_* by integrating over this posterior.
 
     """
-    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
+    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
         """
         woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
         woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@@ -67,6 +67,7 @@ class Posterior(object):
         #option 2:
         self._mean = mean
         self._covariance = cov
+        self._prior_mean = prior_mean
 
         #compute this lazily
         self._precision = None
@@ -175,7 +176,7 @@ class Posterior(object):
         $$
         """
         if self._woodbury_vector is None:
-            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
+            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
         return self._woodbury_vector
 
     @property
diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py
index 521ef20b..bad73a71 100644
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@@ -6,7 +6,8 @@ from .posterior import Posterior
 
 class SVGP(LatentFunctionInference):
 
-    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+
         num_inducing = Z.shape[0]
         num_data, num_outputs = Y.shape
 
@@ -22,6 +23,15 @@ class SVGP(LatentFunctionInference):
             #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
             #Si, Lnew, _,_ = linalg.pdinv(S)
 
+        #compute mean function stuff
+        if mean_function is not None:
+            prior_mean_u = mean_function.f(Z)
+            prior_mean_f = mean_function.f(X)
+        else:
+            prior_mean_u = np.zeros((num_inducing, num_outputs))
+            prior_mean_f = np.zeros((num_data, num_outputs))
+
+
         #compute kernel related stuff
         Kmm = kern.K(Z)
         Knm = kern.K(X, Z)
@@ -30,17 +40,31 @@ class SVGP(LatentFunctionInference):
 
         #compute the marginal means and variances of q(f)
         A = np.dot(Knm, Kmmi)
-        mu = np.dot(A, q_u_mean)
+        mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
         v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
 
         #compute the KL term
         Kmmim = np.dot(Kmmi, q_u_mean)
         KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
         KL = KLs.sum()
-        dKL_dm = Kmmim
+        #gradient of the KL term (assuming zero mean function)
+        dKL_dm = Kmmim.copy()
         dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
         dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
 
+        if mean_function is not None:
+            #adjust KL term for mean function
+            Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
+            KL += -np.sum(q_u_mean*Kmmi_mfZ)
+            KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
+
+            #adjust gradient for mean fucntion
+            dKL_dm -= Kmmi_mfZ
+            dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
+            dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
+
+            #compute gradients for mean_function
+            dKL_dmfZ = Kmmi_mfZ - Kmmim
 
         #quadrature for the likelihood
         F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
@@ -50,11 +74,9 @@ class SVGP(LatentFunctionInference):
         if dF_dthetaL is not None:
             dF_dthetaL =  dF_dthetaL.sum(1).sum(1)*batch_scale
 
-        #derivatives of expected likelihood
+        #derivatives of expected likelihood, assuming zero mean function
         Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
         Admu = A.T.dot(dF_dmu)
-        #AdvA = np.einsum('ijk,jl->ilk', Adv, A)
-        #AdvA = np.dot(A.T, Adv).swapaxes(0,1)
         AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
         tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
         dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
@@ -64,6 +86,14 @@ class SVGP(LatentFunctionInference):
         dF_dm = Admu
         dF_dS = AdvA
 
+        #adjust gradient to account for mean function
+        if mean_function is not None:
+            dF_dmfX = dF_dmu.copy()
+            dF_dmfZ = -Admu
+            dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
+            dF_dKmm += Admu.dot(Kmmi_mfZ.T)
+
+
         #sum (gradients of) expected likelihood and KL part
         log_marginal = F.sum() - KL
         dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
@@ -71,4 +101,8 @@ class SVGP(LatentFunctionInference):
         dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
         dL_dchol = choleskies.triang_to_flat(dL_dchol)
 
-        return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        if mean_function is not None:
+            grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
+            grad_dict['dL_dmfX'] = dF_dmfX
+        return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index 6f98668f..c546a4a1 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -169,11 +169,13 @@ class VarDTC_minibatch(LatentFunctionInference):
 
         Kmm = kern.K(Z).copy()
         diag.add(Kmm, self.const_jitter)
-        Lm = jitchol(Kmm, maxtries=100)
+        if not np.isfinite(Kmm).all():
+            print Kmm
+        Lm = jitchol(Kmm)
 
         LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
         Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
-        LL = jitchol(Lambda, maxtries=100)
+        LL = jitchol(Lambda)
         logdet_L = 2.*np.sum(np.log(np.diag(LL)))
         b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
         bbt = np.square(b).sum()
diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index 27a15aab..ff7cf140 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -7,6 +7,20 @@ from ...util.caching import Cache_this
 import itertools
 from functools import reduce
 
+
+def numpy_invalid_op_as_exception(func):
+    """
+    A decorator that allows catching numpy invalid operations
+    as exceptions (the default behaviour is raising warnings).
+    """
+    def func_wrapper(*args, **kwargs):
+        np.seterr(invalid='raise')
+        result = func(*args, **kwargs)
+        np.seterr(invalid='warn')
+        return result
+    return func_wrapper
+
+
 class Prod(CombinationKernel):
     """
     Computes the product of 2 kernels
@@ -47,18 +61,20 @@ class Prod(CombinationKernel):
             self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
             self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
         else:
-            k = self.K(X,X2)*dL_dK
-            for p in self.parts:
-                p.update_gradients_full(k/p.K(X,X2),X,X2)
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                to_update.update_gradients_full(dL_dK * prod, X, X2)
 
     def update_gradients_diag(self, dL_dKdiag, X):
         if len(self.parts)==2:
             self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
             self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
         else:
-            k = self.Kdiag(X)*dL_dKdiag
-            for p in self.parts:
-                p.update_gradients_diag(k/p.Kdiag(X),X)
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                to_update.update_gradients_diag(dL_dKdiag * prod, X)
 
     def gradients_X(self, dL_dK, X, X2=None):
         target = np.zeros(X.shape)
@@ -66,9 +82,10 @@ class Prod(CombinationKernel):
             target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
             target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
         else:
-            k = self.K(X,X2)*dL_dK
-            for p in self.parts:
-                target += p.gradients_X(k/p.K(X,X2),X,X2)
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                target += to_update.gradients_X(dL_dK * prod, X, X2)
         return target
 
     def gradients_X_diag(self, dL_dKdiag, X):
@@ -81,3 +98,5 @@ class Prod(CombinationKernel):
             for p in self.parts:
                 target += p.gradients_X_diag(k/p.Kdiag(X),X)
         return target
+
+
diff --git a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
index 5f261785..d431cd61 100644
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@@ -37,11 +37,11 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati
 
     # Compute for psi0 and psi1
     mu2S = np.square(mu)+S
-    dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu)
-    dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu)
-    dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z)
-    dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance)
-    dL_dZ +=  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu)
+    dL_dvar += (dL_dpsi0[:,None]*gamma*mu2S).sum(axis=0) + (dL_dpsi1.T.dot(gamma*mu)*Z).sum(axis=0)
+    dL_dgamma += dL_dpsi0[:,None]*variance*mu2S+ dL_dpsi1.dot(Z)*mu*variance
+    dL_dmu += dL_dpsi0[:,None]*2.*variance*gamma*mu + dL_dpsi1.dot(Z)*gamma*variance
+    dL_dS += dL_dpsi0[:,None]*variance*gamma
+    dL_dZ += dL_dpsi1.T.dot(gamma*mu)*variance
     
     return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma
 
@@ -64,29 +64,23 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
     gamma2 = np.square(gamma)
     variance2 = np.square(variance)
     mu2S = mu2+S # NxQ
-    gvm = np.einsum('nq,nq,q->nq',gamma,mu,variance)
-    common_sum = np.einsum('nq,mq->nm',gvm,Z)
-#     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
-    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
+    gvm = gamma*mu*variance
+    common_sum = gvm.dot(Z.T)
+    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+    Z_expect_var2 = Z_expect*variance2
     dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
-    tmp = np.einsum('mo,oq->mq',dL_dpsi2T,Z)
-    common_expect = np.einsum('mq,nm->nq',tmp,common_sum)
-#     common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
-    Z2_expect = np.einsum('om,nm->no',dL_dpsi2T,common_sum)
-    Z1_expect = np.einsum('om,mq->oq',dL_dpsi2T,Z)
+    common_expect = common_sum.dot(dL_dpsi2T).dot(Z)
+    Z2_expect = common_sum.dot(dL_dpsi2T)
+    Z1_expect = dL_dpsi2T.dot(Z)
     
-    dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\
-        np.einsum('nq,nq,nq->q',common_expect,gamma,mu)
+    dL_dvar = variance*Z_expect*2.*(gamma*mu2S-gamma2*mu2).sum(axis=0)+(common_expect*gamma*mu).sum(axis=0)
         
-    dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\
-        np.einsum('nq,q,nq->nq',common_expect,variance,mu)
+    dL_dgamma = Z_expect_var2*(mu2S-2.*gamma*mu2)+common_expect*mu*variance
+                
+    dL_dmu = Z_expect_var2*mu*2.*(gamma-gamma2) + common_expect*gamma*variance
+
+    dL_dS = gamma*Z_expect_var2
     
-    dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\
-            np.einsum('nq,nq,q->nq',common_expect,gamma,variance)
-                    
-    dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2)
-    
-#     dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum))
-    dL_dZ = Z1_expect*np.einsum('nq,q,nq->q',gamma,variance2,(mu2S-gamma*mu2))+np.einsum('nq,q,nq,nm->mq',gamma,variance,mu,Z2_expect)
+    dL_dZ = (gamma*(mu2S-gamma*mu2)).sum(axis=0)*variance2*Z1_expect+ Z2_expect.T.dot(gamma*mu)*variance
 
     return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ
diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
index 18a4d751..f6a24c86 100644
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@@ -22,12 +22,14 @@ try:
         # _psi1                NxM
         mu = variational_posterior.mean
         S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
          
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
         l2 = np.square(lengthscale)
         log_denom1 = np.log(S/l2+1)
         log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
         variance = float(variance)
         psi0 = np.empty(N)
         psi0[:] = variance
@@ -37,6 +39,7 @@ try:
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
         Z = param_to_array(Z)
          
         support_code = """
@@ -79,7 +82,7 @@ try:
             }
         }
         """
-        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
      
         psi2 = psi2n.sum(axis=0)
         return psi0,psi1,psi2,psi2n
@@ -94,12 +97,13 @@ try:
      
         mu = variational_posterior.mean
         S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
         l2 = np.square(lengthscale)
         log_denom1 = np.log(S/l2+1)
         log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
-        gamma, gamma1 = variational_posterior.gamma_probabilities()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
         variance = float(variance)
      
         dvar = np.zeros(1)
@@ -113,6 +117,7 @@ try:
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
         Z = param_to_array(Z)
          
         support_code = """
@@ -130,7 +135,6 @@ try:
                         double Zm1q = Z(m1,q);
                         double Zm2q = Z(m2,q);
                         double gnq = gamma(n,q);
-                        double g1nq = gamma1(n,q);
                         double mu_nq = mu(n,q);
                          
                         if(m2==0) {
@@ -156,7 +160,7 @@ try:
                              
                             dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                             dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
-                            dgamma(n,q) += lpsi1*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                            dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                             dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
                             dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                         }
@@ -184,7 +188,7 @@ try:
                          
                         dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
                         dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
-                        dgamma(n,q) += lpsi2*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                        dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                         dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
                         dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
                     }
@@ -192,7 +196,7 @@ try:
             }
         }
         """
-        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','gamma1','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
      
         dl *= 2.*lengthscale
         if not ARD:
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index 0465a556..6bc4b304 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -301,6 +301,8 @@ class Exponential(Stationary):
         return -0.5*self.K_of_r(r)
 
 
+
+
 class OU(Stationary):
     """
     OU kernel:
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index b4e07af7..9ecf7dbf 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -35,8 +35,8 @@ class Gaussian(Likelihood):
             gp_link = link_functions.Identity()
 
         if not isinstance(gp_link, link_functions.Identity):
-            print "Warning, Exact inference is not implemeted for non-identity link functions,\
-            if you are not already, ensure Laplace inference_method is used"
+            print("Warning, Exact inference is not implemeted for non-identity link functions,\
+            if you are not already, ensure Laplace inference_method is used")
 
         super(Gaussian, self).__init__(gp_link, name=name)
 
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index ff1e3118..955b34b8 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -565,7 +565,7 @@ class Likelihood(Parameterized):
         :param burnin: number of samples to use for burnin (will need modifying)
         :param Y_metadata: Y_metadata for pdf
         """
-        print "Warning, using MCMC for sampling y*, needs to be tuned!"
+        print("Warning, using MCMC for sampling y*, needs to be tuned!")
         if starting_loc is None:
             starting_loc = fNew
         from functools import partial
@@ -619,8 +619,8 @@ class Likelihood(Parameterized):
 
             #Show progress
             if i % int((burn_in+num_samples)*0.1) == 0:
-                print "{}% of samples taken ({})".format((i/int((burn_in+num_samples)*0.1)*10), i)
-                print "Last run accept ratio: ", accept_ratio[i]
+                print("{}% of samples taken ({})".format((i/int((burn_in+num_samples)*0.1)*10), i))
+                print("Last run accept ratio: ", accept_ratio[i])
 
-        print "Average accept ratio: ", np.mean(accept_ratio)
+        print("Average accept ratio: ", np.mean(accept_ratio))
         return chain_values
diff --git a/GPy/mappings/__init__.py b/GPy/mappings/__init__.py
index d9c13ad0..5193a232 100644
--- a/GPy/mappings/__init__.py
+++ b/GPy/mappings/__init__.py
@@ -4,4 +4,6 @@
 from .kernel import Kernel
 from .linear import Linear
 from .mlp import MLP
-#from rbf import RBF
+from .additive import Additive
+from .compound import Compound
+
diff --git a/GPy/mappings/additive.py b/GPy/mappings/additive.py
index 4e7c545d..1c86b680 100644
--- a/GPy/mappings/additive.py
+++ b/GPy/mappings/additive.py
@@ -2,8 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..core.mapping import Mapping
-import GPy
+from ..core import Mapping
 
 class Additive(Mapping):
     """
@@ -27,8 +26,6 @@ class Additive(Mapping):
         Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
         self.mapping1 = mapping1
         self.mapping2 = mapping2
-        self.num_params = self.mapping1.num_params + self.mapping2.num_params
-        self.name = self.mapping1.name + '+' + self.mapping2.name
 
     def f(self, X):
         return self.mapping1.f(X) + self.mapping2.f(X)
diff --git a/GPy/mappings/compound.py b/GPy/mappings/compound.py
new file mode 100644
index 00000000..5a1e8dd1
--- /dev/null
+++ b/GPy/mappings/compound.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2015, James Hensman and Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from ..core import Mapping
+
+class Compound(Mapping):
+    """
+    Mapping based on passing one mapping through another
+
+    .. math::
+
+       f(\mathbf{x}) = f_2(f_1(\mathbf{x}))
+
+    :param mapping1: first mapping
+    :type mapping1: GPy.mappings.Mapping
+    :param mapping2: second mapping
+    :type mapping2: GPy.mappings.Mapping
+
+    """
+
+    def __init__(self, mapping1, mapping2):
+        assert(mapping1.output_dim==mapping2.input_dim)
+        input_dim, output_dim = mapping1.input_dim, mapping2.output_dim
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        self.mapping1 = mapping1
+        self.mapping2 = mapping2
+        self.link_parameters(self.mapping1, self.mapping2)
+
+    def f(self, X):
+        return self.mapping2.f(self.mapping1.f(X))
+
+    def update_gradients(self, dL_dF, X):
+        hidden = self.mapping1.f(X)
+        self.mapping2.update_gradients(dL_dF, hidden)
+        self.mapping1.update_gradients(self.mapping2.gradients_X(dL_dF, hidden), X)
+
+    def gradients_X(self, dL_dF, X):
+        hidden = self.mapping1.f(X)
+        return self.mapping1.gradients_X(self.mapping2.gradients_X(dL_dF, hidden), X)
diff --git a/GPy/mappings/kernel.py b/GPy/mappings/kernel.py
index 3bfcd388..ea1720db 100644
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@@ -36,16 +36,16 @@ class Kernel(Mapping):
         Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
         self.kern = kernel
         self.Z = Z
-        self.num_bases, Zdim = X.shape
+        self.num_bases, Zdim = Z.shape
         assert Zdim == self.input_dim
-        self.A = GPy.core.Param('A', np.random.randn(self.num_bases, self.output_dim))
-        self.add_parameter(self.A)
+        self.A = Param('A', np.random.randn(self.num_bases, self.output_dim))
+        self.link_parameter(self.A)
 
     def f(self, X):
         return np.dot(self.kern.K(X, self.Z), self.A)
 
     def update_gradients(self, dL_dF, X):
-        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T))
+        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T), X, self.Z)
         self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)
 
     def gradients_X(self, dL_dF, X):
diff --git a/GPy/mappings/linear.py b/GPy/mappings/linear.py
index 6fc91944..ee464694 100644
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@@ -26,8 +26,8 @@ class Linear(Mapping):
 
     def __init__(self, input_dim, output_dim, name='linmap'):
         Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
-        self.A = GPy.core.Param('A', np.random.randn(self.input_dim, self.output_dim))
-        self.add_parameter(self.A)
+        self.A = Param('A', np.random.randn(self.input_dim, self.output_dim))
+        self.link_parameter(self.A)
 
     def f(self, X):
         return np.dot(X, self.A)
diff --git a/GPy/mappings/mlp.py b/GPy/mappings/mlp.py
index f22fc07f..4afc2fa1 100644
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@@ -11,32 +11,45 @@ class MLP(Mapping):
     """
 
     def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
-        super(MLP).__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
+        super(MLP, self).__init__(input_dim=input_dim, output_dim=output_dim, name=name)
         self.hidden_dim = hidden_dim
         self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
         self.b1 = Param('b1', np.random.randn(self.hidden_dim))
         self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
         self.b2 = Param('b2', np.random.randn(self.output_dim))
+        self.link_parameters(self.W1, self.b1, self.W2, self.b2)
 
 
     def f(self, X):
-        N, D = X.shape
-        activations = np.tanh(np.dot(X,self.W1) + self.b1)
-        self.out = np.dot(self.activations,self.W2) + self.b2
-        return self.output_fn(self.out)
+        layer1 = np.dot(X, self.W1) + self.b1
+        activations = np.tanh(layer1)
+        return  np.dot(activations, self.W2) + self.b2
 
     def update_gradients(self, dL_dF, X):
-        activations = np.tanh(np.dot(X,self.W1) + self.b1)
-
+        layer1 = np.dot(X,self.W1) + self.b1
+        activations = np.tanh(layer1)
 
         #Evaluate second-layer gradients.
         self.W2.gradient = np.dot(activations.T, dL_dF)
         self.b2.gradient = np.sum(dL_dF, 0)
 
         # Backpropagation to hidden layer.
-        delta_hid = np.dot(dL_dF, self.W2.T) * (1.0 - activations**2)
+        dL_dact = np.dot(dL_dF, self.W2.T)
+        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
 
         # Finally, evaluate the first-layer gradients.
-        self.W1.gradients = np.dot(X.T,delta_hid)
-        self.b1.gradients = np.sum(delta_hid, 0)
+        self.W1.gradient = np.dot(X.T,dL_dlayer1)
+        self.b1.gradient = np.sum(dL_dlayer1, 0)
+
+    def gradients_X(self, dL_dF, X):
+        layer1 = np.dot(X,self.W1) + self.b1
+        activations = np.tanh(layer1)
+
+        # Backpropagation to hidden layer.
+        dL_dact = np.dot(dL_dF, self.W2.T)
+        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+
+        return np.dot(dL_dlayer1, self.W1.T)
+
+
 
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 10c54d49..ad62043a 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -44,10 +44,11 @@ class SparseGPMiniBatch(SparseGP):
     def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                  name='sparse gp', Y_metadata=None, normalizer=False,
                  missing_data=False, stochastic=False, batchsize=1):
-        #pick a sensible inference method
+        
+        # pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
             else:
                 #inference_method = ??
                 raise NotImplementedError("what to do what to do?")
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index b8e1c72d..0f3b8fdd 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -39,7 +39,10 @@ class SSGPLVM(SparseGP_MPI):
             X_variance = np.random.uniform(0,.1,X.shape)
             
         if Gamma is None:
-            gamma = np.random.randn(X.shape[0], input_dim)
+            gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+            gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
+            gamma[gamma>1.-1e-9] = 1.-1e-9
+            gamma[gamma<1e-9] = 1e-9
         else:
             gamma = Gamma.copy()
                 
diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 4b982ed2..5bc9a417 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
@@ -10,14 +9,16 @@ from GPy.util.warping_functions import TanhWarpingFunction_d
 from GPy import kern
 
 class WarpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
 
         if kernel is None:
-            kernel = kern.rbf(X.shape[1])
+            kernel = kern.RBF(X.shape[1])
 
         if warping_function == None:
             self.warping_function = TanhWarpingFunction_d(warping_terms)
             self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
+        else:
+            self.warping_function = warping_function
 
         self.scale_data = False
         if self.scale_data:
@@ -25,10 +26,10 @@ class WarpedGP(GP):
         self.has_uncertain_inputs = False
         self.Y_untransformed = Y.copy()
         self.predict_in_warped_space = False
-        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
+        likelihood = likelihoods.Gaussian()
 
-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
+        self.link_parameter(self.warping_function)
 
     def _scale_data(self, Y):
         self._Ymax = Y.max()
@@ -38,62 +39,55 @@ class WarpedGP(GP):
     def _unscale_data(self, Y):
         return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
 
-    def _set_params(self, x):
-        self.warping_params = x[:self.warping_function.num_parameters]
-        Y = self.transform_data()
-        self.likelihood.set_data(Y)
-        GP._set_params(self, x[self.warping_function.num_parameters:].copy())
+    def parameters_changed(self):
+        self.Y[:] = self.transform_data()
+        super(WarpedGP, self).parameters_changed()
 
-    def _get_params(self):
-        return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
+        Kiy = self.posterior.woodbury_vector.flatten()
 
-    def _get_param_names(self):
-        warping_names = self.warping_function._get_param_names()
-        param_names = GP._get_param_names(self)
-        return warping_names + param_names
-
-    def transform_data(self):
-        Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
-        return Y
-
-    def log_likelihood(self):
-        ll = GP.log_likelihood(self)
-        jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        return ll + np.log(jacobian).sum()
-
-    def _log_likelihood_gradients(self):
-        ll_grads = GP._log_likelihood_gradients(self)
-        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
-        warping_grads = self.warping_function_gradients(alpha)
-
-        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
-        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
-
-    def warping_function_gradients(self, Kiy):
-        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
+        grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
+        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
                                                                  return_covar_chain=True)
         djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
         dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
 
-        return -dquad_dpsi + djac_dpsi
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
+        self.warping_function.d.gradient[:] = warping_grads[0, -1]
+
+
+    def transform_data(self):
+        Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
+        return Y
+
+    def log_likelihood(self):
+        ll = GP.log_likelihood(self)
+        jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
+        return ll + np.log(jacobian).sum()
 
     def plot_warping(self):
-        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
+        self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
 
-    def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
+    def predict(self, Xnew, which_parts='all', pred_init=None):
         # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
+        # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = GP._raw_predict(self, Xnew)
 
         # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var = self.likelihood.predictive_values(mu, var)
 
         if self.predict_in_warped_space:
-            mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
-            var = self.warping_function.f_inv(var, self.warping_params)
+            mean = self.warping_function.f_inv(mean,  y=pred_init)
+            var = self.warping_function.f_inv(var)
 
         if self.scale_data:
             mean = self._unscale_data(mean)
-        
-        return mean, var, _025pm, _975pm
+
+        return mean, var
+
+if __name__ == '__main__':
+    X = np.random.randn(100, 1)
+    Y = np.sin(X) + np.random.randn(100, 1)*0.05
+
+    m = WarpedGP(X, Y)
diff --git a/GPy/plotting/matplot_dep/maps.py b/GPy/plotting/matplot_dep/maps.py
index bc69ddc4..f38c5d93 100644
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@@ -6,7 +6,11 @@ try:
     from matplotlib.patches import Polygon
     from matplotlib.collections import PatchCollection
     #from matplotlib import cm
-    pb.ion()
+    try:
+        __IPYTHON__
+        pb.ion()
+    except NameError:
+        pass
 except:
     pass
 import re
diff --git a/GPy/testing/inference_tests.py b/GPy/testing/inference_tests.py
index d5039049..e09df1fe 100644
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@@ -64,7 +64,7 @@ class InferenceXTestCase(unittest.TestCase):
         m.optimize(max_iters=10000)
         x, mi = m.infer_newX(m.Y)
 
-        print m.X.mean - mi.X.mean
+        print(m.X.mean - mi.X.mean)
         self.assertTrue(np.allclose(m.X.mean, mi.X.mean, rtol=1e-4, atol=1e-4))
         self.assertTrue(np.allclose(m.X.variance, mi.X.variance, rtol=1e-4, atol=1e-4))
 
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index b51034d9..f3d82216 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -255,13 +255,23 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
+    def test_Prod1(self):
+        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
     def test_Prod2(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
     def test_Prod3(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D) * GPy.kern.Bias(self.D)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_Prod4(self):
+        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D) * GPy.kern.Matern32(2, active_dims=[0,1])
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
@@ -400,11 +410,27 @@ class Coregionalize_weave_test(unittest.TestCase):
     GPy.util.config.config.set('weave', 'working', 'False')
 
 
+class KernelTestsProductWithZeroValues(unittest.TestCase):
+
+    def setUp(self):
+        self.X = np.array([[0,1],[1,0]])
+        self.k = GPy.kern.Linear(2) * GPy.kern.Bias(2)
+
+    def test_zero_valued_kernel_full(self):
+        self.k.update_gradients_full(1, self.X)
+        self.assertFalse(np.isnan(self.k['linear.variances'].gradient),
+                         "Gradient resulted in NaN")
+
+    def test_zero_valued_kernel_gradients_X(self):
+        target = self.k.gradients_X(1, self.X)
+        self.assertFalse(np.any(np.isnan(target)),
+                         "Gradient resulted in NaN")
 
 
 if __name__ == "__main__":
     print("Running unit tests, please be (very) patient...")
     unittest.main()
+
 #     np.random.seed(0)
 #     N0 = 3
 #     N1 = 9
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index 7b6164c1..7fa5886f 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -27,9 +27,9 @@ def dparam_partial(inst_func, *args):
           param
     """
     def param_func(param_val, param_name, inst_func, args):
-        #inst_func.im_self._set_params(param)
-        #inst_func.im_self.add_parameter(Param(param_name, param_val))
-        inst_func.im_self[param_name] = param_val
+        #inst_func.__self__._set_params(param)
+        #inst_func.__self__.add_parameter(Param(param_name, param_val))
+        inst_func.__self__[param_name] = param_val
         return inst_func(*args)
     return functools.partial(param_func, inst_func=inst_func, args=args)
 
@@ -44,8 +44,8 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
     The number of parameters and N is the number of data
     Need to take a slice out from f and a slice out of df
     """
-    print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
-                                           func.__name__, dfunc.__name__)
+    print("\n{} likelihood: {} vs {}".format(func.__self__.__class__.__name__,
+                                           func.__name__, dfunc.__name__))
     partial_f = dparam_partial(func, *args)
     partial_df = dparam_partial(dfunc, *args)
     gradchecking = True
@@ -66,7 +66,7 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
         for fixed_val in range(dfnum):
             #dlik and dlik_dvar gives back 1 value for each
             f_ind = min(fnum, fixed_val+1) - 1
-            print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
+            print("fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val))
             #Make grad checker with this param moving, note that set_params is NOT being called
             #The parameter is being set directly with __setattr__
             #Check only the parameter and function value we wish to check at a time
@@ -83,12 +83,12 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
                     if grad.grep_param_names(constrain_param):
                         constraint(constrain_param, grad)
                     else:
-                        print "parameter didn't exist"
-                    print constrain_param, " ", constraint
+                        print("parameter didn't exist")
+                    print(constrain_param, " ", constraint)
             if randomize:
                 grad.randomize()
             if verbose:
-                print grad
+                print(grad)
                 grad.checkgrad(verbose=1)
             if not grad.checkgrad(verbose=True):
                 gradchecking = False
@@ -297,7 +297,7 @@ class TestNoiseModels(object):
     def test_scale2_models(self):
         self.setUp()
 
-        for name, attributes in self.noise_models.iteritems():
+        for name, attributes in self.noise_models.items():
             model = attributes["model"]
             if "grad_params" in attributes:
                 params = attributes["grad_params"]
@@ -373,8 +373,8 @@ class TestNoiseModels(object):
     #############
     @with_setup(setUp, tearDown)
     def t_logpdf(self, model, Y, f, Y_metadata):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         #print model._get_params()
         np.testing.assert_almost_equal(
                 model.pdf(f.copy(), Y.copy(), Y_metadata=Y_metadata).prod(),
@@ -383,33 +383,33 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_dlogpdf_df(self, model, Y, f, Y_metadata):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         self.description = "\n{}".format(inspect.stack()[0][3])
         logpdf = functools.partial(np.sum(model.logpdf), y=Y, Y_metadata=Y_metadata)
         dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(logpdf, dlogpdf_df, f.copy(), 'g')
         grad.randomize()
-        print model
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
     def t_d2logpdf_df2(self, model, Y, f, Y_metadata):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
         d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(dlogpdf_df, d2logpdf_df2, f.copy(), 'g')
         grad.randomize()
-        print model
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
     def t_d3logpdf_df3(self, model, Y, f, Y_metadata):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
         d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, f.copy(), 'g')
         grad.randomize()
-        print model
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     ##############
@@ -417,8 +417,8 @@ class TestNoiseModels(object):
     ##############
     @with_setup(setUp, tearDown)
     def t_dlogpdf_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
                     params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -427,8 +427,8 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_dlogpdf_df_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
                     params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -437,8 +437,8 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_d2logpdf2_df2_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
                     params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -450,7 +450,7 @@ class TestNoiseModels(object):
     ################
     @with_setup(setUp, tearDown)
     def t_dlogpdf_dlink(self, model, Y, f, Y_metadata, link_f_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         logpdf = functools.partial(model.logpdf_link, y=Y, Y_metadata=Y_metadata)
         dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(logpdf, dlogpdf_dlink, f.copy(), 'g')
@@ -460,13 +460,13 @@ class TestNoiseModels(object):
             constraint('g', grad)
 
         grad.randomize()
-        print grad
-        print model
+        print(grad)
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
     def t_d2logpdf_dlink2(self, model, Y, f, Y_metadata, link_f_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
         d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, f.copy(), 'g')
@@ -476,13 +476,13 @@ class TestNoiseModels(object):
             constraint('g', grad)
 
         grad.randomize()
-        print grad
-        print model
+        print(grad)
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
     def t_d3logpdf_dlink3(self, model, Y, f, Y_metadata, link_f_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
         d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, f.copy(), 'g')
@@ -492,8 +492,8 @@ class TestNoiseModels(object):
             constraint('g', grad)
 
         grad.randomize()
-        print grad
-        print model
+        print(grad)
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     #################
@@ -501,8 +501,8 @@ class TestNoiseModels(object):
     #################
     @with_setup(setUp, tearDown)
     def t_dlogpdf_link_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
                     params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -511,8 +511,8 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_dlogpdf_dlink_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
                     params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -521,8 +521,8 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_d2logpdf2_dlink2_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
                     params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -534,7 +534,7 @@ class TestNoiseModels(object):
     ################
     @with_setup(setUp, tearDown)
     def t_laplace_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         #Normalize
         Y = Y/Y.max()
         white_var = 1e-5
@@ -548,7 +548,7 @@ class TestNoiseModels(object):
         for constrain_param, constraint in constraints:
             constraint(constrain_param, m)
 
-        print m
+        print(m)
         m.randomize()
         m.randomize()
 
@@ -558,7 +558,7 @@ class TestNoiseModels(object):
             m[name] = param_vals[param_num]
 
         #m.optimize(max_iters=8)
-        print m
+        print(m)
         #if not m.checkgrad(step=step):
             #m.checkgrad(verbose=1, step=step)
             #NOTE this test appears to be stochastic for some likelihoods (student t?)
@@ -571,7 +571,7 @@ class TestNoiseModels(object):
     ###########
     @with_setup(setUp, tearDown)
     def t_ep_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         #Normalize
         Y = Y/Y.max()
         white_var = 1e-6
@@ -587,7 +587,7 @@ class TestNoiseModels(object):
             constraints[param_num](name, m)
 
         m.randomize()
-        print m
+        print(m)
         assert m.checkgrad(verbose=1, step=step)
 
 
@@ -624,7 +624,7 @@ class LaplaceTests(unittest.TestCase):
         self.X = None
 
     def test_gaussian_d2logpdf_df2_2(self):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         self.Y = None
 
         self.N = 2
@@ -673,17 +673,17 @@ class LaplaceTests(unittest.TestCase):
         m2.randomize()
 
         if debug:
-            print m1
-            print m2
+            print(m1)
+            print(m2)
 
         optimizer = 'scg'
-        print "Gaussian"
+        print("Gaussian")
         m1.optimize(optimizer, messages=debug, ipython_notebook=False)
-        print "Laplace Gaussian"
+        print("Laplace Gaussian")
         m2.optimize(optimizer, messages=debug, ipython_notebook=False)
         if debug:
-            print m1
-            print m2
+            print(m1)
+            print(m2)
 
         m2[:] = m1[:]
 
@@ -730,5 +730,5 @@ class LaplaceTests(unittest.TestCase):
         self.assertTrue(m2.checkgrad(verbose=True))
 
 if __name__ == "__main__":
-    print "Running unit tests"
+    print("Running unit tests")
     unittest.main()
diff --git a/GPy/testing/mapping_tests.py b/GPy/testing/mapping_tests.py
new file mode 100644
index 00000000..2e32dad3
--- /dev/null
+++ b/GPy/testing/mapping_tests.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class MappingGradChecker(GPy.core.Model):
+    """
+    This class has everything we need to check the gradient of a mapping. It
+    implement a simple likelihood which is a weighted sum of the outputs of the
+    mapping. the gradients are checked against the parameters of the mapping
+    and the input.
+    """
+    def __init__(self, mapping, X, name='map_grad_check'):
+        super(MappingGradChecker, self).__init__(name)
+        self.mapping = mapping
+        self.link_parameter(self.mapping)
+        self.X = GPy.core.Param('X',X)
+        self.link_parameter(self.X)
+        self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
+    def log_likelihood(self):
+        return np.sum(self.mapping.f(self.X) * self.dL_dY)
+    def parameters_changed(self):
+        self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
+        self.mapping.update_gradients(self.dL_dY, self.X)
+
+
+
+
+
+
+
+class MappingTests(unittest.TestCase):
+
+    def test_kernelmapping(self):
+        X = np.random.randn(100,3)
+        Z = np.random.randn(10,3)
+        mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_linearmapping(self):
+        mapping = GPy.mappings.Linear(3, 2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_mlpmapping(self):
+        mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_addmapping(self):
+        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
+        mapping = GPy.mappings.Additive(m1, m2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_compoundmapping(self):
+        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        Z = np.random.randn(10,2)
+        m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
+        mapping = GPy.mappings.Compound(m1, m2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+
+
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
diff --git a/GPy/testing/meanfunc_tests.py b/GPy/testing/meanfunc_tests.py
new file mode 100644
index 00000000..1d875377
--- /dev/null
+++ b/GPy/testing/meanfunc_tests.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2015, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class MFtests(unittest.TestCase):
+    def simple_mean_function():
+        """
+        The simplest possible mean function. No parameters, just a simple Sinusoid.
+        """
+        #create  simple mean function
+        mf = GPy.core.Mapping(1,1)
+        mf.f = np.sin
+        mf.update_gradients = lambda a,b: None
+
+        X = np.linspace(0,10,50).reshape(-1,1)
+        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+
+        k =GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        self.assertTrue(m.checkgrad())
+
+    def test_parametric_mean_function(self):
+        """
+        A linear mean function with parameters that we'll learn alongside the kernel
+        """
+
+        X = np.linspace(0,10,50).reshape(-1,1)
+        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
+
+        mf = GPy.mappings.Linear(1,1)
+
+        k =GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        self.assertTrue(m.checkgrad())
+
+    def test_svgp_mean_function(self):
+
+        # an instance of the SVIGOP with a men function
+        X = np.linspace(0,10,500).reshape(-1,1)
+        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+        Y = np.where(Y>0, 1,0) # make aclassificatino problem
+
+        mf = GPy.mappings.Linear(1,1)
+        Z = np.linspace(0,10,50).reshape(-1,1)
+        lik = GPy.likelihoods.Bernoulli()
+        k =GPy.kern.RBF(1) + GPy.kern.White(1, 1e-4)
+        m = GPy.core.SVGP(X, Y,Z=Z, kernel=k, likelihood=lik, mean_function=mf)
+        self.assertTrue(m.checkgrad())
+
+
+
diff --git a/GPy/testing/svgp_tests.py b/GPy/testing/svgp_tests.py
index 6dc0fa56..beb9c00d 100644
--- a/GPy/testing/svgp_tests.py
+++ b/GPy/testing/svgp_tests.py
@@ -32,3 +32,23 @@ class SVGP_classification(np.testing.TestCase):
         self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
     def test_grad(self):
         assert self.m.checkgrad(step=1e-4)
+
+class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+    def setUp(self):
+        X = np.linspace(0,10,100).reshape(-1,1)
+        Z = np.linspace(0,10,10).reshape(-1,1)
+        latent_f = np.exp(0.1*X * 0.05*X**2)
+        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
+
+        mf = GPy.mappings.Linear(1,1)
+
+        lik = GPy.likelihoods.Poisson()
+        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
+    def test_grad(self):
+        assert self.m.checkgrad(step=1e-4)
+
+
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 2813a30a..ec83810f 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -102,16 +102,21 @@ def jitchol(A, maxtries=5):
         num_tries = 1
         while num_tries <= maxtries and np.isfinite(jitter):
             try:
+                print jitter
                 L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
-                logging.warning('Added {} rounds of jitter, jitter of {:.10e}\n'.format(num_tries, jitter))
                 return L
             except:
                 jitter *= 10
+            finally:
                 num_tries += 1
+        raise linalg.LinAlgError, "not positive definite, even with jitter."
     import traceback
-    logging.warning('\n'.join(['Added {} rounds of jitter, jitter of {:.10e}'.format(num_tries-1, jitter),
-                                '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
-    raise linalg.LinAlgError("not positive definite, even with jitter.")
+    try: raise
+    except:
+        logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
+            '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
+    import ipdb;ipdb.set_trace()
+    return L
 
 # def dtrtri(L, lower=1):
 #     """
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 70804c67..8f9d232f 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -1,17 +1,18 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 import numpy as np
+from GPy.core.parameterization import Parameterized, Param
+from ..core.parameterization.transformations import Logexp
 
-class WarpingFunction(object):
+class WarpingFunction(Parameterized):
     """
     abstract function for warping
     z = f(y)
     """
 
-    def __init__(self):
-        raise NotImplementedError
+    def __init__(self, name):
+        super(WarpingFunction, self).__init__(name=name)
 
     def f(self,y,psi):
         """function transformation
@@ -34,9 +35,10 @@ class WarpingFunction(object):
     def _get_param_names(self):
         raise NotImplementedError
 
-    def plot(self, psi, xmin, xmax):
+    def plot(self,  xmin, xmax):
+        psi = self.psi
         y = np.arange(xmin, xmax, 0.01)
-        f_y = self.f(y, psi)
+        f_y = self.f(y)
         from matplotlib import pyplot as plt
         plt.figure()
         plt.plot(y, f_y)
@@ -50,6 +52,7 @@ class TanhWarpingFunction(WarpingFunction):
         """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms
+        super(TanhWarpingFunction, self).__init__(name='warp_tanh')
 
     def f(self,y,psi):
         """
@@ -163,8 +166,18 @@ class TanhWarpingFunction_d(WarpingFunction):
         """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms + 1
+        self.psi = np.ones((self.n_terms, 3))
 
-    def f(self,y,psi):
+        super(TanhWarpingFunction_d, self).__init__(name='warp_tanh')
+        self.psi = Param('psi', self.psi)
+        self.psi[:, :2].constrain_positive()
+
+        self.d = Param('%s' % ('d'), 1.0, Logexp())
+        self.link_parameter(self.psi)
+        self.link_parameter(self.d)
+
+
+    def f(self,y):
         """
         Transform y with f using parameter vector psi
         psi = [[a,b,c]]
@@ -175,9 +188,9 @@ class TanhWarpingFunction_d(WarpingFunction):
         #1. check that number of params is consistent
         # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
         # assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
-        mpsi = psi.copy()
-        d = psi[-1]
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
+
+        d = self.d
+        mpsi = self.psi
 
         #3. transform data
         z = d*y.copy()
@@ -187,7 +200,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         return z
 
 
-    def f_inv(self, z, psi, max_iterations=1000, y=None):
+    def f_inv(self, z, max_iterations=1000, y=None):
         """
         calculate the numerical inverse of f
 
@@ -198,12 +211,12 @@ class TanhWarpingFunction_d(WarpingFunction):
         z = z.copy()
         if y is None:
             y = np.ones_like(z)
-            
+
         it = 0
         update = np.inf
 
         while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
-            update = (self.f(y, psi) - z)/self.fgrad_y(y, psi)
+            update = (self.f(y) - z)/self.fgrad_y(y)
             y -= update
             it += 1
         if it == max_iterations:
@@ -212,7 +225,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         return y
 
 
-    def fgrad_y(self, y, psi, return_precalc = False):
+    def fgrad_y(self, y,return_precalc = False):
         """
         gradient of f w.r.t to y ([N x 1])
 
@@ -221,9 +234,8 @@ class TanhWarpingFunction_d(WarpingFunction):
         """
 
 
-        mpsi = psi.copy()
-        d = psi[-1]
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
+        d = self.d
+        mpsi = self.psi
 
         # vectorized version
 
@@ -240,7 +252,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         return GRAD
 
 
-    def fgrad_y_psi(self, y, psi, return_covar_chain = False):
+    def fgrad_y_psi(self, y, return_covar_chain = False):
         """
         gradient of f w.r.t to y and psi
 
@@ -248,10 +260,10 @@ class TanhWarpingFunction_d(WarpingFunction):
 
         """
 
-        mpsi = psi.copy()
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
 
-        w, s, r, d = self.fgrad_y(y, psi, return_precalc = True)
+        mpsi = self.psi
+
+        w, s, r, d = self.fgrad_y(y, return_precalc = True)
 
         gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
         for i in range(len(mpsi)):