From c14aef948ea4f1e2dd663a70dfa2cd30ae89bf6a Mon Sep 17 00:00:00 2001
From: Nicolo Fusi <nicolo.fusi@gmail.com>
Date: Wed, 3 Dec 2014 17:18:22 -0800
Subject: [PATCH 01/22] fixed minor bug in sparse gp minibatch

---
 GPy/models/sparse_gp_minibatch.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index ec2e28f5..f5119e48 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -47,10 +47,11 @@ Created on 3 Nov 2014
     def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                  name='sparse gp', Y_metadata=None, normalizer=False,
                  missing_data=False, stochastic=False, batchsize=1):
-        #pick a sensible inference method
+        
+        # pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
             else:
                 #inference_method = ??
                 raise NotImplementedError, "what to do what to do?"

From ecf463e88631bcc3ef3c4eb608fdedd0c81edbcc Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Thu, 4 Dec 2014 14:21:50 +0000
Subject: [PATCH 02/22] implement update_gradients_diag for MLP kernel

---
 GPy/kern/_src/mlp.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/_src/mlp.py b/GPy/kern/_src/mlp.py
index badbd60d..16e84363 100644
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@@ -79,8 +79,14 @@ class MLP(Kern):
                              + 2*self.bias_variance + 2.))*base_cov_grad).sum()
 
     def update_gradients_diag(self, X):
-        raise NotImplementedError, "TODO"
-
+        self._K_diag_computations(X)
+        self.variance.gradient = np.sum(self._K_diag_dvar*dL_dKdiag)
+        
+        base = four_over_tau*self.variance/np.sqrt(1-self._K_diag_asin_arg*self._K_diag_asin_arg)
+        base_cov_grad = base*dL_dKdiag/np.square(self._K_diag_denom)
+        
+        self.weight_variance.gradient = (base_cov_grad*np.square(X).sum(axis=1)).sum()
+        self.bias_variance.gradient = base_cov_grad.sum()
 
     def gradients_X(self, dL_dK, X, X2):
         """Derivative of the covariance matrix with respect to X"""

From bd1fb56e6c58eaf348322df4283e2ad8bfafad04 Mon Sep 17 00:00:00 2001
From: Nicolo Fusi <nicolo.fusi@gmail.com>
Date: Fri, 2 Jan 2015 15:07:19 -0800
Subject: [PATCH 03/22] re-implemented warpedGP for new release of GPy

---
 GPy/kern/__init__.py          |  2 +-
 GPy/models/warped_gp.py       | 90 ++++++++++++++++-------------------
 GPy/util/warping_functions.py | 54 +++++++++++++--------
 3 files changed, 76 insertions(+), 70 deletions(-)

diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index c400277c..7a7c7ad8 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,7 +1,7 @@
 from _src.kern import Kern
 from _src.rbf import RBF
 from _src.linear import Linear, LinearFull
-from _src.static import Bias, White
+from _src.static import Bias, White, Fixed
 from _src.brownian import Brownian
 from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from _src.mlp import MLP
diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 4b982ed2..5bc9a417 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
@@ -10,14 +9,16 @@ from GPy.util.warping_functions import TanhWarpingFunction_d
 from GPy import kern
 
 class WarpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
 
         if kernel is None:
-            kernel = kern.rbf(X.shape[1])
+            kernel = kern.RBF(X.shape[1])
 
         if warping_function == None:
             self.warping_function = TanhWarpingFunction_d(warping_terms)
             self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
+        else:
+            self.warping_function = warping_function
 
         self.scale_data = False
         if self.scale_data:
@@ -25,10 +26,10 @@ class WarpedGP(GP):
         self.has_uncertain_inputs = False
         self.Y_untransformed = Y.copy()
         self.predict_in_warped_space = False
-        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
+        likelihood = likelihoods.Gaussian()
 
-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
+        self.link_parameter(self.warping_function)
 
     def _scale_data(self, Y):
         self._Ymax = Y.max()
@@ -38,62 +39,55 @@ class WarpedGP(GP):
     def _unscale_data(self, Y):
         return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
 
-    def _set_params(self, x):
-        self.warping_params = x[:self.warping_function.num_parameters]
-        Y = self.transform_data()
-        self.likelihood.set_data(Y)
-        GP._set_params(self, x[self.warping_function.num_parameters:].copy())
+    def parameters_changed(self):
+        self.Y[:] = self.transform_data()
+        super(WarpedGP, self).parameters_changed()
 
-    def _get_params(self):
-        return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
+        Kiy = self.posterior.woodbury_vector.flatten()
 
-    def _get_param_names(self):
-        warping_names = self.warping_function._get_param_names()
-        param_names = GP._get_param_names(self)
-        return warping_names + param_names
-
-    def transform_data(self):
-        Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
-        return Y
-
-    def log_likelihood(self):
-        ll = GP.log_likelihood(self)
-        jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        return ll + np.log(jacobian).sum()
-
-    def _log_likelihood_gradients(self):
-        ll_grads = GP._log_likelihood_gradients(self)
-        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
-        warping_grads = self.warping_function_gradients(alpha)
-
-        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
-        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
-
-    def warping_function_gradients(self, Kiy):
-        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
+        grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
+        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
                                                                  return_covar_chain=True)
         djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
         dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
 
-        return -dquad_dpsi + djac_dpsi
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
+        self.warping_function.d.gradient[:] = warping_grads[0, -1]
+
+
+    def transform_data(self):
+        Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
+        return Y
+
+    def log_likelihood(self):
+        ll = GP.log_likelihood(self)
+        jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
+        return ll + np.log(jacobian).sum()
 
     def plot_warping(self):
-        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
+        self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
 
-    def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
+    def predict(self, Xnew, which_parts='all', pred_init=None):
         # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
+        # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = GP._raw_predict(self, Xnew)
 
         # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var = self.likelihood.predictive_values(mu, var)
 
         if self.predict_in_warped_space:
-            mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
-            var = self.warping_function.f_inv(var, self.warping_params)
+            mean = self.warping_function.f_inv(mean,  y=pred_init)
+            var = self.warping_function.f_inv(var)
 
         if self.scale_data:
             mean = self._unscale_data(mean)
-        
-        return mean, var, _025pm, _975pm
+
+        return mean, var
+
+if __name__ == '__main__':
+    X = np.random.randn(100, 1)
+    Y = np.sin(X) + np.random.randn(100, 1)*0.05
+
+    m = WarpedGP(X, Y)
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index a0a385e0..a7547be6 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -1,17 +1,18 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 import numpy as np
+from GPy.core.parameterization import Parameterized, Param
+from ..core.parameterization.transformations import Logexp
 
-class WarpingFunction(object):
+class WarpingFunction(Parameterized):
     """
     abstract function for warping
     z = f(y)
     """
 
-    def __init__(self):
-        raise NotImplementedError
+    def __init__(self, name):
+        super(WarpingFunction, self).__init__(name=name)
 
     def f(self,y,psi):
         """function transformation
@@ -34,9 +35,10 @@ class WarpingFunction(object):
     def _get_param_names(self):
         raise NotImplementedError
 
-    def plot(self, psi, xmin, xmax):
+    def plot(self,  xmin, xmax):
+        psi = self.psi
         y = np.arange(xmin, xmax, 0.01)
-        f_y = self.f(y, psi)
+        f_y = self.f(y)
         from matplotlib import pyplot as plt
         plt.figure()
         plt.plot(y, f_y)
@@ -50,6 +52,7 @@ class TanhWarpingFunction(WarpingFunction):
         """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms
+        super(TanhWarpingFunction, self).__init__(name='warp_tanh')
 
     def f(self,y,psi):
         """
@@ -163,8 +166,18 @@ class TanhWarpingFunction_d(WarpingFunction):
         """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms + 1
+        self.psi = np.ones((self.n_terms, 3))
 
-    def f(self,y,psi):
+        super(TanhWarpingFunction_d, self).__init__(name='warp_tanh')
+        self.psi = Param('psi', self.psi)
+        self.psi[:, :2].constrain_positive()
+
+        self.d = Param('%s' % ('d'), 1.0, Logexp())
+        self.link_parameter(self.psi)
+        self.link_parameter(self.d)
+
+
+    def f(self,y):
         """
         Transform y with f using parameter vector psi
         psi = [[a,b,c]]
@@ -175,9 +188,9 @@ class TanhWarpingFunction_d(WarpingFunction):
         #1. check that number of params is consistent
         # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
         # assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
-        mpsi = psi.copy()
-        d = psi[-1]
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
+
+        d = self.d
+        mpsi = self.psi
 
         #3. transform data
         z = d*y.copy()
@@ -187,7 +200,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         return z
 
 
-    def f_inv(self, z, psi, max_iterations=1000, y=None):
+    def f_inv(self, z, max_iterations=1000, y=None):
         """
         calculate the numerical inverse of f
 
@@ -198,12 +211,12 @@ class TanhWarpingFunction_d(WarpingFunction):
         z = z.copy()
         if y is None:
             y = np.ones_like(z)
-            
+
         it = 0
         update = np.inf
 
         while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
-            update = (self.f(y, psi) - z)/self.fgrad_y(y, psi)
+            update = (self.f(y) - z)/self.fgrad_y(y)
             y -= update
             it += 1
         if it == max_iterations:
@@ -212,7 +225,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         return y
 
 
-    def fgrad_y(self, y, psi, return_precalc = False):
+    def fgrad_y(self, y,return_precalc = False):
         """
         gradient of f w.r.t to y ([N x 1])
 
@@ -221,9 +234,8 @@ class TanhWarpingFunction_d(WarpingFunction):
         """
 
 
-        mpsi = psi.copy()
-        d = psi[-1]
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
+        d = self.d
+        mpsi = self.psi
 
         # vectorized version
 
@@ -240,7 +252,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         return GRAD
 
 
-    def fgrad_y_psi(self, y, psi, return_covar_chain = False):
+    def fgrad_y_psi(self, y, return_covar_chain = False):
         """
         gradient of f w.r.t to y and psi
 
@@ -248,10 +260,10 @@ class TanhWarpingFunction_d(WarpingFunction):
 
         """
 
-        mpsi = psi.copy()
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
 
-        w, s, r, d = self.fgrad_y(y, psi, return_precalc = True)
+        mpsi = self.psi
+
+        w, s, r, d = self.fgrad_y(y, return_precalc = True)
 
         gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
         for i in range(len(mpsi)):

From 1d2cbfe44a9f78a1206fdd2366a36e7cad562bd1 Mon Sep 17 00:00:00 2001
From: Daniel Beck <Daniel Beck>
Date: Fri, 6 Feb 2015 19:39:46 +1100
Subject: [PATCH 04/22] first attempt

---
 GPy/kern/_src/prod.py       | 15 +++++++++++++--
 GPy/testing/kernel_tests.py | 15 +++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index dd9a5fe4..e3776838 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -5,6 +5,7 @@ import numpy as np
 from kern import CombinationKernel
 from ...util.caching import Cache_this
 import itertools
+import operator
 
 class Prod(CombinationKernel):
     """
@@ -42,9 +43,19 @@ class Prod(CombinationKernel):
         return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
 
     def update_gradients_full(self, dL_dK, X, X2=None):
+        np.seterr(invalid='raise')
         k = self.K(X,X2)*dL_dK
-        for p in self.parts:
-            p.update_gradients_full(k/p.K(X,X2),X,X2)
+        try:
+            for p in self.parts:
+                p.update_gradients_full(k/p.K(X,X2),X,X2)
+        except FloatingPointError:
+            np.seterr(invalid='warn')
+            print "Gradient warning: falling back to slow version due to zero-valued kernel"
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(operator.mul, [p.K(X, X2) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                to_update.update_gradients_full(dL_dK * prod, X, X2)
+
 
     def update_gradients_diag(self, dL_dKdiag, X):
         k = self.Kdiag(X)*dL_dKdiag
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index c1bb9265..387047b6 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -400,12 +400,27 @@ class Coregionalize_weave_test(unittest.TestCase):
     #reset the weave state for any other tests
     GPy.util.config.config.set('weave', 'working', 'False')
 
+class KernelTestsProductWithZeroValues(unittest.TestCase):
+
+    def test_zero_valued_kernel(self):
+        X = np.array([[0,1],[1,0]])
+        Y = np.array([[1],[10]])
+        lin = GPy.kern.Linear(2)
+        bias = GPy.kern.Bias(2)
+        k = lin * bias
+        #k = lin
+        m = GPy.models.GPRegression(X, Y, kernel=k)
+        #m['mul.bias.variance'].constrain_fixed(0)
+        m.optimize(messages=False)
 
 
 
 if __name__ == "__main__":
     print "Running unit tests, please be (very) patient..."
     unittest.main()
+    #suite = unittest.TestLoader().loadTestsFromTestCase(KernelTestsProductWithZeroValues)
+    #unittest.TextTestRunner().run(suite)
+
 #     np.random.seed(0)
 #     N0 = 3
 #     N1 = 9

From 8b4274339ad034aeede9b926ac47bad89ae2f397 Mon Sep 17 00:00:00 2001
From: Daniel Beck <Daniel Beck>
Date: Mon, 9 Feb 2015 09:28:53 +1100
Subject: [PATCH 05/22] added decorator that changes numpy invalid op warning
 to exception

---
 GPy/kern/_src/prod.py       | 20 +++++++++++++++++---
 GPy/testing/kernel_tests.py |  7 ++++---
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index e3776838..4f9f5ea6 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -7,6 +7,19 @@ from ...util.caching import Cache_this
 import itertools
 import operator
 
+
+def numpy_invalid_op_as_exception(func):
+    """
+    A decorator that allows catching numpy invalid operations
+    as exceptions (the default behaviour is raising warnings).
+    """
+    def func_wrapper(*args, **kwargs):
+        np.seterr(invalid='raise')
+        func(*args, **kwargs)
+        np.seterr(invalid='warn')
+    return func_wrapper
+
+
 class Prod(CombinationKernel):
     """
     Computes the product of 2 kernels
@@ -42,15 +55,14 @@ class Prod(CombinationKernel):
             which_parts = self.parts
         return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
 
+    @numpy_invalid_op_as_exception
     def update_gradients_full(self, dL_dK, X, X2=None):
-        np.seterr(invalid='raise')
         k = self.K(X,X2)*dL_dK
         try:
             for p in self.parts:
                 p.update_gradients_full(k/p.K(X,X2),X,X2)
         except FloatingPointError:
-            np.seterr(invalid='warn')
-            print "Gradient warning: falling back to slow version due to zero-valued kernel"
+            #print "WARNING: gradient calculation falling back to slow version due to zero-valued kernel"
             for combination in itertools.combinations(self.parts, len(self.parts) - 1):
                 prod = reduce(operator.mul, [p.K(X, X2) for p in combination])
                 to_update = list(set(self.parts) - set(combination))[0]
@@ -75,3 +87,5 @@ class Prod(CombinationKernel):
         for p in self.parts:
             target += p.gradients_X_diag(k/p.Kdiag(X),X)
         return target
+
+
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 387047b6..ac6d7ab4 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -408,10 +408,11 @@ class KernelTestsProductWithZeroValues(unittest.TestCase):
         lin = GPy.kern.Linear(2)
         bias = GPy.kern.Bias(2)
         k = lin * bias
-        #k = lin
         m = GPy.models.GPRegression(X, Y, kernel=k)
-        #m['mul.bias.variance'].constrain_fixed(0)
-        m.optimize(messages=False)
+        try:
+            m.optimize()
+        except np.linalg.LinAlgError:
+            self.fail("Zero-valued kernel raised exception!")
 
 
 

From d6a56a6f0bf234f4c8c9f9f1b595ff5c9305bed0 Mon Sep 17 00:00:00 2001
From: Daniel Beck <Daniel Beck>
Date: Mon, 9 Feb 2015 09:35:32 +1100
Subject: [PATCH 06/22] changed operator.mul to np.multiply for consistency

---
 GPy/kern/_src/prod.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index 4f9f5ea6..241c2448 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -5,7 +5,6 @@ import numpy as np
 from kern import CombinationKernel
 from ...util.caching import Cache_this
 import itertools
-import operator
 
 
 def numpy_invalid_op_as_exception(func):
@@ -64,7 +63,7 @@ class Prod(CombinationKernel):
         except FloatingPointError:
             #print "WARNING: gradient calculation falling back to slow version due to zero-valued kernel"
             for combination in itertools.combinations(self.parts, len(self.parts) - 1):
-                prod = reduce(operator.mul, [p.K(X, X2) for p in combination])
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
                 to_update = list(set(self.parts) - set(combination))[0]
                 to_update.update_gradients_full(dL_dK * prod, X, X2)
 

From fc8705104b05cadc772307536f06f6de803c72bc Mon Sep 17 00:00:00 2001
From: Daniel Beck <Daniel Beck>
Date: Mon, 9 Feb 2015 09:41:21 +1100
Subject: [PATCH 07/22] a cleaner test

---
 GPy/kern/_src/prod.py       | 1 -
 GPy/testing/kernel_tests.py | 8 +++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index 241c2448..5e4c0d29 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -61,7 +61,6 @@ class Prod(CombinationKernel):
             for p in self.parts:
                 p.update_gradients_full(k/p.K(X,X2),X,X2)
         except FloatingPointError:
-            #print "WARNING: gradient calculation falling back to slow version due to zero-valued kernel"
             for combination in itertools.combinations(self.parts, len(self.parts) - 1):
                 prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
                 to_update = list(set(self.parts) - set(combination))[0]
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index ac6d7ab4..f9d90607 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -408,11 +408,9 @@ class KernelTestsProductWithZeroValues(unittest.TestCase):
         lin = GPy.kern.Linear(2)
         bias = GPy.kern.Bias(2)
         k = lin * bias
-        m = GPy.models.GPRegression(X, Y, kernel=k)
-        try:
-            m.optimize()
-        except np.linalg.LinAlgError:
-            self.fail("Zero-valued kernel raised exception!")
+        k.update_gradients_full(1, X)
+        self.assertFalse(np.isnan(k['linear.variances'].gradient),
+                         "Gradient resulted in NaN")
 
 
 

From 98c743d157f2954226b0ef5b6d3d1817f28e67f6 Mon Sep 17 00:00:00 2001
From: Daniel Beck <Daniel Beck>
Date: Mon, 9 Feb 2015 10:02:26 +1100
Subject: [PATCH 08/22] test + code change in gradients_X

---
 GPy/kern/_src/prod.py       | 15 +++++++++++----
 GPy/testing/kernel_tests.py | 22 ++++++++++++----------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index 5e4c0d29..a3b49973 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -14,8 +14,9 @@ def numpy_invalid_op_as_exception(func):
     """
     def func_wrapper(*args, **kwargs):
         np.seterr(invalid='raise')
-        func(*args, **kwargs)
+        result = func(*args, **kwargs)
         np.seterr(invalid='warn')
+        return result
     return func_wrapper
 
 
@@ -66,17 +67,23 @@ class Prod(CombinationKernel):
                 to_update = list(set(self.parts) - set(combination))[0]
                 to_update.update_gradients_full(dL_dK * prod, X, X2)
 
-
     def update_gradients_diag(self, dL_dKdiag, X):
         k = self.Kdiag(X)*dL_dKdiag
         for p in self.parts:
             p.update_gradients_diag(k/p.Kdiag(X),X)
 
+    @numpy_invalid_op_as_exception            
     def gradients_X(self, dL_dK, X, X2=None):
         target = np.zeros(X.shape)
         k = self.K(X,X2)*dL_dK
-        for p in self.parts:
-            target += p.gradients_X(k/p.K(X,X2),X,X2)
+        try:
+            for p in self.parts:
+                target += p.gradients_X(k/p.K(X,X2),X,X2)
+        except FloatingPointError:
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                target += to_update.gradients_X(dL_dK * prod, X, X2)
         return target
 
     def gradients_X_diag(self, dL_dKdiag, X):
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index f9d90607..415cc7eb 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -400,25 +400,27 @@ class Coregionalize_weave_test(unittest.TestCase):
     #reset the weave state for any other tests
     GPy.util.config.config.set('weave', 'working', 'False')
 
+
 class KernelTestsProductWithZeroValues(unittest.TestCase):
 
-    def test_zero_valued_kernel(self):
-        X = np.array([[0,1],[1,0]])
-        Y = np.array([[1],[10]])
-        lin = GPy.kern.Linear(2)
-        bias = GPy.kern.Bias(2)
-        k = lin * bias
-        k.update_gradients_full(1, X)
-        self.assertFalse(np.isnan(k['linear.variances'].gradient),
+    def setUp(self):
+        self.X = np.array([[0,1],[1,0]])
+        self.k = GPy.kern.Linear(2) * GPy.kern.Bias(2)
+
+    def test_zero_valued_kernel_full(self):
+        self.k.update_gradients_full(1, self.X)
+        self.assertFalse(np.isnan(self.k['linear.variances'].gradient),
                          "Gradient resulted in NaN")
 
+    def test_zero_valued_kernel_gradients_X(self):
+        target = self.k.gradients_X(1, self.X)
+        self.assertFalse(np.any(np.isnan(target)),
+                         "Gradient resulted in NaN")
 
 
 if __name__ == "__main__":
     print "Running unit tests, please be (very) patient..."
     unittest.main()
-    #suite = unittest.TestLoader().loadTestsFromTestCase(KernelTestsProductWithZeroValues)
-    #unittest.TextTestRunner().run(suite)
 
 #     np.random.seed(0)
 #     N0 = 3

From 952851de88c2a0054502c2fa0b98109ee867ecde Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 9 Feb 2015 19:35:46 +0000
Subject: [PATCH 09/22] Bug in linalg jitchol!!!

---
 GPy/testing/linalg_test.py | 35 +++++++++++++++++++++++++++++++++++
 GPy/util/linalg.py         | 12 ++++++------
 2 files changed, 41 insertions(+), 6 deletions(-)
 create mode 100644 GPy/testing/linalg_test.py

diff --git a/GPy/testing/linalg_test.py b/GPy/testing/linalg_test.py
new file mode 100644
index 00000000..b734f6af
--- /dev/null
+++ b/GPy/testing/linalg_test.py
@@ -0,0 +1,35 @@
+import numpy as np
+import scipy as sp
+from ..util.linalg import jitchol
+
+class LinalgTests(np.testing.TestCase):
+    def setUp(self):
+        #Create PD matrix
+        A = np.random.randn(20,100)
+        self.A = A.dot(A.T)
+        #compute Eigdecomp
+        vals, vectors = np.linalg.eig(self.A)
+        #Set smallest eigenval to be negative with 5 rounds worth of jitter
+        vals[vals.argmin()] = 0
+        default_jitter = 1e-6*np.mean(vals)
+        vals[vals.argmin()] = -default_jitter*(10**3.5)
+        self.A_corrupt = (vectors * vals).dot(vectors.T)
+
+    def test_jitchol_success(self):
+        """
+        Expect 5 rounds of jitter to be added and for the recovered matrix to be
+        identical to the corrupted matrix apart from the jitter added to the diagonal
+        """
+        L = jitchol(self.A_corrupt, maxtries=5)
+        A_new = L.dot(L.T)
+        diff = A_new - self.A_corrupt
+        np.testing.assert_allclose(diff, np.eye(A_new.shape[0])*np.diag(diff).mean(), atol=1e-13)
+
+    def test_jitchol_failure(self):
+        try:
+            """ Expecting an exception to be thrown as we expect it to require
+            5 rounds of jitter to be added to enforce PDness"""
+            jitchol(self.A_corrupt, maxtries=4)
+            return False
+        except sp.linalg.LinAlgError:
+            return True
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index dffd438a..2c02357c 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -82,6 +82,7 @@ def force_F_ordered(A):
 
 #         return jitchol(A+np.eye(A.shape[0])*jitter, maxtries-1)
 
+
 def jitchol(A, maxtries=5):
     A = np.ascontiguousarray(A)
     L, info = lapack.dpotrf(A, lower=1)
@@ -92,13 +93,16 @@ def jitchol(A, maxtries=5):
         if np.any(diagA <= 0.):
             raise linalg.LinAlgError, "not pd: non-positive diagonal elements"
         jitter = diagA.mean() * 1e-6
-        while maxtries > 0 and np.isfinite(jitter):
+        num_tries = 0
+        while num_tries < maxtries and np.isfinite(jitter):
             try:
+                print jitter
                 L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
+                return L
             except:
                 jitter *= 10
             finally:
-                maxtries -= 1
+                num_tries += 1
         raise linalg.LinAlgError, "not positive definite, even with jitter."
     import traceback
     try: raise
@@ -108,10 +112,6 @@ def jitchol(A, maxtries=5):
     import ipdb;ipdb.set_trace()
     return L
 
-
-
-
-
 # def dtrtri(L, lower=1):
 #     """
 #     Wrapper for lapack dtrtri function

From ae5d70b063536cf41452892b8e8adc13b01cdab4 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Sun, 15 Feb 2015 19:24:51 +0000
Subject: [PATCH 10/22] add mcmc into inference import

---
 GPy/inference/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPy/inference/__init__.py b/GPy/inference/__init__.py
index f1ffd595..7b1307e3 100644
--- a/GPy/inference/__init__.py
+++ b/GPy/inference/__init__.py
@@ -1,2 +1,3 @@
 import latent_function_inference
-import optimization
\ No newline at end of file
+import optimization
+import mcmc

From c5c8b8341c1908b62a93e144143a91ad2cb10f08 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 17 Feb 2015 10:48:26 +0000
Subject: [PATCH 11/22] A temporal fix for the problem of sometimes the model
 not being updated.

---
 GPy/core/gp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 25066381..3252ac08 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -124,6 +124,7 @@ class GP(Model):
             else:
                 self.X = ObsAr(X)
         self.update_model(True)
+        self._trigger_params_changed()
 
     def set_X(self,X):
         """

From 7ad275ce8a81a0b6b61f3ae0c17090ce58f6b731 Mon Sep 17 00:00:00 2001
From: mellorjc <mojoeschmoe@gmail.com>
Date: Thu, 19 Feb 2015 11:31:46 +0000
Subject: [PATCH 12/22] matplotlib interactive mode only in IPython

have interactive mode only in IPython so that running scripts that plot from python behave like normal.
---
 GPy/plotting/matplot_dep/maps.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPy/plotting/matplot_dep/maps.py b/GPy/plotting/matplot_dep/maps.py
index fcb03b38..eef72a6a 100644
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@@ -6,7 +6,11 @@ try:
     from matplotlib.patches import Polygon
     from matplotlib.collections import PatchCollection
     #from matplotlib import cm
-    pb.ion()
+    try:
+        __IPYTHON__
+        pb.ion()
+    except:
+        pass
 except:
     pass
 import re

From f25797cd617655f73b828f803e38ccc3e7144e60 Mon Sep 17 00:00:00 2001
From: mellorjc <mojoeschmoe@gmail.com>
Date: Thu, 19 Feb 2015 11:45:57 +0000
Subject: [PATCH 13/22] catch only a specific error

catch only NameError, rather than everything.
---
 GPy/plotting/matplot_dep/maps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/plotting/matplot_dep/maps.py b/GPy/plotting/matplot_dep/maps.py
index eef72a6a..a651f34d 100644
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@@ -9,7 +9,7 @@ try:
     try:
         __IPYTHON__
         pb.ion()
-    except:
+    except NameError:
         pass
 except:
     pass

From fa801bf46c3c6104ab787cde2ab4e71f2bfabeea Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 23 Mar 2015 14:47:49 +0000
Subject: [PATCH 14/22] mean functions in place

---
 GPy/inference/latent_function_inference/dtc.py              | 6 ++++--
 .../latent_function_inference/exact_gaussian_inference.py   | 4 +++-
 .../latent_function_inference/expectation_propagation.py    | 3 ++-
 .../expectation_propagation_dtc.py                          | 3 ++-
 GPy/inference/latent_function_inference/fitc.py             | 3 ++-
 GPy/inference/latent_function_inference/laplace.py          | 3 ++-
 GPy/inference/latent_function_inference/svgp.py             | 3 ++-
 GPy/mappings/linear.py                                      | 2 +-
 8 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py
index 5590a079..a12726e2 100644
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@@ -20,7 +20,8 @@ class DTC(LatentFunctionInference):
     def __init__(self):
         self.const_jitter = 1e-6
 
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
 
         num_inducing, _ = Z.shape
@@ -88,7 +89,8 @@ class vDTC(object):
     def __init__(self):
         self.const_jitter = 1e-6
 
-    def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
 
         num_inducing, _ = Z.shape
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 1312d36a..312855f7 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -36,10 +36,12 @@ class ExactGaussianInference(LatentFunctionInference):
             #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
             return Y
 
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
+        assert mean_function is None, "inference with a mean function not implemented"
+
         YYT_factor = self.get_YYTfactor(Y)
 
         K = kern.K(X)
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index 26144974..ba920569 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -33,7 +33,8 @@ class EP(LatentFunctionInference):
         # TODO: update approximation in the end as well? Maybe even with a switch?
         pass
 
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         num_data, output_dim = Y.shape
         assert output_dim ==1, "ep in 1D only (for now!)"
 
diff --git a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
index 35b1b7dc..466cbbb2 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@@ -64,7 +64,8 @@ class EPDTC(LatentFunctionInference):
         self.old_mutilde, self.old_vtilde = None, None
         self._ep_approximation = None
 
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         num_data, output_dim = Y.shape
         assert output_dim ==1, "ep in 1D only (for now!)"
 
diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index a184c6c4..f99b35ff 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -18,7 +18,8 @@ class FITC(LatentFunctionInference):
     """
     const_jitter = 1e-6
 
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
 
         num_inducing, _ = Z.shape
         num_data, output_dim = Y.shape
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 05711b0b..4e6ece11 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -39,10 +39,11 @@ class Laplace(LatentFunctionInference):
         self.first_run = True
         self._previous_Ki_fhat = None
 
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
+        assert mean_function is None, "inference with a mean function not implemented"
 
         # Compute K
         K = kern.K(X)
diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py
index 1974991b..23d36a14 100644
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@@ -6,7 +6,8 @@ from posterior import Posterior
 
 class SVGP(LatentFunctionInference):
 
-    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+        assert mean_function is None, "inference with a mean function not implemented"
         num_inducing = Z.shape[0]
         num_data, num_outputs = Y.shape
 
diff --git a/GPy/mappings/linear.py b/GPy/mappings/linear.py
index e172b4e2..6fc91944 100644
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@@ -33,7 +33,7 @@ class Linear(Mapping):
         return np.dot(X, self.A)
 
     def update_gradients(self, dL_dF, X):
-        self.A.gradient = np.dot( X.T dL_dF)
+        self.A.gradient = np.dot( X.T, dL_dF)
 
     def gradients_X(self, dL_dF, X):
         return np.dot(dL_dF, self.A.T)

From 2d312099c00d21511b5fa06f534aa074733dd43a Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 23 Mar 2015 14:56:19 +0000
Subject: [PATCH 15/22] added parseing of mean func to gp.py

---
 GPy/core/gp.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 38a7bb3d..732db7e2 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -34,7 +34,7 @@ class GP(Model):
 
 
     """
-    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
+    def __init__(self, X, Y, kernel, likelihood, mean_function=None, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
         super(GP, self).__init__(name)
 
         assert X.ndim == 2
@@ -75,6 +75,15 @@ class GP(Model):
         assert isinstance(likelihood, likelihoods.Likelihood)
         self.likelihood = likelihood
 
+        #handle the mean function
+        self.mean_function = mean_function
+        if mean_function is not None:
+            assert isinstance(self.mean_function, Mapping)
+            assert mean_function.input_dim == self.input_dim
+            assert mean_function.output_dim == self.output_dim
+            self.add_parameter(mean_function)
+
+
         #find a sensible inference method
         logger.info("initializing inference method")
         if inference_method is None:
@@ -153,7 +162,7 @@ class GP(Model):
             This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
             this method yourself, there may be unexpected consequences.
         """
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
         self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
         self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
 

From e97b6e59aab6243e8440475078ef0300eaae8639 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 23 Mar 2015 14:59:08 +0000
Subject: [PATCH 16/22] added mean function into the prediction

---
 GPy/core/gp.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 732db7e2..d415d995 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -5,6 +5,7 @@ import numpy as np
 import sys
 from .. import kern
 from model import Model
+from mapping import Mapping
 from parameterization import ObsAr
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
@@ -201,6 +202,10 @@ class GP(Model):
 
         #force mu to be a column vector
         if len(mu.shape)==1: mu = mu[:,None]
+
+        #add the mean function in
+        if not self.mean_function is None:
+            mu += self.mean_function.f(_Xnew)
         return mu, var
 
     def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):

From 90cc217e37e50d7b329e079085c91f5af9a5b225 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 23 Mar 2015 15:48:20 +0000
Subject: [PATCH 17/22] minimual edits to exact_inference

---
 .../exact_gaussian_inference.py                          | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 312855f7..4c6b70df 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -40,9 +40,14 @@ class ExactGaussianInference(LatentFunctionInference):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
-        assert mean_function is None, "inference with a mean function not implemented"
 
-        YYT_factor = self.get_YYTfactor(Y)
+        if mean_function is None:
+            m = 0
+        else:
+            m = mean_function.f(X)
+
+
+        YYT_factor = self.get_YYTfactor(Y-m)
 
         K = kern.K(X)
 

From cf0e29b207bc8004e635c20c002ed9c6cfed3387 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 26 Mar 2015 08:48:56 +0000
Subject: [PATCH 18/22] working mean function examples

---
 GPy/core/gp.py                                |  4 +-
 GPy/examples/regression.py                    | 45 +++++++++++++++++++
 .../exact_gaussian_inference.py               |  2 +-
 GPy/mappings/linear.py                        |  4 +-
 4 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index d415d995..fd39069d 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -82,7 +82,7 @@ class GP(Model):
             assert isinstance(self.mean_function, Mapping)
             assert mean_function.input_dim == self.input_dim
             assert mean_function.output_dim == self.output_dim
-            self.add_parameter(mean_function)
+            self.link_parameter(mean_function)
 
 
         #find a sensible inference method
@@ -166,6 +166,8 @@ class GP(Model):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
         self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
         self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dm'], self.X)
 
     def log_likelihood(self):
         """
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 37a18f63..0e68d0bf 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -505,3 +505,48 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
 
     print m
     return m
+
+def simple_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    The simplest possible mean function. No parameters, just a simple Sinusoid.
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+    mf.update_gradients = lambda a,b: None
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot(plot_limits=(-10,15))
+    return m
+
+def parametric_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    A linear mean function with parameters that we'll learn alongside the kernel
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
+
+    mf = GPy.mappings.Linear(1,1)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot()
+    return m
+
+
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 4c6b70df..b2f1b7d0 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -63,4 +63,4 @@ class ExactGaussianInference(LatentFunctionInference):
 
         dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
 
-        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
diff --git a/GPy/mappings/linear.py b/GPy/mappings/linear.py
index 6fc91944..ee464694 100644
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@@ -26,8 +26,8 @@ class Linear(Mapping):
 
     def __init__(self, input_dim, output_dim, name='linmap'):
         Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
-        self.A = GPy.core.Param('A', np.random.randn(self.input_dim, self.output_dim))
-        self.add_parameter(self.A)
+        self.A = Param('A', np.random.randn(self.input_dim, self.output_dim))
+        self.link_parameter(self.A)
 
     def f(self, X):
         return np.dot(X, self.A)

From 624117eaac16a0674201471119d940ccd81b1771 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 26 Mar 2015 11:27:20 +0000
Subject: [PATCH 19/22] mappings, including tests

---
 GPy/mappings/__init__.py     |  3 +-
 GPy/mappings/additive.py     |  5 +--
 GPy/mappings/compound.py     | 39 +++++++++++++++++++
 GPy/mappings/kernel.py       |  8 ++--
 GPy/mappings/mlp.py          | 33 ++++++++++++-----
 GPy/testing/mapping_tests.py | 72 ++++++++++++++++++++++++++++++++++++
 6 files changed, 141 insertions(+), 19 deletions(-)
 create mode 100644 GPy/mappings/compound.py
 create mode 100644 GPy/testing/mapping_tests.py

diff --git a/GPy/mappings/__init__.py b/GPy/mappings/__init__.py
index d331c678..b1cb194b 100644
--- a/GPy/mappings/__init__.py
+++ b/GPy/mappings/__init__.py
@@ -4,4 +4,5 @@
 from kernel import Kernel
 from linear import Linear
 from mlp import MLP
-#from rbf import RBF
+from additive import Additive
+from compound import Compound
diff --git a/GPy/mappings/additive.py b/GPy/mappings/additive.py
index 4e7c545d..1c86b680 100644
--- a/GPy/mappings/additive.py
+++ b/GPy/mappings/additive.py
@@ -2,8 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..core.mapping import Mapping
-import GPy
+from ..core import Mapping
 
 class Additive(Mapping):
     """
@@ -27,8 +26,6 @@ class Additive(Mapping):
         Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
         self.mapping1 = mapping1
         self.mapping2 = mapping2
-        self.num_params = self.mapping1.num_params + self.mapping2.num_params
-        self.name = self.mapping1.name + '+' + self.mapping2.name
 
     def f(self, X):
         return self.mapping1.f(X) + self.mapping2.f(X)
diff --git a/GPy/mappings/compound.py b/GPy/mappings/compound.py
new file mode 100644
index 00000000..5a1e8dd1
--- /dev/null
+++ b/GPy/mappings/compound.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2015, James Hensman and Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from ..core import Mapping
+
+class Compound(Mapping):
+    """
+    Mapping based on passing one mapping through another
+
+    .. math::
+
+       f(\mathbf{x}) = f_2(f_1(\mathbf{x}))
+
+    :param mapping1: first mapping
+    :type mapping1: GPy.mappings.Mapping
+    :param mapping2: second mapping
+    :type mapping2: GPy.mappings.Mapping
+
+    """
+
+    def __init__(self, mapping1, mapping2):
+        assert(mapping1.output_dim==mapping2.input_dim)
+        input_dim, output_dim = mapping1.input_dim, mapping2.output_dim
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        self.mapping1 = mapping1
+        self.mapping2 = mapping2
+        self.link_parameters(self.mapping1, self.mapping2)
+
+    def f(self, X):
+        return self.mapping2.f(self.mapping1.f(X))
+
+    def update_gradients(self, dL_dF, X):
+        hidden = self.mapping1.f(X)
+        self.mapping2.update_gradients(dL_dF, hidden)
+        self.mapping1.update_gradients(self.mapping2.gradients_X(dL_dF, hidden), X)
+
+    def gradients_X(self, dL_dF, X):
+        hidden = self.mapping1.f(X)
+        return self.mapping1.gradients_X(self.mapping2.gradients_X(dL_dF, hidden), X)
diff --git a/GPy/mappings/kernel.py b/GPy/mappings/kernel.py
index 3bfcd388..ea1720db 100644
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@@ -36,16 +36,16 @@ class Kernel(Mapping):
         Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
         self.kern = kernel
         self.Z = Z
-        self.num_bases, Zdim = X.shape
+        self.num_bases, Zdim = Z.shape
         assert Zdim == self.input_dim
-        self.A = GPy.core.Param('A', np.random.randn(self.num_bases, self.output_dim))
-        self.add_parameter(self.A)
+        self.A = Param('A', np.random.randn(self.num_bases, self.output_dim))
+        self.link_parameter(self.A)
 
     def f(self, X):
         return np.dot(self.kern.K(X, self.Z), self.A)
 
     def update_gradients(self, dL_dF, X):
-        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T))
+        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T), X, self.Z)
         self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)
 
     def gradients_X(self, dL_dF, X):
diff --git a/GPy/mappings/mlp.py b/GPy/mappings/mlp.py
index f22fc07f..f0fe21e5 100644
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@@ -11,32 +11,45 @@ class MLP(Mapping):
     """
 
     def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
-        super(MLP).__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
+        super(MLP, self).__init__(input_dim=input_dim, output_dim=output_dim, name=name)
         self.hidden_dim = hidden_dim
         self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
         self.b1 = Param('b1', np.random.randn(self.hidden_dim))
         self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
         self.b2 = Param('b2', np.random.randn(self.output_dim))
+        self.link_parameters(self.W1, self.b1, self.W2, self.b2)
 
 
     def f(self, X):
-        N, D = X.shape
-        activations = np.tanh(np.dot(X,self.W1) + self.b1)
-        self.out = np.dot(self.activations,self.W2) + self.b2
-        return self.output_fn(self.out)
+        layer1 = np.dot(X, self.W1) + self.b1
+        activations = np.tanh(layer1)
+        return  np.dot(activations, self.W2) + self.b2
 
     def update_gradients(self, dL_dF, X):
-        activations = np.tanh(np.dot(X,self.W1) + self.b1)
-
+        layer1 = np.dot(X,self.W1) + self.b1
+        activations = np.tanh(layer1)
 
         #Evaluate second-layer gradients.
         self.W2.gradient = np.dot(activations.T, dL_dF)
         self.b2.gradient = np.sum(dL_dF, 0)
 
         # Backpropagation to hidden layer.
-        delta_hid = np.dot(dL_dF, self.W2.T) * (1.0 - activations**2)
+        dL_dact = np.dot(dL_dF, self.W2.T)
+        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
 
         # Finally, evaluate the first-layer gradients.
-        self.W1.gradients = np.dot(X.T,delta_hid)
-        self.b1.gradients = np.sum(delta_hid, 0)
+        self.W1.gradient = np.dot(X.T,dL_dlayer1)
+        self.b1.gradient = np.sum(dL_dlayer1, 0)
+
+    def gradients_X(self, dL_dF, X):
+        layer1 = np.dot(X,self.W1) + self.b1
+        activations = np.tanh(layer1)
+
+        # Backpropagation to hidden layer.
+        dL_dact = np.dot(dL_dF, self.W2.T)
+        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+        
+        return np.dot(dL_dlayer1, self.W1.T)
+
+
 
diff --git a/GPy/testing/mapping_tests.py b/GPy/testing/mapping_tests.py
new file mode 100644
index 00000000..2e32dad3
--- /dev/null
+++ b/GPy/testing/mapping_tests.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class MappingGradChecker(GPy.core.Model):
+    """
+    This class has everything we need to check the gradient of a mapping. It
+    implement a simple likelihood which is a weighted sum of the outputs of the
+    mapping. the gradients are checked against the parameters of the mapping
+    and the input.
+    """
+    def __init__(self, mapping, X, name='map_grad_check'):
+        super(MappingGradChecker, self).__init__(name)
+        self.mapping = mapping
+        self.link_parameter(self.mapping)
+        self.X = GPy.core.Param('X',X)
+        self.link_parameter(self.X)
+        self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
+    def log_likelihood(self):
+        return np.sum(self.mapping.f(self.X) * self.dL_dY)
+    def parameters_changed(self):
+        self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
+        self.mapping.update_gradients(self.dL_dY, self.X)
+
+
+
+
+
+
+
+class MappingTests(unittest.TestCase):
+
+    def test_kernelmapping(self):
+        X = np.random.randn(100,3)
+        Z = np.random.randn(10,3)
+        mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_linearmapping(self):
+        mapping = GPy.mappings.Linear(3, 2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_mlpmapping(self):
+        mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_addmapping(self):
+        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
+        mapping = GPy.mappings.Additive(m1, m2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_compoundmapping(self):
+        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        Z = np.random.randn(10,2)
+        m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
+        mapping = GPy.mappings.Compound(m1, m2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+
+
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()

From 02f2bb5c76b1c0c7c35d43731f2fccd927d713a7 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 26 Mar 2015 11:45:18 +0000
Subject: [PATCH 20/22] fixed up product kernel tests

---
 GPy/kern/_src/prod.py       |  1 +
 GPy/testing/kernel_tests.py | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index b29c85eb..63b23f45 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -80,6 +80,7 @@ class Prod(CombinationKernel):
         if len(self.parts)==2:
             target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
             target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
+        else:
             for combination in itertools.combinations(self.parts, len(self.parts) - 1):
                 prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
                 to_update = list(set(self.parts) - set(combination))[0]
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 415cc7eb..458f5cd8 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -256,13 +256,23 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
+    def test_Prod1(self):
+        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
     def test_Prod2(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
     def test_Prod3(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D) * GPy.kern.Bias(self.D)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_Prod4(self):
+        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D) * GPy.kern.Matern32(2, active_dims=[0,1])
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 

From 72de607199f7645db206521168d423a35363652f Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 26 Mar 2015 16:20:17 +0000
Subject: [PATCH 21/22] mean functions now working for svgp. with tests

---
 GPy/core/sparse_gp.py                         | 16 ++++---
 GPy/core/svgp.py                              | 13 +++--
 .../latent_function_inference/posterior.py    |  5 +-
 .../latent_function_inference/svgp.py         | 47 ++++++++++++++++---
 GPy/testing/svgp_tests.py                     | 20 ++++++++
 5 files changed, 83 insertions(+), 18 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 4fcade79..a81b77fa 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -19,7 +19,7 @@ class SparseGP(GP):
     This model allows (approximate) inference using variational DTC or FITC
     (Gaussian likelihoods) as well as non-conjugate sparse methods based on
     these.
-    
+
     This is not for missing data, as the implementation for missing data involves
     some inefficient optimization routine decisions.
     See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
@@ -39,7 +39,7 @@ class SparseGP(GP):
 
     """
 
-    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
                  name='sparse gp', Y_metadata=None, normalizer=False):
         #pick a sensible inference method
         if inference_method is None:
@@ -53,7 +53,7 @@ class SparseGP(GP):
         self.Z = Param('inducing inputs', Z)
         self.num_inducing = Z.shape[0]
 
-        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
 
         logger.info("Adding Z as parameter")
         self.link_parameter(self.Z, index=0)
@@ -61,7 +61,7 @@ class SparseGP(GP):
 
     def has_uncertain_inputs(self):
         return isinstance(self.X, VariationalPosterior)
-    
+
     def set_Z(self, Z, trigger_update=True):
         if trigger_update: self.update_model(False)
         self.unlink_parameter(self.Z)
@@ -110,8 +110,8 @@ class SparseGP(GP):
 
     def _raw_predict(self, Xnew, full_cov=False, kern=None):
         """
-        Make a prediction for the latent function values. 
-    
+        Make a prediction for the latent function values.
+
         For certain inputs we give back a full_cov of shape NxN,
         if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
         we take only the diagonal elements across N.
@@ -136,6 +136,9 @@ class SparseGP(GP):
             else:
                 Kxx = kern.Kdiag(Xnew)
                 var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+            #add in the mean function
+            if self.mean_function is not None:
+                mu += self.mean_function.f(Xnew)
         else:
             psi0_star = self.kern.psi0(self.Z, Xnew)
             psi1_star = self.kern.psi1(self.Z, Xnew)
@@ -165,4 +168,5 @@ class SparseGP(GP):
                     var[i] = var_
                 else:
                     var[i] = np.diag(var_)+p0-t2
+
         return mu, var
diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py
index 1966dbef..7783f3b1 100644
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf
 
 
 class SVGP(SparseGP):
-    def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
         """
         Stochastic Variational GP.
 
@@ -38,7 +38,7 @@ class SVGP(SparseGP):
         #create the SVI inference method
         inf_method = svgp_inf()
 
-        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
+        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
                  name=name, Y_metadata=Y_metadata, normalizer=False)
 
         self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
@@ -48,7 +48,7 @@ class SVGP(SparseGP):
         self.link_parameter(self.m)
 
     def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
 
         #update the kernel gradients
         self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
@@ -65,6 +65,13 @@ class SVGP(SparseGP):
         self.m.gradient = self.grad_dict['dL_dm']
         self.chol.gradient = self.grad_dict['dL_dchol']
 
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
+            g = self.mean_function.gradient[:].copy()
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
+            self.mean_function.gradient[:] += g
+            self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
+
     def set_data(self, X, Y):
         """
         Set the data without calling parameters_changed to avoid wasted computation
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 34f0b3bb..a1d42c74 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -15,7 +15,7 @@ class Posterior(object):
     the function at any new point x_* by integrating over this posterior.
 
     """
-    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
+    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
         """
         woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
         woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@@ -67,6 +67,7 @@ class Posterior(object):
         #option 2:
         self._mean = mean
         self._covariance = cov
+        self._prior_mean = prior_mean
 
         #compute this lazily
         self._precision = None
@@ -175,7 +176,7 @@ class Posterior(object):
         $$
         """
         if self._woodbury_vector is None:
-            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
+            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
         return self._woodbury_vector
 
     @property
diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py
index 48763426..da003793 100644
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@@ -7,7 +7,7 @@ from posterior import Posterior
 class SVGP(LatentFunctionInference):
 
     def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
-        assert mean_function is None, "inference with a mean function not implemented"
+
         num_inducing = Z.shape[0]
         num_data, num_outputs = Y.shape
 
@@ -23,6 +23,15 @@ class SVGP(LatentFunctionInference):
             #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
             #Si, Lnew, _,_ = linalg.pdinv(S)
 
+        #compute mean function stuff
+        if mean_function is not None:
+            prior_mean_u = mean_function.f(Z)
+            prior_mean_f = mean_function.f(X)
+        else:
+            prior_mean_u = np.zeros((num_inducing, num_outputs))
+            prior_mean_f = np.zeros((num_data, num_outputs))
+
+
         #compute kernel related stuff
         Kmm = kern.K(Z)
         Knm = kern.K(X, Z)
@@ -31,17 +40,31 @@ class SVGP(LatentFunctionInference):
 
         #compute the marginal means and variances of q(f)
         A = np.dot(Knm, Kmmi)
-        mu = np.dot(A, q_u_mean)
+        mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
         v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
 
         #compute the KL term
         Kmmim = np.dot(Kmmi, q_u_mean)
         KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
         KL = KLs.sum()
-        dKL_dm = Kmmim
+        #gradient of the KL term (assuming zero mean function)
+        dKL_dm = Kmmim.copy()
         dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
         dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
 
+        if mean_function is not None:
+            #adjust KL term for mean function
+            Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
+            KL += -np.sum(q_u_mean*Kmmi_mfZ)
+            KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
+
+            #adjust gradient for mean fucntion
+            dKL_dm -= Kmmi_mfZ
+            dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
+            dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
+
+            #compute gradients for mean_function
+            dKL_dmfZ = Kmmi_mfZ - Kmmim
 
         #quadrature for the likelihood
         F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
@@ -51,11 +74,9 @@ class SVGP(LatentFunctionInference):
         if dF_dthetaL is not None:
             dF_dthetaL =  dF_dthetaL.sum(1)*batch_scale
 
-        #derivatives of expected likelihood
+        #derivatives of expected likelihood, assuming zero mean function
         Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
         Admu = A.T.dot(dF_dmu)
-        #AdvA = np.einsum('ijk,jl->ilk', Adv, A)
-        #AdvA = np.dot(A.T, Adv).swapaxes(0,1)
         AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
         tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
         dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
@@ -65,6 +86,14 @@ class SVGP(LatentFunctionInference):
         dF_dm = Admu
         dF_dS = AdvA
 
+        #adjust gradient to account for mean function
+        if mean_function is not None:
+            dF_dmfX = dF_dmu.copy()
+            dF_dmfZ = -Admu
+            dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
+            dF_dKmm += Admu.dot(Kmmi_mfZ.T)
+
+
         #sum (gradients of) expected likelihood and KL part
         log_marginal = F.sum() - KL
         dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
@@ -72,4 +101,8 @@ class SVGP(LatentFunctionInference):
         dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
         dL_dchol = choleskies.triang_to_flat(dL_dchol)
 
-        return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        if mean_function is not None:
+            grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
+            grad_dict['dL_dmfX'] = dF_dmfX
+        return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
diff --git a/GPy/testing/svgp_tests.py b/GPy/testing/svgp_tests.py
index 6dc0fa56..beb9c00d 100644
--- a/GPy/testing/svgp_tests.py
+++ b/GPy/testing/svgp_tests.py
@@ -32,3 +32,23 @@ class SVGP_classification(np.testing.TestCase):
         self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
     def test_grad(self):
         assert self.m.checkgrad(step=1e-4)
+
+class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+    def setUp(self):
+        X = np.linspace(0,10,100).reshape(-1,1)
+        Z = np.linspace(0,10,10).reshape(-1,1)
+        latent_f = np.exp(0.1*X * 0.05*X**2)
+        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
+
+        mf = GPy.mappings.Linear(1,1)
+
+        lik = GPy.likelihoods.Poisson()
+        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
+    def test_grad(self):
+        assert self.m.checkgrad(step=1e-4)
+
+

From 09c8d5a56769c974e9d6c417d0332237a23c9182 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 1 Apr 2015 09:14:03 +0100
Subject: [PATCH 22/22] whitespace

---
 GPy/mappings/mlp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/mappings/mlp.py b/GPy/mappings/mlp.py
index f0fe21e5..4afc2fa1 100644
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@@ -48,7 +48,7 @@ class MLP(Mapping):
         # Backpropagation to hidden layer.
         dL_dact = np.dot(dL_dF, self.W2.T)
         dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
-        
+
         return np.dot(dL_dlayer1, self.W1.T)