diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index fc76ad68..ad082b3c 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -205,7 +205,7 @@ class GP(Model):
         if kern is None:
             kern = self.kern
 
-        Kx = kern.K(self.X, Xnew)
+        Kx = kern.K(self._predictive_variable, Xnew)
         mu = np.dot(Kx.T, self.posterior.woodbury_vector)
         if len(mu.shape)==1:
             mu = mu.reshape(-1,1)
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index e227625d..9d2d6068 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -49,7 +49,7 @@ class SparseGP(GP):
             else:
                 #inference_method = ??
                 raise NotImplementedError("what to do what to do?")
-            print("defaulting to ", inference_method, "for latent function inference")
+            print(("defaulting to ", inference_method, "for latent function inference"))
 
         self.Z = Param('inducing inputs', Z)
         self.num_inducing = Z.shape[0]
@@ -128,29 +128,30 @@ class SparseGP(GP):
         if kern is None: kern = self.kern
 
         if not isinstance(Xnew, VariationalPosterior):
-            Kx = kern.K(self._predictive_variable, Xnew)
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            if full_cov:
-                Kxx = kern.K(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
-                    for i in range(var.shape[2]):
-                        var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
-                var = var
-            else:
-                Kxx = kern.Kdiag(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
-                    for i in range(var.shape[1]):
-                        var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
-                var = var
-            #add in the mean function
-            if self.mean_function is not None:
-                mu += self.mean_function.f(Xnew)
+            # Kx = kern.K(self._predictive_variable, Xnew)
+            # mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+            # if full_cov:
+            #     Kxx = kern.K(Xnew)
+            #     if self.posterior.woodbury_inv.ndim == 2:
+            #         var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+            #     elif self.posterior.woodbury_inv.ndim == 3:
+            #         var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+            #         for i in range(var.shape[2]):
+            #             var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
+            #     var = var
+            # else:
+            #     Kxx = kern.Kdiag(Xnew)
+            #     if self.posterior.woodbury_inv.ndim == 2:
+            #         var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
+            #     elif self.posterior.woodbury_inv.ndim == 3:
+            #         var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+            #         for i in range(var.shape[1]):
+            #             var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+            #     var = var
+            # #add in the mean function
+            # if self.mean_function is not None:
+            #     mu += self.mean_function.f(Xnew)
+            mu, var = super(SparseGP, self)._raw_predict(Xnew, full_cov, kern)
         else:
             psi0_star = kern.psi0(self._predictive_variable, Xnew)
             psi1_star = kern.psi1(self._predictive_variable, Xnew)
@@ -159,7 +160,7 @@ class SparseGP(GP):
             mu = np.dot(psi1_star, la) # TODO: dimensions?
 
             if full_cov:
-                raise NotImplementedError, "Full covariance for Sparse GP predicted with uncertain inputs not implemented yet."
+                raise NotImplementedError("Full covariance for Sparse GP predicted with uncertain inputs not implemented yet.")
                 var = np.empty((Xnew.shape[0], la.shape[1], la.shape[1]))
                 di = np.diag_indices(la.shape[1])
             else:
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 00a2c2b0..2f089141 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -171,7 +171,7 @@ class Laplace(LatentFunctionInference):
         #define the objective function (to be maximised)
         def obj(Ki_f, f):
             ll = -0.5*np.sum(np.dot(Ki_f.T, f)) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
-            print ll
+            print(ll)
             if np.isnan(ll):
                 import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
                 return -np.inf
diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py
index 0fc488a2..902c4290 100644
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@@ -40,7 +40,7 @@ class SparseGPMissing(StochasticStorage):
         bdict = {}
         #For N > 1000 array2string default crops
         opt = np.get_printoptions()
-        np.set_printoptions(threshold='nan')
+        np.set_printoptions(threshold=np.inf)
         for d in range(self.Y.shape[1]):
             inan = np.isnan(self.Y)[:, d]
             arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
@@ -74,7 +74,7 @@ class SparseGPStochastics(StochasticStorage):
             bdict = {}
             if self.missing_data:
                 opt = np.get_printoptions()
-                np.set_printoptions(threshold='nan')
+                np.set_printoptions(threshold=np.inf)
                 for d in self.d:
                     inan = np.isnan(self.Y[:, d])
                     arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 424a7f5a..e1299f73 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -48,7 +48,7 @@ class Gaussian(Likelihood):
 
     def betaY(self,Y,Y_metadata=None):
         #TODO: ~Ricardo this does not live here
-        raise RuntimeError, "Please notify the GPy developers, this should not happen"
+        raise RuntimeError("Please notify the GPy developers, this should not happen")
         return Y/self.gaussian_variance(Y_metadata)
 
     def gaussian_variance(self, Y_metadata=None):
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
index 3d753395..30ad32ad 100644
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@@ -2,6 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
+import scipy
 from ..util.univariate_Gaussian import std_norm_cdf, std_norm_pdf
 import scipy as sp
 from ..util.misc import safe_exp, safe_square, safe_cube, safe_quad, safe_three_times
@@ -67,7 +68,7 @@ class Probit(GPTransformation):
     .. math::
 
         g(f) = \\Phi^{-1} (mu)
-    
+
     """
     def transf(self,f):
         return std_norm_cdf(f)
@@ -140,7 +141,7 @@ class Log_ex_1(GPTransformation):
 
     """
     def transf(self,f):
-        return np.log1p(safe_exp(f))
+        return scipy.log1p(safe_exp(f))
 
     def dtransf_df(self,f):
         ef = safe_exp(f)
diff --git a/GPy/models/gp_regression.py b/GPy/models/gp_regression.py
index 7266ae92..157c0dc8 100644
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@@ -26,12 +26,12 @@ class GPRegression(GP):
 
     """
 
-    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1.):
+    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1., mean_function=None):
 
         if kernel is None:
             kernel = kern.RBF(X.shape[1])
-	
+
         likelihood = likelihoods.Gaussian(variance=noise_var)
 
-        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer)
+        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function)
 
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 87ffd740..3a5a01d2 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 from . import Tango
-from base_plots import gpplot, x_frame1D, x_frame2D,gperrors
+from .base_plots import gpplot, x_frame1D, x_frame2D,gperrors
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from scipy import sparse
@@ -186,8 +186,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         #optionally plot some samples
         if samples: #NOTE not tested with fixed_inputs
             Ysim = model.posterior_samples(Xgrid, samples, Y_metadata=Y_metadata)
-            print Ysim.shape
-            print Xnew.shape
+            print(Ysim.shape)
+            print(Xnew.shape)
             for yi in Ysim.T:
                 plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], '#3300FF', linewidth=0.25)
                 #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
diff --git a/GPy/testing/cacher_tests.py b/GPy/testing/cacher_tests.py
new file mode 100644
index 00000000..60f79ba2
--- /dev/null
+++ b/GPy/testing/cacher_tests.py
@@ -0,0 +1,37 @@
+'''
+Created on 4 Sep 2015
+
+@author: maxz
+'''
+import unittest
+from GPy.util.caching import Cacher
+from pickle import PickleError
+
+
+class Test(unittest.TestCase):
+    def setUp(self):
+        def op(x):
+            return x
+        self.cache = Cacher(op, 1)
+
+    def test_pickling(self):
+        self.assertRaises(PickleError, self.cache.__getstate__)
+        self.assertRaises(PickleError, self.cache.__setstate__)
+
+    def test_copy(self):
+        tmp = self.cache.__deepcopy__()
+        assert(tmp.operation is self.cache.operation)
+        self.assertEqual(tmp.limit, self.cache.limit)
+
+    def test_reset(self):
+        self.cache.reset()
+        self.assertDictEqual(self.cache.cached_input_ids, {}, )
+        self.assertDictEqual(self.cache.cached_outputs, {}, )
+        self.assertDictEqual(self.cache.inputs_changed, {}, )
+
+    def test_name(self):
+        assert(self.cache.__name__ == self.cache.operation.__name__)
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/GPy/testing/cython_tests.py b/GPy/testing/cython_tests.py
index 30e27fbb..8cdb08be 100644
--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@@ -6,7 +6,7 @@ from ..util.config import config
 import unittest
 
 try:
-    from . import linalg_cython
+    from ..util import linalg_cython
     config.set('cython', 'working', 'True')
 except ImportError:
     config.set('cython', 'working', 'False')
diff --git a/GPy/testing/gp_tests.py b/GPy/testing/gp_tests.py
new file mode 100644
index 00000000..07aa31a3
--- /dev/null
+++ b/GPy/testing/gp_tests.py
@@ -0,0 +1,99 @@
+'''
+Created on 4 Sep 2015
+
+@author: maxz
+'''
+import unittest
+import numpy as np, GPy
+from GPy.core.parameterization.variational import NormalPosterior
+
+class Test(unittest.TestCase):
+
+
+    def setUp(self):
+        np.random.seed(12345)
+        self.N = 20
+        self.N_new = 50
+        self.D = 1
+        self.X = np.random.uniform(-3., 3., (self.N, 1))
+        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
+        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
+
+
+    def test_setxy_bgplvm(self):
+        k = GPy.kern.RBF(1)
+        m = GPy.models.BayesianGPLVM(self.Y, 2, kernel=k)
+        mu, var = m.predict(m.X)
+        X = m.X.copy()
+        Xnew = NormalPosterior(m.X.mean[:10].copy(), m.X.variance[:10].copy())
+        m.set_XY(Xnew, m.Y[:10])
+        assert(m.checkgrad())
+        m.set_XY(X, self.Y)
+        mu2, var2 = m.predict(m.X)
+        np.testing.assert_allclose(mu, mu2)
+        np.testing.assert_allclose(var, var2)
+
+    def test_setxy_gplvm(self):
+        k = GPy.kern.RBF(1)
+        m = GPy.models.GPLVM(self.Y, 2, kernel=k)
+        mu, var = m.predict(m.X)
+        X = m.X.copy()
+        Xnew = X[:10].copy()
+        m.set_XY(Xnew, m.Y[:10])
+        assert(m.checkgrad())
+        m.set_XY(X, self.Y)
+        mu2, var2 = m.predict(m.X)
+        np.testing.assert_allclose(mu, mu2)
+        np.testing.assert_allclose(var, var2)
+
+    def test_setxy_gp(self):
+        k = GPy.kern.RBF(1)
+        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
+        mu, var = m.predict(m.X)
+        X = m.X.copy()
+        m.set_XY(m.X[:10], m.Y[:10])
+        assert(m.checkgrad())
+        m.set_XY(X, self.Y)
+        mu2, var2 = m.predict(m.X)
+        np.testing.assert_allclose(mu, mu2)
+        np.testing.assert_allclose(var, var2)
+
+    def test_mean_function(self):
+        from GPy.core.parameterization.param import Param
+        from GPy.core.mapping import Mapping
+        class Parabola(Mapping):
+            def __init__(self, variance, degree=2, name='parabola'):
+                super(Parabola, self).__init__(1, 1, name)
+                self.variance = Param('variance', np.ones(degree+1) * variance)
+                self.degree = degree
+                self.link_parameter(self.variance)
+
+            def f(self, X):
+                p = self.variance[0] * np.ones(X.shape)
+                for i in range(1, self.degree+1):
+                    p += self.variance[i] * X**(i)
+                return p
+
+            def gradients_X(self, dL_dF, X):
+                grad = np.zeros(X.shape)
+                for i in range(1, self.degree+1):
+                    grad += (i) * self.variance[i] * X**(i-1)
+                return grad
+
+            def update_gradients(self, dL_dF, X):
+                for i in range(self.degree+1):
+                    self.variance.gradient[i] = (dL_dF * X**(i)).sum(0)
+        X = np.linspace(-2, 2, 100)[:, None]
+        k = GPy.kern.RBF(1)
+        k.randomize()
+        p = Parabola(.3)
+        p.randomize()
+        Y = p.f(X) + np.random.multivariate_normal(np.zeros(X.shape[0]), k.K(X))[:,None] + np.random.normal(0, .1, (X.shape[0], 1))
+        m = GPy.models.GPRegression(X, Y, mean_function=p)
+        m.randomize()
+        assert(m.checkgrad())
+        _ = m.predict(m.X)
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index ec005b6c..50a5aed8 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -11,7 +11,7 @@ from ..util.config import config
 verbose = 0
 
 try:
-    from . import linalg_cython
+    from ..util import linalg_cython
     config.set('cython', 'working', 'True')
 except ImportError:
     config.set('cython', 'working', 'False')
diff --git a/GPy/testing/misc_tests.py b/GPy/testing/misc_tests.py
index caf98874..a0e2d949 100644
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import numpy as np
 import scipy as sp
 import GPy
@@ -18,8 +19,8 @@ class MiscTests(np.testing.TestCase):
             assert np.isinf(np.exp(self._lim_val_exp + 1))
             assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))
 
-            print w
-            print len(w)
+            print(w)
+            print(len(w))
             assert len(w)==1 # should have one overflow warning
 
     def test_safe_exp_lower(self):
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 648e1174..75165c0e 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -55,13 +55,44 @@ class MiscTests(unittest.TestCase):
         np.testing.assert_allclose(mu1, (mu2*std)+mu)
         np.testing.assert_allclose(var1, var2)
 
+        q50n = m.predict_quantiles(m.X, (50,))
+        q50 = m2.predict_quantiles(m2.X, (50,))
+        np.testing.assert_allclose(q50n[0], (q50[0]*std)+mu)
+
+    def check_jacobian(self):
+        try:
+            import autograd.numpy as np, autograd as ag, GPy, matplotlib.pyplot as plt
+        except:
+            raise self.skipTest("autograd not available to check gradients")
+        def k(X, X2, alpha=1., lengthscale=None):
+            if lengthscale is None:
+                lengthscale = np.ones(X.shape[1])
+            exp = 0.
+            for q in range(X.shape[1]):
+                exp += ((X[:, [q]] - X2[:, [q]].T)/lengthscale[q])**2
+            #exp = np.sqrt(exp)
+            return alpha * np.exp(-.5*exp)
+        dk = ag.elementwise_grad(lambda x, x2: k(x, x2, alpha=ke.variance.values, lengthscale=ke.lengthscale.values))
+        dkdk = ag.elementwise_grad(dk, argnum=1)
+
+        ke = GPy.kern.RBF(1, ARD=True)
+        #ke.randomize()
+        ke.variance = .2#.randomize()
+        ke.lengthscale[:] = .5
+        ke.randomize()
+        X = np.linspace(-1, 1, 1000)[:,None]
+        X2 = np.array([[0.]]).T
+        np.testing.assert_allclose(ke.gradients_X([[1.]], X, X), dk(X, X))
+        np.testing.assert_allclose(ke.gradients_XX([[1.]], X, X).sum(0), dkdk(X, X))
+        np.testing.assert_allclose(ke.gradients_X([[1.]], X, X2), dk(X, X2))
+        np.testing.assert_allclose(ke.gradients_XX([[1.]], X, X2).sum(0), dkdk(X, X2))
+
 
     def test_sparse_raw_predict(self):
         k = GPy.kern.RBF(1)
         m = GPy.models.SparseGPRegression(self.X, self.Y, kernel=k)
         m.randomize()
         Z = m.Z[:]
-        X = self.X[:]
 
         # Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
         Kinv = m.posterior.woodbury_inv
@@ -147,11 +178,24 @@ class MiscTests(unittest.TestCase):
         m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                           kernel=k, missing_data=True)
         assert(m.checkgrad())
+        mul, varl = m.predict(m.X)
 
         k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
-        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
+        m2 = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                           kernel=k, missing_data=True)
         assert(m.checkgrad())
+        m2.kern.rbf.lengthscale[:] = 1e6
+        m2.X[:] = m.X.param_array
+        m2.likelihood[:] = m.likelihood[:]
+        m2.kern.white[:] = m.kern.white[:]
+        mu, var = m.predict(m.X)
+        np.testing.assert_allclose(mul, mu)
+        np.testing.assert_allclose(varl, var)
+
+        q50 = m.predict_quantiles(m.X, (50,))
+        np.testing.assert_allclose(mul, q50[0])
+
+
 
     def test_likelihood_replicate_kern(self):
         m = GPy.models.GPRegression(self.X, self.Y)
diff --git a/GPy/testing/run_coverage.sh b/GPy/testing/run_coverage.sh
index 6b6e8cb2..f2e52230 100755
--- a/GPy/testing/run_coverage.sh
+++ b/GPy/testing/run_coverage.sh
@@ -1 +1 @@
-nosetests . --with-coverage --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase
+nosetests . --with-coverage --logging-level=INFO --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase