diff --git a/.travis.yml b/.travis.yml
index 093fc49a..a866cb5a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,7 @@ before_install:
   - sudo ln -s /run/shm /dev/shm
 
 install:
-  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.7 scipy=0.12 matplotlib nose sphinx pip nose
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.9 scipy=0.16 matplotlib nose sphinx pip nose
   #- pip install . 
   - python setup.py build_ext --inplace
   #--use-mirrors
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 12fb3d27..ddef0647 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -183,7 +183,7 @@ class GP(Model):
         """
         return self._log_marginal_likelihood
 
-    def _raw_predict(self, _Xnew, full_cov=False, kern=None):
+    def _raw_predict(self, Xnew, full_cov=False, kern=None):
         """
         For making predictions, does not account for normalization or likelihood
 
@@ -199,24 +199,30 @@ class GP(Model):
         if kern is None:
             kern = self.kern
 
-        Kx = kern.K(_Xnew, self.X).T
-        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
+        Kx = kern.K(self.X, Xnew)
         mu = np.dot(Kx.T, self.posterior.woodbury_vector)
         if full_cov:
-            Kxx = kern.K(_Xnew)
-            var = Kxx - np.dot(Kx.T, WiKx)
+            Kxx = kern.K(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+            elif self.posterior.woodbury_inv.ndim == 3:
+                var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+                for i in range(var.shape[2]):
+                    var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
+            var = var
         else:
-            Kxx = kern.Kdiag(_Xnew)
-            var = Kxx - np.sum(WiKx*Kx, 0)
-            var = var.reshape(-1, 1)
-            var[var<0.] = 0.
+            Kxx = kern.Kdiag(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
+            elif self.posterior.woodbury_inv.ndim == 3:
+                var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+                for i in range(var.shape[1]):
+                    var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+            var = var
+        #add in the mean function
+        if self.mean_function is not None:
+            mu += self.mean_function.f(Xnew)
 
-        #force mu to be a column vector
-        if len(mu.shape)==1: mu = mu[:,None]
-
-        #add the mean function in
-        if not self.mean_function is None:
-            mu += self.mean_function.f(_Xnew)
         return mu, var
 
     def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
@@ -249,7 +255,7 @@ class GP(Model):
         mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
         return mean, var
 
-    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, kern=None):
         """
         Get the predictive quantiles around the prediction at X
 
@@ -257,10 +263,12 @@ class GP(Model):
         :type X: np.ndarray (Xnew x self.input_dim)
         :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
         :type quantiles: tuple
+        :param kern: optional kernel to use for prediction
+        :type predict_kw: dict
         :returns: list of quantiles for each X and predictive quantiles for interval combination
         :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)]
         """
-        m, v = self._raw_predict(X,  full_cov=False)
+        m, v = self._raw_predict(X,  full_cov=False, kern=kern)
         if self.normalizer is not None:
             m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
         return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index f927aba4..7bdd6ff2 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -69,7 +69,7 @@ from .expectation_propagation_dtc import EPDTC
 from .dtc import DTC
 from .fitc import FITC
 from .var_dtc_parallel import VarDTC_minibatch
-#from .svgp import SVGP
+from .var_gauss import VarGauss
 
 # class FullLatentFunctionData(object):
 #
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 902d5fff..00a2c2b0 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -171,7 +171,9 @@ class Laplace(LatentFunctionInference):
         #define the objective function (to be maximised)
         def obj(Ki_f, f):
             ll = -0.5*np.sum(np.dot(Ki_f.T, f)) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            print ll
             if np.isnan(ll):
+                import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
                 return -np.inf
             else:
                 return ll
diff --git a/GPy/inference/latent_function_inference/var_gauss.py b/GPy/inference/latent_function_inference/var_gauss.py
new file mode 100644
index 00000000..e71416b4
--- /dev/null
+++ b/GPy/inference/latent_function_inference/var_gauss.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2015, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as np
+from ...util.linalg import pdinv
+from .posterior import Posterior
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+
+class VarGauss(LatentFunctionInference):
+    """
+    The Variational Gaussian Approximation revisited
+
+    @article{Opper:2009,
+        title = {The Variational Gaussian Approximation Revisited},
+        author = {Opper, Manfred and Archambeau, C{\'e}dric},
+        journal = {Neural Comput.},
+        year = {2009},
+        pages = {786--792},
+    }
+    """
+    def __init__(self, alpha, beta):
+        """
+        :param alpha: GPy.core.Param varational parameter
+        :param beta: GPy.core.Param varational parameter
+        """
+        self.alpha, self.beta = alpha, beta
+
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
+        if mean_function is not None:
+            raise NotImplementedError
+        num_data, output_dim = Y.shape
+        assert output_dim ==1, "Only one output supported"
+
+        K = kern.K(X)
+        m = K.dot(self.alpha)
+        KB = K*self.beta[:, None]
+        BKB = KB*self.beta[None, :]
+        A = np.eye(num_data) + BKB
+        Ai, LA, _, Alogdet = pdinv(A)
+        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
+        var = np.diag(Sigma).reshape(-1,1)
+
+        F, dF_dm, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, m, var, Y_metadata=Y_metadata)
+        if dF_dthetaL is not None:
+            dL_dthetaL = dF_dthetaL.sum(1).sum(1)
+        else:
+            dL_dthetaL = np.array([])
+        dF_da = np.dot(K, dF_dm)
+        SigmaB = Sigma*self.beta
+        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
+        KL = 0.5*(Alogdet + np.trace(Ai) - num_data + np.sum(m*self.alpha))
+        dKL_da = m
+        A_A2 = Ai - Ai.dot(Ai)
+        dKL_db = np.diag(np.dot(KB.T, A_A2))
+        log_marginal = F.sum() - KL
+        self.alpha.gradient = dF_da - dKL_da
+        self.beta.gradient = dF_db - dKL_db
+
+        # K-gradients
+        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
+        tmp = Ai*self.beta[:, None]/self.beta[None, :]
+        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
+
+        return Posterior(mean=m, cov=Sigma ,K=K),\
+               log_marginal,\
+               {'dL_dK':dF_dK-dKL_dK, 'dL_dthetaL':dL_dthetaL}
diff --git a/GPy/kern/_src/coregionalize.py b/GPy/kern/_src/coregionalize.py
index 3d71c99f..b8e1e139 100644
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@@ -94,7 +94,7 @@ class Coregionalize(Kern):
             dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)
 
 
-        dkappa = np.diag(dL_dK_small)
+        dkappa = np.diag(dL_dK_small).copy()
         dL_dK_small += dL_dK_small.T
         dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
 
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 3a7d32a5..167daee8 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -85,6 +85,7 @@ class Bernoulli(Likelihood):
                 gh_x, gh_w = gh_points
 
 
+            gh_w = gh_w / np.sqrt(np.pi)
             shape = m.shape
             m,v,Y = m.flatten(), v.flatten(), Y.flatten()
             Ysign = np.where(Y==1,1,-1)
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 9abb8cde..86aad330 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -315,6 +315,9 @@ class Gaussian(Likelihood):
         return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v
 
     def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
+        if not isinstance(self.gp_link, link_functions.Identity):
+            return super(Gaussian, self).variational_expectations(Y=Y, m=m, v=v, gh_points=gh_points, Y_metadata=Y_metadata)
+
         lik_var = float(self.variance)
         F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/lik_var
         dF_dmu = (Y - m)/lik_var
diff --git a/GPy/models/gp_var_gauss.py b/GPy/models/gp_var_gauss.py
index 729b6bb8..ca68ed14 100644
--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@@ -2,19 +2,16 @@
 # Distributed under the terms of the GNU General public License, see LICENSE.txt
 
 import numpy as np
-from scipy import stats
-from scipy.special import erf
-from ..core.model import Model
+from ..core import GP
 from ..core.parameterization import ObsAr
 from .. import kern
 from ..core.parameterization.param import Param
-from ..util.linalg import pdinv
-from ..likelihoods import Gaussian
+from ..inference.latent_function_inference import VarGauss
 
 log_2_pi = np.log(2*np.pi)
 
 
-class GPVariationalGaussianApproximation(Model):
+class GPVariationalGaussianApproximation(GP):
     """
     The Variational Gaussian Approximation revisited
 
@@ -26,70 +23,15 @@ class GPVariationalGaussianApproximation(Model):
         pages = {786--792},
     }
     """
-    def __init__(self, X, Y, kernel, likelihood=None, Y_metadata=None):
-        Model.__init__(self,'Variational GP')
-        if likelihood is None:
-            likelihood = Gaussian()
-        # accept the construction arguments
-        self.X = ObsAr(X)
-        self.Y = Y
-        self.num_data, self.input_dim = self.X.shape
-        self.Y_metadata = Y_metadata
+    def __init__(self, X, Y, kernel, likelihood, Y_metadata=None):
 
-        self.kern = kernel
-        self.likelihood = likelihood
-        self.link_parameter(self.kern)
-        self.link_parameter(self.likelihood)
+        num_data = Y.shape[0]
+        self.alpha = Param('alpha', np.zeros((num_data,1))) # only one latent fn for now.
+        self.beta = Param('beta', np.ones(num_data))
+
+        inf = VarGauss(self.alpha, self.beta)
+        super(GPVariationalGaussianApproximation, self).__init__(X, Y, kernel, likelihood, name='VarGP', inference_method=inf)
 
-        self.alpha = Param('alpha', np.zeros((self.num_data,1))) # only one latent fn for now.
-        self.beta = Param('beta', np.ones(self.num_data))
         self.link_parameter(self.alpha)
         self.link_parameter(self.beta)
 
-    def log_likelihood(self):
-        return self._log_lik
-
-    def parameters_changed(self):
-        K = self.kern.K(self.X)
-        m = K.dot(self.alpha)
-        KB = K*self.beta[:, None]
-        BKB = KB*self.beta[None, :]
-        A = np.eye(self.num_data) + BKB
-        Ai, LA, _, Alogdet = pdinv(A)
-        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
-        var = np.diag(Sigma).reshape(-1,1)
-
-        F, dF_dm, dF_dv, dF_dthetaL = self.likelihood.variational_expectations(self.Y, m, var, Y_metadata=self.Y_metadata)
-        self.likelihood.gradient = dF_dthetaL.sum(1).sum(1)
-        dF_da = np.dot(K, dF_dm)
-        SigmaB = Sigma*self.beta
-        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
-        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + np.sum(m*self.alpha))
-        dKL_da = m
-        A_A2 = Ai - Ai.dot(Ai)
-        dKL_db = np.diag(np.dot(KB.T, A_A2))
-        self._log_lik = F.sum() - KL
-        self.alpha.gradient = dF_da - dKL_da
-        self.beta.gradient = dF_db - dKL_db
-
-        # K-gradients
-        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
-        tmp = Ai*self.beta[:, None]/self.beta[None, :]
-        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
-        self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)
-
-    def _raw_predict(self, Xnew):
-        """
-        Predict the function(s) at the new point(s) Xnew.
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        """
-        Wi, _, _, _ = pdinv(self.kern.K(self.X) + np.diag(self.beta**-2))
-        Kux = self.kern.K(self.X, Xnew)
-        mu = np.dot(Kux.T, self.alpha)
-        WiKux = np.dot(Wi, Kux)
-        Kxx = self.kern.Kdiag(Xnew)
-        var = Kxx - np.sum(WiKux*Kux, 0)
-
-        return mu, var.reshape(-1,1)
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 78c80cb5..d0f6c952 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -110,7 +110,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                 else:
                     Y_metadata['output_index'] = extra_data
             m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
-            lower, upper = model.predict_quantiles(Xgrid, Y_metadata=Y_metadata)
+            fmu, fv = model._raw_predict(Xgrid, full_cov=False, **predict_kw)
+            lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=Y_metadata)
 
 
         for d in which_data_ycols:
diff --git a/GPy/testing/inference_tests.py b/GPy/testing/inference_tests.py
index cd85235d..c1fce8b9 100644
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@@ -8,7 +8,7 @@ The test cases for various inference algorithms
 import unittest, itertools
 import numpy as np
 import GPy
-
+#np.seterr(invalid='raise')
 
 class InferenceXTestCase(unittest.TestCase):
 
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index 27b27892..79bbdc36 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -9,8 +9,7 @@ import inspect
 from GPy.likelihoods import link_functions
 from GPy.core.parameterization import Param
 from functools import partial
-#np.random.seed(300)
-#np.random.seed(4)
+fixed_seed = 7
 
 #np.seterr(divide='raise')
 def dparam_partial(inst_func, *args):
@@ -105,6 +104,7 @@ class TestNoiseModels(object):
     Generic model checker
     """
     def setUp(self):
+        np.random.seed(fixed_seed)
         self.N = 15
         self.D = 3
         self.X = np.random.rand(self.N, self.D)*10
@@ -218,7 +218,8 @@ class TestNoiseModels(object):
                     "constraints": [(".*variance", self.constrain_positive)]
                 },
                 "laplace": True,
-                "ep": False # FIXME: Should be True when we have it working again
+                "ep": False, # FIXME: Should be True when we have it working again
+                "variational_expectations": True,
             },
             "Gaussian_log": {
                 "model": GPy.likelihoods.Gaussian(gp_link=link_functions.Log(), variance=self.var),
@@ -227,7 +228,8 @@ class TestNoiseModels(object):
                     "vals": [self.var],
                     "constraints": [(".*variance", self.constrain_positive)]
                 },
-                "laplace": True
+                "laplace": True,
+                "variational_expectations": True
             },
             #"Gaussian_probit": {
             #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
@@ -252,7 +254,8 @@ class TestNoiseModels(object):
                 "link_f_constraints": [partial(self.constrain_bounded, lower=0, upper=1)],
                 "laplace": True,
                 "Y": self.binary_Y,
-                "ep": False # FIXME: Should be True when we have it working again
+                "ep": False, # FIXME: Should be True when we have it working again
+                "variational_expectations": True
             },
             "Exponential_default": {
                 "model": GPy.likelihoods.Exponential(),
@@ -347,6 +350,10 @@ class TestNoiseModels(object):
                 ep = attributes["ep"]
             else:
                 ep = False
+            if "variational_expectations" in attributes:
+                var_exp = attributes["variational_expectations"]
+            else:
+                var_exp = False
 
             #if len(param_vals) > 1:
                 #raise NotImplementedError("Cannot support multiple params in likelihood yet!")
@@ -377,6 +384,11 @@ class TestNoiseModels(object):
             if ep:
                 #ep likelihood gradcheck
                 yield self.t_ep_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints
+            if var_exp:
+                #Need to specify mu and var!
+                yield self.t_varexp, model, Y, Y_metadata
+                yield self.t_dexp_dmu, model, Y, Y_metadata
+                yield self.t_dexp_dvar, model, Y, Y_metadata
 
 
         self.tearDown()
@@ -603,6 +615,87 @@ class TestNoiseModels(object):
         print(m)
         assert m.checkgrad(verbose=1, step=step)
 
+    ################
+    # variational expectations #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_varexp(self, model, Y, Y_metadata):
+        #Test that the analytic implementation (if it exists) matches the generic gauss
+        #hermite implementation
+        print("\n{}".format(inspect.stack()[0][3]))
+        #Make mu and var (marginal means and variances of q(f)) draws from a GP
+        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
+        L = GPy.util.linalg.jitchol(k)
+        mu = L.dot(np.random.randn(*Y.shape))
+        #Variance must be positive
+        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
+
+        expectation = model.variational_expectations(Y=Y, m=mu, v=var, gh_points=None, Y_metadata=Y_metadata)[0]
+
+        #Implementation of gauss hermite integration
+        shape = mu.shape
+        gh_x, gh_w= np.polynomial.hermite.hermgauss(50)
+        m,v,Y = mu.flatten(), var.flatten(), Y.flatten()
+        #make a grid of points
+        X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
+        #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
+        # broadcast needs to be handled carefully.
+        logp = model.logpdf(X, Y[:,None], Y_metadata=Y_metadata)
+        #average over the gird to get derivatives of the Gaussian's parameters
+        #division by pi comes from fact that for each quadrature we need to scale by 1/sqrt(pi)
+        expectation_gh  = np.dot(logp, gh_w)/np.sqrt(np.pi)
+        expectation_gh = expectation_gh.reshape(*shape)
+
+        np.testing.assert_almost_equal(expectation, expectation_gh, decimal=5)
+
+    @with_setup(setUp, tearDown)
+    def t_dexp_dmu(self, model, Y, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
+        #Make mu and var (marginal means and variances of q(f)) draws from a GP
+        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
+        L = GPy.util.linalg.jitchol(k)
+        mu = L.dot(np.random.randn(*Y.shape))
+        #Variance must be positive
+        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
+        expectation = functools.partial(model.variational_expectations, Y=Y, v=var, gh_points=None, Y_metadata=Y_metadata)
+
+        #Function to get the nth returned value
+        def F(mu):
+            return expectation(m=mu)[0]
+        def dmu(mu):
+            return expectation(m=mu)[1]
+
+        grad = GradientChecker(F, dmu, mu.copy(), 'm')
+
+        grad.randomize()
+        print(grad)
+        print(model)
+        assert grad.checkgrad(verbose=1)
+
+    @with_setup(setUp, tearDown)
+    def t_dexp_dvar(self, model, Y, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
+        #Make mu and var (marginal means and variances of q(f)) draws from a GP
+        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
+        L = GPy.util.linalg.jitchol(k)
+        mu = L.dot(np.random.randn(*Y.shape))
+        #Variance must be positive
+        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
+        expectation = functools.partial(model.variational_expectations, Y=Y, m=mu, gh_points=None, Y_metadata=Y_metadata)
+
+        #Function to get the nth returned value
+        def F(var):
+            return expectation(v=var)[0]
+        def dvar(var):
+            return expectation(v=var)[2]
+
+        grad = GradientChecker(F, dvar, var.copy(), 'v')
+
+        self.constrain_positive('v', grad)
+        #grad.randomize()
+        print(grad)
+        print(model)
+        assert grad.checkgrad(verbose=1)
 
 class LaplaceTests(unittest.TestCase):
     """
@@ -610,6 +703,7 @@ class LaplaceTests(unittest.TestCase):
     """
 
     def setUp(self):
+        np.random.seed(fixed_seed)
         self.N = 15
         self.D = 1
         self.X = np.random.rand(self.N, self.D)*10
diff --git a/GPy/testing/misc_tests.py b/GPy/testing/misc_tests.py
index cbb74ca2..caf98874 100644
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@@ -13,10 +13,13 @@ class MiscTests(np.testing.TestCase):
 
     def test_safe_exp_upper(self):
         with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')  # always print
             assert np.isfinite(np.exp(self._lim_val_exp))
             assert np.isinf(np.exp(self._lim_val_exp + 1))
             assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))
 
+            print w
+            print len(w)
             assert len(w)==1 # should have one overflow warning
 
     def test_safe_exp_lower(self):
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index f7cacb13..06e197f8 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -509,7 +509,8 @@ class GradientTests(np.testing.TestCase):
         X = X[:, None]
         Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
         kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
-        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kern)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kernel=kern, likelihood=lik)
         self.assertTrue(m.checkgrad())