merging last master

2026-05-09 20:12:38 +02:00 · 2015-09-17 14:43:00 +01:00 · 2015-09-17 14:43:00 +01:00 · 1a02c65a61
commit 1a02c65a61
parent 4dd2f4feb7 37e835aa36
133 changed files with 13282 additions and 9562 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -17,7 +17,7 @@ before_install:
  - sudo ln -s /run/shm /dev/shm
 install:
-  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.7 scipy=0.12 matplotlib nose sphinx pip nose
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.9 scipy=0.16 matplotlib nose sphinx pip nose
  #- pip install . 
  - python setup.py build_ext --inplace
  #--use-mirrors
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@ -1,7 +1 @@
-James Hensman
+See contributors.
 Nicolo Fusi
 Ricardo Andrade
 Nicolas Durrande
 Alan Saul
 Max Zwiessele
 Neil D. Lawrence
--- a/GPy/init.py
+++ b/GPy/init.py
@ -21,16 +21,18 @@ from . import plotting
 from .core import Model
 from .core.parameterization import Param, Parameterized, ObsAr
 from .__version__ import __version__
 #@nottest
 try:
    #Get rid of nose dependency by only ignoring if you have nose installed
    from nose.tools import nottest
    @nottest
-    def tests():
+    def tests(verbose=10):
-        Tester(testing).test(verbose=10)
+        Tester(testing).test(verbose=verbose)
 except:
-    def tests():
+    def tests(verbose=10):
-        Tester(testing).test(verbose=10)
+        Tester(testing).test(verbose=verbose)
 def load(file_path):
    """
--- a/GPy/version.py
+++ b/GPy/version.py
@ -0,0 +1 @@
 __version__ = "0.8.8"
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -7,6 +7,6 @@ from .parameterization.param import Param, ParamConcatenation
 from .parameterization.observable_array import ObsAr
 from .gp import GP
-#from .svgp import SVGP
+from .svgp import SVGP
 from .sparse_gp import SparseGP
 from .mapping import *
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -60,9 +60,11 @@ class GP(Model):
            self.normalizer.scale_by(Y)
            self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
            self.Y = Y
-        else:
+        elif isinstance(Y, np.ndarray):
            self.Y = ObsAr(Y)
            self.Y_normalized = self.Y
        else:
            self.Y = Y
        if Y.shape[0] != self.num_data:
            #There can be cases where we want inputs than outputs, for example if we have multiple latent
@ -104,8 +106,23 @@ class GP(Model):
        self.link_parameter(self.likelihood)
        self.posterior = None
        # The predictive variable to be used to predict using the posterior object's
        # woodbury_vector and woodbury_inv is defined as predictive_variable
        # as long as the posterior has the right woodbury entries.
        # It is the input variable used for the covariance between
        # X_star and the posterior of the GP.
        # This is usually just a link to self.X (full GP) or self.Z (sparse GP).
        # Make sure to name this variable and the predict functions will "just work"
        # In maths the predictive variable is:
        #         K_{xx} - K_{xp}W_{pp}^{-1}K_{px}
        #         W_{pp} := \texttt{Woodbury inv}
        #         p := _predictive_variable
-    def set_XY(self, X=None, Y=None, trigger_update=True):
+    @property
    def _predictive_variable(self):
        return self.X
    def set_XY(self, X=None, Y=None):
        """
        Set the input / output data of the model
        This is useful if we wish to change our existing data but maintain the same model
@ -115,7 +132,7 @@ class GP(Model):
        :param Y: output observations
        :type Y: np.ndarray
        """
-        if trigger_update: self.update_model(False)
+        self.update_model(False)
        if Y is not None:
            if self.normalizer is not None:
                self.normalizer.scale_by(Y)
@ -131,34 +148,33 @@ class GP(Model):
                    assert isinstance(X, type(self.X)), "The given X must have the same type as the X in the model!"
                    self.unlink_parameter(self.X)
                    self.X = X
-                    self.link_parameters(self.X)
+                    self.link_parameter(self.X)
                else:
                    self.unlink_parameter(self.X)
                    from ..core import Param
                    self.X = Param('latent mean',X)
-                    self.link_parameters(self.X)
+                    self.link_parameter(self.X)
            else:
                self.X = ObsAr(X)
-        if trigger_update: self.update_model(True)
+        self.update_model(True)
        if trigger_update: self._trigger_params_changed()
-    def set_X(self,X, trigger_update=True):
+    def set_X(self,X):
        """
        Set the input data of the model
        :param X: input observations
        :type X: np.ndarray
        """
-        self.set_XY(X=X, trigger_update=trigger_update)
+        self.set_XY(X=X)
-    def set_Y(self,Y, trigger_update=True):
+    def set_Y(self,Y):
        """
        Set the output data of the model
        :param X: output observations
        :type X: np.ndarray
        """
-        self.set_XY(Y=Y, trigger_update=trigger_update)
+        self.set_XY(Y=Y)
    def parameters_changed(self):
        """
@ -181,7 +197,7 @@ class GP(Model):
        """
        return self._log_marginal_likelihood
-    def _raw_predict(self, _Xnew, full_cov=False, kern=None):
+    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
        For making predictions, does not account for normalization or likelihood
@ -197,24 +213,33 @@ class GP(Model):
        if kern is None:
            kern = self.kern
-        Kx = kern.K(_Xnew, self.X).T
+        Kx = kern.K(self._predictive_variable, Xnew)
        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
        if len(mu.shape)==1:
            mu = mu.reshape(-1,1)
        if full_cov:
-            Kxx = kern.K(_Xnew)
+            Kxx = kern.K(Xnew)
-            var = Kxx - np.dot(Kx.T, WiKx)
+            if self.posterior.woodbury_inv.ndim == 2:
                var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
                var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
                from ..util.linalg import mdot
                for i in range(var.shape[2]):
                    var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
            var = var
        else:
-            Kxx = kern.Kdiag(_Xnew)
+            Kxx = kern.Kdiag(Xnew)
-            var = Kxx - np.sum(WiKx*Kx, 0)
+            if self.posterior.woodbury_inv.ndim == 2:
-            var = var.reshape(-1, 1)
+                var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
-            var[var<0.] = 0.
+            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
                var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
                for i in range(var.shape[1]):
                    var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
            var = var
        #add in the mean function
        if self.mean_function is not None:
            mu += self.mean_function.f(Xnew)
        #force mu to be a column vector
        if len(mu.shape)==1: mu = mu[:,None]
        #add the mean function in
        if not self.mean_function is None:
            mu += self.mean_function.f(_Xnew)
        return mu, var
    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
@ -247,7 +272,7 @@ class GP(Model):
        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
        return mean, var
-    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, kern=None):
        """
        Get the predictive quantiles around the prediction at X
@ -255,10 +280,12 @@ class GP(Model):
        :type X: np.ndarray (Xnew x self.input_dim)
        :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
        :type quantiles: tuple
        :param kern: optional kernel to use for prediction
        :type predict_kw: dict
        :returns: list of quantiles for each X and predictive quantiles for interval combination
        :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)]
        """
-        m, v = self._raw_predict(X,  full_cov=False)
+        m, v = self._raw_predict(X,  full_cov=False, kern=kern)
        if self.normalizer is not None:
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
@ -292,6 +319,120 @@ class GP(Model):
        return dmu_dX, dv_dX
    def predict_jacobian(self, Xnew, kern=None, full_cov=True):
        """
        Compute the derivatives of the posterior of the GP.
        Given a set of points at which to predict X* (size [N*,Q]), compute the
        mean and variance of the derivative. Resulting arrays are sized:
         dL_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
          Note that this is the mean and variance of the derivative,
          not the derivative of the mean and variance! (See predictive_gradients for that)
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
          If there is missing data, it is not implemented for now, but
          there will be one output variance per output dimension.
        :param X: The points at which to get the predictive gradients.
        :type X: np.ndarray (Xnew x self.input_dim)
        :param kern: The kernel to compute the jacobian for.
        :param boolean full_cov: whether to return the full covariance of the jacobian.
        :returns: dmu_dX, dv_dX
        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q,(D)) ]
        Note: We always return sum in input_dim gradients, as the off-diagonals
        in the input_dim are not needed for further calculations.
        This is a compromise for increase in speed. Mathematically the jacobian would
        have another dimension in Q.
        """
        if kern is None:
            kern = self.kern
        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
        for i in range(self.output_dim):
            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)
        dK_dXnew_full = np.empty((self._predictive_variable.shape[0], Xnew.shape[0], Xnew.shape[1]))
        for i in range(self._predictive_variable.shape[0]):
            dK_dXnew_full[i] = kern.gradients_X([[1.]], Xnew, self._predictive_variable[[i]])
        if full_cov:
            dK2_dXdX = kern.gradients_XX([[1.]], Xnew)
        else:
            dK2_dXdX = kern.gradients_XX_diag([[1.]], Xnew)
        def compute_cov_inner(wi):
            if full_cov:
                # full covariance gradients:
                var_jac = dK2_dXdX - np.einsum('qnm,miq->niq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
            else:
                var_jac = dK2_dXdX - np.einsum('qim,miq->iq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
            return var_jac
        if self.posterior.woodbury_inv.ndim == 3: # Missing data:
            if full_cov:
                var_jac = np.empty((Xnew.shape[0],Xnew.shape[0],Xnew.shape[1],self.output_dim))
                for d in range(self.posterior.woodbury_inv.shape[2]):
                    var_jac[:, :, :, d] = compute_cov_inner(self.posterior.woodbury_inv[:, :, d])
            else:
                var_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
                for d in range(self.posterior.woodbury_inv.shape[2]):
                    var_jac[:, :, d] = compute_cov_inner(self.posterior.woodbury_inv[:, :, d])
        else:
            var_jac = compute_cov_inner(self.posterior.woodbury_inv)
        return mean_jac, var_jac
    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
        """
        Predict the wishard embedding G of the GP. This is the density of the
        input of the GP defined by the probabilistic function mapping f.
        G = J_mean.T*J_mean + output_dim*J_cov.
        :param array-like Xnew: The points at which to evaluate the magnification.
        :param :py:class:`~GPy.kern.Kern` kern: The kernel to use for the magnification.
        Supplying only a part of the learning kernel gives insights into the density
        of the specific kernel part of the input function. E.g. one can see how dense the
        linear part of a kernel is compared to the non-linear part etc.
        """
        if kern is None:
            kern = self.kern
        mu_jac, var_jac = self.predict_jacobian(Xnew, kern, full_cov=False)
        mumuT = np.einsum('iqd,ipd->iqp', mu_jac, mu_jac)
        Sigma = np.zeros(mumuT.shape)
        if var_jac.ndim == 3:
            Sigma[(slice(None), )+np.diag_indices(Xnew.shape[1], 2)] = var_jac.sum(-1)
        else:
            Sigma[(slice(None), )+np.diag_indices(Xnew.shape[1], 2)] = self.output_dim*var_jac
        G = 0.
        if mean:
            G += mumuT
        if covariance:
            G += Sigma
        return G
    def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True):
        """
        Predict the magnification factor as
        sqrt(det(G))
        for each point N in Xnew
        """
        G = self.predict_wishard_embedding(Xnew, kern, mean, covariance)
        from ..util.linalg import jitchol
        mag = np.empty(Xnew.shape[0])
        for n in range(Xnew.shape[0]):
            try:
                mag[n] = np.sqrt(np.exp(2*np.sum(np.log(np.diag(jitchol(G[n, :, :]))))))
            except:
                mag[n] = np.sqrt(np.linalg.det(G[n, :, :]))
        return mag
    def posterior_samples_f(self,X,size=10, full_cov=True):
        """
        Samples the posterior GP at the points X.
@ -395,8 +536,8 @@ class GP(Model):
    def plot(self, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
-        plot_raw=False,
+        plot_raw=False, linecol=None,fillcol=None, Y_metadata=None,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx', predict_kw=None):
+        data_symbol='kx', predict_kw=None, plot_training_data=True, samples_y=0, apply_link=False):
        """
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -419,7 +560,7 @@ class GP(Model):
        :param levels: number of levels to plot in a contour plot.
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        :type levels: int
-        :param samples: the number of a posteriori samples to plot
+        :param samples: the number of a posteriori samples to plot, p(f*|y)
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
@ -433,6 +574,12 @@ class GP(Model):
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
        :param plot_training_data: whether or not to plot the training points
        :type plot_training_data: boolean
        :param samples_y: the number of a posteriori samples to plot, p(y*|y)
        :type samples_y: int
        :param apply_link: if there is a link function of the likelihood, plot the link(f*) rather than f*, when plotting posterior samples f
        :type apply_link: boolean
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
@ -445,7 +592,103 @@ class GP(Model):
                                     which_data_ycols, fixed_inputs,
                                     levels, samples, fignum, ax, resolution,
                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, predict_kw=predict_kw, **kw)
+                                     data_symbol=data_symbol, predict_kw=predict_kw,
                                     plot_training_data=plot_training_data, samples_y=samples_y, apply_link=apply_link, **kw)
    def plot_data(self, which_data_rows='all',
        which_data_ycols='all', visible_dims=None,
        fignum=None, ax=None, data_symbol='kx'):
        """
        Plot the training data
          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
        Can plot only part of the data
        using which_data_rows and which_data_ycols.
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
        :type which_data_ycols: 'all' or a list of integers
        :param visible_dims: an array specifying the input dimensions to plot (maximum two)
        :type visible_dims: a numpy array
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        :type levels: int
        :param samples: the number of a posteriori samples to plot, p(f*|y)
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
        kw = {}
        return models_plots.plot_data(self, which_data_rows,
                                     which_data_ycols, visible_dims,
                                     fignum, ax, data_symbol, **kw)
    def errorbars_trainset(self, which_data_rows='all',
            which_data_ycols='all', fixed_inputs=[], fignum=None, ax=None,
            linecol=None, data_symbol='kx', predict_kw=None, plot_training_data=True,lw=None):
        """
        Plot the posterior error bars corresponding to the training data
          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
        Can plot only part of the data
        using which_data_rows and which_data_ycols.
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
        :type which_data_rows: 'all' or a list of integers
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param plot_training_data: whether or not to plot the training points
        :type plot_training_data: boolean
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
        kw = {}
        if lw is not None:
            kw['lw'] = lw
        return models_plots.errorbars_trainset(self, which_data_rows, which_data_ycols, fixed_inputs,
                                    fignum, ax, linecol, data_symbol,
                                    predict_kw, plot_training_data, **kw)
    def plot_magnification(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
                fignum=None, legend=True,
                plot_limits=None,
                aspect='auto', updates=False, plot_inducing=True, kern=None, **kwargs):
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import dim_reduction_plots
        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
                resolution, ax, marker, s,
                fignum, plot_inducing, legend,
                plot_limits, aspect, updates, **kwargs)
    def input_sensitivity(self, summarize=True):
        """
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -32,7 +32,7 @@ class Bijective_mapping(Mapping):
    also back from f to X. The inverse mapping is called g().
    """
    def __init__(self, input_dim, output_dim, name='bijective_mapping'):
-        super(Bijective_apping, self).__init__(name=name)
+        super(Bijective_mapping, self).__init__(name=name)
    def g(self, f):
        """Inverse mapping from output domain of the function to the inputs."""
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -42,7 +42,7 @@ class Param(Parameterizable, ObsAr):
    Multilevel indexing (e.g. self[:2][1:]) is not supported and might lead to unexpected behaviour.
    Try to index in one go, using boolean indexing or the numpy builtin
    np.index function.
-    
+
    See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc.
    """
@ -180,6 +180,7 @@ class Param(Parameterizable, ObsAr):
        import copy
        Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo))
        return s
    def _setup_observers(self):
        """
        Setup the default observers
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -74,7 +74,7 @@ class Parameterized(Parameterizable):
    # Metaclass for parameters changed after init.
    # This makes sure, that parameters changed will always be called after __init__
    # **Never** call parameters_changed() yourself
-    #This is ignored in Python 3 -- you need to put the meta class in the function definition. 
+    #This is ignored in Python 3 -- you need to put the meta class in the function definition.
    #__metaclass__ = ParametersChangedMeta
    #The six module is used to support both Python 2 and 3 simultaneously
    #===========================================================================
@ -197,9 +197,10 @@ class Parameterized(Parameterizable):
                raise RuntimeError("{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)))
        start = sum([p.size for p in self.parameters[:param._parent_index_]])
        self._remove_parameter_name(param)
        self.size -= param.size
        del self.parameters[param._parent_index_]
        self._remove_parameter_name(param)
        param._disconnect_parent()
        param.remove_observer(self, self._pass_through_notify_observers)
@ -315,7 +316,7 @@ class Parameterized(Parameterizable):
                    param[:] = val; return
            except AttributeError:
                pass
-        object.__setattr__(self, name, val);
+        return object.__setattr__(self, name, val);
    #===========================================================================
    # Pickling
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -366,6 +366,7 @@ class InverseGamma(Gamma):
    def rvs(self, n):
        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
 class DGPLVM_KFDA(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable function using
@ -512,6 +513,7 @@ class DGPLVM_KFDA(Prior):
        self.A = self.compute_A(lst_ni)
        self.x_shape = x_shape
 class DGPLVM(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@ -669,7 +671,7 @@ class DGPLVM(Prior):
        M_i = self.compute_Mi(cls)
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
-        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        # sb_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
@ -903,7 +905,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
    # This function calculates derivative of the log of prior function
@ -927,7 +929,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
@ -1198,6 +1200,7 @@ class DGPLVM_T(Prior):
 class HalfT(Prior):
    """
    Implementation of the half student t probability function, coupled with random variables.
@ -1208,15 +1211,17 @@ class HalfT(Prior):
    """
    domain = _POSITIVE
    _instances = []
-    def __new__(cls, A, nu): # Singleton:
+
    def __new__(cls, A, nu):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().A == A and instance().nu == nu:
-                   return instance()
+                    return instance()
        o = super(Prior, cls).__new__(cls, A, nu)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, A, nu):
        self.A = float(A)
        self.nu = float(nu)
@ -1225,37 +1230,81 @@ class HalfT(Prior):
    def __str__(self):
        return "hT({:.2g}, {:.2g})".format(self.A, self.nu)
-    def lnpdf(self,theta):
+    def lnpdf(self, theta):
-        return (theta>0) * ( self.constant -.5*(self.nu+1) * np.log( 1.+ (1./self.nu) * (theta/self.A)**2 ) )
+        return (theta > 0) * (self.constant - .5*(self.nu + 1) * np.log(1. + (1./self.nu) * (theta/self.A)**2))
-        #theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
+        # theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
-        #lnpdfs = np.zeros_like(theta)
+        # lnpdfs = np.zeros_like(theta)
-        #theta = np.array([theta])
+        # theta = np.array([theta])
-        #above_zero = theta.flatten()>1e-6
+        # above_zero = theta.flatten()>1e-6
-        #v = self.nu
+        # v = self.nu
-        #sigma2=self.A
+        # sigma2=self.A
-        #stop
+        # stop
-        #lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
+        # lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
-        #    - gammaln(v * 0.5)
+        #     - gammaln(v * 0.5)
-        #    - 0.5*np.log(sigma2 * v * np.pi)
+        #     - 0.5*np.log(sigma2 * v * np.pi)
-        #    - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
+        #     - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
-        #)
+        # )
-        #return lnpdfs
+        # return lnpdfs
-    def lnpdf_grad(self,theta):
+    def lnpdf_grad(self, theta):
-        theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
+        theta = theta if isinstance(theta, np.ndarray) else np.array([theta])
        grad = np.zeros_like(theta)
-        above_zero = theta>1e-6
+        above_zero = theta > 1e-6
        v = self.nu
-        sigma2=self.A
+        sigma2 = self.A
        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
        return grad
    def rvs(self, n):
-         #return np.random.randn(n) * self.sigma + self.mu
+        # return np.random.randn(n) * self.sigma + self.mu
-         from scipy.stats import t
+        from scipy.stats import t
-         #[np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
+        # [np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
-         ret = t.rvs(self.nu,loc=0,scale=self.A, size=n)
+        ret = t.rvs(self.nu, loc=0, scale=self.A, size=n)
-         ret[ret<0] = 0
+        ret[ret < 0] = 0
-         return ret
+        return ret
 class Exponential(Prior):
    """
    Implementation of the Exponential probability function,
    coupled with random variables.
    :param l: shape parameter
    """
    domain = _POSITIVE
    _instances = []
    def __new__(cls, l):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().l == l:
                    return instance()
        o = super(Exponential, cls).__new__(cls, l)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, l):
        self.l = l
    def __str__(self):
        return "Exp({:.2g})".format(self.l)
    def summary(self):
        ret = {"E[x]": 1. / self.l,
               "E[ln x]": np.nan,
               "var[x]": 1. / self.l**2,
               "Entropy": 1. - np.log(self.l),
               "Mode": 0.}
        return ret
    def lnpdf(self, x):
        return np.log(self.l) - self.l * x
    def lnpdf_grad(self, x):
        return - self.l
    def rvs(self, n):
        return np.random.exponential(scale=self.l, size=n)
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -62,7 +62,7 @@ class Transformation(object):
        import matplotlib.pyplot as plt
        from ...plotting.matplot_dep import base_plots
        x = np.linspace(-8,8)
-        base_plots.meanplot(x, self.f(x),axes=axes*args,**kw)
+        base_plots.meanplot(x, self.f(x), *args, ax=axes, **kw)
        axes = plt.gca()
        axes.set_xlabel(xlabel)
        axes.set_ylabel(ylabel)
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -49,7 +49,7 @@ class SparseGP(GP):
            else:
                #inference_method = ??
                raise NotImplementedError("what to do what to do?")
-            print("defaulting to ", inference_method, "for latent function inference")
+            print(("defaulting to ", inference_method, "for latent function inference"))
        self.Z = Param('inducing inputs', Z)
        self.num_inducing = Z.shape[0]
@ -60,6 +60,10 @@ class SparseGP(GP):
        self.link_parameter(self.Z, index=0)
        self.posterior = None
    @property
    def _predictive_variable(self):
        return self.Z
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)
@ -114,63 +118,66 @@ class SparseGP(GP):
        Make a prediction for the latent function values.
        For certain inputs we give back a full_cov of shape NxN,
-        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
+        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
        we take only the diagonal elements across N.
-        
+
-        For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we 
+        For uncertain inputs, the SparseGP bound produces cannot predict the full covariance matrix full_cov for now.
-        return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD).
+        The implementation of that will follow. However, for each dimension the
-        This is for both with and without missing data. See for missing data SparseGP implementation py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
+        covariance changes, so if full_cov is False (standard), we return the variance
        for each dimension [NxD].
        """
        if kern is None: kern = self.kern
        if not isinstance(Xnew, VariationalPosterior):
-            Kx = kern.K(self.Z, Xnew)
+            # Kx = kern.K(self._predictive_variable, Xnew)
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+            # mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            if full_cov:
+            # if full_cov:
-                Kxx = kern.K(Xnew)
+            #     Kxx = kern.K(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
+            #     if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+            #         var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
+            #     elif self.posterior.woodbury_inv.ndim == 3:
-                    var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+            #         var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
-                    for i in range(var.shape[2]):
+            #         for i in range(var.shape[2]):
-                        var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
+            #             var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
-                var = var
+            #     var = var
-            else:
+            # else:
-                Kxx = kern.Kdiag(Xnew)
+            #     Kxx = kern.Kdiag(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
+            #     if self.posterior.woodbury_inv.ndim == 2:
-                    var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
+            #         var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
-                elif self.posterior.woodbury_inv.ndim == 3:
+            #     elif self.posterior.woodbury_inv.ndim == 3:
-                    var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+            #         var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
-                    for i in range(var.shape[1]):
+            #         for i in range(var.shape[1]):
-                        var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+            #             var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
-                var = var
+            #     var = var
-            #add in the mean function
+            # #add in the mean function
-            if self.mean_function is not None:
+            # if self.mean_function is not None:
-                mu += self.mean_function.f(Xnew)
+            #     mu += self.mean_function.f(Xnew)
            mu, var = super(SparseGP, self)._raw_predict(Xnew, full_cov, kern)
        else:
-            psi0_star = kern.psi0(self.Z, Xnew)
+            psi0_star = kern.psi0(self._predictive_variable, Xnew)
-            psi1_star = kern.psi1(self.Z, Xnew)
+            psi1_star = kern.psi1(self._predictive_variable, Xnew)
            #psi2_star = kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
            la = self.posterior.woodbury_vector
            mu = np.dot(psi1_star, la) # TODO: dimensions?
-            
+
-            if full_cov: 
+            if full_cov:
                raise NotImplementedError("Full covariance for Sparse GP predicted with uncertain inputs not implemented yet.")
                var = np.empty((Xnew.shape[0], la.shape[1], la.shape[1]))
                di = np.diag_indices(la.shape[1])
-            else: 
+            else:
                var = np.empty((Xnew.shape[0], la.shape[1]))
-                
+
            for i in range(Xnew.shape[0]):
                _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
-                psi2_star = kern.psi2(self.Z, NormalPosterior(_mu, _var))
+                psi2_star = kern.psi2(self._predictive_variable, NormalPosterior(_mu, _var))
                tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
                var_ = mdot(la.T, tmp, la)
                p0 = psi0_star[i]
                t = np.atleast_3d(self.posterior.woodbury_inv)
                t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
-                
+
                if full_cov:
                    var_[di] += p0
                    var_[di] += -t2
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -34,7 +34,7 @@ class SparseGP_MPI(SparseGP):
    """
-    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp', Y_metadata=None, mpi_comm=None, normalizer=False):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -1,11 +1,11 @@
 # Copyright (c) 2014, James Hensman, Alex Matthews
-# Distributed under the terms of the GNU General public License, see LICENSE.txt
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..util import choleskies
 from .sparse_gp import SparseGP
 from .parameterization.param import Param
-from ..inference.latent_function_inference import SVGP as svgp_inf
+from ..inference.latent_function_inference.svgp import SVGP as svgp_inf
 class SVGP(SparseGP):
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@ -24,7 +24,6 @@ class VerboseOptimization(object):
            self.model.add_observer(self, self.print_status)
            self.status = 'running'
            self.clear = clear_after_finish
            self.deltat = .2
            self.update()
@ -80,6 +79,7 @@ class VerboseOptimization(object):
    def __enter__(self):
        self.start = time.time()
        self._time = self.start
        return self
    def print_out(self, seconds):
@ -143,12 +143,12 @@ class VerboseOptimization(object):
    def print_status(self, me, which=None):
        self.update()
-        seconds = time.time()-self.start
+        t = time.time()
        seconds = t-self.start
        #sys.stdout.write(" "*len(self.message))
-        self.deltat += seconds
+        if t-self._time > .3 or seconds < .3:
        if self.deltat > .2:
            self.print_out(seconds)
-            self.deltat = 0
+            self._time = t
        self.iteration += 1
--- a/GPy/examples/coreg_example.py
+++ b/GPy/examples/coreg_example.py
@ -3,7 +3,7 @@
 import numpy as np
 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
 import GPy
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -77,7 +77,7 @@ def student_t_approx(optimize=True, plot=True):
    debug=True
    if debug:
        m4.optimize(messages=1)
-        import pylab as pb
+        from matplotlib import pyplot as pb
        pb.plot(m4.X, m4.inference_method.f_hat)
        pb.plot(m4.X, m4.Y, 'rx')
        m4.plot()
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -5,7 +5,7 @@
 Gaussian Processes regression examples
 """
 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
 import numpy as np
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -69,7 +69,7 @@ from .expectation_propagation_dtc import EPDTC
 from .dtc import DTC
 from .fitc import FITC
 from .var_dtc_parallel import VarDTC_minibatch
-#from .svgp import SVGP
+from .var_gauss import VarGauss
 # class FullLatentFunctionData(object):
 #
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -4,6 +4,8 @@
 import numpy as np
 from ...core import Model
 from ...core.parameterization import variational
 from ...util.linalg import tdot
 from GPy.core.parameterization.variational import VariationalPosterior
 def infer_newX(model, Y_new, optimize=True, init='L2'):
    """
@ -60,18 +62,19 @@ class InferenceX(Model):
 #                 self.kern.GPU(True)
        from copy import deepcopy
        self.posterior = deepcopy(model.posterior)
-        if hasattr(model, 'variational_prior'):
+        from ...core.parameterization.variational import VariationalPosterior
        if isinstance(model.X, VariationalPosterior):
            self.uncertain_input = True
            from ...models.ss_gplvm import IBPPrior
            from ...models.ss_mrd import IBPPrior_SSMRD
            if isinstance(model.variational_prior, IBPPrior) or isinstance(model.variational_prior, IBPPrior_SSMRD):
                from ...core.parameterization.variational import SpikeAndSlabPrior
-                self.variational_prior = SpikeAndSlabPrior(pi=05,learnPi=False, group_spike=False)
+                self.variational_prior = SpikeAndSlabPrior(pi=0.5, learnPi=False, group_spike=False)
            else:
                self.variational_prior = model.variational_prior.copy()
        else:
            self.uncertain_input = False
-        if hasattr(model, 'inducing_inputs'):
+        if hasattr(model, 'Z'):
            self.sparse_gp = True
            self.Z = model.Z.copy()
        else:
@ -125,13 +128,13 @@ class InferenceX(Model):
            wv = wv[:,self.valid_dim]
            output_dim = self.valid_dim.sum()
            if self.ninan is not None:
-                self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - np.einsum('md,od->mo',wv, wv)[:, :, None]).sum(-1)
+                self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - tdot(wv)[:, :, None]).sum(-1)
            else:
-                self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))
+                self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - tdot(wv))
            self.dL_dpsi1 = beta*np.dot(self.Y[:,self.valid_dim], wv.T)
            self.dL_dpsi0 = - beta/2.* np.ones(self.Y.shape[0])
        else:
-            self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))/2.
+            self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - tdot(wv))/2. #np.einsum('md,od->mo',wv, wv)
            self.dL_dpsi1 = beta*np.dot(self.Y, wv.T)
            self.dL_dpsi0 = -beta/2.*output_dim* np.ones(self.Y.shape[0])
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -172,6 +172,7 @@ class Laplace(LatentFunctionInference):
        def obj(Ki_f, f):
            ll = -0.5*np.sum(np.dot(Ki_f.T, f)) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
            if np.isnan(ll):
                import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
                return -np.inf
            else:
                return ll
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -64,9 +64,7 @@ class VarDTC(LatentFunctionInference):
    def get_VVTfactor(self, Y, prec):
        return Y * prec # TODO chache this, and make it effective
-
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None):
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None):
        _, output_dim = Y.shape
        uncertain_inputs = isinstance(X, VariationalPosterior)
@ -95,17 +93,28 @@ class VarDTC(LatentFunctionInference):
        # The rather complex computations of A, and the psi stats
        if uncertain_inputs:
-            psi0 = kern.psi0(Z, X)
+            if psi0 is None:
-            psi1 = kern.psi1(Z, X)
+                psi0 = kern.psi0(Z, X)
            if psi1 is None:
                psi1 = kern.psi1(Z, X)
            if het_noise:
-                psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
+                if psi2 is None:
                    assert len(psi2.shape) == 3  # Need to have not summed out N
                    #FIXME: Need testing
                    psi2_beta = np.sum([psi2[X[i:i+1,:], :, :] * beta_i for i,beta_i in enumerate(beta)],0)
                else:
                    psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
            else:
-                psi2_beta = kern.psi2(Z,X) * beta
+                if psi2 is None:
                    psi2 = kern.psi2(Z,X)
                psi2_beta =  psi2 * beta
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
-            psi0 = kern.Kdiag(X)
+            if psi0 is None:
-            psi1 = kern.K(X, Z)
+                psi0 = kern.Kdiag(X)
            if psi1 is None:
                psi1 = kern.K(X, Z)
            if het_noise:
                tmp = psi1 * (np.sqrt(beta))
            else:
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -172,18 +172,23 @@ class VarDTC_minibatch(LatentFunctionInference):
        if not np.isfinite(Kmm).all():
            print(Kmm)
        Lm = jitchol(Kmm)
        LmInv = dtrtri(Lm)
-        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
+        LmInvPsi2LmInvT = LmInv.dot(psi2_full.dot(LmInv.T))
        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
        LL = jitchol(Lambda)
        LLInv = dtrtri(LL)
        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
-        b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
+        LmLLInv = LLInv.dot(LmInv)
        b  = psi1Y_full.dot(LmLLInv.T)
        bbt = np.square(b).sum()
-        v = dtrtrs(Lm,dtrtrs(LL,b,trans=1)[0],trans=1)[0]
+        v = b.dot(LmLLInv).T
-
+        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)
-        tmp  = -backsub_both_sides(LL, tdot(b)+output_dim*np.eye(input_dim), transpose='left')
+        
-        dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim), transpose='left')/2.
+        tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv)
-
+        dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2.
        # Cache intermediate results
        self.midRes['dL_dpsi2R'] = dL_dpsi2R
        self.midRes['v'] = v
@ -201,7 +206,7 @@ class VarDTC_minibatch(LatentFunctionInference):
        # Compute dL_dKmm
        #======================================================================
-        dL_dKmm =  dL_dpsi2R - output_dim*backsub_both_sides(Lm, LmInvPsi2LmInvT, transpose='left')/2.
+        dL_dKmm =  dL_dpsi2R - output_dim*LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.
        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
--- a/GPy/inference/latent_function_inference/var_gauss.py
+++ b/GPy/inference/latent_function_inference/var_gauss.py
@ -0,0 +1,69 @@
 # Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...util.linalg import pdinv
 from .posterior import Posterior
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 class VarGauss(LatentFunctionInference):
    """
    The Variational Gaussian Approximation revisited
    @article{Opper:2009,
        title = {The Variational Gaussian Approximation Revisited},
        author = {Opper, Manfred and Archambeau, C{\'e}dric},
        journal = {Neural Comput.},
        year = {2009},
        pages = {786--792},
    }
    """
    def __init__(self, alpha, beta):
        """
        :param alpha: GPy.core.Param varational parameter
        :param beta: GPy.core.Param varational parameter
        """
        self.alpha, self.beta = alpha, beta
    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
        if mean_function is not None:
            raise NotImplementedError
        num_data, output_dim = Y.shape
        assert output_dim ==1, "Only one output supported"
        K = kern.K(X)
        m = K.dot(self.alpha)
        KB = K*self.beta[:, None]
        BKB = KB*self.beta[None, :]
        A = np.eye(num_data) + BKB
        Ai, LA, _, Alogdet = pdinv(A)
        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
        var = np.diag(Sigma).reshape(-1,1)
        F, dF_dm, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, m, var, Y_metadata=Y_metadata)
        if dF_dthetaL is not None:
            dL_dthetaL = dF_dthetaL.sum(1).sum(1)
        else:
            dL_dthetaL = np.array([])
        dF_da = np.dot(K, dF_dm)
        SigmaB = Sigma*self.beta
        #dF_db_ = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
        dF_db = -2*np.sum(Sigma**2 * (dF_dv * self.beta), 0)
        #assert np.allclose(dF_db, dF_db_)
        KL = 0.5*(Alogdet + np.trace(Ai) - num_data + np.sum(m*self.alpha))
        dKL_da = m
        A_A2 = Ai - Ai.dot(Ai)
        dKL_db = np.diag(np.dot(KB.T, A_A2))
        log_marginal = F.sum() - KL
        self.alpha.gradient = dF_da - dKL_da
        self.beta.gradient = dF_db - dKL_db
        # K-gradients
        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
        tmp = Ai*self.beta[:, None]/self.beta[None, :]
        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
        return Posterior(mean=m, cov=Sigma ,K=K),\
               log_marginal,\
               {'dL_dK':dF_dK-dKL_dK, 'dL_dthetaL':dL_dthetaL}
--- a/GPy/inference/mcmc/init.py
+++ b/GPy/inference/mcmc/init.py
@ -1 +1,2 @@
 from .hmc import HMC
 from .samplers import *
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@ -1,14 +1,10 @@
 # ## Copyright (c) 2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
+from __future__ import print_function
 import numpy as np
 from scipy import linalg, optimize
 import Tango
 import sys
-import re
+
 import numdifftools as ndt
 import pdb
 try:
    #In Python 2, cPickle is faster. It does not exist in Python 3 but the underlying code is always used
@ -22,11 +18,11 @@ class Metropolis_Hastings:
    def __init__(self,model,cov=None):
        """Metropolis Hastings, with tunings according to Gelman et al. """
        self.model = model
-        current = self.model._get_params_transformed()
+        current = self.model.optimizer_array
        self.D = current.size
        self.chains = []
        if cov is None:
-            self.cov = model.Laplace_covariance()
+            self.cov = np.eye(self.D)
        else:
            self.cov = cov
        self.scale = 2.4/np.sqrt(self.D)
@ -37,20 +33,20 @@ class Metropolis_Hastings:
        if start is None:
            self.model.randomize()
        else:
-            self.model._set_params_transformed(start)
+            self.model.optimizer_array = start
-
+    def sample(self, Ntotal=10000, Nburn=1000, Nthin=10, tune=True, tune_throughout=False, tune_interval=400):
-
+        current = self.model.optimizer_array
-    def sample(self, Ntotal, Nburn, Nthin, tune=True, tune_throughout=False, tune_interval=400):
+        fcurrent = self.model.log_likelihood() + self.model.log_prior() + \
-        current = self.model._get_params_transformed()
+                   self.model._log_det_jacobian()
        fcurrent = self.model.log_likelihood() + self.model.log_prior()
        accepted = np.zeros(Ntotal,dtype=np.bool)
        for it in range(Ntotal):
-            print("sample %d of %d\r"%(it,Ntotal), end=' ')
+            print("sample %d of %d\r"%(it,Ntotal),end="\t")
            sys.stdout.flush()
            prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
-            self.model._set_params_transformed(prop)
+            self.model.optimizer_array = prop
-            fprop = self.model.log_likelihood() + self.model.log_prior()
+            fprop = self.model.log_likelihood() + self.model.log_prior() + \
                    self.model._log_det_jacobian()
            if fprop>fcurrent:#sample accepted, going 'uphill'
                accepted[it] = True
@ -78,10 +74,11 @@ class Metropolis_Hastings:
    def predict(self,function,args):
        """Make a prediction for the function, to which we will pass the additional arguments"""
-        param = self.model._get_params()
+        param = self.model.param_array
        fs = []
        for p in self.chain:
-            self.model._set_params(p)
+            self.model.param_array = p
            fs.append(function(*args))
-        self.model._set_params(param)# reset model to starting state
+        # reset model to starting state
        self.model.param_array = param
        return fs
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -5,7 +5,7 @@ class StochasticStorage(object):
    '''
    This is a container for holding the stochastic parameters,
    such as subset indices or step length and so on.
-    
+
    self.d has to be a list of lists:
    [dimension indices, nan indices for those dimensions]
    so that the minibatches can be used as efficiently as possible.10
@ -38,16 +38,17 @@ class SparseGPMissing(StochasticStorage):
        import numpy as np
        self.Y = model.Y_normalized
        bdict = {}
        #For N > 1000 array2string default crops
        opt = np.get_printoptions()
        np.set_printoptions(threshold=np.inf)
        for d in range(self.Y.shape[1]):
-            inan = np.isnan(self.Y[:, d])
+            inan = np.isnan(self.Y)[:, d]
-            arr_str = np.array2string(inan, 
+            arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
                                      np.inf, 0, 
                                      True, '', 
                                      formatter={'bool':lambda x: '1' if x else '0'})
            try:
                bdict[arr_str][0].append(d)
            except:
                bdict[arr_str] = [[d], ~inan]
        np.set_printoptions(**opt)
        self.d = bdict.values()
 class SparseGPStochastics(StochasticStorage):
@ -55,32 +56,36 @@ class SparseGPStochastics(StochasticStorage):
    For the sparse gp we need to store the dimension we are in,
    and the indices corresponding to those
    """
-    def __init__(self, model, batchsize=1):
+    def __init__(self, model, batchsize=1, missing_data=True):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
        self.Y = model.Y_normalized
        self.missing_data = missing_data
        self.reset()
        self.do_stochastics()
    def do_stochastics(self):
        import numpy as np
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
-            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.d])]]
+            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]]
        else:
            import numpy as np
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
            bdict = {}
-            for d in self.d:
+            if self.missing_data:
-                inan = np.isnan(self.Y[:, d])
+                opt = np.get_printoptions()
-                arr_str = int(np.array2string(inan, 
+                np.set_printoptions(threshold=np.inf)
-                                          np.inf, 0, 
+                for d in self.d:
-                                          True, '', 
+                    inan = np.isnan(self.Y[:, d])
-                                          formatter={'bool':lambda x: '1' if x else '0'}), 2)
+                    arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
-                try:
+                    try:
-                    bdict[arr_str][0].append(d)
+                        bdict[arr_str][0].append(d)
-                except:
+                    except:
-                    bdict[arr_str] = [[d], ~inan]
+                        bdict[arr_str] = [[d], ~inan]
-            self.d = bdict.values()
+                np.set_printoptions(**opt)
                self.d = bdict.values()
            else:
                self.d = [[self.d, None]]
    def reset(self):
        self.current_dim = -1
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -6,6 +6,7 @@ from ._src.brownian import Brownian
 from ._src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from ._src.mlp import MLP
 from ._src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
 from ._src.standard_periodic import StdPeriodic
 from ._src.independent_outputs import IndependentOutputs, Hierarchical
 from ._src.coregionalize import Coregionalize
 from ._src.ODE_UY import ODE_UY
@ -17,7 +18,7 @@ from ._src.eq_ode2 import EQ_ODE2
 from ._src.trunclinear import TruncLinear,TruncLinear_inf
 from ._src.splitKern import SplitKern,DEtime
 from ._src.splitKern import DEtime as DiffGenomeKern
-
+from ._src.spline import Spline
-
+from ._src.eq_ode2 import EQ_ODE2
 from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -14,7 +14,7 @@ class Add(CombinationKernel):
    This kernel will take over the active dims of it's subkernels passed in.
    """
-    def __init__(self, subkerns, name='add'):
+    def __init__(self, subkerns, name='sum'):
        for i, kern in enumerate(subkerns[:]):
            if isinstance(kern, Add):
                del subkerns[i]
@ -71,16 +71,29 @@ class Add(CombinationKernel):
        target = np.zeros(X.shape)
        [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
        return target
-    
+
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def gradients_XX(self, dL_dK, X, X2):
        if X2 is None:
            target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
        else:
            target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
        [target.__iadd__(p.gradients_XX(dL_dK, X, X2)) for p in self.parts]
        return target
    def gradients_XX_diag(self, dL_dKdiag, X):
        target = np.zeros(X.shape)
        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
        return target
    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi0(self, Z, variational_posterior):
        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
-    
+
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi1(self, Z, variational_posterior):
        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi2(self, Z, variational_posterior):
        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
        #return psi2
@ -115,6 +128,41 @@ class Add(CombinationKernel):
                raise NotImplementedError("psi2 cannot be computed for this kernel")
        return psi2
    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi2n(self, Z, variational_posterior):
        psi2 = reduce(np.add, (p.psi2n(Z, variational_posterior) for p in self.parts))
        #return psi2
        # compute the "cross" terms
        from .static import White, Bias
        from .rbf import RBF
        #from rbf_inv import RBFInv
        from .linear import Linear
        #ffrom fixed import Fixed
        for p1, p2 in itertools.combinations(self.parts, 2):
            # i1, i2 = p1.active_dims, p2.active_dims
            # white doesn;t combine with anything
            if isinstance(p1, White) or isinstance(p2, White):
                pass
            # rbf X bias
            #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
                tmp = p2.psi1(Z, variational_posterior).sum(axis=0)
                psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
            #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
            elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
                tmp = p1.psi1(Z, variational_posterior).sum(axis=0)
                psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
            elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
                assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
                tmp1 = p1.psi1(Z, variational_posterior)
                tmp2 = p2.psi1(Z, variational_posterior)
                psi2 += np.einsum('nm,no->nmo',tmp1,tmp2)+np.einsum('nm,no->nmo',tmp2,tmp1)
                #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
            else:
                raise NotImplementedError("psi2 cannot be computed for this kernel")
        return psi2
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        from .static import White, Bias
        for p1 in self.parts:
@ -126,9 +174,9 @@ class Add(CombinationKernel):
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
    def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
@ -143,9 +191,9 @@ class Add(CombinationKernel):
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
        return target
@ -161,9 +209,9 @@ class Add(CombinationKernel):
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
            [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
        return target_grads
--- a/GPy/kern/_src/basis_funcs.py
+++ b/GPy/kern/_src/basis_funcs.py
@ -11,7 +11,7 @@ class BasisFuncKernel(Kern):
    def __init__(self, input_dim, variance=1., active_dims=None, ARD=False, name='basis func kernel'):
        """
        Abstract superclass for kernels with explicit basis functions for use in GPy.
-        
+
        This class does NOT automatically add an offset to the design matrix phi!
        """
        super(BasisFuncKernel, self).__init__(input_dim, active_dims, name)
@ -23,24 +23,24 @@ class BasisFuncKernel(Kern):
            variance = np.array(variance)
        self.variance = Param('variance', variance, Logexp())
        self.link_parameter(self.variance)
-    
+
    def parameters_changed(self):
        self.alpha = np.sqrt(self.variance)
        self.beta = 1./self.variance
-    
+
    @Cache_this(limit=3, ignore_args=())
    def phi(self, X):
        return self._phi(X)
    def _phi(self, X):
        raise NotImplementedError('Overwrite this _phi function, which maps the input X into the higher dimensional space and returns the design matrix Phi')
-        
+
    def K(self, X, X2=None):
        return self._K(X, X2)
    def Kdiag(self, X, X2=None):
        return np.diag(self._K(X, X2))
-    
+
    def update_gradients_full(self, dL_dK, X, X2=None):
        if self.ARD:
            phi1 = self.phi(X)
@ -51,22 +51,22 @@ class BasisFuncKernel(Kern):
                self.variance.gradient = np.einsum('ij,iq,jq->q', dL_dK, phi1, phi2)
        else:
            self.variance.gradient = np.einsum('ij,ij', dL_dK, self._K(X, X2)) * self.beta
-        
+
    def update_gradients_diag(self, dL_dKdiag, X):
        if self.ARD:
            phi1 = self.phi(X)
            self.variance.gradient = np.einsum('i,iq,iq->q', dL_dKdiag, phi1, phi1)
        else:
            self.variance.gradient = np.einsum('i,i', dL_dKdiag, self.Kdiag(X)) * self.beta
-        
+
    def concatenate_offset(self, X):
        return np.c_[np.ones((X.shape[0], 1)), X]
-    
+
    def posterior_inf(self, X=None, posterior=None):
        """
-        Do the posterior inference on the parameters given this kernels functions 
+        Do the posterior inference on the parameters given this kernels functions
-        and the model posterior, which has to be a GPy posterior, usually found at m.posterior, if m is a GPy model. 
+        and the model posterior, which has to be a GPy posterior, usually found at m.posterior, if m is a GPy model.
-        If not given we search for the the highest parent to be a model, containing the posterior, and for X accordingly. 
+        If not given we search for the the highest parent to be a model, containing the posterior, and for X accordingly.
        """
        if X is None:
            try:
@ -80,7 +80,7 @@ class BasisFuncKernel(Kern):
                raise RuntimeError("This kernel is not part of a model and cannot be used for posterior inference")
        phi_alpha = self.phi(X) * self.variance
        return (phi_alpha).T.dot(posterior.woodbury_vector), (np.eye(phi_alpha.shape[1])*self.variance - mdot(phi_alpha.T, posterior.woodbury_inv, phi_alpha))
-    
+
    @Cache_this(limit=3, ignore_args=())
    def _K(self, X, X2):
        if X2 is None or X is X2:
@ -95,35 +95,35 @@ class BasisFuncKernel(Kern):
                phi1 = phi1[:, None]
                phi2 = phi2[:, None]
            return phi1.dot(phi2.T)
-        
+
-        
+
 class LinearSlopeBasisFuncKernel(BasisFuncKernel):
    def __init__(self, input_dim, start, stop, variance=1., active_dims=None, ARD=False, name='linear_segment'):
        """
        A linear segment transformation. The segments start at start, \
-        are then linear to stop and constant again. The segments are 
+        are then linear to stop and constant again. The segments are
-        normalized, so that they have exactly as much mass above 
+        normalized, so that they have exactly as much mass above
-        as below the origin. 
+        as below the origin.
-        
+
-        Start and stop can be tuples or lists of starts and stops. 
+        Start and stop can be tuples or lists of starts and stops.
        Behaviour of start stop is as np.where(X<start) would do.
        """
-        
+
        self.start = np.array(start)
        self.stop = np.array(stop)
        super(LinearSlopeBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
-    
+
    @Cache_this(limit=3, ignore_args=())
    def _phi(self, X):
        phi = np.where(X < self.start, self.start, X)
        phi = np.where(phi > self.stop, self.stop, phi)
        return ((phi-(self.stop+self.start)/2.))#/(.5*(self.stop-self.start)))-1.
-    
+
 class ChangePointBasisFuncKernel(BasisFuncKernel):
    def __init__(self, input_dim, changepoint, variance=1., active_dims=None, ARD=False, name='changepoint'):
        self.changepoint = np.array(changepoint)
        super(ChangePointBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
-    
+
    @Cache_this(limit=3, ignore_args=())
    def _phi(self, X):
        return np.where((X < self.changepoint), -1, 1)
@ -131,7 +131,7 @@ class ChangePointBasisFuncKernel(BasisFuncKernel):
 class DomainKernel(LinearSlopeBasisFuncKernel):
    def __init__(self, input_dim, start, stop, variance=1., active_dims=None, ARD=False, name='constant_domain'):
        super(DomainKernel, self).__init__(input_dim, start, stop, variance, active_dims, ARD, name)
-    
+
    @Cache_this(limit=3, ignore_args=())
    def _phi(self, X):
        phi = np.where((X>self.start)*(X<self.stop), 1, 0)
@ -147,7 +147,7 @@ class LogisticBasisFuncKernel(BasisFuncKernel):
            self.slope = Param('slope', slope, Logexp())
        super(LogisticBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
        self.link_parameter(self.slope)
-    
+
    @Cache_this(limit=3, ignore_args=())
    def _phi(self, X):
        import scipy as sp
@ -156,7 +156,7 @@ class LogisticBasisFuncKernel(BasisFuncKernel):
    def parameters_changed(self):
        BasisFuncKernel.parameters_changed(self)
-    
+
    def update_gradients_full(self, dL_dK, X, X2=None):
        super(LogisticBasisFuncKernel, self).update_gradients_full(dL_dK, X, X2)
        if X2 is None or X is X2:
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -6,7 +6,11 @@ import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.config import config # for assesing whether to use cython
-from . import coregionalize_cython
+try:
    from . import coregionalize_cython
    config.set('cython', 'working', 'True')
 except ImportError:
    config.set('cython', 'working', 'False')
 class Coregionalize(Kern):
    """
@ -94,7 +98,7 @@ class Coregionalize(Kern):
            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)
-        dkappa = np.diag(dL_dK_small)
+        dkappa = np.diag(dL_dK_small).copy()
        dL_dK_small += dL_dK_small.T
        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
@ -111,7 +115,7 @@ class Coregionalize(Kern):
        return dL_dK_small
    def _gradient_reduce_cython(self, dL_dK, index, index2):
-        index, index2 = index[:,0], index2[:,0]
+        index, index2 = np.int64(index[:,0]), np.int64(index2[:,0])
        return coregionalize_cython.gradient_reduce(self.B.shape[0], dL_dK, index, index2)
@ -126,4 +130,3 @@ class Coregionalize(Kern):
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
--- a/GPy/kern/_src/coregionalize_cython.c
+++ b/GPy/kern/_src/coregionalize_cython.c
--- a/GPy/kern/_src/coregionalize_cython.pyx
+++ b/GPy/kern/_src/coregionalize_cython.pyx
@ -1,33 +1,37 @@
-#cython: boundscheck=True
+#cython: boundscheck=False
-#cython: wraparound=True
+#cython: wraparound=False
 #cython: nonecheck=False
 import cython
 import numpy as np
 cimport numpy as np
 def K_symmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X):
    cdef int N = X.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, N))
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, N))
-    for n in range(N):
+    with nogil:
-        for m in range(N):
+        for n in range(N):
-            K[n,m] = B[X[n],X[m]]
+            for m in range(N):
                K[n, m] = B[X[n], X[m]]
    return K
 def K_asymmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X, np.ndarray[np.int64_t, ndim=1] X2):
    cdef int N = X.size
    cdef int M = X2.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, M))
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, M))
-    for n in range(N):
+    with nogil:
-        for m in range(M):
+        for n in range(N):
-            K[n,m] = B[X[n],X2[m]]
+            for m in range(M):
                K[n, m] = B[X[n], X2[m]]
    return K
 def gradient_reduce(int D, np.ndarray[double, ndim=2] dL_dK, np.ndarray[np.int64_t, ndim=1] index, np.ndarray[np.int64_t, ndim=1] index2):
-        cdef np.ndarray[np.double_t, ndim=2] dL_dK_small = np.zeros((D, D))
+        cdef np.ndarray[np.double_t, ndim=2, mode='c'] dL_dK_small = np.zeros((D, D))
        cdef int N = index.size
        cdef int M = index2.size
-        for i in range(N):
+        with nogil:
-            for j in range(M):
+            for i in range(N):
-                dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
+                for j in range(M):
                    dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
        return dL_dK_small
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@ -105,7 +105,7 @@ class IndependentOutputs(CombinationKernel):
        if X2 is None:
            # TODO: make use of index_to_slices
            # FIXME: Broken as X is already sliced out
-            print("Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!")
+            # print("Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!")
            values = np.unique(X[:,self.index_dim])
            slices = [X[:,self.index_dim]==i for i in values]
            [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -58,20 +58,9 @@ class Kern(Parameterized):
        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
        self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)
-    @property
+        from .psi_comp import PSICOMP_GH
-    def return_psi2_n(self):
+        self.psicomp = PSICOMP_GH()
        """
        Flag whether to pass back psi2 as NxMxM or MxM, by summing out N.
        """
        return self._return_psi2_n_flag[0]
    @return_psi2_n.setter
    def return_psi2_n(self, val):
        def visit(self):
            if isinstance(self, Kern):
                self._return_psi2_n_flag[0]=val
        self.traverse(visit)
    @Cache_this(limit=20)
    def _slice_X(self, X):
@ -81,6 +70,9 @@ class Kern(Parameterized):
        """
        Compute the kernel function.
        .. math::
            K_{ij} = k(X_i, X_j)
        :param X: the first set of inputs to the kernel
        :param X2: (optional) the second set of arguments to the kernel. If X2
                   is None, this is passed throgh to the 'part' object, which
@ -88,16 +80,64 @@ class Kern(Parameterized):
        """
        raise NotImplementedError
    def Kdiag(self, X):
        """
        The diagonal of the kernel matrix K
        .. math::
            Kdiag_{i} = k(X_i, X_i)
        """
        raise NotImplementedError
    def psi0(self, Z, variational_posterior):
-        raise NotImplementedError
+        """
        .. math::
            \psi_0 = \sum_{i=0}^{n}E_{q(X)}[k(X_i, X_i)]
        """
        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]
    def psi1(self, Z, variational_posterior):
-        raise NotImplementedError
+        """
        .. math::
            \psi_1^{n,m} = E_{q(X)}[k(X_n, Z_m)]
        """
        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]
    def psi2(self, Z, variational_posterior):
-        raise NotImplementedError
+        """
        .. math::
            \psi_2^{m,m'} = \sum_{i=0}^{n}E_{q(X)}[ k(Z_m, X_i) k(X_i, Z_{m'})]
        """
        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=False)[2]
    def psi2n(self, Z, variational_posterior):
        """
        .. math::
            \psi_2^{n,m,m'} = E_{q(X)}[ k(Z_m, X_n) k(X_n, Z_{m'})]
        Thus, we do not sum out n, compared to psi2
        """
        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]
    def gradients_X(self, dL_dK, X, X2):
        """
        .. math::
            \\frac{\partial L}{\partial X} = \\frac{\partial L}{\partial K}\\frac{\partial K}{\partial X}
        """
        raise NotImplementedError
    def gradients_X_X2(self, dL_dK, X, X2):
        return self.gradients_X(dL_dK, X, X2), self.gradients_X(dL_dK.T, X2, X)
    def gradients_XX(self, dL_dK, X, X2):
        """
        .. math::
            \\frac{\partial^2 L}{\partial X\partial X_2} = \\frac{\partial L}{\partial K}\\frac{\partial^2 K}{\partial X\partial X_2}
        """
        raise(NotImplementedError, "This is the second derivative of K wrt X and X2, and not implemented for this kernel")
    def gradients_XX_diag(self, dL_dKdiag, X):
        """
        The diagonal of the second derivative w.r.t. X and X2
        """
        raise(NotImplementedError, "This is the diagonal of the second derivative of K wrt X and X2, and not implemented for this kernel")
    def gradients_X_diag(self, dL_dKdiag, X):
        """
        The diagonal of the derivative w.r.t. X
        """
        raise NotImplementedError
    def update_gradients_diag(self, dL_dKdiag, X):
@ -113,27 +153,35 @@ class Kern(Parameterized):
        Set the gradients of all parameters when doing inference with
        uncertain inputs, using expectations of the kernel.
-        The esential maths is
+        The essential maths is
-        dL_d{theta_i} = dL_dpsi0 * dpsi0_d{theta_i} +
+        .. math::
-                        dL_dpsi1 * dpsi1_d{theta_i} +
+
-                        dL_dpsi2 * dpsi2_d{theta_i}
+            \\frac{\partial L}{\partial \\theta_i} & = \\frac{\partial L}{\partial \psi_0}\\frac{\partial \psi_0}{\partial \\theta_i}\\
                & \quad + \\frac{\partial L}{\partial \psi_1}\\frac{\partial \psi_1}{\partial \\theta_i}\\
                & \quad + \\frac{\partial L}{\partial \psi_2}\\frac{\partial \psi_2}{\partial \\theta_i}
        Thus, we push the different derivatives through the gradients of the psi
        statistics. Be sure to set the gradients for all kernel
        parameters here.
        """
-        raise NotImplementedError
+        dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[0]
        self.gradient[:] = dtheta
-    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
                                psi0=None, psi1=None, psi2=None):
        """
        Returns the derivative of the objective wrt Z, using the chain rule
        through the expectation variables.
        """
-        raise NotImplementedError
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[1]
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Compute the gradients wrt the parameters of the variational
        distruibution q(X), chain-ruling via the expectations of the kernel
        """
-        raise NotImplementedError
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2:]
    def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
        """
@ -172,7 +220,7 @@ class Kern(Parameterized):
    def __iadd__(self, other):
        return self.add(other)
-    def add(self, other, name='add'):
+    def add(self, other, name='sum'):
        """
        Add another kernel to this one.
@ -208,8 +256,6 @@ class Kern(Parameterized):
        :param other: the other kernel to be added
        :type other: GPy.kern
        :param tensor: whether or not to use the tensor space (default is false).
        :type tensor: bool
        """
        assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -1,7 +1,11 @@
 '''
 Created on 11 Mar 2014
-@author: maxz
+@author: @mzwiessele
 This module provides a meta class for the kernels. The meta class is for
 slicing the inputs (X, X2) for the kernels, before K (or any other method involving X)
 gets calls. The `active_dims` of a kernel decide which dimensions the kernel works on.
 '''
 from ...core.parameterization.parameterized import ParametersChangedMeta
 import numpy as np
@ -19,20 +23,27 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
        put_clean(dct, 'gradients_X', _slice_gradients_X)
        put_clean(dct, 'gradients_X_X2', _slice_gradients_X)
        put_clean(dct, 'gradients_XX', _slice_gradients_XX)
        put_clean(dct, 'gradients_XX_diag', _slice_gradients_X_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
        put_clean(dct, 'psi2', _slice_psi)
        put_clean(dct, 'psi2n', _slice_psi)
        put_clean(dct, 'update_gradients_expectations', _slice_update_gradients_expectations)
        put_clean(dct, 'gradients_Z_expectations', _slice_gradients_Z_expectations)
        put_clean(dct, 'gradients_qX_expectations', _slice_gradients_qX_expectations)
        return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)
 class _Slice_wrap(object):
-    def __init__(self, k, X, X2=None):
+    def __init__(self, k, X, X2=None, ret_shape=None):
        self.k = k
-        self.shape = X.shape
+        if ret_shape is None:
            self.shape = X.shape
        else:
            self.shape = ret_shape
        assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
        if X2 is not None:
            assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
@ -54,7 +65,10 @@ class _Slice_wrap(object):
    def handle_return_array(self, return_val):
        if self.ret:
            ret = np.zeros(self.shape)
-            ret[:, self.k.active_dims] = return_val
+            if len(self.shape) == 2:
                ret[:, self.k.active_dims] = return_val
            elif len(self.shape) == 3:
                ret[:, :, self.k.active_dims] = return_val
            return ret
        return return_val
@ -98,6 +112,19 @@ def _slice_gradients_X(f):
        return ret
    return wrap
 def _slice_gradients_XX(f):
    @wraps(f)
    def wrap(self, dL_dK, X, X2=None):
        if X2 is None:
            N, M = X.shape[0], X.shape[0]
        else:
            N, M = X.shape[0], X2.shape[0]
        with _Slice_wrap(self, X, X2, ret_shape=(N, M, X.shape[1])) as s:
        #with _Slice_wrap(self, X, X2, ret_shape=None) as s:
            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
        return ret
    return wrap
 def _slice_gradients_X_diag(f):
    @wraps(f)
    def wrap(self, dL_dKdiag, X):
@ -124,7 +151,8 @@ def _slice_update_gradients_expectations(f):
 def _slice_gradients_Z_expectations(f):
    @wraps(f)
-    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
             psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
        with _Slice_wrap(self, Z, variational_posterior) as s:
            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
        return ret
@ -132,7 +160,8 @@ def _slice_gradients_Z_expectations(f):
 def _slice_gradients_qX_expectations(f):
    @wraps(f)
-    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
             psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
        with _Slice_wrap(self, variational_posterior, Z) as s:
            ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X))
            r2 = ret[:2]
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -17,7 +17,7 @@ class Linear(Kern):
    .. math::
-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i
+       k(x,y) = \sum_{i=1}^{\\text{input_dim}} \sigma^2_i x_iy_i
    :param input_dim: the number of input dimensions
    :type input_dim: int
@ -100,6 +100,12 @@ class Linear(Kern):
            #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
            return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)
    def gradients_XX(self, dL_dK, X, X2=None):
        if X2 is None:
            return 2*np.ones(X.shape)*self.variances
        else:
            return np.ones(X.shape)*self.variances
    def gradients_X_diag(self, dL_dKdiag, X):
        return 2.*self.variances*dL_dKdiag[:,None]*X
@ -111,26 +117,29 @@ class Linear(Kern):
    #---------------------------------------#
    def psi0(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[0]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]
    def psi1(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]
    def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[2]
    def psi2n(self, Z, variational_posterior):
        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        dL_dvar = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[0]
+        dL_dvar = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[0]
        if self.ARD:
            self.variances.gradient = dL_dvar
        else:
            self.variances.gradient = dL_dvar.sum()
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[1]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[1]
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2:]
 class LinearFull(Kern):
    def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@ -5,6 +5,8 @@ from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 from ...util.linalg import tdot
 from ...util.caching import Cache_this
 four_over_tau = 2./np.pi
 class MLP(Kern):
@ -31,105 +33,116 @@ class MLP(Kern):
    """
-    def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=100., active_dims=None, name='mlp'):
+    def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=1., ARD=False, active_dims=None, name='mlp'):
        super(MLP, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
        self.ARD= ARD
        if ARD:
            wv = np.empty((input_dim,))
            wv[:] = weight_variance
            weight_variance = wv
        self.weight_variance = Param('weight_variance', weight_variance, Logexp())
        self.bias_variance = Param('bias_variance', bias_variance, Logexp())
        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)
    @Cache_this(limit=20, ignore_args=())
    def K(self, X, X2=None):
-        self._K_computations(X, X2)
+        if X2 is None:
-        return self.variance*self._K_dvar
+            X_denom = np.sqrt(self._comp_prod(X)+1.)
            X2_denom = X_denom
            X2 = X
        else:
            X_denom = np.sqrt(self._comp_prod(X)+1.)
            X2_denom = np.sqrt(self._comp_prod(X2)+1.)
        XTX = self._comp_prod(X,X2)/X_denom[:,None]/X2_denom[None,:]
        return self.variance*four_over_tau*np.arcsin(XTX)
    @Cache_this(limit=20, ignore_args=())
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix for X."""
-        self._K_diag_computations(X)
+        X_prod = self._comp_prod(X)
-        return self.variance*self._K_diag_dvar
+        return self.variance*four_over_tau*np.arcsin(X_prod/(X_prod+1.))
    def update_gradients_full(self, dL_dK, X, X2=None):
        """Derivative of the covariance with respect to the parameters."""
-        self._K_computations(X, X2)
+        dvar, dw, db = self._comp_grads(dL_dK, X, X2)[:3]
-        self.variance.gradient = np.sum(self._K_dvar*dL_dK)
+        self.variance.gradient = dvar
-
+        self.weight_variance.gradient = dw
-        denom3 = self._K_denom**3
+        self.bias_variance.gradient = db
        base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
        base_cov_grad = base*dL_dK
        if X2 is None:
            vec = np.diag(self._K_inner_prod)
            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
                           -.5*self._K_numer/denom3
                           *(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec)
                             +np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
            self.bias_variance.gradient = ((1./self._K_denom
                           -.5*self._K_numer/denom3
                           *((vec[None, :]+vec[:, None])*self.weight_variance
                           +2.*self.bias_variance + 2.))*base_cov_grad).sum()
        else:
            vec1 = (X*X).sum(1)
            vec2 = (X2*X2).sum(1)
            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
                           -.5*self._K_numer/denom3
                           *(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
            self.bias_variance.gradient = ((1./self._K_denom
                           -.5*self._K_numer/denom3
                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()
    def update_gradients_diag(self, dL_dKdiag, X):
-        self._K_diag_computations(X)
+        dvar, dw, db = self._comp_grads_diag(dL_dKdiag, X)[:3]
-        self.variance.gradient = np.sum(self._K_diag_dvar*dL_dKdiag)
+        self.variance.gradient = dvar
        self.weight_variance.gradient = dw
        self.bias_variance.gradient = db
        base = four_over_tau*self.variance/np.sqrt(1-self._K_diag_asin_arg*self._K_diag_asin_arg)
        base_cov_grad = base*dL_dKdiag/np.square(self._K_diag_denom)
        self.weight_variance.gradient = (base_cov_grad*np.square(X).sum(axis=1)).sum()
        self.bias_variance.gradient = base_cov_grad.sum()
    def gradients_X(self, dL_dK, X, X2):
        """Derivative of the covariance matrix with respect to X"""
-        self._K_computations(X, X2)
+        return self._comp_grads(dL_dK, X, X2)[3]
-        arg = self._K_asin_arg
+
-        numer = self._K_numer
+    def gradients_X_X2(self, dL_dK, X, X2):
-        denom = self._K_denom
+        """Derivative of the covariance matrix with respect to X"""
-        denom3 = denom*denom*denom
+        return self._comp_grads(dL_dK, X, X2)[3:]
        if X2 is not None:
            vec2 = (X2*X2).sum(1)*self.weight_variance+self.bias_variance + 1.
            return four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
        else:
            vec = (X*X).sum(1)*self.weight_variance+self.bias_variance + 1.
            return 2*four_over_tau*self.weight_variance*self.variance*((X[None, :, :]/denom[:, :, None] - vec[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
    def gradients_X_diag(self, dL_dKdiag, X):
        """Gradient of diagonal of covariance with respect to X"""
-        self._K_diag_computations(X)
+        return self._comp_grads_diag(dL_dKdiag, X)[3]
        arg = self._K_diag_asin_arg
        denom = self._K_diag_denom
        #numer = self._K_diag_numer
        return four_over_tau*2.*self.weight_variance*self.variance*X*(1./denom*(1. - arg)*dL_dKdiag/(np.sqrt(1-arg*arg)))[:, None]
-
+    @Cache_this(limit=50, ignore_args=())
-    def _K_computations(self, X, X2):
+    def _comp_prod(self, X, X2=None):
        """Pre-computations for the covariance matrix (used for computing the covariance and its gradients."""
        if X2 is None:
-            self._K_inner_prod = np.dot(X,X.T)
+            return (np.square(X)*self.weight_variance).sum(axis=1)+self.bias_variance
            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
            vec = np.diag(self._K_numer) + 1.
            self._K_denom = np.sqrt(np.outer(vec,vec))
        else:
-            self._K_inner_prod = np.dot(X,X2.T)
+            return (X*self.weight_variance).dot(X2.T)+self.bias_variance
-            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
+    
-            vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
+    @Cache_this(limit=20, ignore_args=(1,))
-            vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
+    def _comp_grads(self, dL_dK, X, X2=None):
-            self._K_denom = np.sqrt(np.outer(vec1,vec2))
+        var,w,b = self.variance, self.weight_variance, self.bias_variance
-        self._K_asin_arg = self._K_numer/self._K_denom
+        K = self.K(X, X2)
-        self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
+        dvar = (dL_dK*K).sum()/var
-
+        X_prod = self._comp_prod(X)
-    def _K_diag_computations(self, X):
+        X2_prod = self._comp_prod(X2) if X2 is not None else X_prod
-        """Pre-computations concerning the diagonal terms (used for computation of diagonal and its gradients)."""
+        XTX = self._comp_prod(X,X2) if X2 is not None else self._comp_prod(X, X)
-        self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
+        common = var*four_over_tau/np.sqrt((X_prod[:,None]+1.)*(X2_prod[None,:]+1.)-np.square(XTX))*dL_dK
-        self._K_diag_denom = self._K_diag_numer+1.
+        if self.ARD:
-        self._K_diag_asin_arg = self._K_diag_numer/self._K_diag_denom
+            if X2 is not None:
-        self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_asin_arg)
+                XX2 = X[:,None,:]*X2[None,:,:] if X2 is not None else X[:,None,:]*X[None,:,:]
                XX = np.square(X)
                X2X2 = np.square(X2)
                Q = self.weight_variance.shape[0]
                common_XTX = common*XTX
                dw =  np.dot(common.flat,XX2.reshape(-1,Q)) -( (common_XTX.sum(1)/(X_prod+1.)).T.dot(XX)+(common_XTX.sum(0)/(X2_prod+1.)).dot(X2X2))/2
            else:
                XX2 = X[:,None,:]*X[None,:,:]
                XX = np.square(X)
                Q = self.weight_variance.shape[0]
                common_XTX = common*XTX
                dw =  np.dot(common.flat,XX2.reshape(-1,Q)) - ((common_XTX.sum(0)+common_XTX.sum(1))/(X_prod+1.)).dot(XX)/2
        else:
            dw = (common*((XTX-b)/w-XTX*(((X_prod-b)/(w*(X_prod+1.)))[:,None]+((X2_prod-b)/(w*(X2_prod+1.)))[None,:])/2.)).sum()
        db = (common*(1.-XTX*(1./(X_prod[:,None]+1.)+1./(X2_prod[None,:]+1.))/2.)).sum()
        if X2 is None:
            common = common+common.T
            dX = common.dot(X)*w-((common*XTX).sum(axis=1)/(X_prod+1.))[:,None]*X*w
            dX2 = dX
        else:
            dX = common.dot(X2)*w-((common*XTX).sum(axis=1)/(X_prod+1.))[:,None]*X*w
            dX2 = common.T.dot(X)*w-((common*XTX).sum(axis=0)/(X2_prod+1.))[:,None]*X2*w
        return dvar, dw, db, dX, dX2
    @Cache_this(limit=20, ignore_args=(1,))
    def _comp_grads_diag(self, dL_dKdiag, X):
        var,w,b = self.variance, self.weight_variance, self.bias_variance
        K = self.Kdiag(X)
        dvar = (dL_dKdiag*K).sum()/var
        X_prod = self._comp_prod(X)
        common = var*four_over_tau/(np.sqrt(1-np.square(X_prod/(X_prod+1)))*np.square(X_prod+1))*dL_dKdiag
        if self.ARD:
            XX = np.square(X)
            dw = np.dot(common,XX)
        else:
            dw = (common*(X_prod-b)).sum()/w
        db = common.sum()
        dX = common[:,None]*X*w*2
        return dvar, dw, db, dX
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -27,8 +27,6 @@ class Prod(CombinationKernel):
    :param k1, k2: the kernels to multiply
    :type k1, k2: Kern
    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
    :type tensor: Boolean
    :rtype: kernel object
    """
--- a/GPy/kern/_src/psi_comp/init.py
+++ b/GPy/kern/_src/psi_comp/init.py
@ -9,18 +9,34 @@ from . import ssrbf_psi_comp
 from . import sslinear_psi_comp
 from . import linear_psi_comp
-class PSICOMP_RBF(Pickleable):
+
-    @Cache_this(limit=2, ignore_args=(0,))
+class PSICOMP(Pickleable):
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+        
    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
        raise NotImplementedError("Abstract method!")
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
        raise NotImplementedError("Abstract method!")
    def _setup_observers(self):
        pass
 from .gaussherm import PSICOMP_GH
 class PSICOMP_RBF(PSICOMP):
    @Cache_this(limit=5, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
-            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
+            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=return_psi2_n)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")
-    @Cache_this(limit=2, ignore_args=(0,1,2,3))
+    @Cache_this(limit=5, ignore_args=(0,2,3,4))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
            return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
@ -28,28 +44,26 @@ class PSICOMP_RBF(Pickleable):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")
-    def _setup_observers(self):
+class PSICOMP_Linear(PSICOMP):
        pass
-class PSICOMP_Linear(Pickleable):
+    @Cache_this(limit=5, ignore_args=(0,))
-
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
-    @Cache_this(limit=2, ignore_args=(0,))
+        variances = kern.variances
    def psicomputations(self, variance, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
-            return linear_psi_comp.psicomputations(variance, Z, variational_posterior)
+            return linear_psi_comp.psicomputations(variances, Z, variational_posterior, return_psi2_n=return_psi2_n)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior)
+            return sslinear_psi_comp.psicomputations(variances, Z, variational_posterior)
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")
-    @Cache_this(limit=2, ignore_args=(0,1,2,3))
+    @Cache_this(limit=2, ignore_args=(0,2,3,4))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
-            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variances, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+            return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variances, Z, variational_posterior)
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")
-    def _setup_observers(self):
+
        pass
--- a/GPy/kern/_src/psi_comp/gaussherm.py
+++ b/GPy/kern/_src/psi_comp/gaussherm.py
@ -0,0 +1,100 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 An approximated psi-statistics implementation based on Gauss-Hermite Quadrature
 """
 import numpy as np
 from ....core.parameterization import Param
 from GPy.util.caching import Cache_this
 from ....util.linalg import tdot
 from . import PSICOMP
 class PSICOMP_GH(PSICOMP):
    """
    TODO: support Psi2 with shape NxMxM
    """
    def __init__(self, degree=5, cache_K=True):
        self.degree = degree
        self.cache_K = cache_K
        self.locs, self.weights = np.polynomial.hermite.hermgauss(degree)
        self.locs *= np.sqrt(2.)
        self.weights*= 1./np.sqrt(np.pi)
        self.Xs = None
    def _setup_observers(self):
        pass
    @Cache_this(limit=10, ignore_args=(0,))
    def comp_K(self, Z, qX):
        if self.Xs is None or self.Xs.shape != qX.mean.shape:
            from ....core.parameterization import ObsAr
            self.Xs = ObsAr(np.empty((self.degree,)+qX.mean.shape))
        mu, S = qX.mean.values, qX.variance.values
        S_sq = np.sqrt(S)
        for i in xrange(self.degree):
            self.Xs[i] = self.locs[i]*S_sq+mu
        return self.Xs
    @Cache_this(limit=10, ignore_args=(0,))
    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
        mu, S = qX.mean.values, qX.variance.values
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
        if self.cache_K: Xs = self.comp_K(Z, qX)
        else: S_sq = np.sqrt(S)
        psi0 = np.zeros((N,))
        psi1 = np.zeros((N,M))
        psi2 = np.zeros((M,M))
        for i in xrange(self.degree):
            if self.cache_K:
                X = Xs[i]
            else:
                X = self.locs[i]*S_sq+mu
            psi0 += self.weights[i]* kern.Kdiag(X)
            Kfu = kern.K(X,Z)
            psi1 += self.weights[i]* Kfu
            psi2 += self.weights[i]* tdot(Kfu.T)
        return psi0, psi1, psi2
    @Cache_this(limit=10, ignore_args=(0, 2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
        mu, S = qX.mean.values, qX.variance.values
        if self.cache_K: Xs = self.comp_K(Z, qX)
        S_sq = np.sqrt(S)
        dtheta_old = kern.gradient.copy()
        dtheta = np.zeros_like(kern.gradient)
        if isinstance(Z, Param):
            dZ = np.zeros_like(Z.values)
        else:
            dZ = np.zeros_like(Z)
        dmu = np.zeros_like(mu)
        dS = np.zeros_like(S)
        for i in xrange(self.degree):
            if self.cache_K:
                X = Xs[i]
            else:
                X = self.locs[i]*S_sq+mu
            dL_dpsi0_i = dL_dpsi0*self.weights[i]
            kern.update_gradients_diag(dL_dpsi0_i, X)
            dtheta += kern.gradient
            dX = kern.gradients_X_diag(dL_dpsi0_i, X)
            Kfu = kern.K(X,Z)
            dL_dkfu = (dL_dpsi1+ 2.*Kfu.dot(dL_dpsi2))*self.weights[i]
            kern.update_gradients_full(dL_dkfu, X, Z)
            dtheta += kern.gradient
            dX_i, dZ_i = kern.gradients_X_X2(dL_dkfu, X, Z)
            dX += dX_i
            dZ += dZ_i
            dmu += dX
            dS += dX*self.locs[i]/(2.*S_sq)
        kern.gradient[:] = dtheta_old
        return dtheta, dZ, dmu, dS
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@ -8,7 +8,7 @@ The package for the Psi statistics computation of the linear kernel for Bayesian
 import numpy as np
 from ....util.linalg import tdot
-def psicomputations(variance, Z, variational_posterior):
+def psicomputations(variance, Z, variational_posterior, return_psi2_n=False):
    """
    Compute psi-statistics for ss-linear kernel
    """
@ -21,8 +21,12 @@ def psicomputations(variance, Z, variational_posterior):
    S = variational_posterior.variance
    psi0 = (variance*(np.square(mu)+S)).sum(axis=1)
-    psi1 = np.dot(mu,(variance*Z).T)
+    Zv = variance * Z
-    psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
+    psi1 = np.dot(mu,Zv.T)
    if return_psi2_n:
        psi2 = psi1[:,:,None] * psi1[:,None,:] + np.dot(S[:,None,:] * Zv[None,:,:], Zv.T)
    else:
        psi2 = np.dot(S.sum(axis=0) * Zv, Zv.T) + tdot(psi1.T)
    return psi0, psi1, psi2
@ -40,7 +44,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati
    dL_dmu += 2.*dL_dpsi0_var*mu+np.dot(dL_dpsi1,Z)*variance
    dL_dS += dL_dpsi0_var
    dL_dZ += dL_dpsi1_mu*variance
-    
+
    return dL_dvar, dL_dZ, dL_dmu, dL_dS
 def _psi2computations(dL_dpsi2, variance, Z, mu, S):
@ -56,22 +60,42 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S):
    # _psi2_dZ             MxQ
    # _psi2_dmu            NxQ
    # _psi2_dS             NxQ
-    
+
    variance2 = np.square(variance)
    common_sum = np.dot(mu,(variance*Z).T)
-    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+    if len(dL_dpsi2.shape)==2:
-    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
+        Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
-    common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
+        dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
-    Z2_expect = np.inner(common_sum,dL_dpsi2T)
+        common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
-    Z1_expect = np.dot(dL_dpsi2T,Z)
+        Z2_expect = np.inner(common_sum,dL_dpsi2T)
-
+        Z1_expect = np.dot(dL_dpsi2T,Z)
    dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
    dL_dmu = common_expect*variance
-    dL_dS = np.empty(S.shape)
+        dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
    dL_dS[:] = Z_expect*variance2
-    dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
+        dL_dmu = common_expect*variance
        dL_dS = np.empty(S.shape)
        dL_dS[:] = Z_expect*variance2
        dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
    else:
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
        dL_dpsi2_ = dL_dpsi2.sum(axis=0)
        Z_expect = (np.dot(dL_dpsi2.reshape(N*M,M),Z).reshape(N,M,Q)*Z[None,:,:]).sum(axis=1)
        dL_dpsi2T = dL_dpsi2_+dL_dpsi2_.T
        dL_dpsi2T_ = dL_dpsi2+np.swapaxes(dL_dpsi2, 1, 2)
        common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
        common_expect_ = (common_sum[:,:,None]*np.dot(dL_dpsi2T_.reshape(N*M,M),Z).reshape(N,M,Q)).sum(axis=1)
        Z2_expect = (common_sum[:,:,None]*dL_dpsi2T_).sum(axis=1)
        Z1_expect = np.dot(dL_dpsi2T_.reshape(N*M,M),Z).reshape(N,M,Q)
        dL_dvar = 2.*variance*(S*Z_expect).sum(axis=0)+(common_expect_*mu).sum(axis=0)
        dL_dmu = common_expect_*variance
        dL_dS = np.empty(S.shape)
        dL_dS[:] = variance2* Z_expect
        dL_dZ = variance2*(S[:,None,:]*Z1_expect).sum(axis=0)+np.dot(Z2_expect.T,variance*mu)
    return dL_dvar, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -5,13 +5,7 @@ The module for psi-statistics for RBF kernel
 import numpy as np
 from GPy.util.caching import Cacher
-def psicomputations(variance, lengthscale, Z, variational_posterior):
+def psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=False):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi0, psi1 and psi2
    # Produced intermediate results:
    # _psi1                NxM
@ -21,16 +15,11 @@ def psicomputations(variance, lengthscale, Z, variational_posterior):
    psi0 = np.empty(mu.shape[0])
    psi0[:] = variance
    psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
-    psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
+    psi2 = _psi2computations(variance, lengthscale, Z, mu, S)
    if not return_psi2_n: psi2 = psi2.sum(axis=0)
    return psi0, psi1, psi2
 def __psi1computations(variance, lengthscale, Z, mu, S):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi1
    # Produced intermediate results:
    # _psi1                NxM
@ -45,26 +34,19 @@ def __psi1computations(variance, lengthscale, Z, mu, S):
    return _psi1
 def __psi2computations(variance, lengthscale, Z, mu, S):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi2
    # Produced intermediate results:
    # _psi2                MxM
    N,M,Q = mu.shape[0], Z.shape[0], mu.shape[1]
    lengthscale2 = np.square(lengthscale)
    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
    denom = 1./(2.*S+lengthscale2)
-    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
+    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+(2*(mu*denom).dot(Z_hat.reshape(M*M,Q).T) - denom.dot(np.square(Z_hat).reshape(M*M,Q).T)).reshape(N,M,M)
    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
    return _psi2
 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
@ -86,13 +68,6 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal
    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
 def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
    """
    dL_dpsi1 - NxM
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi1
    # Produced intermediate results: dL_dparams w.r.t. psi1
    # _dL_dvariance     1
@ -118,13 +93,6 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
 def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    dL_dpsi2 - MxM
    """
    # here are the "statistics" for psi2
    # Produced the derivatives w.r.t. psi2:
    # _dL_dvariance      1
@ -157,5 +125,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
-_psi1computations = Cacher(__psi1computations, limit=1)
+_psi1computations = Cacher(__psi1computations, limit=5)
-_psi2computations = Cacher(__psi2computations, limit=1)
+_psi2computations = Cacher(__psi2computations, limit=5)
--- a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
@ -7,13 +7,6 @@ from ....util.caching import Cache_this
 from . import PSICOMP_RBF
 from ....util import gpu_init
 try:
    import pycuda.gpuarray as gpuarray
    from pycuda.compiler import SourceModule
    from ....util.linalg_gpu import sum_axis
 except:
    pass    
 gpu_code = """
    // define THREADNUM
@ -241,7 +234,11 @@ gpu_code = """
 class PSICOMP_RBF_GPU(PSICOMP_RBF):
-    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
+    def __init__(self, threadnum=256, blocknum=30, GPU_direct=False):
        from pycuda.compiler import SourceModule
        from ....util.gpu_init import initGPU
        initGPU()
        self.GPU_direct = GPU_direct
        self.gpuCache = None
@ -264,7 +261,8 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        memo[id(self)] = s 
        return s
-    def _initGPUCache(self, N, M, Q):            
+    def _initGPUCache(self, N, M, Q):
        import pycuda.gpuarray as gpuarray
        if self.gpuCache == None:
            self.gpuCache = {
                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
@ -320,13 +318,14 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
    def get_dimensions(self, Z, variational_posterior):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=5, ignore_args=(0,))
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        """
        variance, lengthscale = kern.variance, kern.lengthscale
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        self._initGPUCache(N,M,Q)
        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
@ -355,8 +354,10 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()
-    @Cache_this(limit=1, ignore_args=(0,1,2,3))
+    @Cache_this(limit=5, ignore_args=(0,2,3,4))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        from ....util.linalg_gpu import sum_axis
        ARD = (len(lengthscale)!=1)
        N,M,Q = self.get_dimensions(Z, variational_posterior)
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@ -9,7 +9,7 @@ from ....util.linalg import tdot
 import numpy as np
-def psicomputations(variance, Z, variational_posterior):
+def psicomputations(variance, Z, variational_posterior, return_psi2_n=False):
    """
    Compute psi-statistics for ss-linear kernel
    """
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@ -9,7 +9,7 @@ import numpy as np
 try:
    from scipy import weave
-     
+
    def _psicomputations(variance, lengthscale, Z, variational_posterior):
        """
        Z - MxQ
@ -23,7 +23,7 @@ try:
        mu = variational_posterior.mean
        S = variational_posterior.variance
        gamma = variational_posterior.binary_prob
-         
+
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
        l2 = np.square(lengthscale)
        log_denom1 = np.log(S/l2+1)
@ -35,13 +35,13 @@ try:
        psi0[:] = variance
        psi1 = np.empty((N,M))
        psi2n = np.empty((N,M,M))
-         
+
        from ....util.misc import param_to_array
        S = param_to_array(S)
        mu = param_to_array(mu)
        gamma = param_to_array(gamma)
        Z = param_to_array(Z)
-         
+
        support_code = """
        #include <math.h>
        """
@ -56,11 +56,11 @@ try:
                        double lq = l2(q);
                        double Zm1q = Z(m1,q);
                        double Zm2q = Z(m2,q);
-                         
+
                        if(m2==0) {
                            // Compute Psi_1
                            double muZ = mu(n,q)-Z(m1,q);
-                             
+
                            double psi1_exp1 = log_gamma(n,q) - (muZ*muZ/(Snq+lq) +log_denom1(n,q))/2.;
                            double psi1_exp2 = log_gamma1(n,q) -Zm1q*Zm1q/(2.*lq);
                            log_psi1 += (psi1_exp1>psi1_exp2)?psi1_exp1+log1p(exp(psi1_exp2-psi1_exp1)):psi1_exp2+log1p(exp(psi1_exp1-psi1_exp2));
@ -69,10 +69,10 @@ try:
                        double muZhat = mu(n,q) - (Zm1q+Zm2q)/2.;
                        double Z2 = Zm1q*Zm1q+ Zm2q*Zm2q;
                        double dZ = Zm1q - Zm2q;
-                         
+
                        double psi2_exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
                        double psi2_exp2 = log_gamma1(n,q) - Z2/(2.*lq);
-                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));                    
+                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));
                    }
                    double exp_psi2_n = exp(log_psi2_n);
                    psi2n(n,m1,m2) = variance*variance*exp_psi2_n;
@ -83,18 +83,18 @@ try:
        }
        """
        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
-     
+
        psi2 = psi2n.sum(axis=0)
        return psi0,psi1,psi2,psi2n
-     
+
    from GPy.util.caching import Cacher
    psicomputations = Cacher(_psicomputations, limit=1)
-     
+
    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
-         
+
        _,psi1,_,psi2n = psicomputations(variance, lengthscale, Z, variational_posterior)
-     
+
        mu = variational_posterior.mean
        S = variational_posterior.variance
        gamma = variational_posterior.binary_prob
@ -105,7 +105,7 @@ try:
        log_gamma = np.log(gamma)
        log_gamma1 = np.log(1.-gamma)
        variance = float(variance)
-     
+
        dvar = np.zeros(1)
        dmu = np.zeros((N,Q))
        dS = np.zeros((N,Q))
@ -113,13 +113,13 @@ try:
        dl = np.zeros(Q)
        dZ = np.zeros((M,Q))
        dvar += np.sum(dL_dpsi0)
-         
+
        from ....util.misc import param_to_array
        S = param_to_array(S)
        mu = param_to_array(mu)
        gamma = param_to_array(gamma)
        Z = param_to_array(Z)
-         
+
        support_code = """
        #include <math.h>
        """
@ -136,16 +136,16 @@ try:
                        double Zm2q = Z(m2,q);
                        double gnq = gamma(n,q);
                        double mu_nq = mu(n,q);
-                         
+
                        if(m2==0) {
-                            // Compute Psi_1                        
+                            // Compute Psi_1
                            double lpsi1 = psi1(n,m1)*dL_dpsi1(n,m1);
                            if(q==0) {dvar(0) += lpsi1/variance;}
-                             
+
                            double Zmu = Zm1q - mu_nq;
                            double denom = Snq+lq;
                            double Zmu2_denom = Zmu*Zmu/denom;
-                             
+
                            double exp1 = log_gamma(n,q)-(Zmu*Zmu/(Snq+lq)+log_denom1(n,q))/(2.);
                            double exp2 = log_gamma1(n,q)-Zm1q*Zm1q/(2.*lq);
                            double d_exp1,d_exp2;
@ -157,7 +157,7 @@ try:
                                d_exp2 = 1.;
                            }
                            double exp_sum = d_exp1+d_exp2;
-                             
+
                            dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                            dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
                            dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
@ -167,13 +167,13 @@ try:
                        // Compute Psi_2
                        double lpsi2 = psi2n(n,m1,m2)*dL_dpsi2(m1,m2);
                        if(q==0) {dvar(0) += lpsi2*2/variance;}
-                         
+
                        double dZm1m2 = Zm1q - Zm2q;
                        double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
                        double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
                        double denom = 2.*Snq+lq;
                        double muZhat2_denom = muZhat*muZhat/denom;
-                         
+
                        double exp1 = dZm1m2*dZm1m2/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
                        double exp2 = log_gamma1(n,q) - Z2/(2.*lq);
                        double d_exp1,d_exp2;
@ -185,23 +185,23 @@ try:
                            d_exp2 = 1.;
                        }
                        double exp_sum = d_exp1+d_exp2;
-                         
+
                        dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
                        dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
                        dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                        dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
-                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
+                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                    }
                }
            }
        }
        """
        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
-     
+
        dl *= 2.*lengthscale
        if not ARD:
            dl = dl.sum()
-         
+
        return dvar, dl, dZ, dmu, dS, dgamma
 except:
@ -219,13 +219,13 @@ except:
        mu = variational_posterior.mean
        S = variational_posterior.variance
        gamma = variational_posterior.binary_prob
-         
+
        psi0 = np.empty(mu.shape[0])
        psi0[:] = variance
        psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
        psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
        return psi0, psi1, psi2
-    
+
    def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
        """
        Z - MxQ
@ -236,9 +236,9 @@ except:
        # here are the "statistics" for psi1
        # Produced intermediate results:
        # _psi1                NxM
-    
+
        lengthscale2 = np.square(lengthscale)
-    
+
        # psi1
        _psi1_denom = S[:, None, :] / lengthscale2 + 1.  # Nx1xQ
        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ
@ -251,9 +251,9 @@ except:
        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
-    
+
        return _psi1
-    
+
    def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
        """
        Z - MxQ
@ -264,14 +264,14 @@ except:
        # here are the "statistics" for psi2
        # Produced intermediate results:
        # _psi2                MxM
-        
+
        lengthscale2 = np.square(lengthscale)
-        
+
        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
-    
+
        # psi2
        _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ
        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
@ -284,28 +284,28 @@ except:
        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
-    
+
        return _psi2
-    
+
    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
-         
+
        dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
        dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-     
+
        dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
-         
+
        dL_dlengscale = dl_psi1 + dl_psi2
        if not ARD:
            dL_dlengscale = dL_dlengscale.sum()
-     
+
        dL_dgamma = dgamma_psi1 + dgamma_psi2
        dL_dmu = dmu_psi1 + dmu_psi2
        dL_dS = dS_psi1 + dS_psi2
        dL_dZ = dZ_psi1 + dZ_psi2
-         
+
        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
-    
+
    def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
        """
        dL_dpsi1 - NxM
@ -322,9 +322,9 @@ except:
        # _dL_dgamma        NxQ
        # _dL_dmu           NxQ
        # _dL_dS            NxQ
-        
+
        lengthscale2 = np.square(lengthscale)
-    
+
        # psi1
        _psi1_denom = S / lengthscale2 + 1.  # NxQ
        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
@ -346,9 +346,9 @@ except:
        _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
        _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
        _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
-    
+
-        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
+        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
-    
+
    def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
        """
        Z - MxQ
@ -365,14 +365,14 @@ except:
        # _dL_dgamma         NxQ
        # _dL_dmu            NxQ
        # _dL_dS             NxQ
-        
+
        lengthscale2 = np.square(lengthscale)
-        
+
        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
-    
+
        # psi2
        _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
@ -384,7 +384,7 @@ except:
        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
-        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
+        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ
        _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
        _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
@ -394,5 +394,5 @@ except:
        _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
        _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
        _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
-    
+
        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
@ -6,14 +6,7 @@ The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM
 import numpy as np
 from ....util.caching import Cache_this
 from . import PSICOMP_RBF
 from ....util import gpu_init
 try:
    import pycuda.gpuarray as gpuarray
    from pycuda.compiler import SourceModule
    from ....util.linalg_gpu import sum_axis
 except:
    pass    
 gpu_code = """
    // define THREADNUM
@ -292,6 +285,11 @@ gpu_code = """
 class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
        from pycuda.compiler import SourceModule
        from ....util.gpu_init import initGPU
        initGPU()
        self.GPU_direct = GPU_direct
        self.gpuCache = None
@ -314,7 +312,8 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        memo[id(self)] = s 
        return s
-    def _initGPUCache(self, N, M, Q):            
+    def _initGPUCache(self, N, M, Q):
        import pycuda.gpuarray as gpuarray
        if self.gpuCache == None:
            self.gpuCache = {
                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
@ -377,12 +376,13 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
    @Cache_this(limit=1, ignore_args=(0,))
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        """
        variance, lengthscale = kern.variance, kern.lengthscale
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        self._initGPUCache(N,M,Q)
        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
@ -409,8 +409,10 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()
-    @Cache_this(limit=1, ignore_args=(0,1,2,3))
+    @Cache_this(limit=1, ignore_args=(0,2,3,4))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        from ....util.linalg_gpu import sum_axis
        ARD = (len(lengthscale)!=1)
        N,M,Q = self.get_dimensions(Z, variational_posterior)
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -31,6 +31,9 @@ class RBF(Stationary):
    def dK_dr(self, r):
        return -r*self.K_of_r(r)
    def dK2_drdr(self, r):
        return (r**2-1)*self.K_of_r(r)
    def __getstate__(self):
        dc = super(RBF, self).__getstate__()
        if self.useGPU:
@ -50,22 +53,25 @@ class RBF(Stationary):
    #---------------------------------------#
    def psi0(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[0]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]
    def psi1(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]
    def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[2]
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=False)[2]
    def psi2n(self, Z, variational_posterior):
        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[:2]
+        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[:2]
        self.variance.gradient = dL_dvar
        self.lengthscale.gradient = dL_dlengscale
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[2]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2]
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[3:]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[3:]
--- a/GPy/kern/_src/spline.py
+++ b/GPy/kern/_src/spline.py
@ -0,0 +1,52 @@
 # Copyright (c) 2015, Thomas Hornung
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Spline(Kern):
    """
    Linear spline kernel. You need to specify 2 parameters: the variance and c.
    The variance is defined in powers of 10. Thus specifying -2 means 10^-2.
    The parameter c allows to define the stiffness of the spline fit. A very stiff
    spline equals linear regression.
    See https://www.youtube.com/watch?v=50Vgw11qn0o starting at minute 1:17:28
    Lit: Wahba, 1990
    """
    def __init__(self, input_dim, variance=1., c=1., active_dims=None, name='spline'):
        super(Spline, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
        self.c = Param('c', c)
        self.link_parameters(self.variance,self.c)
    def K(self, X, X2=None):
        if X2 is None: X2=X
        term1 = (X+8.)*(X2.T+8.)/16.
        term2 = abs((X-X2.T)/16.)**3
        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
        return (self.variance**2 * (1. + (1.+self.c) * term1 + self.c/3. * (term2 - term3)))
    def Kdiag(self, X):
        term1 = np.square(X+8.,X+8.)/16.
        term3 = 2. * ((X+8.)/16.)**3
        return (self.variance**2 * (1. + (1.+self.c) * term1 - self.c/3. * term3))[:,0]
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None: X2=X
        term1 = (X+8.)*(X2.T+8.)/16.
        term2 = abs((X-X2.T)/16.)**3
        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
        self.variance.gradient = np.sum(dL_dK * (2*self.variance * (1. + (1.+self.c) * term1 + self.c/3. * ( term2 - term3))))
        self.c.gradient = np.sum(dL_dK * (self.variance**2* (term1 + 1./3.*(term2 - term3))))
    def update_gradients_diag(self, dL_dKdiag, X):
        raise NotImplementedError
    def gradients_X(self, dL_dK, X, X2=None):
        raise NotImplementedError
    def gradients_X_diag(self, dL_dKdiag, X):
        raise NotImplementedError
--- a/GPy/kern/_src/standard_periodic.py
+++ b/GPy/kern/_src/standard_periodic.py
@ -0,0 +1,166 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The standard periodic kernel which mentioned in:
 [1] Gaussian Processes for Machine Learning, C. E. Rasmussen, C. K. I. Williams.
 The MIT Press, 2005.
 [2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
 Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
 """
 from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 class StdPeriodic(Kern):
    """
    Standart periodic kernel
    .. math::
       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\theta_1` in the formula above
    :type variance: float
    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD1: Auto Relevance Determination with respect to wavelength. 
        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
        each dimension is assumed, otherwise there is one lengthscale 
        parameter per dimension.
    :type ARD1: Boolean
    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
        If equal to "False" one single wavelength parameter :math:`l_i` for 
        each dimension is assumed, otherwise there is one lengthscale 
        parameter per dimension.
    :type ARD2: Boolean
    :param active_dims: indices of dimensions which are used in the computation of the kernel
    :type wavelength: array or list of the appropriate size
    :param name: Name of the kernel for output
    :type String
    :param useGPU: whether of not use GPU
    :type Boolean
    """
    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
        self.input_dim = input_dim
        self.ARD1 = ARD1 # correspond to wavelengths        
        self.ARD2 = ARD2 # correspond to lengthscales
        self.name = name
        if self.ARD1 == False:
            if wavelength is not None:
                wavelength = np.asarray(wavelength)
                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
            else:
                wavelength = np.ones(1)
        else:
            if wavelength is not None:
                wavelength = np.asarray(wavelength)
                assert wavelength.size == input_dim, "bad number of wavelengths"
            else:
                wavelength = np.ones(input_dim)
        if self.ARD2 == False:
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                lengthscale = np.ones(1)
        else:
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == input_dim, "bad number of lengthscales"
            else:
                lengthscale = np.ones(input_dim)
        self.variance = Param('variance', variance, Logexp())
        assert self.variance.size==1, "Variance size must be one"
        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
    def parameters_changed(self):
        """
        This functions deals as a callback for each optimization iteration. 
        If one optimization step was successfull and the parameters
        this callback function will be called to be able to update any 
        precomputations for the kernel.
        """
        pass
    def K(self, X, X2=None):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: 
            X2 = X
        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
        return self.variance * exp_dist
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix associated to X."""
        ret = np.empty(X.shape[0])
        ret[:] = self.variance
        return ret
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: 
            X2 = X
        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
        sin_base = np.sin( base )         
        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
        self.variance.gradient = np.sum(exp_dist * dL_dK)    
        #target[0] += np.sum( exp_dist * dL_dK)        
        if self.ARD1: # different wavelengths
            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
        else:  # same wavelengths
            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
        if self.ARD2: # different lengthscales
            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
        else: # same lengthscales
            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
        self.wavelengths.gradient = 0
        self.lengthscales.gradient = 0
 #    def gradients_X(self, dL_dK, X, X2=None):
 #        """derivative of the covariance matrix with respect to X."""
 #    
 #        raise NotImplemented("Periodic kernel: dK_dX not implemented")
 #
 #    def gradients_X_diag(self, dL_dKdiag, X):
 #        
 #        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -24,6 +24,13 @@ class Static(Kern):
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
    def gradients_XX(self, dL_dK, X, X2):
        if X2 is None:
            X2 = X
        return np.zeros((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
    def gradients_XX_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return np.zeros(Z.shape)
@ -59,6 +66,9 @@ class White(Static):
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
    def psi2n(self, Z, variational_posterior):
        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None:
            self.variance.gradient = np.trace(dL_dK)
@ -92,6 +102,11 @@ class Bias(Static):
        ret[:] = self.variance*self.variance*variational_posterior.shape[0]
        return ret
    def psi2n(self, Z, variational_posterior):
        ret = np.empty((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
        ret[:] = self.variance*self.variance
        return ret
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()*variational_posterior.shape[0]
@ -120,6 +135,9 @@ class Fixed(Static):
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
    def psi2n(self, Z, variational_posterior):
        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -15,7 +15,7 @@ from ...util.caching import Cache_this
 try:
    from . import stationary_cython
 except ImportError:
-    print('warning in sationary: failed to import cython module: falling back to numpy')
+    print('warning in stationary: failed to import cython module: falling back to numpy')
    config.set('cython', 'working', 'false')
@ -25,13 +25,16 @@ class Stationary(Kern):
    Stationary covariance fucntion depend only on r, where r is defined as
-      r = \sqrt{ \sum_{q=1}^Q (x_q - x'_q)^2 }
+    .. math::
        r(x, x') = \\sqrt{ \\sum_{q=1}^Q (x_q - x'_q)^2 }
    The covariance function k(x, x' can then be written k(r).
    In this implementation, r is scaled by the lengthscales parameter(s):
-      r = \sqrt{ \sum_{q=1}^Q \frac{(x_q - x'_q)^2}{\ell_q^2} }.
+    .. math::
        r(x, x') = \\sqrt{ \\sum_{q=1}^Q \\frac{(x_q - x'_q)^2}{\ell_q^2} }.
    By default, there's only one lengthscale: seaprate lengthscales for each
    dimension can be enables by setting ARD=True.
@ -39,11 +42,12 @@ class Stationary(Kern):
    To implement a stationary covariance function using this class, one need
    only define the covariance function k(r), and it derivative.
-      ...
+    ```
-      def K_of_r(self, r):
+    def K_of_r(self, r):
-          return foo
+        return foo
-      def dK_dr(self, r):
+    def dK_dr(self, r):
-          return bar
+        return bar
    ```
    The lengthscale(s) and variance parameters are added to the structure automatically.
@ -77,6 +81,10 @@ class Stationary(Kern):
    def dK_dr(self, r):
        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")
    @Cache_this(limit=20, ignore_args=())
    def dK2_drdr(self, r):
        raise NotImplementedError("implement second derivative of covariance wrt r to use this method")
    @Cache_this(limit=5, ignore_args=())
    def K(self, X, X2=None):
        """
@ -89,11 +97,16 @@ class Stationary(Kern):
        r = self._scaled_dist(X, X2)
        return self.K_of_r(r)
-    @Cache_this(limit=3, ignore_args=())
+    @Cache_this(limit=20, ignore_args=())
    def dK_dr_via_X(self, X, X2):
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))
    @Cache_this(limit=3, ignore_args=())
    def dK2_drdr_via_X(self, X, X2):
        #a convenience function, so we can cache dK_dr
        return self.dK2_drdr(self._scaled_dist(X, X2))
    def _unscaled_dist(self, X, X2=None):
        """
        Compute the Euclidean distance between each row of X and X2, or between
@ -114,12 +127,13 @@ class Stationary(Kern):
            r2 = np.clip(r2, 0, np.inf)
            return np.sqrt(r2)
-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=20, ignore_args=())
    def _scaled_dist(self, X, X2=None):
        """
        Efficiently compute the scaled distance, r.
-        r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )
+        ..math::
            r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )
        Note that if thre is only one lengthscale, l comes outside the sum. In
        this case we compute the unscaled distance first (in a separate
@ -201,6 +215,59 @@ class Stationary(Kern):
        else:
            return self._gradients_X_pure(dL_dK, X, X2)
    def gradients_XX(self, dL_dK, X, X2=None):
        """
        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
        ..math:
          \frac{\partial^2 K}{\partial X\partial X2}
        ..returns:
            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
            Thus, we return the second derivative in X2.
        """
        # The off diagonals in Q are always zero, this should also be true for the Linear kernel...
        # According to multivariable chain rule, we can chain the second derivative through r:
        # d2K_dXdX2 = dK_dr*d2r_dXdX2 + d2K_drdr * dr_dX * dr_dX2:
        invdist = self._inv_dist(X, X2)
        invdist2 = invdist**2
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp1 = dL_dr * invdist
        dL_drdr = self.dK2_drdr_via_X(X, X2) * dL_dK
        tmp2 = dL_drdr * invdist2
        l2 = np.ones(X.shape[1]) * self.lengthscale**2
        if X2 is None:
            X2 = X
            tmp1 -= np.eye(X.shape[0])*self.variance
        else:
            tmp1[X==X2.T] -= self.variance
        grad = np.empty((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
        #grad = np.empty(X.shape, dtype=np.float64)
        for q in range(self.input_dim):
            tmpdist2 = (X[:,[q]]-X2[:,[q]].T) ** 2
            grad[:, :, q] = ((tmp1*invdist2 - tmp2)*tmpdist2/l2[q] - tmp1)/l2[q]
            #grad[:, :, q] = ((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q]
            #np.sum(((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q], axis=1, out=grad[:,q])
            #np.sum( - (tmp2*(tmpdist**2)), axis=1, out=grad[:,q])
        return grad
    def gradients_XX_diag(self, dL_dK, X):
        """
        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
        ..math:
          \frac{\partial^2 K}{\partial X\partial X2}
        ..returns:
            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ]
        """
        return np.ones(X.shape) * self.variance/self.lengthscale**2
    def _gradients_X_pure(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
@ -259,7 +326,7 @@ class OU(Stationary):
    .. math::
-       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^{\text{input_dim}} \\frac{(x_i-y_i)^2}{\ell_i^2} }
    """
@ -279,7 +346,7 @@ class Matern32(Stationary):
    .. math::
-       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^{\\text{input_dim}} \\frac{(x_i-y_i)^2}{\ell_i^2} }
    """
@ -326,7 +393,7 @@ class Matern52(Stationary):
    .. math::
       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
-       """
+    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat52'):
        super(Matern52, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
--- a/GPy/kern/_src/stationary_cython.c
+++ b/GPy/kern/_src/stationary_cython.c
--- a/GPy/kern/_src/stationary_cython.pyx
+++ b/GPy/kern/_src/stationary_cython.pyx
@ -4,14 +4,15 @@
 import numpy as np
 cimport numpy as np
 from cython.parallel import prange
 cimport cython
 ctypedef np.float64_t DTYPE_t
 cdef extern from "stationary_utils.h":
-    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad)
+    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad) nogil
 cdef extern from "stationary_utils.h":
-    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad)
+    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad) nogil
 def grad_X(int N, int D, int M,
        np.ndarray[DTYPE_t, ndim=2] _X,
@ -22,18 +23,18 @@ def grad_X(int N, int D, int M,
    cdef double *X2 = <double*> _X2.data
    cdef double *tmp = <double*> _tmp.data
    cdef double *grad = <double*> _grad.data
-    _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.
+    with nogil:
        _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.
@cython.cdivision(True)
 def grad_X_cython(int N, int D, int M, double[:,:] X, double[:,:] X2, double[:,:] tmp, double[:,:] grad):
    cdef int n,d,nd,m
-    for nd in prange(N*D, nogil=True):
+    for nd in prange(N * D, nogil=True):
-        n = nd/D
+        n = nd / D
-        d = nd%D
+        d = nd % D
        grad[n,d] = 0.0
        for m in range(M):
-            grad[n,d] += tmp[n,m]*(X[n,d]-X2[m,d])
+            grad[n,d] += tmp[n, m] * (X[n, d] - X2[m, d])
 def lengthscale_grads_in_c(int N, int M, int Q,
        np.ndarray[DTYPE_t, ndim=2] _tmp,
@ -44,16 +45,16 @@ def lengthscale_grads_in_c(int N, int M, int Q,
    cdef double *X = <double*> _X.data
    cdef double *X2 = <double*> _X2.data
    cdef double *grad = <double*> _grad.data
-    _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.
+    with nogil:
        _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.
 def lengthscale_grads(int N, int M, int Q, double[:,:] tmp, double[:,:] X, double[:,:] X2, double[:] grad):
    cdef int q, n, m
    cdef double gradq, dist
-    for q in range(Q):
+    with nogil:
-        grad[q] = 0.0
+        for q in range(Q):
-        for n in range(N):
+            grad[q] = 0.0
-            for m in range(M):
+            for n in range(N):
-                dist = X[n,q] - X2[m,q]
+                for m in range(M):
-                grad[q] += tmp[n,m]*dist*dist
+                    dist = X[n,q] - X2[m,q]
-
+                    grad[q] += tmp[n, m] * dist * dist
--- a/GPy/kern/_src/stationary_utils.h
+++ b/GPy/kern/_src/stationary_utils.h
@ -1,3 +1,5 @@
 #ifndef __APPLE__
 #include <omp.h>
 #endif
 void _grad_X(int N, int D, int M, double*X, double* X2, double* tmp, double* grad);
 void _lengthscale_grads(int N, int D, int M, double* X, double* X2, double* tmp, double* grad);
--- a/GPy/kern/_src/trunclinear.py
+++ b/GPy/kern/_src/trunclinear.py
@ -15,7 +15,7 @@ class TruncLinear(Kern):
    .. math::
-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \sigma_q)
    :param input_dim: the number of input dimensions
    :type input_dim: int
@ -54,7 +54,7 @@ class TruncLinear(Kern):
        self.delta = Param('delta', delta)
        self.add_parameter(self.variances)
        self.add_parameter(self.delta)
-            
+
    @Cache_this(limit=2)
    def K(self, X, X2=None):
        XX = self.variances*self._product(X, X2)
@ -114,7 +114,7 @@ class TruncLinear_inf(Kern):
    .. math::
-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \sigma_q)
    :param input_dim: the number of input dimensions
    :type input_dim: int
@ -148,8 +148,8 @@ class TruncLinear_inf(Kern):
        self.variances = Param('variances', variances, Logexp())
        self.add_parameter(self.variances)
-        
+
-    
+
 #     @Cache_this(limit=2)
    def K(self, X, X2=None):
        tmp = self._product(X, X2)
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,6 +1,6 @@
 from .bernoulli import Bernoulli
 from .exponential import Exponential
-from .gaussian import Gaussian
+from .gaussian import Gaussian, HeteroscedasticGaussian
 from .gamma import Gamma
 from .poisson import Poisson
 from .student_t import StudentT
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -85,6 +85,7 @@ class Bernoulli(Likelihood):
                gh_x, gh_w = gh_points
            gh_w = gh_w / np.sqrt(np.pi)
            shape = m.shape
            m,v,Y = m.flatten(), v.flatten(), Y.flatten()
            Ysign = np.where(Y==1,1,-1)
@ -232,6 +233,17 @@ class Bernoulli(Likelihood):
        np.seterr(**state)
        return d3logpdf_dlink3
    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
        """
        Get the "quantiles" of the binary labels (Bernoulli draws). all the
        quantiles must be either 0 or 1, since those are the only values the
        draw can take!
        """
        p = self.predictive_mean(mu, var)
        return [np.asarray(p>(q/100.), dtype=np.int32) for q in quantiles]
    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
--- a/GPy/likelihoods/exponential.py
+++ b/GPy/likelihoods/exponential.py
@ -124,7 +124,7 @@ class Exponential(Likelihood):
        #d3lik_dlink3 = 6*y/(link_f**4) - 2./(link_f**3)
        return d3lik_dlink3
-    def samples(self, gp):
+    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -48,6 +48,7 @@ class Gaussian(Likelihood):
    def betaY(self,Y,Y_metadata=None):
        #TODO: ~Ricardo this does not live here
        raise RuntimeError("Please notify the GPy developers, this should not happen")
        return Y/self.gaussian_variance(Y_metadata)
    def gaussian_variance(self, Y_metadata=None):
@ -315,9 +316,44 @@ class Gaussian(Likelihood):
        return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v
    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        if not isinstance(self.gp_link, link_functions.Identity):
            return super(Gaussian, self).variational_expectations(Y=Y, m=m, v=v, gh_points=gh_points, Y_metadata=Y_metadata)
        lik_var = float(self.variance)
        F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/lik_var
        dF_dmu = (Y - m)/lik_var
        dF_dv = np.ones_like(v)*(-0.5/lik_var)
        dF_dtheta = -0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2)
        return F, dF_dmu, dF_dv, dF_dtheta.reshape(1, Y.shape[0], Y.shape[1])
 class HeteroscedasticGaussian(Gaussian):
    def __init__(self, Y_metadata, gp_link=None, variance=1., name='het_Gauss'):
        if gp_link is None:
            gp_link = link_functions.Identity()
        if not isinstance(gp_link, link_functions.Identity):
            print("Warning, Exact inference is not implemeted for non-identity link functions,\
            if you are not already, ensure Laplace inference_method is used")
        super(HeteroscedasticGaussian, self).__init__(gp_link, np.ones(Y_metadata['output_index'].shape)*variance, name)
    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
        return dL_dKdiag[Y_metadata['output_index']]
    def gaussian_variance(self, Y_metadata=None):
        return self.variance[Y_metadata['output_index'].flatten()]
    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
        _s = self.variance[Y_metadata['output_index'].flatten()]
        if full_cov:
            if var.ndim == 2:
                var += np.eye(var.shape[0])*_s
            if var.ndim == 3:
                var += np.atleast_3d(np.eye(var.shape[0])*_s)
        else:
            var += _s
        return mu, var
    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
        _s = self.variance[Y_metadata['output_index'].flatten()]
        return  [stats.norm.ppf(q/100.)*np.sqrt(var + _s) + mu for q in quantiles]
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -607,7 +607,7 @@ class Likelihood(Parameterized):
            pred_mean = self.predictive_mean(mu, var, Y_metadata=Y_metadata)
            pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata=Y_metadata)
        except NotImplementedError:
-            print "Finding predictive mean and variance via sampling rather than quadrature"
+            print("Finding predictive mean and variance via sampling rather than quadrature")
            Nf_samp = 300
            Ny_samp = 1
            s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
@ -622,7 +622,7 @@ class Likelihood(Parameterized):
        Nf_samp = 300
        Ny_samp = 1
        s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
-        ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+        ss_y = self.samples(s, Y_metadata)#, samples=Ny_samp)
        #ss_y = ss_y.reshape(mu.shape[0], mu.shape[1], Nf_samp*Ny_samp)
        pred_quantiles = [np.percentile(ss_y, q, axis=1)[:,None] for q in quantiles]
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -2,6 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import scipy
 from ..util.univariate_Gaussian import std_norm_cdf, std_norm_pdf
 import scipy as sp
 from ..util.misc import safe_exp, safe_square, safe_cube, safe_quad, safe_three_times
@ -67,7 +68,7 @@ class Probit(GPTransformation):
    .. math::
        g(f) = \\Phi^{-1} (mu)
-    
+
    """
    def transf(self,f):
        return std_norm_cdf(f)
@ -140,7 +141,7 @@ class Log_ex_1(GPTransformation):
    """
    def transf(self,f):
-        return np.log1p(safe_exp(f))
+        return scipy.special.log1p(safe_exp(f))
    def dtransf_df(self,f):
        ef = safe_exp(f)
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@ -145,5 +145,7 @@ class Poisson(Likelihood):
        """
        orig_shape = gp.shape
        gp = gp.flatten()
        # Ysim = np.random.poisson(self.gp_link.transf(gp), [samples, gp.size]).T
        # return Ysim.reshape(orig_shape+(samples,))
        Ysim = np.random.poisson(self.gp_link.transf(gp))
        return Ysim.reshape(orig_shape)
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@ -9,6 +9,7 @@ from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_miniba
 import logging
 from GPy.models.sparse_gp_minibatch import SparseGPMiniBatch
 from GPy.core.parameterization.param import Param
 from GPy.core.parameterization.observable_array import ObsAr
 class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
    """
@ -80,46 +81,10 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        """Get the gradients of the posterior distribution of X in its specific form."""
        return X.mean.gradient, X.variance.gradient
-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kw):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None, **kw):
-        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices, **kw)
+        posterior, log_marginal_likelihood, grad_dict = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm,
-
+                                                                                                                    psi0=psi0, psi1=psi1, psi2=psi2, **kw)
-        if self.has_uncertain_inputs():
+        return posterior, log_marginal_likelihood, grad_dict
            current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
                                                variational_posterior=X,
                                                Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
                                                dL_dpsi1=grad_dict['dL_dpsi1'],
                                                dL_dpsi2=grad_dict['dL_dpsi2'])
        else:
            current_values['Xgrad'] = self.kern.gradients_X(grad_dict['dL_dKnm'], X, Z)
            current_values['Xgrad'] += self.kern.gradients_X_diag(grad_dict['dL_dKdiag'], X)
            if subset_indices is not None:
                value_indices['Xgrad'] = subset_indices['samples']
        kl_fctr = self.kl_factr
        if self.has_uncertain_inputs():
            if self.missing_data:
                d = self.output_dim
                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
            else:
                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
            # Subsetting Variational Posterior objects, makes the gradients
            # empty. We need them to be 0 though:
            X.mean.gradient[:] = 0
            X.variance.gradient[:] = 0
            self.variational_prior.update_gradients_KL(X)
            if self.missing_data:
                current_values['meangrad'] += kl_fctr*X.mean.gradient/d
                current_values['vargrad'] += kl_fctr*X.variance.gradient/d
            else:
                current_values['meangrad'] += kl_fctr*X.mean.gradient
                current_values['vargrad'] += kl_fctr*X.variance.gradient
            if subset_indices is not None:
                value_indices['meangrad'] = subset_indices['samples']
                value_indices['vargrad'] = subset_indices['samples']
        return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
    def _outer_values_update(self, full_values):
        """
@ -128,22 +93,47 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        """
        super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
        if self.has_uncertain_inputs():
-            self.X.mean.gradient = full_values['meangrad']
+            meangrad_tmp, vargrad_tmp = self.kern.gradients_qX_expectations(
-            self.X.variance.gradient = full_values['vargrad']
+                                            variational_posterior=self.X,
                                            Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                            dL_dpsi1=full_values['dL_dpsi1'],
                                            dL_dpsi2=full_values['dL_dpsi2'],
                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
            self.X.mean.gradient = meangrad_tmp
            self.X.variance.gradient = vargrad_tmp
        else:
-            self.X.gradient = full_values['Xgrad']
+            self.X.gradient = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z)
            self.X.gradient += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X)
    def _outer_init_full_values(self):
-        if self.has_uncertain_inputs():
+        return super(BayesianGPLVMMiniBatch, self)._outer_init_full_values()
            return dict(meangrad=np.zeros(self.X.mean.shape),
                        vargrad=np.zeros(self.X.variance.shape))
        else:
            return dict(Xgrad=np.zeros(self.X.shape))
    def parameters_changed(self):
        super(BayesianGPLVMMiniBatch,self).parameters_changed()
-        if isinstance(self.inference_method, VarDTC_minibatch):
+
-            return
+        kl_fctr = self.kl_factr
        if kl_fctr > 0:
            Xgrad = self.X.gradient.copy()
            self.X.gradient[:] = 0
            self.variational_prior.update_gradients_KL(self.X)
            if self.missing_data or not self.stochastics:
                self.X.mean.gradient = kl_fctr*self.X.mean.gradient
                self.X.variance.gradient = kl_fctr*self.X.variance.gradient
            else:
                d = self.output_dim
                self.X.mean.gradient = kl_fctr*self.X.mean.gradient*self.stochastics.batchsize/d
                self.X.variance.gradient = kl_fctr*self.X.variance.gradient*self.stochastics.batchsize/d
            self.X.gradient += Xgrad
            if self.missing_data or not self.stochastics:
                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
            elif self.stochastics:
                d = self.output_dim
                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)*self.stochastics.batchsize/d
        self._Xgrad = self.X.gradient.copy()
    def plot_latent(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@ -1,11 +1,11 @@
-# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015 James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import GP
-from ..models import GPLVM
+from . import GPLVM
-from ..mappings import *
+from .. import mappings
 class BCGPLVM(GPLVM):
@ -16,33 +16,31 @@ class BCGPLVM(GPLVM):
    :type Y: np.ndarray
    :param input_dim: latent dimensionality
    :type input_dim: int
    :param init: initialisation method for the latent space
    :type init: 'PCA'|'random'
    :param mapping: mapping for back constraint
    :type mapping: GPy.core.Mapping object
    """
-    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False, mapping=None):
+    def __init__(self, Y, input_dim, kernel=None, mapping=None):
-        
+
        if mapping is None:
-            mapping = Kernel(X=Y, output_dim=input_dim)
+            mapping = mappings.MLP(input_dim=Y.shape[1],
                                   output_dim=input_dim,
                                   hidden_dim=10)
        else:
            assert mapping.input_dim==Y.shape[1], "mapping input dim does not work for Y dimension"
            assert mapping.output_dim==input_dim, "mapping output dim does not work for self.input_dim"
        GPLVM.__init__(self, Y, input_dim, X=mapping.f(Y), kernel=kernel, name="bcgplvm")
        self.unlink_parameter(self.X)
        self.mapping = mapping
-        GPLVM.__init__(self, Y, input_dim, init, X, kernel, normalize_Y)
+        self.link_parameter(self.mapping)
        self.X = self.mapping.f(self.likelihood.Y)
-    def _get_param_names(self):
+        self.X = self.mapping.f(self.Y)
        return self.mapping._get_param_names() + GP._get_param_names(self)
-    def _get_params(self):
+    def parameters_changed(self):
-        return np.hstack((self.mapping._get_params(), GP._get_params(self)))
+        self.X = self.mapping.f(self.Y)
        GP.parameters_changed(self)
        Xgradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
        self.mapping.update_gradients(Xgradient, self.Y)
    def _set_params(self, x):
        self.mapping._set_params(x[:self.mapping.num_params])
        self.X = self.mapping.f(self.likelihood.Y)
        GP._set_params(self, x[self.mapping.num_params:])
    def _log_likelihood_gradients(self):
        dL_df = self.kern.gradients_X(self.dL_dK, self.X)
        dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
        return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))
--- a/GPy/models/gp_heteroscedastic_regression.py
+++ b/GPy/models/gp_heteroscedastic_regression.py
@ -16,6 +16,8 @@ class GPHeteroscedasticRegression(GP):
    :param X: input observations
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf
    NB: This model does not make inference on the noise outside the training set
    """
    def __init__(self, X, Y, kernel=None, Y_metadata=None):
@ -30,10 +32,7 @@ class GPHeteroscedasticRegression(GP):
            kernel = kern.RBF(X.shape[1])
        #Likelihood
-        #likelihoods_list = [likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for j in range(Ny)]
+        likelihood = likelihoods.HeteroscedasticGaussian(Y_metadata)
        noise_terms = np.unique(Y_metadata['output_index'].flatten())
        likelihoods_list = [likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for j in noise_terms]
        likelihood = likelihoods.MixedNoise(likelihoods_list=likelihoods_list)
        super(GPHeteroscedasticRegression, self).__init__(X,Y,kernel,likelihood, Y_metadata=Y_metadata)
--- a/GPy/models/gp_kronecker_gaussian_regression.py
+++ b/GPy/models/gp_kronecker_gaussian_regression.py
@ -1,5 +1,5 @@
 # Copyright (c) 2014, James Hensman, Alan Saul
-# Distributed under the terms of the GNU General public License, see LICENSE.txt
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core.model import Model
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -26,12 +26,12 @@ class GPRegression(GP):
    """
-    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1.):
+    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1., mean_function=None):
        if kernel is None:
            kernel = kern.RBF(X.shape[1])
-	
+
        likelihood = likelihoods.Gaussian(variance=noise_var)
-        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer)
+        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function)
--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@ -1,20 +1,17 @@
 # Copyright (c) 2014, James Hensman, Alan Saul
-# Distributed under the terms of the GNU General public License, see LICENSE.txt
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-from scipy import stats
+from ..core import GP
 from scipy.special import erf
 from ..core.model import Model
 from ..core.parameterization import ObsAr
 from .. import kern
 from ..core.parameterization.param import Param
-from ..util.linalg import pdinv
+from ..inference.latent_function_inference import VarGauss
 from ..likelihoods import Gaussian
 log_2_pi = np.log(2*np.pi)
-class GPVariationalGaussianApproximation(Model):
+class GPVariationalGaussianApproximation(GP):
    """
    The Variational Gaussian Approximation revisited
@ -26,70 +23,14 @@ class GPVariationalGaussianApproximation(Model):
        pages = {786--792},
    }
    """
-    def __init__(self, X, Y, kernel, likelihood=None, Y_metadata=None):
+    def __init__(self, X, Y, kernel, likelihood, Y_metadata=None):
        Model.__init__(self,'Variational GP')
        if likelihood is None:
            likelihood = Gaussian()
        # accept the construction arguments
        self.X = ObsAr(X)
        self.Y = Y
        self.num_data, self.input_dim = self.X.shape
        self.Y_metadata = Y_metadata
-        self.kern = kernel
+        num_data = Y.shape[0]
-        self.likelihood = likelihood
+        self.alpha = Param('alpha', np.zeros((num_data,1))) # only one latent fn for now.
-        self.link_parameter(self.kern)
+        self.beta = Param('beta', np.ones(num_data))
-        self.link_parameter(self.likelihood)
+
        inf = VarGauss(self.alpha, self.beta)
        super(GPVariationalGaussianApproximation, self).__init__(X, Y, kernel, likelihood, name='VarGP', inference_method=inf)
        self.alpha = Param('alpha', np.zeros((self.num_data,1))) # only one latent fn for now.
        self.beta = Param('beta', np.ones(self.num_data))
        self.link_parameter(self.alpha)
        self.link_parameter(self.beta)
    def log_likelihood(self):
        return self._log_lik
    def parameters_changed(self):
        K = self.kern.K(self.X)
        m = K.dot(self.alpha)
        KB = K*self.beta[:, None]
        BKB = KB*self.beta[None, :]
        A = np.eye(self.num_data) + BKB
        Ai, LA, _, Alogdet = pdinv(A)
        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
        var = np.diag(Sigma).reshape(-1,1)
        F, dF_dm, dF_dv, dF_dthetaL = self.likelihood.variational_expectations(self.Y, m, var, Y_metadata=self.Y_metadata)
        self.likelihood.gradient = dF_dthetaL.sum(1).sum(1)
        dF_da = np.dot(K, dF_dm)
        SigmaB = Sigma*self.beta
        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + np.sum(m*self.alpha))
        dKL_da = m
        A_A2 = Ai - Ai.dot(Ai)
        dKL_db = np.diag(np.dot(KB.T, A_A2))
        self._log_lik = F.sum() - KL
        self.alpha.gradient = dF_da - dKL_da
        self.beta.gradient = dF_db - dKL_db
        # K-gradients
        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
        tmp = Ai*self.beta[:, None]/self.beta[None, :]
        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
        self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)
    def _raw_predict(self, Xnew):
        """
        Predict the function(s) at the new point(s) Xnew.
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        """
        Wi, _, _, _ = pdinv(self.kern.K(self.X) + np.diag(self.beta**-2))
        Kux = self.kern.K(self.X, Xnew)
        mu = np.dot(Kux.T, self.alpha)
        WiKux = np.dot(Wi, Kux)
        Kxx = self.kern.Kdiag(Xnew)
        var = Kxx - np.sum(WiKux*Kux, 0)
        return mu, var.reshape(-1,1)
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -36,6 +36,7 @@ class GPLVM(GP):
        likelihood = Gaussian()
        super(GPLVM, self).__init__(X, Y, kernel, likelihood, name='GPLVM')
        self.X = Param('latent_mean', X)
        self.link_parameter(self.X, index=0)
@ -43,27 +44,30 @@ class GPLVM(GP):
        super(GPLVM, self).parameters_changed()
        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
-    def jacobian(self,X):
+    #def jacobian(self,X):
-        J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+    #    J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        for i in range(self.output_dim):
+    #    for i in range(self.output_dim):
-            J[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1], X, self.X)
+    #        J[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1], X, self.X)
-        return J
+    #    return J
-    def magnification(self,X):
+    #def magnification(self,X):
-        target=np.zeros(X.shape[0])
+    #    target=np.zeros(X.shape[0])
-        #J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+    #    #J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        J = self.jacobian(X)
+    ##    J = self.jacobian(X)
-        for i in range(X.shape[0]):
+    #    for i in range(X.shape[0]):
-            target[i]=np.sqrt(np.linalg.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
+    #        target[i]=np.sqrt(np.linalg.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
-        return target
+    #    return target
    def plot(self):
-        assert self.likelihood.Y.shape[1] == 2
+        assert self.Y.shape[1] == 2, "too high dimensional to plot. Try plot_latent"
-        pb.scatter(self.likelihood.Y[:, 0], self.likelihood.Y[:, 1], 40, self.X[:, 0].copy(), linewidth=0, cmap=pb.cm.jet)  # @UndefinedVariable
+        from matplotlib import pyplot as plt
        plt.scatter(self.Y[:, 0],
                    self.Y[:, 1],
                    40, self.X[:, 0].copy(),
                    linewidth=0, cmap=plt.cm.jet)
        Xnew = np.linspace(self.X.min(), self.X.max(), 200)[:, None]
        mu, _ = self.predict(Xnew)
-        import pylab as pb
+        plt.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
        pb.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
    def plot_latent(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
@ -78,6 +82,3 @@ class GPLVM(GP):
                resolution, ax, marker, s,
                fignum, False, legend,
                plot_limits, aspect, updates, **kwargs)
    def plot_magnification(self, *args, **kwargs):
        return util.plot_latent.plot_magnification(self, *args, **kwargs)
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@ -251,7 +251,7 @@ class HessianChecker(GradientChecker):
            print(grad_string)
            if plot:
-                import pylab as pb
+                from matplotlib import pyplot as pb
                fig, axes = pb.subplots(2, 2)
                max_lim = numpy.max(numpy.vstack((analytic_hess, numeric_hess)))
                min_lim = numpy.min(numpy.vstack((analytic_hess, numeric_hess)))
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -170,20 +170,19 @@ class MRD(BayesianGPLVMMiniBatch):
            self._log_marginal_likelihood += b._log_marginal_likelihood
            self.logger.info('working on im <{}>'.format(hex(id(i))))
-            self.Z.gradient[:] += b.full_values['Zgrad']
+            self.Z.gradient[:] += b.Z.gradient#full_values['Zgrad']
-            grad_dict = b.full_values
+            #grad_dict = b.full_values
            if self.has_uncertain_inputs():
-                self.X.mean.gradient += grad_dict['meangrad']
+                self.X.gradient += b._Xgrad
                self.X.variance.gradient += grad_dict['vargrad']
            else:
-                self.X.gradient += grad_dict['Xgrad']
+                self.X.gradient += b._Xgrad
-        if self.has_uncertain_inputs():
+        #if self.has_uncertain_inputs():
-            # update for the KL divergence
+        #    # update for the KL divergence
-            self.variational_prior.update_gradients_KL(self.X)
+        #    self.variational_prior.update_gradients_KL(self.X)
-            self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
+        #    self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
-            pass
+        #    pass
    def log_likelihood(self):
        return self._log_marginal_likelihood
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -44,7 +44,7 @@ class SparseGPMiniBatch(SparseGP):
    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False,
                 missing_data=False, stochastic=False, batchsize=1):
-        
+
        # pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
@ -63,10 +63,10 @@ class SparseGPMiniBatch(SparseGP):
        if stochastic and missing_data:
            self.missing_data = True
-            self.stochastics = SparseGPStochastics(self, batchsize)
+            self.stochastics = SparseGPStochastics(self, batchsize, self.missing_data)
        elif stochastic and not missing_data:
            self.missing_data = False
-            self.stochastics = SparseGPStochastics(self, batchsize)
+            self.stochastics = SparseGPStochastics(self, batchsize, self.missing_data)
        elif missing_data:
            self.missing_data = True
            self.stochastics = SparseGPMissing(self)
@ -80,7 +80,7 @@ class SparseGPMiniBatch(SparseGP):
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)
-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kwargs):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None, **kwargs):
        """
        This is the standard part, which usually belongs in parameters_changed.
@ -99,47 +99,13 @@ class SparseGPMiniBatch(SparseGP):
        like them into this dictionary for inner use of the indices inside the
        algorithm.
        """
-        try:
+        if psi2 is None:
-            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None, **kwargs)
+            psi2_sum_n = None
        except:
            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata)
        current_values = {}
        likelihood.update_gradients(grad_dict['dL_dthetaL'])
        current_values['likgrad'] = likelihood.gradient.copy()
        if subset_indices is None:
            subset_indices = {}
        if isinstance(X, VariationalPosterior):
            #gradients wrt kernel
            dL_dKmm = grad_dict['dL_dKmm']
            kern.update_gradients_full(dL_dKmm, Z, None)
            current_values['kerngrad'] = kern.gradient.copy()
            kern.update_gradients_expectations(variational_posterior=X,
                                                    Z=Z,
                                                    dL_dpsi0=grad_dict['dL_dpsi0'],
                                                    dL_dpsi1=grad_dict['dL_dpsi1'],
                                                    dL_dpsi2=grad_dict['dL_dpsi2'])
            current_values['kerngrad'] += kern.gradient
            #gradients wrt Z
            current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z)
            current_values['Zgrad'] += kern.gradients_Z_expectations(
                               grad_dict['dL_dpsi0'],
                               grad_dict['dL_dpsi1'],
                               grad_dict['dL_dpsi2'],
                               Z=Z,
                               variational_posterior=X)
        else:
-            #gradients wrt kernel
+            psi2_sum_n = psi2.sum(axis=0)
-            kern.update_gradients_diag(grad_dict['dL_dKdiag'], X)
+        posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm,
-            current_values['kerngrad'] = kern.gradient.copy()
+                                                                                        dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs)
-            kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z)
+        return posterior, log_marginal_likelihood, grad_dict
            current_values['kerngrad'] += kern.gradient
            kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None)
            current_values['kerngrad'] += kern.gradient
            #gradients wrt Z
            current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z)
            current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X)
        return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices
    def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
        """
@ -173,7 +139,10 @@ class SparseGPMiniBatch(SparseGP):
            else:
                index = slice(None)
            if key in full_values:
-                full_values[key][index] += current_values[key]
+                try:
                    full_values[key][index] += current_values[key]
                except:
                    full_values[key] += current_values[key]
            else:
                full_values[key] = current_values[key]
@ -192,9 +161,41 @@ class SparseGPMiniBatch(SparseGP):
        Here you put the values, which were collected before in the right places.
        E.g. set the gradients of parameters, etc.
        """
-        self.likelihood.gradient = full_values['likgrad']
+        if self.has_uncertain_inputs():
-        self.kern.gradient = full_values['kerngrad']
+            #gradients wrt kernel
-        self.Z.gradient = full_values['Zgrad']
+            dL_dKmm = full_values['dL_dKmm']
            self.kern.update_gradients_full(dL_dKmm, self.Z, None)
            kgrad = self.kern.gradient.copy()
            self.kern.update_gradients_expectations(
                                                variational_posterior=self.X,
                                                Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                                dL_dpsi1=full_values['dL_dpsi1'],
                                                dL_dpsi2=full_values['dL_dpsi2'])
            self.kern.gradient += kgrad
            #gradients wrt Z
            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
            self.Z.gradient += self.kern.gradients_Z_expectations(
                                            variational_posterior=self.X,
                                            Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                            dL_dpsi1=full_values['dL_dpsi1'],
                                            dL_dpsi2=full_values['dL_dpsi2'])
        else:
            #gradients wrt kernel
            self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X)
            kgrad = self.kern.gradient.copy()
            self.kern.update_gradients_full(full_values['dL_dKnm'], self.X, self.Z)
            kgrad += self.kern.gradient
            self.kern.update_gradients_full(full_values['dL_dKmm'], self.Z, None)
            self.kern.gradient += kgrad
            #kgrad += self.kern.gradient
            #gradients wrt Z
            self.Z.gradient = self.kern.gradients_X(full_values['dL_dKmm'], self.Z)
            self.Z.gradient += self.kern.gradients_X(full_values['dL_dKnm'].T, self.Z, self.X)
        self.likelihood.update_gradients(full_values['dL_dthetaL'])
    def _outer_init_full_values(self):
        """
@ -209,7 +210,15 @@ class SparseGPMiniBatch(SparseGP):
        to initialize the gradients for the mean and the variance in order to
        have the full gradient for indexing)
        """
-        return {}
+        retd = dict(dL_dKmm=np.zeros((self.Z.shape[0], self.Z.shape[0])))
        if self.has_uncertain_inputs():
            retd.update(dict(dL_dpsi0=np.zeros(self.X.shape[0]),
                             dL_dpsi1=np.zeros((self.X.shape[0], self.Z.shape[0])),
                             dL_dpsi2=np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0]))))
        else:
            retd.update({'dL_dKdiag': np.zeros(self.X.shape[0]),
                         'dL_dKnm': np.zeros((self.X.shape[0], self.Z.shape[0]))})
        return retd
    def _outer_loop_for_missing_data(self):
        Lm = None
@ -231,28 +240,36 @@ class SparseGPMiniBatch(SparseGP):
            print(message, end=' ')
        for d, ninan in self.stochastics.d:
            if not self.stochastics:
                print(' '*(len(message)) + '\r', end=' ')
                message = m_f(d)
                print(message, end=' ')
-            posterior, log_marginal_likelihood, \
+            psi0ni = self.psi0[ninan]
-                grad_dict, current_values, value_indices = self._inner_parameters_changed(
+            psi1ni = self.psi1[ninan]
            if self.has_uncertain_inputs():
                psi2ni = self.psi2[ninan]
                value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, dL_dpsi2=ninan)
            else:
                psi2ni = None
                value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan)
            posterior, log_marginal_likelihood, grad_dict = self._inner_parameters_changed(
                                self.kern, self.X[ninan],
                                self.Z, self.likelihood,
                                self.Y_normalized[ninan][:, d], self.Y_metadata,
                                Lm, dL_dKmm,
-                                subset_indices=dict(outputs=d, samples=ninan))
+                                psi0=psi0ni, psi1=psi1ni, psi2=psi2ni)
-            self._inner_take_over_or_update(self.full_values, current_values, value_indices)
+            # Fill out the full values by adding in the apporpriate grad_dict
-            self._inner_values_update(current_values)
+            # values
            self._inner_take_over_or_update(self.full_values, grad_dict, value_indices)
            self._inner_values_update(grad_dict)  # What is this for? -> MRD
            Lm = posterior.K_chol
            dL_dKmm = grad_dict['dL_dKmm']
            woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None]
            woodbury_vector[:, d] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood
        if not self.stochastics:
            print('')
@ -260,10 +277,10 @@ class SparseGPMiniBatch(SparseGP):
            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
        self._outer_values_update(self.full_values)
        if self.has_uncertain_inputs():
            self.kern.return_psi2_n = False
    def _outer_loop_without_missing_data(self):
        self._log_marginal_likelihood = 0
        if self.posterior is None:
            woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
@ -271,17 +288,16 @@ class SparseGPMiniBatch(SparseGP):
            woodbury_inv = self.posterior._woodbury_inv
            woodbury_vector = self.posterior._woodbury_vector
-        d = self.stochastics.d
+        d = self.stochastics.d[0][0]
-        posterior, log_marginal_likelihood, \
+        posterior, log_marginal_likelihood, grad_dict= self._inner_parameters_changed(
            grad_dict, self.full_values, _ = self._inner_parameters_changed(
                            self.kern, self.X,
                            self.Z, self.likelihood,
                            self.Y_normalized[:, d], self.Y_metadata)
        self.grad_dict = grad_dict
-        self._log_marginal_likelihood += log_marginal_likelihood
+        self._log_marginal_likelihood = log_marginal_likelihood
-        self._outer_values_update(self.full_values)
+        self._outer_values_update(self.grad_dict)
        woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None]
        woodbury_vector[:, d] = posterior.woodbury_vector
@ -290,10 +306,23 @@ class SparseGPMiniBatch(SparseGP):
                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
    def parameters_changed(self):
        #Compute the psi statistics for N once, but don't sum out N in psi2
        if self.has_uncertain_inputs():
            #psi0 = ObsAr(self.kern.psi0(self.Z, self.X))
            #psi1 = ObsAr(self.kern.psi1(self.Z, self.X))
            #psi2 = ObsAr(self.kern.psi2(self.Z, self.X))
            self.psi0 = self.kern.psi0(self.Z, self.X)
            self.psi1 = self.kern.psi1(self.Z, self.X)
            self.psi2 = self.kern.psi2n(self.Z, self.X)
        else:
            self.psi0 = self.kern.Kdiag(self.X)
            self.psi1 = self.kern.K(self.X, self.Z)
            self.psi2 = None
        if self.missing_data:
            self._outer_loop_for_missing_data()
        elif self.stochastics:
            self._outer_loop_without_missing_data()
        else:
-            self.posterior, self._log_marginal_likelihood, self.grad_dict, self.full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
+            self.posterior, self._log_marginal_likelihood, self.grad_dict = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
-            self._outer_values_update(self.full_values)
+            self._outer_values_update(self.grad_dict)
--- a/GPy/plotting/init.py
+++ b/GPy/plotting/init.py
@ -4,4 +4,8 @@
 try:
    from . import matplot_dep
 except (ImportError, NameError):
-    print('Fail to load GPy.plotting.matplot_dep.')
+    # Matplotlib not available
    import warnings
    warnings.warn(ImportWarning("Matplotlib not available, install newest version of Matplotlib for plotting"))
    #sys.modules['matplotlib'] =
    #sys.modules[__name__+'.matplot_dep'] = ImportWarning("Matplotlib not available, install newest version of Matplotlib for plotting")
--- a/GPy/plotting/matplot_dep/Tango.py
+++ b/GPy/plotting/matplot_dep/Tango.py
@ -3,7 +3,7 @@
 import matplotlib as mpl
-import pylab as pb
+from matplotlib import pyplot as pb
 import sys
 #sys.path.append('/home/james/mlprojects/sitran_cluster/')
 #from switch_pylab_backend import *
@ -159,7 +159,7 @@ cdict_Alu = {'red' :((0./5,colorsRGB['Aluminium1'][0]/256.,colorsRGB['Aluminium1
 # cmap_BGR = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_BGR,256)
 # cmap_RB = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_RB,256)
 if __name__=='__main__':
-    import pylab as pb
+    from matplotlib import pyplot as pb
    pb.figure()
    pb.pcolor(pb.rand(10,10),cmap=cmap_RB)
    pb.colorbar()
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -3,8 +3,8 @@
 try:
-    import Tango
+    #import Tango
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
 import numpy as np
@ -17,11 +17,11 @@ def ax_default(fignum, ax):
        fig = ax.figure
    return fig, ax
-def meanplot(x, mu, color=Tango.colorsHex['darkBlue'], ax=None, fignum=None, linewidth=2,**kw):
+def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
    _, axes = ax_default(fignum, ax)
    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)
-def gpplot(x, mu, lower, upper, edgecol=Tango.colorsHex['darkBlue'], fillcol=Tango.colorsHex['lightBlue'], ax=None, fignum=None, **kwargs):
+def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)
    mu = mu.flatten()
@ -47,6 +47,32 @@ def gpplot(x, mu, lower, upper, edgecol=Tango.colorsHex['darkBlue'], fillcol=Tan
    return plots
 def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)
    mu = mu.flatten()
    x = x.flatten()
    lower = lower.flatten()
    upper = upper.flatten()
    plots = []
    if edgecol is None:
        edgecol='#3300FF'
    if not 'alpha' in kwargs.keys():
        kwargs['alpha'] = 1.
    if not 'lw' in kwargs.keys():
        kwargs['lw'] = 1.
    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
    plots[-1][0].remove()
    return plots
 def removeRightTicks(ax=None):
    ax = ax or pb.gca()
    for i, line in enumerate(ax.get_yticklines()):
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@ -9,7 +9,8 @@ import itertools
 try:
    import Tango
    from matplotlib.cm import get_cmap
-    import pylab as pb
+    from matplotlib import pyplot as pb
    from matplotlib import cm
 except:
    pass
@ -114,7 +115,7 @@ def plot_latent(model, labels=None, which_indices=None,
    # create a function which computes the shading of latent space according to the output variance
    def plot_function(x):
-        Xtest_full = np.zeros((x.shape[0], model.X.shape[1]))
+        Xtest_full = np.zeros((x.shape[0], X.shape[1]))
        Xtest_full[:, [input_1, input_2]] = x
        _, var = model.predict(Xtest_full, **predict_kwargs)
        var = var[:, :1]
@ -137,7 +138,7 @@ def plot_latent(model, labels=None, which_indices=None,
    view = ImshowController(ax, plot_function,
                            (xmin, ymin, xmax, ymax),
                            resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.binary, **imshow_kwargs)
+                            cmap=cm.binary, **imshow_kwargs)
    # make sure labels are in order of input:
    labels = np.asarray(labels)
@ -192,17 +193,18 @@ def plot_latent(model, labels=None, which_indices=None,
    if updates:
        try:
-            ax.figure.canvas.show()
+            fig.canvas.show()
        except Exception as e:
            print("Could not invoke show: {}".format(e))
-        raw_input('Enter to continue')
+        #raw_input('Enter to continue')
-        view.deactivate()
+        return view
    return ax
 def plot_magnification(model, labels=None, which_indices=None,
                resolution=60, ax=None, marker='o', s=40,
                fignum=None, plot_inducing=False, legend=True,
-                aspect='auto', updates=False):
+                plot_limits=None,
                aspect='auto', updates=False, mean=True, covariance=True, kern=None):
    """
    :param labels: a np.array of size model.num_data containing labels for the points (can be number, strings, etc)
    :param resolution: the resolution of the grid on which to evaluate the predictive variance
@ -210,6 +212,8 @@ def plot_magnification(model, labels=None, which_indices=None,
    if ax is None:
        fig = pb.figure(num=fignum)
        ax = fig.add_subplot(111)
    else:
        fig = ax.figure
    Tango.reset()
    if labels is None:
@ -217,19 +221,90 @@ def plot_magnification(model, labels=None, which_indices=None,
    input_1, input_2 = most_significant_input_dimensions(model, which_indices)
-    # first, plot the output variance as a function of the latent space
+    #fethch the data points X that we'd like to plot
-    Xtest, xx, yy, xmin, xmax = x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
+    X = model.X
-    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
+    if isinstance(X, VariationalPosterior):
        X = X.mean
    else:
        X = X
    if X.shape[0] > 1000:
        print("Warning: subsampling X, as it has more samples then 1000. X.shape={!s}".format(X.shape))
        subsample = np.random.choice(X.shape[0], size=1000, replace=False)
        X = X[subsample]
        labels = labels[subsample]
        #=======================================================================
        #     <<<WORK IN PROGRESS>>>
        #     <<<DO NOT DELETE>>>
        #     plt.close('all')
        #     fig, ax = plt.subplots(1,1)
        #     from GPy.plotting.matplot_dep.dim_reduction_plots import most_significant_input_dimensions
        #     import matplotlib.patches as mpatches
        #     i1, i2 = most_significant_input_dimensions(m, None)
        #     xmin, xmax = 100, -100
        #     ymin, ymax = 100, -100
        #     legend_handles = []
        #
        #     X = m.X.mean[:, [i1, i2]]
        #     X = m.X.variance[:, [i1, i2]]
        #
        #     xmin = X[:,0].min(); xmax = X[:,0].max()
        #     ymin = X[:,1].min(); ymax = X[:,1].max()
        #     range_ = [[xmin, xmax], [ymin, ymax]]
        #     ul = np.unique(labels)
        #
        #     for i, l in enumerate(ul):
        #         #cdict = dict(red  =[(0., colors[i][0], colors[i][0]), (1., colors[i][0], colors[i][0])],
        #         #             green=[(0., colors[i][0], colors[i][1]), (1., colors[i][1], colors[i][1])],
        #         #             blue =[(0., colors[i][0], colors[i][2]), (1., colors[i][2], colors[i][2])],
        #         #             alpha=[(0., 0., .0), (.5, .5, .5), (1., .5, .5)])
        #         #cmap = LinearSegmentedColormap('{}'.format(l), cdict)
        #         cmap = LinearSegmentedColormap.from_list('cmap_{}'.format(str(l)), [colors[i], colors[i]], 255)
        #         cmap._init()
        #         #alphas = .5*(1+scipy.special.erf(np.linspace(-2,2, cmap.N+3)))#np.log(np.linspace(np.exp(0), np.exp(1.), cmap.N+3))
        #         alphas = (scipy.special.erf(np.linspace(0,2.4, cmap.N+3)))#np.log(np.linspace(np.exp(0), np.exp(1.), cmap.N+3))
        #         cmap._lut[:, -1] = alphas
        #         print l
        #         x, y = X[labels==l].T
        #
        #         heatmap, xedges, yedges = np.histogram2d(x, y, bins=300, range=range_)
        #         #heatmap, xedges, yedges = np.histogram2d(x, y, bins=100)
        #
        #         im = ax.imshow(heatmap, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], cmap=cmap, aspect='auto', interpolation='nearest', label=str(l))
        #         legend_handles.append(mpatches.Patch(color=colors[i], label=l))
        #     ax.set_xlim(xmin, xmax)
        #     ax.set_ylim(ymin, ymax)
        #     plt.legend(legend_handles, [l.get_label() for l in legend_handles])
        #     plt.draw()
        #     plt.show()
        #=======================================================================
    #Create an IMshow controller that can re-plot the latent space shading at a good resolution
    if plot_limits is None:
        xmin, ymin = X[:, [input_1, input_2]].min(0)
        xmax, ymax = X[:, [input_1, input_2]].max(0)
        x_r, y_r = xmax-xmin, ymax-ymin
        xmin -= .1*x_r
        xmax += .1*x_r
        ymin -= .1*y_r
        ymax += .1*y_r
    else:
        try:
            xmin, xmax, ymin, ymax = plot_limits
        except (TypeError, ValueError) as e:
            raise e.__class__("Wrong plot limits: {} given -> need (xmin, xmax, ymin, ymax)".format(plot_limits))
    def plot_function(x):
        Xtest_full = np.zeros((x.shape[0], X.shape[1]))
        Xtest_full[:, [input_1, input_2]] = x
-        mf=model.magnification(Xtest_full)
+        mf = model.predict_magnification(Xtest_full, kern=kern, mean=mean, covariance=covariance)
        return mf
    view = ImshowController(ax, plot_function,
-                            tuple(model.X.min(0)[:, [input_1, input_2]]) + tuple(model.X.max(0)[:, [input_1, input_2]]),
+                            (xmin, ymin, xmax, ymax),
                            resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.gray)
+                            cmap=cm.get_cmap('Greys'))
    # make sure labels are in order of input:
    ulabels = []
@ -245,17 +320,17 @@ def plot_magnification(model, labels=None, which_indices=None,
        elif type(ul) is np.int64:
            this_label = 'class %i' % ul
        else:
-            this_label = 'class %i' % i
+            this_label = unicode(ul)
        m = marker.next()
        index = np.nonzero(labels == ul)[0]
        if model.input_dim == 1:
-            x = model.X[index, input_1]
+            x = X[index, input_1]
            y = np.zeros(index.size)
        else:
-            x = model.X[index, input_1]
+            x = X[index, input_1]
-            y = model.X[index, input_2]
+            y = X[index, input_2]
-        ax.scatter(x, y, marker=m, s=s, color=Tango.nextMedium(), label=this_label)
+        ax.scatter(x, y, marker=m, s=s, c=Tango.nextMedium(), label=this_label, linewidth=.2, edgecolor='k', alpha=.9)
    ax.set_xlabel('latent dimension %i' % input_1)
    ax.set_ylabel('latent dimension %i' % input_2)
@ -263,19 +338,29 @@ def plot_magnification(model, labels=None, which_indices=None,
    if not np.all(labels == 1.) and legend:
        ax.legend(loc=0, numpoints=1)
-    ax.set_xlim(xmin[0], xmax[0])
+    ax.set_xlim((xmin, xmax))
-    ax.set_ylim(xmin[1], xmax[1])
+    ax.set_ylim((ymin, ymax))
    ax.grid(b=False) # remove the grid if present, it doesn't look good
    ax.set_aspect('auto') # set a nice aspect ratio
-    if plot_inducing:
+    if plot_inducing and hasattr(model, 'Z'):
-        ax.plot(model.Z[:, input_1], model.Z[:, input_2], '^w')
+        Z = model.Z
        ax.scatter(Z[:, input_1], Z[:, input_2], c='w', s=18, marker="^", edgecolor='k', linewidth=.3, alpha=.7)
    try:
        fig.canvas.draw()
        fig.tight_layout()
        fig.canvas.draw()
    except Exception as e:
        print("Could not invoke tight layout: {}".format(e))
        pass
    if updates:
-        fig.canvas.show()
+        try:
-        raw_input('Enter to continue')
+            fig.canvas.draw()
-
+            fig.canvas.show()
-    pb.title('Magnification Factor')
+        except Exception as e:
            print("Could not invoke show: {}".format(e))
        #raw_input('Enter to continue')
        return view
    return ax
@ -314,8 +399,8 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
            this_label = 'class %i' % i
        m = marker.next()
        index = np.nonzero(data_labels == ul)[0]
-        x = model.X[index, input_1]
+        x = X[index, input_1]
-        y = model.X[index, input_2]
+        y = X[index, input_2]
        ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)
    ax.set_xlabel('latent dimension %i' % input_1)
@ -323,7 +408,7 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
    controller = ImAnnotateController(ax,
                                  plot_function,
-                                  tuple(model.X.min(0)[:, significant_dims]) + tuple(model.X.max(0)[:, significant_dims]),
+                                  tuple(X.min(0)[:, significant_dims]) + tuple(X.max(0)[:, significant_dims]),
                                  resolution=resolution,
                                  aspect=aspect,
                                  cmap=get_cmap('jet'),
--- a/GPy/plotting/matplot_dep/inference_plots.py
+++ b/GPy/plotting/matplot_dep/inference_plots.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
 #import numpy as np
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-import pylab as pb
+from matplotlib import pyplot as pb
 import Tango
 from matplotlib.textpath import TextPath
 from matplotlib.transforms import offset_copy
--- a/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py
+++ b/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py
@ -9,6 +9,9 @@ class AxisEventController(object):
    def __init__(self, ax):
        self.ax = ax
        self.activate()
    def __del__(self):
        self.deactivate()
        return self
    def deactivate(self):
        for cb_class in self.ax.callbacks.callbacks.values():
            for cb_num in cb_class.keys():
@ -81,9 +84,9 @@ class BufferedAxisChangedController(AxisChangedController):
    def __init__(self, ax, plot_function, plot_limits, resolution=50, update_lim=None, **kwargs):
        """
        Buffered axis changed controller. Controls the buffer and handles update events for when the axes changed.
-        
+
        Updated plotting will be after first reload (first time will be within plot limits, after that the limits will be buffered)
-        
+
        :param plot_function:
            function to use for creating image for plotting (return ndarray-like)
            plot_function gets called with (2D!) Xtest grid if replotting required
--- a/GPy/plotting/matplot_dep/mapping_plots.py
+++ b/GPy/plotting/matplot_dep/mapping_plots.py
@ -4,7 +4,7 @@
 import numpy as np
 try:
    import Tango
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
 from base_plots import x_frame1D, x_frame2D
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
    from matplotlib.patches import Polygon
    from matplotlib.collections import PatchCollection
    #from matplotlib import cm
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -1,25 +1,82 @@
 # Copyright (c) 2012-2015, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 try:
    import Tango
    import pylab as pb
 except:
    pass
 import numpy as np
-from base_plots import gpplot, x_frame1D, x_frame2D
+from . import Tango
 from .base_plots import gpplot, x_frame1D, x_frame2D,gperrors
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from ...models.warped_gp import WarpedGP
 from scipy import sparse
 from ...core.parameterization.variational import VariationalPosterior
 from matplotlib import pyplot as plt
 def plot_data(model, which_data_rows='all',
        which_data_ycols='all', visible_dims=None,
        fignum=None, ax=None, data_symbol='kx',mew=1.5):
    """
    Plot the training data
      - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
    Can plot only part of the data
    using which_data_rows and which_data_ycols.
    :param which_data_rows: which of the training data to plot (default all)
    :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
    :param which_data_ycols: when the data has several columns (independant outputs), only plot these
    :type which_data_rows: 'all' or a list of integers
    :param visible_dims: an array specifying the input dimensions to plot (maximum two)
    :type visible_dims: a numpy array
    :param fignum: figure to plot on.
    :type fignum: figure number
    :param ax: axes to plot on.
    :type ax: axes handle
    """
    #deal with optional arguments
    if which_data_rows == 'all':
        which_data_rows = slice(None)
    if which_data_ycols == 'all':
        which_data_ycols = np.arange(model.output_dim)
    if ax is None:
        fig = plt.figure(num=fignum)
        ax = fig.add_subplot(111)
    #data
    X = model.X
    Y = model.Y
    #work out what the inputs are for plotting (1D or 2D)
    if visible_dims is None:
        visible_dims = np.arange(model.input_dim)
    assert visible_dims.size <= 2, "Visible inputs cannot be larger than two"
    free_dims = visible_dims
    plots = {}
    #one dimensional plotting
    if len(free_dims) == 1:
        for d in which_data_ycols:
            plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=mew)
    #2D plotting
    elif len(free_dims) == 2:
        for d in which_data_ycols:
            plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40,
            Y[which_data_rows, d], cmap=plt.cm.jet, vmin=Y.min(), vmax=Y.max(), linewidth=0.)
    else:
        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
    return plots
 def plot_fit(model, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=False,
        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx',
-        apply_link=False, samples_f=0, plot_uncertain_inputs=True, predict_kw=None):
+        apply_link=False, samples_y=0, plot_uncertain_inputs=True, predict_kw=None, plot_training_data=True):
    """
    Plot the posterior of the GP.
      - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -37,25 +94,32 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    :type which_data_rows: 'all' or a list of integers
    :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
    :type fixed_inputs: a list of tuples
-    :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+    :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
    :type resolution: int
    :param levels: number of levels to plot in a contour plot.
    :type levels: int
-    :param samples: the number of a posteriori samples to plot p(y*|y)
+    :param samples: the number of a posteriori samples to plot p(f*|y)
    :type samples: int
    :param fignum: figure to plot on.
    :type fignum: figure number
    :param ax: axes to plot on.
    :type ax: axes handle
-    :type output: integer (first output is 0)
+    :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
    :type resolution: int
    :param plot_raw: Whether to plot the raw function p(f|y)
    :type plot_raw: boolean
    :param linecol: color of line to plot.
-    :type linecol:
+    :type linecol: hex or color
    :param fillcol: color of fill
-    :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+    :type fillcol: hex or color
-    :param apply_link: apply the link function if plotting f (default false)
+    :param apply_link: apply the link function if plotting f (default false), as well as posterior samples if requested
    :type apply_link: boolean
-    :param samples_f: the number of posteriori f samples to plot p(f*|y)
+    :param samples_y: the number of posteriori f samples to plot p(y*|y)
-    :type samples_f: int
+    :type samples_y: int
    :param plot_uncertain_inputs: plot the uncertainty of the inputs as error bars if they have uncertainty (BGPLVM etc.)
    :type plot_uncertain_inputs: boolean
    :param predict_kw: keyword args for _raw_predict and predict functions if required
    :type predict_kw: dict
    :param plot_training_data: whether or not to plot the training points
    :type plot_training_data: boolean
    """
    #deal with optional arguments
    if which_data_rows == 'all':
@ -65,7 +129,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    #if len(which_data_ycols)==0:
        #raise ValueError('No data selected for plotting')
    if ax is None:
-        fig = pb.figure(num=fignum)
+        fig = plt.figure(num=fignum)
        ax = fig.add_subplot(111)
    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
@ -117,31 +181,38 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                    Y_metadata = {'output_index': extra_data}
                else:
                    Y_metadata['output_index'] = extra_data
            if isinstance(model, WarpedGP):
                m, v = model.predict(Xgrid, full_cov=False, median=True, Y_metadata=Y_metadata, **predict_kw)
                #print np.concatenate((Xgrid, m), axis=1)
            else:
                m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
-            lower, upper = model.predict_quantiles(Xgrid, Y_metadata=Y_metadata)
+            fmu, fv = model._raw_predict(Xgrid, full_cov=False, **predict_kw)
            lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=Y_metadata)
        for d in which_data_ycols:
            plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol)
-            if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
+            #if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
            if not plot_raw and plot_training_data:
                plots['dataplot'] = plot_data(model=model, which_data_rows=which_data_rows,
                visible_dims=free_dims, data_symbol=data_symbol, mew=1.5, ax=ax, fignum=fignum)
        #optionally plot some samples
        if samples: #NOTE not tested with fixed_inputs
-            Ysim = model.posterior_samples(Xgrid, samples, Y_metadata=Y_metadata)
+            Fsim = model.posterior_samples_f(Xgrid, samples)
-            print Ysim.shape
+            if apply_link:
-            print Xnew.shape
+                Fsim = model.likelihood.gp_link.transf(Fsim)
            for yi in Ysim.T:
                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
        if samples_f: #NOTE not tested with fixed_inputs
            Fsim = model.posterior_samples_f(Xgrid, samples_f)
            for fi in Fsim.T:
-                plots['posterior_samples_f'] = ax.plot(Xnew, fi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                plots['posterior_samples'] = ax.plot(Xnew, fi[:,None], '#3300FF', linewidth=0.25)
                #ax.plot(Xnew, fi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
        if samples_y: #NOTE not tested with fixed_inputs
            Ysim = model.posterior_samples(Xgrid, samples_y, Y_metadata=Y_metadata)
            for yi in Ysim.T:
                plots['posterior_samples_y'] = ax.scatter(Xnew, yi[:,None], s=5, c=Tango.colorsHex['darkBlue'], marker='o', alpha=0.5)
                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
@ -206,8 +277,10 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
-            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=plt.cm.jet)
-            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            #if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
            if not plot_raw and plot_training_data:
                plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
        #set the limits of the plot to some sensible values
        ax.set_xlim(xmin[0], xmax[0])
@ -272,3 +345,82 @@ def fixed_inputs(model, non_fixed_inputs, fix_routine='median', as_list=True, X_
        return f_inputs
    else:
        return X
 def errorbars_trainset(model, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        fignum=None, ax=None,
        linecol='red', data_symbol='kx',
        predict_kw=None, plot_training_data=True, **kwargs):
    """
    Plot the posterior error bars corresponding to the training data
      - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
    Can plot only part of the data
    using which_data_rows and which_data_ycols.
    :param which_data_rows: which of the training data to plot (default all)
    :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
    :param which_data_ycols: when the data has several columns (independant outputs), only plot these
    :type which_data_rows: 'all' or a list of integers
    :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
    :type fixed_inputs: a list of tuples
    :param fignum: figure to plot on.
    :type fignum: figure number
    :param ax: axes to plot on.
    :type ax: axes handle
    :param plot_training_data: whether or not to plot the training points
    :type plot_training_data: boolean
    """
    #deal with optional arguments
    if which_data_rows == 'all':
        which_data_rows = slice(None)
    if which_data_ycols == 'all':
        which_data_ycols = np.arange(model.output_dim)
    if ax is None:
        fig = plt.figure(num=fignum)
        ax = fig.add_subplot(111)
    X = model.X
    Y = model.Y
    if predict_kw is None:
        predict_kw = {}
    #work out what the inputs are for plotting (1D or 2D)
    fixed_dims = np.array([i for i,v in fixed_inputs])
    free_dims = np.setdiff1d(np.arange(model.input_dim),fixed_dims)
    plots = {}
    #one dimensional plotting
    if len(free_dims) == 1:
        m, v = model.predict(X, full_cov=False, Y_metadata=model.Y_metadata, **predict_kw)
        fmu, fv = model._raw_predict(X, full_cov=False, **predict_kw)
        lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=model.Y_metadata)
        for d in which_data_ycols:
            plots['gperrors'] = gperrors(X, m[:, d], lower[:, d], upper[:, d], edgecol=linecol, ax=ax, fignum=fignum, **kwargs )
            if plot_training_data:
                plots['dataplot'] = plot_data(model=model, which_data_rows=which_data_rows,
                visible_dims=free_dims, data_symbol=data_symbol, mew=1.5, ax=ax, fignum=fignum)
        #set the limits of the plot to some sensible values
        ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
        ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
        ax.set_xlim(X[:,free_dims].min(), X[:,free_dims].max())
        ax.set_ylim(ymin, ymax)
    elif len(free_dims) == 2:
        raise NotImplementedError("Not implemented yet")
    else:
        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
    return plots
--- a/GPy/plotting/matplot_dep/priors_plots.py
+++ b/GPy/plotting/matplot_dep/priors_plots.py
@ -4,7 +4,7 @@
 import numpy as np
 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
--- a/GPy/plotting/matplot_dep/svig_plots.py
+++ b/GPy/plotting/matplot_dep/svig_plots.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-import pylab as pb
+from matplotlib import pyplot as pb
 def plot(model, ax=None, fignum=None, Z_height=None, **kwargs):
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -1,4 +1,4 @@
-import pylab as pb, numpy as np
+from matplotlib import pyplot as pb, numpy as np
 def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    """
--- a/GPy/testing/bgplvm_minibatch_tests.py
+++ b/GPy/testing/bgplvm_minibatch_tests.py
@ -0,0 +1,109 @@
 '''
 Created on 4 Sep 2015
@author: maxz
 '''
 import unittest
 import numpy as np
 import GPy
 class BGPLVMTest(unittest.TestCase):
    def setUp(self):
        np.random.seed(12345)
        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
        self.X, self.W, self.Y = X,W,Y
        self.Q = 3
        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
    def test_lik_comparisons_m1_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_predict_missing_data(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        self.assertRaises(NotImplementedError, m.predict, m.X, full_cov=True)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=True)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:,:,0], var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:,[0]], var2)
    def test_lik_comparisons_m0_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m1_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m0_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_gradients_missingdata(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
        assert(m.checkgrad())
    def test_gradients_missingdata_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_gradients_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_predict(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()
--- a/GPy/testing/cacher_tests.py
+++ b/GPy/testing/cacher_tests.py
@ -0,0 +1,37 @@
 '''
 Created on 4 Sep 2015
@author: maxz
 '''
 import unittest
 from GPy.util.caching import Cacher
 from pickle import PickleError
 class Test(unittest.TestCase):
    def setUp(self):
        def op(x):
            return x
        self.cache = Cacher(op, 1)
    def test_pickling(self):
        self.assertRaises(PickleError, self.cache.__getstate__)
        self.assertRaises(PickleError, self.cache.__setstate__)
    def test_copy(self):
        tmp = self.cache.__deepcopy__()
        assert(tmp.operation is self.cache.operation)
        self.assertEqual(tmp.limit, self.cache.limit)
    def test_reset(self):
        self.cache.reset()
        self.assertDictEqual(self.cache.cached_input_ids, {}, )
        self.assertDictEqual(self.cache.cached_outputs, {}, )
        self.assertDictEqual(self.cache.inputs_changed, {}, )
    def test_name(self):
        assert(self.cache.__name__ == self.cache.operation.__name__)
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()
--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -2,11 +2,21 @@ import numpy as np
 import scipy as sp
 from GPy.util import choleskies
 import GPy
 from ..util.config import config
 import unittest
 try:
    from ..util import linalg_cython
    from ..util import choleskies_cython
    config.set('cython', 'working', 'True')
 except ImportError:
    config.set('cython', 'working', 'False')
 """
-These tests make sure that the opure python and cython codes work the same
+These tests make sure that the pure python and cython codes work the same
 """
@unittest.skipIf(not config.getboolean('cython', 'working'),"Cython modules have not been built on this machine")
 class CythonTestChols(np.testing.TestCase):
    def setUp(self):
        self.flat = np.random.randn(45,5)
@ -20,6 +30,7 @@ class CythonTestChols(np.testing.TestCase):
        A2 = choleskies._triang_to_flat_cython(self.triang)
        np.testing.assert_allclose(A1, A2)
@unittest.skipIf(not config.getboolean('cython', 'working'),"Cython modules have not been built on this machine")
 class test_stationary(np.testing.TestCase):
    def setUp(self):
        self.k = GPy.kern.RBF(10)
@ -49,17 +60,16 @@ class test_stationary(np.testing.TestCase):
        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
        np.testing.assert_allclose(g1, g2)
@unittest.skipIf(not config.getboolean('cython', 'working'),"Cython modules have not been built on this machine")
 class test_choleskies_backprop(np.testing.TestCase):
    def setUp(self):
-        self.dL, self.L = np.random.randn(2, 100, 100)
+        a =np.random.randn(10,12)
        A = a.dot(a.T)
        self.L = GPy.util.linalg.jitchol(A)
        self.dL = np.random.randn(10,10)
    def test(self):
-        r1 = GPy.util.choleskies._backprop_gradient_pure(self.dL, self.L)
+        r1 = choleskies._backprop_gradient_pure(self.dL, self.L)
-        r2 = GPy.util.choleskies.choleskies_cython.backprop_gradient(self.dL, self.L)
+        r2 = choleskies_cython.backprop_gradient(self.dL, self.L)
        r3 = choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
        np.testing.assert_allclose(r1, r2)
-
+        np.testing.assert_allclose(r1, r3)
--- a/GPy/testing/gp_tests.py
+++ b/GPy/testing/gp_tests.py
@ -0,0 +1,99 @@
 '''
 Created on 4 Sep 2015
@author: maxz
 '''
 import unittest
 import numpy as np, GPy
 from GPy.core.parameterization.variational import NormalPosterior
 class Test(unittest.TestCase):
    def setUp(self):
        np.random.seed(12345)
        self.N = 20
        self.N_new = 50
        self.D = 1
        self.X = np.random.uniform(-3., 3., (self.N, 1))
        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
    def test_setxy_bgplvm(self):
        k = GPy.kern.RBF(1)
        m = GPy.models.BayesianGPLVM(self.Y, 2, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        Xnew = NormalPosterior(m.X.mean[:10].copy(), m.X.variance[:10].copy())
        m.set_XY(Xnew, m.Y[:10])
        assert(m.checkgrad())
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
        np.testing.assert_allclose(mu, mu2)
        np.testing.assert_allclose(var, var2)
    def test_setxy_gplvm(self):
        k = GPy.kern.RBF(1)
        m = GPy.models.GPLVM(self.Y, 2, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        Xnew = X[:10].copy()
        m.set_XY(Xnew, m.Y[:10])
        assert(m.checkgrad())
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
        np.testing.assert_allclose(mu, mu2)
        np.testing.assert_allclose(var, var2)
    def test_setxy_gp(self):
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        m.set_XY(m.X[:10], m.Y[:10])
        assert(m.checkgrad())
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
        np.testing.assert_allclose(mu, mu2)
        np.testing.assert_allclose(var, var2)
    def test_mean_function(self):
        from GPy.core.parameterization.param import Param
        from GPy.core.mapping import Mapping
        class Parabola(Mapping):
            def __init__(self, variance, degree=2, name='parabola'):
                super(Parabola, self).__init__(1, 1, name)
                self.variance = Param('variance', np.ones(degree+1) * variance)
                self.degree = degree
                self.link_parameter(self.variance)
            def f(self, X):
                p = self.variance[0] * np.ones(X.shape)
                for i in range(1, self.degree+1):
                    p += self.variance[i] * X**(i)
                return p
            def gradients_X(self, dL_dF, X):
                grad = np.zeros(X.shape)
                for i in range(1, self.degree+1):
                    grad += (i) * self.variance[i] * X**(i-1)
                return grad
            def update_gradients(self, dL_dF, X):
                for i in range(self.degree+1):
                    self.variance.gradient[i] = (dL_dF * X**(i)).sum(0)
        X = np.linspace(-2, 2, 100)[:, None]
        k = GPy.kern.RBF(1)
        k.randomize()
        p = Parabola(.3)
        p.randomize()
        Y = p.f(X) + np.random.multivariate_normal(np.zeros(X.shape[0]), k.K(X)+np.eye(X.shape[0])*1e-8)[:,None] + np.random.normal(0, .1, (X.shape[0], 1))
        m = GPy.models.GPRegression(X, Y, mean_function=p)
        m.randomize()
        assert(m.checkgrad())
        _ = m.predict(m.X)
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -8,11 +8,12 @@ The test cases for various inference algorithms
 import unittest, itertools
 import numpy as np
 import GPy
-
+#np.seterr(invalid='raise')
 class InferenceXTestCase(unittest.TestCase):
    def genData(self):
        np.random.seed(1)
        D1,D2,N = 12,12,50
        x = np.linspace(0, 4 * np.pi, N)[:, None]
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -6,9 +6,16 @@ import numpy as np
 import GPy
 import sys
 from GPy.core.parameterization.param import Param
 from ..util.config import config
 verbose = 0
 try:
    from ..util import linalg_cython
    config.set('cython', 'working', 'True')
 except ImportError:
    config.set('cython', 'working', 'False')
 class Kern_check_model(GPy.core.Model):
    """
@ -245,6 +252,11 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        continuous_kerns = ['RBF', 'Linear']
        self.kernclasses = [getattr(GPy.kern, s) for s in continuous_kerns]
    def test_MLP(self):
        k = GPy.kern.MLP(self.D,ARD=True)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_Matern32(self):
        k = GPy.kern.Matern32(self.D)
        k.randomize()
@ -313,6 +325,11 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_standard_periodic(self):
        k = GPy.kern.StdPeriodic(self.D, self.D-1)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 class KernelTestsMiscellaneous(unittest.TestCase):
    def setUp(self):
        N, D = 100, 10
@ -366,6 +383,7 @@ class KernelTestsNonContinuous(unittest.TestCase):
        X2 = self.X2[self.X2[:,-1]!=2]
        self.assertTrue(check_kernel_gradient_functions(kern, X=X, X2=X2, verbose=verbose, fixed_X_dims=-1))
@unittest.skipIf(not config.getboolean('cython', 'working'),"Cython modules have not been built on this machine")
 class Coregionalize_cython_test(unittest.TestCase):
    """
    Make sure that the coregionalize kernel work with and without cython enabled
@ -432,6 +450,104 @@ class KernelTestsProductWithZeroValues(unittest.TestCase):
        self.assertFalse(np.any(np.isnan(target)),
                         "Gradient resulted in NaN")
 class Kernel_Psi_statistics_GradientTests(unittest.TestCase):
    def setUp(self):
        from GPy.core.parameterization.variational import NormalPosterior
        N,M,Q = 100,20,3
        X = np.random.randn(N,Q)
        X_var = np.random.rand(N,Q)+0.01
        self.Z = np.random.randn(M,Q)
        self.qX = NormalPosterior(X, X_var)
        self.w1 = np.random.randn(N)
        self.w2 = np.random.randn(N,M)
        self.w3 = np.random.randn(M,M)
        self.w3 = self.w3+self.w3.T
        self.w3n = np.random.randn(N,M,M)
        self.w3n = self.w3n+np.swapaxes(self.w3n, 1,2)
    def test_kernels(self):
        from GPy.kern import RBF,Linear,MLP
        Q = self.Z.shape[1]
        kernels = [RBF(Q,ARD=True), Linear(Q,ARD=True)]
        for k in kernels:
            k.randomize()
            self._test_kernel_param(k)
            self._test_Z(k)
            self._test_qX(k)
            self._test_kernel_param(k, psi2n=True)
            self._test_Z(k, psi2n=True)
            self._test_qX(k, psi2n=True)
    def _test_kernel_param(self, kernel, psi2n=False):
        def f(p):
            kernel.param_array[:] = p
            psi0 = kernel.psi0(self.Z, self.qX)
            psi1 = kernel.psi1(self.Z, self.qX)
            if not psi2n:
                psi2 = kernel.psi2(self.Z, self.qX)
                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3*psi2).sum()
            else:
                psi2 = kernel.psi2n(self.Z, self.qX)
                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3n*psi2).sum()
        def df(p):
            kernel.param_array[:] = p
            kernel.update_gradients_expectations(self.w1, self.w2, self.w3 if not psi2n else self.w3n, self.Z, self.qX)
            return kernel.gradient.copy()
        from GPy.models import GradientChecker
        m = GradientChecker(f, df, kernel.param_array.copy())
        self.assertTrue(m.checkgrad())
    def _test_Z(self, kernel, psi2n=False):
        def f(p):
            psi0 = kernel.psi0(p, self.qX)
            psi1 = kernel.psi1(p, self.qX)
            psi2 = kernel.psi2(p, self.qX)
            if not psi2n:
                psi2 = kernel.psi2(p, self.qX)
                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3*psi2).sum()
            else:
                psi2 = kernel.psi2n(p, self.qX)
                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3n*psi2).sum()
        def df(p):
            return kernel.gradients_Z_expectations(self.w1, self.w2, self.w3 if not psi2n else self.w3n, p, self.qX)
        from GPy.models import GradientChecker
        m = GradientChecker(f, df, self.Z.copy())
        self.assertTrue(m.checkgrad())
    def _test_qX(self, kernel, psi2n=False):
        def f(p):
            self.qX.param_array[:] = p
            self.qX._trigger_params_changed()
            psi0 = kernel.psi0(self.Z, self.qX)
            psi1 = kernel.psi1(self.Z, self.qX)
            if not psi2n:
                psi2 = kernel.psi2(self.Z, self.qX)
                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3*psi2).sum()
            else:
                psi2 = kernel.psi2n(self.Z, self.qX)
                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3n*psi2).sum()
        def df(p):
            self.qX.param_array[:] = p
            self.qX._trigger_params_changed()
            grad =  kernel.gradients_qX_expectations(self.w1, self.w2, self.w3 if not psi2n else self.w3n, self.Z, self.qX)
            self.qX.set_gradients(grad)
            return self.qX.gradient.copy()
        from GPy.models import GradientChecker
        m = GradientChecker(f, df, self.qX.param_array.copy())
        self.assertTrue(m.checkgrad())
 if __name__ == "__main__":
    print("Running unit tests, please be (very) patient...")
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -7,10 +7,8 @@ from GPy.models import GradientChecker
 import functools
 import inspect
 from GPy.likelihoods import link_functions
 from GPy.core.parameterization import Param
 from functools import partial
-#np.random.seed(300)
+fixed_seed = 7
 #np.random.seed(4)
 #np.seterr(divide='raise')
 def dparam_partial(inst_func, *args):
@ -105,6 +103,7 @@ class TestNoiseModels(object):
    Generic model checker
    """
    def setUp(self):
        np.random.seed(fixed_seed)
        self.N = 15
        self.D = 3
        self.X = np.random.rand(self.N, self.D)*10
@ -218,7 +217,8 @@ class TestNoiseModels(object):
                    "constraints": [(".*variance", self.constrain_positive)]
                },
                "laplace": True,
-                "ep": False # FIXME: Should be True when we have it working again
+                "ep": False, # FIXME: Should be True when we have it working again
                "variational_expectations": True,
            },
            "Gaussian_log": {
                "model": GPy.likelihoods.Gaussian(gp_link=link_functions.Log(), variance=self.var),
@ -227,7 +227,8 @@ class TestNoiseModels(object):
                    "vals": [self.var],
                    "constraints": [(".*variance", self.constrain_positive)]
                },
-                "laplace": True
+                "laplace": True,
                "variational_expectations": True
            },
            #"Gaussian_probit": {
            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
@ -252,7 +253,8 @@ class TestNoiseModels(object):
                "link_f_constraints": [partial(self.constrain_bounded, lower=0, upper=1)],
                "laplace": True,
                "Y": self.binary_Y,
-                "ep": False # FIXME: Should be True when we have it working again
+                "ep": False, # FIXME: Should be True when we have it working again
                "variational_expectations": True
            },
            "Exponential_default": {
                "model": GPy.likelihoods.Exponential(),
@ -347,6 +349,10 @@ class TestNoiseModels(object):
                ep = attributes["ep"]
            else:
                ep = False
            if "variational_expectations" in attributes:
                var_exp = attributes["variational_expectations"]
            else:
                var_exp = False
            #if len(param_vals) > 1:
                #raise NotImplementedError("Cannot support multiple params in likelihood yet!")
@ -377,6 +383,11 @@ class TestNoiseModels(object):
            if ep:
                #ep likelihood gradcheck
                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints
            if var_exp:
                #Need to specify mu and var!
                yield self.t_varexp, model, Y, Y_metadata
                yield self.t_dexp_dmu, model, Y, Y_metadata
                yield self.t_dexp_dvar, model, Y, Y_metadata
        self.tearDown()
@ -603,6 +614,87 @@ class TestNoiseModels(object):
        print(m)
        assert m.checkgrad(verbose=1, step=step)
    ################
    # variational expectations #
    ################
    @with_setup(setUp, tearDown)
    def t_varexp(self, model, Y, Y_metadata):
        #Test that the analytic implementation (if it exists) matches the generic gauss
        #hermite implementation
        print("\n{}".format(inspect.stack()[0][3]))
        #Make mu and var (marginal means and variances of q(f)) draws from a GP
        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
        L = GPy.util.linalg.jitchol(k)
        mu = L.dot(np.random.randn(*Y.shape))
        #Variance must be positive
        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
        expectation = model.variational_expectations(Y=Y, m=mu, v=var, gh_points=None, Y_metadata=Y_metadata)[0]
        #Implementation of gauss hermite integration
        shape = mu.shape
        gh_x, gh_w= np.polynomial.hermite.hermgauss(50)
        m,v,Y = mu.flatten(), var.flatten(), Y.flatten()
        #make a grid of points
        X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
        #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
        # broadcast needs to be handled carefully.
        logp = model.logpdf(X, Y[:,None], Y_metadata=Y_metadata)
        #average over the gird to get derivatives of the Gaussian's parameters
        #division by pi comes from fact that for each quadrature we need to scale by 1/sqrt(pi)
        expectation_gh  = np.dot(logp, gh_w)/np.sqrt(np.pi)
        expectation_gh = expectation_gh.reshape(*shape)
        np.testing.assert_almost_equal(expectation, expectation_gh, decimal=5)
    @with_setup(setUp, tearDown)
    def t_dexp_dmu(self, model, Y, Y_metadata):
        print("\n{}".format(inspect.stack()[0][3]))
        #Make mu and var (marginal means and variances of q(f)) draws from a GP
        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
        L = GPy.util.linalg.jitchol(k)
        mu = L.dot(np.random.randn(*Y.shape))
        #Variance must be positive
        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
        expectation = functools.partial(model.variational_expectations, Y=Y, v=var, gh_points=None, Y_metadata=Y_metadata)
        #Function to get the nth returned value
        def F(mu):
            return expectation(m=mu)[0]
        def dmu(mu):
            return expectation(m=mu)[1]
        grad = GradientChecker(F, dmu, mu.copy(), 'm')
        grad.randomize()
        print(grad)
        print(model)
        assert grad.checkgrad(verbose=1)
    @with_setup(setUp, tearDown)
    def t_dexp_dvar(self, model, Y, Y_metadata):
        print("\n{}".format(inspect.stack()[0][3]))
        #Make mu and var (marginal means and variances of q(f)) draws from a GP
        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
        L = GPy.util.linalg.jitchol(k)
        mu = L.dot(np.random.randn(*Y.shape))
        #Variance must be positive
        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
        expectation = functools.partial(model.variational_expectations, Y=Y, m=mu, gh_points=None, Y_metadata=Y_metadata)
        #Function to get the nth returned value
        def F(var):
            return expectation(v=var)[0]
        def dvar(var):
            return expectation(v=var)[2]
        grad = GradientChecker(F, dvar, var.copy(), 'v')
        self.constrain_positive('v', grad)
        #grad.randomize()
        print(grad)
        print(model)
        assert grad.checkgrad(verbose=1)
 class LaplaceTests(unittest.TestCase):
    """
@ -610,6 +702,7 @@ class LaplaceTests(unittest.TestCase):
    """
    def setUp(self):
        np.random.seed(fixed_seed)
        self.N = 15
        self.D = 1
        self.X = np.random.rand(self.N, self.D)*10
@ -705,7 +798,7 @@ class LaplaceTests(unittest.TestCase):
        post_mean_approx, post_var_approx, = m2.predict(X)
        if debug:
-            import pylab as pb
+            from matplotlib import pyplot as pb
            pb.figure(5)
            pb.title('posterior means')
            pb.scatter(X, post_mean, c='g')
--- a/GPy/testing/linalg_test.py
+++ b/GPy/testing/linalg_test.py
@ -1,7 +1,6 @@
 import numpy as np
 import scipy as sp
-from GPy.util.linalg import jitchol
+from ..util.linalg import jitchol,trace_dot, ijk_jlk_to_il, ijk_ljk_to_ilk
 import GPy
 class LinalgTests(np.testing.TestCase):
    def setUp(self):
@ -37,18 +36,19 @@ class LinalgTests(np.testing.TestCase):
        except sp.linalg.LinAlgError:
            return True
-    def test_einsum_ijk_jlk_to_il(self):
+    def test_trace_dot(self):
-        A = np.random.randn(50, 150, 5)
+        N = 5
-        B = np.random.randn(150, 100, 5)
+        A = np.random.rand(N,N)
-        pure = np.einsum('ijk,jlk->il', A, B)
+        B = np.random.rand(N,N)
-        quick = GPy.util.linalg.ijk_jlk_to_il(A, B)
+        trace = np.trace(A.dot(B))
-        np.testing.assert_allclose(pure, quick)
+        test_trace = trace_dot(A,B)
        np.testing.assert_allclose(trace,test_trace,atol=1e-13)
    def test_einsum_ij_jlk_to_ilk(self):
        A = np.random.randn(15, 150, 5)
        B = np.random.randn(150, 50, 5)
        pure = np.einsum('ijk,jlk->il', A, B)
-        quick = GPy.util.linalg.ijk_jlk_to_il(A,B)
+        quick = ijk_jlk_to_il(A,B)
        np.testing.assert_allclose(pure, quick)
    def test_einsum_ijk_ljk_to_ilk(self):
@ -56,5 +56,5 @@ class LinalgTests(np.testing.TestCase):
        B = np.random.randn(150, 20, 5)
        #B = A.copy()
        pure = np.einsum('ijk,ljk->ilk', A, B)
-        quick = GPy.util.linalg.ijk_ljk_to_ilk(A,B)
+        quick = ijk_ljk_to_ilk(A,B)
        np.testing.assert_allclose(pure, quick)
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -1,5 +1,5 @@
 import numpy as np
-import scipy as sp
+import scipy
 from scipy.special import cbrt
 from GPy.models import GradientChecker
 _lim_val = np.finfo(np.float64).max
@ -79,8 +79,7 @@ class LinkFunctionTests(np.testing.TestCase):
        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
        #Check the clipping works
        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
-        #Need to look at most significant figures here rather than the decimals
+        self.assertTrue(np.isfinite(link.transf(self.f_upper_lim)))
        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), _lim_val, significant=5)
        self.check_overflow(link, lim_of_inf)
        #Check that it would otherwise fail
@ -93,18 +92,18 @@ class LinkFunctionTests(np.testing.TestCase):
        link = Log_ex_1()
        lim_of_inf = _lim_val_exp
-        np.testing.assert_almost_equal(np.log1p(np.exp(self.mid_f)), link.transf(self.mid_f))
+        np.testing.assert_almost_equal(scipy.special.log1p(np.exp(self.mid_f)), link.transf(self.mid_f))
-        assert np.isinf(np.log1p(np.exp(np.log(self.f_upper_lim))))
+        assert np.isinf(scipy.special.log1p(np.exp(np.log(self.f_upper_lim))))
        #Check the clipping works
        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
        #Need to look at most significant figures here rather than the decimals
-        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), np.log1p(_lim_val), significant=5)
+        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), scipy.special.log1p(_lim_val), significant=5)
        self.check_overflow(link, lim_of_inf)
        #Check that it would otherwise fail
        beyond_lim_of_inf = lim_of_inf + 10.0
        old_err_state = np.seterr(over='ignore')
-        self.assertTrue(np.isinf(np.log1p(np.exp(beyond_lim_of_inf))))
+        self.assertTrue(np.isinf(scipy.special.log1p(np.exp(beyond_lim_of_inf))))
        np.seterr(**old_err_state)
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@ -1,6 +1,8 @@
 from __future__ import print_function
 import numpy as np
 import scipy as sp
 import GPy
 import warnings
 class MiscTests(np.testing.TestCase):
    """
@ -11,8 +13,15 @@ class MiscTests(np.testing.TestCase):
        self._lim_val_exp = np.log(self._lim_val)
    def test_safe_exp_upper(self):
-        assert np.exp(self._lim_val_exp + 1) == np.inf
+        with warnings.catch_warnings(record=True) as w:
-        assert GPy.util.misc.safe_exp(self._lim_val_exp + 1) < np.inf
+            warnings.simplefilter('always')  # always print
            assert np.isfinite(np.exp(self._lim_val_exp))
            assert np.isinf(np.exp(self._lim_val_exp + 1))
            assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))
            print(w)
            print(len(w))
            assert len(w)<=1 # should have one overflow warning
    def test_safe_exp_lower(self):
        assert GPy.util.misc.safe_exp(1e-10) < np.inf
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -15,6 +15,13 @@ class MiscTests(unittest.TestCase):
        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
    def test_setXY(self):
        m = GPy.models.GPRegression(self.X, self.Y)
        m.set_XY(np.vstack([self.X, np.random.rand(1,self.X.shape[1])]), np.vstack([self.Y, np.random.rand(1,self.Y.shape[1])]))
        m._trigger_params_changed()
        self.assertTrue(m.checkgrad())
        m.predict(m.X)
    def test_raw_predict(self):
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
@ -36,12 +43,78 @@ class MiscTests(unittest.TestCase):
        np.testing.assert_almost_equal(np.diag(K_hat)[:, None], var)
        np.testing.assert_almost_equal(mu_hat, mu)
    def test_normalizer(self):
        k = GPy.kern.RBF(1)
        Y = self.Y
        mu, std = Y.mean(0), Y.std(0)
        m = GPy.models.GPRegression(self.X, Y, kernel=k, normalizer=True)
        m.optimize()
        assert(m.checkgrad())
        k = GPy.kern.RBF(1)
        m2 = GPy.models.GPRegression(self.X, (Y-mu)/std, kernel=k, normalizer=False)
        m2[:] = m[:]
        mu1, var1 = m.predict(m.X, full_cov=True)
        mu2, var2 = m2.predict(m2.X, full_cov=True)
        np.testing.assert_allclose(mu1, (mu2*std)+mu)
        np.testing.assert_allclose(var1, var2)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = m2.predict(m2.X, full_cov=False)
        np.testing.assert_allclose(mu1, (mu2*std)+mu)
        np.testing.assert_allclose(var1, var2)
        q50n = m.predict_quantiles(m.X, (50,))
        q50 = m2.predict_quantiles(m2.X, (50,))
        np.testing.assert_allclose(q50n[0], (q50[0]*std)+mu)
    def check_jacobian(self):
        try:
            import autograd.numpy as np, autograd as ag, GPy, matplotlib.pyplot as plt
            from GPy.models import GradientChecker, GPRegression
        except:
            raise self.skipTest("autograd not available to check gradients")
        def k(X, X2, alpha=1., lengthscale=None):
            if lengthscale is None:
                lengthscale = np.ones(X.shape[1])
            exp = 0.
            for q in range(X.shape[1]):
                exp += ((X[:, [q]] - X2[:, [q]].T)/lengthscale[q])**2
            #exp = np.sqrt(exp)
            return alpha * np.exp(-.5*exp)
        dk = ag.elementwise_grad(lambda x, x2: k(x, x2, alpha=ke.variance.values, lengthscale=ke.lengthscale.values))
        dkdk = ag.elementwise_grad(dk, argnum=1)
        ke = GPy.kern.RBF(1, ARD=True)
        #ke.randomize()
        ke.variance = .2#.randomize()
        ke.lengthscale[:] = .5
        ke.randomize()
        X = np.linspace(-1, 1, 1000)[:,None]
        X2 = np.array([[0.]]).T
        np.testing.assert_allclose(ke.gradients_X([[1.]], X, X), dk(X, X))
        np.testing.assert_allclose(ke.gradients_XX([[1.]], X, X).sum(0), dkdk(X, X))
        np.testing.assert_allclose(ke.gradients_X([[1.]], X, X2), dk(X, X2))
        np.testing.assert_allclose(ke.gradients_XX([[1.]], X, X2).sum(0), dkdk(X, X2))
        m = GPRegression(self.X, self.Y)
        def f(x):
            m.X[:] = x
            return m.log_likelihood()
        def df(x):
            m.X[:] = x
            return m.kern.gradients_X(m.grad_dict['dL_dK'], X)
        def ddf(x):
            m.X[:] = x
            return m.kern.gradients_XX(m.grad_dict['dL_dK'], X).sum(0)
        gc = GradientChecker(f, df, self.X)
        gc2 = GradientChecker(df, ddf, self.X)
        assert(gc.checkgrad())
        assert(gc2.checkgrad())
    def test_sparse_raw_predict(self):
        k = GPy.kern.RBF(1)
        m = GPy.models.SparseGPRegression(self.X, self.Y, kernel=k)
        m.randomize()
        Z = m.Z[:]
        X = self.X[:]
        # Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
        Kinv = m.posterior.woodbury_inv
@ -127,11 +200,24 @@ class MiscTests(unittest.TestCase):
        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())
        mul, varl = m.predict(m.X)
        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
-        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
+        m2 = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())
        m2.kern.rbf.lengthscale[:] = 1e6
        m2.X[:] = m.X.param_array
        m2.likelihood[:] = m.likelihood[:]
        m2.kern.white[:] = m.kern.white[:]
        mu, var = m.predict(m.X)
        np.testing.assert_allclose(mul, mu)
        np.testing.assert_allclose(varl, var)
        q50 = m.predict_quantiles(m.X, (50,))
        np.testing.assert_allclose(mul, q50[0])
    def test_likelihood_replicate_kern(self):
        m = GPy.models.GPRegression(self.X, self.Y)
@ -410,8 +496,8 @@ class GradientTests(np.testing.TestCase):
        self.check_model(rbf, model_type='SparseGPRegression', dimension=2)
    def test_SparseGPRegression_rbf_linear_white_kern_1D(self):
-        ''' Testing the sparse GP regression with rbf kernel on 2d data '''
+        ''' Testing the sparse GP regression with rbf kernel on 1d data '''
-        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1)
+        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1) + GPy.kern.White(1, 1e-5)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=1)
    def test_SparseGPRegression_rbf_linear_white_kern_2D(self):
@ -419,14 +505,12 @@ class GradientTests(np.testing.TestCase):
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2)
    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_2D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 2d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        raise unittest.SkipTest("This is not implemented yet!")
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2, uncertain_inputs=1)
    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_1D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 1d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1)
@ -443,6 +527,16 @@ class GradientTests(np.testing.TestCase):
        m = GPy.models.GPLVM(Y, input_dim, kernel=k)
        self.assertTrue(m.checkgrad())
    def test_BCGPLVM_rbf_bias_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
        X = np.random.rand(N, input_dim)
        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
        m = GPy.models.BCGPLVM(Y, input_dim, kernel=k)
        self.assertTrue(m.checkgrad())
    def test_GPLVM_rbf_linear_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
@ -468,23 +562,8 @@ class GradientTests(np.testing.TestCase):
        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, Z=Z)
        # distribution = GPy.likelihoods.likelihood_functions.Bernoulli()
        # likelihood = GPy.likelihoods.EP(Y, distribution)
        # m = GPy.core.SparseGP(X, likelihood, kernel, Z)
        # m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())
    @unittest.expectedFailure
    def test_generalized_FITC(self):
        N = 20
        X = np.hstack([np.random.rand(N / 2) + 1, np.random.rand(N / 2) - 1])[:, None]
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
        m = GPy.models.FITCClassification(X, Y, kernel=k)
        m.update_likelihood_approximation()
        self.assertTrue(m.checkgrad())
    @unittest.expectedFailure
    def test_multioutput_regression_1D(self):
        X1 = np.random.rand(50, 1) * 8
        X2 = np.random.rand(30, 1) * 5
@ -494,12 +573,11 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))
        k1 = GPy.kern.RBF(1)
-        m = GPy.models.GPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
+        m = GPy.models.GPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
-        import ipdb;ipdb.set_trace()
+        #import ipdb;ipdb.set_trace()
-        m.constrain_fixed('.*rbf_var', 1.)
+        #m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())
    @unittest.expectedFailure
    def test_multioutput_sparse_regression_1D(self):
        X1 = np.random.rand(500, 1) * 8
        X2 = np.random.rand(300, 1) * 5
@ -509,8 +587,7 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))
        k1 = GPy.kern.RBF(1)
-        m = GPy.models.SparseGPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
+        m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
        m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())
    def test_gp_heteroscedastic_regression(self):
@ -539,6 +616,7 @@ class GradientTests(np.testing.TestCase):
        self.assertTrue(m.checkgrad())
    def test_gp_kronecker_gaussian(self):
        np.random.seed(0)
        N1, N2 = 30, 20
        X1 = np.random.randn(N1, 1)
        X2 = np.random.randn(N2, 1)
@ -559,16 +637,16 @@ class GradientTests(np.testing.TestCase):
        m.randomize()
        mm[:] = m[:]
-        assert np.allclose(m.log_likelihood(), mm.log_likelihood())
+        self.assertTrue(np.allclose(m.log_likelihood(), mm.log_likelihood()))
-        assert np.allclose(m.gradient, mm.gradient)
+        self.assertTrue(np.allclose(m.gradient, mm.gradient))
        X1test = np.random.randn(100, 1)
        X2test = np.random.randn(100, 1)
        mean1, var1 = m.predict(X1test, X2test)
        yy, xx = np.meshgrid(X2test, X1test)
        Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
        mean2, var2 = mm.predict(Xgrid)
-        assert np.allclose(mean1, mean2)
+        self.assertTrue( np.allclose(mean1, mean2) )
-        assert np.allclose(var1, var2)
+        self.assertTrue( np.allclose(var1, var2) )
    def test_gp_VGPC(self):
        num_obs = 25
@ -576,7 +654,8 @@ class GradientTests(np.testing.TestCase):
        X = X[:, None]
        Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
        kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
-        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kern)
+        lik = GPy.likelihoods.Gaussian()
        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kernel=kern, likelihood=lik)
        self.assertTrue(m.checkgrad())
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@ -248,10 +248,16 @@ class ParameterizedTest(unittest.TestCase):
        m.randomize()
        self.assertEqual(m.p1, val)
    def test_checkgrad(self):
        assert(self.testmodel.kern.checkgrad())
        assert(self.testmodel.kern.lengthscale.checkgrad())
        assert(self.testmodel.likelihood.checkgrad())
    def test_printing(self):
        print(self.test1)
        print(self.param)
        print(self.test1[''])
        print(self.testmodel.hierarchy_name(False))
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.test_add_parameter']
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -20,6 +20,8 @@ from GPy.examples.dimensionality_reduction import mrd_simulation
 from GPy.core.parameterization.variational import NormalPosterior
 from GPy.models.gp_regression import GPRegression
 from functools import reduce
 from GPy.util.caching import Cacher
 from pickle import PicklingError
 def toy_model():
    X = np.linspace(0,1,50)[:, None]
@ -205,23 +207,6 @@ class Test(ListDictTestCase):
    def _callback(self, what, which):
        what.count += 1
    @unittest.skip
    def test_add_observer(self):
        par = toy_model()
        par.name = "original"
        par.count = 0
        par.add_observer(self, self._callback, 1)
        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
        self.assertNotIn(par.observers[0], pcopy.observers)
        pcopy = par.copy()
        pcopy.name = "copy"
        self.assertTrue(par.checkgrad())
        self.assertTrue(pcopy.checkgrad())
        self.assertTrue(pcopy.kern.checkgrad())
        import ipdb;ipdb.set_trace()
        self.assertIn(par.observers[0], pcopy.observers)
        self.assertEqual(par.count, 3)
        self.assertEqual(pcopy.count, 6) # 3 of each call to checkgrad
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
--- a/GPy/testing/run_coverage.sh
+++ b/GPy/testing/run_coverage.sh
@ -0,0 +1 @@
 nosetests . --with-coverage --logging-level=INFO --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase
--- a/Show more
+++ b/Show more
`@ -1 +1,2 @@`
	`from .hmc import HMC`	`from .hmc import HMC`
		`from .samplers import *`
		`@ -0,0 +1 @@`
							`nosetests . --with-coverage --logging-level=INFO --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase`