merging last master

2026-05-09 12:02:38 +02:00 · 2015-09-17 14:43:00 +01:00 · 2015-09-17 14:43:00 +01:00 · 1a02c65a61
commit 1a02c65a61
parent 4dd2f4feb7 37e835aa36
133 changed files with 13282 additions and 9562 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -17,7 +17,7 @@ before_install:
  - sudo ln -s /run/shm /dev/shm

 install:
-  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.7 scipy=0.12 matplotlib nose sphinx pip nose
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.9 scipy=0.16 matplotlib nose sphinx pip nose
  #- pip install . 
  - python setup.py build_ext --inplace
  #--use-mirrors
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@ -1,7 +1 @@
-James Hensman
-Nicolo Fusi
-Ricardo Andrade
-Nicolas Durrande
-Alan Saul
-Max Zwiessele
-Neil D. Lawrence
+See contributors.
--- a/GPy/init.py
+++ b/GPy/init.py
@ -21,16 +21,18 @@ from . import plotting
 from .core import Model
 from .core.parameterization import Param, Parameterized, ObsAr

+from .__version__ import __version__
+
 #@nottest
 try:
    #Get rid of nose dependency by only ignoring if you have nose installed
    from nose.tools import nottest
    @nottest
-    def tests():
-        Tester(testing).test(verbose=10)
+    def tests(verbose=10):
+        Tester(testing).test(verbose=verbose)
 except:
-    def tests():
-        Tester(testing).test(verbose=10)
+    def tests(verbose=10):
+        Tester(testing).test(verbose=verbose)

 def load(file_path):
    """
--- a/GPy/version.py
+++ b/GPy/version.py
@ -0,0 +1 @@
+__version__ = "0.8.8"
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -7,6 +7,6 @@ from .parameterization.param import Param, ParamConcatenation
 from .parameterization.observable_array import ObsAr

 from .gp import GP
-#from .svgp import SVGP
+from .svgp import SVGP
 from .sparse_gp import SparseGP
 from .mapping import *
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -60,9 +60,11 @@ class GP(Model):
            self.normalizer.scale_by(Y)
            self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
            self.Y = Y
-        else:
+        elif isinstance(Y, np.ndarray):
            self.Y = ObsAr(Y)
            self.Y_normalized = self.Y
+        else:
+            self.Y = Y

        if Y.shape[0] != self.num_data:
            #There can be cases where we want inputs than outputs, for example if we have multiple latent
@ -104,8 +106,23 @@ class GP(Model):
        self.link_parameter(self.likelihood)
        self.posterior = None

+        # The predictive variable to be used to predict using the posterior object's
+        # woodbury_vector and woodbury_inv is defined as predictive_variable
+        # as long as the posterior has the right woodbury entries.
+        # It is the input variable used for the covariance between
+        # X_star and the posterior of the GP.
+        # This is usually just a link to self.X (full GP) or self.Z (sparse GP).
+        # Make sure to name this variable and the predict functions will "just work"
+        # In maths the predictive variable is:
+        #         K_{xx} - K_{xp}W_{pp}^{-1}K_{px}
+        #         W_{pp} := \texttt{Woodbury inv}
+        #         p := _predictive_variable

-    def set_XY(self, X=None, Y=None, trigger_update=True):
+    @property
+    def _predictive_variable(self):
+        return self.X
+
+    def set_XY(self, X=None, Y=None):
        """
        Set the input / output data of the model
        This is useful if we wish to change our existing data but maintain the same model
@ -115,7 +132,7 @@ class GP(Model):
        :param Y: output observations
        :type Y: np.ndarray
        """
-        if trigger_update: self.update_model(False)
+        self.update_model(False)
        if Y is not None:
            if self.normalizer is not None:
                self.normalizer.scale_by(Y)
@ -131,34 +148,33 @@ class GP(Model):
                    assert isinstance(X, type(self.X)), "The given X must have the same type as the X in the model!"
                    self.unlink_parameter(self.X)
                    self.X = X
-                    self.link_parameters(self.X)
+                    self.link_parameter(self.X)
                else:
                    self.unlink_parameter(self.X)
                    from ..core import Param
                    self.X = Param('latent mean',X)
-                    self.link_parameters(self.X)
+                    self.link_parameter(self.X)
            else:
                self.X = ObsAr(X)
-        if trigger_update: self.update_model(True)
-        if trigger_update: self._trigger_params_changed()
+        self.update_model(True)

-    def set_X(self,X, trigger_update=True):
+    def set_X(self,X):
        """
        Set the input data of the model

        :param X: input observations
        :type X: np.ndarray
        """
-        self.set_XY(X=X, trigger_update=trigger_update)
+        self.set_XY(X=X)

-    def set_Y(self,Y, trigger_update=True):
+    def set_Y(self,Y):
        """
        Set the output data of the model

        :param X: output observations
        :type X: np.ndarray
        """
-        self.set_XY(Y=Y, trigger_update=trigger_update)
+        self.set_XY(Y=Y)

    def parameters_changed(self):
        """
@ -181,7 +197,7 @@ class GP(Model):
        """
        return self._log_marginal_likelihood

-    def _raw_predict(self, _Xnew, full_cov=False, kern=None):
+    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
        For making predictions, does not account for normalization or likelihood

@ -197,24 +213,33 @@ class GP(Model):
        if kern is None:
            kern = self.kern

-        Kx = kern.K(_Xnew, self.X).T
-        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
+        Kx = kern.K(self._predictive_variable, Xnew)
        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+        if len(mu.shape)==1:
+            mu = mu.reshape(-1,1)
        if full_cov:
-            Kxx = kern.K(_Xnew)
-            var = Kxx - np.dot(Kx.T, WiKx)
+            Kxx = kern.K(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
+                var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+                from ..util.linalg import mdot
+                for i in range(var.shape[2]):
+                    var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
+            var = var
        else:
-            Kxx = kern.Kdiag(_Xnew)
-            var = Kxx - np.sum(WiKx*Kx, 0)
-            var = var.reshape(-1, 1)
-            var[var<0.] = 0.
+            Kxx = kern.Kdiag(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
+            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
+                var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+                for i in range(var.shape[1]):
+                    var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+            var = var
+        #add in the mean function
+        if self.mean_function is not None:
+            mu += self.mean_function.f(Xnew)

-        #force mu to be a column vector
-        if len(mu.shape)==1: mu = mu[:,None]
-
-        #add the mean function in
-        if not self.mean_function is None:
-            mu += self.mean_function.f(_Xnew)
        return mu, var

    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
@ -247,7 +272,7 @@ class GP(Model):
        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
        return mean, var

-    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, kern=None):
        """
        Get the predictive quantiles around the prediction at X

@ -255,10 +280,12 @@ class GP(Model):
        :type X: np.ndarray (Xnew x self.input_dim)
        :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
        :type quantiles: tuple
+        :param kern: optional kernel to use for prediction
+        :type predict_kw: dict
        :returns: list of quantiles for each X and predictive quantiles for interval combination
        :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)]
        """
-        m, v = self._raw_predict(X,  full_cov=False)
+        m, v = self._raw_predict(X,  full_cov=False, kern=kern)
        if self.normalizer is not None:
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
@ -292,6 +319,120 @@ class GP(Model):
        return dmu_dX, dv_dX


+    def predict_jacobian(self, Xnew, kern=None, full_cov=True):
+        """
+        Compute the derivatives of the posterior of the GP.
+
+        Given a set of points at which to predict X* (size [N*,Q]), compute the
+        mean and variance of the derivative. Resulting arrays are sized:
+
+         dL_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
+          Note that this is the mean and variance of the derivative,
+          not the derivative of the mean and variance! (See predictive_gradients for that)
+
+         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
+          If there is missing data, it is not implemented for now, but
+          there will be one output variance per output dimension.
+
+        :param X: The points at which to get the predictive gradients.
+        :type X: np.ndarray (Xnew x self.input_dim)
+        :param kern: The kernel to compute the jacobian for.
+        :param boolean full_cov: whether to return the full covariance of the jacobian.
+
+        :returns: dmu_dX, dv_dX
+        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q,(D)) ]
+
+        Note: We always return sum in input_dim gradients, as the off-diagonals
+        in the input_dim are not needed for further calculations.
+        This is a compromise for increase in speed. Mathematically the jacobian would
+        have another dimension in Q.
+        """
+        if kern is None:
+            kern = self.kern
+
+        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
+
+        for i in range(self.output_dim):
+            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)
+
+        dK_dXnew_full = np.empty((self._predictive_variable.shape[0], Xnew.shape[0], Xnew.shape[1]))
+        for i in range(self._predictive_variable.shape[0]):
+            dK_dXnew_full[i] = kern.gradients_X([[1.]], Xnew, self._predictive_variable[[i]])
+
+        if full_cov:
+            dK2_dXdX = kern.gradients_XX([[1.]], Xnew)
+        else:
+            dK2_dXdX = kern.gradients_XX_diag([[1.]], Xnew)
+
+        def compute_cov_inner(wi):
+            if full_cov:
+                # full covariance gradients:
+                var_jac = dK2_dXdX - np.einsum('qnm,miq->niq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
+            else:
+                var_jac = dK2_dXdX - np.einsum('qim,miq->iq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
+            return var_jac
+
+        if self.posterior.woodbury_inv.ndim == 3: # Missing data:
+            if full_cov:
+                var_jac = np.empty((Xnew.shape[0],Xnew.shape[0],Xnew.shape[1],self.output_dim))
+                for d in range(self.posterior.woodbury_inv.shape[2]):
+                    var_jac[:, :, :, d] = compute_cov_inner(self.posterior.woodbury_inv[:, :, d])
+            else:
+                var_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
+                for d in range(self.posterior.woodbury_inv.shape[2]):
+                    var_jac[:, :, d] = compute_cov_inner(self.posterior.woodbury_inv[:, :, d])
+        else:
+            var_jac = compute_cov_inner(self.posterior.woodbury_inv)
+        return mean_jac, var_jac
+
+    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
+        """
+        Predict the wishard embedding G of the GP. This is the density of the
+        input of the GP defined by the probabilistic function mapping f.
+        G = J_mean.T*J_mean + output_dim*J_cov.
+
+        :param array-like Xnew: The points at which to evaluate the magnification.
+        :param :py:class:`~GPy.kern.Kern` kern: The kernel to use for the magnification.
+
+        Supplying only a part of the learning kernel gives insights into the density
+        of the specific kernel part of the input function. E.g. one can see how dense the
+        linear part of a kernel is compared to the non-linear part etc.
+        """
+        if kern is None:
+            kern = self.kern
+
+        mu_jac, var_jac = self.predict_jacobian(Xnew, kern, full_cov=False)
+        mumuT = np.einsum('iqd,ipd->iqp', mu_jac, mu_jac)
+        Sigma = np.zeros(mumuT.shape)
+        if var_jac.ndim == 3:
+            Sigma[(slice(None), )+np.diag_indices(Xnew.shape[1], 2)] = var_jac.sum(-1)
+        else:
+            Sigma[(slice(None), )+np.diag_indices(Xnew.shape[1], 2)] = self.output_dim*var_jac
+        G = 0.
+        if mean:
+            G += mumuT
+        if covariance:
+            G += Sigma
+        return G
+
+    def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True):
+        """
+        Predict the magnification factor as
+
+        sqrt(det(G))
+
+        for each point N in Xnew
+        """
+        G = self.predict_wishard_embedding(Xnew, kern, mean, covariance)
+        from ..util.linalg import jitchol
+        mag = np.empty(Xnew.shape[0])
+        for n in range(Xnew.shape[0]):
+            try:
+                mag[n] = np.sqrt(np.exp(2*np.sum(np.log(np.diag(jitchol(G[n, :, :]))))))
+            except:
+                mag[n] = np.sqrt(np.linalg.det(G[n, :, :]))
+        return mag
+
    def posterior_samples_f(self,X,size=10, full_cov=True):
        """
        Samples the posterior GP at the points X.
@ -395,8 +536,8 @@ class GP(Model):
    def plot(self, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
-        plot_raw=False,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx', predict_kw=None):
+        plot_raw=False, linecol=None,fillcol=None, Y_metadata=None,
+        data_symbol='kx', predict_kw=None, plot_training_data=True, samples_y=0, apply_link=False):
        """
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -419,7 +560,7 @@ class GP(Model):
        :param levels: number of levels to plot in a contour plot.
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        :type levels: int
-        :param samples: the number of a posteriori samples to plot
+        :param samples: the number of a posteriori samples to plot, p(f*|y)
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
@ -433,6 +574,12 @@ class GP(Model):
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        :param plot_training_data: whether or not to plot the training points
+        :type plot_training_data: boolean
+        :param samples_y: the number of a posteriori samples to plot, p(y*|y)
+        :type samples_y: int
+        :param apply_link: if there is a link function of the likelihood, plot the link(f*) rather than f*, when plotting posterior samples f
+        :type apply_link: boolean
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
@ -445,7 +592,103 @@ class GP(Model):
                                     which_data_ycols, fixed_inputs,
                                     levels, samples, fignum, ax, resolution,
                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, predict_kw=predict_kw, **kw)
+                                     data_symbol=data_symbol, predict_kw=predict_kw,
+                                     plot_training_data=plot_training_data, samples_y=samples_y, apply_link=apply_link, **kw)
+
+
+    def plot_data(self, which_data_rows='all',
+        which_data_ycols='all', visible_dims=None,
+        fignum=None, ax=None, data_symbol='kx'):
+        """
+        Plot the training data
+          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+        Can plot only part of the data
+        using which_data_rows and which_data_ycols.
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_ycols: 'all' or a list of integers
+        :param visible_dims: an array specifying the input dimensions to plot (maximum two)
+        :type visible_dims: a numpy array
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot, p(f*|y)
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
+        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
+        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
+        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import models_plots
+        kw = {}
+        return models_plots.plot_data(self, which_data_rows,
+                                     which_data_ycols, visible_dims,
+                                     fignum, ax, data_symbol, **kw)
+
+
+    def errorbars_trainset(self, which_data_rows='all',
+            which_data_ycols='all', fixed_inputs=[], fignum=None, ax=None,
+            linecol=None, data_symbol='kx', predict_kw=None, plot_training_data=True,lw=None):
+
+        """
+        Plot the posterior error bars corresponding to the training data
+          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+        Can plot only part of the data
+        using which_data_rows and which_data_ycols.
+
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param plot_training_data: whether or not to plot the training points
+        :type plot_training_data: boolean
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import models_plots
+        kw = {}
+        if lw is not None:
+            kw['lw'] = lw
+        return models_plots.errorbars_trainset(self, which_data_rows, which_data_ycols, fixed_inputs,
+                                    fignum, ax, linecol, data_symbol,
+                                    predict_kw, plot_training_data, **kw)
+
+
+    def plot_magnification(self, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
+                fignum=None, legend=True,
+                plot_limits=None,
+                aspect='auto', updates=False, plot_inducing=True, kern=None, **kwargs):
+
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import dim_reduction_plots
+
+        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
+                resolution, ax, marker, s,
+                fignum, plot_inducing, legend,
+                plot_limits, aspect, updates, **kwargs)
+

    def input_sensitivity(self, summarize=True):
        """
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -32,7 +32,7 @@ class Bijective_mapping(Mapping):
    also back from f to X. The inverse mapping is called g().
    """
    def __init__(self, input_dim, output_dim, name='bijective_mapping'):
-        super(Bijective_apping, self).__init__(name=name)
+        super(Bijective_mapping, self).__init__(name=name)

    def g(self, f):
        """Inverse mapping from output domain of the function to the inputs."""
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -180,6 +180,7 @@ class Param(Parameterizable, ObsAr):
        import copy
        Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo))
        return s
+
    def _setup_observers(self):
        """
        Setup the default observers
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -197,9 +197,10 @@ class Parameterized(Parameterizable):
                raise RuntimeError("{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)))

        start = sum([p.size for p in self.parameters[:param._parent_index_]])
-        self._remove_parameter_name(param)
        self.size -= param.size
        del self.parameters[param._parent_index_]
+        self._remove_parameter_name(param)
+

        param._disconnect_parent()
        param.remove_observer(self, self._pass_through_notify_observers)
@ -315,7 +316,7 @@ class Parameterized(Parameterizable):
                    param[:] = val; return
            except AttributeError:
                pass
-        object.__setattr__(self, name, val);
+        return object.__setattr__(self, name, val);

    #===========================================================================
    # Pickling
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -366,6 +366,7 @@ class InverseGamma(Gamma):
    def rvs(self, n):
        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)

+
 class DGPLVM_KFDA(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable function using
@ -512,6 +513,7 @@ class DGPLVM_KFDA(Prior):
        self.A = self.compute_A(lst_ni)
        self.x_shape = x_shape

+
 class DGPLVM(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@ -669,7 +671,7 @@ class DGPLVM(Prior):
        M_i = self.compute_Mi(cls)
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
-        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        # sb_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
@ -903,7 +905,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -927,7 +929,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
@ -1198,6 +1200,7 @@ class DGPLVM_T(Prior):



+
 class HalfT(Prior):
    """
    Implementation of the half student t probability function, coupled with random variables.
@ -1208,15 +1211,17 @@ class HalfT(Prior):
    """
    domain = _POSITIVE
    _instances = []
-    def __new__(cls, A, nu): # Singleton:
+
+    def __new__(cls, A, nu):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().A == A and instance().nu == nu:
-                   return instance()
+                    return instance()
        o = super(Prior, cls).__new__(cls, A, nu)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
+
    def __init__(self, A, nu):
        self.A = float(A)
        self.nu = float(nu)
@ -1225,37 +1230,81 @@ class HalfT(Prior):
    def __str__(self):
        return "hT({:.2g}, {:.2g})".format(self.A, self.nu)

-    def lnpdf(self,theta):
-        return (theta>0) * ( self.constant -.5*(self.nu+1) * np.log( 1.+ (1./self.nu) * (theta/self.A)**2 ) )
+    def lnpdf(self, theta):
+        return (theta > 0) * (self.constant - .5*(self.nu + 1) * np.log(1. + (1./self.nu) * (theta/self.A)**2))

-        #theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
-        #lnpdfs = np.zeros_like(theta)
-        #theta = np.array([theta])
-        #above_zero = theta.flatten()>1e-6
-        #v = self.nu
-        #sigma2=self.A
-        #stop
-        #lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
-        #    - gammaln(v * 0.5)
-        #    - 0.5*np.log(sigma2 * v * np.pi)
-        #    - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
-        #)
-        #return lnpdfs
+        # theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
+        # lnpdfs = np.zeros_like(theta)
+        # theta = np.array([theta])
+        # above_zero = theta.flatten()>1e-6
+        # v = self.nu
+        # sigma2=self.A
+        # stop
+        # lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
+        #     - gammaln(v * 0.5)
+        #     - 0.5*np.log(sigma2 * v * np.pi)
+        #     - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
+        # )
+        # return lnpdfs

-    def lnpdf_grad(self,theta):
-        theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
+    def lnpdf_grad(self, theta):
+        theta = theta if isinstance(theta, np.ndarray) else np.array([theta])
        grad = np.zeros_like(theta)
-        above_zero = theta>1e-6
+        above_zero = theta > 1e-6
        v = self.nu
-        sigma2=self.A
+        sigma2 = self.A
        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
        return grad

    def rvs(self, n):
-         #return np.random.randn(n) * self.sigma + self.mu
-         from scipy.stats import t
-         #[np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
-         ret = t.rvs(self.nu,loc=0,scale=self.A, size=n)
-         ret[ret<0] = 0
-         return ret
+        # return np.random.randn(n) * self.sigma + self.mu
+        from scipy.stats import t
+        # [np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
+        ret = t.rvs(self.nu, loc=0, scale=self.A, size=n)
+        ret[ret < 0] = 0
+        return ret

+
+class Exponential(Prior):
+    """
+    Implementation of the Exponential probability function,
+    coupled with random variables.
+
+    :param l: shape parameter
+
+    """
+    domain = _POSITIVE
+    _instances = []
+
+    def __new__(cls, l):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().l == l:
+                    return instance()
+        o = super(Exponential, cls).__new__(cls, l)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, l):
+        self.l = l
+
+    def __str__(self):
+        return "Exp({:.2g})".format(self.l)
+
+    def summary(self):
+        ret = {"E[x]": 1. / self.l,
+               "E[ln x]": np.nan,
+               "var[x]": 1. / self.l**2,
+               "Entropy": 1. - np.log(self.l),
+               "Mode": 0.}
+        return ret
+
+    def lnpdf(self, x):
+        return np.log(self.l) - self.l * x
+
+    def lnpdf_grad(self, x):
+        return - self.l
+
+    def rvs(self, n):
+        return np.random.exponential(scale=self.l, size=n)
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -62,7 +62,7 @@ class Transformation(object):
        import matplotlib.pyplot as plt
        from ...plotting.matplot_dep import base_plots
        x = np.linspace(-8,8)
-        base_plots.meanplot(x, self.f(x),axes=axes*args,**kw)
+        base_plots.meanplot(x, self.f(x), *args, ax=axes, **kw)
        axes = plt.gca()
        axes.set_xlabel(xlabel)
        axes.set_ylabel(ylabel)
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -49,7 +49,7 @@ class SparseGP(GP):
            else:
                #inference_method = ??
                raise NotImplementedError("what to do what to do?")
-            print("defaulting to ", inference_method, "for latent function inference")
+            print(("defaulting to ", inference_method, "for latent function inference"))

        self.Z = Param('inducing inputs', Z)
        self.num_inducing = Z.shape[0]
@ -60,6 +60,10 @@ class SparseGP(GP):
        self.link_parameter(self.Z, index=0)
        self.posterior = None

+    @property
+    def _predictive_variable(self):
+        return self.Z
+
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)

@ -117,45 +121,48 @@ class SparseGP(GP):
        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
        we take only the diagonal elements across N.

-        For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we 
-        return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD).
-        This is for both with and without missing data. See for missing data SparseGP implementation py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
+        For uncertain inputs, the SparseGP bound produces cannot predict the full covariance matrix full_cov for now.
+        The implementation of that will follow. However, for each dimension the
+        covariance changes, so if full_cov is False (standard), we return the variance
+        for each dimension [NxD].
        """

        if kern is None: kern = self.kern

        if not isinstance(Xnew, VariationalPosterior):
-            Kx = kern.K(self.Z, Xnew)
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            if full_cov:
-                Kxx = kern.K(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
-                    for i in range(var.shape[2]):
-                        var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
-                var = var
-            else:
-                Kxx = kern.Kdiag(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
-                    for i in range(var.shape[1]):
-                        var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
-                var = var
-            #add in the mean function
-            if self.mean_function is not None:
-                mu += self.mean_function.f(Xnew)
+            # Kx = kern.K(self._predictive_variable, Xnew)
+            # mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+            # if full_cov:
+            #     Kxx = kern.K(Xnew)
+            #     if self.posterior.woodbury_inv.ndim == 2:
+            #         var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+            #     elif self.posterior.woodbury_inv.ndim == 3:
+            #         var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+            #         for i in range(var.shape[2]):
+            #             var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
+            #     var = var
+            # else:
+            #     Kxx = kern.Kdiag(Xnew)
+            #     if self.posterior.woodbury_inv.ndim == 2:
+            #         var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
+            #     elif self.posterior.woodbury_inv.ndim == 3:
+            #         var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+            #         for i in range(var.shape[1]):
+            #             var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+            #     var = var
+            # #add in the mean function
+            # if self.mean_function is not None:
+            #     mu += self.mean_function.f(Xnew)
+            mu, var = super(SparseGP, self)._raw_predict(Xnew, full_cov, kern)
        else:
-            psi0_star = kern.psi0(self.Z, Xnew)
-            psi1_star = kern.psi1(self.Z, Xnew)
+            psi0_star = kern.psi0(self._predictive_variable, Xnew)
+            psi1_star = kern.psi1(self._predictive_variable, Xnew)
            #psi2_star = kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
            la = self.posterior.woodbury_vector
            mu = np.dot(psi1_star, la) # TODO: dimensions?

            if full_cov:
+                raise NotImplementedError("Full covariance for Sparse GP predicted with uncertain inputs not implemented yet.")
                var = np.empty((Xnew.shape[0], la.shape[1], la.shape[1]))
                di = np.diag_indices(la.shape[1])
            else:
@ -163,7 +170,7 @@ class SparseGP(GP):

            for i in range(Xnew.shape[0]):
                _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
-                psi2_star = kern.psi2(self.Z, NormalPosterior(_mu, _var))
+                psi2_star = kern.psi2(self._predictive_variable, NormalPosterior(_mu, _var))
                tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))

                var_ = mdot(la.T, tmp, la)
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -34,7 +34,7 @@ class SparseGP_MPI(SparseGP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp', Y_metadata=None, mpi_comm=None, normalizer=False):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -1,11 +1,11 @@
 # Copyright (c) 2014, James Hensman, Alex Matthews
-# Distributed under the terms of the GNU General public License, see LICENSE.txt
+# Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
 from ..util import choleskies
 from .sparse_gp import SparseGP
 from .parameterization.param import Param
-from ..inference.latent_function_inference import SVGP as svgp_inf
+from ..inference.latent_function_inference.svgp import SVGP as svgp_inf


 class SVGP(SparseGP):
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@ -24,7 +24,6 @@ class VerboseOptimization(object):
            self.model.add_observer(self, self.print_status)
            self.status = 'running'
            self.clear = clear_after_finish
-            self.deltat = .2

            self.update()

@ -80,6 +79,7 @@ class VerboseOptimization(object):

    def __enter__(self):
        self.start = time.time()
+        self._time = self.start
        return self

    def print_out(self, seconds):
@ -143,12 +143,12 @@ class VerboseOptimization(object):
    def print_status(self, me, which=None):
        self.update()

-        seconds = time.time()-self.start
+        t = time.time()
+        seconds = t-self.start
        #sys.stdout.write(" "*len(self.message))
-        self.deltat += seconds
-        if self.deltat > .2:
+        if t-self._time > .3 or seconds < .3:
            self.print_out(seconds)
-            self.deltat = 0
+            self._time = t

        self.iteration += 1

--- a/GPy/examples/coreg_example.py
+++ b/GPy/examples/coreg_example.py
@ -3,7 +3,7 @@

 import numpy as np
 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
 import GPy
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -77,7 +77,7 @@ def student_t_approx(optimize=True, plot=True):
    debug=True
    if debug:
        m4.optimize(messages=1)
-        import pylab as pb
+        from matplotlib import pyplot as pb
        pb.plot(m4.X, m4.inference_method.f_hat)
        pb.plot(m4.X, m4.Y, 'rx')
        m4.plot()
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -5,7 +5,7 @@
 Gaussian Processes regression examples
 """
 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
 import numpy as np
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -69,7 +69,7 @@ from .expectation_propagation_dtc import EPDTC
 from .dtc import DTC
 from .fitc import FITC
 from .var_dtc_parallel import VarDTC_minibatch
-#from .svgp import SVGP
+from .var_gauss import VarGauss

 # class FullLatentFunctionData(object):
 #
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -4,6 +4,8 @@
 import numpy as np
 from ...core import Model
 from ...core.parameterization import variational
+from ...util.linalg import tdot
+from GPy.core.parameterization.variational import VariationalPosterior

 def infer_newX(model, Y_new, optimize=True, init='L2'):
    """
@ -60,18 +62,19 @@ class InferenceX(Model):
 #                 self.kern.GPU(True)
        from copy import deepcopy
        self.posterior = deepcopy(model.posterior)
-        if hasattr(model, 'variational_prior'):
+        from ...core.parameterization.variational import VariationalPosterior
+        if isinstance(model.X, VariationalPosterior):
            self.uncertain_input = True
            from ...models.ss_gplvm import IBPPrior
            from ...models.ss_mrd import IBPPrior_SSMRD
            if isinstance(model.variational_prior, IBPPrior) or isinstance(model.variational_prior, IBPPrior_SSMRD):
                from ...core.parameterization.variational import SpikeAndSlabPrior
-                self.variational_prior = SpikeAndSlabPrior(pi=05,learnPi=False, group_spike=False)
+                self.variational_prior = SpikeAndSlabPrior(pi=0.5, learnPi=False, group_spike=False)
            else:
                self.variational_prior = model.variational_prior.copy()
        else:
            self.uncertain_input = False
-        if hasattr(model, 'inducing_inputs'):
+        if hasattr(model, 'Z'):
            self.sparse_gp = True
            self.Z = model.Z.copy()
        else:
@ -125,13 +128,13 @@ class InferenceX(Model):
            wv = wv[:,self.valid_dim]
            output_dim = self.valid_dim.sum()
            if self.ninan is not None:
-                self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - np.einsum('md,od->mo',wv, wv)[:, :, None]).sum(-1)
+                self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - tdot(wv)[:, :, None]).sum(-1)
            else:
-                self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))
+                self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - tdot(wv))
            self.dL_dpsi1 = beta*np.dot(self.Y[:,self.valid_dim], wv.T)
            self.dL_dpsi0 = - beta/2.* np.ones(self.Y.shape[0])
        else:
-            self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))/2.
+            self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - tdot(wv))/2. #np.einsum('md,od->mo',wv, wv)
            self.dL_dpsi1 = beta*np.dot(self.Y, wv.T)
            self.dL_dpsi0 = -beta/2.*output_dim* np.ones(self.Y.shape[0])

--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -172,6 +172,7 @@ class Laplace(LatentFunctionInference):
        def obj(Ki_f, f):
            ll = -0.5*np.sum(np.dot(Ki_f.T, f)) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
            if np.isnan(ll):
+                import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
                return -np.inf
            else:
                return ll
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -64,9 +64,7 @@ class VarDTC(LatentFunctionInference):
    def get_VVTfactor(self, Y, prec):
        return Y * prec # TODO chache this, and make it effective

-
-
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None):
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None):

        _, output_dim = Y.shape
        uncertain_inputs = isinstance(X, VariationalPosterior)
@ -95,17 +93,28 @@ class VarDTC(LatentFunctionInference):

        # The rather complex computations of A, and the psi stats
        if uncertain_inputs:
-            psi0 = kern.psi0(Z, X)
-            psi1 = kern.psi1(Z, X)
+            if psi0 is None:
+                psi0 = kern.psi0(Z, X)
+            if psi1 is None:
+                psi1 = kern.psi1(Z, X)
            if het_noise:
-                psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
+                if psi2 is None:
+                    assert len(psi2.shape) == 3  # Need to have not summed out N
+                    #FIXME: Need testing
+                    psi2_beta = np.sum([psi2[X[i:i+1,:], :, :] * beta_i for i,beta_i in enumerate(beta)],0)
+                else:
+                    psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
            else:
-                psi2_beta = kern.psi2(Z,X) * beta
+                if psi2 is None:
+                    psi2 = kern.psi2(Z,X)
+                psi2_beta =  psi2 * beta
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
-            psi0 = kern.Kdiag(X)
-            psi1 = kern.K(X, Z)
+            if psi0 is None:
+                psi0 = kern.Kdiag(X)
+            if psi1 is None:
+                psi1 = kern.K(X, Z)
            if het_noise:
                tmp = psi1 * (np.sqrt(beta))
            else:
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -172,17 +172,22 @@ class VarDTC_minibatch(LatentFunctionInference):
        if not np.isfinite(Kmm).all():
            print(Kmm)
        Lm = jitchol(Kmm)
+        LmInv = dtrtri(Lm)

-        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
+        LmInvPsi2LmInvT = LmInv.dot(psi2_full.dot(LmInv.T))
        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
        LL = jitchol(Lambda)
+        LLInv = dtrtri(LL)
        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
-        b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
-        bbt = np.square(b).sum()
-        v = dtrtrs(Lm,dtrtrs(LL,b,trans=1)[0],trans=1)[0]
+        LmLLInv = LLInv.dot(LmInv)
        
-        tmp  = -backsub_both_sides(LL, tdot(b)+output_dim*np.eye(input_dim), transpose='left')
-        dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim), transpose='left')/2.
+        b  = psi1Y_full.dot(LmLLInv.T)
+        bbt = np.square(b).sum()
+        v = b.dot(LmLLInv).T
+        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)
+        
+        tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv)
+        dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2.
        
        # Cache intermediate results
        self.midRes['dL_dpsi2R'] = dL_dpsi2R
@ -201,7 +206,7 @@ class VarDTC_minibatch(LatentFunctionInference):
        # Compute dL_dKmm
        #======================================================================

-        dL_dKmm =  dL_dpsi2R - output_dim*backsub_both_sides(Lm, LmInvPsi2LmInvT, transpose='left')/2.
+        dL_dKmm =  dL_dpsi2R - output_dim*LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
--- a/GPy/inference/latent_function_inference/var_gauss.py
+++ b/GPy/inference/latent_function_inference/var_gauss.py
@ -0,0 +1,69 @@
+# Copyright (c) 2015, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as np
+from ...util.linalg import pdinv
+from .posterior import Posterior
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+
+class VarGauss(LatentFunctionInference):
+    """
+    The Variational Gaussian Approximation revisited
+
+    @article{Opper:2009,
+        title = {The Variational Gaussian Approximation Revisited},
+        author = {Opper, Manfred and Archambeau, C{\'e}dric},
+        journal = {Neural Comput.},
+        year = {2009},
+        pages = {786--792},
+    }
+    """
+    def __init__(self, alpha, beta):
+        """
+        :param alpha: GPy.core.Param varational parameter
+        :param beta: GPy.core.Param varational parameter
+        """
+        self.alpha, self.beta = alpha, beta
+
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
+        if mean_function is not None:
+            raise NotImplementedError
+        num_data, output_dim = Y.shape
+        assert output_dim ==1, "Only one output supported"
+
+        K = kern.K(X)
+        m = K.dot(self.alpha)
+        KB = K*self.beta[:, None]
+        BKB = KB*self.beta[None, :]
+        A = np.eye(num_data) + BKB
+        Ai, LA, _, Alogdet = pdinv(A)
+        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
+        var = np.diag(Sigma).reshape(-1,1)
+
+        F, dF_dm, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, m, var, Y_metadata=Y_metadata)
+        if dF_dthetaL is not None:
+            dL_dthetaL = dF_dthetaL.sum(1).sum(1)
+        else:
+            dL_dthetaL = np.array([])
+        dF_da = np.dot(K, dF_dm)
+        SigmaB = Sigma*self.beta
+        #dF_db_ = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
+        dF_db = -2*np.sum(Sigma**2 * (dF_dv * self.beta), 0)
+        #assert np.allclose(dF_db, dF_db_)
+
+        KL = 0.5*(Alogdet + np.trace(Ai) - num_data + np.sum(m*self.alpha))
+        dKL_da = m
+        A_A2 = Ai - Ai.dot(Ai)
+        dKL_db = np.diag(np.dot(KB.T, A_A2))
+        log_marginal = F.sum() - KL
+        self.alpha.gradient = dF_da - dKL_da
+        self.beta.gradient = dF_db - dKL_db
+
+        # K-gradients
+        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
+        tmp = Ai*self.beta[:, None]/self.beta[None, :]
+        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
+
+        return Posterior(mean=m, cov=Sigma ,K=K),\
+               log_marginal,\
+               {'dL_dK':dF_dK-dKL_dK, 'dL_dthetaL':dL_dthetaL}
--- a/GPy/inference/mcmc/init.py
+++ b/GPy/inference/mcmc/init.py
@ -1 +1,2 @@
 from .hmc import HMC
+from .samplers import *
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@ -1,14 +1,10 @@
 # ## Copyright (c) 2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
+from __future__ import print_function

 import numpy as np
-from scipy import linalg, optimize
-import Tango
 import sys
-import re
-import numdifftools as ndt
-import pdb
+

 try:
    #In Python 2, cPickle is faster. It does not exist in Python 3 but the underlying code is always used
@ -22,11 +18,11 @@ class Metropolis_Hastings:
    def __init__(self,model,cov=None):
        """Metropolis Hastings, with tunings according to Gelman et al. """
        self.model = model
-        current = self.model._get_params_transformed()
+        current = self.model.optimizer_array
        self.D = current.size
        self.chains = []
        if cov is None:
-            self.cov = model.Laplace_covariance()
+            self.cov = np.eye(self.D)
        else:
            self.cov = cov
        self.scale = 2.4/np.sqrt(self.D)
@ -37,20 +33,20 @@ class Metropolis_Hastings:
        if start is None:
            self.model.randomize()
        else:
-            self.model._set_params_transformed(start)
+            self.model.optimizer_array = start

-
-
-    def sample(self, Ntotal, Nburn, Nthin, tune=True, tune_throughout=False, tune_interval=400):
-        current = self.model._get_params_transformed()
-        fcurrent = self.model.log_likelihood() + self.model.log_prior()
+    def sample(self, Ntotal=10000, Nburn=1000, Nthin=10, tune=True, tune_throughout=False, tune_interval=400):
+        current = self.model.optimizer_array
+        fcurrent = self.model.log_likelihood() + self.model.log_prior() + \
+                   self.model._log_det_jacobian()
        accepted = np.zeros(Ntotal,dtype=np.bool)
        for it in range(Ntotal):
-            print("sample %d of %d\r"%(it,Ntotal), end=' ')
+            print("sample %d of %d\r"%(it,Ntotal),end="\t")
            sys.stdout.flush()
            prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
-            self.model._set_params_transformed(prop)
-            fprop = self.model.log_likelihood() + self.model.log_prior()
+            self.model.optimizer_array = prop
+            fprop = self.model.log_likelihood() + self.model.log_prior() + \
+                    self.model._log_det_jacobian()

            if fprop>fcurrent:#sample accepted, going 'uphill'
                accepted[it] = True
@ -78,10 +74,11 @@ class Metropolis_Hastings:

    def predict(self,function,args):
        """Make a prediction for the function, to which we will pass the additional arguments"""
-        param = self.model._get_params()
+        param = self.model.param_array
        fs = []
        for p in self.chain:
-            self.model._set_params(p)
+            self.model.param_array = p
            fs.append(function(*args))
-        self.model._set_params(param)# reset model to starting state
+        # reset model to starting state
+        self.model.param_array = param
        return fs
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -38,16 +38,17 @@ class SparseGPMissing(StochasticStorage):
        import numpy as np
        self.Y = model.Y_normalized
        bdict = {}
+        #For N > 1000 array2string default crops
+        opt = np.get_printoptions()
+        np.set_printoptions(threshold=np.inf)
        for d in range(self.Y.shape[1]):
-            inan = np.isnan(self.Y[:, d])
-            arr_str = np.array2string(inan, 
-                                      np.inf, 0, 
-                                      True, '', 
-                                      formatter={'bool':lambda x: '1' if x else '0'})
+            inan = np.isnan(self.Y)[:, d]
+            arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
            try:
                bdict[arr_str][0].append(d)
            except:
                bdict[arr_str] = [[d], ~inan]
+        np.set_printoptions(**opt)
        self.d = bdict.values()

 class SparseGPStochastics(StochasticStorage):
@ -55,32 +56,36 @@ class SparseGPStochastics(StochasticStorage):
    For the sparse gp we need to store the dimension we are in,
    and the indices corresponding to those
    """
-    def __init__(self, model, batchsize=1):
+    def __init__(self, model, batchsize=1, missing_data=True):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
        self.Y = model.Y_normalized
+        self.missing_data = missing_data
        self.reset()
        self.do_stochastics()

    def do_stochastics(self):
+        import numpy as np
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
-            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.d])]]
+            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]]
        else:
-            import numpy as np
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
            bdict = {}
-            for d in self.d:
-                inan = np.isnan(self.Y[:, d])
-                arr_str = int(np.array2string(inan, 
-                                          np.inf, 0, 
-                                          True, '', 
-                                          formatter={'bool':lambda x: '1' if x else '0'}), 2)
-                try:
-                    bdict[arr_str][0].append(d)
-                except:
-                    bdict[arr_str] = [[d], ~inan]
-            self.d = bdict.values()
+            if self.missing_data:
+                opt = np.get_printoptions()
+                np.set_printoptions(threshold=np.inf)
+                for d in self.d:
+                    inan = np.isnan(self.Y[:, d])
+                    arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
+                    try:
+                        bdict[arr_str][0].append(d)
+                    except:
+                        bdict[arr_str] = [[d], ~inan]
+                np.set_printoptions(**opt)
+                self.d = bdict.values()
+            else:
+                self.d = [[self.d, None]]

    def reset(self):
        self.current_dim = -1
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -6,6 +6,7 @@ from ._src.brownian import Brownian
 from ._src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from ._src.mlp import MLP
 from ._src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
+from ._src.standard_periodic import StdPeriodic
 from ._src.independent_outputs import IndependentOutputs, Hierarchical
 from ._src.coregionalize import Coregionalize
 from ._src.ODE_UY import ODE_UY
@ -17,7 +18,7 @@ from ._src.eq_ode2 import EQ_ODE2
 from ._src.trunclinear import TruncLinear,TruncLinear_inf
 from ._src.splitKern import SplitKern,DEtime
 from ._src.splitKern import DEtime as DiffGenomeKern
-
-
+from ._src.spline import Spline
+from ._src.eq_ode2 import EQ_ODE2
 from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel

--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -14,7 +14,7 @@ class Add(CombinationKernel):

    This kernel will take over the active dims of it's subkernels passed in.
    """
-    def __init__(self, subkerns, name='add'):
+    def __init__(self, subkerns, name='sum'):
        for i, kern in enumerate(subkerns[:]):
            if isinstance(kern, Add):
                del subkerns[i]
@ -72,15 +72,28 @@ class Add(CombinationKernel):
        [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
        return target

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def gradients_XX(self, dL_dK, X, X2):
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
+        else:
+            target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
+        [target.__iadd__(p.gradients_XX(dL_dK, X, X2)) for p in self.parts]
+        return target
+
+    def gradients_XX_diag(self, dL_dKdiag, X):
+        target = np.zeros(X.shape)
+        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
+        return target
+
+    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi0(self, Z, variational_posterior):
        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi1(self, Z, variational_posterior):
        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi2(self, Z, variational_posterior):
        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
        #return psi2
@ -115,6 +128,41 @@ class Add(CombinationKernel):
                raise NotImplementedError("psi2 cannot be computed for this kernel")
        return psi2

+    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    def psi2n(self, Z, variational_posterior):
+        psi2 = reduce(np.add, (p.psi2n(Z, variational_posterior) for p in self.parts))
+        #return psi2
+        # compute the "cross" terms
+        from .static import White, Bias
+        from .rbf import RBF
+        #from rbf_inv import RBFInv
+        from .linear import Linear
+        #ffrom fixed import Fixed
+
+        for p1, p2 in itertools.combinations(self.parts, 2):
+            # i1, i2 = p1.active_dims, p2.active_dims
+            # white doesn;t combine with anything
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
+                tmp = p2.psi1(Z, variational_posterior).sum(axis=0)
+                psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
+            #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+            elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
+                tmp = p1.psi1(Z, variational_posterior).sum(axis=0)
+                psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
+            elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
+                assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
+                tmp1 = p1.psi1(Z, variational_posterior)
+                tmp2 = p2.psi1(Z, variational_posterior)
+                psi2 += np.einsum('nm,no->nmo',tmp1,tmp2)+np.einsum('nm,no->nmo',tmp2,tmp1)
+                #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
+            else:
+                raise NotImplementedError("psi2 cannot be computed for this kernel")
+        return psi2
+
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        from .static import White, Bias
        for p1 in self.parts:
@ -126,9 +174,9 @@ class Add(CombinationKernel):
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)

    def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
@ -143,9 +191,9 @@ class Add(CombinationKernel):
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
        return target

@ -161,9 +209,9 @@ class Add(CombinationKernel):
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
            [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
        return target_grads
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -6,7 +6,11 @@ import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.config import config # for assesing whether to use cython
-from . import coregionalize_cython
+try:
+    from . import coregionalize_cython
+    config.set('cython', 'working', 'True')
+except ImportError:
+    config.set('cython', 'working', 'False')

 class Coregionalize(Kern):
    """
@ -94,7 +98,7 @@ class Coregionalize(Kern):
            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)


-        dkappa = np.diag(dL_dK_small)
+        dkappa = np.diag(dL_dK_small).copy()
        dL_dK_small += dL_dK_small.T
        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)

@ -111,7 +115,7 @@ class Coregionalize(Kern):
        return dL_dK_small

    def _gradient_reduce_cython(self, dL_dK, index, index2):
-        index, index2 = index[:,0], index2[:,0]
+        index, index2 = np.int64(index[:,0]), np.int64(index2[:,0])
        return coregionalize_cython.gradient_reduce(self.B.shape[0], dL_dK, index, index2)


@ -126,4 +130,3 @@ class Coregionalize(Kern):

    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
-
--- a/GPy/kern/_src/coregionalize_cython.c
+++ b/GPy/kern/_src/coregionalize_cython.c
--- a/GPy/kern/_src/coregionalize_cython.pyx
+++ b/GPy/kern/_src/coregionalize_cython.pyx
@ -1,33 +1,37 @@
-#cython: boundscheck=True
-#cython: wraparound=True
+#cython: boundscheck=False
+#cython: wraparound=False
+#cython: nonecheck=False
 import cython
 import numpy as np
 cimport numpy as np

 def K_symmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X):
    cdef int N = X.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, N))
-    for n in range(N):
-        for m in range(N):
-            K[n,m] = B[X[n],X[m]]
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, N))
+    with nogil:
+        for n in range(N):
+            for m in range(N):
+                K[n, m] = B[X[n], X[m]]
    return K

 def K_asymmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X, np.ndarray[np.int64_t, ndim=1] X2):
    cdef int N = X.size
    cdef int M = X2.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, M))
-    for n in range(N):
-        for m in range(M):
-            K[n,m] = B[X[n],X2[m]]
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, M))
+    with nogil:
+        for n in range(N):
+            for m in range(M):
+                K[n, m] = B[X[n], X2[m]]
    return K

 def gradient_reduce(int D, np.ndarray[double, ndim=2] dL_dK, np.ndarray[np.int64_t, ndim=1] index, np.ndarray[np.int64_t, ndim=1] index2):
-        cdef np.ndarray[np.double_t, ndim=2] dL_dK_small = np.zeros((D, D))
+        cdef np.ndarray[np.double_t, ndim=2, mode='c'] dL_dK_small = np.zeros((D, D))
        cdef int N = index.size
        cdef int M = index2.size
-        for i in range(N):
-            for j in range(M):
-                dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
+        with nogil:
+            for i in range(N):
+                for j in range(M):
+                    dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
        return dL_dK_small


--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@ -105,7 +105,7 @@ class IndependentOutputs(CombinationKernel):
        if X2 is None:
            # TODO: make use of index_to_slices
            # FIXME: Broken as X is already sliced out
-            print("Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!")
+            # print("Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!")
            values = np.unique(X[:,self.index_dim])
            slices = [X[:,self.index_dim]==i for i in values]
            [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -58,20 +58,9 @@ class Kern(Parameterized):

        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
-        self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)

-    @property
-    def return_psi2_n(self):
-        """
-        Flag whether to pass back psi2 as NxMxM or MxM, by summing out N.
-        """
-        return self._return_psi2_n_flag[0]
-    @return_psi2_n.setter
-    def return_psi2_n(self, val):
-        def visit(self):
-            if isinstance(self, Kern):
-                self._return_psi2_n_flag[0]=val
-        self.traverse(visit)
+        from .psi_comp import PSICOMP_GH
+        self.psicomp = PSICOMP_GH()

    @Cache_this(limit=20)
    def _slice_X(self, X):
@ -81,6 +70,9 @@ class Kern(Parameterized):
        """
        Compute the kernel function.

+        .. math::
+            K_{ij} = k(X_i, X_j)
+
        :param X: the first set of inputs to the kernel
        :param X2: (optional) the second set of arguments to the kernel. If X2
                   is None, this is passed throgh to the 'part' object, which
@ -88,16 +80,64 @@ class Kern(Parameterized):
        """
        raise NotImplementedError
    def Kdiag(self, X):
+        """
+        The diagonal of the kernel matrix K
+
+        .. math::
+            Kdiag_{i} = k(X_i, X_i)
+        """
        raise NotImplementedError
    def psi0(self, Z, variational_posterior):
-        raise NotImplementedError
+        """
+        .. math::
+            \psi_0 = \sum_{i=0}^{n}E_{q(X)}[k(X_i, X_i)]
+        """
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]
    def psi1(self, Z, variational_posterior):
-        raise NotImplementedError
+        """
+        .. math::
+            \psi_1^{n,m} = E_{q(X)}[k(X_n, Z_m)]
+        """
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]
    def psi2(self, Z, variational_posterior):
-        raise NotImplementedError
+        """
+        .. math::
+            \psi_2^{m,m'} = \sum_{i=0}^{n}E_{q(X)}[ k(Z_m, X_i) k(X_i, Z_{m'})]
+        """
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=False)[2]
+    def psi2n(self, Z, variational_posterior):
+        """
+        .. math::
+            \psi_2^{n,m,m'} = E_{q(X)}[ k(Z_m, X_n) k(X_n, Z_{m'})]
+
+        Thus, we do not sum out n, compared to psi2
+        """
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]
    def gradients_X(self, dL_dK, X, X2):
+        """
+        .. math::
+
+            \\frac{\partial L}{\partial X} = \\frac{\partial L}{\partial K}\\frac{\partial K}{\partial X}
+        """
        raise NotImplementedError
+    def gradients_X_X2(self, dL_dK, X, X2):
+        return self.gradients_X(dL_dK, X, X2), self.gradients_X(dL_dK.T, X2, X)
+    def gradients_XX(self, dL_dK, X, X2):
+        """
+        .. math::
+
+            \\frac{\partial^2 L}{\partial X\partial X_2} = \\frac{\partial L}{\partial K}\\frac{\partial^2 K}{\partial X\partial X_2}
+        """
+        raise(NotImplementedError, "This is the second derivative of K wrt X and X2, and not implemented for this kernel")
+    def gradients_XX_diag(self, dL_dKdiag, X):
+        """
+        The diagonal of the second derivative w.r.t. X and X2
+        """
+        raise(NotImplementedError, "This is the diagonal of the second derivative of K wrt X and X2, and not implemented for this kernel")
    def gradients_X_diag(self, dL_dKdiag, X):
+        """
+        The diagonal of the derivative w.r.t. X
+        """
        raise NotImplementedError

    def update_gradients_diag(self, dL_dKdiag, X):
@ -113,27 +153,35 @@ class Kern(Parameterized):
        Set the gradients of all parameters when doing inference with
        uncertain inputs, using expectations of the kernel.

-        The esential maths is
+        The essential maths is

-        dL_d{theta_i} = dL_dpsi0 * dpsi0_d{theta_i} +
-                        dL_dpsi1 * dpsi1_d{theta_i} +
-                        dL_dpsi2 * dpsi2_d{theta_i}
+        .. math::
+
+            \\frac{\partial L}{\partial \\theta_i} & = \\frac{\partial L}{\partial \psi_0}\\frac{\partial \psi_0}{\partial \\theta_i}\\
+                & \quad + \\frac{\partial L}{\partial \psi_1}\\frac{\partial \psi_1}{\partial \\theta_i}\\
+                & \quad + \\frac{\partial L}{\partial \psi_2}\\frac{\partial \psi_2}{\partial \\theta_i}
+
+        Thus, we push the different derivatives through the gradients of the psi
+        statistics. Be sure to set the gradients for all kernel
+        parameters here.
        """
-        raise NotImplementedError
+        dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[0]
+        self.gradient[:] = dtheta

-    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                psi0=None, psi1=None, psi2=None):
        """
        Returns the derivative of the objective wrt Z, using the chain rule
        through the expectation variables.
        """
-        raise NotImplementedError
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[1]

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Compute the gradients wrt the parameters of the variational
        distruibution q(X), chain-ruling via the expectations of the kernel
        """
-        raise NotImplementedError
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2:]

    def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
        """
@ -172,7 +220,7 @@ class Kern(Parameterized):
    def __iadd__(self, other):
        return self.add(other)

-    def add(self, other, name='add'):
+    def add(self, other, name='sum'):
        """
        Add another kernel to this one.

@ -208,8 +256,6 @@ class Kern(Parameterized):

        :param other: the other kernel to be added
        :type other: GPy.kern
-        :param tensor: whether or not to use the tensor space (default is false).
-        :type tensor: bool

        """
        assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -1,7 +1,11 @@
 '''
 Created on 11 Mar 2014

-@author: maxz
+@author: @mzwiessele
+
+This module provides a meta class for the kernels. The meta class is for
+slicing the inputs (X, X2) for the kernels, before K (or any other method involving X)
+gets calls. The `active_dims` of a kernel decide which dimensions the kernel works on.
 '''
 from ...core.parameterization.parameterized import ParametersChangedMeta
 import numpy as np
@ -19,20 +23,27 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
        put_clean(dct, 'gradients_X', _slice_gradients_X)
+        put_clean(dct, 'gradients_X_X2', _slice_gradients_X)
+        put_clean(dct, 'gradients_XX', _slice_gradients_XX)
+        put_clean(dct, 'gradients_XX_diag', _slice_gradients_X_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)

        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
        put_clean(dct, 'psi2', _slice_psi)
+        put_clean(dct, 'psi2n', _slice_psi)
        put_clean(dct, 'update_gradients_expectations', _slice_update_gradients_expectations)
        put_clean(dct, 'gradients_Z_expectations', _slice_gradients_Z_expectations)
        put_clean(dct, 'gradients_qX_expectations', _slice_gradients_qX_expectations)
        return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)

 class _Slice_wrap(object):
-    def __init__(self, k, X, X2=None):
+    def __init__(self, k, X, X2=None, ret_shape=None):
        self.k = k
-        self.shape = X.shape
+        if ret_shape is None:
+            self.shape = X.shape
+        else:
+            self.shape = ret_shape
        assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
        if X2 is not None:
            assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
@ -54,7 +65,10 @@ class _Slice_wrap(object):
    def handle_return_array(self, return_val):
        if self.ret:
            ret = np.zeros(self.shape)
-            ret[:, self.k.active_dims] = return_val
+            if len(self.shape) == 2:
+                ret[:, self.k.active_dims] = return_val
+            elif len(self.shape) == 3:
+                ret[:, :, self.k.active_dims] = return_val
            return ret
        return return_val

@ -98,6 +112,19 @@ def _slice_gradients_X(f):
        return ret
    return wrap

+def _slice_gradients_XX(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        if X2 is None:
+            N, M = X.shape[0], X.shape[0]
+        else:
+            N, M = X.shape[0], X2.shape[0]
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M, X.shape[1])) as s:
+        #with _Slice_wrap(self, X, X2, ret_shape=None) as s:
+            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
+        return ret
+    return wrap
+
 def _slice_gradients_X_diag(f):
    @wraps(f)
    def wrap(self, dL_dKdiag, X):
@ -124,7 +151,8 @@ def _slice_update_gradients_expectations(f):

 def _slice_gradients_Z_expectations(f):
    @wraps(f)
-    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+             psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
        with _Slice_wrap(self, Z, variational_posterior) as s:
            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
        return ret
@ -132,7 +160,8 @@ def _slice_gradients_Z_expectations(f):

 def _slice_gradients_qX_expectations(f):
    @wraps(f)
-    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+             psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
        with _Slice_wrap(self, variational_posterior, Z) as s:
            ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X))
            r2 = ret[:2]
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -17,7 +17,7 @@ class Linear(Kern):

    .. math::

-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i
+       k(x,y) = \sum_{i=1}^{\\text{input_dim}} \sigma^2_i x_iy_i

    :param input_dim: the number of input dimensions
    :type input_dim: int
@ -100,6 +100,12 @@ class Linear(Kern):
            #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
            return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)

+    def gradients_XX(self, dL_dK, X, X2=None):
+        if X2 is None:
+            return 2*np.ones(X.shape)*self.variances
+        else:
+            return np.ones(X.shape)*self.variances
+
    def gradients_X_diag(self, dL_dKdiag, X):
        return 2.*self.variances*dL_dKdiag[:,None]*X

@ -111,26 +117,29 @@ class Linear(Kern):
    #---------------------------------------#

    def psi0(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[0]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]

    def psi1(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]

    def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[2]
+
+    def psi2n(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]

    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        dL_dvar = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[0]
+        dL_dvar = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[0]
        if self.ARD:
            self.variances.gradient = dL_dvar
        else:
            self.variances.gradient = dL_dvar.sum()

    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[1]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[1]

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2:]

 class LinearFull(Kern):
    def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@ -5,6 +5,8 @@ from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
+from ...util.linalg import tdot
+from ...util.caching import Cache_this
 four_over_tau = 2./np.pi

 class MLP(Kern):
@ -31,105 +33,116 @@ class MLP(Kern):

    """

-    def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=100., active_dims=None, name='mlp'):
+    def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=1., ARD=False, active_dims=None, name='mlp'):
        super(MLP, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
+        self.ARD= ARD
+        if ARD:
+            wv = np.empty((input_dim,))
+            wv[:] = weight_variance
+            weight_variance = wv
        self.weight_variance = Param('weight_variance', weight_variance, Logexp())
        self.bias_variance = Param('bias_variance', bias_variance, Logexp())
        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)


+    @Cache_this(limit=20, ignore_args=())
    def K(self, X, X2=None):
-        self._K_computations(X, X2)
-        return self.variance*self._K_dvar
+        if X2 is None:
+            X_denom = np.sqrt(self._comp_prod(X)+1.)
+            X2_denom = X_denom
+            X2 = X
+        else:
+            X_denom = np.sqrt(self._comp_prod(X)+1.)
+            X2_denom = np.sqrt(self._comp_prod(X2)+1.)
+        XTX = self._comp_prod(X,X2)/X_denom[:,None]/X2_denom[None,:]
+        return self.variance*four_over_tau*np.arcsin(XTX)

+    @Cache_this(limit=20, ignore_args=())
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix for X."""
-        self._K_diag_computations(X)
-        return self.variance*self._K_diag_dvar
+        X_prod = self._comp_prod(X)
+        return self.variance*four_over_tau*np.arcsin(X_prod/(X_prod+1.))

    def update_gradients_full(self, dL_dK, X, X2=None):
        """Derivative of the covariance with respect to the parameters."""
-        self._K_computations(X, X2)
-        self.variance.gradient = np.sum(self._K_dvar*dL_dK)
-
-        denom3 = self._K_denom**3
-        base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
-        base_cov_grad = base*dL_dK
-
-        if X2 is None:
-            vec = np.diag(self._K_inner_prod)
-            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
-                           -.5*self._K_numer/denom3
-                           *(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec)
-                             +np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
-            self.bias_variance.gradient = ((1./self._K_denom
-                           -.5*self._K_numer/denom3
-                           *((vec[None, :]+vec[:, None])*self.weight_variance
-                           +2.*self.bias_variance + 2.))*base_cov_grad).sum()
-        else:
-            vec1 = (X*X).sum(1)
-            vec2 = (X2*X2).sum(1)
-            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
-                           -.5*self._K_numer/denom3
-                           *(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
-            self.bias_variance.gradient = ((1./self._K_denom
-                           -.5*self._K_numer/denom3
-                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
-                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()
+        dvar, dw, db = self._comp_grads(dL_dK, X, X2)[:3]
+        self.variance.gradient = dvar
+        self.weight_variance.gradient = dw
+        self.bias_variance.gradient = db

    def update_gradients_diag(self, dL_dKdiag, X):
-        self._K_diag_computations(X)
-        self.variance.gradient = np.sum(self._K_diag_dvar*dL_dKdiag)
-        
-        base = four_over_tau*self.variance/np.sqrt(1-self._K_diag_asin_arg*self._K_diag_asin_arg)
-        base_cov_grad = base*dL_dKdiag/np.square(self._K_diag_denom)
-        
-        self.weight_variance.gradient = (base_cov_grad*np.square(X).sum(axis=1)).sum()
-        self.bias_variance.gradient = base_cov_grad.sum()
+        dvar, dw, db = self._comp_grads_diag(dL_dKdiag, X)[:3]
+        self.variance.gradient = dvar
+        self.weight_variance.gradient = dw
+        self.bias_variance.gradient = db
        
    def gradients_X(self, dL_dK, X, X2):
        """Derivative of the covariance matrix with respect to X"""
-        self._K_computations(X, X2)
-        arg = self._K_asin_arg
-        numer = self._K_numer
-        denom = self._K_denom
-        denom3 = denom*denom*denom
-        if X2 is not None:
-            vec2 = (X2*X2).sum(1)*self.weight_variance+self.bias_variance + 1.
-            return four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
-        else:
-            vec = (X*X).sum(1)*self.weight_variance+self.bias_variance + 1.
-            return 2*four_over_tau*self.weight_variance*self.variance*((X[None, :, :]/denom[:, :, None] - vec[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+        return self._comp_grads(dL_dK, X, X2)[3]
+
+    def gradients_X_X2(self, dL_dK, X, X2):
+        """Derivative of the covariance matrix with respect to X"""
+        return self._comp_grads(dL_dK, X, X2)[3:]

    def gradients_X_diag(self, dL_dKdiag, X):
        """Gradient of diagonal of covariance with respect to X"""
-        self._K_diag_computations(X)
-        arg = self._K_diag_asin_arg
-        denom = self._K_diag_denom
-        #numer = self._K_diag_numer
-        return four_over_tau*2.*self.weight_variance*self.variance*X*(1./denom*(1. - arg)*dL_dKdiag/(np.sqrt(1-arg*arg)))[:, None]
+        return self._comp_grads_diag(dL_dKdiag, X)[3]

-
-    def _K_computations(self, X, X2):
-        """Pre-computations for the covariance matrix (used for computing the covariance and its gradients."""
+    @Cache_this(limit=50, ignore_args=())
+    def _comp_prod(self, X, X2=None):
        if X2 is None:
-            self._K_inner_prod = np.dot(X,X.T)
-            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
-            vec = np.diag(self._K_numer) + 1.
-            self._K_denom = np.sqrt(np.outer(vec,vec))
+            return (np.square(X)*self.weight_variance).sum(axis=1)+self.bias_variance
        else:
-            self._K_inner_prod = np.dot(X,X2.T)
-            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
-            vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
-            vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
-            self._K_denom = np.sqrt(np.outer(vec1,vec2))
-        self._K_asin_arg = self._K_numer/self._K_denom
-        self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
+            return (X*self.weight_variance).dot(X2.T)+self.bias_variance
    
-    def _K_diag_computations(self, X):
-        """Pre-computations concerning the diagonal terms (used for computation of diagonal and its gradients)."""
-        self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
-        self._K_diag_denom = self._K_diag_numer+1.
-        self._K_diag_asin_arg = self._K_diag_numer/self._K_diag_denom
-        self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_asin_arg)
+    @Cache_this(limit=20, ignore_args=(1,))
+    def _comp_grads(self, dL_dK, X, X2=None):
+        var,w,b = self.variance, self.weight_variance, self.bias_variance
+        K = self.K(X, X2)
+        dvar = (dL_dK*K).sum()/var
+        X_prod = self._comp_prod(X)
+        X2_prod = self._comp_prod(X2) if X2 is not None else X_prod
+        XTX = self._comp_prod(X,X2) if X2 is not None else self._comp_prod(X, X)
+        common = var*four_over_tau/np.sqrt((X_prod[:,None]+1.)*(X2_prod[None,:]+1.)-np.square(XTX))*dL_dK
+        if self.ARD:
+            if X2 is not None:
+                XX2 = X[:,None,:]*X2[None,:,:] if X2 is not None else X[:,None,:]*X[None,:,:]
+                XX = np.square(X)
+                X2X2 = np.square(X2)
+                Q = self.weight_variance.shape[0]
+                common_XTX = common*XTX
+                dw =  np.dot(common.flat,XX2.reshape(-1,Q)) -( (common_XTX.sum(1)/(X_prod+1.)).T.dot(XX)+(common_XTX.sum(0)/(X2_prod+1.)).dot(X2X2))/2
+            else:
+                XX2 = X[:,None,:]*X[None,:,:]
+                XX = np.square(X)
+                Q = self.weight_variance.shape[0]
+                common_XTX = common*XTX
+                dw =  np.dot(common.flat,XX2.reshape(-1,Q)) - ((common_XTX.sum(0)+common_XTX.sum(1))/(X_prod+1.)).dot(XX)/2
+        else:
+            dw = (common*((XTX-b)/w-XTX*(((X_prod-b)/(w*(X_prod+1.)))[:,None]+((X2_prod-b)/(w*(X2_prod+1.)))[None,:])/2.)).sum()
+        db = (common*(1.-XTX*(1./(X_prod[:,None]+1.)+1./(X2_prod[None,:]+1.))/2.)).sum()
+        if X2 is None:
+            common = common+common.T
+            dX = common.dot(X)*w-((common*XTX).sum(axis=1)/(X_prod+1.))[:,None]*X*w
+            dX2 = dX
+        else:
+            dX = common.dot(X2)*w-((common*XTX).sum(axis=1)/(X_prod+1.))[:,None]*X*w
+            dX2 = common.T.dot(X)*w-((common*XTX).sum(axis=0)/(X2_prod+1.))[:,None]*X2*w
+        return dvar, dw, db, dX, dX2
+    
+    @Cache_this(limit=20, ignore_args=(1,))
+    def _comp_grads_diag(self, dL_dKdiag, X):
+        var,w,b = self.variance, self.weight_variance, self.bias_variance
+        K = self.Kdiag(X)
+        dvar = (dL_dKdiag*K).sum()/var
+        X_prod = self._comp_prod(X)
+        common = var*four_over_tau/(np.sqrt(1-np.square(X_prod/(X_prod+1)))*np.square(X_prod+1))*dL_dKdiag
+        if self.ARD:
+            XX = np.square(X)
+            dw = np.dot(common,XX)
+        else:
+            dw = (common*(X_prod-b)).sum()/w
+        db = common.sum()
+        dX = common[:,None]*X*w*2
+        return dvar, dw, db, dX
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -27,8 +27,6 @@ class Prod(CombinationKernel):

    :param k1, k2: the kernels to multiply
    :type k1, k2: Kern
-    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
-    :type tensor: Boolean
    :rtype: kernel object

    """
--- a/GPy/kern/_src/psi_comp/init.py
+++ b/GPy/kern/_src/psi_comp/init.py
@ -9,18 +9,34 @@ from . import ssrbf_psi_comp
 from . import sslinear_psi_comp
 from . import linear_psi_comp

-class PSICOMP_RBF(Pickleable):
-    @Cache_this(limit=2, ignore_args=(0,))
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+
+class PSICOMP(Pickleable):
+        
+    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
+        raise NotImplementedError("Abstract method!")
+    
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
+        raise NotImplementedError("Abstract method!")
+
+    def _setup_observers(self):
+        pass
+
+from .gaussherm import PSICOMP_GH
+
+class PSICOMP_RBF(PSICOMP):
+    @Cache_this(limit=5, ignore_args=(0,))
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
+        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
-            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
+            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=return_psi2_n)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=2, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    @Cache_this(limit=5, ignore_args=(0,2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
            return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
@ -28,28 +44,26 @@ class PSICOMP_RBF(Pickleable):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    def _setup_observers(self):
-        pass
+class PSICOMP_Linear(PSICOMP):

-class PSICOMP_Linear(Pickleable):
-
-    @Cache_this(limit=2, ignore_args=(0,))
-    def psicomputations(self, variance, Z, variational_posterior):
+    @Cache_this(limit=5, ignore_args=(0,))
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
+        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
-            return linear_psi_comp.psicomputations(variance, Z, variational_posterior)
+            return linear_psi_comp.psicomputations(variances, Z, variational_posterior, return_psi2_n=return_psi2_n)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior)
+            return sslinear_psi_comp.psicomputations(variances, Z, variational_posterior)
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=2, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+    @Cache_this(limit=2, ignore_args=(0,2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
-            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variances, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+            return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variances, Z, variational_posterior)
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    def _setup_observers(self):
-        pass
+
--- a/GPy/kern/_src/psi_comp/gaussherm.py
+++ b/GPy/kern/_src/psi_comp/gaussherm.py
@ -0,0 +1,100 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+An approximated psi-statistics implementation based on Gauss-Hermite Quadrature
+"""
+
+import numpy as np
+
+from ....core.parameterization import Param
+from GPy.util.caching import Cache_this
+from ....util.linalg import tdot
+from . import PSICOMP
+
+class PSICOMP_GH(PSICOMP):
+    """
+    TODO: support Psi2 with shape NxMxM
+    """
+    
+    def __init__(self, degree=5, cache_K=True):
+        self.degree = degree
+        self.cache_K = cache_K
+        self.locs, self.weights = np.polynomial.hermite.hermgauss(degree)
+        self.locs *= np.sqrt(2.)
+        self.weights*= 1./np.sqrt(np.pi)
+        self.Xs = None
+
+    def _setup_observers(self):
+        pass
+    
+    @Cache_this(limit=10, ignore_args=(0,))
+    def comp_K(self, Z, qX):
+        if self.Xs is None or self.Xs.shape != qX.mean.shape:
+            from ....core.parameterization import ObsAr
+            self.Xs = ObsAr(np.empty((self.degree,)+qX.mean.shape))
+        mu, S = qX.mean.values, qX.variance.values
+        S_sq = np.sqrt(S)
+        for i in xrange(self.degree):
+            self.Xs[i] = self.locs[i]*S_sq+mu
+        return self.Xs
+    
+    @Cache_this(limit=10, ignore_args=(0,))
+    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
+        mu, S = qX.mean.values, qX.variance.values
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        if self.cache_K: Xs = self.comp_K(Z, qX)
+        else: S_sq = np.sqrt(S)
+        
+        psi0 = np.zeros((N,))
+        psi1 = np.zeros((N,M))
+        psi2 = np.zeros((M,M))
+        for i in xrange(self.degree):
+            if self.cache_K:
+                X = Xs[i]
+            else:
+                X = self.locs[i]*S_sq+mu
+            psi0 += self.weights[i]* kern.Kdiag(X)
+            Kfu = kern.K(X,Z)
+            psi1 += self.weights[i]* Kfu
+            psi2 += self.weights[i]* tdot(Kfu.T)
+        return psi0, psi1, psi2
+    
+    @Cache_this(limit=10, ignore_args=(0, 2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
+        mu, S = qX.mean.values, qX.variance.values
+        if self.cache_K: Xs = self.comp_K(Z, qX)
+        S_sq = np.sqrt(S)
+        
+        dtheta_old = kern.gradient.copy()
+        dtheta = np.zeros_like(kern.gradient)
+        if isinstance(Z, Param):
+            dZ = np.zeros_like(Z.values)
+        else:
+            dZ = np.zeros_like(Z)
+        dmu = np.zeros_like(mu)
+        dS = np.zeros_like(S)
+        for i in xrange(self.degree):
+            if self.cache_K:
+                X = Xs[i]
+            else:
+                X = self.locs[i]*S_sq+mu
+            dL_dpsi0_i = dL_dpsi0*self.weights[i]
+            kern.update_gradients_diag(dL_dpsi0_i, X)
+            dtheta += kern.gradient
+            dX = kern.gradients_X_diag(dL_dpsi0_i, X)
+            Kfu = kern.K(X,Z)
+            dL_dkfu = (dL_dpsi1+ 2.*Kfu.dot(dL_dpsi2))*self.weights[i]
+            kern.update_gradients_full(dL_dkfu, X, Z)
+            dtheta += kern.gradient
+            dX_i, dZ_i = kern.gradients_X_X2(dL_dkfu, X, Z)
+            dX += dX_i
+            dZ += dZ_i
+            dmu += dX
+            dS += dX*self.locs[i]/(2.*S_sq)
+        kern.gradient[:] = dtheta_old
+        return dtheta, dZ, dmu, dS
+        
+
+
+
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@ -8,7 +8,7 @@ The package for the Psi statistics computation of the linear kernel for Bayesian
 import numpy as np
 from ....util.linalg import tdot

-def psicomputations(variance, Z, variational_posterior):
+def psicomputations(variance, Z, variational_posterior, return_psi2_n=False):
    """
    Compute psi-statistics for ss-linear kernel
    """
@ -21,8 +21,12 @@ def psicomputations(variance, Z, variational_posterior):
    S = variational_posterior.variance

    psi0 = (variance*(np.square(mu)+S)).sum(axis=1)
-    psi1 = np.dot(mu,(variance*Z).T)
-    psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
+    Zv = variance * Z
+    psi1 = np.dot(mu,Zv.T)
+    if return_psi2_n:
+        psi2 = psi1[:,:,None] * psi1[:,None,:] + np.dot(S[:,None,:] * Zv[None,:,:], Zv.T)
+    else:
+        psi2 = np.dot(S.sum(axis=0) * Zv, Zv.T) + tdot(psi1.T)

    return psi0, psi1, psi2

@ -59,19 +63,39 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S):

    variance2 = np.square(variance)
    common_sum = np.dot(mu,(variance*Z).T)
-    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
-    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
-    common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
-    Z2_expect = np.inner(common_sum,dL_dpsi2T)
-    Z1_expect = np.dot(dL_dpsi2T,Z)
+    if len(dL_dpsi2.shape)==2:
+        Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+        dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
+        common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
+        Z2_expect = np.inner(common_sum,dL_dpsi2T)
+        Z1_expect = np.dot(dL_dpsi2T,Z)
    
-    dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
+        dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
    
-    dL_dmu = common_expect*variance
+        dL_dmu = common_expect*variance
    
-    dL_dS = np.empty(S.shape)
-    dL_dS[:] = Z_expect*variance2
+        dL_dS = np.empty(S.shape)
+        dL_dS[:] = Z_expect*variance2
    
-    dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
+        dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
+    else:
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        dL_dpsi2_ = dL_dpsi2.sum(axis=0)
+        Z_expect = (np.dot(dL_dpsi2.reshape(N*M,M),Z).reshape(N,M,Q)*Z[None,:,:]).sum(axis=1)
+        dL_dpsi2T = dL_dpsi2_+dL_dpsi2_.T
+        dL_dpsi2T_ = dL_dpsi2+np.swapaxes(dL_dpsi2, 1, 2)
+        common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
+        common_expect_ = (common_sum[:,:,None]*np.dot(dL_dpsi2T_.reshape(N*M,M),Z).reshape(N,M,Q)).sum(axis=1)
+        Z2_expect = (common_sum[:,:,None]*dL_dpsi2T_).sum(axis=1)
+        Z1_expect = np.dot(dL_dpsi2T_.reshape(N*M,M),Z).reshape(N,M,Q)
+    
+        dL_dvar = 2.*variance*(S*Z_expect).sum(axis=0)+(common_expect_*mu).sum(axis=0)
+    
+        dL_dmu = common_expect_*variance
+    
+        dL_dS = np.empty(S.shape)
+        dL_dS[:] = variance2* Z_expect
+    
+        dL_dZ = variance2*(S[:,None,:]*Z1_expect).sum(axis=0)+np.dot(Z2_expect.T,variance*mu)

    return dL_dvar, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -5,13 +5,7 @@ The module for psi-statistics for RBF kernel
 import numpy as np
 from GPy.util.caching import Cacher

-def psicomputations(variance, lengthscale, Z, variational_posterior):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
+def psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=False):
    # here are the "statistics" for psi0, psi1 and psi2
    # Produced intermediate results:
    # _psi1                NxM
@ -21,16 +15,11 @@ def psicomputations(variance, lengthscale, Z, variational_posterior):
    psi0 = np.empty(mu.shape[0])
    psi0[:] = variance
    psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
-    psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
+    psi2 = _psi2computations(variance, lengthscale, Z, mu, S)
+    if not return_psi2_n: psi2 = psi2.sum(axis=0)
    return psi0, psi1, psi2

 def __psi1computations(variance, lengthscale, Z, mu, S):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
    # here are the "statistics" for psi1
    # Produced intermediate results:
    # _psi1                NxM
@ -45,26 +34,19 @@ def __psi1computations(variance, lengthscale, Z, mu, S):
    return _psi1

 def __psi2computations(variance, lengthscale, Z, mu, S):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
    # here are the "statistics" for psi2
    # Produced intermediate results:
    # _psi2                MxM

+    N,M,Q = mu.shape[0], Z.shape[0], mu.shape[1]
    lengthscale2 = np.square(lengthscale)

    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
    denom = 1./(2.*S+lengthscale2)
-    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
+    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+(2*(mu*denom).dot(Z_hat.reshape(M*M,Q).T) - denom.dot(np.square(Z_hat).reshape(M*M,Q).T)).reshape(N,M,M)
    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
-
-
    return _psi2

 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
@ -86,13 +68,6 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal
    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS

 def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
-    """
-    dL_dpsi1 - NxM
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
    # here are the "statistics" for psi1
    # Produced intermediate results: dL_dparams w.r.t. psi1
    # _dL_dvariance     1
@ -118,13 +93,6 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS

 def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    dL_dpsi2 - MxM
-    """
    # here are the "statistics" for psi2
    # Produced the derivatives w.r.t. psi2:
    # _dL_dvariance      1
@ -157,5 +125,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):

    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS

-_psi1computations = Cacher(__psi1computations, limit=1)
-_psi2computations = Cacher(__psi2computations, limit=1)
+_psi1computations = Cacher(__psi1computations, limit=5)
+_psi2computations = Cacher(__psi2computations, limit=5)
--- a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
@ -7,13 +7,6 @@ from ....util.caching import Cache_this
 from . import PSICOMP_RBF
 from ....util import gpu_init

-try:
-    import pycuda.gpuarray as gpuarray
-    from pycuda.compiler import SourceModule
-    from ....util.linalg_gpu import sum_axis
-except:
-    pass    
-
 gpu_code = """
    // define THREADNUM

@ -241,7 +234,11 @@ gpu_code = """

 class PSICOMP_RBF_GPU(PSICOMP_RBF):

-    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
+    def __init__(self, threadnum=256, blocknum=30, GPU_direct=False):
+        from pycuda.compiler import SourceModule
+        from ....util.gpu_init import initGPU
+        initGPU()
+        
        self.GPU_direct = GPU_direct
        self.gpuCache = None
        
@ -265,6 +262,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        return s
    
    def _initGPUCache(self, N, M, Q):
+        import pycuda.gpuarray as gpuarray
        if self.gpuCache == None:
            self.gpuCache = {
                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
@ -320,13 +318,14 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
    def get_dimensions(self, Z, variational_posterior):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]

-    @Cache_this(limit=1, ignore_args=(0,))
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+    @Cache_this(limit=5, ignore_args=(0,))
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        """
+        variance, lengthscale = kern.variance, kern.lengthscale
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        self._initGPUCache(N,M,Q)
        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
@ -355,8 +354,10 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()

-    @Cache_this(limit=1, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    @Cache_this(limit=5, ignore_args=(0,2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        variance, lengthscale = kern.variance, kern.lengthscale
+        from ....util.linalg_gpu import sum_axis
        ARD = (len(lengthscale)!=1)
        
        N,M,Q = self.get_dimensions(Z, variational_posterior)
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@ -9,7 +9,7 @@ from ....util.linalg import tdot

 import numpy as np

-def psicomputations(variance, Z, variational_posterior):
+def psicomputations(variance, Z, variational_posterior, return_psi2_n=False):
    """
    Compute psi-statistics for ss-linear kernel
    """
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
@ -6,14 +6,7 @@ The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM
 import numpy as np
 from ....util.caching import Cache_this
 from . import PSICOMP_RBF
-from ....util import gpu_init

-try:
-    import pycuda.gpuarray as gpuarray
-    from pycuda.compiler import SourceModule
-    from ....util.linalg_gpu import sum_axis
-except:
-    pass    

 gpu_code = """
    // define THREADNUM
@ -292,6 +285,11 @@ gpu_code = """
 class PSICOMP_SSRBF_GPU(PSICOMP_RBF):

    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
+        
+        from pycuda.compiler import SourceModule
+        from ....util.gpu_init import initGPU
+        initGPU()
+        
        self.GPU_direct = GPU_direct
        self.gpuCache = None
        
@ -315,6 +313,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        return s

    def _initGPUCache(self, N, M, Q):
+        import pycuda.gpuarray as gpuarray
        if self.gpuCache == None:
            self.gpuCache = {
                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
@ -377,12 +376,13 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]

    @Cache_this(limit=1, ignore_args=(0,))
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        """
+        variance, lengthscale = kern.variance, kern.lengthscale
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        self._initGPUCache(N,M,Q)
        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
@ -409,8 +409,10 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()

-    @Cache_this(limit=1, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    @Cache_this(limit=1, ignore_args=(0,2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        variance, lengthscale = kern.variance, kern.lengthscale
+        from ....util.linalg_gpu import sum_axis
        ARD = (len(lengthscale)!=1)
        
        N,M,Q = self.get_dimensions(Z, variational_posterior)
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -31,6 +31,9 @@ class RBF(Stationary):
    def dK_dr(self, r):
        return -r*self.K_of_r(r)

+    def dK2_drdr(self, r):
+        return (r**2-1)*self.K_of_r(r)
+
    def __getstate__(self):
        dc = super(RBF, self).__getstate__()
        if self.useGPU:
@ -50,22 +53,25 @@ class RBF(Stationary):
    #---------------------------------------#

    def psi0(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[0]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]

    def psi1(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]

    def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[2]
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=False)[2]
+
+    def psi2n(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]

    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[:2]
+        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[:2]
        self.variance.gradient = dL_dvar
        self.lengthscale.gradient = dL_dlengscale

    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[2]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2]

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[3:]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[3:]

--- a/GPy/kern/_src/spline.py
+++ b/GPy/kern/_src/spline.py
@ -0,0 +1,52 @@
+# Copyright (c) 2015, Thomas Hornung
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from .kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+class Spline(Kern):
+    """
+    Linear spline kernel. You need to specify 2 parameters: the variance and c.
+    The variance is defined in powers of 10. Thus specifying -2 means 10^-2.
+    The parameter c allows to define the stiffness of the spline fit. A very stiff
+    spline equals linear regression.
+    See https://www.youtube.com/watch?v=50Vgw11qn0o starting at minute 1:17:28
+    Lit: Wahba, 1990
+    """
+
+    def __init__(self, input_dim, variance=1., c=1., active_dims=None, name='spline'):
+        super(Spline, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.c = Param('c', c)
+        self.link_parameters(self.variance,self.c)
+
+
+    def K(self, X, X2=None):
+        if X2 is None: X2=X
+        term1 = (X+8.)*(X2.T+8.)/16.
+        term2 = abs((X-X2.T)/16.)**3
+        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
+        return (self.variance**2 * (1. + (1.+self.c) * term1 + self.c/3. * (term2 - term3)))
+
+    def Kdiag(self, X):
+        term1 = np.square(X+8.,X+8.)/16.
+        term3 = 2. * ((X+8.)/16.)**3
+        return (self.variance**2 * (1. + (1.+self.c) * term1 - self.c/3. * term3))[:,0]
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None: X2=X
+        term1 = (X+8.)*(X2.T+8.)/16.
+        term2 = abs((X-X2.T)/16.)**3
+        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
+        self.variance.gradient = np.sum(dL_dK * (2*self.variance * (1. + (1.+self.c) * term1 + self.c/3. * ( term2 - term3))))
+        self.c.gradient = np.sum(dL_dK * (self.variance**2* (term1 + 1./3.*(term2 - term3))))
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        raise NotImplementedError
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
--- a/GPy/kern/_src/standard_periodic.py
+++ b/GPy/kern/_src/standard_periodic.py
@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+"""
+The standard periodic kernel which mentioned in:
+
+[1] Gaussian Processes for Machine Learning, C. E. Rasmussen, C. K. I. Williams.
+The MIT Press, 2005.
+
+
+[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
+Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
+"""
+
+from .kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+
+import numpy as np
+
+class StdPeriodic(Kern):
+    """
+    Standart periodic kernel
+
+    .. math::
+
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
+       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\theta_1` in the formula above
+    :type variance: float
+    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
+    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
+    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD1: Auto Relevance Determination with respect to wavelength. 
+        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
+        each dimension is assumed, otherwise there is one lengthscale 
+        parameter per dimension.
+    :type ARD1: Boolean
+    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
+        If equal to "False" one single wavelength parameter :math:`l_i` for 
+        each dimension is assumed, otherwise there is one lengthscale 
+        parameter per dimension.
+    :type ARD2: Boolean
+    :param active_dims: indices of dimensions which are used in the computation of the kernel
+    :type wavelength: array or list of the appropriate size
+    :param name: Name of the kernel for output
+    :type String
+    :param useGPU: whether of not use GPU
+    :type Boolean
+    """
+    
+    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
+        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
+        self.input_dim = input_dim
+        self.ARD1 = ARD1 # correspond to wavelengths        
+        self.ARD2 = ARD2 # correspond to lengthscales
+        
+        self.name = name
+        
+        if self.ARD1 == False:
+            if wavelength is not None:
+                wavelength = np.asarray(wavelength)
+                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
+            else:
+                wavelength = np.ones(1)
+        else:
+            if wavelength is not None:
+                wavelength = np.asarray(wavelength)
+                assert wavelength.size == input_dim, "bad number of wavelengths"
+            else:
+                wavelength = np.ones(input_dim)
+        
+        if self.ARD2 == False:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                lengthscale = np.ones(1)
+        else:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == input_dim, "bad number of lengthscales"
+            else:
+                lengthscale = np.ones(input_dim)
+        
+        self.variance = Param('variance', variance, Logexp())
+        assert self.variance.size==1, "Variance size must be one"
+        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
+        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
+        
+        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
+
+    def parameters_changed(self):
+        """
+        This functions deals as a callback for each optimization iteration. 
+        If one optimization step was successfull and the parameters
+        this callback function will be called to be able to update any 
+        precomputations for the kernel.
+        """
+        
+        pass
+        
+        
+    def K(self, X, X2=None):
+        """Compute the covariance matrix between X and X2."""
+        if X2 is None: 
+            X2 = X
+            
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
+            
+        return self.variance * exp_dist
+
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        ret = np.empty(X.shape[0])
+        ret[:] = self.variance
+        return ret
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: 
+            X2 = X
+        
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        
+        sin_base = np.sin( base )         
+        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
+        
+        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
+        
+        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
+        
+        self.variance.gradient = np.sum(exp_dist * dL_dK)    
+        #target[0] += np.sum( exp_dist * dL_dK)        
+        
+        if self.ARD1: # different wavelengths
+            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else:  # same wavelengths
+            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
+            
+        if self.ARD2: # different lengthscales
+            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else: # same lengthscales
+            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
+        
+    def update_gradients_diag(self, dL_dKdiag, X):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+        self.variance.gradient = np.sum(dL_dKdiag)
+        self.wavelengths.gradient = 0
+        self.lengthscales.gradient = 0
+
+#    def gradients_X(self, dL_dK, X, X2=None):
+#        """derivative of the covariance matrix with respect to X."""
+#    
+#        raise NotImplemented("Periodic kernel: dK_dX not implemented")
+#
+#    def gradients_X_diag(self, dL_dKdiag, X):
+#        
+#        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -24,6 +24,13 @@ class Static(Kern):
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)

+    def gradients_XX(self, dL_dK, X, X2):
+        if X2 is None:
+            X2 = X
+        return np.zeros((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
+    def gradients_XX_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return np.zeros(Z.shape)

@ -59,6 +66,9 @@ class White(Static):
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)

+    def psi2n(self, Z, variational_posterior):
+        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
+
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None:
            self.variance.gradient = np.trace(dL_dK)
@ -92,6 +102,11 @@ class Bias(Static):
        ret[:] = self.variance*self.variance*variational_posterior.shape[0]
        return ret

+    def psi2n(self, Z, variational_posterior):
+        ret = np.empty((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
+        ret[:] = self.variance*self.variance
+        return ret
+
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()*variational_posterior.shape[0]

@ -120,6 +135,9 @@ class Fixed(Static):
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)

+    def psi2n(self, Z, variational_posterior):
+        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
+
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()

--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -15,7 +15,7 @@ from ...util.caching import Cache_this
 try:
    from . import stationary_cython
 except ImportError:
-    print('warning in sationary: failed to import cython module: falling back to numpy')
+    print('warning in stationary: failed to import cython module: falling back to numpy')
    config.set('cython', 'working', 'false')


@ -25,13 +25,16 @@ class Stationary(Kern):

    Stationary covariance fucntion depend only on r, where r is defined as

-      r = \sqrt{ \sum_{q=1}^Q (x_q - x'_q)^2 }
+    .. math::
+        r(x, x') = \\sqrt{ \\sum_{q=1}^Q (x_q - x'_q)^2 }

    The covariance function k(x, x' can then be written k(r).

    In this implementation, r is scaled by the lengthscales parameter(s):

-      r = \sqrt{ \sum_{q=1}^Q \frac{(x_q - x'_q)^2}{\ell_q^2} }.
+    .. math::
+
+        r(x, x') = \\sqrt{ \\sum_{q=1}^Q \\frac{(x_q - x'_q)^2}{\ell_q^2} }.

    By default, there's only one lengthscale: seaprate lengthscales for each
    dimension can be enables by setting ARD=True.
@ -39,11 +42,12 @@ class Stationary(Kern):
    To implement a stationary covariance function using this class, one need
    only define the covariance function k(r), and it derivative.

-      ...
-      def K_of_r(self, r):
-          return foo
-      def dK_dr(self, r):
-          return bar
+    ```
+    def K_of_r(self, r):
+        return foo
+    def dK_dr(self, r):
+        return bar
+    ```

    The lengthscale(s) and variance parameters are added to the structure automatically.

@ -77,6 +81,10 @@ class Stationary(Kern):
    def dK_dr(self, r):
        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")

+    @Cache_this(limit=20, ignore_args=())
+    def dK2_drdr(self, r):
+        raise NotImplementedError("implement second derivative of covariance wrt r to use this method")
+
    @Cache_this(limit=5, ignore_args=())
    def K(self, X, X2=None):
        """
@ -89,11 +97,16 @@ class Stationary(Kern):
        r = self._scaled_dist(X, X2)
        return self.K_of_r(r)

-    @Cache_this(limit=3, ignore_args=())
+    @Cache_this(limit=20, ignore_args=())
    def dK_dr_via_X(self, X, X2):
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))

+    @Cache_this(limit=3, ignore_args=())
+    def dK2_drdr_via_X(self, X, X2):
+        #a convenience function, so we can cache dK_dr
+        return self.dK2_drdr(self._scaled_dist(X, X2))
+
    def _unscaled_dist(self, X, X2=None):
        """
        Compute the Euclidean distance between each row of X and X2, or between
@ -114,12 +127,13 @@ class Stationary(Kern):
            r2 = np.clip(r2, 0, np.inf)
            return np.sqrt(r2)

-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=20, ignore_args=())
    def _scaled_dist(self, X, X2=None):
        """
        Efficiently compute the scaled distance, r.

-        r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )
+        ..math::
+            r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )

        Note that if thre is only one lengthscale, l comes outside the sum. In
        this case we compute the unscaled distance first (in a separate
@ -201,6 +215,59 @@ class Stationary(Kern):
        else:
            return self._gradients_X_pure(dL_dK, X, X2)

+    def gradients_XX(self, dL_dK, X, X2=None):
+        """
+        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
+
+        ..math:
+          \frac{\partial^2 K}{\partial X\partial X2}
+
+        ..returns:
+            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
+            Thus, we return the second derivative in X2.
+        """
+        # The off diagonals in Q are always zero, this should also be true for the Linear kernel...
+        # According to multivariable chain rule, we can chain the second derivative through r:
+        # d2K_dXdX2 = dK_dr*d2r_dXdX2 + d2K_drdr * dr_dX * dr_dX2:
+        invdist = self._inv_dist(X, X2)
+        invdist2 = invdist**2
+
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        tmp1 = dL_dr * invdist
+
+        dL_drdr = self.dK2_drdr_via_X(X, X2) * dL_dK
+        tmp2 = dL_drdr * invdist2
+
+        l2 = np.ones(X.shape[1]) * self.lengthscale**2
+
+        if X2 is None:
+            X2 = X
+            tmp1 -= np.eye(X.shape[0])*self.variance
+        else:
+            tmp1[X==X2.T] -= self.variance
+
+        grad = np.empty((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
+        #grad = np.empty(X.shape, dtype=np.float64)
+        for q in range(self.input_dim):
+            tmpdist2 = (X[:,[q]]-X2[:,[q]].T) ** 2
+            grad[:, :, q] = ((tmp1*invdist2 - tmp2)*tmpdist2/l2[q] - tmp1)/l2[q]
+            #grad[:, :, q] = ((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q]
+            #np.sum(((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q], axis=1, out=grad[:,q])
+            #np.sum( - (tmp2*(tmpdist**2)), axis=1, out=grad[:,q])
+        return grad
+
+    def gradients_XX_diag(self, dL_dK, X):
+        """
+        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
+
+        ..math:
+          \frac{\partial^2 K}{\partial X\partial X2}
+
+        ..returns:
+            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ]
+        """
+        return np.ones(X.shape) * self.variance/self.lengthscale**2
+
    def _gradients_X_pure(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
@ -259,7 +326,7 @@ class OU(Stationary):

    .. math::

-       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^{\text{input_dim}} \\frac{(x_i-y_i)^2}{\ell_i^2} }

    """

@ -279,7 +346,7 @@ class Matern32(Stationary):

    .. math::

-       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^{\\text{input_dim}} \\frac{(x_i-y_i)^2}{\ell_i^2} }

    """

@ -326,7 +393,7 @@ class Matern52(Stationary):
    .. math::

       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
-       """
+    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat52'):
        super(Matern52, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)

--- a/GPy/kern/_src/stationary_cython.c
+++ b/GPy/kern/_src/stationary_cython.c
--- a/GPy/kern/_src/stationary_cython.pyx
+++ b/GPy/kern/_src/stationary_cython.pyx
@ -4,14 +4,15 @@
 import numpy as np
 cimport numpy as np
 from cython.parallel import prange
+cimport cython

 ctypedef np.float64_t DTYPE_t
 
 cdef extern from "stationary_utils.h":
-    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad)
+    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad) nogil

 cdef extern from "stationary_utils.h":
-    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad)
+    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad) nogil
 
 def grad_X(int N, int D, int M,
        np.ndarray[DTYPE_t, ndim=2] _X,
@ -22,18 +23,18 @@ def grad_X(int N, int D, int M,
    cdef double *X2 = <double*> _X2.data
    cdef double *tmp = <double*> _tmp.data
    cdef double *grad = <double*> _grad.data
-    _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.
+    with nogil:
+        _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.

+@cython.cdivision(True)
 def grad_X_cython(int N, int D, int M, double[:,:] X, double[:,:] X2, double[:,:] tmp, double[:,:] grad):
    cdef int n,d,nd,m
-    for nd in prange(N*D, nogil=True):
-        n = nd/D
-        d = nd%D
+    for nd in prange(N * D, nogil=True):
+        n = nd / D
+        d = nd % D
        grad[n,d] = 0.0
        for m in range(M):
-            grad[n,d] += tmp[n,m]*(X[n,d]-X2[m,d])
-
-
+            grad[n,d] += tmp[n, m] * (X[n, d] - X2[m, d])

 def lengthscale_grads_in_c(int N, int M, int Q,
        np.ndarray[DTYPE_t, ndim=2] _tmp,
@ -44,16 +45,16 @@ def lengthscale_grads_in_c(int N, int M, int Q,
    cdef double *X = <double*> _X.data
    cdef double *X2 = <double*> _X2.data
    cdef double *grad = <double*> _grad.data
-    _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.
+    with nogil:
+        _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.

 def lengthscale_grads(int N, int M, int Q, double[:,:] tmp, double[:,:] X, double[:,:] X2, double[:] grad):
    cdef int q, n, m
    cdef double gradq, dist
-    for q in range(Q):
-        grad[q] = 0.0
-        for n in range(N):
-            for m in range(M):
-                dist = X[n,q] - X2[m,q]
-                grad[q] += tmp[n,m]*dist*dist
-
-
+    with nogil:
+        for q in range(Q):
+            grad[q] = 0.0
+            for n in range(N):
+                for m in range(M):
+                    dist = X[n,q] - X2[m,q]
+                    grad[q] += tmp[n, m] * dist * dist
--- a/GPy/kern/_src/stationary_utils.h
+++ b/GPy/kern/_src/stationary_utils.h
@ -1,3 +1,5 @@
+#ifndef __APPLE__
 #include <omp.h>
+#endif
 void _grad_X(int N, int D, int M, double*X, double* X2, double* tmp, double* grad);
 void _lengthscale_grads(int N, int D, int M, double* X, double* X2, double* tmp, double* grad);
--- a/GPy/kern/_src/trunclinear.py
+++ b/GPy/kern/_src/trunclinear.py
@ -15,7 +15,7 @@ class TruncLinear(Kern):

    .. math::

-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \sigma_q)

    :param input_dim: the number of input dimensions
    :type input_dim: int
@ -114,7 +114,7 @@ class TruncLinear_inf(Kern):

    .. math::

-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \sigma_q)

    :param input_dim: the number of input dimensions
    :type input_dim: int
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,6 +1,6 @@
 from .bernoulli import Bernoulli
 from .exponential import Exponential
-from .gaussian import Gaussian
+from .gaussian import Gaussian, HeteroscedasticGaussian
 from .gamma import Gamma
 from .poisson import Poisson
 from .student_t import StudentT
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -85,6 +85,7 @@ class Bernoulli(Likelihood):
                gh_x, gh_w = gh_points


+            gh_w = gh_w / np.sqrt(np.pi)
            shape = m.shape
            m,v,Y = m.flatten(), v.flatten(), Y.flatten()
            Ysign = np.where(Y==1,1,-1)
@ -232,6 +233,17 @@ class Bernoulli(Likelihood):
        np.seterr(**state)
        return d3logpdf_dlink3

+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
+        """
+        Get the "quantiles" of the binary labels (Bernoulli draws). all the
+        quantiles must be either 0 or 1, since those are the only values the
+        draw can take!
+        """
+        p = self.predictive_mean(mu, var)
+        return [np.asarray(p>(q/100.), dtype=np.int32) for q in quantiles]
+
+
+
    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
--- a/GPy/likelihoods/exponential.py
+++ b/GPy/likelihoods/exponential.py
@ -124,7 +124,7 @@ class Exponential(Likelihood):
        #d3lik_dlink3 = 6*y/(link_f**4) - 2./(link_f**3)
        return d3lik_dlink3

-    def samples(self, gp):
+    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.

--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -48,6 +48,7 @@ class Gaussian(Likelihood):

    def betaY(self,Y,Y_metadata=None):
        #TODO: ~Ricardo this does not live here
+        raise RuntimeError("Please notify the GPy developers, this should not happen")
        return Y/self.gaussian_variance(Y_metadata)

    def gaussian_variance(self, Y_metadata=None):
@ -315,9 +316,44 @@ class Gaussian(Likelihood):
        return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v

    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
+        if not isinstance(self.gp_link, link_functions.Identity):
+            return super(Gaussian, self).variational_expectations(Y=Y, m=m, v=v, gh_points=gh_points, Y_metadata=Y_metadata)
+
        lik_var = float(self.variance)
        F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/lik_var
        dF_dmu = (Y - m)/lik_var
        dF_dv = np.ones_like(v)*(-0.5/lik_var)
        dF_dtheta = -0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2)
        return F, dF_dmu, dF_dv, dF_dtheta.reshape(1, Y.shape[0], Y.shape[1])
+
+class HeteroscedasticGaussian(Gaussian):
+    def __init__(self, Y_metadata, gp_link=None, variance=1., name='het_Gauss'):
+        if gp_link is None:
+            gp_link = link_functions.Identity()
+
+        if not isinstance(gp_link, link_functions.Identity):
+            print("Warning, Exact inference is not implemeted for non-identity link functions,\
+            if you are not already, ensure Laplace inference_method is used")
+
+        super(HeteroscedasticGaussian, self).__init__(gp_link, np.ones(Y_metadata['output_index'].shape)*variance, name)
+
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        return dL_dKdiag[Y_metadata['output_index']]
+
+    def gaussian_variance(self, Y_metadata=None):
+        return self.variance[Y_metadata['output_index'].flatten()]
+
+    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
+        _s = self.variance[Y_metadata['output_index'].flatten()]
+        if full_cov:
+            if var.ndim == 2:
+                var += np.eye(var.shape[0])*_s
+            if var.ndim == 3:
+                var += np.atleast_3d(np.eye(var.shape[0])*_s)
+        else:
+            var += _s
+        return mu, var
+
+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
+        _s = self.variance[Y_metadata['output_index'].flatten()]
+        return  [stats.norm.ppf(q/100.)*np.sqrt(var + _s) + mu for q in quantiles]
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -607,7 +607,7 @@ class Likelihood(Parameterized):
            pred_mean = self.predictive_mean(mu, var, Y_metadata=Y_metadata)
            pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata=Y_metadata)
        except NotImplementedError:
-            print "Finding predictive mean and variance via sampling rather than quadrature"
+            print("Finding predictive mean and variance via sampling rather than quadrature")
            Nf_samp = 300
            Ny_samp = 1
            s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
@ -622,7 +622,7 @@ class Likelihood(Parameterized):
        Nf_samp = 300
        Ny_samp = 1
        s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
-        ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+        ss_y = self.samples(s, Y_metadata)#, samples=Ny_samp)
        #ss_y = ss_y.reshape(mu.shape[0], mu.shape[1], Nf_samp*Ny_samp)

        pred_quantiles = [np.percentile(ss_y, q, axis=1)[:,None] for q in quantiles]
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -2,6 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
+import scipy
 from ..util.univariate_Gaussian import std_norm_cdf, std_norm_pdf
 import scipy as sp
 from ..util.misc import safe_exp, safe_square, safe_cube, safe_quad, safe_three_times
@ -140,7 +141,7 @@ class Log_ex_1(GPTransformation):

    """
    def transf(self,f):
-        return np.log1p(safe_exp(f))
+        return scipy.special.log1p(safe_exp(f))

    def dtransf_df(self,f):
        ef = safe_exp(f)
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@ -145,5 +145,7 @@ class Poisson(Likelihood):
        """
        orig_shape = gp.shape
        gp = gp.flatten()
+        # Ysim = np.random.poisson(self.gp_link.transf(gp), [samples, gp.size]).T
+        # return Ysim.reshape(orig_shape+(samples,))
        Ysim = np.random.poisson(self.gp_link.transf(gp))
        return Ysim.reshape(orig_shape)
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@ -9,6 +9,7 @@ from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_miniba
 import logging
 from GPy.models.sparse_gp_minibatch import SparseGPMiniBatch
 from GPy.core.parameterization.param import Param
+from GPy.core.parameterization.observable_array import ObsAr

 class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
    """
@ -80,46 +81,10 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        """Get the gradients of the posterior distribution of X in its specific form."""
        return X.mean.gradient, X.variance.gradient

-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kw):
-        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices, **kw)
-
-        if self.has_uncertain_inputs():
-            current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
-                                                variational_posterior=X,
-                                                Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
-                                                dL_dpsi1=grad_dict['dL_dpsi1'],
-                                                dL_dpsi2=grad_dict['dL_dpsi2'])
-        else:
-            current_values['Xgrad'] = self.kern.gradients_X(grad_dict['dL_dKnm'], X, Z)
-            current_values['Xgrad'] += self.kern.gradients_X_diag(grad_dict['dL_dKdiag'], X)
-            if subset_indices is not None:
-                value_indices['Xgrad'] = subset_indices['samples']
-
-        kl_fctr = self.kl_factr
-        if self.has_uncertain_inputs():
-            if self.missing_data:
-                d = self.output_dim
-                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
-            else:
-                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
-
-            # Subsetting Variational Posterior objects, makes the gradients
-            # empty. We need them to be 0 though:
-            X.mean.gradient[:] = 0
-            X.variance.gradient[:] = 0
-
-            self.variational_prior.update_gradients_KL(X)
-            if self.missing_data:
-                current_values['meangrad'] += kl_fctr*X.mean.gradient/d
-                current_values['vargrad'] += kl_fctr*X.variance.gradient/d
-            else:
-                current_values['meangrad'] += kl_fctr*X.mean.gradient
-                current_values['vargrad'] += kl_fctr*X.variance.gradient
-
-            if subset_indices is not None:
-                value_indices['meangrad'] = subset_indices['samples']
-                value_indices['vargrad'] = subset_indices['samples']
-        return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None, **kw):
+        posterior, log_marginal_likelihood, grad_dict = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm,
+                                                                                                                    psi0=psi0, psi1=psi1, psi2=psi2, **kw)
+        return posterior, log_marginal_likelihood, grad_dict

    def _outer_values_update(self, full_values):
        """
@ -128,22 +93,47 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        """
        super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
        if self.has_uncertain_inputs():
-            self.X.mean.gradient = full_values['meangrad']
-            self.X.variance.gradient = full_values['vargrad']
+            meangrad_tmp, vargrad_tmp = self.kern.gradients_qX_expectations(
+                                            variational_posterior=self.X,
+                                            Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
+                                            dL_dpsi1=full_values['dL_dpsi1'],
+                                            dL_dpsi2=full_values['dL_dpsi2'],
+                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
+
+            self.X.mean.gradient = meangrad_tmp
+            self.X.variance.gradient = vargrad_tmp
        else:
-            self.X.gradient = full_values['Xgrad']
+            self.X.gradient = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z)
+            self.X.gradient += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X)

    def _outer_init_full_values(self):
-        if self.has_uncertain_inputs():
-            return dict(meangrad=np.zeros(self.X.mean.shape),
-                        vargrad=np.zeros(self.X.variance.shape))
-        else:
-            return dict(Xgrad=np.zeros(self.X.shape))
+        return super(BayesianGPLVMMiniBatch, self)._outer_init_full_values()

    def parameters_changed(self):
        super(BayesianGPLVMMiniBatch,self).parameters_changed()
-        if isinstance(self.inference_method, VarDTC_minibatch):
-            return
+
+        kl_fctr = self.kl_factr
+        if kl_fctr > 0:
+            Xgrad = self.X.gradient.copy()
+            self.X.gradient[:] = 0
+            self.variational_prior.update_gradients_KL(self.X)
+
+            if self.missing_data or not self.stochastics:
+                self.X.mean.gradient = kl_fctr*self.X.mean.gradient
+                self.X.variance.gradient = kl_fctr*self.X.variance.gradient
+            else:
+                d = self.output_dim
+                self.X.mean.gradient = kl_fctr*self.X.mean.gradient*self.stochastics.batchsize/d
+                self.X.variance.gradient = kl_fctr*self.X.variance.gradient*self.stochastics.batchsize/d
+            self.X.gradient += Xgrad
+
+            if self.missing_data or not self.stochastics:
+                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
+            elif self.stochastics:
+                d = self.output_dim
+                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)*self.stochastics.batchsize/d
+
+        self._Xgrad = self.X.gradient.copy()

    def plot_latent(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@ -1,11 +1,11 @@
-# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015 James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


 import numpy as np
 from ..core import GP
-from ..models import GPLVM
-from ..mappings import *
+from . import GPLVM
+from .. import mappings


 class BCGPLVM(GPLVM):
@ -16,33 +16,31 @@ class BCGPLVM(GPLVM):
    :type Y: np.ndarray
    :param input_dim: latent dimensionality
    :type input_dim: int
-    :param init: initialisation method for the latent space
-    :type init: 'PCA'|'random'
    :param mapping: mapping for back constraint
    :type mapping: GPy.core.Mapping object

    """
-    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False, mapping=None):
+    def __init__(self, Y, input_dim, kernel=None, mapping=None):
+

        if mapping is None:
-            mapping = Kernel(X=Y, output_dim=input_dim)
+            mapping = mappings.MLP(input_dim=Y.shape[1],
+                                   output_dim=input_dim,
+                                   hidden_dim=10)
+        else:
+            assert mapping.input_dim==Y.shape[1], "mapping input dim does not work for Y dimension"
+            assert mapping.output_dim==input_dim, "mapping output dim does not work for self.input_dim"
+        GPLVM.__init__(self, Y, input_dim, X=mapping.f(Y), kernel=kernel, name="bcgplvm")
+        self.unlink_parameter(self.X)
        self.mapping = mapping
-        GPLVM.__init__(self, Y, input_dim, init, X, kernel, normalize_Y)
-        self.X = self.mapping.f(self.likelihood.Y)
+        self.link_parameter(self.mapping)

-    def _get_param_names(self):
-        return self.mapping._get_param_names() + GP._get_param_names(self)
+        self.X = self.mapping.f(self.Y)

-    def _get_params(self):
-        return np.hstack((self.mapping._get_params(), GP._get_params(self)))
+    def parameters_changed(self):
+        self.X = self.mapping.f(self.Y)
+        GP.parameters_changed(self)
+        Xgradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
+        self.mapping.update_gradients(Xgradient, self.Y)

-    def _set_params(self, x):
-        self.mapping._set_params(x[:self.mapping.num_params])
-        self.X = self.mapping.f(self.likelihood.Y)
-        GP._set_params(self, x[self.mapping.num_params:])
-
-    def _log_likelihood_gradients(self):
-        dL_df = self.kern.gradients_X(self.dL_dK, self.X)
-        dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
-        return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))

--- a/GPy/models/gp_heteroscedastic_regression.py
+++ b/GPy/models/gp_heteroscedastic_regression.py
@ -16,6 +16,8 @@ class GPHeteroscedasticRegression(GP):
    :param X: input observations
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf
+
+    NB: This model does not make inference on the noise outside the training set
    """
    def __init__(self, X, Y, kernel=None, Y_metadata=None):

@ -30,10 +32,7 @@ class GPHeteroscedasticRegression(GP):
            kernel = kern.RBF(X.shape[1])

        #Likelihood
-        #likelihoods_list = [likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for j in range(Ny)]
-        noise_terms = np.unique(Y_metadata['output_index'].flatten())
-        likelihoods_list = [likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for j in noise_terms]
-        likelihood = likelihoods.MixedNoise(likelihoods_list=likelihoods_list)
+        likelihood = likelihoods.HeteroscedasticGaussian(Y_metadata)

        super(GPHeteroscedasticRegression, self).__init__(X,Y,kernel,likelihood, Y_metadata=Y_metadata)

--- a/GPy/models/gp_kronecker_gaussian_regression.py
+++ b/GPy/models/gp_kronecker_gaussian_regression.py
@ -1,5 +1,5 @@
 # Copyright (c) 2014, James Hensman, Alan Saul
-# Distributed under the terms of the GNU General public License, see LICENSE.txt
+# Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
 from ..core.model import Model
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -26,12 +26,12 @@ class GPRegression(GP):

    """

-    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1.):
+    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1., mean_function=None):

        if kernel is None:
            kernel = kern.RBF(X.shape[1])

        likelihood = likelihoods.Gaussian(variance=noise_var)

-        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer)
+        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function)

--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@ -1,20 +1,17 @@
 # Copyright (c) 2014, James Hensman, Alan Saul
-# Distributed under the terms of the GNU General public License, see LICENSE.txt
+# Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from scipy import stats
-from scipy.special import erf
-from ..core.model import Model
+from ..core import GP
 from ..core.parameterization import ObsAr
 from .. import kern
 from ..core.parameterization.param import Param
-from ..util.linalg import pdinv
-from ..likelihoods import Gaussian
+from ..inference.latent_function_inference import VarGauss

 log_2_pi = np.log(2*np.pi)


-class GPVariationalGaussianApproximation(Model):
+class GPVariationalGaussianApproximation(GP):
    """
    The Variational Gaussian Approximation revisited

@ -26,70 +23,14 @@ class GPVariationalGaussianApproximation(Model):
        pages = {786--792},
    }
    """
-    def __init__(self, X, Y, kernel, likelihood=None, Y_metadata=None):
-        Model.__init__(self,'Variational GP')
-        if likelihood is None:
-            likelihood = Gaussian()
-        # accept the construction arguments
-        self.X = ObsAr(X)
-        self.Y = Y
-        self.num_data, self.input_dim = self.X.shape
-        self.Y_metadata = Y_metadata
+    def __init__(self, X, Y, kernel, likelihood, Y_metadata=None):

-        self.kern = kernel
-        self.likelihood = likelihood
-        self.link_parameter(self.kern)
-        self.link_parameter(self.likelihood)
+        num_data = Y.shape[0]
+        self.alpha = Param('alpha', np.zeros((num_data,1))) # only one latent fn for now.
+        self.beta = Param('beta', np.ones(num_data))
+
+        inf = VarGauss(self.alpha, self.beta)
+        super(GPVariationalGaussianApproximation, self).__init__(X, Y, kernel, likelihood, name='VarGP', inference_method=inf)

-        self.alpha = Param('alpha', np.zeros((self.num_data,1))) # only one latent fn for now.
-        self.beta = Param('beta', np.ones(self.num_data))
        self.link_parameter(self.alpha)
        self.link_parameter(self.beta)
-
-    def log_likelihood(self):
-        return self._log_lik
-
-    def parameters_changed(self):
-        K = self.kern.K(self.X)
-        m = K.dot(self.alpha)
-        KB = K*self.beta[:, None]
-        BKB = KB*self.beta[None, :]
-        A = np.eye(self.num_data) + BKB
-        Ai, LA, _, Alogdet = pdinv(A)
-        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
-        var = np.diag(Sigma).reshape(-1,1)
-
-        F, dF_dm, dF_dv, dF_dthetaL = self.likelihood.variational_expectations(self.Y, m, var, Y_metadata=self.Y_metadata)
-        self.likelihood.gradient = dF_dthetaL.sum(1).sum(1)
-        dF_da = np.dot(K, dF_dm)
-        SigmaB = Sigma*self.beta
-        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
-        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + np.sum(m*self.alpha))
-        dKL_da = m
-        A_A2 = Ai - Ai.dot(Ai)
-        dKL_db = np.diag(np.dot(KB.T, A_A2))
-        self._log_lik = F.sum() - KL
-        self.alpha.gradient = dF_da - dKL_da
-        self.beta.gradient = dF_db - dKL_db
-
-        # K-gradients
-        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
-        tmp = Ai*self.beta[:, None]/self.beta[None, :]
-        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
-        self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)
-
-    def _raw_predict(self, Xnew):
-        """
-        Predict the function(s) at the new point(s) Xnew.
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        """
-        Wi, _, _, _ = pdinv(self.kern.K(self.X) + np.diag(self.beta**-2))
-        Kux = self.kern.K(self.X, Xnew)
-        mu = np.dot(Kux.T, self.alpha)
-        WiKux = np.dot(Wi, Kux)
-        Kxx = self.kern.Kdiag(Xnew)
-        var = Kxx - np.sum(WiKux*Kux, 0)
-
-        return mu, var.reshape(-1,1)
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -36,6 +36,7 @@ class GPLVM(GP):
        likelihood = Gaussian()

        super(GPLVM, self).__init__(X, Y, kernel, likelihood, name='GPLVM')
+
        self.X = Param('latent_mean', X)
        self.link_parameter(self.X, index=0)

@ -43,27 +44,30 @@ class GPLVM(GP):
        super(GPLVM, self).parameters_changed()
        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)

-    def jacobian(self,X):
-        J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        for i in range(self.output_dim):
-            J[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1], X, self.X)
-        return J
+    #def jacobian(self,X):
+    #    J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+    #    for i in range(self.output_dim):
+    #        J[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1], X, self.X)
+    #    return J

-    def magnification(self,X):
-        target=np.zeros(X.shape[0])
-        #J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        J = self.jacobian(X)
-        for i in range(X.shape[0]):
-            target[i]=np.sqrt(np.linalg.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
-        return target
+    #def magnification(self,X):
+    #    target=np.zeros(X.shape[0])
+    #    #J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+    ##    J = self.jacobian(X)
+    #    for i in range(X.shape[0]):
+    #        target[i]=np.sqrt(np.linalg.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
+    #    return target

    def plot(self):
-        assert self.likelihood.Y.shape[1] == 2
-        pb.scatter(self.likelihood.Y[:, 0], self.likelihood.Y[:, 1], 40, self.X[:, 0].copy(), linewidth=0, cmap=pb.cm.jet)  # @UndefinedVariable
+        assert self.Y.shape[1] == 2, "too high dimensional to plot. Try plot_latent"
+        from matplotlib import pyplot as plt
+        plt.scatter(self.Y[:, 0],
+                    self.Y[:, 1],
+                    40, self.X[:, 0].copy(),
+                    linewidth=0, cmap=plt.cm.jet)
        Xnew = np.linspace(self.X.min(), self.X.max(), 200)[:, None]
        mu, _ = self.predict(Xnew)
-        import pylab as pb
-        pb.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
+        plt.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)

    def plot_latent(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
@ -78,6 +82,3 @@ class GPLVM(GP):
                resolution, ax, marker, s,
                fignum, False, legend,
                plot_limits, aspect, updates, **kwargs)
-
-    def plot_magnification(self, *args, **kwargs):
-        return util.plot_latent.plot_magnification(self, *args, **kwargs)
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@ -251,7 +251,7 @@ class HessianChecker(GradientChecker):
            print(grad_string)

            if plot:
-                import pylab as pb
+                from matplotlib import pyplot as pb
                fig, axes = pb.subplots(2, 2)
                max_lim = numpy.max(numpy.vstack((analytic_hess, numeric_hess)))
                min_lim = numpy.min(numpy.vstack((analytic_hess, numeric_hess)))
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -170,20 +170,19 @@ class MRD(BayesianGPLVMMiniBatch):
            self._log_marginal_likelihood += b._log_marginal_likelihood

            self.logger.info('working on im <{}>'.format(hex(id(i))))
-            self.Z.gradient[:] += b.full_values['Zgrad']
-            grad_dict = b.full_values
+            self.Z.gradient[:] += b.Z.gradient#full_values['Zgrad']
+            #grad_dict = b.full_values

            if self.has_uncertain_inputs():
-                self.X.mean.gradient += grad_dict['meangrad']
-                self.X.variance.gradient += grad_dict['vargrad']
+                self.X.gradient += b._Xgrad
            else:
-                self.X.gradient += grad_dict['Xgrad']
+                self.X.gradient += b._Xgrad

-        if self.has_uncertain_inputs():
-            # update for the KL divergence
-            self.variational_prior.update_gradients_KL(self.X)
-            self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
-            pass
+        #if self.has_uncertain_inputs():
+        #    # update for the KL divergence
+        #    self.variational_prior.update_gradients_KL(self.X)
+        #    self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
+        #    pass

    def log_likelihood(self):
        return self._log_marginal_likelihood
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -63,10 +63,10 @@ class SparseGPMiniBatch(SparseGP):

        if stochastic and missing_data:
            self.missing_data = True
-            self.stochastics = SparseGPStochastics(self, batchsize)
+            self.stochastics = SparseGPStochastics(self, batchsize, self.missing_data)
        elif stochastic and not missing_data:
            self.missing_data = False
-            self.stochastics = SparseGPStochastics(self, batchsize)
+            self.stochastics = SparseGPStochastics(self, batchsize, self.missing_data)
        elif missing_data:
            self.missing_data = True
            self.stochastics = SparseGPMissing(self)
@ -80,7 +80,7 @@ class SparseGPMiniBatch(SparseGP):
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)

-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kwargs):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None, **kwargs):
        """
        This is the standard part, which usually belongs in parameters_changed.

@ -99,47 +99,13 @@ class SparseGPMiniBatch(SparseGP):
        like them into this dictionary for inner use of the indices inside the
        algorithm.
        """
-        try:
-            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None, **kwargs)
-        except:
-            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata)
-        current_values = {}
-        likelihood.update_gradients(grad_dict['dL_dthetaL'])
-        current_values['likgrad'] = likelihood.gradient.copy()
-        if subset_indices is None:
-            subset_indices = {}
-        if isinstance(X, VariationalPosterior):
-            #gradients wrt kernel
-            dL_dKmm = grad_dict['dL_dKmm']
-            kern.update_gradients_full(dL_dKmm, Z, None)
-            current_values['kerngrad'] = kern.gradient.copy()
-            kern.update_gradients_expectations(variational_posterior=X,
-                                                    Z=Z,
-                                                    dL_dpsi0=grad_dict['dL_dpsi0'],
-                                                    dL_dpsi1=grad_dict['dL_dpsi1'],
-                                                    dL_dpsi2=grad_dict['dL_dpsi2'])
-            current_values['kerngrad'] += kern.gradient
-
-            #gradients wrt Z
-            current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z)
-            current_values['Zgrad'] += kern.gradients_Z_expectations(
-                               grad_dict['dL_dpsi0'],
-                               grad_dict['dL_dpsi1'],
-                               grad_dict['dL_dpsi2'],
-                               Z=Z,
-                               variational_posterior=X)
+        if psi2 is None:
+            psi2_sum_n = None
        else:
-            #gradients wrt kernel
-            kern.update_gradients_diag(grad_dict['dL_dKdiag'], X)
-            current_values['kerngrad'] = kern.gradient.copy()
-            kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z)
-            current_values['kerngrad'] += kern.gradient
-            kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None)
-            current_values['kerngrad'] += kern.gradient
-            #gradients wrt Z
-            current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z)
-            current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X)
-        return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices
+            psi2_sum_n = psi2.sum(axis=0)
+        posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm,
+                                                                                        dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs)
+        return posterior, log_marginal_likelihood, grad_dict

    def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
        """
@ -173,7 +139,10 @@ class SparseGPMiniBatch(SparseGP):
            else:
                index = slice(None)
            if key in full_values:
-                full_values[key][index] += current_values[key]
+                try:
+                    full_values[key][index] += current_values[key]
+                except:
+                    full_values[key] += current_values[key]
            else:
                full_values[key] = current_values[key]

@ -192,9 +161,41 @@ class SparseGPMiniBatch(SparseGP):
        Here you put the values, which were collected before in the right places.
        E.g. set the gradients of parameters, etc.
        """
-        self.likelihood.gradient = full_values['likgrad']
-        self.kern.gradient = full_values['kerngrad']
-        self.Z.gradient = full_values['Zgrad']
+        if self.has_uncertain_inputs():
+            #gradients wrt kernel
+            dL_dKmm = full_values['dL_dKmm']
+            self.kern.update_gradients_full(dL_dKmm, self.Z, None)
+            kgrad = self.kern.gradient.copy()
+            self.kern.update_gradients_expectations(
+                                                variational_posterior=self.X,
+                                                Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
+                                                dL_dpsi1=full_values['dL_dpsi1'],
+                                                dL_dpsi2=full_values['dL_dpsi2'])
+            self.kern.gradient += kgrad
+
+
+            #gradients wrt Z
+            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
+            self.Z.gradient += self.kern.gradients_Z_expectations(
+                                            variational_posterior=self.X,
+                                            Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
+                                            dL_dpsi1=full_values['dL_dpsi1'],
+                                            dL_dpsi2=full_values['dL_dpsi2'])
+        else:
+            #gradients wrt kernel
+            self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X)
+            kgrad = self.kern.gradient.copy()
+            self.kern.update_gradients_full(full_values['dL_dKnm'], self.X, self.Z)
+            kgrad += self.kern.gradient
+            self.kern.update_gradients_full(full_values['dL_dKmm'], self.Z, None)
+            self.kern.gradient += kgrad
+            #kgrad += self.kern.gradient
+
+            #gradients wrt Z
+            self.Z.gradient = self.kern.gradients_X(full_values['dL_dKmm'], self.Z)
+            self.Z.gradient += self.kern.gradients_X(full_values['dL_dKnm'].T, self.Z, self.X)
+
+        self.likelihood.update_gradients(full_values['dL_dthetaL'])

    def _outer_init_full_values(self):
        """
@ -209,7 +210,15 @@ class SparseGPMiniBatch(SparseGP):
        to initialize the gradients for the mean and the variance in order to
        have the full gradient for indexing)
        """
-        return {}
+        retd = dict(dL_dKmm=np.zeros((self.Z.shape[0], self.Z.shape[0])))
+        if self.has_uncertain_inputs():
+            retd.update(dict(dL_dpsi0=np.zeros(self.X.shape[0]),
+                             dL_dpsi1=np.zeros((self.X.shape[0], self.Z.shape[0])),
+                             dL_dpsi2=np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0]))))
+        else:
+            retd.update({'dL_dKdiag': np.zeros(self.X.shape[0]),
+                         'dL_dKnm': np.zeros((self.X.shape[0], self.Z.shape[0]))})
+        return retd

    def _outer_loop_for_missing_data(self):
        Lm = None
@ -231,28 +240,36 @@ class SparseGPMiniBatch(SparseGP):
            print(message, end=' ')

        for d, ninan in self.stochastics.d:
-
            if not self.stochastics:
                print(' '*(len(message)) + '\r', end=' ')
                message = m_f(d)
                print(message, end=' ')

-            posterior, log_marginal_likelihood, \
-                grad_dict, current_values, value_indices = self._inner_parameters_changed(
+            psi0ni = self.psi0[ninan]
+            psi1ni = self.psi1[ninan]
+            if self.has_uncertain_inputs():
+                psi2ni = self.psi2[ninan]
+                value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, dL_dpsi2=ninan)
+            else:
+                psi2ni = None
+                value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan)
+
+            posterior, log_marginal_likelihood, grad_dict = self._inner_parameters_changed(
                                self.kern, self.X[ninan],
                                self.Z, self.likelihood,
                                self.Y_normalized[ninan][:, d], self.Y_metadata,
                                Lm, dL_dKmm,
-                                subset_indices=dict(outputs=d, samples=ninan))
+                                psi0=psi0ni, psi1=psi1ni, psi2=psi2ni)

-            self._inner_take_over_or_update(self.full_values, current_values, value_indices)
-            self._inner_values_update(current_values)
+            # Fill out the full values by adding in the apporpriate grad_dict
+            # values
+            self._inner_take_over_or_update(self.full_values, grad_dict, value_indices)
+            self._inner_values_update(grad_dict)  # What is this for? -> MRD

-            Lm = posterior.K_chol
-            dL_dKmm = grad_dict['dL_dKmm']
            woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None]
            woodbury_vector[:, d] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood
+
        if not self.stochastics:
            print('')

@ -260,10 +277,10 @@ class SparseGPMiniBatch(SparseGP):
            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
        self._outer_values_update(self.full_values)
+        if self.has_uncertain_inputs():
+            self.kern.return_psi2_n = False

    def _outer_loop_without_missing_data(self):
-        self._log_marginal_likelihood = 0
-
        if self.posterior is None:
            woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
@ -271,17 +288,16 @@ class SparseGPMiniBatch(SparseGP):
            woodbury_inv = self.posterior._woodbury_inv
            woodbury_vector = self.posterior._woodbury_vector

-        d = self.stochastics.d
-        posterior, log_marginal_likelihood, \
-            grad_dict, self.full_values, _ = self._inner_parameters_changed(
+        d = self.stochastics.d[0][0]
+        posterior, log_marginal_likelihood, grad_dict= self._inner_parameters_changed(
                            self.kern, self.X,
                            self.Z, self.likelihood,
                            self.Y_normalized[:, d], self.Y_metadata)
        self.grad_dict = grad_dict

-        self._log_marginal_likelihood += log_marginal_likelihood
+        self._log_marginal_likelihood = log_marginal_likelihood

-        self._outer_values_update(self.full_values)
+        self._outer_values_update(self.grad_dict)

        woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None]
        woodbury_vector[:, d] = posterior.woodbury_vector
@ -290,10 +306,23 @@ class SparseGPMiniBatch(SparseGP):
                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)

    def parameters_changed(self):
+        #Compute the psi statistics for N once, but don't sum out N in psi2
+        if self.has_uncertain_inputs():
+            #psi0 = ObsAr(self.kern.psi0(self.Z, self.X))
+            #psi1 = ObsAr(self.kern.psi1(self.Z, self.X))
+            #psi2 = ObsAr(self.kern.psi2(self.Z, self.X))
+            self.psi0 = self.kern.psi0(self.Z, self.X)
+            self.psi1 = self.kern.psi1(self.Z, self.X)
+            self.psi2 = self.kern.psi2n(self.Z, self.X)
+        else:
+            self.psi0 = self.kern.Kdiag(self.X)
+            self.psi1 = self.kern.K(self.X, self.Z)
+            self.psi2 = None
+
        if self.missing_data:
            self._outer_loop_for_missing_data()
        elif self.stochastics:
            self._outer_loop_without_missing_data()
        else:
-            self.posterior, self._log_marginal_likelihood, self.grad_dict, self.full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
-            self._outer_values_update(self.full_values)
+            self.posterior, self._log_marginal_likelihood, self.grad_dict = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
+            self._outer_values_update(self.grad_dict)
--- a/GPy/plotting/init.py
+++ b/GPy/plotting/init.py
@ -4,4 +4,8 @@
 try:
    from . import matplot_dep
 except (ImportError, NameError):
-    print('Fail to load GPy.plotting.matplot_dep.')
+    # Matplotlib not available
+    import warnings
+    warnings.warn(ImportWarning("Matplotlib not available, install newest version of Matplotlib for plotting"))
+    #sys.modules['matplotlib'] =
+    #sys.modules[__name__+'.matplot_dep'] = ImportWarning("Matplotlib not available, install newest version of Matplotlib for plotting")
--- a/GPy/plotting/matplot_dep/Tango.py
+++ b/GPy/plotting/matplot_dep/Tango.py
@ -3,7 +3,7 @@


 import matplotlib as mpl
-import pylab as pb
+from matplotlib import pyplot as pb
 import sys
 #sys.path.append('/home/james/mlprojects/sitran_cluster/')
 #from switch_pylab_backend import *
@ -159,7 +159,7 @@ cdict_Alu = {'red' :((0./5,colorsRGB['Aluminium1'][0]/256.,colorsRGB['Aluminium1
 # cmap_BGR = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_BGR,256)
 # cmap_RB = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_RB,256)
 if __name__=='__main__':
-    import pylab as pb
+    from matplotlib import pyplot as pb
    pb.figure()
    pb.pcolor(pb.rand(10,10),cmap=cmap_RB)
    pb.colorbar()
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -3,8 +3,8 @@


 try:
-    import Tango
-    import pylab as pb
+    #import Tango
+    from matplotlib import pyplot as pb
 except:
    pass
 import numpy as np
@ -17,11 +17,11 @@ def ax_default(fignum, ax):
        fig = ax.figure
    return fig, ax

-def meanplot(x, mu, color=Tango.colorsHex['darkBlue'], ax=None, fignum=None, linewidth=2,**kw):
+def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
    _, axes = ax_default(fignum, ax)
    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)

-def gpplot(x, mu, lower, upper, edgecol=Tango.colorsHex['darkBlue'], fillcol=Tango.colorsHex['lightBlue'], ax=None, fignum=None, **kwargs):
+def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)

    mu = mu.flatten()
@ -47,6 +47,32 @@ def gpplot(x, mu, lower, upper, edgecol=Tango.colorsHex['darkBlue'], fillcol=Tan
    return plots


+def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
+    _, axes = ax_default(fignum, ax)
+
+    mu = mu.flatten()
+    x = x.flatten()
+    lower = lower.flatten()
+    upper = upper.flatten()
+
+    plots = []
+
+    if edgecol is None:
+        edgecol='#3300FF'
+
+    if not 'alpha' in kwargs.keys():
+        kwargs['alpha'] = 1.
+
+
+    if not 'lw' in kwargs.keys():
+        kwargs['lw'] = 1.
+
+
+    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
+    plots[-1][0].remove()
+    return plots
+
+
 def removeRightTicks(ax=None):
    ax = ax or pb.gca()
    for i, line in enumerate(ax.get_yticklines()):
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@ -9,7 +9,8 @@ import itertools
 try:
    import Tango
    from matplotlib.cm import get_cmap
-    import pylab as pb
+    from matplotlib import pyplot as pb
+    from matplotlib import cm
 except:
    pass

@ -114,7 +115,7 @@ def plot_latent(model, labels=None, which_indices=None,

    # create a function which computes the shading of latent space according to the output variance
    def plot_function(x):
-        Xtest_full = np.zeros((x.shape[0], model.X.shape[1]))
+        Xtest_full = np.zeros((x.shape[0], X.shape[1]))
        Xtest_full[:, [input_1, input_2]] = x
        _, var = model.predict(Xtest_full, **predict_kwargs)
        var = var[:, :1]
@ -137,7 +138,7 @@ def plot_latent(model, labels=None, which_indices=None,
    view = ImshowController(ax, plot_function,
                            (xmin, ymin, xmax, ymax),
                            resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.binary, **imshow_kwargs)
+                            cmap=cm.binary, **imshow_kwargs)

    # make sure labels are in order of input:
    labels = np.asarray(labels)
@ -192,17 +193,18 @@ def plot_latent(model, labels=None, which_indices=None,

    if updates:
        try:
-            ax.figure.canvas.show()
+            fig.canvas.show()
        except Exception as e:
            print("Could not invoke show: {}".format(e))
-        raw_input('Enter to continue')
-        view.deactivate()
+        #raw_input('Enter to continue')
+        return view
    return ax

 def plot_magnification(model, labels=None, which_indices=None,
                resolution=60, ax=None, marker='o', s=40,
                fignum=None, plot_inducing=False, legend=True,
-                aspect='auto', updates=False):
+                plot_limits=None,
+                aspect='auto', updates=False, mean=True, covariance=True, kern=None):
    """
    :param labels: a np.array of size model.num_data containing labels for the points (can be number, strings, etc)
    :param resolution: the resolution of the grid on which to evaluate the predictive variance
@ -210,6 +212,8 @@ def plot_magnification(model, labels=None, which_indices=None,
    if ax is None:
        fig = pb.figure(num=fignum)
        ax = fig.add_subplot(111)
+    else:
+        fig = ax.figure
    Tango.reset()

    if labels is None:
@ -217,19 +221,90 @@ def plot_magnification(model, labels=None, which_indices=None,

    input_1, input_2 = most_significant_input_dimensions(model, which_indices)

-    # first, plot the output variance as a function of the latent space
-    Xtest, xx, yy, xmin, xmax = x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
-    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
+    #fethch the data points X that we'd like to plot
+    X = model.X
+    if isinstance(X, VariationalPosterior):
+        X = X.mean
+    else:
+        X = X
+
+    if X.shape[0] > 1000:
+        print("Warning: subsampling X, as it has more samples then 1000. X.shape={!s}".format(X.shape))
+        subsample = np.random.choice(X.shape[0], size=1000, replace=False)
+        X = X[subsample]
+        labels = labels[subsample]
+        #=======================================================================
+        #     <<<WORK IN PROGRESS>>>
+        #     <<<DO NOT DELETE>>>
+        #     plt.close('all')
+        #     fig, ax = plt.subplots(1,1)
+        #     from GPy.plotting.matplot_dep.dim_reduction_plots import most_significant_input_dimensions
+        #     import matplotlib.patches as mpatches
+        #     i1, i2 = most_significant_input_dimensions(m, None)
+        #     xmin, xmax = 100, -100
+        #     ymin, ymax = 100, -100
+        #     legend_handles = []
+        #
+        #     X = m.X.mean[:, [i1, i2]]
+        #     X = m.X.variance[:, [i1, i2]]
+        #
+        #     xmin = X[:,0].min(); xmax = X[:,0].max()
+        #     ymin = X[:,1].min(); ymax = X[:,1].max()
+        #     range_ = [[xmin, xmax], [ymin, ymax]]
+        #     ul = np.unique(labels)
+        #
+        #     for i, l in enumerate(ul):
+        #         #cdict = dict(red  =[(0., colors[i][0], colors[i][0]), (1., colors[i][0], colors[i][0])],
+        #         #             green=[(0., colors[i][0], colors[i][1]), (1., colors[i][1], colors[i][1])],
+        #         #             blue =[(0., colors[i][0], colors[i][2]), (1., colors[i][2], colors[i][2])],
+        #         #             alpha=[(0., 0., .0), (.5, .5, .5), (1., .5, .5)])
+        #         #cmap = LinearSegmentedColormap('{}'.format(l), cdict)
+        #         cmap = LinearSegmentedColormap.from_list('cmap_{}'.format(str(l)), [colors[i], colors[i]], 255)
+        #         cmap._init()
+        #         #alphas = .5*(1+scipy.special.erf(np.linspace(-2,2, cmap.N+3)))#np.log(np.linspace(np.exp(0), np.exp(1.), cmap.N+3))
+        #         alphas = (scipy.special.erf(np.linspace(0,2.4, cmap.N+3)))#np.log(np.linspace(np.exp(0), np.exp(1.), cmap.N+3))
+        #         cmap._lut[:, -1] = alphas
+        #         print l
+        #         x, y = X[labels==l].T
+        #
+        #         heatmap, xedges, yedges = np.histogram2d(x, y, bins=300, range=range_)
+        #         #heatmap, xedges, yedges = np.histogram2d(x, y, bins=100)
+        #
+        #         im = ax.imshow(heatmap, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], cmap=cmap, aspect='auto', interpolation='nearest', label=str(l))
+        #         legend_handles.append(mpatches.Patch(color=colors[i], label=l))
+        #     ax.set_xlim(xmin, xmax)
+        #     ax.set_ylim(ymin, ymax)
+        #     plt.legend(legend_handles, [l.get_label() for l in legend_handles])
+        #     plt.draw()
+        #     plt.show()
+        #=======================================================================
+
+    #Create an IMshow controller that can re-plot the latent space shading at a good resolution
+    if plot_limits is None:
+        xmin, ymin = X[:, [input_1, input_2]].min(0)
+        xmax, ymax = X[:, [input_1, input_2]].max(0)
+        x_r, y_r = xmax-xmin, ymax-ymin
+        xmin -= .1*x_r
+        xmax += .1*x_r
+        ymin -= .1*y_r
+        ymax += .1*y_r
+    else:
+        try:
+            xmin, xmax, ymin, ymax = plot_limits
+        except (TypeError, ValueError) as e:
+            raise e.__class__("Wrong plot limits: {} given -> need (xmin, xmax, ymin, ymax)".format(plot_limits))
+

    def plot_function(x):
+        Xtest_full = np.zeros((x.shape[0], X.shape[1]))
        Xtest_full[:, [input_1, input_2]] = x
-        mf=model.magnification(Xtest_full)
+        mf = model.predict_magnification(Xtest_full, kern=kern, mean=mean, covariance=covariance)
        return mf

    view = ImshowController(ax, plot_function,
-                            tuple(model.X.min(0)[:, [input_1, input_2]]) + tuple(model.X.max(0)[:, [input_1, input_2]]),
+                            (xmin, ymin, xmax, ymax),
                            resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.gray)
+                            cmap=cm.get_cmap('Greys'))

    # make sure labels are in order of input:
    ulabels = []
@ -245,17 +320,17 @@ def plot_magnification(model, labels=None, which_indices=None,
        elif type(ul) is np.int64:
            this_label = 'class %i' % ul
        else:
-            this_label = 'class %i' % i
+            this_label = unicode(ul)
        m = marker.next()

        index = np.nonzero(labels == ul)[0]
        if model.input_dim == 1:
-            x = model.X[index, input_1]
+            x = X[index, input_1]
            y = np.zeros(index.size)
        else:
-            x = model.X[index, input_1]
-            y = model.X[index, input_2]
-        ax.scatter(x, y, marker=m, s=s, color=Tango.nextMedium(), label=this_label)
+            x = X[index, input_1]
+            y = X[index, input_2]
+        ax.scatter(x, y, marker=m, s=s, c=Tango.nextMedium(), label=this_label, linewidth=.2, edgecolor='k', alpha=.9)

    ax.set_xlabel('latent dimension %i' % input_1)
    ax.set_ylabel('latent dimension %i' % input_2)
@ -263,19 +338,29 @@ def plot_magnification(model, labels=None, which_indices=None,
    if not np.all(labels == 1.) and legend:
        ax.legend(loc=0, numpoints=1)

-    ax.set_xlim(xmin[0], xmax[0])
-    ax.set_ylim(xmin[1], xmax[1])
-    ax.grid(b=False) # remove the grid if present, it doesn't look good
-    ax.set_aspect('auto') # set a nice aspect ratio
+    ax.set_xlim((xmin, xmax))
+    ax.set_ylim((ymin, ymax))

-    if plot_inducing:
-        ax.plot(model.Z[:, input_1], model.Z[:, input_2], '^w')
+    if plot_inducing and hasattr(model, 'Z'):
+        Z = model.Z
+        ax.scatter(Z[:, input_1], Z[:, input_2], c='w', s=18, marker="^", edgecolor='k', linewidth=.3, alpha=.7)
+
+    try:
+        fig.canvas.draw()
+        fig.tight_layout()
+        fig.canvas.draw()
+    except Exception as e:
+        print("Could not invoke tight layout: {}".format(e))
+        pass

    if updates:
-        fig.canvas.show()
-        raw_input('Enter to continue')
-
-    pb.title('Magnification Factor')
+        try:
+            fig.canvas.draw()
+            fig.canvas.show()
+        except Exception as e:
+            print("Could not invoke show: {}".format(e))
+        #raw_input('Enter to continue')
+        return view
    return ax


@ -314,8 +399,8 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
            this_label = 'class %i' % i
        m = marker.next()
        index = np.nonzero(data_labels == ul)[0]
-        x = model.X[index, input_1]
-        y = model.X[index, input_2]
+        x = X[index, input_1]
+        y = X[index, input_2]
        ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)

    ax.set_xlabel('latent dimension %i' % input_1)
@ -323,7 +408,7 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,

    controller = ImAnnotateController(ax,
                                  plot_function,
-                                  tuple(model.X.min(0)[:, significant_dims]) + tuple(model.X.max(0)[:, significant_dims]),
+                                  tuple(X.min(0)[:, significant_dims]) + tuple(X.max(0)[:, significant_dims]),
                                  resolution=resolution,
                                  aspect=aspect,
                                  cmap=get_cmap('jet'),
--- a/GPy/plotting/matplot_dep/inference_plots.py
+++ b/GPy/plotting/matplot_dep/inference_plots.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
 #import numpy as np
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-import pylab as pb
+from matplotlib import pyplot as pb
 import Tango
 from matplotlib.textpath import TextPath
 from matplotlib.transforms import offset_copy
--- a/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py
+++ b/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py
@ -9,6 +9,9 @@ class AxisEventController(object):
    def __init__(self, ax):
        self.ax = ax
        self.activate()
+    def __del__(self):
+        self.deactivate()
+        return self
    def deactivate(self):
        for cb_class in self.ax.callbacks.callbacks.values():
            for cb_num in cb_class.keys():
--- a/GPy/plotting/matplot_dep/mapping_plots.py
+++ b/GPy/plotting/matplot_dep/mapping_plots.py
@ -4,7 +4,7 @@
 import numpy as np
 try:
    import Tango
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass
 from base_plots import x_frame1D, x_frame2D
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
    from matplotlib.patches import Polygon
    from matplotlib.collections import PatchCollection
    #from matplotlib import cm
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -1,25 +1,82 @@
 # Copyright (c) 2012-2015, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-try:
-    import Tango
-    import pylab as pb
-except:
-    pass
 import numpy as np
-from base_plots import gpplot, x_frame1D, x_frame2D
+from . import Tango
+from .base_plots import gpplot, x_frame1D, x_frame2D,gperrors
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from ...models.warped_gp import WarpedGP
 from scipy import sparse
 from ...core.parameterization.variational import VariationalPosterior
+from matplotlib import pyplot as plt
+
+
+def plot_data(model, which_data_rows='all',
+        which_data_ycols='all', visible_dims=None,
+        fignum=None, ax=None, data_symbol='kx',mew=1.5):
+    """
+    Plot the training data
+      - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+    Can plot only part of the data
+    using which_data_rows and which_data_ycols.
+
+    :param which_data_rows: which of the training data to plot (default all)
+    :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+    :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+    :type which_data_rows: 'all' or a list of integers
+    :param visible_dims: an array specifying the input dimensions to plot (maximum two)
+    :type visible_dims: a numpy array
+    :param fignum: figure to plot on.
+    :type fignum: figure number
+    :param ax: axes to plot on.
+    :type ax: axes handle
+    """
+    #deal with optional arguments
+    if which_data_rows == 'all':
+        which_data_rows = slice(None)
+    if which_data_ycols == 'all':
+        which_data_ycols = np.arange(model.output_dim)
+
+    if ax is None:
+        fig = plt.figure(num=fignum)
+        ax = fig.add_subplot(111)
+
+    #data
+    X = model.X
+    Y = model.Y
+
+    #work out what the inputs are for plotting (1D or 2D)
+    if visible_dims is None:
+        visible_dims = np.arange(model.input_dim)
+    assert visible_dims.size <= 2, "Visible inputs cannot be larger than two"
+    free_dims = visible_dims
+    plots = {}
+    #one dimensional plotting
+    if len(free_dims) == 1:
+
+        for d in which_data_ycols:
+            plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=mew)
+
+    #2D plotting
+    elif len(free_dims) == 2:
+
+        for d in which_data_ycols:
+            plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40,
+            Y[which_data_rows, d], cmap=plt.cm.jet, vmin=Y.min(), vmax=Y.max(), linewidth=0.)
+
+    else:
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
+    return plots
+

 def plot_fit(model, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=False,
        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx',
-        apply_link=False, samples_f=0, plot_uncertain_inputs=True, predict_kw=None):
+        apply_link=False, samples_y=0, plot_uncertain_inputs=True, predict_kw=None, plot_training_data=True):
    """
    Plot the posterior of the GP.
      - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -37,25 +94,32 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    :type which_data_rows: 'all' or a list of integers
    :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
    :type fixed_inputs: a list of tuples
-    :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-    :type resolution: int
-    :param levels: number of levels to plot in a contour plot.
+    :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
    :type levels: int
-    :param samples: the number of a posteriori samples to plot p(y*|y)
+    :param samples: the number of a posteriori samples to plot p(f*|y)
    :type samples: int
    :param fignum: figure to plot on.
    :type fignum: figure number
    :param ax: axes to plot on.
    :type ax: axes handle
-    :type output: integer (first output is 0)
+    :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+    :type resolution: int
+    :param plot_raw: Whether to plot the raw function p(f|y)
+    :type plot_raw: boolean
    :param linecol: color of line to plot.
-    :type linecol:
+    :type linecol: hex or color
    :param fillcol: color of fill
-    :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
-    :param apply_link: apply the link function if plotting f (default false)
+    :type fillcol: hex or color
+    :param apply_link: apply the link function if plotting f (default false), as well as posterior samples if requested
    :type apply_link: boolean
-    :param samples_f: the number of posteriori f samples to plot p(f*|y)
-    :type samples_f: int
+    :param samples_y: the number of posteriori f samples to plot p(y*|y)
+    :type samples_y: int
+    :param plot_uncertain_inputs: plot the uncertainty of the inputs as error bars if they have uncertainty (BGPLVM etc.)
+    :type plot_uncertain_inputs: boolean
+    :param predict_kw: keyword args for _raw_predict and predict functions if required
+    :type predict_kw: dict
+    :param plot_training_data: whether or not to plot the training points
+    :type plot_training_data: boolean
    """
    #deal with optional arguments
    if which_data_rows == 'all':
@ -65,7 +129,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    #if len(which_data_ycols)==0:
        #raise ValueError('No data selected for plotting')
    if ax is None:
-        fig = pb.figure(num=fignum)
+        fig = plt.figure(num=fignum)
        ax = fig.add_subplot(111)

    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
@ -117,31 +181,38 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                    Y_metadata = {'output_index': extra_data}
                else:
                    Y_metadata['output_index'] = extra_data
+
            if isinstance(model, WarpedGP):
                m, v = model.predict(Xgrid, full_cov=False, median=True, Y_metadata=Y_metadata, **predict_kw)
                #print np.concatenate((Xgrid, m), axis=1)
            else:
                m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
-            lower, upper = model.predict_quantiles(Xgrid, Y_metadata=Y_metadata)
+            fmu, fv = model._raw_predict(Xgrid, full_cov=False, **predict_kw)
+            lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=Y_metadata)
+


        for d in which_data_ycols:
            plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol)
-            if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
+            #if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
+            if not plot_raw and plot_training_data:
+                plots['dataplot'] = plot_data(model=model, which_data_rows=which_data_rows,
+                visible_dims=free_dims, data_symbol=data_symbol, mew=1.5, ax=ax, fignum=fignum)
+

        #optionally plot some samples
        if samples: #NOTE not tested with fixed_inputs
-            Ysim = model.posterior_samples(Xgrid, samples, Y_metadata=Y_metadata)
-            print Ysim.shape
-            print Xnew.shape
-            for yi in Ysim.T:
-                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
-                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
-
-        if samples_f: #NOTE not tested with fixed_inputs
-            Fsim = model.posterior_samples_f(Xgrid, samples_f)
+            Fsim = model.posterior_samples_f(Xgrid, samples)
+            if apply_link:
+                Fsim = model.likelihood.gp_link.transf(Fsim)
            for fi in Fsim.T:
-                plots['posterior_samples_f'] = ax.plot(Xnew, fi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                plots['posterior_samples'] = ax.plot(Xnew, fi[:,None], '#3300FF', linewidth=0.25)
+                #ax.plot(Xnew, fi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
+
+        if samples_y: #NOTE not tested with fixed_inputs
+            Ysim = model.posterior_samples(Xgrid, samples_y, Y_metadata=Y_metadata)
+            for yi in Ysim.T:
+                plots['posterior_samples_y'] = ax.scatter(Xnew, yi[:,None], s=5, c=Tango.colorsHex['darkBlue'], marker='o', alpha=0.5)
                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.


@ -206,8 +277,10 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
-            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=plt.cm.jet)
+            #if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            if not plot_raw and plot_training_data:
+                plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)

        #set the limits of the plot to some sensible values
        ax.set_xlim(xmin[0], xmax[0])
@ -272,3 +345,82 @@ def fixed_inputs(model, non_fixed_inputs, fix_routine='median', as_list=True, X_
        return f_inputs
    else:
        return X
+
+
+def errorbars_trainset(model, which_data_rows='all',
+        which_data_ycols='all', fixed_inputs=[],
+        fignum=None, ax=None,
+        linecol='red', data_symbol='kx',
+        predict_kw=None, plot_training_data=True, **kwargs):
+
+    """
+    Plot the posterior error bars corresponding to the training data
+      - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+    Can plot only part of the data
+    using which_data_rows and which_data_ycols.
+
+    :param which_data_rows: which of the training data to plot (default all)
+    :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+    :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+    :type which_data_rows: 'all' or a list of integers
+    :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+    :type fixed_inputs: a list of tuples
+    :param fignum: figure to plot on.
+    :type fignum: figure number
+    :param ax: axes to plot on.
+    :type ax: axes handle
+    :param plot_training_data: whether or not to plot the training points
+    :type plot_training_data: boolean
+    """
+
+    #deal with optional arguments
+    if which_data_rows == 'all':
+        which_data_rows = slice(None)
+    if which_data_ycols == 'all':
+        which_data_ycols = np.arange(model.output_dim)
+
+    if ax is None:
+        fig = plt.figure(num=fignum)
+        ax = fig.add_subplot(111)
+
+    X = model.X
+    Y = model.Y
+
+    if predict_kw is None:
+        predict_kw = {}
+
+
+    #work out what the inputs are for plotting (1D or 2D)
+    fixed_dims = np.array([i for i,v in fixed_inputs])
+    free_dims = np.setdiff1d(np.arange(model.input_dim),fixed_dims)
+    plots = {}
+
+    #one dimensional plotting
+    if len(free_dims) == 1:
+
+        m, v = model.predict(X, full_cov=False, Y_metadata=model.Y_metadata, **predict_kw)
+        fmu, fv = model._raw_predict(X, full_cov=False, **predict_kw)
+        lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=model.Y_metadata)
+
+        for d in which_data_ycols:
+            plots['gperrors'] = gperrors(X, m[:, d], lower[:, d], upper[:, d], edgecol=linecol, ax=ax, fignum=fignum, **kwargs )
+            if plot_training_data:
+                plots['dataplot'] = plot_data(model=model, which_data_rows=which_data_rows,
+                visible_dims=free_dims, data_symbol=data_symbol, mew=1.5, ax=ax, fignum=fignum)
+
+
+        #set the limits of the plot to some sensible values
+        ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
+        ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+        ax.set_xlim(X[:,free_dims].min(), X[:,free_dims].max())
+        ax.set_ylim(ymin, ymax)
+
+
+    elif len(free_dims) == 2:
+        raise NotImplementedError("Not implemented yet")
+
+
+    else:
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
+    return plots
--- a/GPy/plotting/matplot_dep/priors_plots.py
+++ b/GPy/plotting/matplot_dep/priors_plots.py
@ -4,7 +4,7 @@

 import numpy as np
 try:
-    import pylab as pb
+    from matplotlib import pyplot as pb
 except:
    pass

--- a/GPy/plotting/matplot_dep/svig_plots.py
+++ b/GPy/plotting/matplot_dep/svig_plots.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-import pylab as pb
+from matplotlib import pyplot as pb


 def plot(model, ax=None, fignum=None, Z_height=None, **kwargs):
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -1,4 +1,4 @@
-import pylab as pb, numpy as np
+from matplotlib import pyplot as pb, numpy as np

 def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    """
--- a/GPy/testing/bgplvm_minibatch_tests.py
+++ b/GPy/testing/bgplvm_minibatch_tests.py
@ -0,0 +1,109 @@
+'''
+Created on 4 Sep 2015
+
+@author: maxz
+'''
+import unittest
+import numpy as np
+import GPy
+
+class BGPLVMTest(unittest.TestCase):
+
+
+    def setUp(self):
+        np.random.seed(12345)
+        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
+        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
+        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
+        self.X, self.W, self.Y = X,W,Y
+        self.Q = 3
+        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
+
+    def test_lik_comparisons_m1_s0(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False)
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_predict_missing_data(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+
+        self.assertRaises(NotImplementedError, m.predict, m.X, full_cov=True)
+
+        mu1, var1 = m.predict(m.X, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1, var2)
+
+        mu1, var1 = m.predict(m.X.mean, full_cov=True)
+        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1[:,:,0], var2)
+
+        mu1, var1 = m.predict(m.X.mean, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1[:,[0]], var2)
+
+    def test_lik_comparisons_m0_s0(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=False)
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_lik_comparisons_m1_s1(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_lik_comparisons_m0_s1(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_gradients_missingdata(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
+        assert(m.checkgrad())
+
+    def test_gradients_missingdata_stochastics(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1)
+        assert(m.checkgrad())
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4)
+        assert(m.checkgrad())
+
+    def test_gradients_stochastics(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1)
+        assert(m.checkgrad())
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4)
+        assert(m.checkgrad())
+
+    def test_predict(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
--- a/GPy/testing/cacher_tests.py
+++ b/GPy/testing/cacher_tests.py
@ -0,0 +1,37 @@
+'''
+Created on 4 Sep 2015
+
+@author: maxz
+'''
+import unittest
+from GPy.util.caching import Cacher
+from pickle import PickleError
+
+
+class Test(unittest.TestCase):
+    def setUp(self):
+        def op(x):
+            return x
+        self.cache = Cacher(op, 1)
+
+    def test_pickling(self):
+        self.assertRaises(PickleError, self.cache.__getstate__)
+        self.assertRaises(PickleError, self.cache.__setstate__)
+
+    def test_copy(self):
+        tmp = self.cache.__deepcopy__()
+        assert(tmp.operation is self.cache.operation)
+        self.assertEqual(tmp.limit, self.cache.limit)
+
+    def test_reset(self):
+        self.cache.reset()
+        self.assertDictEqual(self.cache.cached_input_ids, {}, )
+        self.assertDictEqual(self.cache.cached_outputs, {}, )
+        self.assertDictEqual(self.cache.inputs_changed, {}, )
+
+    def test_name(self):
+        assert(self.cache.__name__ == self.cache.operation.__name__)
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -2,11 +2,21 @@ import numpy as np
 import scipy as sp
 from GPy.util import choleskies
 import GPy
+from ..util.config import config
+import unittest
+
+try:
+    from ..util import linalg_cython
+    from ..util import choleskies_cython
+    config.set('cython', 'working', 'True')
+except ImportError:
+    config.set('cython', 'working', 'False')

 """
-These tests make sure that the opure python and cython codes work the same
+These tests make sure that the pure python and cython codes work the same
 """

+@unittest.skipIf(not config.getboolean('cython', 'working'),"Cython modules have not been built on this machine")
 class CythonTestChols(np.testing.TestCase):
    def setUp(self):
        self.flat = np.random.randn(45,5)
@ -20,6 +30,7 @@ class CythonTestChols(np.testing.TestCase):
        A2 = choleskies._triang_to_flat_cython(self.triang)
        np.testing.assert_allclose(A1, A2)

+@unittest.skipIf(not config.getboolean('cython', 'working'),"Cython modules have not been built on this machine")
 class test_stationary(np.testing.TestCase):
    def setUp(self):
        self.k = GPy.kern.RBF(10)
@ -49,17 +60,16 @@ class test_stationary(np.testing.TestCase):
        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
        np.testing.assert_allclose(g1, g2)

+@unittest.skipIf(not config.getboolean('cython', 'working'),"Cython modules have not been built on this machine")
 class test_choleskies_backprop(np.testing.TestCase):
    def setUp(self):
-        self.dL, self.L = np.random.randn(2, 100, 100)
+        a =np.random.randn(10,12)
+        A = a.dot(a.T)
+        self.L = GPy.util.linalg.jitchol(A)
+        self.dL = np.random.randn(10,10)
    def test(self):
-        r1 = GPy.util.choleskies._backprop_gradient_pure(self.dL, self.L)
-        r2 = GPy.util.choleskies.choleskies_cython.backprop_gradient(self.dL, self.L)
+        r1 = choleskies._backprop_gradient_pure(self.dL, self.L)
+        r2 = choleskies_cython.backprop_gradient(self.dL, self.L)
+        r3 = choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
        np.testing.assert_allclose(r1, r2)
-
-
-
-
-
-
-
+        np.testing.assert_allclose(r1, r3)
--- a/GPy/testing/gp_tests.py
+++ b/GPy/testing/gp_tests.py
@ -0,0 +1,99 @@
+'''
+Created on 4 Sep 2015
+
+@author: maxz
+'''
+import unittest
+import numpy as np, GPy
+from GPy.core.parameterization.variational import NormalPosterior
+
+class Test(unittest.TestCase):
+
+
+    def setUp(self):
+        np.random.seed(12345)
+        self.N = 20
+        self.N_new = 50
+        self.D = 1
+        self.X = np.random.uniform(-3., 3., (self.N, 1))
+        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
+        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
+
+
+    def test_setxy_bgplvm(self):
+        k = GPy.kern.RBF(1)
+        m = GPy.models.BayesianGPLVM(self.Y, 2, kernel=k)
+        mu, var = m.predict(m.X)
+        X = m.X.copy()
+        Xnew = NormalPosterior(m.X.mean[:10].copy(), m.X.variance[:10].copy())
+        m.set_XY(Xnew, m.Y[:10])
+        assert(m.checkgrad())
+        m.set_XY(X, self.Y)
+        mu2, var2 = m.predict(m.X)
+        np.testing.assert_allclose(mu, mu2)
+        np.testing.assert_allclose(var, var2)
+
+    def test_setxy_gplvm(self):
+        k = GPy.kern.RBF(1)
+        m = GPy.models.GPLVM(self.Y, 2, kernel=k)
+        mu, var = m.predict(m.X)
+        X = m.X.copy()
+        Xnew = X[:10].copy()
+        m.set_XY(Xnew, m.Y[:10])
+        assert(m.checkgrad())
+        m.set_XY(X, self.Y)
+        mu2, var2 = m.predict(m.X)
+        np.testing.assert_allclose(mu, mu2)
+        np.testing.assert_allclose(var, var2)
+
+    def test_setxy_gp(self):
+        k = GPy.kern.RBF(1)
+        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
+        mu, var = m.predict(m.X)
+        X = m.X.copy()
+        m.set_XY(m.X[:10], m.Y[:10])
+        assert(m.checkgrad())
+        m.set_XY(X, self.Y)
+        mu2, var2 = m.predict(m.X)
+        np.testing.assert_allclose(mu, mu2)
+        np.testing.assert_allclose(var, var2)
+
+    def test_mean_function(self):
+        from GPy.core.parameterization.param import Param
+        from GPy.core.mapping import Mapping
+        class Parabola(Mapping):
+            def __init__(self, variance, degree=2, name='parabola'):
+                super(Parabola, self).__init__(1, 1, name)
+                self.variance = Param('variance', np.ones(degree+1) * variance)
+                self.degree = degree
+                self.link_parameter(self.variance)
+
+            def f(self, X):
+                p = self.variance[0] * np.ones(X.shape)
+                for i in range(1, self.degree+1):
+                    p += self.variance[i] * X**(i)
+                return p
+
+            def gradients_X(self, dL_dF, X):
+                grad = np.zeros(X.shape)
+                for i in range(1, self.degree+1):
+                    grad += (i) * self.variance[i] * X**(i-1)
+                return grad
+
+            def update_gradients(self, dL_dF, X):
+                for i in range(self.degree+1):
+                    self.variance.gradient[i] = (dL_dF * X**(i)).sum(0)
+        X = np.linspace(-2, 2, 100)[:, None]
+        k = GPy.kern.RBF(1)
+        k.randomize()
+        p = Parabola(.3)
+        p.randomize()
+        Y = p.f(X) + np.random.multivariate_normal(np.zeros(X.shape[0]), k.K(X)+np.eye(X.shape[0])*1e-8)[:,None] + np.random.normal(0, .1, (X.shape[0], 1))
+        m = GPy.models.GPRegression(X, Y, mean_function=p)
+        m.randomize()
+        assert(m.checkgrad())
+        _ = m.predict(m.X)
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -8,11 +8,12 @@ The test cases for various inference algorithms
 import unittest, itertools
 import numpy as np
 import GPy
-
+#np.seterr(invalid='raise')

 class InferenceXTestCase(unittest.TestCase):

    def genData(self):
+        np.random.seed(1)
        D1,D2,N = 12,12,50

        x = np.linspace(0, 4 * np.pi, N)[:, None]
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -6,9 +6,16 @@ import numpy as np
 import GPy
 import sys
 from GPy.core.parameterization.param import Param
+from ..util.config import config

 verbose = 0

+try:
+    from ..util import linalg_cython
+    config.set('cython', 'working', 'True')
+except ImportError:
+    config.set('cython', 'working', 'False')
+

 class Kern_check_model(GPy.core.Model):
    """
@ -245,6 +252,11 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        continuous_kerns = ['RBF', 'Linear']
        self.kernclasses = [getattr(GPy.kern, s) for s in continuous_kerns]

+    def test_MLP(self):
+        k = GPy.kern.MLP(self.D,ARD=True)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
    def test_Matern32(self):
        k = GPy.kern.Matern32(self.D)
        k.randomize()
@ -313,6 +325,11 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))

+    def test_standard_periodic(self):
+        k = GPy.kern.StdPeriodic(self.D, self.D-1)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
 class KernelTestsMiscellaneous(unittest.TestCase):
    def setUp(self):
        N, D = 100, 10
@ -366,6 +383,7 @@ class KernelTestsNonContinuous(unittest.TestCase):
        X2 = self.X2[self.X2[:,-1]!=2]
        self.assertTrue(check_kernel_gradient_functions(kern, X=X, X2=X2, verbose=verbose, fixed_X_dims=-1))

+@unittest.skipIf(not config.getboolean('cython', 'working'),"Cython modules have not been built on this machine")
 class Coregionalize_cython_test(unittest.TestCase):
    """
    Make sure that the coregionalize kernel work with and without cython enabled
@ -432,6 +450,104 @@ class KernelTestsProductWithZeroValues(unittest.TestCase):
        self.assertFalse(np.any(np.isnan(target)),
                         "Gradient resulted in NaN")

+class Kernel_Psi_statistics_GradientTests(unittest.TestCase):
+
+    def setUp(self):
+        from GPy.core.parameterization.variational import NormalPosterior
+        N,M,Q = 100,20,3
+
+        X = np.random.randn(N,Q)
+        X_var = np.random.rand(N,Q)+0.01
+        self.Z = np.random.randn(M,Q)
+        self.qX = NormalPosterior(X, X_var)
+
+        self.w1 = np.random.randn(N)
+        self.w2 = np.random.randn(N,M)
+        self.w3 = np.random.randn(M,M)
+        self.w3 = self.w3+self.w3.T
+        self.w3n = np.random.randn(N,M,M)
+        self.w3n = self.w3n+np.swapaxes(self.w3n, 1,2)
+
+    def test_kernels(self):
+        from GPy.kern import RBF,Linear,MLP
+        Q = self.Z.shape[1]
+        kernels = [RBF(Q,ARD=True), Linear(Q,ARD=True)]
+
+        for k in kernels:
+            k.randomize()
+            self._test_kernel_param(k)
+            self._test_Z(k)
+            self._test_qX(k)
+            self._test_kernel_param(k, psi2n=True)
+            self._test_Z(k, psi2n=True)
+            self._test_qX(k, psi2n=True)
+
+    def _test_kernel_param(self, kernel, psi2n=False):
+
+        def f(p):
+            kernel.param_array[:] = p
+            psi0 = kernel.psi0(self.Z, self.qX)
+            psi1 = kernel.psi1(self.Z, self.qX)
+            if not psi2n:
+                psi2 = kernel.psi2(self.Z, self.qX)
+                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3*psi2).sum()
+            else:
+                psi2 = kernel.psi2n(self.Z, self.qX)
+                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3n*psi2).sum()
+
+        def df(p):
+            kernel.param_array[:] = p
+            kernel.update_gradients_expectations(self.w1, self.w2, self.w3 if not psi2n else self.w3n, self.Z, self.qX)
+            return kernel.gradient.copy()
+
+        from GPy.models import GradientChecker
+        m = GradientChecker(f, df, kernel.param_array.copy())
+        self.assertTrue(m.checkgrad())
+
+    def _test_Z(self, kernel, psi2n=False):
+
+        def f(p):
+            psi0 = kernel.psi0(p, self.qX)
+            psi1 = kernel.psi1(p, self.qX)
+            psi2 = kernel.psi2(p, self.qX)
+            if not psi2n:
+                psi2 = kernel.psi2(p, self.qX)
+                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3*psi2).sum()
+            else:
+                psi2 = kernel.psi2n(p, self.qX)
+                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3n*psi2).sum()
+
+        def df(p):
+            return kernel.gradients_Z_expectations(self.w1, self.w2, self.w3 if not psi2n else self.w3n, p, self.qX)
+
+        from GPy.models import GradientChecker
+        m = GradientChecker(f, df, self.Z.copy())
+        self.assertTrue(m.checkgrad())
+
+    def _test_qX(self, kernel, psi2n=False):
+
+        def f(p):
+            self.qX.param_array[:] = p
+            self.qX._trigger_params_changed()
+            psi0 = kernel.psi0(self.Z, self.qX)
+            psi1 = kernel.psi1(self.Z, self.qX)
+            if not psi2n:
+                psi2 = kernel.psi2(self.Z, self.qX)
+                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3*psi2).sum()
+            else:
+                psi2 = kernel.psi2n(self.Z, self.qX)
+                return (self.w1*psi0).sum() + (self.w2*psi1).sum() + (self.w3n*psi2).sum()
+
+        def df(p):
+            self.qX.param_array[:] = p
+            self.qX._trigger_params_changed()
+            grad =  kernel.gradients_qX_expectations(self.w1, self.w2, self.w3 if not psi2n else self.w3n, self.Z, self.qX)
+            self.qX.set_gradients(grad)
+            return self.qX.gradient.copy()
+
+        from GPy.models import GradientChecker
+        m = GradientChecker(f, df, self.qX.param_array.copy())
+        self.assertTrue(m.checkgrad())

 if __name__ == "__main__":
    print("Running unit tests, please be (very) patient...")
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -7,10 +7,8 @@ from GPy.models import GradientChecker
 import functools
 import inspect
 from GPy.likelihoods import link_functions
-from GPy.core.parameterization import Param
 from functools import partial
-#np.random.seed(300)
-#np.random.seed(4)
+fixed_seed = 7

 #np.seterr(divide='raise')
 def dparam_partial(inst_func, *args):
@ -105,6 +103,7 @@ class TestNoiseModels(object):
    Generic model checker
    """
    def setUp(self):
+        np.random.seed(fixed_seed)
        self.N = 15
        self.D = 3
        self.X = np.random.rand(self.N, self.D)*10
@ -218,7 +217,8 @@ class TestNoiseModels(object):
                    "constraints": [(".*variance", self.constrain_positive)]
                },
                "laplace": True,
-                "ep": False # FIXME: Should be True when we have it working again
+                "ep": False, # FIXME: Should be True when we have it working again
+                "variational_expectations": True,
            },
            "Gaussian_log": {
                "model": GPy.likelihoods.Gaussian(gp_link=link_functions.Log(), variance=self.var),
@ -227,7 +227,8 @@ class TestNoiseModels(object):
                    "vals": [self.var],
                    "constraints": [(".*variance", self.constrain_positive)]
                },
-                "laplace": True
+                "laplace": True,
+                "variational_expectations": True
            },
            #"Gaussian_probit": {
            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
@ -252,7 +253,8 @@ class TestNoiseModels(object):
                "link_f_constraints": [partial(self.constrain_bounded, lower=0, upper=1)],
                "laplace": True,
                "Y": self.binary_Y,
-                "ep": False # FIXME: Should be True when we have it working again
+                "ep": False, # FIXME: Should be True when we have it working again
+                "variational_expectations": True
            },
            "Exponential_default": {
                "model": GPy.likelihoods.Exponential(),
@ -347,6 +349,10 @@ class TestNoiseModels(object):
                ep = attributes["ep"]
            else:
                ep = False
+            if "variational_expectations" in attributes:
+                var_exp = attributes["variational_expectations"]
+            else:
+                var_exp = False

            #if len(param_vals) > 1:
                #raise NotImplementedError("Cannot support multiple params in likelihood yet!")
@ -377,6 +383,11 @@ class TestNoiseModels(object):
            if ep:
                #ep likelihood gradcheck
                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints
+            if var_exp:
+                #Need to specify mu and var!
+                yield self.t_varexp, model, Y, Y_metadata
+                yield self.t_dexp_dmu, model, Y, Y_metadata
+                yield self.t_dexp_dvar, model, Y, Y_metadata


        self.tearDown()
@ -603,6 +614,87 @@ class TestNoiseModels(object):
        print(m)
        assert m.checkgrad(verbose=1, step=step)

+    ################
+    # variational expectations #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_varexp(self, model, Y, Y_metadata):
+        #Test that the analytic implementation (if it exists) matches the generic gauss
+        #hermite implementation
+        print("\n{}".format(inspect.stack()[0][3]))
+        #Make mu and var (marginal means and variances of q(f)) draws from a GP
+        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
+        L = GPy.util.linalg.jitchol(k)
+        mu = L.dot(np.random.randn(*Y.shape))
+        #Variance must be positive
+        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
+
+        expectation = model.variational_expectations(Y=Y, m=mu, v=var, gh_points=None, Y_metadata=Y_metadata)[0]
+
+        #Implementation of gauss hermite integration
+        shape = mu.shape
+        gh_x, gh_w= np.polynomial.hermite.hermgauss(50)
+        m,v,Y = mu.flatten(), var.flatten(), Y.flatten()
+        #make a grid of points
+        X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
+        #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
+        # broadcast needs to be handled carefully.
+        logp = model.logpdf(X, Y[:,None], Y_metadata=Y_metadata)
+        #average over the gird to get derivatives of the Gaussian's parameters
+        #division by pi comes from fact that for each quadrature we need to scale by 1/sqrt(pi)
+        expectation_gh  = np.dot(logp, gh_w)/np.sqrt(np.pi)
+        expectation_gh = expectation_gh.reshape(*shape)
+
+        np.testing.assert_almost_equal(expectation, expectation_gh, decimal=5)
+
+    @with_setup(setUp, tearDown)
+    def t_dexp_dmu(self, model, Y, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
+        #Make mu and var (marginal means and variances of q(f)) draws from a GP
+        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
+        L = GPy.util.linalg.jitchol(k)
+        mu = L.dot(np.random.randn(*Y.shape))
+        #Variance must be positive
+        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
+        expectation = functools.partial(model.variational_expectations, Y=Y, v=var, gh_points=None, Y_metadata=Y_metadata)
+
+        #Function to get the nth returned value
+        def F(mu):
+            return expectation(m=mu)[0]
+        def dmu(mu):
+            return expectation(m=mu)[1]
+
+        grad = GradientChecker(F, dmu, mu.copy(), 'm')
+
+        grad.randomize()
+        print(grad)
+        print(model)
+        assert grad.checkgrad(verbose=1)
+
+    @with_setup(setUp, tearDown)
+    def t_dexp_dvar(self, model, Y, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
+        #Make mu and var (marginal means and variances of q(f)) draws from a GP
+        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
+        L = GPy.util.linalg.jitchol(k)
+        mu = L.dot(np.random.randn(*Y.shape))
+        #Variance must be positive
+        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
+        expectation = functools.partial(model.variational_expectations, Y=Y, m=mu, gh_points=None, Y_metadata=Y_metadata)
+
+        #Function to get the nth returned value
+        def F(var):
+            return expectation(v=var)[0]
+        def dvar(var):
+            return expectation(v=var)[2]
+
+        grad = GradientChecker(F, dvar, var.copy(), 'v')
+
+        self.constrain_positive('v', grad)
+        #grad.randomize()
+        print(grad)
+        print(model)
+        assert grad.checkgrad(verbose=1)

 class LaplaceTests(unittest.TestCase):
    """
@ -610,6 +702,7 @@ class LaplaceTests(unittest.TestCase):
    """

    def setUp(self):
+        np.random.seed(fixed_seed)
        self.N = 15
        self.D = 1
        self.X = np.random.rand(self.N, self.D)*10
@ -705,7 +798,7 @@ class LaplaceTests(unittest.TestCase):
        post_mean_approx, post_var_approx, = m2.predict(X)

        if debug:
-            import pylab as pb
+            from matplotlib import pyplot as pb
            pb.figure(5)
            pb.title('posterior means')
            pb.scatter(X, post_mean, c='g')
--- a/GPy/testing/linalg_test.py
+++ b/GPy/testing/linalg_test.py
@ -1,7 +1,6 @@
 import numpy as np
 import scipy as sp
-from GPy.util.linalg import jitchol
-import GPy
+from ..util.linalg import jitchol,trace_dot, ijk_jlk_to_il, ijk_ljk_to_ilk

 class LinalgTests(np.testing.TestCase):
    def setUp(self):
@ -37,18 +36,19 @@ class LinalgTests(np.testing.TestCase):
        except sp.linalg.LinAlgError:
            return True

-    def test_einsum_ijk_jlk_to_il(self):
-        A = np.random.randn(50, 150, 5)
-        B = np.random.randn(150, 100, 5)
-        pure = np.einsum('ijk,jlk->il', A, B)
-        quick = GPy.util.linalg.ijk_jlk_to_il(A, B)
-        np.testing.assert_allclose(pure, quick)
+    def test_trace_dot(self):
+        N = 5
+        A = np.random.rand(N,N)
+        B = np.random.rand(N,N)
+        trace = np.trace(A.dot(B))
+        test_trace = trace_dot(A,B)
+        np.testing.assert_allclose(trace,test_trace,atol=1e-13)

    def test_einsum_ij_jlk_to_ilk(self):
        A = np.random.randn(15, 150, 5)
        B = np.random.randn(150, 50, 5)
        pure = np.einsum('ijk,jlk->il', A, B)
-        quick = GPy.util.linalg.ijk_jlk_to_il(A,B)
+        quick = ijk_jlk_to_il(A,B)
        np.testing.assert_allclose(pure, quick)

    def test_einsum_ijk_ljk_to_ilk(self):
@ -56,5 +56,5 @@ class LinalgTests(np.testing.TestCase):
        B = np.random.randn(150, 20, 5)
        #B = A.copy()
        pure = np.einsum('ijk,ljk->ilk', A, B)
-        quick = GPy.util.linalg.ijk_ljk_to_ilk(A,B)
+        quick = ijk_ljk_to_ilk(A,B)
        np.testing.assert_allclose(pure, quick)
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -1,5 +1,5 @@
 import numpy as np
-import scipy as sp
+import scipy
 from scipy.special import cbrt
 from GPy.models import GradientChecker
 _lim_val = np.finfo(np.float64).max
@ -79,8 +79,7 @@ class LinkFunctionTests(np.testing.TestCase):
        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
        #Check the clipping works
        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
-        #Need to look at most significant figures here rather than the decimals
-        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), _lim_val, significant=5)
+        self.assertTrue(np.isfinite(link.transf(self.f_upper_lim)))
        self.check_overflow(link, lim_of_inf)

        #Check that it would otherwise fail
@ -93,18 +92,18 @@ class LinkFunctionTests(np.testing.TestCase):
        link = Log_ex_1()
        lim_of_inf = _lim_val_exp

-        np.testing.assert_almost_equal(np.log1p(np.exp(self.mid_f)), link.transf(self.mid_f))
-        assert np.isinf(np.log1p(np.exp(np.log(self.f_upper_lim))))
+        np.testing.assert_almost_equal(scipy.special.log1p(np.exp(self.mid_f)), link.transf(self.mid_f))
+        assert np.isinf(scipy.special.log1p(np.exp(np.log(self.f_upper_lim))))
        #Check the clipping works
        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
        #Need to look at most significant figures here rather than the decimals
-        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), np.log1p(_lim_val), significant=5)
+        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), scipy.special.log1p(_lim_val), significant=5)
        self.check_overflow(link, lim_of_inf)

        #Check that it would otherwise fail
        beyond_lim_of_inf = lim_of_inf + 10.0
        old_err_state = np.seterr(over='ignore')
-        self.assertTrue(np.isinf(np.log1p(np.exp(beyond_lim_of_inf))))
+        self.assertTrue(np.isinf(scipy.special.log1p(np.exp(beyond_lim_of_inf))))
        np.seterr(**old_err_state)


--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@ -1,6 +1,8 @@
+from __future__ import print_function
 import numpy as np
 import scipy as sp
 import GPy
+import warnings

 class MiscTests(np.testing.TestCase):
    """
@ -11,8 +13,15 @@ class MiscTests(np.testing.TestCase):
        self._lim_val_exp = np.log(self._lim_val)

    def test_safe_exp_upper(self):
-        assert np.exp(self._lim_val_exp + 1) == np.inf
-        assert GPy.util.misc.safe_exp(self._lim_val_exp + 1) < np.inf
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')  # always print
+            assert np.isfinite(np.exp(self._lim_val_exp))
+            assert np.isinf(np.exp(self._lim_val_exp + 1))
+            assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))
+
+            print(w)
+            print(len(w))
+            assert len(w)<=1 # should have one overflow warning

    def test_safe_exp_lower(self):
        assert GPy.util.misc.safe_exp(1e-10) < np.inf
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -15,6 +15,13 @@ class MiscTests(unittest.TestCase):
        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))

+    def test_setXY(self):
+        m = GPy.models.GPRegression(self.X, self.Y)
+        m.set_XY(np.vstack([self.X, np.random.rand(1,self.X.shape[1])]), np.vstack([self.Y, np.random.rand(1,self.Y.shape[1])]))
+        m._trigger_params_changed()
+        self.assertTrue(m.checkgrad())
+        m.predict(m.X)
+
    def test_raw_predict(self):
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
@ -36,12 +43,78 @@ class MiscTests(unittest.TestCase):
        np.testing.assert_almost_equal(np.diag(K_hat)[:, None], var)
        np.testing.assert_almost_equal(mu_hat, mu)

+    def test_normalizer(self):
+        k = GPy.kern.RBF(1)
+        Y = self.Y
+        mu, std = Y.mean(0), Y.std(0)
+        m = GPy.models.GPRegression(self.X, Y, kernel=k, normalizer=True)
+        m.optimize()
+        assert(m.checkgrad())
+        k = GPy.kern.RBF(1)
+        m2 = GPy.models.GPRegression(self.X, (Y-mu)/std, kernel=k, normalizer=False)
+        m2[:] = m[:]
+        mu1, var1 = m.predict(m.X, full_cov=True)
+        mu2, var2 = m2.predict(m2.X, full_cov=True)
+        np.testing.assert_allclose(mu1, (mu2*std)+mu)
+        np.testing.assert_allclose(var1, var2)
+        mu1, var1 = m.predict(m.X, full_cov=False)
+        mu2, var2 = m2.predict(m2.X, full_cov=False)
+        np.testing.assert_allclose(mu1, (mu2*std)+mu)
+        np.testing.assert_allclose(var1, var2)
+
+        q50n = m.predict_quantiles(m.X, (50,))
+        q50 = m2.predict_quantiles(m2.X, (50,))
+        np.testing.assert_allclose(q50n[0], (q50[0]*std)+mu)
+
+    def check_jacobian(self):
+        try:
+            import autograd.numpy as np, autograd as ag, GPy, matplotlib.pyplot as plt
+            from GPy.models import GradientChecker, GPRegression
+        except:
+            raise self.skipTest("autograd not available to check gradients")
+        def k(X, X2, alpha=1., lengthscale=None):
+            if lengthscale is None:
+                lengthscale = np.ones(X.shape[1])
+            exp = 0.
+            for q in range(X.shape[1]):
+                exp += ((X[:, [q]] - X2[:, [q]].T)/lengthscale[q])**2
+            #exp = np.sqrt(exp)
+            return alpha * np.exp(-.5*exp)
+        dk = ag.elementwise_grad(lambda x, x2: k(x, x2, alpha=ke.variance.values, lengthscale=ke.lengthscale.values))
+        dkdk = ag.elementwise_grad(dk, argnum=1)
+
+        ke = GPy.kern.RBF(1, ARD=True)
+        #ke.randomize()
+        ke.variance = .2#.randomize()
+        ke.lengthscale[:] = .5
+        ke.randomize()
+        X = np.linspace(-1, 1, 1000)[:,None]
+        X2 = np.array([[0.]]).T
+        np.testing.assert_allclose(ke.gradients_X([[1.]], X, X), dk(X, X))
+        np.testing.assert_allclose(ke.gradients_XX([[1.]], X, X).sum(0), dkdk(X, X))
+        np.testing.assert_allclose(ke.gradients_X([[1.]], X, X2), dk(X, X2))
+        np.testing.assert_allclose(ke.gradients_XX([[1.]], X, X2).sum(0), dkdk(X, X2))
+
+        m = GPRegression(self.X, self.Y)
+        def f(x):
+            m.X[:] = x
+            return m.log_likelihood()
+        def df(x):
+            m.X[:] = x
+            return m.kern.gradients_X(m.grad_dict['dL_dK'], X)
+        def ddf(x):
+            m.X[:] = x
+            return m.kern.gradients_XX(m.grad_dict['dL_dK'], X).sum(0)
+        gc = GradientChecker(f, df, self.X)
+        gc2 = GradientChecker(df, ddf, self.X)
+        assert(gc.checkgrad())
+        assert(gc2.checkgrad())
+
    def test_sparse_raw_predict(self):
        k = GPy.kern.RBF(1)
        m = GPy.models.SparseGPRegression(self.X, self.Y, kernel=k)
        m.randomize()
        Z = m.Z[:]
-        X = self.X[:]

        # Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
        Kinv = m.posterior.woodbury_inv
@ -127,11 +200,24 @@ class MiscTests(unittest.TestCase):
        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())
+        mul, varl = m.predict(m.X)

        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
-        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
+        m2 = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())
+        m2.kern.rbf.lengthscale[:] = 1e6
+        m2.X[:] = m.X.param_array
+        m2.likelihood[:] = m.likelihood[:]
+        m2.kern.white[:] = m.kern.white[:]
+        mu, var = m.predict(m.X)
+        np.testing.assert_allclose(mul, mu)
+        np.testing.assert_allclose(varl, var)
+
+        q50 = m.predict_quantiles(m.X, (50,))
+        np.testing.assert_allclose(mul, q50[0])
+
+

    def test_likelihood_replicate_kern(self):
        m = GPy.models.GPRegression(self.X, self.Y)
@ -410,8 +496,8 @@ class GradientTests(np.testing.TestCase):
        self.check_model(rbf, model_type='SparseGPRegression', dimension=2)

    def test_SparseGPRegression_rbf_linear_white_kern_1D(self):
-        ''' Testing the sparse GP regression with rbf kernel on 2d data '''
-        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1)
+        ''' Testing the sparse GP regression with rbf kernel on 1d data '''
+        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1) + GPy.kern.White(1, 1e-5)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=1)

    def test_SparseGPRegression_rbf_linear_white_kern_2D(self):
@ -419,14 +505,12 @@ class GradientTests(np.testing.TestCase):
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2)

-    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_2D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 2d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        raise unittest.SkipTest("This is not implemented yet!")
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2, uncertain_inputs=1)

-    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_1D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 1d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1)
@ -443,6 +527,16 @@ class GradientTests(np.testing.TestCase):
        m = GPy.models.GPLVM(Y, input_dim, kernel=k)
        self.assertTrue(m.checkgrad())

+    def test_BCGPLVM_rbf_bias_white_kern_2D(self):
+        """ Testing GPLVM with rbf + bias kernel """
+        N, input_dim, D = 50, 1, 2
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
+        m = GPy.models.BCGPLVM(Y, input_dim, kernel=k)
+        self.assertTrue(m.checkgrad())
+
    def test_GPLVM_rbf_linear_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
@ -468,23 +562,8 @@ class GradientTests(np.testing.TestCase):
        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, Z=Z)
-        # distribution = GPy.likelihoods.likelihood_functions.Bernoulli()
-        # likelihood = GPy.likelihoods.EP(Y, distribution)
-        # m = GPy.core.SparseGP(X, likelihood, kernel, Z)
-        # m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())

-    @unittest.expectedFailure
-    def test_generalized_FITC(self):
-        N = 20
-        X = np.hstack([np.random.rand(N / 2) + 1, np.random.rand(N / 2) - 1])[:, None]
-        k = GPy.kern.RBF(1) + GPy.kern.White(1)
-        Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
-        m = GPy.models.FITCClassification(X, Y, kernel=k)
-        m.update_likelihood_approximation()
-        self.assertTrue(m.checkgrad())
-
-    @unittest.expectedFailure
    def test_multioutput_regression_1D(self):
        X1 = np.random.rand(50, 1) * 8
        X2 = np.random.rand(30, 1) * 5
@ -494,12 +573,11 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))

        k1 = GPy.kern.RBF(1)
-        m = GPy.models.GPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
-        import ipdb;ipdb.set_trace()
-        m.constrain_fixed('.*rbf_var', 1.)
+        m = GPy.models.GPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
+        #import ipdb;ipdb.set_trace()
+        #m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())

-    @unittest.expectedFailure
    def test_multioutput_sparse_regression_1D(self):
        X1 = np.random.rand(500, 1) * 8
        X2 = np.random.rand(300, 1) * 5
@ -509,8 +587,7 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))

        k1 = GPy.kern.RBF(1)
-        m = GPy.models.SparseGPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
-        m.constrain_fixed('.*rbf_var', 1.)
+        m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
        self.assertTrue(m.checkgrad())

    def test_gp_heteroscedastic_regression(self):
@ -539,6 +616,7 @@ class GradientTests(np.testing.TestCase):
        self.assertTrue(m.checkgrad())

    def test_gp_kronecker_gaussian(self):
+        np.random.seed(0)
        N1, N2 = 30, 20
        X1 = np.random.randn(N1, 1)
        X2 = np.random.randn(N2, 1)
@ -559,16 +637,16 @@ class GradientTests(np.testing.TestCase):

        m.randomize()
        mm[:] = m[:]
-        assert np.allclose(m.log_likelihood(), mm.log_likelihood())
-        assert np.allclose(m.gradient, mm.gradient)
+        self.assertTrue(np.allclose(m.log_likelihood(), mm.log_likelihood()))
+        self.assertTrue(np.allclose(m.gradient, mm.gradient))
        X1test = np.random.randn(100, 1)
        X2test = np.random.randn(100, 1)
        mean1, var1 = m.predict(X1test, X2test)
        yy, xx = np.meshgrid(X2test, X1test)
        Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
        mean2, var2 = mm.predict(Xgrid)
-        assert np.allclose(mean1, mean2)
-        assert np.allclose(var1, var2)
+        self.assertTrue( np.allclose(mean1, mean2) )
+        self.assertTrue( np.allclose(var1, var2) )

    def test_gp_VGPC(self):
        num_obs = 25
@ -576,7 +654,8 @@ class GradientTests(np.testing.TestCase):
        X = X[:, None]
        Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
        kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
-        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kern)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kernel=kern, likelihood=lik)
        self.assertTrue(m.checkgrad())


--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@ -248,10 +248,16 @@ class ParameterizedTest(unittest.TestCase):
        m.randomize()
        self.assertEqual(m.p1, val)

+    def test_checkgrad(self):
+        assert(self.testmodel.kern.checkgrad())
+        assert(self.testmodel.kern.lengthscale.checkgrad())
+        assert(self.testmodel.likelihood.checkgrad())
+
    def test_printing(self):
        print(self.test1)
        print(self.param)
        print(self.test1[''])
+        print(self.testmodel.hierarchy_name(False))

 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.test_add_parameter']
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -20,6 +20,8 @@ from GPy.examples.dimensionality_reduction import mrd_simulation
 from GPy.core.parameterization.variational import NormalPosterior
 from GPy.models.gp_regression import GPRegression
 from functools import reduce
+from GPy.util.caching import Cacher
+from pickle import PicklingError

 def toy_model():
    X = np.linspace(0,1,50)[:, None]
@ -205,23 +207,6 @@ class Test(ListDictTestCase):
    def _callback(self, what, which):
        what.count += 1

-    @unittest.skip
-    def test_add_observer(self):
-        par = toy_model()
-        par.name = "original"
-        par.count = 0
-        par.add_observer(self, self._callback, 1)
-        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
-        self.assertNotIn(par.observers[0], pcopy.observers)
-        pcopy = par.copy()
-        pcopy.name = "copy"
-        self.assertTrue(par.checkgrad())
-        self.assertTrue(pcopy.checkgrad())
-        self.assertTrue(pcopy.kern.checkgrad())
-        import ipdb;ipdb.set_trace()
-        self.assertIn(par.observers[0], pcopy.observers)
-        self.assertEqual(par.count, 3)
-        self.assertEqual(pcopy.count, 6) # 3 of each call to checkgrad

 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
--- a/GPy/testing/run_coverage.sh
+++ b/GPy/testing/run_coverage.sh
@ -0,0 +1 @@
+nosetests . --with-coverage --logging-level=INFO --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase
--- a/GPy/testing/rv_transformation_tests.py
+++ b/GPy/testing/rv_transformation_tests.py
@ -0,0 +1,101 @@
+# Written by Ilias Bilionis
+"""
+Test if hyperparameters in models are properly transformed.
+"""
+
+
+import unittest
+import numpy as np
+import scipy.stats as st
+import GPy
+
+
+class TestModel(GPy.core.Model):
+    """
+    A simple GPy model with one parameter.
+    """
+    def __init__(self):
+        GPy.core.Model.__init__(self, 'test_model')
+        theta = GPy.core.Param('theta', 1.)
+        self.link_parameter(theta)
+
+    def log_likelihood(self):
+        return 0.
+
+
+class RVTransformationTestCase(unittest.TestCase):
+
+    def _test_trans(self, trans):
+        m = TestModel()
+        prior = GPy.priors.LogGaussian(.5, 0.1)
+        m.theta.set_prior(prior)
+        m.theta.unconstrain()
+        m.theta.constrain(trans)
+        # The PDF of the transformed variables
+        p_phi = lambda phi : np.exp(-m._objective_grads(phi)[0])
+        # To the empirical PDF of:
+        theta_s = prior.rvs(100000)
+        phi_s = trans.finv(theta_s)
+        # which is essentially a kernel density estimation
+        kde = st.gaussian_kde(phi_s)
+        # We will compare the PDF here:
+        phi = np.linspace(phi_s.min(), phi_s.max(), 100)
+        # The transformed PDF of phi should be this:
+        pdf_phi = np.array([p_phi(p) for p in phi])
+        # UNCOMMENT TO SEE GRAPHICAL COMPARISON
+        #import matplotlib.pyplot as plt
+        #fig, ax = plt.subplots()
+        #ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
+        #ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
+        #ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
+        #ax.set_xlabel(r'transformed $\theta$', fontsize=16)
+        #ax.set_ylabel('PDF', fontsize=16)
+        #plt.legend(loc='best')
+        #plt.show(block=True)
+        # END OF PLOT
+        # The following test cannot be very accurate
+        self.assertTrue(np.linalg.norm(pdf_phi - kde(phi)) / np.linalg.norm(kde(phi)) <= 1e-1)
+        # Check the gradients at a few random points
+        for i in range(10):
+            m.theta = theta_s[i]
+            self.assertTrue(m.checkgrad(verbose=True))
+
+    def test_Logexp(self):
+        self._test_trans(GPy.constraints.Logexp())
+        self._test_trans(GPy.constraints.Exponent())
+
+
+if __name__ == '__main__':
+    unittest.main()
+    quit()
+    m = TestModel()
+    prior = GPy.priors.LogGaussian(0., .9)
+    m.theta.set_prior(prior)
+
+    # The following should return the PDF in terms of the transformed quantities
+    p_phi = lambda phi : np.exp(-m._objective_grads(phi)[0])
+
+    # Let's look at the transformation phi = log(exp(theta - 1))
+    trans = GPy.constraints.Exponent()
+    m.theta.constrain(trans)
+    # Plot the transformed probability density
+    phi = np.linspace(-8, 8, 100)
+    fig, ax = plt.subplots()
+    # Let's draw some samples of theta and transform them so that we see
+    # which one is right
+    theta_s = prior.rvs(10000)
+    # Transform it to the new variables
+    phi_s = trans.finv(theta_s)
+    # And draw their histogram
+    ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Empirical')
+    # This is to be compared to the PDF of the model expressed in terms of these new
+    # variables
+    ax.plot(phi, [p_phi(p) for p in phi], label='Transformed PDF', linewidth=2)
+    ax.set_xlim(-3, 10)
+    ax.set_xlabel(r'transformed $\theta$', fontsize=16)
+    ax.set_ylabel('PDF', fontsize=16)
+    plt.legend(loc='best')
+    # Now let's test the gradients
+    m.checkgrad(verbose=True)
+    # And show the plot
+    plt.show(block=True)
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -15,5 +15,5 @@ from . import caching
 from . import diag
 from . import initialization
 from . import multioutput
-from . import linalg_gpu
+from . import parallel

--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`nosetests . --with-coverage --logging-level=INFO --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase`